diff --git "a/checkpoint-50354/trainer_state.json" "b/checkpoint-50354/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-50354/trainer_state.json" @@ -0,0 +1,374828 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.010843229932082, + "global_step": 50354, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-07, + "loss": 10.8952, + "theoretical_loss": 20.81281780154715, + "tokens_seen": 65536 + }, + { + "epoch": 0.0, + "learning_rate": 1.984126984126984e-06, + "loss": 10.8925, + "theoretical_loss": 17.566201104328645, + "tokens_seen": 131072 + }, + { + "epoch": 0.0, + "learning_rate": 2.9761904761904763e-06, + "loss": 10.8113, + "theoretical_loss": 15.939477092836569, + "tokens_seen": 196608 + }, + { + "epoch": 0.0, + "learning_rate": 3.968253968253968e-06, + "loss": 10.7096, + "theoretical_loss": 14.89231675598857, + "tokens_seen": 262144 + }, + { + "epoch": 0.0, + "learning_rate": 4.96031746031746e-06, + "loss": 10.516, + "theoretical_loss": 14.136216937762974, + "tokens_seen": 327680 + }, + { + "epoch": 0.0, + "learning_rate": 5.9523809523809525e-06, + "loss": 10.5507, + "theoretical_loss": 13.552561472550224, + "tokens_seen": 393216 + }, + { + "epoch": 0.0, + "learning_rate": 6.944444444444444e-06, + "loss": 10.1736, + "theoretical_loss": 13.08180900140119, + "tokens_seen": 458752 + }, + { + "epoch": 0.0, + "learning_rate": 7.936507936507936e-06, + "loss": 10.0954, + "theoretical_loss": 12.690129625483323, + "tokens_seen": 524288 + }, + { + "epoch": 0.0, + "learning_rate": 8.928571428571428e-06, + "loss": 9.9354, + "theoretical_loss": 12.356592463873625, + "tokens_seen": 589824 + }, + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-06, + "loss": 9.9176, + "theoretical_loss": 12.067412607035077, + "tokens_seen": 655360 + }, + { + "epoch": 0.0, + "learning_rate": 1.0912698412698412e-05, + "loss": 9.8329, + "theoretical_loss": 11.813066231101676, + "tokens_seen": 720896 + }, + { + "epoch": 0.0, + "learning_rate": 1.1904761904761905e-05, + "loss": 9.5392, + "theoretical_loss": 11.586719208706729, + "tokens_seen": 786432 + }, + { + "epoch": 0.0, + "learning_rate": 1.2896825396825396e-05, + "loss": 9.6937, + "theoretical_loss": 11.383314140186787, + "tokens_seen": 851968 + }, + { + "epoch": 0.0, + "learning_rate": 1.3888888888888888e-05, + "loss": 9.4581, + "theoretical_loss": 11.199011702111871, + "tokens_seen": 917504 + }, + { + "epoch": 0.0, + "learning_rate": 1.4880952380952381e-05, + "loss": 9.4229, + "theoretical_loss": 11.030833917977912, + "tokens_seen": 983040 + }, + { + "epoch": 0.0, + "learning_rate": 1.5873015873015872e-05, + "loss": 9.4341, + "theoretical_loss": 10.87642808645695, + "tokens_seen": 1048576 + }, + { + "epoch": 0.0, + "learning_rate": 1.6865079365079364e-05, + "loss": 9.3235, + "theoretical_loss": 10.733905740062724, + "tokens_seen": 1114112 + }, + { + "epoch": 0.0, + "learning_rate": 1.7857142857142855e-05, + "loss": 9.2909, + "theoretical_loss": 10.60172987623028, + "tokens_seen": 1179648 + }, + { + "epoch": 0.0, + "learning_rate": 1.884920634920635e-05, + "loss": 9.2573, + "theoretical_loss": 10.478634172356642, + "tokens_seen": 1245184 + }, + { + "epoch": 0.0, + "learning_rate": 1.984126984126984e-05, + "loss": 9.159, + "theoretical_loss": 10.36356394376333, + "tokens_seen": 1310720 + }, + { + "epoch": 0.0, + "learning_rate": 2.0833333333333333e-05, + "loss": 9.0708, + "theoretical_loss": 10.255632220896747, + "tokens_seen": 1376256 + }, + { + "epoch": 0.0, + "learning_rate": 2.1825396825396824e-05, + "loss": 9.2882, + "theoretical_loss": 10.15408655327002, + "tokens_seen": 1441792 + }, + { + "epoch": 0.0, + "learning_rate": 2.2817460317460315e-05, + "loss": 8.981, + "theoretical_loss": 10.058283561732598, + "tokens_seen": 1507328 + }, + { + "epoch": 0.0, + "learning_rate": 2.380952380952381e-05, + "loss": 9.1814, + "theoretical_loss": 9.967669178840278, + "tokens_seen": 1572864 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 67480, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 9.154304504394531, + "objective/train/theoretical_loss": 9.881763126393109, + "objective/train/tokens_used": 22098400, + "theoretical_loss": 9.881763126393109, + "tokens_seen": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 2.48015873015873e-05, + "loss": 9.2744, + "theoretical_loss": 9.881763126393109, + "tokens_seen": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 2.5793650793650793e-05, + "loss": 9.1924, + "theoretical_loss": 9.80014659154056, + "tokens_seen": 1703936 + }, + { + "epoch": 0.0, + "learning_rate": 2.6785714285714284e-05, + "loss": 9.1481, + "theoretical_loss": 9.722452346907446, + "tokens_seen": 1769472 + }, + { + "epoch": 0.0, + "learning_rate": 2.7777777777777776e-05, + "loss": 8.8718, + "theoretical_loss": 9.648356759081546, + "tokens_seen": 1835008 + }, + { + "epoch": 0.0, + "learning_rate": 2.876984126984127e-05, + "loss": 9.0305, + "theoretical_loss": 9.577573271145639, + "tokens_seen": 1900544 + }, + { + "epoch": 0.0, + "learning_rate": 2.9761904761904762e-05, + "loss": 8.9527, + "theoretical_loss": 9.509847046764852, + "tokens_seen": 1966080 + }, + { + "epoch": 0.0, + "learning_rate": 3.075396825396825e-05, + "loss": 8.8801, + "theoretical_loss": 9.444950537631936, + "tokens_seen": 2031616 + }, + { + "epoch": 0.0, + "learning_rate": 3.1746031746031745e-05, + "loss": 8.7157, + "theoretical_loss": 9.382679790910457, + "tokens_seen": 2097152 + }, + { + "epoch": 0.0, + "learning_rate": 3.273809523809524e-05, + "loss": 8.6865, + "theoretical_loss": 9.32285135423398, + "tokens_seen": 2162688 + }, + { + "epoch": 0.0, + "learning_rate": 3.373015873015873e-05, + "loss": 8.7653, + "theoretical_loss": 9.265299666660276, + "tokens_seen": 2228224 + }, + { + "epoch": 0.0, + "learning_rate": 3.472222222222222e-05, + "loss": 8.6092, + "theoretical_loss": 9.209874847444755, + "tokens_seen": 2293760 + }, + { + "epoch": 0.0, + "learning_rate": 3.571428571428571e-05, + "loss": 8.7534, + "theoretical_loss": 9.156440812508292, + "tokens_seen": 2359296 + }, + { + "epoch": 0.0, + "learning_rate": 3.670634920634921e-05, + "loss": 8.5526, + "theoretical_loss": 9.10487366241335, + "tokens_seen": 2424832 + }, + { + "epoch": 0.0, + "learning_rate": 3.76984126984127e-05, + "loss": 8.4992, + "theoretical_loss": 9.055060296533734, + "tokens_seen": 2490368 + }, + { + "epoch": 0.0, + "learning_rate": 3.8690476190476195e-05, + "loss": 8.5932, + "theoretical_loss": 9.006897216643829, + "tokens_seen": 2555904 + }, + { + "epoch": 0.0, + "learning_rate": 3.968253968253968e-05, + "loss": 8.5997, + "theoretical_loss": 8.960289489909357, + "tokens_seen": 2621440 + }, + { + "epoch": 0.0, + "learning_rate": 4.067460317460318e-05, + "loss": 8.3689, + "theoretical_loss": 8.915149846640611, + "tokens_seen": 2686976 + }, + { + "epoch": 0.0, + "learning_rate": 4.1666666666666665e-05, + "loss": 8.3045, + "theoretical_loss": 8.871397892478225, + "tokens_seen": 2752512 + }, + { + "epoch": 0.0, + "learning_rate": 4.265873015873016e-05, + "loss": 8.3696, + "theoretical_loss": 8.828959418153499, + "tokens_seen": 2818048 + }, + { + "epoch": 0.0, + "learning_rate": 4.365079365079365e-05, + "loss": 8.3294, + "theoretical_loss": 8.787765792778412, + "tokens_seen": 2883584 + }, + { + "epoch": 0.0, + "learning_rate": 4.464285714285714e-05, + "loss": 7.9479, + "theoretical_loss": 8.747753428911455, + "tokens_seen": 2949120 + }, + { + "epoch": 0.0, + "learning_rate": 4.563492063492063e-05, + "loss": 7.9269, + "theoretical_loss": 8.708863309520833, + "tokens_seen": 3014656 + }, + { + "epoch": 0.0, + "learning_rate": 4.6626984126984126e-05, + "loss": 7.9812, + "theoretical_loss": 8.671040568508847, + "tokens_seen": 3080192 + }, + { + "epoch": 0.0, + "learning_rate": 4.761904761904762e-05, + "loss": 8.0261, + "theoretical_loss": 8.634234117735474, + "tokens_seen": 3145728 + }, + { + "epoch": 0.0, + "learning_rate": 4.8611111111111115e-05, + "loss": 8.2961, + "theoretical_loss": 8.598396314536323, + "tokens_seen": 3211264 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 72733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 8.41878604888916, + "objective/train/theoretical_loss": 8.563482664611069, + "objective/train/tokens_used": 23736800, + "theoretical_loss": 8.563482664611069, + "tokens_seen": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 4.96031746031746e-05, + "loss": 8.2388, + "theoretical_loss": 8.563482664611069, + "tokens_seen": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 5.05952380952381e-05, + "loss": 7.8592, + "theoretical_loss": 8.529451555895115, + "tokens_seen": 3342336 + }, + { + "epoch": 0.0, + "learning_rate": 5.1587301587301586e-05, + "loss": 7.7882, + "theoretical_loss": 8.496264019646002, + "tokens_seen": 3407872 + }, + { + "epoch": 0.0, + "learning_rate": 5.257936507936508e-05, + "loss": 8.0092, + "theoretical_loss": 8.463883515497187, + "tokens_seen": 3473408 + }, + { + "epoch": 0.0, + "learning_rate": 5.357142857142857e-05, + "loss": 7.8125, + "theoretical_loss": 8.432275737672779, + "tokens_seen": 3538944 + }, + { + "epoch": 0.0, + "learning_rate": 5.4563492063492063e-05, + "loss": 8.1055, + "theoretical_loss": 8.401408439930716, + "tokens_seen": 3604480 + }, + { + "epoch": 0.0, + "learning_rate": 5.555555555555555e-05, + "loss": 7.6534, + "theoretical_loss": 8.371251277120209, + "tokens_seen": 3670016 + }, + { + "epoch": 0.0, + "learning_rate": 5.6547619047619046e-05, + "loss": 7.8082, + "theoretical_loss": 8.341775661511075, + "tokens_seen": 3735552 + }, + { + "epoch": 0.0, + "learning_rate": 5.753968253968254e-05, + "loss": 7.7956, + "theoretical_loss": 8.31295463228533, + "tokens_seen": 3801088 + }, + { + "epoch": 0.0, + "learning_rate": 5.8531746031746036e-05, + "loss": 7.7599, + "theoretical_loss": 8.284762736781182, + "tokens_seen": 3866624 + }, + { + "epoch": 0.0, + "learning_rate": 5.9523809523809524e-05, + "loss": 7.683, + "theoretical_loss": 8.257175922251864, + "tokens_seen": 3932160 + }, + { + "epoch": 0.0, + "learning_rate": 6.051587301587302e-05, + "loss": 7.5917, + "theoretical_loss": 8.230171437050114, + "tokens_seen": 3997696 + }, + { + "epoch": 0.0, + "learning_rate": 6.15079365079365e-05, + "loss": 7.6448, + "theoretical_loss": 8.20372774027797, + "tokens_seen": 4063232 + }, + { + "epoch": 0.0, + "learning_rate": 6.25e-05, + "loss": 7.5636, + "theoretical_loss": 8.177824419053046, + "tokens_seen": 4128768 + }, + { + "epoch": 0.0, + "learning_rate": 6.349206349206349e-05, + "loss": 7.3527, + "theoretical_loss": 8.152442112639616, + "tokens_seen": 4194304 + }, + { + "epoch": 0.0, + "learning_rate": 6.448412698412699e-05, + "loss": 7.6386, + "theoretical_loss": 8.1275624427775, + "tokens_seen": 4259840 + }, + { + "epoch": 0.0, + "learning_rate": 6.547619047619048e-05, + "loss": 7.4188, + "theoretical_loss": 8.10316794961571, + "tokens_seen": 4325376 + }, + { + "epoch": 0.0, + "learning_rate": 6.646825396825397e-05, + "loss": 7.5008, + "theoretical_loss": 8.07924203272264, + "tokens_seen": 4390912 + }, + { + "epoch": 0.0, + "learning_rate": 6.746031746031745e-05, + "loss": 7.5319, + "theoretical_loss": 8.055768896701416, + "tokens_seen": 4456448 + }, + { + "epoch": 0.0, + "learning_rate": 6.845238095238096e-05, + "loss": 7.048, + "theoretical_loss": 8.032733500989007, + "tokens_seen": 4521984 + }, + { + "epoch": 0.0, + "learning_rate": 6.944444444444444e-05, + "loss": 7.2613, + "theoretical_loss": 8.010121513461836, + "tokens_seen": 4587520 + }, + { + "epoch": 0.0, + "learning_rate": 7.043650793650793e-05, + "loss": 7.2432, + "theoretical_loss": 7.987919267509379, + "tokens_seen": 4653056 + }, + { + "epoch": 0.0, + "learning_rate": 7.142857142857142e-05, + "loss": 7.2, + "theoretical_loss": 7.966113722271801, + "tokens_seen": 4718592 + }, + { + "epoch": 0.0, + "learning_rate": 7.242063492063492e-05, + "loss": 7.2711, + "theoretical_loss": 7.944692425767988, + "tokens_seen": 4784128 + }, + { + "epoch": 0.0, + "learning_rate": 7.341269841269842e-05, + "loss": 7.2298, + "theoretical_loss": 7.9236434806675184, + "tokens_seen": 4849664 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 77606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 7.1540398597717285, + "objective/train/theoretical_loss": 7.902955512484067, + "objective/train/tokens_used": 25375200, + "theoretical_loss": 7.902955512484067, + "tokens_seen": 4915200 + }, + { + "epoch": 0.0, + "learning_rate": 7.440476190476191e-05, + "loss": 7.0498, + "theoretical_loss": 7.902955512484067, + "tokens_seen": 4915200 + }, + { + "epoch": 0.0, + "learning_rate": 7.53968253968254e-05, + "loss": 6.8919, + "theoretical_loss": 7.882617639989203, + "tokens_seen": 4980736 + }, + { + "epoch": 0.0, + "learning_rate": 7.63888888888889e-05, + "loss": 7.1864, + "theoretical_loss": 7.862619447664628, + "tokens_seen": 5046272 + }, + { + "epoch": 0.0, + "learning_rate": 7.738095238095239e-05, + "loss": 7.153, + "theoretical_loss": 7.842950960027937, + "tokens_seen": 5111808 + }, + { + "epoch": 0.0, + "learning_rate": 7.837301587301588e-05, + "loss": 7.0043, + "theoretical_loss": 7.823602617682313, + "tokens_seen": 5177344 + }, + { + "epoch": 0.0, + "learning_rate": 7.936507936507937e-05, + "loss": 7.067, + "theoretical_loss": 7.804565254954165, + "tokens_seen": 5242880 + }, + { + "epoch": 0.0, + "learning_rate": 8.035714285714287e-05, + "loss": 7.0251, + "theoretical_loss": 7.7858300789950725, + "tokens_seen": 5308416 + }, + { + "epoch": 0.0, + "learning_rate": 8.134920634920635e-05, + "loss": 6.7699, + "theoretical_loss": 7.767388650235364, + "tokens_seen": 5373952 + }, + { + "epoch": 0.0, + "learning_rate": 8.234126984126984e-05, + "loss": 6.8388, + "theoretical_loss": 7.749232864086619, + "tokens_seen": 5439488 + }, + { + "epoch": 0.0, + "learning_rate": 8.333333333333333e-05, + "loss": 6.9893, + "theoretical_loss": 7.731354933799318, + "tokens_seen": 5505024 + }, + { + "epoch": 0.0, + "learning_rate": 8.432539682539683e-05, + "loss": 6.819, + "theoretical_loss": 7.71374737438992, + "tokens_seen": 5570560 + }, + { + "epoch": 0.0, + "learning_rate": 8.531746031746032e-05, + "loss": 6.6608, + "theoretical_loss": 7.696402987558934, + "tokens_seen": 5636096 + }, + { + "epoch": 0.0, + "learning_rate": 8.630952380952381e-05, + "loss": 6.5322, + "theoretical_loss": 7.679314847528181, + "tokens_seen": 5701632 + }, + { + "epoch": 0.0, + "learning_rate": 8.73015873015873e-05, + "loss": 6.7129, + "theoretical_loss": 7.662476287731328, + "tokens_seen": 5767168 + }, + { + "epoch": 0.0, + "learning_rate": 8.82936507936508e-05, + "loss": 6.7745, + "theoretical_loss": 7.645880888297279, + "tokens_seen": 5832704 + }, + { + "epoch": 0.0, + "learning_rate": 8.928571428571429e-05, + "loss": 6.8321, + "theoretical_loss": 7.629522464270861, + "tokens_seen": 5898240 + }, + { + "epoch": 0.0, + "learning_rate": 9.027777777777777e-05, + "loss": 6.6706, + "theoretical_loss": 7.613395054519696, + "tokens_seen": 5963776 + }, + { + "epoch": 0.0, + "learning_rate": 9.126984126984126e-05, + "loss": 6.6371, + "theoretical_loss": 7.59749291128028, + "tokens_seen": 6029312 + }, + { + "epoch": 0.0, + "learning_rate": 9.226190476190476e-05, + "loss": 6.6738, + "theoretical_loss": 7.581810490299888, + "tokens_seen": 6094848 + }, + { + "epoch": 0.0, + "learning_rate": 9.325396825396825e-05, + "loss": 6.6876, + "theoretical_loss": 7.5663424415343705, + "tokens_seen": 6160384 + }, + { + "epoch": 0.0, + "learning_rate": 9.424603174603175e-05, + "loss": 6.5013, + "theoretical_loss": 7.551083600364949, + "tokens_seen": 6225920 + }, + { + "epoch": 0.0, + "learning_rate": 9.523809523809524e-05, + "loss": 6.5264, + "theoretical_loss": 7.536028979299919, + "tokens_seen": 6291456 + }, + { + "epoch": 0.0, + "learning_rate": 9.623015873015874e-05, + "loss": 6.4506, + "theoretical_loss": 7.521173760129762, + "tokens_seen": 6356992 + }, + { + "epoch": 0.0, + "learning_rate": 9.722222222222223e-05, + "loss": 6.4093, + "theoretical_loss": 7.506513286506497, + "tokens_seen": 6422528 + }, + { + "epoch": 0.0, + "learning_rate": 9.821428571428572e-05, + "loss": 6.3955, + "theoretical_loss": 7.492043056920249, + "tokens_seen": 6488064 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 82513, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.486067771911621, + "objective/train/theoretical_loss": 7.4777587180480305, + "objective/train/tokens_used": 27013600, + "theoretical_loss": 7.4777587180480305, + "tokens_seen": 6553600 + }, + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-05, + "loss": 6.6458, + "theoretical_loss": 7.4777587180480305, + "tokens_seen": 6553600 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010019841269841271, + "loss": 6.6237, + "theoretical_loss": 7.463656058451462, + "tokens_seen": 6619136 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001011904761904762, + "loss": 6.5474, + "theoretical_loss": 7.449731002601916, + "tokens_seen": 6684672 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010218253968253968, + "loss": 6.3712, + "theoretical_loss": 7.435979605213019, + "tokens_seen": 6750208 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010317460317460317, + "loss": 6.7222, + "theoretical_loss": 7.422398045861905, + "tokens_seen": 6815744 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010416666666666667, + "loss": 6.7042, + "theoretical_loss": 7.408982623881875, + "tokens_seen": 6881280 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010515873015873016, + "loss": 6.5424, + "theoretical_loss": 7.395729753510345, + "tokens_seen": 6946816 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010615079365079365, + "loss": 6.4443, + "theoretical_loss": 7.3826359592770325, + "tokens_seen": 7012352 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010714285714285714, + "loss": 6.4673, + "theoretical_loss": 7.369697871618373, + "tokens_seen": 7077888 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010813492063492064, + "loss": 6.7002, + "theoretical_loss": 7.3569122227050885, + "tokens_seen": 7143424 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010912698412698413, + "loss": 6.2387, + "theoretical_loss": 7.3442758424706875, + "tokens_seen": 7208960 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011011904761904761, + "loss": 6.3967, + "theoretical_loss": 7.331785654829519, + "tokens_seen": 7274496 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001111111111111111, + "loss": 6.3247, + "theoretical_loss": 7.319438674073677, + "tokens_seen": 7340032 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001121031746031746, + "loss": 6.4478, + "theoretical_loss": 7.307232001438824, + "tokens_seen": 7405568 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011309523809523809, + "loss": 6.3822, + "theoretical_loss": 7.295162821829564, + "tokens_seen": 7471104 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011408730158730158, + "loss": 6.57, + "theoretical_loss": 7.283228400695652, + "tokens_seen": 7536640 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011507936507936508, + "loss": 6.351, + "theoretical_loss": 7.271426081050832, + "tokens_seen": 7602176 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011607142857142858, + "loss": 6.1891, + "theoretical_loss": 7.259753280626623, + "tokens_seen": 7667712 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011706349206349207, + "loss": 6.2692, + "theoretical_loss": 7.24820748915387, + "tokens_seen": 7733248 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011805555555555556, + "loss": 6.3516, + "theoretical_loss": 7.236786265765262, + "tokens_seen": 7798784 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011904761904761905, + "loss": 6.1213, + "theoretical_loss": 7.225487236512497, + "tokens_seen": 7864320 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012003968253968255, + "loss": 6.3916, + "theoretical_loss": 7.21430809199212, + "tokens_seen": 7929856 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012103174603174604, + "loss": 6.4588, + "theoretical_loss": 7.2032465850744005, + "tokens_seen": 7995392 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012202380952380953, + "loss": 6.1372, + "theoretical_loss": 7.192300528730015, + "tokens_seen": 8060928 + }, + { + "epoch": 0.0, + "learning_rate": 0.000123015873015873, + "loss": 6.2326, + "theoretical_loss": 7.1814677939495155, + "tokens_seen": 8126464 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 87567, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.155417442321777, + "objective/train/theoretical_loss": 7.1707463077509646, + "objective/train/tokens_used": 28652000, + "theoretical_loss": 7.1707463077509646, + "tokens_seen": 8192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001240079365079365, + "loss": 6.3427, + "theoretical_loss": 7.1707463077509646, + "tokens_seen": 8192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.000125, + "loss": 6.1035, + "theoretical_loss": 7.160134051271272, + "tokens_seen": 8257536 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001259920634920635, + "loss": 6.2199, + "theoretical_loss": 7.149629057937138, + "tokens_seen": 8323072 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012698412698412698, + "loss": 6.1004, + "theoretical_loss": 7.139229411711638, + "tokens_seen": 8388608 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012797619047619048, + "loss": 6.084, + "theoretical_loss": 7.128933245412794, + "tokens_seen": 8454144 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012896825396825398, + "loss": 5.9073, + "theoretical_loss": 7.118738739100616, + "tokens_seen": 8519680 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012996031746031748, + "loss": 6.1843, + "theoretical_loss": 7.1086441185293445, + "tokens_seen": 8585216 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013095238095238096, + "loss": 5.9909, + "theoretical_loss": 7.09864765366177, + "tokens_seen": 8650752 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013194444444444446, + "loss": 5.8824, + "theoretical_loss": 7.088747657242693, + "tokens_seen": 8716288 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013293650793650793, + "loss": 6.1145, + "theoretical_loss": 7.078942483428749, + "tokens_seen": 8781824 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013392857142857144, + "loss": 6.1675, + "theoretical_loss": 7.069230526471966, + "tokens_seen": 8847360 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001349206349206349, + "loss": 6.1415, + "theoretical_loss": 7.059610219454568, + "tokens_seen": 8912896 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001359126984126984, + "loss": 6.1579, + "theoretical_loss": 7.0500800330726685, + "tokens_seen": 8978432 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001369047619047619, + "loss": 6.2736, + "theoretical_loss": 7.040638474466625, + "tokens_seen": 9043968 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013789682539682541, + "loss": 6.0392, + "theoretical_loss": 7.031284086095933, + "tokens_seen": 9109504 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001388888888888889, + "loss": 6.2167, + "theoretical_loss": 7.022015444656678, + "tokens_seen": 9175040 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001398809523809524, + "loss": 5.9301, + "theoretical_loss": 7.012831160039609, + "tokens_seen": 9240576 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014087301587301586, + "loss": 5.9091, + "theoretical_loss": 7.003729874327071, + "tokens_seen": 9306112 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014186507936507937, + "loss": 6.1878, + "theoretical_loss": 6.994710260827057, + "tokens_seen": 9371648 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014285714285714284, + "loss": 5.9379, + "theoretical_loss": 6.98577102314278, + "tokens_seen": 9437184 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014384920634920634, + "loss": 5.8658, + "theoretical_loss": 6.976910894276189, + "tokens_seen": 9502720 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014484126984126984, + "loss": 5.9478, + "theoretical_loss": 6.968128635764015, + "tokens_seen": 9568256 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014583333333333335, + "loss": 6.1221, + "theoretical_loss": 6.959423036844894, + "tokens_seen": 9633792 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014682539682539685, + "loss": 5.9375, + "theoretical_loss": 6.950792913656309, + "tokens_seen": 9699328 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014781746031746032, + "loss": 5.9392, + "theoretical_loss": 6.942237108460029, + "tokens_seen": 9764864 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 92738, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.9081549644470215, + "objective/train/theoretical_loss": 6.9337544888949, + "objective/train/tokens_used": 30290400, + "theoretical_loss": 6.9337544888949, + "tokens_seen": 9830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014880952380952382, + "loss": 5.9356, + "theoretical_loss": 6.9337544888949, + "tokens_seen": 9830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001498015873015873, + "loss": 6.1536, + "theoretical_loss": 6.925343947255817, + "tokens_seen": 9895936 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001507936507936508, + "loss": 5.9417, + "theoretical_loss": 6.917004399797798, + "tokens_seen": 9961472 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015178571428571427, + "loss": 6.0151, + "theoretical_loss": 6.908734786064147, + "tokens_seen": 10027008 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001527777777777778, + "loss": 6.0507, + "theoretical_loss": 6.900534068237688, + "tokens_seen": 10092544 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015376984126984128, + "loss": 5.9881, + "theoretical_loss": 6.89240123051416, + "tokens_seen": 10158080 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015476190476190478, + "loss": 6.0754, + "theoretical_loss": 6.884335278496871, + "tokens_seen": 10223616 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015575396825396825, + "loss": 5.7266, + "theoretical_loss": 6.87633523861175, + "tokens_seen": 10289152 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015674603174603175, + "loss": 5.9508, + "theoretical_loss": 6.868400157541997, + "tokens_seen": 10354688 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015773809523809523, + "loss": 5.7484, + "theoretical_loss": 6.860529101681551, + "tokens_seen": 10420224 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015873015873015873, + "loss": 5.9385, + "theoretical_loss": 6.85272115660663, + "tokens_seen": 10485760 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001597222222222222, + "loss": 5.9719, + "theoretical_loss": 6.844975426564642, + "tokens_seen": 10551296 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016071428571428573, + "loss": 5.9549, + "theoretical_loss": 6.8372910339797945, + "tokens_seen": 10616832 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001617063492063492, + "loss": 6.0978, + "theoretical_loss": 6.829667118974749, + "tokens_seen": 10682368 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001626984126984127, + "loss": 5.7867, + "theoretical_loss": 6.8221028389077185, + "tokens_seen": 10747904 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016369047619047618, + "loss": 5.8499, + "theoretical_loss": 6.814597367924395, + "tokens_seen": 10813440 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016468253968253969, + "loss": 5.9166, + "theoretical_loss": 6.807149896524181, + "tokens_seen": 10878976 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016567460317460316, + "loss": 5.7915, + "theoretical_loss": 6.799759631140145, + "tokens_seen": 10944512 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016666666666666666, + "loss": 5.7431, + "theoretical_loss": 6.7924257937322245, + "tokens_seen": 11010048 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016765873015873016, + "loss": 5.8272, + "theoretical_loss": 6.785147621393148, + "tokens_seen": 11075584 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016865079365079366, + "loss": 5.6441, + "theoretical_loss": 6.777924365966638, + "tokens_seen": 11141120 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016964285714285717, + "loss": 5.733, + "theoretical_loss": 6.770755293677423, + "tokens_seen": 11206656 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017063492063492064, + "loss": 5.8183, + "theoretical_loss": 6.763639684772625, + "tokens_seen": 11272192 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017162698412698414, + "loss": 5.7902, + "theoretical_loss": 6.756576833174123, + "tokens_seen": 11337728 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017261904761904762, + "loss": 5.7469, + "theoretical_loss": 6.749566046141486, + "tokens_seen": 11403264 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 97786, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.325038433074951, + "objective/train/theoretical_loss": 6.7426066439450905, + "objective/train/tokens_used": 31928800, + "theoretical_loss": 6.7426066439450905, + "tokens_seen": 11468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017361111111111112, + "loss": 5.8377, + "theoretical_loss": 6.7426066439450905, + "tokens_seen": 11468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001746031746031746, + "loss": 6.0515, + "theoretical_loss": 6.735697959549075, + "tokens_seen": 11534336 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001755952380952381, + "loss": 5.8069, + "theoretical_loss": 6.728839338303761, + "tokens_seen": 11599872 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001765873015873016, + "loss": 5.6908, + "theoretical_loss": 6.722030137647226, + "tokens_seen": 11665408 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001775793650793651, + "loss": 5.5015, + "theoretical_loss": 6.715269726815689, + "tokens_seen": 11730944 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017857142857142857, + "loss": 5.6095, + "theoretical_loss": 6.7085574865624125, + "tokens_seen": 11796480 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017956349206349207, + "loss": 5.588, + "theoretical_loss": 6.701892808884824, + "tokens_seen": 11862016 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018055555555555555, + "loss": 5.7956, + "theoretical_loss": 6.695275096759559, + "tokens_seen": 11927552 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018154761904761905, + "loss": 5.759, + "theoretical_loss": 6.68870376388518, + "tokens_seen": 11993088 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018253968253968252, + "loss": 5.7904, + "theoretical_loss": 6.682178234432274, + "tokens_seen": 12058624 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018353174603174602, + "loss": 5.8681, + "theoretical_loss": 6.675697942800715, + "tokens_seen": 12124160 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018452380952380953, + "loss": 5.6942, + "theoretical_loss": 6.669262333383815, + "tokens_seen": 12189696 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018551587301587303, + "loss": 5.612, + "theoretical_loss": 6.662870860339158, + "tokens_seen": 12255232 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001865079365079365, + "loss": 5.8235, + "theoretical_loss": 6.656522987365879, + "tokens_seen": 12320768 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001875, + "loss": 5.7374, + "theoretical_loss": 6.6502181874881705, + "tokens_seen": 12386304 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001884920634920635, + "loss": 5.624, + "theoretical_loss": 6.643955942844831, + "tokens_seen": 12451840 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018948412698412698, + "loss": 5.7172, + "theoretical_loss": 6.637735744484626, + "tokens_seen": 12517376 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019047619047619048, + "loss": 5.6884, + "theoretical_loss": 6.631557092167304, + "tokens_seen": 12582912 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019146825396825398, + "loss": 5.5524, + "theoretical_loss": 6.625419494170049, + "tokens_seen": 12648448 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019246031746031748, + "loss": 5.9368, + "theoretical_loss": 6.619322467099223, + "tokens_seen": 12713984 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019345238095238096, + "loss": 5.6623, + "theoretical_loss": 6.613265535707211, + "tokens_seen": 12779520 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019444444444444446, + "loss": 5.4595, + "theoretical_loss": 6.607248232714213, + "tokens_seen": 12845056 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019543650793650793, + "loss": 5.2926, + "theoretical_loss": 6.60127009863481, + "tokens_seen": 12910592 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019642857142857144, + "loss": 5.5284, + "theoretical_loss": 6.59533068160918, + "tokens_seen": 12976128 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001974206349206349, + "loss": 5.7083, + "theoretical_loss": 6.589429537238785, + "tokens_seen": 13041664 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 102516, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.887484073638916, + "objective/train/theoretical_loss": 6.583566228426414, + "objective/train/tokens_used": 33567200, + "theoretical_loss": 6.583566228426414, + "tokens_seen": 13107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001984126984126984, + "loss": 5.7401, + "theoretical_loss": 6.583566228426414, + "tokens_seen": 13107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019940476190476191, + "loss": 5.6309, + "theoretical_loss": 6.5777403252204305, + "tokens_seen": 13172736 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020039682539682542, + "loss": 5.5628, + "theoretical_loss": 6.571951404663098, + "tokens_seen": 13238272 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002013888888888889, + "loss": 5.7721, + "theoretical_loss": 6.566199050642863, + "tokens_seen": 13303808 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002023809523809524, + "loss": 5.3532, + "theoretical_loss": 6.560482853750463, + "tokens_seen": 13369344 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020337301587301587, + "loss": 5.6038, + "theoretical_loss": 6.554802411138745, + "tokens_seen": 13434880 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020436507936507937, + "loss": 5.8779, + "theoretical_loss": 6.549157326386091, + "tokens_seen": 13500416 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020535714285714284, + "loss": 5.8416, + "theoretical_loss": 6.54354720936333, + "tokens_seen": 13565952 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020634920634920634, + "loss": 5.3702, + "theoretical_loss": 6.537971676104026, + "tokens_seen": 13631488 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020734126984126985, + "loss": 5.8494, + "theoretical_loss": 6.532430348678068, + "tokens_seen": 13697024 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020833333333333335, + "loss": 5.6039, + "theoretical_loss": 6.5269228550684195, + "tokens_seen": 13762560 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020932539682539685, + "loss": 5.5831, + "theoretical_loss": 6.521448829050978, + "tokens_seen": 13828096 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021031746031746032, + "loss": 5.5982, + "theoretical_loss": 6.516007910077416, + "tokens_seen": 13893632 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021130952380952382, + "loss": 5.4289, + "theoretical_loss": 6.51059974316095, + "tokens_seen": 13959168 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002123015873015873, + "loss": 5.5763, + "theoretical_loss": 6.50522397876491, + "tokens_seen": 14024704 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002132936507936508, + "loss": 5.5622, + "theoretical_loss": 6.499880272694068, + "tokens_seen": 14090240 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021428571428571427, + "loss": 5.6438, + "theoretical_loss": 6.494568285988618, + "tokens_seen": 14155776 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002152777777777778, + "loss": 5.662, + "theoretical_loss": 6.489287684820745, + "tokens_seen": 14221312 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021626984126984128, + "loss": 5.3087, + "theoretical_loss": 6.484038140393699, + "tokens_seen": 14286848 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021726190476190478, + "loss": 5.3336, + "theoretical_loss": 6.4788193288433105, + "tokens_seen": 14352384 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021825396825396825, + "loss": 5.4532, + "theoretical_loss": 6.473630931141869, + "tokens_seen": 14417920 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021924603174603176, + "loss": 5.6727, + "theoretical_loss": 6.468472633004308, + "tokens_seen": 14483456 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022023809523809523, + "loss": 5.3546, + "theoretical_loss": 6.463344124796616, + "tokens_seen": 14548992 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022123015873015873, + "loss": 5.469, + "theoretical_loss": 6.45824510144643, + "tokens_seen": 14614528 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002222222222222222, + "loss": 5.7059, + "theoretical_loss": 6.45317526235573, + "tokens_seen": 14680064 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 107633, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.52135705947876, + "objective/train/theoretical_loss": 6.448134311315593, + "objective/train/tokens_used": 35205600, + "theoretical_loss": 6.448134311315593, + "tokens_seen": 14745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022321428571428573, + "loss": 5.493, + "theoretical_loss": 6.448134311315593, + "tokens_seen": 14745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002242063492063492, + "loss": 5.5841, + "theoretical_loss": 6.443121956422939, + "tokens_seen": 14811136 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002251984126984127, + "loss": 5.329, + "theoretical_loss": 6.438137909999214, + "tokens_seen": 14876672 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022619047619047618, + "loss": 5.5932, + "theoretical_loss": 6.433181888510964, + "tokens_seen": 14942208 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022718253968253969, + "loss": 5.3925, + "theoretical_loss": 6.428253612492239, + "tokens_seen": 15007744 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022817460317460316, + "loss": 5.3742, + "theoretical_loss": 6.4233528064687855, + "tokens_seen": 15073280 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022916666666666666, + "loss": 5.6301, + "theoretical_loss": 6.418479198883969, + "tokens_seen": 15138816 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023015873015873016, + "loss": 5.0119, + "theoretical_loss": 6.413632522026391, + "tokens_seen": 15204352 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023115079365079367, + "loss": 5.4315, + "theoretical_loss": 6.40881251195914, + "tokens_seen": 15269888 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023214285714285717, + "loss": 5.401, + "theoretical_loss": 6.404018908450656, + "tokens_seen": 15335424 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023313492063492064, + "loss": 5.2261, + "theoretical_loss": 6.399251454907132, + "tokens_seen": 15400960 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023412698412698414, + "loss": 5.44, + "theoretical_loss": 6.394509898306452, + "tokens_seen": 15466496 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023511904761904762, + "loss": 5.4848, + "theoretical_loss": 6.389793989133574, + "tokens_seen": 15532032 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023611111111111112, + "loss": 5.5739, + "theoretical_loss": 6.385103481317387, + "tokens_seen": 15597568 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002371031746031746, + "loss": 5.3712, + "theoretical_loss": 6.380438132168923, + "tokens_seen": 15663104 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002380952380952381, + "loss": 5.3527, + "theoretical_loss": 6.375797702320966, + "tokens_seen": 15728640 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002390873015873016, + "loss": 5.4, + "theoretical_loss": 6.371181955668966, + "tokens_seen": 15794176 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002400793650793651, + "loss": 5.3654, + "theoretical_loss": 6.366590659313248, + "tokens_seen": 15859712 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024107142857142857, + "loss": 5.4837, + "theoretical_loss": 6.36202358350248, + "tokens_seen": 15925248 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024206349206349207, + "loss": 5.2794, + "theoretical_loss": 6.357480501578371, + "tokens_seen": 15990784 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024305555555555555, + "loss": 5.2403, + "theoretical_loss": 6.352961189921553, + "tokens_seen": 16056320 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024404761904761905, + "loss": 5.3106, + "theoretical_loss": 6.348465427898629, + "tokens_seen": 16121856 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024503968253968255, + "loss": 5.3799, + "theoretical_loss": 6.343992997810366, + "tokens_seen": 16187392 + }, + { + "epoch": 0.0, + "learning_rate": 0.000246031746031746, + "loss": 5.3256, + "theoretical_loss": 6.33954368484097, + "tokens_seen": 16252928 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024702380952380955, + "loss": 5.5652, + "theoretical_loss": 6.33511727700846, + "tokens_seen": 16318464 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 112492, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.672967910766602, + "objective/train/theoretical_loss": 6.330713565116083, + "objective/train/tokens_used": 36844000, + "theoretical_loss": 6.330713565116083, + "tokens_seen": 16384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.000248015873015873, + "loss": 5.5336, + "theoretical_loss": 6.330713565116083, + "tokens_seen": 16384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002490079365079365, + "loss": 5.1247, + "theoretical_loss": 6.326332342704751, + "tokens_seen": 16449536 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025, + "loss": 4.9896, + "theoretical_loss": 6.32197340600647, + "tokens_seen": 16515072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002509920634920635, + "loss": 5.323, + "theoretical_loss": 6.3176365538987636, + "tokens_seen": 16580608 + }, + { + "epoch": 0.01, + "learning_rate": 0.000251984126984127, + "loss": 5.0875, + "theoretical_loss": 6.313321587860021, + "tokens_seen": 16646144 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025297619047619046, + "loss": 5.4551, + "theoretical_loss": 6.309028311925785, + "tokens_seen": 16711680 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025396825396825396, + "loss": 5.3549, + "theoretical_loss": 6.304756532645939, + "tokens_seen": 16777216 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025496031746031746, + "loss": 5.1, + "theoretical_loss": 6.300506059042775, + "tokens_seen": 16842752 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025595238095238096, + "loss": 5.4827, + "theoretical_loss": 6.296276702569918, + "tokens_seen": 16908288 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002569444444444444, + "loss": 5.4362, + "theoretical_loss": 6.292068277072099, + "tokens_seen": 16973824 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025793650793650796, + "loss": 5.3388, + "theoretical_loss": 6.28788059874573, + "tokens_seen": 17039360 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025892857142857146, + "loss": 5.2424, + "theoretical_loss": 6.283713486100297, + "tokens_seen": 17104896 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025992063492063497, + "loss": 5.3369, + "theoretical_loss": 6.279566759920507, + "tokens_seen": 17170432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002609126984126984, + "loss": 5.3566, + "theoretical_loss": 6.275440243229228, + "tokens_seen": 17235968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002619047619047619, + "loss": 5.1989, + "theoretical_loss": 6.271333761251142, + "tokens_seen": 17301504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002628968253968254, + "loss": 5.2989, + "theoretical_loss": 6.267247141377137, + "tokens_seen": 17367040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002638888888888889, + "loss": 5.1155, + "theoretical_loss": 6.2631802131294085, + "tokens_seen": 17432576 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026488095238095237, + "loss": 5.0634, + "theoretical_loss": 6.259132808127246, + "tokens_seen": 17498112 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026587301587301587, + "loss": 5.2512, + "theoretical_loss": 6.255104760053497, + "tokens_seen": 17563648 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026686507936507937, + "loss": 5.2924, + "theoretical_loss": 6.251095904621689, + "tokens_seen": 17629184 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026785714285714287, + "loss": 5.1212, + "theoretical_loss": 6.247106079543801, + "tokens_seen": 17694720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002688492063492063, + "loss": 5.2689, + "theoretical_loss": 6.243135124498652, + "tokens_seen": 17760256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002698412698412698, + "loss": 5.2765, + "theoretical_loss": 6.239182881100916, + "tokens_seen": 17825792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002708333333333333, + "loss": 5.2352, + "theoretical_loss": 6.235249192870732, + "tokens_seen": 17891328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002718253968253968, + "loss": 5.5256, + "theoretical_loss": 6.231333905203899, + "tokens_seen": 17956864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 117713, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.518486022949219, + "objective/train/theoretical_loss": 6.227436865342643, + "objective/train/tokens_used": 38482400, + "theoretical_loss": 6.227436865342643, + "tokens_seen": 18022400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002728174603174603, + "loss": 5.1989, + "theoretical_loss": 6.227436865342643, + "tokens_seen": 18022400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002738095238095238, + "loss": 5.243, + "theoretical_loss": 6.223557922346955, + "tokens_seen": 18087936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002748015873015873, + "loss": 5.205, + "theoretical_loss": 6.219696927066456, + "tokens_seen": 18153472 + }, + { + "epoch": 0.01, + "learning_rate": 0.00027579365079365083, + "loss": 5.068, + "theoretical_loss": 6.215853732112821, + "tokens_seen": 18219008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00027678571428571433, + "loss": 5.1643, + "theoretical_loss": 6.212028191832702, + "tokens_seen": 18284544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002777777777777778, + "loss": 5.1496, + "theoretical_loss": 6.208220162281178, + "tokens_seen": 18350080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002787698412698413, + "loss": 5.0765, + "theoretical_loss": 6.204429501195701, + "tokens_seen": 18415616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002797619047619048, + "loss": 4.9861, + "theoretical_loss": 6.20065606797053, + "tokens_seen": 18481152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002807539682539683, + "loss": 5.2004, + "theoretical_loss": 6.19689972363164, + "tokens_seen": 18546688 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028174603174603173, + "loss": 5.3659, + "theoretical_loss": 6.1931603308120975, + "tokens_seen": 18612224 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028273809523809523, + "loss": 5.1814, + "theoretical_loss": 6.189437753727901, + "tokens_seen": 18677760 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028373015873015873, + "loss": 5.1199, + "theoretical_loss": 6.185731858154261, + "tokens_seen": 18743296 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028472222222222223, + "loss": 5.2977, + "theoretical_loss": 6.182042511402313, + "tokens_seen": 18808832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002857142857142857, + "loss": 5.1583, + "theoretical_loss": 6.17836958229627, + "tokens_seen": 18874368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002867063492063492, + "loss": 5.0914, + "theoretical_loss": 6.1747129411509825, + "tokens_seen": 18939904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002876984126984127, + "loss": 4.8355, + "theoretical_loss": 6.171072459749913, + "tokens_seen": 19005440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002886904761904762, + "loss": 5.1587, + "theoretical_loss": 6.1674480113235095, + "tokens_seen": 19070976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002896825396825397, + "loss": 5.0374, + "theoretical_loss": 6.163839470527964, + "tokens_seen": 19136512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002906746031746032, + "loss": 5.3875, + "theoretical_loss": 6.160246713424372, + "tokens_seen": 19202048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002916666666666667, + "loss": 5.1875, + "theoretical_loss": 6.156669617458243, + "tokens_seen": 19267584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002926587301587302, + "loss": 5.3545, + "theoretical_loss": 6.153108061439397, + "tokens_seen": 19333120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002936507936507937, + "loss": 5.1569, + "theoretical_loss": 6.149561925522211, + "tokens_seen": 19398656 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029464285714285714, + "loss": 5.2313, + "theoretical_loss": 6.146031091186222, + "tokens_seen": 19464192 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029563492063492064, + "loss": 5.3309, + "theoretical_loss": 6.142515441217064, + "tokens_seen": 19529728 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029662698412698414, + "loss": 5.2886, + "theoretical_loss": 6.1390148596877605, + "tokens_seen": 19595264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 122717, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.544928073883057, + "objective/train/theoretical_loss": 6.135529231940326, + "objective/train/tokens_used": 40120800, + "theoretical_loss": 6.135529231940326, + "tokens_seen": 19660800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029761904761904765, + "loss": 5.3891, + "theoretical_loss": 6.135529231940326, + "tokens_seen": 19660800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002986111111111111, + "loss": 5.1961, + "theoretical_loss": 6.132058444567705, + "tokens_seen": 19726336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002996031746031746, + "loss": 5.166, + "theoretical_loss": 6.128602385396022, + "tokens_seen": 19791872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003005952380952381, + "loss": 5.1897, + "theoretical_loss": 6.125160943467138, + "tokens_seen": 19857408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003015873015873016, + "loss": 5.1994, + "theoretical_loss": 6.121734009021521, + "tokens_seen": 19922944 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030257936507936505, + "loss": 5.0567, + "theoretical_loss": 6.118321473481398, + "tokens_seen": 19988480 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030357142857142855, + "loss": 5.1657, + "theoretical_loss": 6.114923229434213, + "tokens_seen": 20054016 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030456349206349205, + "loss": 5.1727, + "theoretical_loss": 6.111539170616359, + "tokens_seen": 20119552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003055555555555556, + "loss": 4.7328, + "theoretical_loss": 6.108169191897195, + "tokens_seen": 20185088 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030654761904761905, + "loss": 5.2345, + "theoretical_loss": 6.104813189263336, + "tokens_seen": 20250624 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030753968253968255, + "loss": 5.1383, + "theoretical_loss": 6.101471059803204, + "tokens_seen": 20316160 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030853174603174605, + "loss": 5.1325, + "theoretical_loss": 6.098142701691856, + "tokens_seen": 20381696 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030952380952380956, + "loss": 4.9321, + "theoretical_loss": 6.094828014176053, + "tokens_seen": 20447232 + }, + { + "epoch": 0.01, + "learning_rate": 0.000310515873015873, + "loss": 5.3554, + "theoretical_loss": 6.091526897559593, + "tokens_seen": 20512768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003115079365079365, + "loss": 5.1831, + "theoretical_loss": 6.088239253188885, + "tokens_seen": 20578304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003125, + "loss": 5.2792, + "theoretical_loss": 6.084964983438763, + "tokens_seen": 20643840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003134920634920635, + "loss": 4.9628, + "theoretical_loss": 6.0817039916985465, + "tokens_seen": 20709376 + }, + { + "epoch": 0.01, + "learning_rate": 0.000314484126984127, + "loss": 5.0786, + "theoretical_loss": 6.078456182358325, + "tokens_seen": 20774912 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031547619047619046, + "loss": 5.3443, + "theoretical_loss": 6.075221460795472, + "tokens_seen": 20840448 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031646825396825396, + "loss": 5.2776, + "theoretical_loss": 6.071999733361386, + "tokens_seen": 20905984 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031746031746031746, + "loss": 4.7741, + "theoretical_loss": 6.068790907368448, + "tokens_seen": 20971520 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031845238095238096, + "loss": 5.0769, + "theoretical_loss": 6.0655948910771915, + "tokens_seen": 21037056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003194444444444444, + "loss": 5.2176, + "theoretical_loss": 6.062411593683687, + "tokens_seen": 21102592 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032043650793650796, + "loss": 5.2326, + "theoretical_loss": 6.059240925307134, + "tokens_seen": 21168128 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032142857142857147, + "loss": 5.2254, + "theoretical_loss": 6.056082796977648, + "tokens_seen": 21233664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 127954, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.949924468994141, + "objective/train/theoretical_loss": 6.052937120624258, + "objective/train/tokens_used": 41759200, + "theoretical_loss": 6.052937120624258, + "tokens_seen": 21299200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032242063492063497, + "loss": 5.1471, + "theoretical_loss": 6.052937120624258, + "tokens_seen": 21299200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003234126984126984, + "loss": 4.9762, + "theoretical_loss": 6.049803809063083, + "tokens_seen": 21364736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003244047619047619, + "loss": 4.7374, + "theoretical_loss": 6.0466827759857145, + "tokens_seen": 21430272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003253968253968254, + "loss": 4.8236, + "theoretical_loss": 6.04357393594778, + "tokens_seen": 21495808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003263888888888889, + "loss": 5.0896, + "theoretical_loss": 6.040477204357686, + "tokens_seen": 21561344 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032738095238095237, + "loss": 4.9351, + "theoretical_loss": 6.037392497465552, + "tokens_seen": 21626880 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032837301587301587, + "loss": 4.9554, + "theoretical_loss": 6.034319732352309, + "tokens_seen": 21692416 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032936507936507937, + "loss": 5.2635, + "theoretical_loss": 6.031258826918979, + "tokens_seen": 21757952 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033035714285714287, + "loss": 5.0873, + "theoretical_loss": 6.0282096998761245, + "tokens_seen": 21823488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003313492063492063, + "loss": 5.1317, + "theoretical_loss": 6.025172270733464, + "tokens_seen": 21889024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003323412698412698, + "loss": 4.7562, + "theoretical_loss": 6.0221464597896475, + "tokens_seen": 21954560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003333333333333333, + "loss": 5.0833, + "theoretical_loss": 6.0191321881221995, + "tokens_seen": 22020096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003343253968253968, + "loss": 4.9952, + "theoretical_loss": 6.016129377577614, + "tokens_seen": 22085632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003353174603174603, + "loss": 5.141, + "theoretical_loss": 6.01313795076161, + "tokens_seen": 22151168 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003363095238095238, + "loss": 4.8368, + "theoretical_loss": 6.010157831029533, + "tokens_seen": 22216704 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033730158730158733, + "loss": 5.0532, + "theoretical_loss": 6.007188942476907, + "tokens_seen": 22282240 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033829365079365083, + "loss": 5.0954, + "theoretical_loss": 6.0042312099301425, + "tokens_seen": 22347776 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033928571428571433, + "loss": 5.0574, + "theoretical_loss": 6.001284558937368, + "tokens_seen": 22413312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003402777777777778, + "loss": 4.8983, + "theoretical_loss": 5.998348915759426, + "tokens_seen": 22478848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003412698412698413, + "loss": 4.9248, + "theoretical_loss": 5.995424207360987, + "tokens_seen": 22544384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003422619047619048, + "loss": 4.8788, + "theoretical_loss": 5.992510361401818, + "tokens_seen": 22609920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003432539682539683, + "loss": 5.0268, + "theoretical_loss": 5.989607306228168, + "tokens_seen": 22675456 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034424603174603173, + "loss": 4.9311, + "theoretical_loss": 5.986714970864292, + "tokens_seen": 22740992 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034523809523809523, + "loss": 4.9288, + "theoretical_loss": 5.983833285004112, + "tokens_seen": 22806528 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034623015873015873, + "loss": 5.1659, + "theoretical_loss": 5.980962179002983, + "tokens_seen": 22872064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 130738, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.168276786804199, + "objective/train/theoretical_loss": 5.978101583869607, + "objective/train/tokens_used": 43397600, + "theoretical_loss": 5.978101583869607, + "tokens_seen": 22937600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034722222222222224, + "loss": 4.6862, + "theoretical_loss": 5.978101583869607, + "tokens_seen": 22937600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003482142857142857, + "loss": 4.9681, + "theoretical_loss": 5.975251431258057, + "tokens_seen": 23003136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003492063492063492, + "loss": 5.004, + "theoretical_loss": 5.972411653459913, + "tokens_seen": 23068672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003501984126984127, + "loss": 5.0517, + "theoretical_loss": 5.9695821833965335, + "tokens_seen": 23134208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003511904761904762, + "loss": 5.2897, + "theoretical_loss": 5.966762954611432, + "tokens_seen": 23199744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003521825396825397, + "loss": 5.0365, + "theoretical_loss": 5.963953901262764, + "tokens_seen": 23265280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003531746031746032, + "loss": 4.9465, + "theoretical_loss": 5.961154958115937, + "tokens_seen": 23330816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003541666666666667, + "loss": 5.1461, + "theoretical_loss": 5.958366060536315, + "tokens_seen": 23396352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003551587301587302, + "loss": 4.8917, + "theoretical_loss": 5.955587144482044, + "tokens_seen": 23461888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003561507936507937, + "loss": 4.9521, + "theoretical_loss": 5.952818146496978, + "tokens_seen": 23527424 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035714285714285714, + "loss": 4.8648, + "theoretical_loss": 5.950059003703704, + "tokens_seen": 23592960 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035813492063492064, + "loss": 5.081, + "theoretical_loss": 5.94730965379668, + "tokens_seen": 23658496 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035912698412698415, + "loss": 4.9531, + "theoretical_loss": 5.944570035035458, + "tokens_seen": 23724032 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036011904761904765, + "loss": 4.8974, + "theoretical_loss": 5.941840086238027, + "tokens_seen": 23789568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003611111111111111, + "loss": 4.9338, + "theoretical_loss": 5.939119746774228, + "tokens_seen": 23855104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003621031746031746, + "loss": 4.9457, + "theoretical_loss": 5.936408956559284, + "tokens_seen": 23920640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003630952380952381, + "loss": 4.8877, + "theoretical_loss": 5.933707656047414, + "tokens_seen": 23986176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003640873015873016, + "loss": 5.2136, + "theoretical_loss": 5.93101578622554, + "tokens_seen": 24051712 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036507936507936505, + "loss": 5.1442, + "theoretical_loss": 5.928333288607086, + "tokens_seen": 24117248 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036607142857142855, + "loss": 4.939, + "theoretical_loss": 5.925660105225867, + "tokens_seen": 24182784 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036706349206349205, + "loss": 4.9773, + "theoretical_loss": 5.92299617863006, + "tokens_seen": 24248320 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003680555555555556, + "loss": 4.8856, + "theoretical_loss": 5.920341451876267, + "tokens_seen": 24313856 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036904761904761905, + "loss": 4.8102, + "theoretical_loss": 5.9176958685236585, + "tokens_seen": 24379392 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037003968253968255, + "loss": 4.9364, + "theoretical_loss": 5.9150593726282015, + "tokens_seen": 24444928 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037103174603174606, + "loss": 4.7276, + "theoretical_loss": 5.912431908736972, + "tokens_seen": 24510464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 131522, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.190422534942627, + "objective/train/theoretical_loss": 5.909813421882534, + "objective/train/tokens_used": 45036000, + "theoretical_loss": 5.909813421882534, + "tokens_seen": 24576000 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037202380952380956, + "loss": 4.8887, + "theoretical_loss": 5.909813421882534, + "tokens_seen": 24576000 + }, + { + "epoch": 0.01, + "learning_rate": 0.000373015873015873, + "loss": 4.9041, + "theoretical_loss": 5.907203857577422, + "tokens_seen": 24641536 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003740079365079365, + "loss": 4.8034, + "theoretical_loss": 5.9046031618086765, + "tokens_seen": 24707072 + }, + { + "epoch": 0.01, + "learning_rate": 0.000375, + "loss": 5.0023, + "theoretical_loss": 5.902011281032472, + "tokens_seen": 24772608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003759920634920635, + "loss": 5.0488, + "theoretical_loss": 5.899428162168808, + "tokens_seen": 24838144 + }, + { + "epoch": 0.01, + "learning_rate": 0.000376984126984127, + "loss": 5.0453, + "theoretical_loss": 5.896853752596286, + "tokens_seen": 24903680 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037797619047619046, + "loss": 4.813, + "theoretical_loss": 5.894288000146949, + "tokens_seen": 24969216 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037896825396825396, + "loss": 4.8831, + "theoretical_loss": 5.891730853101199, + "tokens_seen": 25034752 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037996031746031746, + "loss": 5.263, + "theoretical_loss": 5.88918226018278, + "tokens_seen": 25100288 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038095238095238096, + "loss": 5.1195, + "theoretical_loss": 5.8866421705538325, + "tokens_seen": 25165824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003819444444444444, + "loss": 4.9757, + "theoretical_loss": 5.8841105338100155, + "tokens_seen": 25231360 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038293650793650797, + "loss": 5.1901, + "theoretical_loss": 5.881587299975694, + "tokens_seen": 25296896 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038392857142857147, + "loss": 4.8656, + "theoretical_loss": 5.8790724194991935, + "tokens_seen": 25362432 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038492063492063497, + "loss": 5.0726, + "theoretical_loss": 5.876565843248124, + "tokens_seen": 25427968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003859126984126984, + "loss": 5.1417, + "theoretical_loss": 5.8740675225047525, + "tokens_seen": 25493504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003869047619047619, + "loss": 5.1355, + "theoretical_loss": 5.871577408961457, + "tokens_seen": 25559040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003878968253968254, + "loss": 5.0327, + "theoretical_loss": 5.869095454716231, + "tokens_seen": 25624576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003888888888888889, + "loss": 5.139, + "theoretical_loss": 5.866621612268246, + "tokens_seen": 25690112 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038988095238095237, + "loss": 4.9834, + "theoretical_loss": 5.864155834513486, + "tokens_seen": 25755648 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039087301587301587, + "loss": 5.187, + "theoretical_loss": 5.8616980747404295, + "tokens_seen": 25821184 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039186507936507937, + "loss": 5.0414, + "theoretical_loss": 5.859248286625787, + "tokens_seen": 25886720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003928571428571429, + "loss": 5.0615, + "theoretical_loss": 5.856806424230314, + "tokens_seen": 25952256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003938492063492063, + "loss": 4.9987, + "theoretical_loss": 5.854372441994654, + "tokens_seen": 26017792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003948412698412698, + "loss": 5.0551, + "theoretical_loss": 5.851946294735258, + "tokens_seen": 26083328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003958333333333333, + "loss": 4.8932, + "theoretical_loss": 5.849527937640345, + "tokens_seen": 26148864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 132646, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.532196998596191, + "objective/train/theoretical_loss": 5.8471173262659235, + "objective/train/tokens_used": 46674400, + "theoretical_loss": 5.8471173262659235, + "tokens_seen": 26214400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003968253968253968, + "loss": 4.9406, + "theoretical_loss": 5.8471173262659235, + "tokens_seen": 26214400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003978174603174603, + "loss": 5.0843, + "theoretical_loss": 5.84471441653186, + "tokens_seen": 26279936 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039880952380952383, + "loss": 5.0651, + "theoretical_loss": 5.842319164718004, + "tokens_seen": 26345472 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039980158730158733, + "loss": 5.2335, + "theoretical_loss": 5.83993152746036, + "tokens_seen": 26411008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040079365079365083, + "loss": 4.9606, + "theoretical_loss": 5.83755146174731, + "tokens_seen": 26476544 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040178571428571433, + "loss": 4.8502, + "theoretical_loss": 5.835178924915889, + "tokens_seen": 26542080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004027777777777778, + "loss": 5.1473, + "theoretical_loss": 5.832813874648102, + "tokens_seen": 26607616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004037698412698413, + "loss": 4.9566, + "theoretical_loss": 5.8304562689673, + "tokens_seen": 26673152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004047619047619048, + "loss": 4.7435, + "theoretical_loss": 5.828106066234588, + "tokens_seen": 26738688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004057539682539683, + "loss": 5.1525, + "theoretical_loss": 5.825763225145295, + "tokens_seen": 26804224 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040674603174603173, + "loss": 5.0371, + "theoretical_loss": 5.823427704725473, + "tokens_seen": 26869760 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040773809523809523, + "loss": 5.0707, + "theoretical_loss": 5.82109946432846, + "tokens_seen": 26935296 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040873015873015874, + "loss": 4.9852, + "theoretical_loss": 5.818778463631473, + "tokens_seen": 27000832 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040972222222222224, + "loss": 5.0129, + "theoretical_loss": 5.816464662632243, + "tokens_seen": 27066368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004107142857142857, + "loss": 4.8632, + "theoretical_loss": 5.8141580216457065, + "tokens_seen": 27131904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004117063492063492, + "loss": 5.2237, + "theoretical_loss": 5.811858501300729, + "tokens_seen": 27197440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004126984126984127, + "loss": 4.7749, + "theoretical_loss": 5.809566062536868, + "tokens_seen": 27262976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004136904761904762, + "loss": 4.9421, + "theoretical_loss": 5.807280666601191, + "tokens_seen": 27328512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004146825396825397, + "loss": 5.3685, + "theoretical_loss": 5.805002275045111, + "tokens_seen": 27394048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004156746031746032, + "loss": 4.9844, + "theoretical_loss": 5.8027308497212875, + "tokens_seen": 27459584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004166666666666667, + "loss": 5.0175, + "theoretical_loss": 5.800466352780546, + "tokens_seen": 27525120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004176587301587302, + "loss": 5.2404, + "theoretical_loss": 5.798208746668847, + "tokens_seen": 27590656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004186507936507937, + "loss": 4.9851, + "theoretical_loss": 5.795957994124291, + "tokens_seen": 27656192 + }, + { + "epoch": 0.01, + "learning_rate": 0.00041964285714285714, + "loss": 5.0305, + "theoretical_loss": 5.7937140581741575, + "tokens_seen": 27721728 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042063492063492065, + "loss": 4.947, + "theoretical_loss": 5.791476902131985, + "tokens_seen": 27787264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 133227, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.931251525878906, + "objective/train/theoretical_loss": 5.789246489594688, + "objective/train/tokens_used": 48312800, + "theoretical_loss": 5.789246489594688, + "tokens_seen": 27852800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042162698412698415, + "loss": 5.0547, + "theoretical_loss": 5.789246489594688, + "tokens_seen": 27852800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042261904761904765, + "loss": 5.0383, + "theoretical_loss": 5.787022784439701, + "tokens_seen": 27918336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004236111111111111, + "loss": 5.0555, + "theoretical_loss": 5.784805750822171, + "tokens_seen": 27983872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004246031746031746, + "loss": 4.825, + "theoretical_loss": 5.782595353172176, + "tokens_seen": 28049408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004255952380952381, + "loss": 5.0085, + "theoretical_loss": 5.780391556191977, + "tokens_seen": 28114944 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004265873015873016, + "loss": 4.7223, + "theoretical_loss": 5.778194324853311, + "tokens_seen": 28180480 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042757936507936505, + "loss": 4.9563, + "theoretical_loss": 5.776003624394711, + "tokens_seen": 28246016 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042857142857142855, + "loss": 4.8565, + "theoretical_loss": 5.773819420318858, + "tokens_seen": 28311552 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042956349206349205, + "loss": 4.653, + "theoretical_loss": 5.771641678389971, + "tokens_seen": 28377088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004305555555555556, + "loss": 4.9409, + "theoretical_loss": 5.769470364631225, + "tokens_seen": 28442624 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043154761904761905, + "loss": 5.1167, + "theoretical_loss": 5.767305445322201, + "tokens_seen": 28508160 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043253968253968256, + "loss": 4.933, + "theoretical_loss": 5.765146886996363, + "tokens_seen": 28573696 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043353174603174606, + "loss": 4.8283, + "theoretical_loss": 5.762994656438579, + "tokens_seen": 28639232 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043452380952380956, + "loss": 5.0357, + "theoretical_loss": 5.760848720682651, + "tokens_seen": 28704768 + }, + { + "epoch": 0.01, + "learning_rate": 0.000435515873015873, + "loss": 5.0796, + "theoretical_loss": 5.758709047008894, + "tokens_seen": 28770304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004365079365079365, + "loss": 4.8349, + "theoretical_loss": 5.756575602941732, + "tokens_seen": 28835840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004375, + "loss": 5.0019, + "theoretical_loss": 5.75444835624733, + "tokens_seen": 28901376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004384920634920635, + "loss": 4.8212, + "theoretical_loss": 5.752327274931249, + "tokens_seen": 28966912 + }, + { + "epoch": 0.01, + "learning_rate": 0.000439484126984127, + "loss": 4.9358, + "theoretical_loss": 5.750212327236129, + "tokens_seen": 29032448 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044047619047619046, + "loss": 4.9691, + "theoretical_loss": 5.7481034816394105, + "tokens_seen": 29097984 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044146825396825396, + "loss": 4.9269, + "theoretical_loss": 5.7460007068510635, + "tokens_seen": 29163520 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044246031746031746, + "loss": 5.0363, + "theoretical_loss": 5.74390397181136, + "tokens_seen": 29229056 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044345238095238096, + "loss": 4.6776, + "theoretical_loss": 5.741813245688668, + "tokens_seen": 29294592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004444444444444444, + "loss": 4.8552, + "theoretical_loss": 5.739728497877267, + "tokens_seen": 29360128 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044543650793650797, + "loss": 5.2274, + "theoretical_loss": 5.737649697995197, + "tokens_seen": 29425664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 134422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.431413650512695, + "objective/train/theoretical_loss": 5.7355768158821245, + "objective/train/tokens_used": 49951200, + "theoretical_loss": 5.7355768158821245, + "tokens_seen": 29491200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044642857142857147, + "loss": 5.112, + "theoretical_loss": 5.7355768158821245, + "tokens_seen": 29491200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044742063492063497, + "loss": 5.1022, + "theoretical_loss": 5.73350982159724, + "tokens_seen": 29556736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004484126984126984, + "loss": 5.0531, + "theoretical_loss": 5.731448685417178, + "tokens_seen": 29622272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004494047619047619, + "loss": 4.8186, + "theoretical_loss": 5.729393377833956, + "tokens_seen": 29687808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004503968253968254, + "loss": 4.9572, + "theoretical_loss": 5.7273438695529535, + "tokens_seen": 29753344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004513888888888889, + "loss": 4.9607, + "theoretical_loss": 5.725300131490888, + "tokens_seen": 29818880 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045238095238095237, + "loss": 4.8473, + "theoretical_loss": 5.7232621347738455, + "tokens_seen": 29884416 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045337301587301587, + "loss": 5.0085, + "theoretical_loss": 5.721229850735305, + "tokens_seen": 29949952 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045436507936507937, + "loss": 4.938, + "theoretical_loss": 5.719203250914208, + "tokens_seen": 30015488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004553571428571429, + "loss": 4.8577, + "theoretical_loss": 5.717182307053037, + "tokens_seen": 30081024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004563492063492063, + "loss": 5.0517, + "theoretical_loss": 5.715166991095922, + "tokens_seen": 30146560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004573412698412698, + "loss": 4.8027, + "theoretical_loss": 5.713157275186761, + "tokens_seen": 30212096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004583333333333333, + "loss": 5.0271, + "theoretical_loss": 5.71115313166738, + "tokens_seen": 30277632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004593253968253968, + "loss": 5.0041, + "theoretical_loss": 5.709154533075688, + "tokens_seen": 30343168 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046031746031746033, + "loss": 4.9309, + "theoretical_loss": 5.707161452143879, + "tokens_seen": 30408704 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046130952380952383, + "loss": 4.9085, + "theoretical_loss": 5.7051738617966326, + "tokens_seen": 30474240 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046230158730158733, + "loss": 4.6103, + "theoretical_loss": 5.7031917351493515, + "tokens_seen": 30539776 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046329365079365083, + "loss": 5.0814, + "theoretical_loss": 5.701215045506411, + "tokens_seen": 30605312 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046428571428571433, + "loss": 5.0333, + "theoretical_loss": 5.699243766359421, + "tokens_seen": 30670848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004652777777777778, + "loss": 5.0393, + "theoretical_loss": 5.697277871385534, + "tokens_seen": 30736384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004662698412698413, + "loss": 4.813, + "theoretical_loss": 5.695317334445736, + "tokens_seen": 30801920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004672619047619048, + "loss": 4.8031, + "theoretical_loss": 5.693362129583184, + "tokens_seen": 30867456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004682539682539683, + "loss": 4.6799, + "theoretical_loss": 5.691412231021549, + "tokens_seen": 30932992 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046924603174603173, + "loss": 4.9719, + "theoretical_loss": 5.689467613163388, + "tokens_seen": 30998528 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047023809523809523, + "loss": 4.7362, + "theoretical_loss": 5.687528250588518, + "tokens_seen": 31064064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 135224, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.8863983154296875, + "objective/train/theoretical_loss": 5.6855941180524265, + "objective/train/tokens_used": 51589600, + "theoretical_loss": 5.6855941180524265, + "tokens_seen": 31129600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047123015873015874, + "loss": 4.893, + "theoretical_loss": 5.6855941180524265, + "tokens_seen": 31129600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047222222222222224, + "loss": 4.9991, + "theoretical_loss": 5.683665190484683, + "tokens_seen": 31195136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004732142857142857, + "loss": 5.1718, + "theoretical_loss": 5.681741442987381, + "tokens_seen": 31260672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004742063492063492, + "loss": 4.9702, + "theoretical_loss": 5.679822850833591, + "tokens_seen": 31326208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004751984126984127, + "loss": 4.9426, + "theoretical_loss": 5.677909389465831, + "tokens_seen": 31391744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004761904761904762, + "loss": 4.5549, + "theoretical_loss": 5.676001034494554, + "tokens_seen": 31457280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004771825396825397, + "loss": 4.8956, + "theoretical_loss": 5.674097761696653, + "tokens_seen": 31522816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004781746031746032, + "loss": 4.7593, + "theoretical_loss": 5.672199547013983, + "tokens_seen": 31588352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004791666666666667, + "loss": 4.7632, + "theoretical_loss": 5.670306366551898, + "tokens_seen": 31653888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004801587301587302, + "loss": 4.8197, + "theoretical_loss": 5.6684181965778, + "tokens_seen": 31719424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004811507936507937, + "loss": 5.0023, + "theoretical_loss": 5.666535013519715, + "tokens_seen": 31784960 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048214285714285715, + "loss": 4.265, + "theoretical_loss": 5.6646567939648715, + "tokens_seen": 31850496 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048313492063492065, + "loss": 4.6841, + "theoretical_loss": 5.6627835146583045, + "tokens_seen": 31916032 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048412698412698415, + "loss": 4.9755, + "theoretical_loss": 5.660915152501465, + "tokens_seen": 31981568 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048511904761904765, + "loss": 4.838, + "theoretical_loss": 5.659051684550857, + "tokens_seen": 32047104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004861111111111111, + "loss": 4.8988, + "theoretical_loss": 5.657193088016677, + "tokens_seen": 32112640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004871031746031746, + "loss": 4.8451, + "theoretical_loss": 5.655339340261474, + "tokens_seen": 32178176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004880952380952381, + "loss": 4.9273, + "theoretical_loss": 5.653490418798825, + "tokens_seen": 32243712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004890873015873016, + "loss": 4.6617, + "theoretical_loss": 5.651646301292022, + "tokens_seen": 32309248 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004900793650793651, + "loss": 4.6989, + "theoretical_loss": 5.649806965552774, + "tokens_seen": 32374784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004910714285714286, + "loss": 4.6449, + "theoretical_loss": 5.6479723895399205, + "tokens_seen": 32440320 + }, + { + "epoch": 0.01, + "learning_rate": 0.000492063492063492, + "loss": 4.9494, + "theoretical_loss": 5.6461425513581665, + "tokens_seen": 32505856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004930555555555556, + "loss": 4.6846, + "theoretical_loss": 5.6443174292568195, + "tokens_seen": 32571392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004940476190476191, + "loss": 4.6929, + "theoretical_loss": 5.6424970016285485, + "tokens_seen": 32636928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004950396825396826, + "loss": 4.6288, + "theoretical_loss": 5.640681247008156, + "tokens_seen": 32702464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 136821, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.2028489112854, + "objective/train/theoretical_loss": 5.638870144071353, + "objective/train/tokens_used": 53228000, + "theoretical_loss": 5.638870144071353, + "tokens_seen": 32768000 + }, + { + "epoch": 0.01, + "learning_rate": 0.000496031746031746, + "loss": 4.6949, + "theoretical_loss": 5.638870144071353, + "tokens_seen": 32768000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004970238095238095, + "loss": 4.5009, + "theoretical_loss": 5.637063671633564, + "tokens_seen": 32833536 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498015873015873, + "loss": 4.7564, + "theoretical_loss": 5.635261808648728, + "tokens_seen": 32899072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990079365079365, + "loss": 5.0523, + "theoretical_loss": 5.6334645342081195, + "tokens_seen": 32964608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0005, + "loss": 4.6206, + "theoretical_loss": 5.631671827539186, + "tokens_seen": 33030144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999899699097292, + "loss": 4.7802, + "theoretical_loss": 5.629883668004389, + "tokens_seen": 33095680 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999799398194584, + "loss": 4.7891, + "theoretical_loss": 5.628100035100061, + "tokens_seen": 33161216 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999699097291876, + "loss": 4.8953, + "theoretical_loss": 5.626320908455279, + "tokens_seen": 33226752 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999598796389167, + "loss": 4.5953, + "theoretical_loss": 5.6245462678307385, + "tokens_seen": 33292288 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499949849548646, + "loss": 4.67, + "theoretical_loss": 5.622776093117652, + "tokens_seen": 33357824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999398194583751, + "loss": 4.8824, + "theoretical_loss": 5.621010364336651, + "tokens_seen": 33423360 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999297893681044, + "loss": 4.7663, + "theoretical_loss": 5.619249061636698, + "tokens_seen": 33488896 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999197592778335, + "loss": 4.7263, + "theoretical_loss": 5.61749216529402, + "tokens_seen": 33554432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999097291875627, + "loss": 4.9792, + "theoretical_loss": 5.615739655711037, + "tokens_seen": 33619968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998996990972919, + "loss": 4.9381, + "theoretical_loss": 5.61399151341532, + "tokens_seen": 33685504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998896690070211, + "loss": 4.9476, + "theoretical_loss": 5.6122477190585425, + "tokens_seen": 33751040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998796389167503, + "loss": 4.5086, + "theoretical_loss": 5.610508253415453, + "tokens_seen": 33816576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998696088264795, + "loss": 4.5371, + "theoretical_loss": 5.6087730973828585, + "tokens_seen": 33882112 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998595787362087, + "loss": 4.5089, + "theoretical_loss": 5.6070422319786095, + "tokens_seen": 33947648 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998495486459378, + "loss": 4.8347, + "theoretical_loss": 5.605315638340606, + "tokens_seen": 34013184 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499839518555667, + "loss": 4.7779, + "theoretical_loss": 5.603593297725807, + "tokens_seen": 34078720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998294884653962, + "loss": 4.7224, + "theoretical_loss": 5.601875191509249, + "tokens_seen": 34144256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998194583751254, + "loss": 4.8306, + "theoretical_loss": 5.600161301183084, + "tokens_seen": 34209792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998094282848546, + "loss": 5.0491, + "theoretical_loss": 5.598451608355614, + "tokens_seen": 34275328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997993981945837, + "loss": 4.6312, + "theoretical_loss": 5.596746094750342, + "tokens_seen": 34340864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 137555, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.4468183517456055, + "objective/train/theoretical_loss": 5.595044742205037, + "objective/train/tokens_used": 54866400, + "theoretical_loss": 5.595044742205037, + "tokens_seen": 34406400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997893681043129, + "loss": 4.5163, + "theoretical_loss": 5.595044742205037, + "tokens_seen": 34406400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997793380140421, + "loss": 4.9292, + "theoretical_loss": 5.5933475326707995, + "tokens_seen": 34471936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997693079237714, + "loss": 4.8149, + "theoretical_loss": 5.591654448211143, + "tokens_seen": 34537472 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997592778335005, + "loss": 4.954, + "theoretical_loss": 5.589965471001077, + "tokens_seen": 34603008 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997492477432298, + "loss": 4.9477, + "theoretical_loss": 5.5882805833262115, + "tokens_seen": 34668544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997392176529588, + "loss": 4.7965, + "theoretical_loss": 5.586599767581859, + "tokens_seen": 34734080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997291875626881, + "loss": 5.0125, + "theoretical_loss": 5.584923006272151, + "tokens_seen": 34799616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997191574724173, + "loss": 4.7564, + "theoretical_loss": 5.583250282009159, + "tokens_seen": 34865152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997091273821465, + "loss": 4.7508, + "theoretical_loss": 5.581581577512031, + "tokens_seen": 34930688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996990972918757, + "loss": 4.7624, + "theoretical_loss": 5.579916875606134, + "tokens_seen": 34996224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996890672016048, + "loss": 4.6123, + "theoretical_loss": 5.578256159222196, + "tokens_seen": 35061760 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499679037111334, + "loss": 4.8336, + "theoretical_loss": 5.576599411395472, + "tokens_seen": 35127296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996690070210632, + "loss": 4.9514, + "theoretical_loss": 5.574946615264906, + "tokens_seen": 35192832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996589769307924, + "loss": 4.7192, + "theoretical_loss": 5.5732977540723105, + "tokens_seen": 35258368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996489468405216, + "loss": 4.7461, + "theoretical_loss": 5.571652811161542, + "tokens_seen": 35323904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996389167502507, + "loss": 4.6751, + "theoretical_loss": 5.570011769977693, + "tokens_seen": 35389440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996288866599799, + "loss": 4.6333, + "theoretical_loss": 5.568374614066299, + "tokens_seen": 35454976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996188565697091, + "loss": 4.6241, + "theoretical_loss": 5.566741327072535, + "tokens_seen": 35520512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996088264794383, + "loss": 4.8779, + "theoretical_loss": 5.565111892740433, + "tokens_seen": 35586048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995987963891675, + "loss": 4.4276, + "theoretical_loss": 5.563486294912105, + "tokens_seen": 35651584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995887662988968, + "loss": 4.6749, + "theoretical_loss": 5.56186451752697, + "tokens_seen": 35717120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995787362086258, + "loss": 4.665, + "theoretical_loss": 5.560246544620993, + "tokens_seen": 35782656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995687061183551, + "loss": 4.7076, + "theoretical_loss": 5.558632360325929, + "tokens_seen": 35848192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995586760280842, + "loss": 4.7311, + "theoretical_loss": 5.557021948868571, + "tokens_seen": 35913728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995486459378135, + "loss": 4.6598, + "theoretical_loss": 5.555415294570011, + "tokens_seen": 35979264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 138886, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.407133102416992, + "objective/train/theoretical_loss": 5.553812381844907, + "objective/train/tokens_used": 56504800, + "theoretical_loss": 5.553812381844907, + "tokens_seen": 36044800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995386158475427, + "loss": 4.5711, + "theoretical_loss": 5.553812381844907, + "tokens_seen": 36044800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995285857572718, + "loss": 4.7261, + "theoretical_loss": 5.552213195200755, + "tokens_seen": 36110336 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499518555667001, + "loss": 4.7503, + "theoretical_loss": 5.550617719237167, + "tokens_seen": 36175872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995085255767302, + "loss": 4.7123, + "theoretical_loss": 5.549025938645155, + "tokens_seen": 36241408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994984954864594, + "loss": 4.8397, + "theoretical_loss": 5.547437838206435, + "tokens_seen": 36306944 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994884653961886, + "loss": 4.8057, + "theoretical_loss": 5.545853402792717, + "tokens_seen": 36372480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994784353059178, + "loss": 4.711, + "theoretical_loss": 5.544272617365014, + "tokens_seen": 36438016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994684052156469, + "loss": 4.758, + "theoretical_loss": 5.542695466972956, + "tokens_seen": 36503552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994583751253761, + "loss": 4.9674, + "theoretical_loss": 5.541121936754111, + "tokens_seen": 36569088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994483450351053, + "loss": 4.8971, + "theoretical_loss": 5.539552011933312, + "tokens_seen": 36634624 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994383149448345, + "loss": 4.6474, + "theoretical_loss": 5.537985677821986, + "tokens_seen": 36700160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994282848545637, + "loss": 4.8243, + "theoretical_loss": 5.536422919817495, + "tokens_seen": 36765696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994182547642928, + "loss": 4.7595, + "theoretical_loss": 5.5348637234024824, + "tokens_seen": 36831232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994082246740221, + "loss": 4.8208, + "theoretical_loss": 5.53330807414422, + "tokens_seen": 36896768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993981945837512, + "loss": 4.5145, + "theoretical_loss": 5.5317559576939725, + "tokens_seen": 36962304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993881644934805, + "loss": 4.5815, + "theoretical_loss": 5.530207359786353, + "tokens_seen": 37027840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993781344032096, + "loss": 4.7233, + "theoretical_loss": 5.5286622662386975, + "tokens_seen": 37093376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993681043129389, + "loss": 4.7717, + "theoretical_loss": 5.52712066295044, + "tokens_seen": 37158912 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499358074222668, + "loss": 4.3869, + "theoretical_loss": 5.525582535902489, + "tokens_seen": 37224448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993480441323972, + "loss": 4.5429, + "theoretical_loss": 5.524047871156618, + "tokens_seen": 37289984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993380140421264, + "loss": 4.8437, + "theoretical_loss": 5.52251665485486, + "tokens_seen": 37355520 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993279839518556, + "loss": 4.7926, + "theoretical_loss": 5.520988873218897, + "tokens_seen": 37421056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993179538615848, + "loss": 4.8248, + "theoretical_loss": 5.519464512549478, + "tokens_seen": 37486592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993079237713139, + "loss": 4.7691, + "theoretical_loss": 5.5179435592258095, + "tokens_seen": 37552128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992978936810431, + "loss": 4.6871, + "theoretical_loss": 5.516425999704987, + "tokens_seen": 37617664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 139553, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.058813571929932, + "objective/train/theoretical_loss": 5.514911820521407, + "objective/train/tokens_used": 58143200, + "theoretical_loss": 5.514911820521407, + "tokens_seen": 37683200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992878635907723, + "loss": 4.5181, + "theoretical_loss": 5.514911820521407, + "tokens_seen": 37683200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992778335005015, + "loss": 4.4909, + "theoretical_loss": 5.5134010082861895, + "tokens_seen": 37748736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992678034102307, + "loss": 4.5355, + "theoretical_loss": 5.511893549686616, + "tokens_seen": 37814272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992577733199598, + "loss": 4.7593, + "theoretical_loss": 5.51038943148556, + "tokens_seen": 37879808 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499247743229689, + "loss": 4.6539, + "theoretical_loss": 5.508888640520928, + "tokens_seen": 37945344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992377131394183, + "loss": 4.5281, + "theoretical_loss": 5.50739116370511, + "tokens_seen": 38010880 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992276830491475, + "loss": 4.4719, + "theoretical_loss": 5.505896988024423, + "tokens_seen": 38076416 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992176529588767, + "loss": 4.8288, + "theoretical_loss": 5.5044061005385725, + "tokens_seen": 38141952 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992076228686059, + "loss": 4.376, + "theoretical_loss": 5.502918488380116, + "tokens_seen": 38207488 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499197592778335, + "loss": 4.5187, + "theoretical_loss": 5.501434138753918, + "tokens_seen": 38273024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991875626880642, + "loss": 4.5366, + "theoretical_loss": 5.499953038936635, + "tokens_seen": 38338560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991775325977934, + "loss": 4.5867, + "theoretical_loss": 5.498475176276176, + "tokens_seen": 38404096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991675025075226, + "loss": 4.5793, + "theoretical_loss": 5.497000538191195, + "tokens_seen": 38469632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991574724172518, + "loss": 4.4392, + "theoretical_loss": 5.495529112170568, + "tokens_seen": 38535168 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499147442326981, + "loss": 4.6516, + "theoretical_loss": 5.494060885772887, + "tokens_seen": 38600704 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991374122367101, + "loss": 4.5034, + "theoretical_loss": 5.492595846625951, + "tokens_seen": 38666240 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991273821464393, + "loss": 4.8256, + "theoretical_loss": 5.491133982426266, + "tokens_seen": 38731776 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991173520561685, + "loss": 5.0062, + "theoretical_loss": 5.489675280938547, + "tokens_seen": 38797312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991073219658977, + "loss": 4.6335, + "theoretical_loss": 5.488219729995227, + "tokens_seen": 38862848 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499097291875627, + "loss": 4.2841, + "theoretical_loss": 5.486767317495966, + "tokens_seen": 38928384 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499087261785356, + "loss": 4.8828, + "theoretical_loss": 5.48531803140717, + "tokens_seen": 38993920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990772316950853, + "loss": 4.6358, + "theoretical_loss": 5.483871859761511, + "tokens_seen": 39059456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990672016048144, + "loss": 4.7315, + "theoretical_loss": 5.482428790657449, + "tokens_seen": 39124992 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990571715145437, + "loss": 4.8301, + "theoretical_loss": 5.480988812258763, + "tokens_seen": 39190528 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990471414242729, + "loss": 4.6629, + "theoretical_loss": 5.479551912794086, + "tokens_seen": 39256064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 141085, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.329990386962891, + "objective/train/theoretical_loss": 5.478118080556438, + "objective/train/tokens_used": 59781600, + "theoretical_loss": 5.478118080556438, + "tokens_seen": 39321600 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499037111334002, + "loss": 4.4817, + "theoretical_loss": 5.478118080556438, + "tokens_seen": 39321600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990270812437312, + "loss": 4.7964, + "theoretical_loss": 5.476687303902768, + "tokens_seen": 39387136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990170511534604, + "loss": 4.6476, + "theoretical_loss": 5.475259571253502, + "tokens_seen": 39452672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990070210631896, + "loss": 4.5687, + "theoretical_loss": 5.473834871092089, + "tokens_seen": 39518208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989969909729188, + "loss": 4.6718, + "theoretical_loss": 5.4724131919645576, + "tokens_seen": 39583744 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498986960882648, + "loss": 4.5519, + "theoretical_loss": 5.470994522479069, + "tokens_seen": 39649280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989769307923771, + "loss": 4.8962, + "theoretical_loss": 5.4695788513054815, + "tokens_seen": 39714816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989669007021063, + "loss": 4.7691, + "theoretical_loss": 5.468166167174912, + "tokens_seen": 39780352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989568706118355, + "loss": 4.8426, + "theoretical_loss": 5.466756458879306, + "tokens_seen": 39845888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989468405215647, + "loss": 4.3942, + "theoretical_loss": 5.465349715271013, + "tokens_seen": 39911424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989368104312939, + "loss": 4.5906, + "theoretical_loss": 5.463945925262355, + "tokens_seen": 39976960 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498926780341023, + "loss": 4.7871, + "theoretical_loss": 5.462545077825214, + "tokens_seen": 40042496 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989167502507523, + "loss": 4.3864, + "theoretical_loss": 5.461147161990611, + "tokens_seen": 40108032 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989067201604814, + "loss": 4.5497, + "theoretical_loss": 5.459752166848292, + "tokens_seen": 40173568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988966900702107, + "loss": 4.45, + "theoretical_loss": 5.458360081546321, + "tokens_seen": 40239104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988866599799398, + "loss": 4.5501, + "theoretical_loss": 5.456970895290674, + "tokens_seen": 40304640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988766298896691, + "loss": 4.6729, + "theoretical_loss": 5.455584597344835, + "tokens_seen": 40370176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988665997993982, + "loss": 4.7367, + "theoretical_loss": 5.454201177029395, + "tokens_seen": 40435712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988565697091274, + "loss": 4.5589, + "theoretical_loss": 5.452820623721662, + "tokens_seen": 40501248 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988465396188566, + "loss": 4.6359, + "theoretical_loss": 5.45144292685526, + "tokens_seen": 40566784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988365095285858, + "loss": 4.5998, + "theoretical_loss": 5.450068075919752, + "tokens_seen": 40632320 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498826479438315, + "loss": 4.6231, + "theoretical_loss": 5.44869606046024, + "tokens_seen": 40697856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988164493480441, + "loss": 4.5891, + "theoretical_loss": 5.447326870076996, + "tokens_seen": 40763392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988064192577733, + "loss": 4.7368, + "theoretical_loss": 5.445960494425072, + "tokens_seen": 40828928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987963891675025, + "loss": 4.4854, + "theoretical_loss": 5.444596923213931, + "tokens_seen": 40894464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 141824, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.5050435066223145, + "objective/train/theoretical_loss": 5.443236146207074, + "objective/train/tokens_used": 61420000, + "theoretical_loss": 5.443236146207074, + "tokens_seen": 40960000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987863590772317, + "loss": 4.6512, + "theoretical_loss": 5.443236146207074, + "tokens_seen": 40960000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987763289869609, + "loss": 4.5578, + "theoretical_loss": 5.441878153221662, + "tokens_seen": 41025536 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049876629889669, + "loss": 4.7668, + "theoretical_loss": 5.440522934128164, + "tokens_seen": 41091072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987562688064192, + "loss": 4.523, + "theoretical_loss": 5.439170478849976, + "tokens_seen": 41156608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987462387161484, + "loss": 4.6254, + "theoretical_loss": 5.437820777363078, + "tokens_seen": 41222144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987362086258777, + "loss": 4.6625, + "theoretical_loss": 5.4364738196956655, + "tokens_seen": 41287680 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987261785356068, + "loss": 4.6117, + "theoretical_loss": 5.435129595927794, + "tokens_seen": 41353216 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987161484453361, + "loss": 4.7588, + "theoretical_loss": 5.433788096191039, + "tokens_seen": 41418752 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987061183550651, + "loss": 4.6583, + "theoretical_loss": 5.432449310668134, + "tokens_seen": 41484288 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986960882647944, + "loss": 4.5594, + "theoretical_loss": 5.4311132295926345, + "tokens_seen": 41549824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986860581745236, + "loss": 4.682, + "theoretical_loss": 5.42977984324857, + "tokens_seen": 41615360 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986760280842528, + "loss": 4.5311, + "theoretical_loss": 5.428449141970107, + "tokens_seen": 41680896 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498665997993982, + "loss": 4.6285, + "theoretical_loss": 5.427121116141212, + "tokens_seen": 41746432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986559679037111, + "loss": 4.4254, + "theoretical_loss": 5.42579575619531, + "tokens_seen": 41811968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986459378134403, + "loss": 4.3521, + "theoretical_loss": 5.424473052614967, + "tokens_seen": 41877504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986359077231695, + "loss": 4.2651, + "theoretical_loss": 5.423152995931552, + "tokens_seen": 41943040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986258776328987, + "loss": 4.4635, + "theoretical_loss": 5.421835576724906, + "tokens_seen": 42008576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986158475426279, + "loss": 4.4402, + "theoretical_loss": 5.420520785623031, + "tokens_seen": 42074112 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498605817452357, + "loss": 4.4318, + "theoretical_loss": 5.4192086133017625, + "tokens_seen": 42139648 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985957873620862, + "loss": 4.6869, + "theoretical_loss": 5.417899050484451, + "tokens_seen": 42205184 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985857572718154, + "loss": 4.5541, + "theoretical_loss": 5.416592087941646, + "tokens_seen": 42270720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985757271815446, + "loss": 4.6914, + "theoretical_loss": 5.415287716490787, + "tokens_seen": 42336256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985656970912738, + "loss": 4.4525, + "theoretical_loss": 5.413985926995892, + "tokens_seen": 42401792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985556670010031, + "loss": 4.5715, + "theoretical_loss": 5.412686710367245, + "tokens_seen": 42467328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985456369107321, + "loss": 4.4437, + "theoretical_loss": 5.411390057561097, + "tokens_seen": 42532864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 143445, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.786459922790527, + "objective/train/theoretical_loss": 5.410095959579362, + "objective/train/tokens_used": 63058400, + "theoretical_loss": 5.410095959579362, + "tokens_seen": 42598400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985356068204614, + "loss": 4.5962, + "theoretical_loss": 5.410095959579362, + "tokens_seen": 42598400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985255767301905, + "loss": 4.64, + "theoretical_loss": 5.408804407469308, + "tokens_seen": 42663936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985155466399198, + "loss": 4.639, + "theoretical_loss": 5.407515392323276, + "tokens_seen": 42729472 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498505516549649, + "loss": 4.5622, + "theoretical_loss": 5.406228905278368, + "tokens_seen": 42795008 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984954864593782, + "loss": 4.7461, + "theoretical_loss": 5.404944937516161, + "tokens_seen": 42860544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984854563691073, + "loss": 4.6478, + "theoretical_loss": 5.403663480262418, + "tokens_seen": 42926080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984754262788365, + "loss": 4.8657, + "theoretical_loss": 5.402384524786797, + "tokens_seen": 42991616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984653961885657, + "loss": 4.6469, + "theoretical_loss": 5.401108062402562, + "tokens_seen": 43057152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984553660982949, + "loss": 4.3658, + "theoretical_loss": 5.399834084466306, + "tokens_seen": 43122688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984453360080241, + "loss": 4.6816, + "theoretical_loss": 5.398562582377666, + "tokens_seen": 43188224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984353059177532, + "loss": 4.3855, + "theoretical_loss": 5.397293547579041, + "tokens_seen": 43253760 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984252758274825, + "loss": 4.568, + "theoretical_loss": 5.396026971555319, + "tokens_seen": 43319296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984152457372116, + "loss": 4.4991, + "theoretical_loss": 5.394762845833601, + "tokens_seen": 43384832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984052156469409, + "loss": 4.583, + "theoretical_loss": 5.393501161982926, + "tokens_seen": 43450368 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049839518555667, + "loss": 4.2179, + "theoretical_loss": 5.392241911614005, + "tokens_seen": 43515904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983851554663993, + "loss": 4.5224, + "theoretical_loss": 5.390985086378949, + "tokens_seen": 43581440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983751253761284, + "loss": 4.7938, + "theoretical_loss": 5.389730677971002, + "tokens_seen": 43646976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983650952858576, + "loss": 4.4534, + "theoretical_loss": 5.388478678124285, + "tokens_seen": 43712512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983550651955868, + "loss": 4.2336, + "theoretical_loss": 5.387229078613521, + "tokens_seen": 43778048 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498345035105316, + "loss": 4.6225, + "theoretical_loss": 5.385981871253785, + "tokens_seen": 43843584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983350050150452, + "loss": 4.4481, + "theoretical_loss": 5.384737047900243, + "tokens_seen": 43909120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983249749247743, + "loss": 4.5426, + "theoretical_loss": 5.3834946004478965, + "tokens_seen": 43974656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983149448345035, + "loss": 4.722, + "theoretical_loss": 5.382254520831328, + "tokens_seen": 44040192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983049147442327, + "loss": 4.4346, + "theoretical_loss": 5.381016801024449, + "tokens_seen": 44105728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982948846539619, + "loss": 4.5081, + "theoretical_loss": 5.379781433040252, + "tokens_seen": 44171264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 144147, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.279332637786865, + "objective/train/theoretical_loss": 5.378548408930558, + "objective/train/tokens_used": 64696800, + "theoretical_loss": 5.378548408930558, + "tokens_seen": 44236800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982848545636911, + "loss": 4.6137, + "theoretical_loss": 5.378548408930558, + "tokens_seen": 44236800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982748244734202, + "loss": 4.4353, + "theoretical_loss": 5.377317720785777, + "tokens_seen": 44302336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982647943831494, + "loss": 4.4919, + "theoretical_loss": 5.37608936073466, + "tokens_seen": 44367872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982547642928786, + "loss": 4.3873, + "theoretical_loss": 5.374863320944057, + "tokens_seen": 44433408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982447342026079, + "loss": 4.5235, + "theoretical_loss": 5.373639593618675, + "tokens_seen": 44498944 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498234704112337, + "loss": 4.5877, + "theoretical_loss": 5.372418171000847, + "tokens_seen": 44564480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982246740220663, + "loss": 4.3387, + "theoretical_loss": 5.371199045370283, + "tokens_seen": 44630016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982146439317953, + "loss": 4.4968, + "theoretical_loss": 5.369982209043851, + "tokens_seen": 44695552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982046138415246, + "loss": 4.5509, + "theoretical_loss": 5.368767654375327, + "tokens_seen": 44761088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981945837512538, + "loss": 4.6583, + "theoretical_loss": 5.367555373755179, + "tokens_seen": 44826624 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498184553660983, + "loss": 4.2686, + "theoretical_loss": 5.366345359610327, + "tokens_seen": 44892160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981745235707122, + "loss": 4.5189, + "theoretical_loss": 5.365137604403923, + "tokens_seen": 44957696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981644934804413, + "loss": 4.7356, + "theoretical_loss": 5.363932100635117, + "tokens_seen": 45023232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981544633901705, + "loss": 4.5647, + "theoretical_loss": 5.362728840838843, + "tokens_seen": 45088768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981444332998997, + "loss": 4.4656, + "theoretical_loss": 5.361527817585586, + "tokens_seen": 45154304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981344032096289, + "loss": 4.6026, + "theoretical_loss": 5.360329023481169, + "tokens_seen": 45219840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981243731193581, + "loss": 4.4202, + "theoretical_loss": 5.359132451166534, + "tokens_seen": 45285376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981143430290873, + "loss": 4.5494, + "theoretical_loss": 5.357938093317518, + "tokens_seen": 45350912 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981043129388164, + "loss": 4.3413, + "theoretical_loss": 5.356745942644645, + "tokens_seen": 45416448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980942828485456, + "loss": 4.2947, + "theoretical_loss": 5.355555991892905, + "tokens_seen": 45481984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980842527582748, + "loss": 4.7378, + "theoretical_loss": 5.35436823384155, + "tokens_seen": 45547520 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498074222668004, + "loss": 4.5112, + "theoretical_loss": 5.353182661303873, + "tokens_seen": 45613056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980641925777333, + "loss": 4.6471, + "theoretical_loss": 5.35199926712701, + "tokens_seen": 45678592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980541624874623, + "loss": 4.4652, + "theoretical_loss": 5.350818044191721, + "tokens_seen": 45744128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980441323971916, + "loss": 4.3502, + "theoretical_loss": 5.349638985412193, + "tokens_seen": 45809664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 144842, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.692098617553711, + "objective/train/theoretical_loss": 5.348462083735834, + "objective/train/tokens_used": 66335200, + "theoretical_loss": 5.348462083735834, + "tokens_seen": 45875200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980341023069207, + "loss": 4.4348, + "theoretical_loss": 5.348462083735834, + "tokens_seen": 45875200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049802407221665, + "loss": 4.45, + "theoretical_loss": 5.347287332143064, + "tokens_seen": 45940736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980140421263792, + "loss": 4.4526, + "theoretical_loss": 5.346114723647119, + "tokens_seen": 46006272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980040120361084, + "loss": 4.5448, + "theoretical_loss": 5.344944251293852, + "tokens_seen": 46071808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979939819458375, + "loss": 4.3139, + "theoretical_loss": 5.343775908161532, + "tokens_seen": 46137344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979839518555667, + "loss": 4.5183, + "theoretical_loss": 5.342609687360644, + "tokens_seen": 46202880 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979739217652959, + "loss": 4.2806, + "theoretical_loss": 5.341445582033705, + "tokens_seen": 46268416 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979638916750251, + "loss": 4.6082, + "theoretical_loss": 5.3402835853550545, + "tokens_seen": 46333952 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979538615847543, + "loss": 4.286, + "theoretical_loss": 5.339123690530673, + "tokens_seen": 46399488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979438314944834, + "loss": 4.4594, + "theoretical_loss": 5.337965890797989, + "tokens_seen": 46465024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979338014042126, + "loss": 4.4121, + "theoretical_loss": 5.336810179425685, + "tokens_seen": 46530560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979237713139418, + "loss": 4.3507, + "theoretical_loss": 5.335656549713516, + "tokens_seen": 46596096 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497913741223671, + "loss": 4.4211, + "theoretical_loss": 5.334504994992115, + "tokens_seen": 46661632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979037111334002, + "loss": 4.548, + "theoretical_loss": 5.333355508622814, + "tokens_seen": 46727168 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978936810431293, + "loss": 4.4084, + "theoretical_loss": 5.332208083997459, + "tokens_seen": 46792704 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978836509528586, + "loss": 4.3348, + "theoretical_loss": 5.33106271453822, + "tokens_seen": 46858240 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978736208625877, + "loss": 4.3399, + "theoretical_loss": 5.329919393697422, + "tokens_seen": 46923776 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497863590772317, + "loss": 4.2743, + "theoretical_loss": 5.328778114957351, + "tokens_seen": 46989312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978535606820461, + "loss": 4.3191, + "theoretical_loss": 5.327638871830089, + "tokens_seen": 47054848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978435305917754, + "loss": 4.3063, + "theoretical_loss": 5.326501657857326, + "tokens_seen": 47120384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978335005015045, + "loss": 4.7146, + "theoretical_loss": 5.32536646661019, + "tokens_seen": 47185920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978234704112337, + "loss": 4.5358, + "theoretical_loss": 5.324233291689069, + "tokens_seen": 47251456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978134403209629, + "loss": 4.3223, + "theoretical_loss": 5.323102126723439, + "tokens_seen": 47316992 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978034102306921, + "loss": 4.3505, + "theoretical_loss": 5.321972965371691, + "tokens_seen": 47382528 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977933801404213, + "loss": 4.515, + "theoretical_loss": 5.320845801320959, + "tokens_seen": 47448064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 146139, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.262459754943848, + "objective/train/theoretical_loss": 5.319720628286955, + "objective/train/tokens_used": 67973600, + "theoretical_loss": 5.319720628286955, + "tokens_seen": 47513600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977833500501504, + "loss": 4.315, + "theoretical_loss": 5.319720628286955, + "tokens_seen": 47513600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977733199598796, + "loss": 4.3331, + "theoretical_loss": 5.318597440013795, + "tokens_seen": 47579136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977632898696088, + "loss": 4.458, + "theoretical_loss": 5.317476230273831, + "tokens_seen": 47644672 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497753259779338, + "loss": 4.3781, + "theoretical_loss": 5.316356992867491, + "tokens_seen": 47710208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977432296890672, + "loss": 4.3273, + "theoretical_loss": 5.31523972162311, + "tokens_seen": 47775744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977331995987965, + "loss": 4.613, + "theoretical_loss": 5.314124410396767, + "tokens_seen": 47841280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977231695085255, + "loss": 4.3982, + "theoretical_loss": 5.31301105307212, + "tokens_seen": 47906816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977131394182548, + "loss": 4.3032, + "theoretical_loss": 5.311899643560251, + "tokens_seen": 47972352 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497703109327984, + "loss": 4.449, + "theoretical_loss": 5.310790175799497, + "tokens_seen": 48037888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976930792377132, + "loss": 4.4828, + "theoretical_loss": 5.3096826437553, + "tokens_seen": 48103424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976830491474424, + "loss": 3.9312, + "theoretical_loss": 5.308577041420046, + "tokens_seen": 48168960 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976730190571715, + "loss": 4.3421, + "theoretical_loss": 5.3074733628129005, + "tokens_seen": 48234496 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976629889669007, + "loss": 4.5473, + "theoretical_loss": 5.3063716019796665, + "tokens_seen": 48300032 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976529588766299, + "loss": 4.5363, + "theoretical_loss": 5.305271752992619, + "tokens_seen": 48365568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976429287863591, + "loss": 4.5261, + "theoretical_loss": 5.304173809950358, + "tokens_seen": 48431104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976328986960883, + "loss": 4.2883, + "theoretical_loss": 5.303077766977653, + "tokens_seen": 48496640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976228686058175, + "loss": 4.7128, + "theoretical_loss": 5.3019836182252895, + "tokens_seen": 48562176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976128385155466, + "loss": 4.4607, + "theoretical_loss": 5.300891357869929, + "tokens_seen": 48627712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976028084252758, + "loss": 4.5213, + "theoretical_loss": 5.299800980113945, + "tokens_seen": 48693248 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497592778335005, + "loss": 4.3922, + "theoretical_loss": 5.298712479185288, + "tokens_seen": 48758784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975827482447342, + "loss": 4.3163, + "theoretical_loss": 5.297625849337331, + "tokens_seen": 48824320 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975727181544635, + "loss": 4.2912, + "theoretical_loss": 5.296541084848727, + "tokens_seen": 48889856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975626880641925, + "loss": 4.47, + "theoretical_loss": 5.295458180023262, + "tokens_seen": 48955392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975526579739218, + "loss": 4.3223, + "theoretical_loss": 5.294377129189715, + "tokens_seen": 49020928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975426278836509, + "loss": 4.2547, + "theoretical_loss": 5.293297926701706, + "tokens_seen": 49086464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 146831, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.401537895202637, + "objective/train/theoretical_loss": 5.292220566937567, + "objective/train/tokens_used": 69612000, + "theoretical_loss": 5.292220566937567, + "tokens_seen": 49152000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975325977933802, + "loss": 4.4595, + "theoretical_loss": 5.292220566937567, + "tokens_seen": 49152000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975225677031094, + "loss": 4.2178, + "theoretical_loss": 5.29114504430019, + "tokens_seen": 49217536 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975125376128386, + "loss": 4.2229, + "theoretical_loss": 5.290071353216895, + "tokens_seen": 49283072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975025075225677, + "loss": 4.3304, + "theoretical_loss": 5.288999488139284, + "tokens_seen": 49348608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004974924774322969, + "loss": 4.2719, + "theoretical_loss": 5.28792944354311, + "tokens_seen": 49414144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004974824473420261, + "loss": 4.4284, + "theoretical_loss": 5.286861213928137, + "tokens_seen": 49479680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974724172517553, + "loss": 4.5986, + "theoretical_loss": 5.285794793817999, + "tokens_seen": 49545216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974623871614845, + "loss": 4.2011, + "theoretical_loss": 5.284730177760077, + "tokens_seen": 49610752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974523570712136, + "loss": 4.3674, + "theoretical_loss": 5.283667360325351, + "tokens_seen": 49676288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974423269809428, + "loss": 4.5679, + "theoretical_loss": 5.2826063361082785, + "tokens_seen": 49741824 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497432296890672, + "loss": 4.6443, + "theoretical_loss": 5.281547099726654, + "tokens_seen": 49807360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974222668004012, + "loss": 4.2155, + "theoretical_loss": 5.280489645821483, + "tokens_seen": 49872896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974122367101304, + "loss": 4.2667, + "theoretical_loss": 5.279433969056848, + "tokens_seen": 49938432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974022066198595, + "loss": 4.4801, + "theoretical_loss": 5.278380064119782, + "tokens_seen": 50003968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973921765295888, + "loss": 4.2357, + "theoretical_loss": 5.277327925720137, + "tokens_seen": 50069504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973821464393179, + "loss": 4.7003, + "theoretical_loss": 5.276277548590457, + "tokens_seen": 50135040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973721163490472, + "loss": 4.1875, + "theoretical_loss": 5.275228927485855, + "tokens_seen": 50200576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973620862587763, + "loss": 4.3904, + "theoretical_loss": 5.2741820571838804, + "tokens_seen": 50266112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973520561685056, + "loss": 4.3429, + "theoretical_loss": 5.273136932484399, + "tokens_seen": 50331648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973420260782347, + "loss": 4.6097, + "theoretical_loss": 5.272093548209467, + "tokens_seen": 50397184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973319959879639, + "loss": 4.1553, + "theoretical_loss": 5.271051899203207, + "tokens_seen": 50462720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973219658976931, + "loss": 4.3946, + "theoretical_loss": 5.270011980331685, + "tokens_seen": 50528256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973119358074223, + "loss": 4.1563, + "theoretical_loss": 5.268973786482794, + "tokens_seen": 50593792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973019057171515, + "loss": 4.4526, + "theoretical_loss": 5.267937312566123, + "tokens_seen": 50659328 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972918756268806, + "loss": 4.3346, + "theoretical_loss": 5.266902553512847, + "tokens_seen": 50724864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 148451, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5394937992095947, + "objective/train/theoretical_loss": 5.265869504275602, + "objective/train/tokens_used": 71250400, + "theoretical_loss": 5.265869504275602, + "tokens_seen": 50790400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972818455366098, + "loss": 4.2104, + "theoretical_loss": 5.265869504275602, + "tokens_seen": 50790400 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497271815446339, + "loss": 4.6599, + "theoretical_loss": 5.264838159828369, + "tokens_seen": 50855936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972617853560682, + "loss": 4.5403, + "theoretical_loss": 5.263808515166355, + "tokens_seen": 50921472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972517552657974, + "loss": 3.8993, + "theoretical_loss": 5.262780565305875, + "tokens_seen": 50987008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972417251755266, + "loss": 4.4009, + "theoretical_loss": 5.261754305284241, + "tokens_seen": 51052544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972316950852557, + "loss": 4.4019, + "theoretical_loss": 5.260729730159641, + "tokens_seen": 51118080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972216649949849, + "loss": 4.191, + "theoretical_loss": 5.259706835011027, + "tokens_seen": 51183616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972116349047142, + "loss": 4.3533, + "theoretical_loss": 5.2586856149380035, + "tokens_seen": 51249152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972016048144433, + "loss": 4.1959, + "theoretical_loss": 5.257666065060709, + "tokens_seen": 51314688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971915747241726, + "loss": 4.5251, + "theoretical_loss": 5.256648180519708, + "tokens_seen": 51380224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971815446339017, + "loss": 4.348, + "theoretical_loss": 5.255631956475881, + "tokens_seen": 51445760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971715145436309, + "loss": 4.4193, + "theoretical_loss": 5.25461738811031, + "tokens_seen": 51511296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971614844533601, + "loss": 4.2312, + "theoretical_loss": 5.25360447062417, + "tokens_seen": 51576832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971514543630893, + "loss": 4.6115, + "theoretical_loss": 5.252593199238619, + "tokens_seen": 51642368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971414242728185, + "loss": 4.2891, + "theoretical_loss": 5.2515835691946915, + "tokens_seen": 51707904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971313941825477, + "loss": 4.0652, + "theoretical_loss": 5.2505755757531904, + "tokens_seen": 51773440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971213640922768, + "loss": 4.5308, + "theoretical_loss": 5.24956921419458, + "tokens_seen": 51838976 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497111334002006, + "loss": 4.2823, + "theoretical_loss": 5.248564479818876, + "tokens_seen": 51904512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971013039117352, + "loss": 4.3883, + "theoretical_loss": 5.247561367945544, + "tokens_seen": 51970048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970912738214644, + "loss": 4.3249, + "theoretical_loss": 5.246559873913396, + "tokens_seen": 52035584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970812437311936, + "loss": 4.3358, + "theoretical_loss": 5.245559993080484, + "tokens_seen": 52101120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970712136409227, + "loss": 4.0621, + "theoretical_loss": 5.24456172082399, + "tokens_seen": 52166656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970611835506519, + "loss": 4.5677, + "theoretical_loss": 5.243565052540136, + "tokens_seen": 52232192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970511534603811, + "loss": 4.4829, + "theoretical_loss": 5.242569983644074, + "tokens_seen": 52297728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970411233701103, + "loss": 4.1669, + "theoretical_loss": 5.241576509569784, + "tokens_seen": 52363264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 149172, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.183032989501953, + "objective/train/theoretical_loss": 5.240584625769978, + "objective/train/tokens_used": 72888800, + "theoretical_loss": 5.240584625769978, + "tokens_seen": 52428800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970310932798396, + "loss": 4.3346, + "theoretical_loss": 5.240584625769978, + "tokens_seen": 52428800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970210631895686, + "loss": 4.3921, + "theoretical_loss": 5.239594327715992, + "tokens_seen": 52494336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970110330992979, + "loss": 4.4333, + "theoretical_loss": 5.238605610897698, + "tokens_seen": 52559872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970010030090271, + "loss": 4.2883, + "theoretical_loss": 5.237618470823394, + "tokens_seen": 52625408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969909729187563, + "loss": 4.1672, + "theoretical_loss": 5.2366329030197125, + "tokens_seen": 52690944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969809428284855, + "loss": 4.335, + "theoretical_loss": 5.235648903031521, + "tokens_seen": 52756480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969709127382147, + "loss": 4.1939, + "theoretical_loss": 5.2346664664218245, + "tokens_seen": 52822016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969608826479438, + "loss": 4.2423, + "theoretical_loss": 5.233685588771669, + "tokens_seen": 52887552 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496950852557673, + "loss": 4.1699, + "theoretical_loss": 5.232706265680049, + "tokens_seen": 52953088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969408224674022, + "loss": 4.3159, + "theoretical_loss": 5.231728492763811, + "tokens_seen": 53018624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969307923771314, + "loss": 3.9781, + "theoretical_loss": 5.230752265657554, + "tokens_seen": 53084160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969207622868606, + "loss": 4.0656, + "theoretical_loss": 5.229777580013545, + "tokens_seen": 53149696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969107321965897, + "loss": 4.6733, + "theoretical_loss": 5.228804431501619, + "tokens_seen": 53215232 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496900702106319, + "loss": 4.4798, + "theoretical_loss": 5.227832815809087, + "tokens_seen": 53280768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968906720160481, + "loss": 4.4063, + "theoretical_loss": 5.226862728640651, + "tokens_seen": 53346304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968806419257774, + "loss": 4.3503, + "theoretical_loss": 5.2258941657183, + "tokens_seen": 53411840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968706118355065, + "loss": 4.346, + "theoretical_loss": 5.2249271227812315, + "tokens_seen": 53477376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968605817452358, + "loss": 4.3345, + "theoretical_loss": 5.223961595585755, + "tokens_seen": 53542912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968505516549649, + "loss": 4.2838, + "theoretical_loss": 5.222997579905204, + "tokens_seen": 53608448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968405215646941, + "loss": 3.9395, + "theoretical_loss": 5.222035071529845, + "tokens_seen": 53673984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968304914744233, + "loss": 4.2813, + "theoretical_loss": 5.2210740662667945, + "tokens_seen": 53739520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968204613841525, + "loss": 4.2565, + "theoretical_loss": 5.220114559939923, + "tokens_seen": 53805056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968104312938817, + "loss": 4.2532, + "theoretical_loss": 5.219156548389775, + "tokens_seen": 53870592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968004012036108, + "loss": 4.1641, + "theoretical_loss": 5.218200027473481, + "tokens_seen": 53936128 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049679037111334, + "loss": 4.2325, + "theoretical_loss": 5.217244993064664, + "tokens_seen": 54001664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 150441, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.059125900268555, + "objective/train/theoretical_loss": 5.216291441053366, + "objective/train/tokens_used": 74527200, + "theoretical_loss": 5.216291441053366, + "tokens_seen": 54067200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967803410230692, + "loss": 4.3793, + "theoretical_loss": 5.216291441053366, + "tokens_seen": 54067200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967703109327984, + "loss": 4.3139, + "theoretical_loss": 5.215339367345955, + "tokens_seen": 54132736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967602808425276, + "loss": 4.4466, + "theoretical_loss": 5.214388767865036, + "tokens_seen": 54198272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967502507522568, + "loss": 4.2178, + "theoretical_loss": 5.2134396385493815, + "tokens_seen": 54263808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967402206619859, + "loss": 4.3888, + "theoretical_loss": 5.212491975353835, + "tokens_seen": 54329344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967301905717151, + "loss": 4.4649, + "theoretical_loss": 5.211545774249233, + "tokens_seen": 54394880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967201604814444, + "loss": 4.5714, + "theoretical_loss": 5.210601031222324, + "tokens_seen": 54460416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967101303911735, + "loss": 4.3355, + "theoretical_loss": 5.209657742275683, + "tokens_seen": 54525952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967001003009028, + "loss": 4.2998, + "theoretical_loss": 5.208715903427631, + "tokens_seen": 54591488 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496690070210632, + "loss": 4.4595, + "theoretical_loss": 5.207775510712159, + "tokens_seen": 54657024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966800401203611, + "loss": 4.4266, + "theoretical_loss": 5.2068365601788384, + "tokens_seen": 54722560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966700100300903, + "loss": 4.3538, + "theoretical_loss": 5.205899047892753, + "tokens_seen": 54788096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966599799398195, + "loss": 4.2137, + "theoretical_loss": 5.2049629699344075, + "tokens_seen": 54853632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966499498495487, + "loss": 4.4532, + "theoretical_loss": 5.204028322399658, + "tokens_seen": 54919168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966399197592779, + "loss": 4.365, + "theoretical_loss": 5.203095101399628, + "tokens_seen": 54984704 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496629889669007, + "loss": 4.4775, + "theoretical_loss": 5.202163303060633, + "tokens_seen": 55050240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966198595787362, + "loss": 4.6992, + "theoretical_loss": 5.201232923524104, + "tokens_seen": 55115776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966098294884654, + "loss": 4.293, + "theoretical_loss": 5.20030395894651, + "tokens_seen": 55181312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965997993981946, + "loss": 4.3133, + "theoretical_loss": 5.199376405499277, + "tokens_seen": 55246848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965897693079238, + "loss": 4.4571, + "theoretical_loss": 5.198450259368721, + "tokens_seen": 55312384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965797392176529, + "loss": 4.3994, + "theoretical_loss": 5.197525516755965, + "tokens_seen": 55377920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965697091273821, + "loss": 3.9741, + "theoretical_loss": 5.196602173876867, + "tokens_seen": 55443456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965596790371113, + "loss": 4.0186, + "theoretical_loss": 5.195680226961947, + "tokens_seen": 55508992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965496489468405, + "loss": 4.2808, + "theoretical_loss": 5.194759672256309, + "tokens_seen": 55574528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965396188565698, + "loss": 4.4242, + "theoretical_loss": 5.19384050601957, + "tokens_seen": 55640064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 151052, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9295265674591064, + "objective/train/theoretical_loss": 5.192922724525789, + "objective/train/tokens_used": 76165600, + "theoretical_loss": 5.192922724525789, + "tokens_seen": 55705600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965295887662988, + "loss": 4.1666, + "theoretical_loss": 5.192922724525789, + "tokens_seen": 55705600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965195586760281, + "loss": 4.4752, + "theoretical_loss": 5.19200632406339, + "tokens_seen": 55771136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965095285857573, + "loss": 4.162, + "theoretical_loss": 5.19109130093509, + "tokens_seen": 55836672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964994984954865, + "loss": 4.268, + "theoretical_loss": 5.190177651457833, + "tokens_seen": 55902208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964894684052157, + "loss": 4.2668, + "theoretical_loss": 5.189265371962712, + "tokens_seen": 55967744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964794383149449, + "loss": 4.1351, + "theoretical_loss": 5.188354458794902, + "tokens_seen": 56033280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496469408224674, + "loss": 4.3194, + "theoretical_loss": 5.187444908313586, + "tokens_seen": 56098816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964593781344032, + "loss": 4.3667, + "theoretical_loss": 5.186536716891892, + "tokens_seen": 56164352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964493480441324, + "loss": 4.3239, + "theoretical_loss": 5.185629880916814, + "tokens_seen": 56229888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964393179538616, + "loss": 4.1802, + "theoretical_loss": 5.18472439678915, + "tokens_seen": 56295424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964292878635908, + "loss": 4.1018, + "theoretical_loss": 5.18382026092343, + "tokens_seen": 56360960 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049641925777332, + "loss": 4.2962, + "theoretical_loss": 5.182917469747851, + "tokens_seen": 56426496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964092276830491, + "loss": 3.9407, + "theoretical_loss": 5.182016019704204, + "tokens_seen": 56492032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963991975927783, + "loss": 4.468, + "theoretical_loss": 5.1811159072478095, + "tokens_seen": 56557568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963891675025075, + "loss": 4.2165, + "theoretical_loss": 5.180217128847451, + "tokens_seen": 56623104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963791374122367, + "loss": 4.3633, + "theoretical_loss": 5.17931968098531, + "tokens_seen": 56688640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963691073219659, + "loss": 4.2837, + "theoretical_loss": 5.178423560156894, + "tokens_seen": 56754176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963590772316951, + "loss": 4.5775, + "theoretical_loss": 5.177528762870973, + "tokens_seen": 56819712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963490471414242, + "loss": 4.2386, + "theoretical_loss": 5.176635285649521, + "tokens_seen": 56885248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963390170511535, + "loss": 4.4926, + "theoretical_loss": 5.175743125027638, + "tokens_seen": 56950784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963289869608827, + "loss": 4.2936, + "theoretical_loss": 5.174852277553498, + "tokens_seen": 57016320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963189568706119, + "loss": 4.3052, + "theoretical_loss": 5.173962739788276, + "tokens_seen": 57081856 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496308926780341, + "loss": 4.2385, + "theoretical_loss": 5.17307450830609, + "tokens_seen": 57147392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962988966900702, + "loss": 4.0804, + "theoretical_loss": 5.172187579693933, + "tokens_seen": 57212928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962888665997994, + "loss": 4.2481, + "theoretical_loss": 5.1713019505516105, + "tokens_seen": 57278464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 152615, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.15744161605835, + "objective/train/theoretical_loss": 5.170417617491682, + "objective/train/tokens_used": 77804000, + "theoretical_loss": 5.170417617491682, + "tokens_seen": 57344000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962788365095286, + "loss": 4.1901, + "theoretical_loss": 5.170417617491682, + "tokens_seen": 57344000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962688064192578, + "loss": 4.4107, + "theoretical_loss": 5.169534577139395, + "tokens_seen": 57409536 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496258776328987, + "loss": 4.4229, + "theoretical_loss": 5.168652826132623, + "tokens_seen": 57475072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962487462387161, + "loss": 4.2645, + "theoretical_loss": 5.167772361121805, + "tokens_seen": 57540608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962387161484453, + "loss": 4.0974, + "theoretical_loss": 5.166893178769884, + "tokens_seen": 57606144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962286860581746, + "loss": 4.5159, + "theoretical_loss": 5.1660152757522475, + "tokens_seen": 57671680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962186559679037, + "loss": 4.3628, + "theoretical_loss": 5.165138648756665, + "tokens_seen": 57737216 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496208625877633, + "loss": 3.9702, + "theoretical_loss": 5.164263294483226, + "tokens_seen": 57802752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961985957873621, + "loss": 4.1025, + "theoretical_loss": 5.163389209644287, + "tokens_seen": 57868288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961885656970913, + "loss": 4.0294, + "theoretical_loss": 5.162516390964408, + "tokens_seen": 57933824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961785356068205, + "loss": 4.155, + "theoretical_loss": 5.1616448351802875, + "tokens_seen": 57999360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961685055165497, + "loss": 4.1206, + "theoretical_loss": 5.160774539040716, + "tokens_seen": 58064896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961584754262789, + "loss": 4.2619, + "theoretical_loss": 5.159905499306511, + "tokens_seen": 58130432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961484453360081, + "loss": 4.2084, + "theoretical_loss": 5.159037712750455, + "tokens_seen": 58195968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961384152457372, + "loss": 4.3638, + "theoretical_loss": 5.158171176157245, + "tokens_seen": 58261504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961283851554664, + "loss": 4.4218, + "theoretical_loss": 5.157305886323435, + "tokens_seen": 58327040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961183550651956, + "loss": 4.4587, + "theoretical_loss": 5.156441840057371, + "tokens_seen": 58392576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961083249749248, + "loss": 4.1511, + "theoretical_loss": 5.155579034179144, + "tokens_seen": 58458112 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496098294884654, + "loss": 3.9374, + "theoretical_loss": 5.15471746552053, + "tokens_seen": 58523648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960882647943831, + "loss": 4.1491, + "theoretical_loss": 5.153857130924929, + "tokens_seen": 58589184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960782347041123, + "loss": 4.3473, + "theoretical_loss": 5.1529980272473175, + "tokens_seen": 58654720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960682046138415, + "loss": 4.2407, + "theoretical_loss": 5.152140151354191, + "tokens_seen": 58720256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960581745235707, + "loss": 4.2574, + "theoretical_loss": 5.151283500123505, + "tokens_seen": 58785792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960481444333, + "loss": 4.196, + "theoretical_loss": 5.150428070444621, + "tokens_seen": 58851328 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496038114343029, + "loss": 4.1719, + "theoretical_loss": 5.149573859218261, + "tokens_seen": 58916864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 153245, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.41132926940918, + "objective/train/theoretical_loss": 5.1487208633564405, + "objective/train/tokens_used": 79442400, + "theoretical_loss": 5.1487208633564405, + "tokens_seen": 58982400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960280842527583, + "loss": 4.2262, + "theoretical_loss": 5.1487208633564405, + "tokens_seen": 58982400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960180541624875, + "loss": 4.3165, + "theoretical_loss": 5.147869079782423, + "tokens_seen": 59047936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960080240722167, + "loss": 4.1407, + "theoretical_loss": 5.147018505430666, + "tokens_seen": 59113472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959979939819459, + "loss": 4.2968, + "theoretical_loss": 5.146169137246765, + "tokens_seen": 59179008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959879638916751, + "loss": 4.3721, + "theoretical_loss": 5.145320972187402, + "tokens_seen": 59244544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959779338014042, + "loss": 3.9537, + "theoretical_loss": 5.144474007220293, + "tokens_seen": 59310080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959679037111334, + "loss": 4.0455, + "theoretical_loss": 5.143628239324139, + "tokens_seen": 59375616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959578736208626, + "loss": 4.1954, + "theoretical_loss": 5.142783665488567, + "tokens_seen": 59441152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959478435305918, + "loss": 4.4969, + "theoretical_loss": 5.1419402827140885, + "tokens_seen": 59506688 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495937813440321, + "loss": 4.0657, + "theoretical_loss": 5.141098088012036, + "tokens_seen": 59572224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959277833500501, + "loss": 4.0912, + "theoretical_loss": 5.140257078404524, + "tokens_seen": 59637760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959177532597793, + "loss": 4.2994, + "theoretical_loss": 5.13941725092439, + "tokens_seen": 59703296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959077231695085, + "loss": 4.0762, + "theoretical_loss": 5.138578602615146, + "tokens_seen": 59768832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958976930792377, + "loss": 4.3081, + "theoretical_loss": 5.137741130530934, + "tokens_seen": 59834368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958876629889669, + "loss": 4.1723, + "theoretical_loss": 5.1369048317364685, + "tokens_seen": 59899904 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495877632898696, + "loss": 4.1926, + "theoretical_loss": 5.13606970330699, + "tokens_seen": 59965440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958676028084253, + "loss": 4.3296, + "theoretical_loss": 5.135235742328217, + "tokens_seen": 60030976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958575727181544, + "loss": 4.2648, + "theoretical_loss": 5.134402945896297, + "tokens_seen": 60096512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958475426278837, + "loss": 4.0622, + "theoretical_loss": 5.133571311117755, + "tokens_seen": 60162048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958375125376129, + "loss": 4.1772, + "theoretical_loss": 5.132740835109448, + "tokens_seen": 60227584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958274824473421, + "loss": 3.9454, + "theoretical_loss": 5.131911514998518, + "tokens_seen": 60293120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958174523570712, + "loss": 4.3073, + "theoretical_loss": 5.131083347922338, + "tokens_seen": 60358656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958074222668004, + "loss": 4.0057, + "theoretical_loss": 5.130256331028474, + "tokens_seen": 60424192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957973921765296, + "loss": 4.3214, + "theoretical_loss": 5.129430461474628, + "tokens_seen": 60489728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957873620862588, + "loss": 4.1685, + "theoretical_loss": 5.128605736428597, + "tokens_seen": 60555264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 154602, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.488969802856445, + "objective/train/theoretical_loss": 5.127782153068225, + "objective/train/tokens_used": 81080800, + "theoretical_loss": 5.127782153068225, + "tokens_seen": 60620800 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495777331995988, + "loss": 4.1876, + "theoretical_loss": 5.127782153068225, + "tokens_seen": 60620800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957673019057172, + "loss": 4.1921, + "theoretical_loss": 5.126959708581356, + "tokens_seen": 60686336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957572718154463, + "loss": 4.2615, + "theoretical_loss": 5.1261384001657895, + "tokens_seen": 60751872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957472417251755, + "loss": 4.156, + "theoretical_loss": 5.125318225029231, + "tokens_seen": 60817408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957372116349047, + "loss": 4.1728, + "theoretical_loss": 5.124499180389249, + "tokens_seen": 60882944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957271815446339, + "loss": 4.2523, + "theoretical_loss": 5.12368126347323, + "tokens_seen": 60948480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957171514543631, + "loss": 4.1074, + "theoretical_loss": 5.122864471518334, + "tokens_seen": 61014016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957071213640923, + "loss": 4.2443, + "theoretical_loss": 5.122048801771443, + "tokens_seen": 61079552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956970912738214, + "loss": 3.8773, + "theoretical_loss": 5.121234251489128, + "tokens_seen": 61145088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956870611835507, + "loss": 4.0361, + "theoretical_loss": 5.120420817937591, + "tokens_seen": 61210624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956770310932798, + "loss": 3.9077, + "theoretical_loss": 5.119608498392633, + "tokens_seen": 61276160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956670010030091, + "loss": 3.9712, + "theoretical_loss": 5.118797290139605, + "tokens_seen": 61341696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956569709127383, + "loss": 4.1847, + "theoretical_loss": 5.117987190473361, + "tokens_seen": 61407232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956469408224674, + "loss": 3.9935, + "theoretical_loss": 5.1171781966982195, + "tokens_seen": 61472768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956369107321966, + "loss": 3.932, + "theoretical_loss": 5.116370306127921, + "tokens_seen": 61538304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956268806419258, + "loss": 4.3194, + "theoretical_loss": 5.11556351608558, + "tokens_seen": 61603840 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495616850551655, + "loss": 4.322, + "theoretical_loss": 5.114757823903647, + "tokens_seen": 61669376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956068204613842, + "loss": 4.0708, + "theoretical_loss": 5.113953226923864, + "tokens_seen": 61734912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955967903711133, + "loss": 4.3149, + "theoretical_loss": 5.113149722497221, + "tokens_seen": 61800448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955867602808425, + "loss": 4.3968, + "theoretical_loss": 5.112347307983919, + "tokens_seen": 61865984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955767301905717, + "loss": 4.1143, + "theoretical_loss": 5.111545980753322, + "tokens_seen": 61931520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955667001003009, + "loss": 3.9537, + "theoretical_loss": 5.110745738183919, + "tokens_seen": 61997056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955566700100301, + "loss": 4.1058, + "theoretical_loss": 5.109946577663284, + "tokens_seen": 62062592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955466399197592, + "loss": 4.3783, + "theoretical_loss": 5.109148496588032, + "tokens_seen": 62128128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955366098294884, + "loss": 4.1335, + "theoretical_loss": 5.108351492363779, + "tokens_seen": 62193664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 155165, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9781312942504883, + "objective/train/theoretical_loss": 5.107555562405102, + "objective/train/tokens_used": 82719200, + "theoretical_loss": 5.107555562405102, + "tokens_seen": 62259200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955265797392177, + "loss": 4.1683, + "theoretical_loss": 5.107555562405102, + "tokens_seen": 62259200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955165496489468, + "loss": 4.3202, + "theoretical_loss": 5.106760704135499, + "tokens_seen": 62324736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955065195586761, + "loss": 4.1865, + "theoretical_loss": 5.105966914987349, + "tokens_seen": 62390272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954964894684052, + "loss": 4.0606, + "theoretical_loss": 5.1051741924018685, + "tokens_seen": 62455808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954864593781344, + "loss": 4.1826, + "theoretical_loss": 5.10438253382908, + "tokens_seen": 62521344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954764292878636, + "loss": 4.123, + "theoretical_loss": 5.103591936727762, + "tokens_seen": 62586880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954663991975928, + "loss": 3.9976, + "theoretical_loss": 5.102802398565418, + "tokens_seen": 62652416 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495456369107322, + "loss": 4.1302, + "theoretical_loss": 5.102013916818235, + "tokens_seen": 62717952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954463390170512, + "loss": 4.1543, + "theoretical_loss": 5.101226488971042, + "tokens_seen": 62783488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954363089267803, + "loss": 3.8957, + "theoretical_loss": 5.100440112517276, + "tokens_seen": 62849024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954262788365095, + "loss": 4.1473, + "theoretical_loss": 5.09965478495894, + "tokens_seen": 62914560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954162487462387, + "loss": 4.3325, + "theoretical_loss": 5.098870503806567, + "tokens_seen": 62980096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954062186559679, + "loss": 4.268, + "theoretical_loss": 5.09808726657918, + "tokens_seen": 63045632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953961885656971, + "loss": 4.096, + "theoretical_loss": 5.097305070804255, + "tokens_seen": 63111168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953861584754263, + "loss": 4.0601, + "theoretical_loss": 5.096523914017688, + "tokens_seen": 63176704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953761283851555, + "loss": 3.6987, + "theoretical_loss": 5.095743793763747, + "tokens_seen": 63242240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953660982948846, + "loss": 4.1201, + "theoretical_loss": 5.094964707595047, + "tokens_seen": 63307776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953560682046139, + "loss": 3.6626, + "theoretical_loss": 5.094186653072505, + "tokens_seen": 63373312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953460381143431, + "loss": 4.0223, + "theoretical_loss": 5.093409627765306, + "tokens_seen": 63438848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953360080240723, + "loss": 4.2156, + "theoretical_loss": 5.092633629250866, + "tokens_seen": 63504384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953259779338014, + "loss": 3.9654, + "theoretical_loss": 5.091858655114796, + "tokens_seen": 63569920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953159478435306, + "loss": 4.0676, + "theoretical_loss": 5.091084702950868, + "tokens_seen": 63635456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953059177532598, + "loss": 4.0842, + "theoretical_loss": 5.090311770360971, + "tokens_seen": 63700992 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495295887662989, + "loss": 4.2643, + "theoretical_loss": 5.089539854955088, + "tokens_seen": 63766528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952858575727182, + "loss": 3.9339, + "theoretical_loss": 5.088768954351249, + "tokens_seen": 63832064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 156647, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6908023357391357, + "objective/train/theoretical_loss": 5.087999066175502, + "objective/train/tokens_used": 84357600, + "theoretical_loss": 5.087999066175502, + "tokens_seen": 63897600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952758274824474, + "loss": 3.8197, + "theoretical_loss": 5.087999066175502, + "tokens_seen": 63897600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952657973921765, + "loss": 4.0257, + "theoretical_loss": 5.0872301880618735, + "tokens_seen": 63963136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952557673019057, + "loss": 4.062, + "theoretical_loss": 5.086462317652341, + "tokens_seen": 64028672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952457372116349, + "loss": 4.1515, + "theoretical_loss": 5.085695452596788, + "tokens_seen": 64094208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952357071213641, + "loss": 4.1296, + "theoretical_loss": 5.084929590552976, + "tokens_seen": 64159744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952256770310933, + "loss": 3.8944, + "theoretical_loss": 5.0841647291865115, + "tokens_seen": 64225280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952156469408225, + "loss": 4.2949, + "theoretical_loss": 5.083400866170806, + "tokens_seen": 64290816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952056168505516, + "loss": 3.9266, + "theoretical_loss": 5.082637999187046, + "tokens_seen": 64356352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951955867602809, + "loss": 4.0743, + "theoretical_loss": 5.081876125924159, + "tokens_seen": 64421888 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049518555667001, + "loss": 3.9946, + "theoretical_loss": 5.0811152440787755, + "tokens_seen": 64487424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951755265797393, + "loss": 4.1916, + "theoretical_loss": 5.0803553513552036, + "tokens_seen": 64552960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951654964894685, + "loss": 4.2992, + "theoretical_loss": 5.079596445465386, + "tokens_seen": 64618496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951554663991976, + "loss": 3.9929, + "theoretical_loss": 5.078838524128878, + "tokens_seen": 64684032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951454363089268, + "loss": 3.7549, + "theoretical_loss": 5.078081585072802, + "tokens_seen": 64749568 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495135406218656, + "loss": 4.0781, + "theoretical_loss": 5.077325626031826, + "tokens_seen": 64815104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951253761283852, + "loss": 3.9946, + "theoretical_loss": 5.076570644748123, + "tokens_seen": 64880640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951153460381144, + "loss": 3.966, + "theoretical_loss": 5.075816638971341, + "tokens_seen": 64946176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951053159478435, + "loss": 4.3961, + "theoretical_loss": 5.075063606458576, + "tokens_seen": 65011712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950952858575727, + "loss": 4.1845, + "theoretical_loss": 5.074311544974331, + "tokens_seen": 65077248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950852557673019, + "loss": 4.0004, + "theoretical_loss": 5.07356045229049, + "tokens_seen": 65142784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950752256770311, + "loss": 3.7156, + "theoretical_loss": 5.072810326186285, + "tokens_seen": 65208320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950651955867603, + "loss": 4.0024, + "theoretical_loss": 5.072061164448261, + "tokens_seen": 65273856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950551654964894, + "loss": 4.0715, + "theoretical_loss": 5.071312964870252, + "tokens_seen": 65339392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950451354062186, + "loss": 4.07, + "theoretical_loss": 5.070565725253344, + "tokens_seen": 65404928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950351053159479, + "loss": 4.1316, + "theoretical_loss": 5.069819443405842, + "tokens_seen": 65470464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 157489, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.333700180053711, + "objective/train/theoretical_loss": 5.069074117143246, + "objective/train/tokens_used": 85996000, + "theoretical_loss": 5.069074117143246, + "tokens_seen": 65536000 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495025075225677, + "loss": 3.9793, + "theoretical_loss": 5.069074117143246, + "tokens_seen": 65536000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950150451354063, + "loss": 3.8486, + "theoretical_loss": 5.068329744288216, + "tokens_seen": 65601536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950050150451354, + "loss": 4.252, + "theoretical_loss": 5.067586322670541, + "tokens_seen": 65667072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949949849548646, + "loss": 4.079, + "theoretical_loss": 5.0668438501271105, + "tokens_seen": 65732608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949849548645938, + "loss": 4.09, + "theoretical_loss": 5.066102324501883, + "tokens_seen": 65798144 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494974924774323, + "loss": 4.1249, + "theoretical_loss": 5.065361743645855, + "tokens_seen": 65863680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949648946840522, + "loss": 3.8237, + "theoretical_loss": 5.064622105417033, + "tokens_seen": 65929216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949548645937814, + "loss": 4.0333, + "theoretical_loss": 5.063883407680405, + "tokens_seen": 65994752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949448345035105, + "loss": 4.0507, + "theoretical_loss": 5.063145648307904, + "tokens_seen": 66060288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949348044132397, + "loss": 3.5886, + "theoretical_loss": 5.062408825178388, + "tokens_seen": 66125824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949247743229689, + "loss": 3.9934, + "theoretical_loss": 5.061672936177604, + "tokens_seen": 66191360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949147442326981, + "loss": 3.8668, + "theoretical_loss": 5.06093797919816, + "tokens_seen": 66256896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949047141424273, + "loss": 4.1734, + "theoretical_loss": 5.060203952139497, + "tokens_seen": 66322432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948946840521565, + "loss": 3.8566, + "theoretical_loss": 5.059470852907861, + "tokens_seen": 66387968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948846539618856, + "loss": 3.9639, + "theoretical_loss": 5.0587386794162725, + "tokens_seen": 66453504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948746238716148, + "loss": 3.9601, + "theoretical_loss": 5.058007429584498, + "tokens_seen": 66519040 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494864593781344, + "loss": 3.7703, + "theoretical_loss": 5.057277101339023, + "tokens_seen": 66584576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948545636910733, + "loss": 4.193, + "theoretical_loss": 5.056547692613021, + "tokens_seen": 66650112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948445336008024, + "loss": 3.8359, + "theoretical_loss": 5.055819201346331, + "tokens_seen": 66715648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948345035105316, + "loss": 4.019, + "theoretical_loss": 5.055091625485421, + "tokens_seen": 66781184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948244734202607, + "loss": 3.6005, + "theoretical_loss": 5.054364962983367, + "tokens_seen": 66846720 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049481444332999, + "loss": 4.065, + "theoretical_loss": 5.053639211799824, + "tokens_seen": 66912256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948044132397192, + "loss": 4.0457, + "theoretical_loss": 5.052914369900997, + "tokens_seen": 66977792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947943831494484, + "loss": 3.8273, + "theoretical_loss": 5.052190435259614, + "tokens_seen": 67043328 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947843530591776, + "loss": 3.9551, + "theoretical_loss": 5.051467405854897, + "tokens_seen": 67108864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 158845, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.314136505126953, + "objective/train/theoretical_loss": 5.05074527967254, + "objective/train/tokens_used": 87634400, + "theoretical_loss": 5.05074527967254, + "tokens_seen": 67174400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947743229689067, + "loss": 3.8036, + "theoretical_loss": 5.05074527967254, + "tokens_seen": 67174400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947642928786359, + "loss": 4.2425, + "theoretical_loss": 5.050024054704677, + "tokens_seen": 67239936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947542627883651, + "loss": 3.6902, + "theoretical_loss": 5.049303728949859, + "tokens_seen": 67305472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947442326980943, + "loss": 4.0812, + "theoretical_loss": 5.048584300413019, + "tokens_seen": 67371008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947342026078235, + "loss": 3.9309, + "theoretical_loss": 5.04786576710546, + "tokens_seen": 67436544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947241725175527, + "loss": 4.0231, + "theoretical_loss": 5.0471481270448155, + "tokens_seen": 67502080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947141424272818, + "loss": 3.7707, + "theoretical_loss": 5.046431378255027, + "tokens_seen": 67567616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947041123370111, + "loss": 4.1525, + "theoretical_loss": 5.045715518766322, + "tokens_seen": 67633152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946940822467402, + "loss": 3.7804, + "theoretical_loss": 5.0450005466151815, + "tokens_seen": 67698688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946840521564695, + "loss": 3.9759, + "theoretical_loss": 5.044286459844319, + "tokens_seen": 67764224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946740220661987, + "loss": 4.0479, + "theoretical_loss": 5.043573256502652, + "tokens_seen": 67829760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946639919759278, + "loss": 4.1937, + "theoretical_loss": 5.0428609346452795, + "tokens_seen": 67895296 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494653961885657, + "loss": 4.1032, + "theoretical_loss": 5.042149492333452, + "tokens_seen": 67960832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946439317953862, + "loss": 3.7257, + "theoretical_loss": 5.041438927634549, + "tokens_seen": 68026368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946339017051154, + "loss": 4.3465, + "theoretical_loss": 5.040729238622053, + "tokens_seen": 68091904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946238716148446, + "loss": 3.7946, + "theoretical_loss": 5.040020423375525, + "tokens_seen": 68157440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946138415245737, + "loss": 3.8581, + "theoretical_loss": 5.039312479980579, + "tokens_seen": 68222976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946038114343029, + "loss": 3.9283, + "theoretical_loss": 5.038605406528857, + "tokens_seen": 68288512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945937813440321, + "loss": 3.9674, + "theoretical_loss": 5.037899201118005, + "tokens_seen": 68354048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945837512537613, + "loss": 3.9114, + "theoretical_loss": 5.037193861851646, + "tokens_seen": 68419584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945737211634905, + "loss": 4.0903, + "theoretical_loss": 5.03648938683936, + "tokens_seen": 68485120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945636910732196, + "loss": 4.0617, + "theoretical_loss": 5.035785774196654, + "tokens_seen": 68550656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945536609829488, + "loss": 3.9602, + "theoretical_loss": 5.035083022044944, + "tokens_seen": 68616192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945436308926781, + "loss": 3.7551, + "theoretical_loss": 5.034381128511525, + "tokens_seen": 68681728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945336008024072, + "loss": 3.728, + "theoretical_loss": 5.0336800917295506, + "tokens_seen": 68747264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 159466, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.931746244430542, + "objective/train/theoretical_loss": 5.032979909838007, + "objective/train/tokens_used": 89272800, + "theoretical_loss": 5.032979909838007, + "tokens_seen": 68812800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945235707121365, + "loss": 4.1672, + "theoretical_loss": 5.032979909838007, + "tokens_seen": 68812800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945135406218656, + "loss": 3.7009, + "theoretical_loss": 5.032280580981691, + "tokens_seen": 68878336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945035105315948, + "loss": 3.9125, + "theoretical_loss": 5.031582103311187, + "tokens_seen": 68943872 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494493480441324, + "loss": 3.8562, + "theoretical_loss": 5.030884474982842, + "tokens_seen": 69009408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944834503510532, + "loss": 4.0789, + "theoretical_loss": 5.030187694158739, + "tokens_seen": 69074944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944734202607824, + "loss": 3.6415, + "theoretical_loss": 5.02949175900668, + "tokens_seen": 69140480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944633901705116, + "loss": 3.9927, + "theoretical_loss": 5.028796667700159, + "tokens_seen": 69206016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944533600802407, + "loss": 3.6965, + "theoretical_loss": 5.0281024184183405, + "tokens_seen": 69271552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944433299899699, + "loss": 3.9956, + "theoretical_loss": 5.0274090093460355, + "tokens_seen": 69337088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944332998996991, + "loss": 4.0111, + "theoretical_loss": 5.026716438673677, + "tokens_seen": 69402624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944232698094283, + "loss": 3.8318, + "theoretical_loss": 5.0260247045973045, + "tokens_seen": 69468160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944132397191575, + "loss": 4.0343, + "theoretical_loss": 5.02533380531853, + "tokens_seen": 69533696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944032096288867, + "loss": 3.6072, + "theoretical_loss": 5.024643739044526, + "tokens_seen": 69599232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943931795386158, + "loss": 4.0637, + "theoretical_loss": 5.023954503987998, + "tokens_seen": 69664768 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494383149448345, + "loss": 3.9123, + "theoretical_loss": 5.023266098367161, + "tokens_seen": 69730304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943731193580742, + "loss": 3.7934, + "theoretical_loss": 5.022578520405721, + "tokens_seen": 69795840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943630892678035, + "loss": 3.8608, + "theoretical_loss": 5.0218917683328534, + "tokens_seen": 69861376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943530591775326, + "loss": 3.8826, + "theoretical_loss": 5.021205840383175, + "tokens_seen": 69926912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943430290872618, + "loss": 3.7947, + "theoretical_loss": 5.020520734796728, + "tokens_seen": 69992448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943329989969909, + "loss": 3.8164, + "theoretical_loss": 5.019836449818957, + "tokens_seen": 70057984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943229689067202, + "loss": 3.6969, + "theoretical_loss": 5.019152983700687, + "tokens_seen": 70123520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943129388164494, + "loss": 3.8535, + "theoretical_loss": 5.018470334698101, + "tokens_seen": 70189056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943029087261786, + "loss": 3.9834, + "theoretical_loss": 5.01778850107272, + "tokens_seen": 70254592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942928786359078, + "loss": 3.7347, + "theoretical_loss": 5.017107481091379, + "tokens_seen": 70320128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942828485456369, + "loss": 3.6688, + "theoretical_loss": 5.016427273026212, + "tokens_seen": 70385664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 160845, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9590885639190674, + "objective/train/theoretical_loss": 5.015747875154622, + "objective/train/tokens_used": 90911200, + "theoretical_loss": 5.015747875154622, + "tokens_seen": 70451200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942728184553661, + "loss": 4.0648, + "theoretical_loss": 5.015747875154622, + "tokens_seen": 70451200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942627883650953, + "loss": 3.7767, + "theoretical_loss": 5.015069285759269, + "tokens_seen": 70516736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942527582748245, + "loss": 4.1511, + "theoretical_loss": 5.01439150312804, + "tokens_seen": 70582272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942427281845537, + "loss": 3.8699, + "theoretical_loss": 5.0137145255540405, + "tokens_seen": 70647808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942326980942828, + "loss": 3.8057, + "theoretical_loss": 5.013038351335559, + "tokens_seen": 70713344 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494222668004012, + "loss": 3.8407, + "theoretical_loss": 5.012362978776057, + "tokens_seen": 70778880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942126379137412, + "loss": 3.7493, + "theoretical_loss": 5.011688406184147, + "tokens_seen": 70844416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942026078234704, + "loss": 3.9297, + "theoretical_loss": 5.011014631873566, + "tokens_seen": 70909952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941925777331996, + "loss": 4.0194, + "theoretical_loss": 5.010341654163167, + "tokens_seen": 70975488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941825476429289, + "loss": 3.9314, + "theoretical_loss": 5.009669471376882, + "tokens_seen": 71041024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941725175526579, + "loss": 3.553, + "theoretical_loss": 5.008998081843721, + "tokens_seen": 71106560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941624874623872, + "loss": 4.1107, + "theoretical_loss": 5.008327483897736, + "tokens_seen": 71172096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941524573721163, + "loss": 3.8063, + "theoretical_loss": 5.00765767587801, + "tokens_seen": 71237632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941424272818456, + "loss": 3.7438, + "theoretical_loss": 5.006988656128635, + "tokens_seen": 71303168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941323971915748, + "loss": 3.6967, + "theoretical_loss": 5.006320422998691, + "tokens_seen": 71368704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941223671013039, + "loss": 3.9589, + "theoretical_loss": 5.00565297484223, + "tokens_seen": 71434240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941123370110331, + "loss": 3.9641, + "theoretical_loss": 5.004986310018252, + "tokens_seen": 71499776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941023069207623, + "loss": 4.093, + "theoretical_loss": 5.004320426890686, + "tokens_seen": 71565312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940922768304915, + "loss": 3.8581, + "theoretical_loss": 5.003655323828376, + "tokens_seen": 71630848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940822467402207, + "loss": 3.9614, + "theoretical_loss": 5.002990999205057, + "tokens_seen": 71696384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940722166499498, + "loss": 3.9838, + "theoretical_loss": 5.002327451399335, + "tokens_seen": 71761920 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494062186559679, + "loss": 4.1628, + "theoretical_loss": 5.001664678794671, + "tokens_seen": 71827456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940521564694082, + "loss": 3.9386, + "theoretical_loss": 5.001002679779363, + "tokens_seen": 71892992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940421263791374, + "loss": 3.9657, + "theoretical_loss": 5.0003414527465235, + "tokens_seen": 71958528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940320962888666, + "loss": 4.0775, + "theoretical_loss": 4.99968099609406, + "tokens_seen": 72024064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 161502, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.11187744140625, + "objective/train/theoretical_loss": 4.999021308224664, + "objective/train/tokens_used": 92549600, + "theoretical_loss": 4.999021308224664, + "tokens_seen": 72089600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940220661985958, + "loss": 3.9657, + "theoretical_loss": 4.999021308224664, + "tokens_seen": 72089600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940120361083249, + "loss": 3.9299, + "theoretical_loss": 4.998362387545782, + "tokens_seen": 72155136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940020060180542, + "loss": 3.833, + "theoretical_loss": 4.997704232469606, + "tokens_seen": 72220672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939919759277834, + "loss": 3.6457, + "theoretical_loss": 4.997046841413049, + "tokens_seen": 72286208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939819458375126, + "loss": 4.0112, + "theoretical_loss": 4.996390212797728, + "tokens_seen": 72351744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939719157472418, + "loss": 3.6914, + "theoretical_loss": 4.995734345049949, + "tokens_seen": 72417280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493961885656971, + "loss": 3.8694, + "theoretical_loss": 4.995079236600686, + "tokens_seen": 72482816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939518555667001, + "loss": 3.9413, + "theoretical_loss": 4.994424885885564, + "tokens_seen": 72548352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939418254764293, + "loss": 4.0597, + "theoretical_loss": 4.993771291344839, + "tokens_seen": 72613888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939317953861585, + "loss": 4.0831, + "theoretical_loss": 4.993118451423381, + "tokens_seen": 72679424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939217652958877, + "loss": 3.8775, + "theoretical_loss": 4.992466364570659, + "tokens_seen": 72744960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939117352056169, + "loss": 4.0318, + "theoretical_loss": 4.991815029240721, + "tokens_seen": 72810496 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493901705115346, + "loss": 3.9009, + "theoretical_loss": 4.991164443892175, + "tokens_seen": 72876032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938916750250752, + "loss": 3.8903, + "theoretical_loss": 4.990514606988173, + "tokens_seen": 72941568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938816449348044, + "loss": 3.8758, + "theoretical_loss": 4.989865516996396, + "tokens_seen": 73007104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938716148445337, + "loss": 3.6876, + "theoretical_loss": 4.98921717238903, + "tokens_seen": 73072640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938615847542628, + "loss": 3.9345, + "theoretical_loss": 4.988569571642756, + "tokens_seen": 73138176 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493851554663992, + "loss": 3.6821, + "theoretical_loss": 4.98792271323873, + "tokens_seen": 73203712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938415245737211, + "loss": 3.9708, + "theoretical_loss": 4.9872765956625615, + "tokens_seen": 73269248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938314944834504, + "loss": 3.8872, + "theoretical_loss": 4.9866312174043035, + "tokens_seen": 73334784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938214643931796, + "loss": 3.799, + "theoretical_loss": 4.9859865769584335, + "tokens_seen": 73400320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938114343029088, + "loss": 3.9449, + "theoretical_loss": 4.9853426728238315, + "tokens_seen": 73465856 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493801404212638, + "loss": 3.5261, + "theoretical_loss": 4.984699503503771, + "tokens_seen": 73531392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937913741223671, + "loss": 3.9521, + "theoretical_loss": 4.984057067505898, + "tokens_seen": 73596928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937813440320963, + "loss": 3.977, + "theoretical_loss": 4.9834153633422105, + "tokens_seen": 73662464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 162491, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.735835313796997, + "objective/train/theoretical_loss": 4.982774389529053, + "objective/train/tokens_used": 94188000, + "theoretical_loss": 4.982774389529053, + "tokens_seen": 73728000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937713139418255, + "loss": 3.751, + "theoretical_loss": 4.982774389529053, + "tokens_seen": 73728000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937612838515547, + "loss": 3.8968, + "theoretical_loss": 4.9821341445870875, + "tokens_seen": 73793536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937512537612839, + "loss": 3.5854, + "theoretical_loss": 4.981494627041286, + "tokens_seen": 73859072 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493741223671013, + "loss": 3.5525, + "theoretical_loss": 4.98085583542091, + "tokens_seen": 73924608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937311935807422, + "loss": 3.9853, + "theoretical_loss": 4.980217768259496, + "tokens_seen": 73990144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937211634904714, + "loss": 3.9015, + "theoretical_loss": 4.979580424094836, + "tokens_seen": 74055680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937111334002006, + "loss": 4.0376, + "theoretical_loss": 4.978943801468967, + "tokens_seen": 74121216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937011033099298, + "loss": 3.5555, + "theoretical_loss": 4.978307898928149, + "tokens_seen": 74186752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936910732196591, + "loss": 3.9641, + "theoretical_loss": 4.977672715022855, + "tokens_seen": 74252288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936810431293881, + "loss": 3.8699, + "theoretical_loss": 4.97703824830775, + "tokens_seen": 74317824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936710130391174, + "loss": 3.9669, + "theoretical_loss": 4.976404497341676, + "tokens_seen": 74383360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936609829488465, + "loss": 3.6727, + "theoretical_loss": 4.975771460687641, + "tokens_seen": 74448896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936509528585758, + "loss": 3.4212, + "theoretical_loss": 4.975139136912794, + "tokens_seen": 74514432 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493640922768305, + "loss": 3.8349, + "theoretical_loss": 4.974507524588424, + "tokens_seen": 74579968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936308926780341, + "loss": 3.9229, + "theoretical_loss": 4.973876622289927, + "tokens_seen": 74645504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936208625877633, + "loss": 3.4796, + "theoretical_loss": 4.973246428596802, + "tokens_seen": 74711040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936108324974925, + "loss": 3.7328, + "theoretical_loss": 4.972616942092634, + "tokens_seen": 74776576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936008024072217, + "loss": 4.1417, + "theoretical_loss": 4.971988161365077, + "tokens_seen": 74842112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935907723169509, + "loss": 3.5812, + "theoretical_loss": 4.9713600850058395, + "tokens_seen": 74907648 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049358074222668, + "loss": 3.661, + "theoretical_loss": 4.970732711610667, + "tokens_seen": 74973184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935707121364092, + "loss": 3.9342, + "theoretical_loss": 4.97010603977933, + "tokens_seen": 75038720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935606820461384, + "loss": 3.7614, + "theoretical_loss": 4.96948006811561, + "tokens_seen": 75104256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935506519558676, + "loss": 3.6541, + "theoretical_loss": 4.968854795227281, + "tokens_seen": 75169792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935406218655968, + "loss": 3.8975, + "theoretical_loss": 4.968230219726093, + "tokens_seen": 75235328 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493530591775326, + "loss": 3.8834, + "theoretical_loss": 4.967606340227765, + "tokens_seen": 75300864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 163075, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.548557996749878, + "objective/train/theoretical_loss": 4.966983155351962, + "objective/train/tokens_used": 95826400, + "theoretical_loss": 4.966983155351962, + "tokens_seen": 75366400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935205616850551, + "loss": 3.6221, + "theoretical_loss": 4.966983155351962, + "tokens_seen": 75366400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935105315947844, + "loss": 4.1199, + "theoretical_loss": 4.966360663722287, + "tokens_seen": 75431936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935005015045135, + "loss": 3.9567, + "theoretical_loss": 4.96573886396626, + "tokens_seen": 75497472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934904714142428, + "loss": 4.2001, + "theoretical_loss": 4.965117754715307, + "tokens_seen": 75563008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934804413239719, + "loss": 3.8199, + "theoretical_loss": 4.964497334604748, + "tokens_seen": 75628544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934704112337011, + "loss": 3.8548, + "theoretical_loss": 4.963877602273776, + "tokens_seen": 75694080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934603811434303, + "loss": 4.0167, + "theoretical_loss": 4.963258556365449, + "tokens_seen": 75759616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934503510531595, + "loss": 3.5764, + "theoretical_loss": 4.962640195526673, + "tokens_seen": 75825152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934403209628887, + "loss": 3.9003, + "theoretical_loss": 4.962022518408183, + "tokens_seen": 75890688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934302908726179, + "loss": 3.9603, + "theoretical_loss": 4.96140552366454, + "tokens_seen": 75956224 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493420260782347, + "loss": 3.6928, + "theoretical_loss": 4.9607892099541075, + "tokens_seen": 76021760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934102306920762, + "loss": 3.9968, + "theoretical_loss": 4.9601735759390415, + "tokens_seen": 76087296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934002006018054, + "loss": 3.5467, + "theoretical_loss": 4.959558620285274, + "tokens_seen": 76152832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933901705115346, + "loss": 3.4065, + "theoretical_loss": 4.958944341662502, + "tokens_seen": 76218368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933801404212638, + "loss": 4.0583, + "theoretical_loss": 4.958330738744172, + "tokens_seen": 76283904 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493370110330993, + "loss": 3.7728, + "theoretical_loss": 4.957717810207466, + "tokens_seen": 76349440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933600802407221, + "loss": 3.7919, + "theoretical_loss": 4.957105554733289, + "tokens_seen": 76414976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933500501504513, + "loss": 3.9052, + "theoretical_loss": 4.956493971006253, + "tokens_seen": 76480512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933400200601805, + "loss": 3.7208, + "theoretical_loss": 4.955883057714669, + "tokens_seen": 76546048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933299899699098, + "loss": 3.7532, + "theoretical_loss": 4.955272813550524, + "tokens_seen": 76611584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933199598796389, + "loss": 3.8077, + "theoretical_loss": 4.954663237209477, + "tokens_seen": 76677120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933099297893682, + "loss": 3.8766, + "theoretical_loss": 4.954054327390841, + "tokens_seen": 76742656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932998996990972, + "loss": 3.9885, + "theoretical_loss": 4.9534460827975675, + "tokens_seen": 76808192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932898696088265, + "loss": 3.7149, + "theoretical_loss": 4.952838502136241, + "tokens_seen": 76873728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932798395185557, + "loss": 3.8799, + "theoretical_loss": 4.952231584117056, + "tokens_seen": 76939264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 164442, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9509365558624268, + "objective/train/theoretical_loss": 4.951625327453812, + "objective/train/tokens_used": 97464800, + "theoretical_loss": 4.951625327453812, + "tokens_seen": 77004800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932698094282849, + "loss": 3.6636, + "theoretical_loss": 4.951625327453812, + "tokens_seen": 77004800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932597793380141, + "loss": 3.8467, + "theoretical_loss": 4.951019730863894, + "tokens_seen": 77070336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932497492477432, + "loss": 3.7791, + "theoretical_loss": 4.950414793068266, + "tokens_seen": 77135872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932397191574724, + "loss": 3.6553, + "theoretical_loss": 4.94981051279145, + "tokens_seen": 77201408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932296890672016, + "loss": 4.0068, + "theoretical_loss": 4.94920688876152, + "tokens_seen": 77266944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932196589769308, + "loss": 3.4971, + "theoretical_loss": 4.948603919710088, + "tokens_seen": 77332480 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049320962888666, + "loss": 3.5554, + "theoretical_loss": 4.948001604372287, + "tokens_seen": 77398016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931995987963893, + "loss": 3.8427, + "theoretical_loss": 4.947399941486762, + "tokens_seen": 77463552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931895687061183, + "loss": 3.7901, + "theoretical_loss": 4.946798929795658, + "tokens_seen": 77529088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931795386158476, + "loss": 3.6006, + "theoretical_loss": 4.946198568044602, + "tokens_seen": 77594624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931695085255767, + "loss": 3.7489, + "theoretical_loss": 4.945598854982698, + "tokens_seen": 77660160 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493159478435306, + "loss": 3.6273, + "theoretical_loss": 4.944999789362508, + "tokens_seen": 77725696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931494483450352, + "loss": 3.9755, + "theoretical_loss": 4.944401369940043, + "tokens_seen": 77791232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931394182547643, + "loss": 3.8917, + "theoretical_loss": 4.9438035954747495, + "tokens_seen": 77856768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931293881644935, + "loss": 3.9238, + "theoretical_loss": 4.9432064647294975, + "tokens_seen": 77922304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931193580742227, + "loss": 3.8091, + "theoretical_loss": 4.942609976470566, + "tokens_seen": 77987840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931093279839519, + "loss": 3.6847, + "theoretical_loss": 4.942014129467637, + "tokens_seen": 78053376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930992978936811, + "loss": 3.7321, + "theoretical_loss": 4.941418922493774, + "tokens_seen": 78118912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930892678034102, + "loss": 3.9275, + "theoretical_loss": 4.940824354325419, + "tokens_seen": 78184448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930792377131394, + "loss": 3.5937, + "theoretical_loss": 4.940230423742372, + "tokens_seen": 78249984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930692076228686, + "loss": 3.9973, + "theoretical_loss": 4.939637129527789, + "tokens_seen": 78315520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930591775325978, + "loss": 3.6964, + "theoretical_loss": 4.939044470468156, + "tokens_seen": 78381056 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493049147442327, + "loss": 3.8794, + "theoretical_loss": 4.938452445353294, + "tokens_seen": 78446592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930391173520562, + "loss": 3.6823, + "theoretical_loss": 4.937861052976332, + "tokens_seen": 78512128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930290872617853, + "loss": 4.0123, + "theoretical_loss": 4.937270292133704, + "tokens_seen": 78577664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 165209, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9666755199432373, + "objective/train/theoretical_loss": 4.9366801616251355, + "objective/train/tokens_used": 99103200, + "theoretical_loss": 4.9366801616251355, + "tokens_seen": 78643200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930190571715146, + "loss": 3.7987, + "theoretical_loss": 4.9366801616251355, + "tokens_seen": 78643200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930090270812437, + "loss": 3.7133, + "theoretical_loss": 4.93609066025363, + "tokens_seen": 78708736 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492998996990973, + "loss": 3.8611, + "theoretical_loss": 4.935501786825457, + "tokens_seen": 78774272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929889669007021, + "loss": 3.7493, + "theoretical_loss": 4.934913540150143, + "tokens_seen": 78839808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929789368104313, + "loss": 3.8265, + "theoretical_loss": 4.934325919040461, + "tokens_seen": 78905344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929689067201605, + "loss": 3.7146, + "theoretical_loss": 4.933738922312413, + "tokens_seen": 78970880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929588766298897, + "loss": 3.8796, + "theoretical_loss": 4.933152548785222, + "tokens_seen": 79036416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929488465396189, + "loss": 3.6061, + "theoretical_loss": 4.932566797281324, + "tokens_seen": 79101952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929388164493481, + "loss": 3.8318, + "theoretical_loss": 4.931981666626351, + "tokens_seen": 79167488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929287863590773, + "loss": 3.8867, + "theoretical_loss": 4.931397155649121, + "tokens_seen": 79233024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929187562688064, + "loss": 3.6513, + "theoretical_loss": 4.930813263181631, + "tokens_seen": 79298560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929087261785356, + "loss": 4.1337, + "theoretical_loss": 4.93022998805904, + "tokens_seen": 79364096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928986960882648, + "loss": 3.7265, + "theoretical_loss": 4.929647329119659, + "tokens_seen": 79429632 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492888665997994, + "loss": 3.6387, + "theoretical_loss": 4.9290652852049455, + "tokens_seen": 79495168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928786359077232, + "loss": 3.569, + "theoretical_loss": 4.928483855159485, + "tokens_seen": 79560704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928686058174523, + "loss": 4.0239, + "theoretical_loss": 4.927903037830983, + "tokens_seen": 79626240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928585757271815, + "loss": 3.9572, + "theoretical_loss": 4.9273228320702565, + "tokens_seen": 79691776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928485456369107, + "loss": 3.7791, + "theoretical_loss": 4.926743236731218, + "tokens_seen": 79757312 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049283851554664, + "loss": 3.8708, + "theoretical_loss": 4.926164250670868, + "tokens_seen": 79822848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928284854563691, + "loss": 3.6422, + "theoretical_loss": 4.925585872749284, + "tokens_seen": 79888384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928184553660984, + "loss": 3.9702, + "theoretical_loss": 4.925008101829608, + "tokens_seen": 79953920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928084252758274, + "loss": 3.745, + "theoretical_loss": 4.9244309367780374, + "tokens_seen": 80019456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927983951855567, + "loss": 3.6731, + "theoretical_loss": 4.923854376463816, + "tokens_seen": 80084992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927883650952859, + "loss": 3.8668, + "theoretical_loss": 4.923278419759217, + "tokens_seen": 80150528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927783350050151, + "loss": 3.8823, + "theoretical_loss": 4.92270306553954, + "tokens_seen": 80216064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 165995, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.573359489440918, + "objective/train/theoretical_loss": 4.922128312683096, + "objective/train/tokens_used": 100741600, + "theoretical_loss": 4.922128312683096, + "tokens_seen": 80281600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927683049147443, + "loss": 3.6868, + "theoretical_loss": 4.922128312683096, + "tokens_seen": 80281600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927582748244734, + "loss": 3.8374, + "theoretical_loss": 4.921554160071194, + "tokens_seen": 80347136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927482447342026, + "loss": 3.6288, + "theoretical_loss": 4.920980606588142, + "tokens_seen": 80412672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927382146439318, + "loss": 3.7754, + "theoretical_loss": 4.920407651121222, + "tokens_seen": 80478208 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492728184553661, + "loss": 3.869, + "theoretical_loss": 4.919835292560689, + "tokens_seen": 80543744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927181544633902, + "loss": 3.8912, + "theoretical_loss": 4.919263529799759, + "tokens_seen": 80609280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927081243731193, + "loss": 3.8343, + "theoretical_loss": 4.918692361734598, + "tokens_seen": 80674816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926980942828485, + "loss": 3.8784, + "theoretical_loss": 4.91812178726431, + "tokens_seen": 80740352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926880641925777, + "loss": 3.6742, + "theoretical_loss": 4.917551805290929, + "tokens_seen": 80805888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926780341023069, + "loss": 3.7316, + "theoretical_loss": 4.916982414719408, + "tokens_seen": 80871424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926680040120361, + "loss": 3.9762, + "theoretical_loss": 4.9164136144576105, + "tokens_seen": 80936960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926579739217654, + "loss": 3.5511, + "theoretical_loss": 4.915845403416299, + "tokens_seen": 81002496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926479438314944, + "loss": 4.2042, + "theoretical_loss": 4.915277780509124, + "tokens_seen": 81068032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926379137412237, + "loss": 3.7369, + "theoretical_loss": 4.914710744652614, + "tokens_seen": 81133568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926278836509528, + "loss": 3.7712, + "theoretical_loss": 4.914144294766169, + "tokens_seen": 81199104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926178535606821, + "loss": 3.7433, + "theoretical_loss": 4.913578429772047, + "tokens_seen": 81264640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926078234704113, + "loss": 3.6981, + "theoretical_loss": 4.913013148595355, + "tokens_seen": 81330176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925977933801404, + "loss": 3.6646, + "theoretical_loss": 4.912448450164041, + "tokens_seen": 81395712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925877632898696, + "loss": 3.775, + "theoretical_loss": 4.91188433340888, + "tokens_seen": 81461248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925777331995988, + "loss": 3.7487, + "theoretical_loss": 4.911320797263471, + "tokens_seen": 81526784 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492567703109328, + "loss": 3.6504, + "theoretical_loss": 4.910757840664219, + "tokens_seen": 81592320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925576730190572, + "loss": 3.6375, + "theoretical_loss": 4.910195462550334, + "tokens_seen": 81657856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925476429287864, + "loss": 3.6929, + "theoretical_loss": 4.909633661863811, + "tokens_seen": 81723392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925376128385155, + "loss": 3.7826, + "theoretical_loss": 4.909072437549434, + "tokens_seen": 81788928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925275827482447, + "loss": 3.561, + "theoretical_loss": 4.908511788554753, + "tokens_seen": 81854464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 167243, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.2206902503967285, + "objective/train/theoretical_loss": 4.907951713830082, + "objective/train/tokens_used": 102380000, + "theoretical_loss": 4.907951713830082, + "tokens_seen": 81920000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925175526579739, + "loss": 4.1068, + "theoretical_loss": 4.907951713830082, + "tokens_seen": 81920000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925075225677031, + "loss": 3.8568, + "theoretical_loss": 4.907392212328489, + "tokens_seen": 81985536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924974924774323, + "loss": 3.5104, + "theoretical_loss": 4.906833283005785, + "tokens_seen": 82051072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924874623871615, + "loss": 3.5485, + "theoretical_loss": 4.906274924820515, + "tokens_seen": 82116608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924774322968907, + "loss": 4.0778, + "theoretical_loss": 4.90571713673395, + "tokens_seen": 82182144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924674022066199, + "loss": 3.9291, + "theoretical_loss": 4.905159917710073, + "tokens_seen": 82247680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924573721163491, + "loss": 3.6175, + "theoretical_loss": 4.904603266715578, + "tokens_seen": 82313216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924473420260783, + "loss": 3.8755, + "theoretical_loss": 4.904047182719854, + "tokens_seen": 82378752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924373119358075, + "loss": 3.7729, + "theoretical_loss": 4.903491664694977, + "tokens_seen": 82444288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004924272818455366, + "loss": 3.8916, + "theoretical_loss": 4.902936711615702, + "tokens_seen": 82509824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004924172517552658, + "loss": 3.7129, + "theoretical_loss": 4.902382322459456, + "tokens_seen": 82575360 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492407221664995, + "loss": 3.5962, + "theoretical_loss": 4.901828496206322, + "tokens_seen": 82640896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923971915747242, + "loss": 3.7346, + "theoretical_loss": 4.90127523183904, + "tokens_seen": 82706432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923871614844534, + "loss": 3.8739, + "theoretical_loss": 4.900722528342988, + "tokens_seen": 82771968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923771313941825, + "loss": 3.7026, + "theoretical_loss": 4.900170384706181, + "tokens_seen": 82837504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923671013039117, + "loss": 3.6768, + "theoretical_loss": 4.899618799919256, + "tokens_seen": 82903040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923570712136409, + "loss": 3.5168, + "theoretical_loss": 4.899067772975469, + "tokens_seen": 82968576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923470411233702, + "loss": 3.357, + "theoretical_loss": 4.898517302870679, + "tokens_seen": 83034112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923370110330993, + "loss": 3.6077, + "theoretical_loss": 4.897967388603346, + "tokens_seen": 83099648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923269809428286, + "loss": 3.879, + "theoretical_loss": 4.897418029174519, + "tokens_seen": 83165184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923169508525576, + "loss": 3.5125, + "theoretical_loss": 4.896869223587828, + "tokens_seen": 83230720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923069207622869, + "loss": 3.7582, + "theoretical_loss": 4.896320970849472, + "tokens_seen": 83296256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922968906720161, + "loss": 3.9325, + "theoretical_loss": 4.895773269968219, + "tokens_seen": 83361792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922868605817453, + "loss": 3.6416, + "theoretical_loss": 4.895226119955386, + "tokens_seen": 83427328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922768304914745, + "loss": 3.7782, + "theoretical_loss": 4.894679519824841, + "tokens_seen": 83492864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 167845, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8555917739868164, + "objective/train/theoretical_loss": 4.894133468592984, + "objective/train/tokens_used": 104018400, + "theoretical_loss": 4.894133468592984, + "tokens_seen": 83558400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922668004012036, + "loss": 3.9438, + "theoretical_loss": 4.894133468592984, + "tokens_seen": 83558400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922567703109328, + "loss": 3.5061, + "theoretical_loss": 4.8935879652787495, + "tokens_seen": 83623936 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492246740220662, + "loss": 3.6214, + "theoretical_loss": 4.893043008903591, + "tokens_seen": 83689472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922367101303912, + "loss": 3.9424, + "theoretical_loss": 4.892498598491473, + "tokens_seen": 83755008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922266800401204, + "loss": 3.4731, + "theoretical_loss": 4.891954733068863, + "tokens_seen": 83820544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922166499498495, + "loss": 3.7306, + "theoretical_loss": 4.891411411664727, + "tokens_seen": 83886080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922066198595787, + "loss": 3.8514, + "theoretical_loss": 4.890868633310515, + "tokens_seen": 83951616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921965897693079, + "loss": 3.7332, + "theoretical_loss": 4.890326397040158, + "tokens_seen": 84017152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921865596790371, + "loss": 3.6843, + "theoretical_loss": 4.889784701890056, + "tokens_seen": 84082688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921765295887663, + "loss": 3.5876, + "theoretical_loss": 4.8892435468990705, + "tokens_seen": 84148224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921664994984956, + "loss": 3.6961, + "theoretical_loss": 4.88870293110852, + "tokens_seen": 84213760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921564694082246, + "loss": 3.6685, + "theoretical_loss": 4.888162853562166, + "tokens_seen": 84279296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921464393179539, + "loss": 3.4346, + "theoretical_loss": 4.88762331330621, + "tokens_seen": 84344832 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492136409227683, + "loss": 3.5563, + "theoretical_loss": 4.88708430938928, + "tokens_seen": 84410368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921263791374123, + "loss": 3.7265, + "theoretical_loss": 4.8865458408624285, + "tokens_seen": 84475904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921163490471415, + "loss": 3.5372, + "theoretical_loss": 4.8860079067791204, + "tokens_seen": 84541440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921063189568706, + "loss": 3.4829, + "theoretical_loss": 4.885470506195227, + "tokens_seen": 84606976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920962888665998, + "loss": 3.533, + "theoretical_loss": 4.884933638169014, + "tokens_seen": 84672512 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492086258776329, + "loss": 3.9744, + "theoretical_loss": 4.88439730176114, + "tokens_seen": 84738048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920762286860582, + "loss": 3.674, + "theoretical_loss": 4.883861496034644, + "tokens_seen": 84803584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920661985957874, + "loss": 3.9751, + "theoretical_loss": 4.88332622005494, + "tokens_seen": 84869120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920561685055166, + "loss": 3.6296, + "theoretical_loss": 4.8827914728898065, + "tokens_seen": 84934656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920461384152457, + "loss": 3.6182, + "theoretical_loss": 4.88225725360938, + "tokens_seen": 85000192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920361083249749, + "loss": 3.7477, + "theoretical_loss": 4.881723561286149, + "tokens_seen": 85065728 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920260782347041, + "loss": 3.6688, + "theoretical_loss": 4.881190394994943, + "tokens_seen": 85131264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 169231, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.038014888763428, + "objective/train/theoretical_loss": 4.880657753812926, + "objective/train/tokens_used": 105656800, + "theoretical_loss": 4.880657753812926, + "tokens_seen": 85196800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920160481444333, + "loss": 3.8814, + "theoretical_loss": 4.880657753812926, + "tokens_seen": 85196800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920060180541625, + "loss": 3.6396, + "theoretical_loss": 4.880125636819594, + "tokens_seen": 85262336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919959879638916, + "loss": 3.7983, + "theoretical_loss": 4.879594043096755, + "tokens_seen": 85327872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919859578736209, + "loss": 3.8557, + "theoretical_loss": 4.879062971728534, + "tokens_seen": 85393408 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049197592778335, + "loss": 3.5785, + "theoretical_loss": 4.87853242180136, + "tokens_seen": 85458944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919658976930793, + "loss": 3.4825, + "theoretical_loss": 4.878002392403959, + "tokens_seen": 85524480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919558676028084, + "loss": 3.5934, + "theoretical_loss": 4.877472882627343, + "tokens_seen": 85590016 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919458375125377, + "loss": 3.7652, + "theoretical_loss": 4.8769438915648085, + "tokens_seen": 85655552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919358074222668, + "loss": 3.5099, + "theoretical_loss": 4.876415418311928, + "tokens_seen": 85721088 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491925777331996, + "loss": 3.4595, + "theoretical_loss": 4.875887461966537, + "tokens_seen": 85786624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919157472417252, + "loss": 3.7013, + "theoretical_loss": 4.875360021628733, + "tokens_seen": 85852160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919057171514544, + "loss": 3.8989, + "theoretical_loss": 4.874833096400865, + "tokens_seen": 85917696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918956870611836, + "loss": 3.8334, + "theoretical_loss": 4.874306685387525, + "tokens_seen": 85983232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918856569709127, + "loss": 3.4766, + "theoretical_loss": 4.873780787695547, + "tokens_seen": 86048768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918756268806419, + "loss": 3.6543, + "theoretical_loss": 4.87325540243399, + "tokens_seen": 86114304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918655967903711, + "loss": 3.5469, + "theoretical_loss": 4.872730528714139, + "tokens_seen": 86179840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918555667001003, + "loss": 3.7906, + "theoretical_loss": 4.872206165649493, + "tokens_seen": 86245376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918455366098295, + "loss": 3.7537, + "theoretical_loss": 4.871682312355761, + "tokens_seen": 86310912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918355065195586, + "loss": 3.8324, + "theoretical_loss": 4.871158967950852, + "tokens_seen": 86376448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918254764292878, + "loss": 3.5791, + "theoretical_loss": 4.870636131554869, + "tokens_seen": 86441984 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491815446339017, + "loss": 3.6966, + "theoretical_loss": 4.8701138022901045, + "tokens_seen": 86507520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918054162487463, + "loss": 3.89, + "theoretical_loss": 4.869591979281028, + "tokens_seen": 86573056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917953861584754, + "loss": 3.9098, + "theoretical_loss": 4.8690706616542805, + "tokens_seen": 86638592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917853560682047, + "loss": 3.6221, + "theoretical_loss": 4.868549848538675, + "tokens_seen": 86704128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917753259779337, + "loss": 3.7127, + "theoretical_loss": 4.868029539065176, + "tokens_seen": 86769664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 169899, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6036012172698975, + "objective/train/theoretical_loss": 4.867509732366907, + "objective/train/tokens_used": 107295200, + "theoretical_loss": 4.867509732366907, + "tokens_seen": 86835200 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491765295887663, + "loss": 3.6755, + "theoretical_loss": 4.867509732366907, + "tokens_seen": 86835200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917552657973922, + "loss": 3.8447, + "theoretical_loss": 4.866990427579129, + "tokens_seen": 86900736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917452357071214, + "loss": 3.8378, + "theoretical_loss": 4.866471623839248, + "tokens_seen": 86966272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917352056168506, + "loss": 3.7024, + "theoretical_loss": 4.8659533202867955, + "tokens_seen": 87031808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917251755265797, + "loss": 3.7791, + "theoretical_loss": 4.86543551606343, + "tokens_seen": 87097344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917151454363089, + "loss": 3.7572, + "theoretical_loss": 4.864918210312927, + "tokens_seen": 87162880 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917051153460381, + "loss": 3.4891, + "theoretical_loss": 4.864401402181173, + "tokens_seen": 87228416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916950852557673, + "loss": 3.7334, + "theoretical_loss": 4.863885090816158, + "tokens_seen": 87293952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916850551654965, + "loss": 3.818, + "theoretical_loss": 4.863369275367968, + "tokens_seen": 87359488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916750250752258, + "loss": 3.6889, + "theoretical_loss": 4.862853954988781, + "tokens_seen": 87425024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916649949849548, + "loss": 3.6571, + "theoretical_loss": 4.862339128832857, + "tokens_seen": 87490560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916549648946841, + "loss": 3.6037, + "theoretical_loss": 4.861824796056533, + "tokens_seen": 87556096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916449348044132, + "loss": 4.1055, + "theoretical_loss": 4.861310955818219, + "tokens_seen": 87621632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916349047141425, + "loss": 3.9186, + "theoretical_loss": 4.860797607278385, + "tokens_seen": 87687168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916248746238717, + "loss": 3.6732, + "theoretical_loss": 4.86028474959956, + "tokens_seen": 87752704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916148445336008, + "loss": 3.6389, + "theoretical_loss": 4.859772381946323, + "tokens_seen": 87818240 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049160481444333, + "loss": 3.4738, + "theoretical_loss": 4.859260503485298, + "tokens_seen": 87883776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915947843530592, + "loss": 3.6106, + "theoretical_loss": 4.858749113385144, + "tokens_seen": 87949312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915847542627884, + "loss": 3.8513, + "theoretical_loss": 4.858238210816554, + "tokens_seen": 88014848 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915747241725176, + "loss": 3.5283, + "theoretical_loss": 4.8577277949522415, + "tokens_seen": 88080384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915646940822468, + "loss": 3.6318, + "theoretical_loss": 4.857217864966943, + "tokens_seen": 88145920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915546639919759, + "loss": 3.9574, + "theoretical_loss": 4.856708420037402, + "tokens_seen": 88211456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915446339017051, + "loss": 3.6005, + "theoretical_loss": 4.8561994593423705, + "tokens_seen": 88276992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915346038114343, + "loss": 3.6561, + "theoretical_loss": 4.8556909820625975, + "tokens_seen": 88342528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915245737211635, + "loss": 3.5624, + "theoretical_loss": 4.855182987380823, + "tokens_seen": 88408064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 170786, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6651480197906494, + "objective/train/theoretical_loss": 4.854675474481779, + "objective/train/tokens_used": 108933600, + "theoretical_loss": 4.854675474481779, + "tokens_seen": 88473600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915145436308927, + "loss": 3.5245, + "theoretical_loss": 4.854675474481779, + "tokens_seen": 88473600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915045135406218, + "loss": 3.9545, + "theoretical_loss": 4.8541684425521705, + "tokens_seen": 88539136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914944834503511, + "loss": 3.7957, + "theoretical_loss": 4.85366189078068, + "tokens_seen": 88604672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914844533600802, + "loss": 3.9079, + "theoretical_loss": 4.853155818357957, + "tokens_seen": 88670208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914744232698095, + "loss": 3.9184, + "theoretical_loss": 4.852650224476609, + "tokens_seen": 88735744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914643931795386, + "loss": 3.731, + "theoretical_loss": 4.852145108331205, + "tokens_seen": 88801280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914543630892679, + "loss": 3.7566, + "theoretical_loss": 4.851640469118255, + "tokens_seen": 88866816 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491444332998997, + "loss": 3.6482, + "theoretical_loss": 4.851136306036219, + "tokens_seen": 88932352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914343029087262, + "loss": 3.7222, + "theoretical_loss": 4.850632618285486, + "tokens_seen": 88997888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914242728184554, + "loss": 3.6696, + "theoretical_loss": 4.850129405068383, + "tokens_seen": 89063424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914142427281846, + "loss": 3.8089, + "theoretical_loss": 4.849626665589156, + "tokens_seen": 89128960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914042126379138, + "loss": 3.5295, + "theoretical_loss": 4.849124399053969, + "tokens_seen": 89194496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913941825476429, + "loss": 3.6736, + "theoretical_loss": 4.8486226046709024, + "tokens_seen": 89260032 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913841524573721, + "loss": 3.5729, + "theoretical_loss": 4.8481212816499415, + "tokens_seen": 89325568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913741223671013, + "loss": 3.7998, + "theoretical_loss": 4.847620429202967, + "tokens_seen": 89391104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913640922768305, + "loss": 3.4795, + "theoretical_loss": 4.847120046543763, + "tokens_seen": 89456640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913540621865597, + "loss": 3.8715, + "theoretical_loss": 4.846620132887992, + "tokens_seen": 89522176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913440320962888, + "loss": 3.7386, + "theoretical_loss": 4.8461206874532055, + "tokens_seen": 89587712 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491334002006018, + "loss": 3.6659, + "theoretical_loss": 4.845621709458831, + "tokens_seen": 89653248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913239719157472, + "loss": 3.4551, + "theoretical_loss": 4.845123198126162, + "tokens_seen": 89718784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913139418254765, + "loss": 3.6237, + "theoretical_loss": 4.844625152678364, + "tokens_seen": 89784320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913039117352056, + "loss": 3.8513, + "theoretical_loss": 4.844127572340455, + "tokens_seen": 89849856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912938816449349, + "loss": 3.6724, + "theoretical_loss": 4.84363045633931, + "tokens_seen": 89915392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912838515546639, + "loss": 3.8743, + "theoretical_loss": 4.843133803903651, + "tokens_seen": 89980928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912738214643932, + "loss": 3.6873, + "theoretical_loss": 4.84263761426404, + "tokens_seen": 90046464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 171435, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2105295658111572, + "objective/train/theoretical_loss": 4.842141886652876, + "objective/train/tokens_used": 110572000, + "theoretical_loss": 4.842141886652876, + "tokens_seen": 90112000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912637913741224, + "loss": 3.2592, + "theoretical_loss": 4.842141886652876, + "tokens_seen": 90112000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912537612838516, + "loss": 3.6091, + "theoretical_loss": 4.841646620304388, + "tokens_seen": 90177536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912437311935808, + "loss": 3.7829, + "theoretical_loss": 4.841151814454632, + "tokens_seen": 90243072 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049123370110331, + "loss": 3.8082, + "theoretical_loss": 4.840657468341476, + "tokens_seen": 90308608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912236710130391, + "loss": 3.5862, + "theoretical_loss": 4.84016358120461, + "tokens_seen": 90374144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912136409227683, + "loss": 3.523, + "theoretical_loss": 4.839670152285526, + "tokens_seen": 90439680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912036108324975, + "loss": 3.3392, + "theoretical_loss": 4.8391771808275195, + "tokens_seen": 90505216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911935807422267, + "loss": 3.6086, + "theoretical_loss": 4.838684666075682, + "tokens_seen": 90570752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911835506519559, + "loss": 3.7421, + "theoretical_loss": 4.838192607276896, + "tokens_seen": 90636288 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491173520561685, + "loss": 3.549, + "theoretical_loss": 4.837701003679829, + "tokens_seen": 90701824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911634904714142, + "loss": 3.6988, + "theoretical_loss": 4.8372098545349305, + "tokens_seen": 90767360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911534603811434, + "loss": 3.5152, + "theoretical_loss": 4.836719159094422, + "tokens_seen": 90832896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911434302908726, + "loss": 3.5065, + "theoretical_loss": 4.836228916612292, + "tokens_seen": 90898432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911334002006019, + "loss": 3.8109, + "theoretical_loss": 4.835739126344298, + "tokens_seen": 90963968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911233701103309, + "loss": 3.9112, + "theoretical_loss": 4.8352497875479505, + "tokens_seen": 91029504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911133400200602, + "loss": 3.8912, + "theoretical_loss": 4.834760899482514, + "tokens_seen": 91095040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911033099297893, + "loss": 3.9841, + "theoretical_loss": 4.834272461409001, + "tokens_seen": 91160576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910932798395186, + "loss": 3.6817, + "theoretical_loss": 4.833784472590165, + "tokens_seen": 91226112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910832497492478, + "loss": 3.795, + "theoretical_loss": 4.833296932290495, + "tokens_seen": 91291648 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491073219658977, + "loss": 3.8488, + "theoretical_loss": 4.832809839776213, + "tokens_seen": 91357184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910631895687061, + "loss": 4.0801, + "theoretical_loss": 4.832323194315265, + "tokens_seen": 91422720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910531594784353, + "loss": 3.8667, + "theoretical_loss": 4.831836995177319, + "tokens_seen": 91488256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910431293881645, + "loss": 3.849, + "theoretical_loss": 4.831351241633756, + "tokens_seen": 91553792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910330992978937, + "loss": 3.8149, + "theoretical_loss": 4.8308659329576695, + "tokens_seen": 91619328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910230692076229, + "loss": 3.7799, + "theoretical_loss": 4.830381068423856, + "tokens_seen": 91684864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 171740, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2981529235839844, + "objective/train/theoretical_loss": 4.8298966473088125, + "objective/train/tokens_used": 112210400, + "theoretical_loss": 4.8298966473088125, + "tokens_seen": 91750400 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491013039117352, + "loss": 3.4557, + "theoretical_loss": 4.8298966473088125, + "tokens_seen": 91750400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910030090270812, + "loss": 4.0207, + "theoretical_loss": 4.829412668890729, + "tokens_seen": 91815936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909929789368104, + "loss": 3.8245, + "theoretical_loss": 4.8289291324494865, + "tokens_seen": 91881472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909829488465397, + "loss": 3.7635, + "theoretical_loss": 4.828446037266647, + "tokens_seen": 91947008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909729187562688, + "loss": 3.6794, + "theoretical_loss": 4.827963382625454, + "tokens_seen": 92012544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909628886659981, + "loss": 3.7506, + "theoretical_loss": 4.827481167810825, + "tokens_seen": 92078080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909528585757272, + "loss": 4.0619, + "theoretical_loss": 4.826999392109344, + "tokens_seen": 92143616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909428284854564, + "loss": 3.8418, + "theoretical_loss": 4.826518054809259, + "tokens_seen": 92209152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909327983951856, + "loss": 3.809, + "theoretical_loss": 4.826037155200478, + "tokens_seen": 92274688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909227683049148, + "loss": 3.9797, + "theoretical_loss": 4.825556692574562, + "tokens_seen": 92340224 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490912738214644, + "loss": 3.8061, + "theoretical_loss": 4.825076666224717, + "tokens_seen": 92405760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909027081243731, + "loss": 3.7777, + "theoretical_loss": 4.824597075445799, + "tokens_seen": 92471296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908926780341023, + "loss": 3.8168, + "theoretical_loss": 4.824117919534297, + "tokens_seen": 92536832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908826479438315, + "loss": 3.8435, + "theoretical_loss": 4.823639197788334, + "tokens_seen": 92602368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908726178535607, + "loss": 3.5213, + "theoretical_loss": 4.823160909507665, + "tokens_seen": 92667904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908625877632899, + "loss": 3.6417, + "theoretical_loss": 4.822683053993664, + "tokens_seen": 92733440 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490852557673019, + "loss": 3.4858, + "theoretical_loss": 4.822205630549329, + "tokens_seen": 92798976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908425275827482, + "loss": 3.7943, + "theoretical_loss": 4.821728638479267, + "tokens_seen": 92864512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908324974924774, + "loss": 3.7895, + "theoretical_loss": 4.821252077089696, + "tokens_seen": 92930048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908224674022067, + "loss": 3.753, + "theoretical_loss": 4.820775945688437, + "tokens_seen": 92995584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908124373119358, + "loss": 3.9838, + "theoretical_loss": 4.820300243584913, + "tokens_seen": 93061120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908024072216651, + "loss": 3.8889, + "theoretical_loss": 4.819824970090138, + "tokens_seen": 93126656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907923771313941, + "loss": 3.8958, + "theoretical_loss": 4.819350124516717, + "tokens_seen": 93192192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907823470411234, + "loss": 3.6089, + "theoretical_loss": 4.818875706178841, + "tokens_seen": 93257728 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907723169508526, + "loss": 3.9225, + "theoretical_loss": 4.818401714392279, + "tokens_seen": 93323264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 171740, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6480627059936523, + "objective/train/theoretical_loss": 4.817928148474378, + "objective/train/tokens_used": 113848800, + "theoretical_loss": 4.817928148474378, + "tokens_seen": 93388800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907622868605818, + "loss": 3.544, + "theoretical_loss": 4.817928148474378, + "tokens_seen": 93388800 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490752256770311, + "loss": 3.8504, + "theoretical_loss": 4.817455007744052, + "tokens_seen": 93454336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907422266800401, + "loss": 3.6831, + "theoretical_loss": 4.816982291521785, + "tokens_seen": 93519872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907321965897693, + "loss": 3.7159, + "theoretical_loss": 4.816509999129618, + "tokens_seen": 93585408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907221664994985, + "loss": 3.8475, + "theoretical_loss": 4.816038129891151, + "tokens_seen": 93650944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907121364092277, + "loss": 3.867, + "theoretical_loss": 4.815566683131536, + "tokens_seen": 93716480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907021063189569, + "loss": 3.7402, + "theoretical_loss": 4.815095658177472, + "tokens_seen": 93782016 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490692076228686, + "loss": 3.6066, + "theoretical_loss": 4.814625054357199, + "tokens_seen": 93847552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906820461384152, + "loss": 3.7494, + "theoretical_loss": 4.814154871000497, + "tokens_seen": 93913088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906720160481444, + "loss": 3.996, + "theoretical_loss": 4.813685107438679, + "tokens_seen": 93978624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906619859578736, + "loss": 3.7818, + "theoretical_loss": 4.813215763004585, + "tokens_seen": 94044160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906519558676028, + "loss": 3.8077, + "theoretical_loss": 4.812746837032582, + "tokens_seen": 94109696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906419257773321, + "loss": 3.8845, + "theoretical_loss": 4.812278328858554, + "tokens_seen": 94175232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906318956870611, + "loss": 3.8249, + "theoretical_loss": 4.811810237819904, + "tokens_seen": 94240768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906218655967904, + "loss": 3.8511, + "theoretical_loss": 4.81134256325554, + "tokens_seen": 94306304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906118355065195, + "loss": 3.5025, + "theoretical_loss": 4.810875304505881, + "tokens_seen": 94371840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906018054162488, + "loss": 3.9814, + "theoretical_loss": 4.810408460912846, + "tokens_seen": 94437376 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490591775325978, + "loss": 3.7851, + "theoretical_loss": 4.809942031819853, + "tokens_seen": 94502912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905817452357072, + "loss": 3.7676, + "theoretical_loss": 4.809476016571809, + "tokens_seen": 94568448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905717151454363, + "loss": 3.8481, + "theoretical_loss": 4.809010414515113, + "tokens_seen": 94633984 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905616850551655, + "loss": 3.8722, + "theoretical_loss": 4.808545224997644, + "tokens_seen": 94699520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905516549648947, + "loss": 4.0819, + "theoretical_loss": 4.808080447368766, + "tokens_seen": 94765056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905416248746239, + "loss": 3.7927, + "theoretical_loss": 4.807616080979315, + "tokens_seen": 94830592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905315947843531, + "loss": 3.8727, + "theoretical_loss": 4.807152125181597, + "tokens_seen": 94896128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905215646940822, + "loss": 3.7942, + "theoretical_loss": 4.806688579329387, + "tokens_seen": 94961664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 173310, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.714308500289917, + "objective/train/theoretical_loss": 4.8062254427779205, + "objective/train/tokens_used": 115487200, + "theoretical_loss": 4.8062254427779205, + "tokens_seen": 95027200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905115346038114, + "loss": 3.8544, + "theoretical_loss": 4.8062254427779205, + "tokens_seen": 95027200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905015045135406, + "loss": 3.7483, + "theoretical_loss": 4.80576271488389, + "tokens_seen": 95092736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904914744232698, + "loss": 3.8588, + "theoretical_loss": 4.805300395005444, + "tokens_seen": 95158272 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490481444332999, + "loss": 4.0297, + "theoretical_loss": 4.804838482502181, + "tokens_seen": 95223808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904714142427281, + "loss": 3.9291, + "theoretical_loss": 4.8043769767351385, + "tokens_seen": 95289344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904613841524574, + "loss": 3.7934, + "theoretical_loss": 4.8039158770668005, + "tokens_seen": 95354880 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904513540621865, + "loss": 3.911, + "theoretical_loss": 4.803455182861087, + "tokens_seen": 95420416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904413239719158, + "loss": 3.6903, + "theoretical_loss": 4.802994893483348, + "tokens_seen": 95485952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904312938816449, + "loss": 3.3024, + "theoretical_loss": 4.802535008300364, + "tokens_seen": 95551488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904212637913742, + "loss": 3.9319, + "theoretical_loss": 4.802075526680335, + "tokens_seen": 95617024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904112337011033, + "loss": 3.905, + "theoretical_loss": 4.801616447992888, + "tokens_seen": 95682560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904012036108325, + "loss": 3.6539, + "theoretical_loss": 4.801157771609061, + "tokens_seen": 95748096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903911735205617, + "loss": 3.9673, + "theoretical_loss": 4.8006994969013, + "tokens_seen": 95813632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903811434302909, + "loss": 3.9648, + "theoretical_loss": 4.800241623243467, + "tokens_seen": 95879168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903711133400201, + "loss": 3.6807, + "theoretical_loss": 4.799784150010819, + "tokens_seen": 95944704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903610832497492, + "loss": 3.5081, + "theoretical_loss": 4.799327076580017, + "tokens_seen": 96010240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903510531594784, + "loss": 3.9447, + "theoretical_loss": 4.798870402329115, + "tokens_seen": 96075776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903410230692076, + "loss": 3.8515, + "theoretical_loss": 4.798414126637558, + "tokens_seen": 96141312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903309929789368, + "loss": 3.6779, + "theoretical_loss": 4.797958248886179, + "tokens_seen": 96206848 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490320962888666, + "loss": 3.7298, + "theoretical_loss": 4.797502768457193, + "tokens_seen": 96272384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903109327983952, + "loss": 3.9204, + "theoretical_loss": 4.797047684734192, + "tokens_seen": 96337920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903009027081243, + "loss": 3.7826, + "theoretical_loss": 4.796592997102147, + "tokens_seen": 96403456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902908726178535, + "loss": 3.9417, + "theoretical_loss": 4.796138704947397, + "tokens_seen": 96468992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902808425275828, + "loss": 3.7212, + "theoretical_loss": 4.795684807657649, + "tokens_seen": 96534528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902708124373119, + "loss": 3.97, + "theoretical_loss": 4.795231304621968, + "tokens_seen": 96600064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 173936, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.69027042388916, + "objective/train/theoretical_loss": 4.794778195230787, + "objective/train/tokens_used": 117125600, + "theoretical_loss": 4.794778195230787, + "tokens_seen": 96665600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902607823470412, + "loss": 3.7908, + "theoretical_loss": 4.794778195230787, + "tokens_seen": 96665600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902507522567703, + "loss": 3.6508, + "theoretical_loss": 4.794325478875885, + "tokens_seen": 96731136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902407221664995, + "loss": 3.8929, + "theoretical_loss": 4.793873154950399, + "tokens_seen": 96796672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902306920762287, + "loss": 3.9375, + "theoretical_loss": 4.793421222848808, + "tokens_seen": 96862208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902206619859579, + "loss": 3.9788, + "theoretical_loss": 4.7929696819669365, + "tokens_seen": 96927744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902106318956871, + "loss": 3.9239, + "theoretical_loss": 4.792518531701948, + "tokens_seen": 96993280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902006018054163, + "loss": 3.8721, + "theoretical_loss": 4.792067771452341, + "tokens_seen": 97058816 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901905717151454, + "loss": 3.6114, + "theoretical_loss": 4.791617400617948, + "tokens_seen": 97124352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901805416248746, + "loss": 3.7371, + "theoretical_loss": 4.791167418599925, + "tokens_seen": 97189888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901705115346038, + "loss": 3.74, + "theoretical_loss": 4.790717824800755, + "tokens_seen": 97255424 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490160481444333, + "loss": 3.9032, + "theoretical_loss": 4.790268618624239, + "tokens_seen": 97320960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901504513540623, + "loss": 3.6149, + "theoretical_loss": 4.789819799475499, + "tokens_seen": 97386496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901404212637913, + "loss": 3.8858, + "theoretical_loss": 4.789371366760961, + "tokens_seen": 97452032 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901303911735206, + "loss": 3.722, + "theoretical_loss": 4.788923319888369, + "tokens_seen": 97517568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901203610832497, + "loss": 3.6514, + "theoretical_loss": 4.788475658266766, + "tokens_seen": 97583104 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490110330992979, + "loss": 3.6426, + "theoretical_loss": 4.788028381306497, + "tokens_seen": 97648640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901003009027082, + "loss": 3.6375, + "theoretical_loss": 4.787581488419207, + "tokens_seen": 97714176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900902708124374, + "loss": 3.7361, + "theoretical_loss": 4.787134979017832, + "tokens_seen": 97779712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900802407221665, + "loss": 3.676, + "theoretical_loss": 4.786688852516599, + "tokens_seen": 97845248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900702106318957, + "loss": 3.9366, + "theoretical_loss": 4.786243108331024, + "tokens_seen": 97910784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900601805416249, + "loss": 3.7287, + "theoretical_loss": 4.7857977458779, + "tokens_seen": 97976320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900501504513541, + "loss": 3.5148, + "theoretical_loss": 4.785352764575304, + "tokens_seen": 98041856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900401203610833, + "loss": 3.8189, + "theoretical_loss": 4.784908163842585, + "tokens_seen": 98107392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900300902708124, + "loss": 3.741, + "theoretical_loss": 4.784463943100367, + "tokens_seen": 98172928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900200601805416, + "loss": 3.9671, + "theoretical_loss": 4.7840201017705395, + "tokens_seen": 98238464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 175137, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.979011058807373, + "objective/train/theoretical_loss": 4.783576639276257, + "objective/train/tokens_used": 118764000, + "theoretical_loss": 4.783576639276257, + "tokens_seen": 98304000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900100300902708, + "loss": 3.7197, + "theoretical_loss": 4.783576639276257, + "tokens_seen": 98304000 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049, + "loss": 3.9945, + "theoretical_loss": 4.783133555041934, + "tokens_seen": 98369536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899899699097292, + "loss": 3.8699, + "theoretical_loss": 4.782690848493245, + "tokens_seen": 98435072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899799398194583, + "loss": 3.7009, + "theoretical_loss": 4.7822485190571165, + "tokens_seen": 98500608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899699097291876, + "loss": 3.9709, + "theoretical_loss": 4.781806566161723, + "tokens_seen": 98566144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899598796389167, + "loss": 3.8659, + "theoretical_loss": 4.781364989236488, + "tokens_seen": 98631680 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489949849548646, + "loss": 3.6496, + "theoretical_loss": 4.78092378771208, + "tokens_seen": 98697216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899398194583751, + "loss": 3.7254, + "theoretical_loss": 4.780482961020402, + "tokens_seen": 98762752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899297893681044, + "loss": 3.7948, + "theoretical_loss": 4.780042508594596, + "tokens_seen": 98828288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899197592778335, + "loss": 3.9568, + "theoretical_loss": 4.779602429869035, + "tokens_seen": 98893824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899097291875627, + "loss": 3.7481, + "theoretical_loss": 4.779162724279324, + "tokens_seen": 98959360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898996990972919, + "loss": 3.755, + "theoretical_loss": 4.7787233912622895, + "tokens_seen": 99024896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898896690070211, + "loss": 3.7307, + "theoretical_loss": 4.778284430255981, + "tokens_seen": 99090432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898796389167503, + "loss": 3.4225, + "theoretical_loss": 4.77784584069967, + "tokens_seen": 99155968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898696088264794, + "loss": 3.8225, + "theoretical_loss": 4.777407622033838, + "tokens_seen": 99221504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898595787362086, + "loss": 3.5734, + "theoretical_loss": 4.776969773700181, + "tokens_seen": 99287040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898495486459378, + "loss": 3.6576, + "theoretical_loss": 4.776532295141601, + "tokens_seen": 99352576 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489839518555667, + "loss": 3.916, + "theoretical_loss": 4.776095185802211, + "tokens_seen": 99418112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898294884653962, + "loss": 3.4804, + "theoretical_loss": 4.775658445127318, + "tokens_seen": 99483648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898194583751254, + "loss": 3.6648, + "theoretical_loss": 4.775222072563429, + "tokens_seen": 99549184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898094282848545, + "loss": 4.0798, + "theoretical_loss": 4.7747860675582485, + "tokens_seen": 99614720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897993981945837, + "loss": 3.7901, + "theoretical_loss": 4.77435042956067, + "tokens_seen": 99680256 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489789368104313, + "loss": 3.6989, + "theoretical_loss": 4.773915158020776, + "tokens_seen": 99745792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897793380140421, + "loss": 3.6796, + "theoretical_loss": 4.773480252389831, + "tokens_seen": 99811328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897693079237714, + "loss": 3.6931, + "theoretical_loss": 4.773045712120284, + "tokens_seen": 99876864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 175797, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.162777900695801, + "objective/train/theoretical_loss": 4.77261153666576, + "objective/train/tokens_used": 120402400, + "theoretical_loss": 4.77261153666576, + "tokens_seen": 99942400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897592778335005, + "loss": 3.6811, + "theoretical_loss": 4.77261153666576, + "tokens_seen": 99942400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897492477432297, + "loss": 3.8359, + "theoretical_loss": 4.772177725481062, + "tokens_seen": 100007936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897392176529589, + "loss": 4.1084, + "theoretical_loss": 4.77174427802216, + "tokens_seen": 100073472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897291875626881, + "loss": 3.5353, + "theoretical_loss": 4.771311193746191, + "tokens_seen": 100139008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897191574724173, + "loss": 3.6427, + "theoretical_loss": 4.770878472111465, + "tokens_seen": 100204544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897091273821465, + "loss": 3.7959, + "theoretical_loss": 4.770446112577445, + "tokens_seen": 100270080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896990972918756, + "loss": 3.7708, + "theoretical_loss": 4.770014114604756, + "tokens_seen": 100335616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896890672016048, + "loss": 3.698, + "theoretical_loss": 4.769582477655177, + "tokens_seen": 100401152 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489679037111334, + "loss": 3.8029, + "theoretical_loss": 4.769151201191641, + "tokens_seen": 100466688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896690070210632, + "loss": 3.488, + "theoretical_loss": 4.768720284678228, + "tokens_seen": 100532224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896589769307924, + "loss": 3.7797, + "theoretical_loss": 4.768289727580161, + "tokens_seen": 100597760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896489468405215, + "loss": 3.4447, + "theoretical_loss": 4.767859529363809, + "tokens_seen": 100663296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896389167502507, + "loss": 3.7995, + "theoretical_loss": 4.767429689496682, + "tokens_seen": 100728832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896288866599799, + "loss": 3.8211, + "theoretical_loss": 4.767000207447417, + "tokens_seen": 100794368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896188565697091, + "loss": 3.5972, + "theoretical_loss": 4.766571082685794, + "tokens_seen": 100859904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896088264794384, + "loss": 3.8521, + "theoretical_loss": 4.766142314682716, + "tokens_seen": 100925440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895987963891674, + "loss": 3.7264, + "theoretical_loss": 4.765713902910214, + "tokens_seen": 100990976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895887662988967, + "loss": 3.702, + "theoretical_loss": 4.765285846841444, + "tokens_seen": 101056512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895787362086259, + "loss": 3.8745, + "theoretical_loss": 4.76485814595068, + "tokens_seen": 101122048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895687061183551, + "loss": 3.8407, + "theoretical_loss": 4.764430799713314, + "tokens_seen": 101187584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895586760280843, + "loss": 3.5995, + "theoretical_loss": 4.764003807605853, + "tokens_seen": 101253120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895486459378135, + "loss": 3.6566, + "theoretical_loss": 4.763577169105912, + "tokens_seen": 101318656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895386158475426, + "loss": 3.7968, + "theoretical_loss": 4.763150883692218, + "tokens_seen": 101384192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895285857572718, + "loss": 3.6979, + "theoretical_loss": 4.762724950844598, + "tokens_seen": 101449728 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489518555667001, + "loss": 3.7237, + "theoretical_loss": 4.762299370043984, + "tokens_seen": 101515264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 179449, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7288663387298584, + "objective/train/theoretical_loss": 4.761874140772408, + "objective/train/tokens_used": 122040800, + "theoretical_loss": 4.761874140772408, + "tokens_seen": 101580800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895085255767302, + "loss": 3.8993, + "theoretical_loss": 4.761874140772408, + "tokens_seen": 101580800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894984954864594, + "loss": 3.523, + "theoretical_loss": 4.761449262512993, + "tokens_seen": 101646336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894884653961885, + "loss": 3.4939, + "theoretical_loss": 4.761024734749958, + "tokens_seen": 101711872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894784353059178, + "loss": 3.8311, + "theoretical_loss": 4.76060055696861, + "tokens_seen": 101777408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894684052156469, + "loss": 3.8815, + "theoretical_loss": 4.760176728655345, + "tokens_seen": 101842944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894583751253762, + "loss": 3.8344, + "theoretical_loss": 4.75975324929764, + "tokens_seen": 101908480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894483450351053, + "loss": 3.7128, + "theoretical_loss": 4.759330118384053, + "tokens_seen": 101974016 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894383149448346, + "loss": 3.7163, + "theoretical_loss": 4.758907335404221, + "tokens_seen": 102039552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894282848545637, + "loss": 3.5908, + "theoretical_loss": 4.758484899848854, + "tokens_seen": 102105088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894182547642929, + "loss": 3.8424, + "theoretical_loss": 4.7580628112097365, + "tokens_seen": 102170624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894082246740221, + "loss": 3.6798, + "theoretical_loss": 4.7576410689797175, + "tokens_seen": 102236160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893981945837513, + "loss": 3.6937, + "theoretical_loss": 4.757219672652717, + "tokens_seen": 102301696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893881644934805, + "loss": 3.6042, + "theoretical_loss": 4.756798621723712, + "tokens_seen": 102367232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893781344032096, + "loss": 3.8252, + "theoretical_loss": 4.756377915688748, + "tokens_seen": 102432768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893681043129388, + "loss": 3.6281, + "theoretical_loss": 4.755957554044917, + "tokens_seen": 102498304 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489358074222668, + "loss": 3.634, + "theoretical_loss": 4.755537536290373, + "tokens_seen": 102563840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893480441323972, + "loss": 3.5845, + "theoretical_loss": 4.755117861924321, + "tokens_seen": 102629376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893380140421264, + "loss": 3.7616, + "theoretical_loss": 4.754698530447009, + "tokens_seen": 102694912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893279839518556, + "loss": 3.8346, + "theoretical_loss": 4.754279541359738, + "tokens_seen": 102760448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893179538615847, + "loss": 3.4694, + "theoretical_loss": 4.753860894164845, + "tokens_seen": 102825984 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893079237713139, + "loss": 3.8897, + "theoretical_loss": 4.75344258836571, + "tokens_seen": 102891520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892978936810432, + "loss": 3.5267, + "theoretical_loss": 4.753024623466752, + "tokens_seen": 102957056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892878635907723, + "loss": 3.4275, + "theoretical_loss": 4.752606998973421, + "tokens_seen": 103022592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892778335005016, + "loss": 3.5392, + "theoretical_loss": 4.752189714392202, + "tokens_seen": 103088128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892678034102307, + "loss": 3.6427, + "theoretical_loss": 4.7517727692306035, + "tokens_seen": 103153664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 184726, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9697039127349854, + "objective/train/theoretical_loss": 4.751356162997164, + "objective/train/tokens_used": 123679200, + "theoretical_loss": 4.751356162997164, + "tokens_seen": 103219200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892577733199599, + "loss": 3.525, + "theoretical_loss": 4.751356162997164, + "tokens_seen": 103219200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892477432296891, + "loss": 3.6341, + "theoretical_loss": 4.750939895201443, + "tokens_seen": 103284736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892377131394183, + "loss": 3.6469, + "theoretical_loss": 4.750523965354024, + "tokens_seen": 103350272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892276830491475, + "loss": 3.4156, + "theoretical_loss": 4.750108372966501, + "tokens_seen": 103415808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892176529588767, + "loss": 3.5315, + "theoretical_loss": 4.749693117551491, + "tokens_seen": 103481344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892076228686058, + "loss": 3.6321, + "theoretical_loss": 4.749278198622617, + "tokens_seen": 103546880 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489197592778335, + "loss": 3.54, + "theoretical_loss": 4.748863615694514, + "tokens_seen": 103612416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891875626880642, + "loss": 3.6321, + "theoretical_loss": 4.748449368282822, + "tokens_seen": 103677952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891775325977934, + "loss": 3.7374, + "theoretical_loss": 4.748035455904185, + "tokens_seen": 103743488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891675025075226, + "loss": 3.7389, + "theoretical_loss": 4.747621878076252, + "tokens_seen": 103809024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891574724172517, + "loss": 3.8047, + "theoretical_loss": 4.747208634317664, + "tokens_seen": 103874560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891474423269809, + "loss": 3.6744, + "theoretical_loss": 4.746795724148061, + "tokens_seen": 103940096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891374122367101, + "loss": 3.9085, + "theoretical_loss": 4.746383147088078, + "tokens_seen": 104005632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891273821464393, + "loss": 3.471, + "theoretical_loss": 4.745970902659338, + "tokens_seen": 104071168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891173520561686, + "loss": 3.8325, + "theoretical_loss": 4.745558990384451, + "tokens_seen": 104136704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891073219658976, + "loss": 3.523, + "theoretical_loss": 4.7451474097870125, + "tokens_seen": 104202240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890972918756269, + "loss": 3.5334, + "theoretical_loss": 4.744736160391602, + "tokens_seen": 104267776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890872617853561, + "loss": 3.6145, + "theoretical_loss": 4.744325241723777, + "tokens_seen": 104333312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890772316950853, + "loss": 3.7004, + "theoretical_loss": 4.743914653310073, + "tokens_seen": 104398848 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890672016048145, + "loss": 3.674, + "theoretical_loss": 4.743504394678, + "tokens_seen": 104464384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890571715145437, + "loss": 3.8081, + "theoretical_loss": 4.743094465356039, + "tokens_seen": 104529920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890471414242728, + "loss": 3.7858, + "theoretical_loss": 4.742684864873641, + "tokens_seen": 104595456 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489037111334002, + "loss": 3.5326, + "theoretical_loss": 4.742275592761223, + "tokens_seen": 104660992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890270812437312, + "loss": 3.5429, + "theoretical_loss": 4.741866648550168, + "tokens_seen": 104726528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890170511534604, + "loss": 3.4406, + "theoretical_loss": 4.741458031772817, + "tokens_seen": 104792064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 187033, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.107706069946289, + "objective/train/theoretical_loss": 4.741049741962473, + "objective/train/tokens_used": 125317600, + "theoretical_loss": 4.741049741962473, + "tokens_seen": 104857600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890070210631896, + "loss": 3.9333, + "theoretical_loss": 4.741049741962473, + "tokens_seen": 104857600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889969909729187, + "loss": 3.5559, + "theoretical_loss": 4.740641778653395, + "tokens_seen": 104923136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889869608826479, + "loss": 3.4574, + "theoretical_loss": 4.740234141380794, + "tokens_seen": 104988672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889769307923771, + "loss": 3.7356, + "theoretical_loss": 4.739826829680833, + "tokens_seen": 105054208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889669007021063, + "loss": 3.472, + "theoretical_loss": 4.739419843090626, + "tokens_seen": 105119744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889568706118355, + "loss": 3.7092, + "theoretical_loss": 4.739013181148229, + "tokens_seen": 105185280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889468405215647, + "loss": 3.8201, + "theoretical_loss": 4.738606843392644, + "tokens_seen": 105250816 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889368104312939, + "loss": 3.4478, + "theoretical_loss": 4.738200829363815, + "tokens_seen": 105316352 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488926780341023, + "loss": 3.9938, + "theoretical_loss": 4.737795138602624, + "tokens_seen": 105381888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889167502507523, + "loss": 3.7457, + "theoretical_loss": 4.737389770650887, + "tokens_seen": 105447424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889067201604815, + "loss": 3.6661, + "theoretical_loss": 4.736984725051357, + "tokens_seen": 105512960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888966900702107, + "loss": 3.6371, + "theoretical_loss": 4.736580001347717, + "tokens_seen": 105578496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888866599799398, + "loss": 3.7063, + "theoretical_loss": 4.736175599084576, + "tokens_seen": 105644032 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488876629889669, + "loss": 3.7422, + "theoretical_loss": 4.735771517807473, + "tokens_seen": 105709568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888665997993982, + "loss": 3.6926, + "theoretical_loss": 4.735367757062869, + "tokens_seen": 105775104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888565697091274, + "loss": 3.616, + "theoretical_loss": 4.734964316398148, + "tokens_seen": 105840640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888465396188566, + "loss": 3.5101, + "theoretical_loss": 4.734561195361609, + "tokens_seen": 105906176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888365095285858, + "loss": 3.5185, + "theoretical_loss": 4.734158393502471, + "tokens_seen": 105971712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888264794383149, + "loss": 3.8055, + "theoretical_loss": 4.733755910370867, + "tokens_seen": 106037248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888164493480441, + "loss": 3.6915, + "theoretical_loss": 4.73335374551784, + "tokens_seen": 106102784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888064192577733, + "loss": 3.5202, + "theoretical_loss": 4.732951898495341, + "tokens_seen": 106168320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887963891675025, + "loss": 3.675, + "theoretical_loss": 4.7325503688562325, + "tokens_seen": 106233856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887863590772317, + "loss": 3.4758, + "theoretical_loss": 4.732149156154276, + "tokens_seen": 106299392 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488776328986961, + "loss": 3.7876, + "theoretical_loss": 4.731748259944139, + "tokens_seen": 106364928 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048876629889669, + "loss": 3.649, + "theoretical_loss": 4.731347679781386, + "tokens_seen": 106430464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 192056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8580915927886963, + "objective/train/theoretical_loss": 4.730947415222481, + "objective/train/tokens_used": 126956000, + "theoretical_loss": 4.730947415222481, + "tokens_seen": 106496000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887562688064193, + "loss": 3.6611, + "theoretical_loss": 4.730947415222481, + "tokens_seen": 106496000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887462387161484, + "loss": 3.8264, + "theoretical_loss": 4.730547465824781, + "tokens_seen": 106561536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887362086258777, + "loss": 3.863, + "theoretical_loss": 4.730147831146537, + "tokens_seen": 106627072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887261785356069, + "loss": 3.5344, + "theoretical_loss": 4.72974851074689, + "tokens_seen": 106692608 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488716148445336, + "loss": 3.4504, + "theoretical_loss": 4.729349504185867, + "tokens_seen": 106758144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887061183550652, + "loss": 3.642, + "theoretical_loss": 4.728950811024383, + "tokens_seen": 106823680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886960882647944, + "loss": 3.6794, + "theoretical_loss": 4.7285524308242355, + "tokens_seen": 106889216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886860581745236, + "loss": 3.2107, + "theoretical_loss": 4.728154363148102, + "tokens_seen": 106954752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886760280842528, + "loss": 3.6065, + "theoretical_loss": 4.72775660755954, + "tokens_seen": 107020288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886659979939819, + "loss": 3.9955, + "theoretical_loss": 4.72735916362298, + "tokens_seen": 107085824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886559679037111, + "loss": 3.6974, + "theoretical_loss": 4.7269620309037315, + "tokens_seen": 107151360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886459378134403, + "loss": 3.728, + "theoretical_loss": 4.726565208967973, + "tokens_seen": 107216896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886359077231695, + "loss": 3.4342, + "theoretical_loss": 4.726168697382751, + "tokens_seen": 107282432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886258776328988, + "loss": 3.2861, + "theoretical_loss": 4.725772495715983, + "tokens_seen": 107347968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886158475426278, + "loss": 3.7293, + "theoretical_loss": 4.725376603536446, + "tokens_seen": 107413504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886058174523571, + "loss": 3.964, + "theoretical_loss": 4.724981020413787, + "tokens_seen": 107479040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885957873620863, + "loss": 3.6411, + "theoretical_loss": 4.724585745918505, + "tokens_seen": 107544576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885857572718155, + "loss": 3.6381, + "theoretical_loss": 4.7241907796219635, + "tokens_seen": 107610112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885757271815447, + "loss": 3.6016, + "theoretical_loss": 4.723796121096381, + "tokens_seen": 107675648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885656970912739, + "loss": 3.3598, + "theoretical_loss": 4.723401769914824, + "tokens_seen": 107741184 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488555667001003, + "loss": 3.731, + "theoretical_loss": 4.723007725651219, + "tokens_seen": 107806720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885456369107322, + "loss": 3.6714, + "theoretical_loss": 4.722613987880335, + "tokens_seen": 107872256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885356068204614, + "loss": 3.6898, + "theoretical_loss": 4.722220556177792, + "tokens_seen": 107937792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885255767301906, + "loss": 3.609, + "theoretical_loss": 4.721827430120053, + "tokens_seen": 108003328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885155466399198, + "loss": 3.4883, + "theoretical_loss": 4.721434609284424, + "tokens_seen": 108068864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 197231, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.762702465057373, + "objective/train/theoretical_loss": 4.721042093249051, + "objective/train/tokens_used": 128594400, + "theoretical_loss": 4.721042093249051, + "tokens_seen": 108134400 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488505516549649, + "loss": 3.6925, + "theoretical_loss": 4.721042093249051, + "tokens_seen": 108134400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884954864593781, + "loss": 3.4986, + "theoretical_loss": 4.720649881592919, + "tokens_seen": 108199936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884854563691073, + "loss": 3.4754, + "theoretical_loss": 4.7202579738958494, + "tokens_seen": 108265472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884754262788365, + "loss": 3.3368, + "theoretical_loss": 4.7198663697384955, + "tokens_seen": 108331008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884653961885657, + "loss": 3.6466, + "theoretical_loss": 4.719475068702346, + "tokens_seen": 108396544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884553660982949, + "loss": 3.5481, + "theoretical_loss": 4.719084070369714, + "tokens_seen": 108462080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884453360080241, + "loss": 3.5884, + "theoretical_loss": 4.718693374323747, + "tokens_seen": 108527616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884353059177532, + "loss": 3.8041, + "theoretical_loss": 4.718302980148412, + "tokens_seen": 108593152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884252758274825, + "loss": 3.8141, + "theoretical_loss": 4.717912887428501, + "tokens_seen": 108658688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884152457372117, + "loss": 3.5719, + "theoretical_loss": 4.717523095749626, + "tokens_seen": 108724224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884052156469409, + "loss": 3.7324, + "theoretical_loss": 4.717133604698222, + "tokens_seen": 108789760 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048839518555667, + "loss": 3.6457, + "theoretical_loss": 4.7167444138615355, + "tokens_seen": 108855296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883851554663992, + "loss": 3.4354, + "theoretical_loss": 4.716355522827633, + "tokens_seen": 108920832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883751253761284, + "loss": 3.6252, + "theoretical_loss": 4.715966931185388, + "tokens_seen": 108986368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883650952858576, + "loss": 3.591, + "theoretical_loss": 4.715578638524491, + "tokens_seen": 109051904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883550651955868, + "loss": 3.6158, + "theoretical_loss": 4.715190644435435, + "tokens_seen": 109117440 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488345035105316, + "loss": 3.6187, + "theoretical_loss": 4.714802948509522, + "tokens_seen": 109182976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883350050150451, + "loss": 3.7455, + "theoretical_loss": 4.71441555033886, + "tokens_seen": 109248512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883249749247743, + "loss": 3.712, + "theoretical_loss": 4.714028449516356, + "tokens_seen": 109314048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883149448345035, + "loss": 3.7868, + "theoretical_loss": 4.713641645635718, + "tokens_seen": 109379584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883049147442327, + "loss": 3.4678, + "theoretical_loss": 4.713255138291454, + "tokens_seen": 109445120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882948846539619, + "loss": 3.6966, + "theoretical_loss": 4.712868927078868, + "tokens_seen": 109510656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882848545636911, + "loss": 3.7838, + "theoretical_loss": 4.712483011594056, + "tokens_seen": 109576192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882748244734203, + "loss": 3.3161, + "theoretical_loss": 4.7120973914339075, + "tokens_seen": 109641728 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048826479438314946, + "loss": 3.8165, + "theoretical_loss": 4.7117120661961005, + "tokens_seen": 109707264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 198319, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8059911727905273, + "objective/train/theoretical_loss": 4.711327035479103, + "objective/train/tokens_used": 130232800, + "theoretical_loss": 4.711327035479103, + "tokens_seen": 109772800 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048825476429287864, + "loss": 3.6663, + "theoretical_loss": 4.711327035479103, + "tokens_seen": 109772800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882447342026078, + "loss": 3.4004, + "theoretical_loss": 4.710942298882169, + "tokens_seen": 109838336 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488234704112337, + "loss": 3.5996, + "theoretical_loss": 4.710557856005335, + "tokens_seen": 109903872 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048822467402206624, + "loss": 3.7299, + "theoretical_loss": 4.710173706449419, + "tokens_seen": 109969408 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048821464393179536, + "loss": 3.6217, + "theoretical_loss": 4.709789849816021, + "tokens_seen": 110034944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882046138415246, + "loss": 3.8175, + "theoretical_loss": 4.7094062857075185, + "tokens_seen": 110100480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881945837512537, + "loss": 3.5329, + "theoretical_loss": 4.709023013727063, + "tokens_seen": 110166016 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048818455366098296, + "loss": 3.5599, + "theoretical_loss": 4.708640033478584, + "tokens_seen": 110231552 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048817452357071214, + "loss": 3.4719, + "theoretical_loss": 4.708257344566778, + "tokens_seen": 110297088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881644934804413, + "loss": 3.627, + "theoretical_loss": 4.7078749465971175, + "tokens_seen": 110362624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881544633901705, + "loss": 3.7143, + "theoretical_loss": 4.707492839175837, + "tokens_seen": 110428160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048814443329989974, + "loss": 3.5303, + "theoretical_loss": 4.707111021909941, + "tokens_seen": 110493696 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048813440320962887, + "loss": 3.5631, + "theoretical_loss": 4.706729494407197, + "tokens_seen": 110559232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881243731193581, + "loss": 3.4962, + "theoretical_loss": 4.706348256276138, + "tokens_seen": 110624768 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048811434302908723, + "loss": 3.6254, + "theoretical_loss": 4.705967307126051, + "tokens_seen": 110690304 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048810431293881646, + "loss": 3.5333, + "theoretical_loss": 4.705586646566987, + "tokens_seen": 110755840 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048809428284854564, + "loss": 3.386, + "theoretical_loss": 4.705206274209751, + "tokens_seen": 110821376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880842527582748, + "loss": 3.6736, + "theoretical_loss": 4.704826189665905, + "tokens_seen": 110886912 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488074222668004, + "loss": 3.4569, + "theoretical_loss": 4.704446392547759, + "tokens_seen": 110952448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880641925777332, + "loss": 3.5925, + "theoretical_loss": 4.7040668824683785, + "tokens_seen": 111017984 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048805416248746237, + "loss": 3.6843, + "theoretical_loss": 4.7036876590415755, + "tokens_seen": 111083520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880441323971916, + "loss": 3.8082, + "theoretical_loss": 4.7033087218819105, + "tokens_seen": 111149056 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048803410230692073, + "loss": 3.5422, + "theoretical_loss": 4.7029300706046895, + "tokens_seen": 111214592 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048802407221664997, + "loss": 3.7771, + "theoretical_loss": 4.702551704825957, + "tokens_seen": 111280128 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048801404212637915, + "loss": 3.6929, + "theoretical_loss": 4.702173624162507, + "tokens_seen": 111345664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 199018, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2992162704467773, + "objective/train/theoretical_loss": 4.701795828231866, + "objective/train/tokens_used": 131871200, + "theoretical_loss": 4.701795828231866, + "tokens_seen": 111411200 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048800401203610833, + "loss": 3.3269, + "theoretical_loss": 4.701795828231866, + "tokens_seen": 111411200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879939819458375, + "loss": 3.4583, + "theoretical_loss": 4.701418316652299, + "tokens_seen": 111476736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879839518555667, + "loss": 3.5179, + "theoretical_loss": 4.701041089042813, + "tokens_seen": 111542272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879739217652959, + "loss": 3.5889, + "theoretical_loss": 4.700664145023142, + "tokens_seen": 111607808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879638916750251, + "loss": 3.6855, + "theoretical_loss": 4.700287484213753, + "tokens_seen": 111673344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879538615847543, + "loss": 3.6633, + "theoretical_loss": 4.699911106235849, + "tokens_seen": 111738880 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048794383149448347, + "loss": 3.3769, + "theoretical_loss": 4.6995350107113545, + "tokens_seen": 111804416 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048793380140421265, + "loss": 3.4899, + "theoretical_loss": 4.699159197262922, + "tokens_seen": 111869952 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048792377131394183, + "loss": 3.4922, + "theoretical_loss": 4.698783665513934, + "tokens_seen": 111935488 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048791374122367107, + "loss": 3.6508, + "theoretical_loss": 4.698408415088491, + "tokens_seen": 112001024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879037111334002, + "loss": 3.644, + "theoretical_loss": 4.698033445611415, + "tokens_seen": 112066560 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048789368104312943, + "loss": 3.6637, + "theoretical_loss": 4.6976587567082495, + "tokens_seen": 112132096 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048788365095285856, + "loss": 3.3232, + "theoretical_loss": 4.697284348005253, + "tokens_seen": 112197632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004878736208625878, + "loss": 3.6476, + "theoretical_loss": 4.696910219129402, + "tokens_seen": 112263168 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048786359077231697, + "loss": 3.7732, + "theoretical_loss": 4.696536369708386, + "tokens_seen": 112328704 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048785356068204615, + "loss": 3.5794, + "theoretical_loss": 4.696162799370606, + "tokens_seen": 112394240 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048784353059177533, + "loss": 3.5167, + "theoretical_loss": 4.695789507745176, + "tokens_seen": 112459776 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048783350050150457, + "loss": 3.508, + "theoretical_loss": 4.695416494461917, + "tokens_seen": 112525312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004878234704112337, + "loss": 3.6201, + "theoretical_loss": 4.695043759151353, + "tokens_seen": 112590848 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048781344032096293, + "loss": 3.5772, + "theoretical_loss": 4.694671301444722, + "tokens_seen": 112656384 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048780341023069206, + "loss": 3.511, + "theoretical_loss": 4.694299120973957, + "tokens_seen": 112721920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877933801404213, + "loss": 3.4497, + "theoretical_loss": 4.693927217371698, + "tokens_seen": 112787456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877833500501505, + "loss": 3.6642, + "theoretical_loss": 4.693555590271282, + "tokens_seen": 112852992 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048777331995987966, + "loss": 3.1849, + "theoretical_loss": 4.693184239306744, + "tokens_seen": 112918528 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048776328986960884, + "loss": 3.7865, + "theoretical_loss": 4.692813164112819, + "tokens_seen": 112984064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 200274, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.979872226715088, + "objective/train/theoretical_loss": 4.692442364324931, + "objective/train/tokens_used": 133509600, + "theoretical_loss": 4.692442364324931, + "tokens_seen": 113049600 + }, + { + "epoch": 0.03, + "learning_rate": 0.000487753259779338, + "loss": 3.2723, + "theoretical_loss": 4.692442364324931, + "tokens_seen": 113049600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877432296890672, + "loss": 3.5444, + "theoretical_loss": 4.692071839579201, + "tokens_seen": 113115136 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048773319959879644, + "loss": 3.4821, + "theoretical_loss": 4.6917015895124425, + "tokens_seen": 113180672 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048772316950852556, + "loss": 3.8326, + "theoretical_loss": 4.691331613762153, + "tokens_seen": 113246208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877131394182548, + "loss": 3.4582, + "theoretical_loss": 4.690961911966523, + "tokens_seen": 113311744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877031093279839, + "loss": 3.3548, + "theoretical_loss": 4.690592483764427, + "tokens_seen": 113377280 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048769307923771316, + "loss": 3.4375, + "theoretical_loss": 4.690223328795424, + "tokens_seen": 113442816 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048768304914744234, + "loss": 3.5515, + "theoretical_loss": 4.689854446699757, + "tokens_seen": 113508352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876730190571715, + "loss": 3.5217, + "theoretical_loss": 4.689485837118347, + "tokens_seen": 113573888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876629889669007, + "loss": 3.5703, + "theoretical_loss": 4.689117499692798, + "tokens_seen": 113639424 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048765295887662994, + "loss": 3.568, + "theoretical_loss": 4.688749434065389, + "tokens_seen": 113704960 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048764292878635907, + "loss": 3.5214, + "theoretical_loss": 4.688381639879076, + "tokens_seen": 113770496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876328986960883, + "loss": 3.7132, + "theoretical_loss": 4.68801411677749, + "tokens_seen": 113836032 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048762286860581743, + "loss": 3.6747, + "theoretical_loss": 4.687646864404934, + "tokens_seen": 113901568 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048761283851554666, + "loss": 3.7008, + "theoretical_loss": 4.687279882406381, + "tokens_seen": 113967104 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048760280842527584, + "loss": 3.6326, + "theoretical_loss": 4.686913170427477, + "tokens_seen": 114032640 + }, + { + "epoch": 0.03, + "learning_rate": 0.000487592778335005, + "loss": 3.5939, + "theoretical_loss": 4.68654672811453, + "tokens_seen": 114098176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875827482447342, + "loss": 3.5751, + "theoretical_loss": 4.68618055511452, + "tokens_seen": 114163712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875727181544634, + "loss": 3.4383, + "theoretical_loss": 4.685814651075088, + "tokens_seen": 114229248 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048756268806419257, + "loss": 3.484, + "theoretical_loss": 4.685449015644537, + "tokens_seen": 114294784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875526579739218, + "loss": 3.5631, + "theoretical_loss": 4.685083648471835, + "tokens_seen": 114360320 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048754262788365093, + "loss": 3.592, + "theoretical_loss": 4.684718549206607, + "tokens_seen": 114425856 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048753259779338017, + "loss": 3.7434, + "theoretical_loss": 4.6843537174991345, + "tokens_seen": 114491392 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048752256770310935, + "loss": 3.5416, + "theoretical_loss": 4.6839891530003595, + "tokens_seen": 114556928 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048751253761283853, + "loss": 3.8109, + "theoretical_loss": 4.683624855361876, + "tokens_seen": 114622464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 200959, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.169064998626709, + "objective/train/theoretical_loss": 4.68326082423593, + "objective/train/tokens_used": 135148000, + "theoretical_loss": 4.68326082423593, + "tokens_seen": 114688000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875025075225677, + "loss": 3.5971, + "theoretical_loss": 4.68326082423593, + "tokens_seen": 114688000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874924774322969, + "loss": 3.6054, + "theoretical_loss": 4.682897059275422, + "tokens_seen": 114753536 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048748244734202607, + "loss": 3.5122, + "theoretical_loss": 4.682533560133901, + "tokens_seen": 114819072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874724172517553, + "loss": 3.5253, + "theoretical_loss": 4.682170326465565, + "tokens_seen": 114884608 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048746238716148443, + "loss": 3.4639, + "theoretical_loss": 4.681807357925257, + "tokens_seen": 114950144 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048745235707121367, + "loss": 3.4482, + "theoretical_loss": 4.681444654168468, + "tokens_seen": 115015680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874423269809428, + "loss": 3.4827, + "theoretical_loss": 4.68108221485133, + "tokens_seen": 115081216 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048743229689067203, + "loss": 3.5963, + "theoretical_loss": 4.680720039630617, + "tokens_seen": 115146752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874222668004012, + "loss": 3.604, + "theoretical_loss": 4.680358128163747, + "tokens_seen": 115212288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874122367101304, + "loss": 3.575, + "theoretical_loss": 4.679996480108773, + "tokens_seen": 115277824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874022066198596, + "loss": 3.6315, + "theoretical_loss": 4.6796350951243895, + "tokens_seen": 115343360 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048739217652958876, + "loss": 3.5988, + "theoretical_loss": 4.679273972869922, + "tokens_seen": 115408896 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048738214643931794, + "loss": 3.544, + "theoretical_loss": 4.678913113005333, + "tokens_seen": 115474432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048737211634904717, + "loss": 3.434, + "theoretical_loss": 4.6785525151912175, + "tokens_seen": 115539968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873620862587763, + "loss": 3.7679, + "theoretical_loss": 4.678192179088802, + "tokens_seen": 115605504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048735205616850553, + "loss": 3.3392, + "theoretical_loss": 4.6778321043599425, + "tokens_seen": 115671040 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873420260782347, + "loss": 3.6271, + "theoretical_loss": 4.677472290667122, + "tokens_seen": 115736576 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873319959879639, + "loss": 3.5469, + "theoretical_loss": 4.677112737673453, + "tokens_seen": 115802112 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873219658976931, + "loss": 3.7895, + "theoretical_loss": 4.676753445042669, + "tokens_seen": 115867648 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048731193580742226, + "loss": 3.395, + "theoretical_loss": 4.676394412439132, + "tokens_seen": 115933184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048730190571715144, + "loss": 3.4239, + "theoretical_loss": 4.6760356395278215, + "tokens_seen": 115998720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872918756268807, + "loss": 3.8464, + "theoretical_loss": 4.675677125974339, + "tokens_seen": 116064256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872818455366098, + "loss": 3.6943, + "theoretical_loss": 4.675318871444908, + "tokens_seen": 116129792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048727181544633904, + "loss": 3.2368, + "theoretical_loss": 4.674960875606366, + "tokens_seen": 116195328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048726178535606816, + "loss": 3.5752, + "theoretical_loss": 4.674603138126168, + "tokens_seen": 116260864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 202277, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6310184001922607, + "objective/train/theoretical_loss": 4.674245658672382, + "objective/train/tokens_used": 136786400, + "theoretical_loss": 4.674245658672382, + "tokens_seen": 116326400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872517552657974, + "loss": 3.3857, + "theoretical_loss": 4.674245658672382, + "tokens_seen": 116326400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872417251755266, + "loss": 3.6198, + "theoretical_loss": 4.673888436913694, + "tokens_seen": 116391936 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048723169508525576, + "loss": 3.5595, + "theoretical_loss": 4.673531472519397, + "tokens_seen": 116457472 + }, + { + "epoch": 0.04, + "learning_rate": 0.000487221664994985, + "loss": 3.6411, + "theoretical_loss": 4.673174765159393, + "tokens_seen": 116523008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872116349047141, + "loss": 3.5595, + "theoretical_loss": 4.672818314504198, + "tokens_seen": 116588544 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048720160481444336, + "loss": 3.5692, + "theoretical_loss": 4.6724621202249335, + "tokens_seen": 116654080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048719157472417254, + "loss": 3.3932, + "theoretical_loss": 4.672106181993324, + "tokens_seen": 116719616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871815446339017, + "loss": 3.5326, + "theoretical_loss": 4.6717504994817, + "tokens_seen": 116785152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871715145436309, + "loss": 3.2663, + "theoretical_loss": 4.671395072362996, + "tokens_seen": 116850688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048716148445336014, + "loss": 3.565, + "theoretical_loss": 4.671039900310747, + "tokens_seen": 116916224 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048715145436308927, + "loss": 3.582, + "theoretical_loss": 4.670684982999088, + "tokens_seen": 116981760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871414242728185, + "loss": 3.6837, + "theoretical_loss": 4.670330320102753, + "tokens_seen": 117047296 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048713139418254763, + "loss": 3.6938, + "theoretical_loss": 4.669975911297072, + "tokens_seen": 117112832 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048712136409227686, + "loss": 3.5719, + "theoretical_loss": 4.669621756257971, + "tokens_seen": 117178368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048711133400200604, + "loss": 3.6868, + "theoretical_loss": 4.669267854661973, + "tokens_seen": 117243904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871013039117352, + "loss": 3.3895, + "theoretical_loss": 4.668914206186189, + "tokens_seen": 117309440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870912738214644, + "loss": 3.4743, + "theoretical_loss": 4.6685608105083265, + "tokens_seen": 117374976 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870812437311936, + "loss": 3.4699, + "theoretical_loss": 4.66820766730668, + "tokens_seen": 117440512 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048707121364092277, + "loss": 3.727, + "theoretical_loss": 4.667854776260132, + "tokens_seen": 117506048 + }, + { + "epoch": 0.04, + "learning_rate": 0.000487061183550652, + "loss": 3.5116, + "theoretical_loss": 4.667502137048155, + "tokens_seen": 117571584 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048705115346038113, + "loss": 3.4195, + "theoretical_loss": 4.667149749350805, + "tokens_seen": 117637120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048704112337011037, + "loss": 3.6593, + "theoretical_loss": 4.666797612848723, + "tokens_seen": 117702656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048703109327983955, + "loss": 3.3363, + "theoretical_loss": 4.666445727223134, + "tokens_seen": 117768192 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048702106318956873, + "loss": 3.5724, + "theoretical_loss": 4.666094092155843, + "tokens_seen": 117833728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870110330992979, + "loss": 3.4272, + "theoretical_loss": 4.665742707329238, + "tokens_seen": 117899264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 203088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.953324317932129, + "objective/train/theoretical_loss": 4.665391572426282, + "objective/train/tokens_used": 138424800, + "theoretical_loss": 4.665391572426282, + "tokens_seen": 117964800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870010030090271, + "loss": 3.5257, + "theoretical_loss": 4.665391572426282, + "tokens_seen": 117964800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048699097291875627, + "loss": 3.6506, + "theoretical_loss": 4.665040687130518, + "tokens_seen": 118030336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869809428284855, + "loss": 3.5996, + "theoretical_loss": 4.664690051126065, + "tokens_seen": 118095872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048697091273821463, + "loss": 3.5075, + "theoretical_loss": 4.664339664097617, + "tokens_seen": 118161408 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048696088264794387, + "loss": 3.6508, + "theoretical_loss": 4.66398952573044, + "tokens_seen": 118226944 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486950852557673, + "loss": 3.7334, + "theoretical_loss": 4.663639635710373, + "tokens_seen": 118292480 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048694082246740223, + "loss": 3.6102, + "theoretical_loss": 4.663289993723826, + "tokens_seen": 118358016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869307923771314, + "loss": 3.5511, + "theoretical_loss": 4.662940599457777, + "tokens_seen": 118423552 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869207622868606, + "loss": 3.5001, + "theoretical_loss": 4.662591452599774, + "tokens_seen": 118489088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869107321965898, + "loss": 3.485, + "theoretical_loss": 4.662242552837929, + "tokens_seen": 118554624 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048690070210631896, + "loss": 3.6105, + "theoretical_loss": 4.661893899860923, + "tokens_seen": 118620160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048689067201604814, + "loss": 3.7023, + "theoretical_loss": 4.6615454933579965, + "tokens_seen": 118685696 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048688064192577737, + "loss": 3.4614, + "theoretical_loss": 4.661197333018957, + "tokens_seen": 118751232 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868706118355065, + "loss": 3.5773, + "theoretical_loss": 4.66084941853417, + "tokens_seen": 118816768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048686058174523573, + "loss": 3.6587, + "theoretical_loss": 4.6605017495945615, + "tokens_seen": 118882304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868505516549649, + "loss": 3.7256, + "theoretical_loss": 4.660154325891618, + "tokens_seen": 118947840 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868405215646941, + "loss": 3.6683, + "theoretical_loss": 4.659807147117382, + "tokens_seen": 119013376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868304914744233, + "loss": 3.5433, + "theoretical_loss": 4.6594602129644525, + "tokens_seen": 119078912 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048682046138415246, + "loss": 3.6163, + "theoretical_loss": 4.659113523125981, + "tokens_seen": 119144448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048681043129388164, + "loss": 3.6002, + "theoretical_loss": 4.6587670772956775, + "tokens_seen": 119209984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868004012036109, + "loss": 3.744, + "theoretical_loss": 4.658420875167799, + "tokens_seen": 119275520 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048679037111334, + "loss": 3.5674, + "theoretical_loss": 4.658074916437155, + "tokens_seen": 119341056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048678034102306924, + "loss": 3.705, + "theoretical_loss": 4.657729200799105, + "tokens_seen": 119406592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048677031093279836, + "loss": 3.5421, + "theoretical_loss": 4.657383727949558, + "tokens_seen": 119472128 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867602808425276, + "loss": 3.2356, + "theoretical_loss": 4.657038497584967, + "tokens_seen": 119537664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 204120, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.542322874069214, + "objective/train/theoretical_loss": 4.656693509402331, + "objective/train/tokens_used": 140063200, + "theoretical_loss": 4.656693509402331, + "tokens_seen": 119603200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867502507522568, + "loss": 3.5035, + "theoretical_loss": 4.656693509402331, + "tokens_seen": 119603200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048674022066198596, + "loss": 3.6295, + "theoretical_loss": 4.6563487630991975, + "tokens_seen": 119668736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048673019057171514, + "loss": 3.6794, + "theoretical_loss": 4.656004258373651, + "tokens_seen": 119734272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867201604814443, + "loss": 3.4914, + "theoretical_loss": 4.655659994924323, + "tokens_seen": 119799808 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867101303911735, + "loss": 3.4807, + "theoretical_loss": 4.655315972450383, + "tokens_seen": 119865344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048670010030090274, + "loss": 3.6322, + "theoretical_loss": 4.65497219065154, + "tokens_seen": 119930880 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048669007021063187, + "loss": 3.5918, + "theoretical_loss": 4.654628649228041, + "tokens_seen": 119996416 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004866800401203611, + "loss": 3.6249, + "theoretical_loss": 4.654285347880672, + "tokens_seen": 120061952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004866700100300903, + "loss": 3.6893, + "theoretical_loss": 4.653942286310749, + "tokens_seen": 120127488 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048665997993981947, + "loss": 3.7484, + "theoretical_loss": 4.653599464220129, + "tokens_seen": 120193024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048664994984954865, + "loss": 3.6358, + "theoretical_loss": 4.653256881311198, + "tokens_seen": 120258560 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048663991975927783, + "loss": 3.5567, + "theoretical_loss": 4.6529145372868745, + "tokens_seen": 120324096 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486629889669007, + "loss": 3.519, + "theoretical_loss": 4.652572431850608, + "tokens_seen": 120389632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048661985957873624, + "loss": 3.4446, + "theoretical_loss": 4.652230564706377, + "tokens_seen": 120455168 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048660982948846537, + "loss": 3.4514, + "theoretical_loss": 4.651888935558688, + "tokens_seen": 120520704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865997993981946, + "loss": 3.3068, + "theoretical_loss": 4.651547544112575, + "tokens_seen": 120586240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048658976930792373, + "loss": 3.7197, + "theoretical_loss": 4.651206390073597, + "tokens_seen": 120651776 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048657973921765297, + "loss": 3.4481, + "theoretical_loss": 4.650865473147837, + "tokens_seen": 120717312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048656970912738215, + "loss": 3.3836, + "theoretical_loss": 4.650524793041903, + "tokens_seen": 120782848 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048655967903711133, + "loss": 3.259, + "theoretical_loss": 4.650184349462922, + "tokens_seen": 120848384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865496489468405, + "loss": 3.4753, + "theoretical_loss": 4.649844142118544, + "tokens_seen": 120913920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048653961885656975, + "loss": 3.6915, + "theoretical_loss": 4.6495041707169396, + "tokens_seen": 120979456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865295887662989, + "loss": 3.562, + "theoretical_loss": 4.649164434966794, + "tokens_seen": 121044992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865195586760281, + "loss": 3.4441, + "theoretical_loss": 4.648824934577313, + "tokens_seen": 121110528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048650952858575724, + "loss": 3.7728, + "theoretical_loss": 4.648485669258216, + "tokens_seen": 121176064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 204902, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.444392681121826, + "objective/train/theoretical_loss": 4.648146638719739, + "objective/train/tokens_used": 141701600, + "theoretical_loss": 4.648146638719739, + "tokens_seen": 121241600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048649949849548647, + "loss": 3.7584, + "theoretical_loss": 4.648146638719739, + "tokens_seen": 121241600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048648946840521565, + "loss": 3.3618, + "theoretical_loss": 4.647807842672631, + "tokens_seen": 121307136 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048647943831494483, + "loss": 3.6952, + "theoretical_loss": 4.647469280828153, + "tokens_seen": 121372672 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048646940822467407, + "loss": 3.4908, + "theoretical_loss": 4.647130952898077, + "tokens_seen": 121438208 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864593781344032, + "loss": 3.9747, + "theoretical_loss": 4.646792858594686, + "tokens_seen": 121503744 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048644934804413243, + "loss": 3.3985, + "theoretical_loss": 4.64645499763077, + "tokens_seen": 121569280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864393179538616, + "loss": 3.4271, + "theoretical_loss": 4.646117369719629, + "tokens_seen": 121634816 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864292878635908, + "loss": 3.5924, + "theoretical_loss": 4.645779974575069, + "tokens_seen": 121700352 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048641925777332, + "loss": 3.4433, + "theoretical_loss": 4.6454428119113995, + "tokens_seen": 121765888 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048640922768304916, + "loss": 3.6515, + "theoretical_loss": 4.6451058814434365, + "tokens_seen": 121831424 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048639919759277834, + "loss": 3.8542, + "theoretical_loss": 4.644769182886495, + "tokens_seen": 121896960 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048638916750250757, + "loss": 3.6482, + "theoretical_loss": 4.644432715956399, + "tokens_seen": 121962496 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863791374122367, + "loss": 3.4727, + "theoretical_loss": 4.644096480369466, + "tokens_seen": 122028032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048636910732196593, + "loss": 3.4828, + "theoretical_loss": 4.643760475842518, + "tokens_seen": 122093568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863590772316951, + "loss": 3.3981, + "theoretical_loss": 4.6434247020928705, + "tokens_seen": 122159104 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863490471414243, + "loss": 3.4389, + "theoretical_loss": 4.643089158838341, + "tokens_seen": 122224640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863390170511535, + "loss": 3.7861, + "theoretical_loss": 4.642753845797243, + "tokens_seen": 122290176 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048632898696088266, + "loss": 3.4705, + "theoretical_loss": 4.642418762688379, + "tokens_seen": 122355712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048631895687061184, + "loss": 3.5749, + "theoretical_loss": 4.642083909231053, + "tokens_seen": 122421248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863089267803411, + "loss": 3.6578, + "theoretical_loss": 4.641749285145057, + "tokens_seen": 122486784 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862988966900702, + "loss": 3.407, + "theoretical_loss": 4.641414890150675, + "tokens_seen": 122552320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048628886659979944, + "loss": 3.5225, + "theoretical_loss": 4.641080723968684, + "tokens_seen": 122617856 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048627883650952857, + "loss": 3.4672, + "theoretical_loss": 4.6407467863203475, + "tokens_seen": 122683392 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862688064192578, + "loss": 3.336, + "theoretical_loss": 4.640413076927418, + "tokens_seen": 122748928 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486258776328987, + "loss": 3.2779, + "theoretical_loss": 4.6400795955121374, + "tokens_seen": 122814464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 206332, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.079704761505127, + "objective/train/theoretical_loss": 4.639746341797229, + "objective/train/tokens_used": 143340000, + "theoretical_loss": 4.639746341797229, + "tokens_seen": 122880000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048624874623871616, + "loss": 3.7945, + "theoretical_loss": 4.639746341797229, + "tokens_seen": 122880000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048623871614844534, + "loss": 3.7026, + "theoretical_loss": 4.639413315505905, + "tokens_seen": 122945536 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862286860581745, + "loss": 3.7991, + "theoretical_loss": 4.639080516361861, + "tokens_seen": 123011072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862186559679037, + "loss": 3.3407, + "theoretical_loss": 4.638747944089273, + "tokens_seen": 123076608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048620862587763294, + "loss": 3.6945, + "theoretical_loss": 4.638415598412799, + "tokens_seen": 123142144 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048619859578736207, + "loss": 3.3499, + "theoretical_loss": 4.638083479057579, + "tokens_seen": 123207680 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861885656970913, + "loss": 3.4411, + "theoretical_loss": 4.637751585749234, + "tokens_seen": 123273216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861785356068205, + "loss": 3.419, + "theoretical_loss": 4.6374199182138565, + "tokens_seen": 123338752 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048616850551654967, + "loss": 3.1327, + "theoretical_loss": 4.637088476178025, + "tokens_seen": 123404288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048615847542627885, + "loss": 3.3689, + "theoretical_loss": 4.636757259368787, + "tokens_seen": 123469824 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048614844533600803, + "loss": 3.2499, + "theoretical_loss": 4.636426267513668, + "tokens_seen": 123535360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861384152457372, + "loss": 3.443, + "theoretical_loss": 4.636095500340669, + "tokens_seen": 123600896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048612838515546644, + "loss": 3.4505, + "theoretical_loss": 4.635764957578261, + "tokens_seen": 123666432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048611835506519557, + "loss": 3.4719, + "theoretical_loss": 4.635434638955388, + "tokens_seen": 123731968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861083249749248, + "loss": 3.6203, + "theoretical_loss": 4.635104544201465, + "tokens_seen": 123797504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048609829488465393, + "loss": 3.1321, + "theoretical_loss": 4.634774673046376, + "tokens_seen": 123863040 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048608826479438317, + "loss": 3.3834, + "theoretical_loss": 4.634445025220475, + "tokens_seen": 123928576 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048607823470411235, + "loss": 3.5649, + "theoretical_loss": 4.634115600454582, + "tokens_seen": 123994112 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048606820461384153, + "loss": 3.3337, + "theoretical_loss": 4.633786398479983, + "tokens_seen": 124059648 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860581745235707, + "loss": 3.3326, + "theoretical_loss": 4.6334574190284314, + "tokens_seen": 124125184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048604814443329995, + "loss": 3.2998, + "theoretical_loss": 4.633128661832145, + "tokens_seen": 124190720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860381143430291, + "loss": 3.581, + "theoretical_loss": 4.632800126623803, + "tokens_seen": 124256256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860280842527583, + "loss": 3.6333, + "theoretical_loss": 4.632471813136547, + "tokens_seen": 124321792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048601805416248744, + "loss": 3.447, + "theoretical_loss": 4.632143721103983, + "tokens_seen": 124387328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048600802407221667, + "loss": 3.2293, + "theoretical_loss": 4.631815850260173, + "tokens_seen": 124452864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 207169, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6358470916748047, + "objective/train/theoretical_loss": 4.631488200339643, + "objective/train/tokens_used": 144978400, + "theoretical_loss": 4.631488200339643, + "tokens_seen": 124518400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048599799398194585, + "loss": 3.4651, + "theoretical_loss": 4.631488200339643, + "tokens_seen": 124518400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048598796389167503, + "loss": 3.4181, + "theoretical_loss": 4.63116077107737, + "tokens_seen": 124583936 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859779338014042, + "loss": 3.6055, + "theoretical_loss": 4.630833562208797, + "tokens_seen": 124649472 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859679037111334, + "loss": 3.4885, + "theoretical_loss": 4.630506573469815, + "tokens_seen": 124715008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859578736208626, + "loss": 3.5052, + "theoretical_loss": 4.630179804596775, + "tokens_seen": 124780544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859478435305918, + "loss": 3.5164, + "theoretical_loss": 4.629853255326481, + "tokens_seen": 124846080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048593781344032094, + "loss": 3.3836, + "theoretical_loss": 4.629526925396189, + "tokens_seen": 124911616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859277833500502, + "loss": 3.4527, + "theoretical_loss": 4.6292008145436085, + "tokens_seen": 124977152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859177532597793, + "loss": 3.37, + "theoretical_loss": 4.628874922506897, + "tokens_seen": 125042688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048590772316950854, + "loss": 3.294, + "theoretical_loss": 4.628549249024666, + "tokens_seen": 125108224 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858976930792377, + "loss": 3.3212, + "theoretical_loss": 4.628223793835975, + "tokens_seen": 125173760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858876629889669, + "loss": 3.3518, + "theoretical_loss": 4.627898556680327, + "tokens_seen": 125239296 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858776328986961, + "loss": 3.2403, + "theoretical_loss": 4.627573537297678, + "tokens_seen": 125304832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858676028084253, + "loss": 3.4935, + "theoretical_loss": 4.627248735428427, + "tokens_seen": 125370368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048585757271815444, + "loss": 3.4322, + "theoretical_loss": 4.6269241508134185, + "tokens_seen": 125435904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858475426278837, + "loss": 3.5357, + "theoretical_loss": 4.6265997831939405, + "tokens_seen": 125501440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858375125376128, + "loss": 3.4814, + "theoretical_loss": 4.6262756323117245, + "tokens_seen": 125566976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048582748244734204, + "loss": 3.4993, + "theoretical_loss": 4.625951697908944, + "tokens_seen": 125632512 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858174523570712, + "loss": 3.4275, + "theoretical_loss": 4.625627979728212, + "tokens_seen": 125698048 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858074222668004, + "loss": 3.38, + "theoretical_loss": 4.625304477512584, + "tokens_seen": 125763584 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857973921765296, + "loss": 3.1327, + "theoretical_loss": 4.624981191005554, + "tokens_seen": 125829120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048578736208625877, + "loss": 3.4219, + "theoretical_loss": 4.624658119951052, + "tokens_seen": 125894656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048577733199598795, + "loss": 3.4453, + "theoretical_loss": 4.624335264093447, + "tokens_seen": 125960192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857673019057172, + "loss": 3.5913, + "theoretical_loss": 4.624012623177544, + "tokens_seen": 126025728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857572718154463, + "loss": 3.3552, + "theoretical_loss": 4.623690196948582, + "tokens_seen": 126091264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 208560, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.118776321411133, + "objective/train/theoretical_loss": 4.623367985152234, + "objective/train/tokens_used": 146616800, + "theoretical_loss": 4.623367985152234, + "tokens_seen": 126156800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048574724172517554, + "loss": 3.3034, + "theoretical_loss": 4.623367985152234, + "tokens_seen": 126156800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048573721163490467, + "loss": 3.4021, + "theoretical_loss": 4.623045987534609, + "tokens_seen": 126222336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857271815446339, + "loss": 3.4968, + "theoretical_loss": 4.622724203842246, + "tokens_seen": 126287872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048571715145436314, + "loss": 3.6168, + "theoretical_loss": 4.622402633822114, + "tokens_seen": 126353408 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048570712136409227, + "loss": 3.6483, + "theoretical_loss": 4.622081277221616, + "tokens_seen": 126418944 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856970912738215, + "loss": 3.1841, + "theoretical_loss": 4.62176013378858, + "tokens_seen": 126484480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856870611835507, + "loss": 3.3364, + "theoretical_loss": 4.621439203271267, + "tokens_seen": 126550016 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048567703109327987, + "loss": 3.5219, + "theoretical_loss": 4.621118485418362, + "tokens_seen": 126615552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048566700100300905, + "loss": 3.5676, + "theoretical_loss": 4.620797979978978, + "tokens_seen": 126681088 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048565697091273823, + "loss": 3.0699, + "theoretical_loss": 4.620477686702651, + "tokens_seen": 126746624 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856469408224674, + "loss": 3.4134, + "theoretical_loss": 4.620157605339347, + "tokens_seen": 126812160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048563691073219664, + "loss": 3.3575, + "theoretical_loss": 4.619837735639452, + "tokens_seen": 126877696 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048562688064192577, + "loss": 3.1724, + "theoretical_loss": 4.619518077353776, + "tokens_seen": 126943232 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485616850551655, + "loss": 3.5963, + "theoretical_loss": 4.619198630233547, + "tokens_seen": 127008768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048560682046138413, + "loss": 3.2372, + "theoretical_loss": 4.6188793940304205, + "tokens_seen": 127074304 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048559679037111337, + "loss": 3.5238, + "theoretical_loss": 4.618560368496466, + "tokens_seen": 127139840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048558676028084255, + "loss": 3.4135, + "theoretical_loss": 4.618241553384175, + "tokens_seen": 127205376 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048557673019057173, + "loss": 3.5748, + "theoretical_loss": 4.617922948446459, + "tokens_seen": 127270912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855667001003009, + "loss": 3.2217, + "theoretical_loss": 4.617604553436642, + "tokens_seen": 127336448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048555667001003015, + "loss": 3.4823, + "theoretical_loss": 4.617286368108466, + "tokens_seen": 127401984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855466399197593, + "loss": 3.5026, + "theoretical_loss": 4.6169683922160925, + "tokens_seen": 127467520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855366098294885, + "loss": 3.3712, + "theoretical_loss": 4.616650625514091, + "tokens_seen": 127533056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048552657973921764, + "loss": 3.3915, + "theoretical_loss": 4.616333067757449, + "tokens_seen": 127598592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048551654964894687, + "loss": 3.5052, + "theoretical_loss": 4.616015718701563, + "tokens_seen": 127664128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048550651955867605, + "loss": 3.4679, + "theoretical_loss": 4.615698578102245, + "tokens_seen": 127729664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 209446, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4149038791656494, + "objective/train/theoretical_loss": 4.615381645715717, + "objective/train/tokens_used": 148255200, + "theoretical_loss": 4.615381645715717, + "tokens_seen": 127795200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048549648946840523, + "loss": 3.3464, + "theoretical_loss": 4.615381645715717, + "tokens_seen": 127795200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854864593781344, + "loss": 3.448, + "theoretical_loss": 4.615064921298608, + "tokens_seen": 127860736 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854764292878636, + "loss": 3.4669, + "theoretical_loss": 4.61474840460796, + "tokens_seen": 127926272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854663991975928, + "loss": 3.2942, + "theoretical_loss": 4.614432095401219, + "tokens_seen": 127991808 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485456369107322, + "loss": 3.5544, + "theoretical_loss": 4.614115993436242, + "tokens_seen": 128057344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048544633901705114, + "loss": 3.6377, + "theoretical_loss": 4.613800098471291, + "tokens_seen": 128122880 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854363089267804, + "loss": 3.4673, + "theoretical_loss": 4.613484410265032, + "tokens_seen": 128188416 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854262788365095, + "loss": 3.3257, + "theoretical_loss": 4.613168928576538, + "tokens_seen": 128253952 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048541624874623874, + "loss": 3.3088, + "theoretical_loss": 4.612853653165283, + "tokens_seen": 128319488 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854062186559679, + "loss": 3.3764, + "theoretical_loss": 4.612538583791146, + "tokens_seen": 128385024 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853961885656971, + "loss": 3.347, + "theoretical_loss": 4.612223720214407, + "tokens_seen": 128450560 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853861584754263, + "loss": 3.5082, + "theoretical_loss": 4.611909062195749, + "tokens_seen": 128516096 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853761283851555, + "loss": 3.5355, + "theoretical_loss": 4.61159460949625, + "tokens_seen": 128581632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048536609829488464, + "loss": 3.5775, + "theoretical_loss": 4.611280361877393, + "tokens_seen": 128647168 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853560682046139, + "loss": 3.4158, + "theoretical_loss": 4.610966319101056, + "tokens_seen": 128712704 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485346038114343, + "loss": 3.4332, + "theoretical_loss": 4.610652480929515, + "tokens_seen": 128778240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048533600802407224, + "loss": 3.5417, + "theoretical_loss": 4.610338847125445, + "tokens_seen": 128843776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853259779338014, + "loss": 3.2011, + "theoretical_loss": 4.610025417451913, + "tokens_seen": 128909312 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853159478435306, + "loss": 3.3289, + "theoretical_loss": 4.6097121916723856, + "tokens_seen": 128974848 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853059177532598, + "loss": 3.4422, + "theoretical_loss": 4.609399169550718, + "tokens_seen": 129040384 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048529588766298897, + "loss": 3.4443, + "theoretical_loss": 4.609086350851165, + "tokens_seen": 129105920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048528585757271815, + "loss": 3.5788, + "theoretical_loss": 4.6087737353383655, + "tokens_seen": 129171456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852758274824474, + "loss": 3.2671, + "theoretical_loss": 4.6084613227773605, + "tokens_seen": 129236992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852657973921765, + "loss": 3.4653, + "theoretical_loss": 4.608149112933571, + "tokens_seen": 129302528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048525576730190574, + "loss": 3.2752, + "theoretical_loss": 4.607837105572816, + "tokens_seen": 129368064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 210153, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.262286424636841, + "objective/train/theoretical_loss": 4.607525300461299, + "objective/train/tokens_used": 149893600, + "theoretical_loss": 4.607525300461299, + "tokens_seen": 129433600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048524573721163487, + "loss": 3.4158, + "theoretical_loss": 4.607525300461299, + "tokens_seen": 129433600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852357071213641, + "loss": 3.84, + "theoretical_loss": 4.607213697365613, + "tokens_seen": 129499136 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852256770310933, + "loss": 3.2357, + "theoretical_loss": 4.606902296052739, + "tokens_seen": 129564672 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048521564694082247, + "loss": 3.1954, + "theoretical_loss": 4.6065910962900425, + "tokens_seen": 129630208 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048520561685055165, + "loss": 3.2395, + "theoretical_loss": 4.606280097845277, + "tokens_seen": 129695744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851955867602809, + "loss": 3.6586, + "theoretical_loss": 4.60596930048658, + "tokens_seen": 129761280 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048518555667001, + "loss": 3.8546, + "theoretical_loss": 4.605658703982471, + "tokens_seen": 129826816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048517552657973925, + "loss": 3.6046, + "theoretical_loss": 4.6053483081018545, + "tokens_seen": 129892352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851654964894684, + "loss": 3.6551, + "theoretical_loss": 4.605038112614018, + "tokens_seen": 129957888 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851554663991976, + "loss": 3.7827, + "theoretical_loss": 4.604728117288631, + "tokens_seen": 130023424 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851454363089268, + "loss": 3.4682, + "theoretical_loss": 4.604418321895739, + "tokens_seen": 130088960 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048513540621865597, + "loss": 3.4146, + "theoretical_loss": 4.604108726205774, + "tokens_seen": 130154496 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048512537612838515, + "loss": 3.6829, + "theoretical_loss": 4.60379932998954, + "tokens_seen": 130220032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048511534603811433, + "loss": 3.2773, + "theoretical_loss": 4.6034901330182265, + "tokens_seen": 130285568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851053159478435, + "loss": 3.441, + "theoretical_loss": 4.603181135063394, + "tokens_seen": 130351104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048509528585757275, + "loss": 3.4939, + "theoretical_loss": 4.6028723358969845, + "tokens_seen": 130416640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850852557673019, + "loss": 3.6256, + "theoretical_loss": 4.602563735291312, + "tokens_seen": 130482176 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850752256770311, + "loss": 3.3061, + "theoretical_loss": 4.602255333019068, + "tokens_seen": 130547712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048506519558676024, + "loss": 3.4777, + "theoretical_loss": 4.6019471288533165, + "tokens_seen": 130613248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850551654964895, + "loss": 3.5478, + "theoretical_loss": 4.601639122567497, + "tokens_seen": 130678784 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048504513540621866, + "loss": 3.2939, + "theoretical_loss": 4.601331313935418, + "tokens_seen": 130744320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048503510531594784, + "loss": 3.3667, + "theoretical_loss": 4.601023702731264, + "tokens_seen": 130809856 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485025075225677, + "loss": 3.7578, + "theoretical_loss": 4.600716288729587, + "tokens_seen": 130875392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048501504513540625, + "loss": 3.3572, + "theoretical_loss": 4.600409071705312, + "tokens_seen": 130940928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850050150451354, + "loss": 3.5029, + "theoretical_loss": 4.60010205143373, + "tokens_seen": 131006464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 211647, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9844844341278076, + "objective/train/theoretical_loss": 4.599795227690505, + "objective/train/tokens_used": 151532000, + "theoretical_loss": 4.599795227690505, + "tokens_seen": 131072000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849949849548646, + "loss": 3.4689, + "theoretical_loss": 4.599795227690505, + "tokens_seen": 131072000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849849548645938, + "loss": 3.5353, + "theoretical_loss": 4.5994886002516635, + "tokens_seen": 131137536 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484974924774323, + "loss": 3.7795, + "theoretical_loss": 4.599182168893604, + "tokens_seen": 131203072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849648946840522, + "loss": 3.42, + "theoretical_loss": 4.598875933393089, + "tokens_seen": 131268608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048495486459378134, + "loss": 3.4034, + "theoretical_loss": 4.5985698935272445, + "tokens_seen": 131334144 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849448345035106, + "loss": 3.5382, + "theoretical_loss": 4.598264049073565, + "tokens_seen": 131399680 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849348044132397, + "loss": 3.336, + "theoretical_loss": 4.597958399809908, + "tokens_seen": 131465216 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048492477432296894, + "loss": 3.3876, + "theoretical_loss": 4.59765294551449, + "tokens_seen": 131530752 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849147442326981, + "loss": 3.418, + "theoretical_loss": 4.597347685965897, + "tokens_seen": 131596288 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849047141424273, + "loss": 3.4876, + "theoretical_loss": 4.597042620943069, + "tokens_seen": 131661824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848946840521565, + "loss": 3.7115, + "theoretical_loss": 4.596737750225311, + "tokens_seen": 131727360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848846539618857, + "loss": 3.6111, + "theoretical_loss": 4.596433073592289, + "tokens_seen": 131792896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048487462387161484, + "loss": 3.4538, + "theoretical_loss": 4.596128590824026, + "tokens_seen": 131858432 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848645937813441, + "loss": 3.2722, + "theoretical_loss": 4.595824301700904, + "tokens_seen": 131923968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848545636910732, + "loss": 3.2836, + "theoretical_loss": 4.595520206003663, + "tokens_seen": 131989504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048484453360080244, + "loss": 3.2864, + "theoretical_loss": 4.595216303513399, + "tokens_seen": 132055040 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848345035105316, + "loss": 3.3418, + "theoretical_loss": 4.594912594011566, + "tokens_seen": 132120576 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848244734202608, + "loss": 3.573, + "theoretical_loss": 4.594609077279973, + "tokens_seen": 132186112 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048481444332999, + "loss": 3.4152, + "theoretical_loss": 4.594305753100782, + "tokens_seen": 132251648 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048480441323971917, + "loss": 3.2979, + "theoretical_loss": 4.594002621256511, + "tokens_seen": 132317184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048479438314944835, + "loss": 3.5056, + "theoretical_loss": 4.59369968153003, + "tokens_seen": 132382720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847843530591776, + "loss": 3.6168, + "theoretical_loss": 4.593396933704562, + "tokens_seen": 132448256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847743229689067, + "loss": 3.3142, + "theoretical_loss": 4.593094377563681, + "tokens_seen": 132513792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048476429287863594, + "loss": 3.3908, + "theoretical_loss": 4.592792012891314, + "tokens_seen": 132579328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048475426278836507, + "loss": 3.3265, + "theoretical_loss": 4.592489839471735, + "tokens_seen": 132644864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 212401, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6346824169158936, + "objective/train/theoretical_loss": 4.592187857089571, + "objective/train/tokens_used": 153170400, + "theoretical_loss": 4.592187857089571, + "tokens_seen": 132710400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847442326980943, + "loss": 3.6193, + "theoretical_loss": 4.592187857089571, + "tokens_seen": 132710400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847342026078235, + "loss": 3.1198, + "theoretical_loss": 4.591886065529795, + "tokens_seen": 132775936 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048472417251755267, + "loss": 3.5852, + "theoretical_loss": 4.591584464577728, + "tokens_seen": 132841472 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048471414242728185, + "loss": 3.5391, + "theoretical_loss": 4.591283054019041, + "tokens_seen": 132907008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847041123370111, + "loss": 3.7466, + "theoretical_loss": 4.5909818336397485, + "tokens_seen": 132972544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846940822467402, + "loss": 3.5896, + "theoretical_loss": 4.590680803226213, + "tokens_seen": 133038080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048468405215646945, + "loss": 3.4774, + "theoretical_loss": 4.590379962565141, + "tokens_seen": 133103616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846740220661986, + "loss": 3.5418, + "theoretical_loss": 4.590079311443583, + "tokens_seen": 133169152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846639919759278, + "loss": 3.5195, + "theoretical_loss": 4.589778849648934, + "tokens_seen": 133234688 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484653961885657, + "loss": 3.3776, + "theoretical_loss": 4.589478576968932, + "tokens_seen": 133300224 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048464393179538617, + "loss": 3.451, + "theoretical_loss": 4.589178493191655, + "tokens_seen": 133365760 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048463390170511535, + "loss": 3.4017, + "theoretical_loss": 4.588878598105527, + "tokens_seen": 133431296 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048462387161484453, + "loss": 3.1066, + "theoretical_loss": 4.588578891499308, + "tokens_seen": 133496832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846138415245737, + "loss": 3.5051, + "theoretical_loss": 4.588279373162101, + "tokens_seen": 133562368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048460381143430295, + "loss": 3.3603, + "theoretical_loss": 4.587980042883347, + "tokens_seen": 133627904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845937813440321, + "loss": 3.1542, + "theoretical_loss": 4.587680900452824, + "tokens_seen": 133693440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845837512537613, + "loss": 3.3772, + "theoretical_loss": 4.587381945660653, + "tokens_seen": 133758976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048457372116349044, + "loss": 3.303, + "theoretical_loss": 4.587083178297288, + "tokens_seen": 133824512 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845636910732197, + "loss": 3.4483, + "theoretical_loss": 4.5867845981535185, + "tokens_seen": 133890048 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048455366098294886, + "loss": 3.4031, + "theoretical_loss": 4.586486205020474, + "tokens_seen": 133955584 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048454363089267804, + "loss": 3.4808, + "theoretical_loss": 4.586187998689616, + "tokens_seen": 134021120 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845336008024072, + "loss": 3.4596, + "theoretical_loss": 4.585889978952741, + "tokens_seen": 134086656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048452357071213645, + "loss": 3.2542, + "theoretical_loss": 4.58559214560198, + "tokens_seen": 134152192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845135406218656, + "loss": 3.2377, + "theoretical_loss": 4.585294498429796, + "tokens_seen": 134217728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845035105315948, + "loss": 3.4138, + "theoretical_loss": 4.584997037228986, + "tokens_seen": 134283264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 213682, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.553767681121826, + "objective/train/theoretical_loss": 4.584699761792674, + "objective/train/tokens_used": 154808800, + "theoretical_loss": 4.584699761792674, + "tokens_seen": 134348800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048449348044132394, + "loss": 3.4115, + "theoretical_loss": 4.584699761792674, + "tokens_seen": 134348800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844834503510532, + "loss": 3.379, + "theoretical_loss": 4.5844026719143205, + "tokens_seen": 134414336 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048447342026078236, + "loss": 3.2655, + "theoretical_loss": 4.5841057673877135, + "tokens_seen": 134479872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048446339017051154, + "loss": 3.6094, + "theoretical_loss": 4.5838090480069695, + "tokens_seen": 134545408 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844533600802407, + "loss": 3.4139, + "theoretical_loss": 4.5835125135665375, + "tokens_seen": 134610944 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844433299899699, + "loss": 3.241, + "theoretical_loss": 4.583216163861191, + "tokens_seen": 134676480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844332998996991, + "loss": 3.4857, + "theoretical_loss": 4.58291999868603, + "tokens_seen": 134742016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844232698094283, + "loss": 3.4424, + "theoretical_loss": 4.582624017836489, + "tokens_seen": 134807552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048441323971915745, + "loss": 3.5497, + "theoretical_loss": 4.582328221108318, + "tokens_seen": 134873088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844032096288867, + "loss": 3.4341, + "theoretical_loss": 4.5820326082976, + "tokens_seen": 134938624 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048439317953861586, + "loss": 3.4735, + "theoretical_loss": 4.581737179200739, + "tokens_seen": 135004160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048438314944834504, + "loss": 3.3947, + "theoretical_loss": 4.581441933614466, + "tokens_seen": 135069696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843731193580742, + "loss": 3.6985, + "theoretical_loss": 4.581146871335832, + "tokens_seen": 135135232 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843630892678034, + "loss": 3.4447, + "theoretical_loss": 4.580851992162214, + "tokens_seen": 135200768 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843530591775326, + "loss": 3.2804, + "theoretical_loss": 4.5805572958913086, + "tokens_seen": 135266304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843430290872618, + "loss": 3.2166, + "theoretical_loss": 4.580262782321135, + "tokens_seen": 135331840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048433299899699095, + "loss": 3.1952, + "theoretical_loss": 4.579968451250032, + "tokens_seen": 135397376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843229689067202, + "loss": 3.6351, + "theoretical_loss": 4.579674302476661, + "tokens_seen": 135462912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843129388164493, + "loss": 3.5732, + "theoretical_loss": 4.579380335800001, + "tokens_seen": 135528448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048430290872617855, + "loss": 3.2331, + "theoretical_loss": 4.579086551019348, + "tokens_seen": 135593984 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048429287863590773, + "loss": 3.3013, + "theoretical_loss": 4.5787929479343195, + "tokens_seen": 135659520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842828485456369, + "loss": 3.3744, + "theoretical_loss": 4.578499526344848, + "tokens_seen": 135725056 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842728184553661, + "loss": 3.4363, + "theoretical_loss": 4.578206286051184, + "tokens_seen": 135790592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048426278836509527, + "loss": 3.4997, + "theoretical_loss": 4.5779132268538945, + "tokens_seen": 135856128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048425275827482445, + "loss": 3.3039, + "theoretical_loss": 4.577620348553859, + "tokens_seen": 135921664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 214514, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6967029571533203, + "objective/train/theoretical_loss": 4.577327650952276, + "objective/train/tokens_used": 156447200, + "theoretical_loss": 4.577327650952276, + "tokens_seen": 135987200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842427281845537, + "loss": 3.7825, + "theoretical_loss": 4.577327650952276, + "tokens_seen": 135987200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048423269809428287, + "loss": 3.2603, + "theoretical_loss": 4.5770351338506545, + "tokens_seen": 136052736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048422266800401205, + "loss": 3.4404, + "theoretical_loss": 4.57674279705082, + "tokens_seen": 136118272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842126379137413, + "loss": 3.39, + "theoretical_loss": 4.57645064035491, + "tokens_seen": 136183808 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842026078234704, + "loss": 3.1578, + "theoretical_loss": 4.576158663565371, + "tokens_seen": 136249344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048419257773319965, + "loss": 3.1521, + "theoretical_loss": 4.575866866484967, + "tokens_seen": 136314880 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841825476429288, + "loss": 3.3245, + "theoretical_loss": 4.575575248916767, + "tokens_seen": 136380416 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484172517552658, + "loss": 3.2286, + "theoretical_loss": 4.575283810664155, + "tokens_seen": 136445952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841624874623872, + "loss": 3.1527, + "theoretical_loss": 4.574992551530822, + "tokens_seen": 136511488 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048415245737211637, + "loss": 3.3615, + "theoretical_loss": 4.574701471320768, + "tokens_seen": 136577024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048414242728184555, + "loss": 3.2323, + "theoretical_loss": 4.574410569838304, + "tokens_seen": 136642560 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048413239719157473, + "loss": 3.5178, + "theoretical_loss": 4.574119846888045, + "tokens_seen": 136708096 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841223671013039, + "loss": 3.4339, + "theoretical_loss": 4.573829302274915, + "tokens_seen": 136773632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048411233701103315, + "loss": 3.3785, + "theoretical_loss": 4.573538935804146, + "tokens_seen": 136839168 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841023069207623, + "loss": 3.4305, + "theoretical_loss": 4.573248747281273, + "tokens_seen": 136904704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840922768304915, + "loss": 3.7156, + "theoretical_loss": 4.5729587365121365, + "tokens_seen": 136970240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048408224674022064, + "loss": 3.1826, + "theoretical_loss": 4.572668903302886, + "tokens_seen": 137035776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840722166499499, + "loss": 3.291, + "theoretical_loss": 4.572379247459969, + "tokens_seen": 137101312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048406218655967906, + "loss": 3.4151, + "theoretical_loss": 4.57208976879014, + "tokens_seen": 137166848 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048405215646940824, + "loss": 3.4274, + "theoretical_loss": 4.571800467100456, + "tokens_seen": 137232384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840421263791374, + "loss": 3.5986, + "theoretical_loss": 4.5715113421982725, + "tokens_seen": 137297920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048403209628886665, + "loss": 3.6963, + "theoretical_loss": 4.571222393891253, + "tokens_seen": 137363456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840220661985958, + "loss": 3.3796, + "theoretical_loss": 4.570933621987356, + "tokens_seen": 137428992 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484012036108325, + "loss": 3.1538, + "theoretical_loss": 4.570645026294844, + "tokens_seen": 137494528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048400200601805414, + "loss": 3.3521, + "theoretical_loss": 4.570356606622278, + "tokens_seen": 137560064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 215745, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.805065631866455, + "objective/train/theoretical_loss": 4.570068362778516, + "objective/train/tokens_used": 158085600, + "theoretical_loss": 4.570068362778516, + "tokens_seen": 137625600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839919759277834, + "loss": 3.7218, + "theoretical_loss": 4.570068362778516, + "tokens_seen": 137625600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048398194583751256, + "loss": 3.2555, + "theoretical_loss": 4.569780294572718, + "tokens_seen": 137691136 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048397191574724174, + "loss": 3.5768, + "theoretical_loss": 4.569492401814339, + "tokens_seen": 137756672 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839618856569709, + "loss": 3.6795, + "theoretical_loss": 4.569204684313133, + "tokens_seen": 137822208 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839518555667001, + "loss": 3.3141, + "theoretical_loss": 4.568917141879149, + "tokens_seen": 137887744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839418254764293, + "loss": 3.3816, + "theoretical_loss": 4.568629774322736, + "tokens_seen": 137953280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839317953861585, + "loss": 3.3705, + "theoretical_loss": 4.568342581454532, + "tokens_seen": 138018816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048392176529588765, + "loss": 3.5989, + "theoretical_loss": 4.568055563085476, + "tokens_seen": 138084352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839117352056169, + "loss": 3.3891, + "theoretical_loss": 4.567768719026797, + "tokens_seen": 138149888 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048390170511534606, + "loss": 3.2672, + "theoretical_loss": 4.567482049090019, + "tokens_seen": 138215424 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048389167502507524, + "loss": 3.3265, + "theoretical_loss": 4.567195553086961, + "tokens_seen": 138280960 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838816449348044, + "loss": 3.2513, + "theoretical_loss": 4.566909230829729, + "tokens_seen": 138346496 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838716148445336, + "loss": 3.4663, + "theoretical_loss": 4.566623082130729, + "tokens_seen": 138412032 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838615847542628, + "loss": 3.5158, + "theoretical_loss": 4.566337106802651, + "tokens_seen": 138477568 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483851554663992, + "loss": 3.558, + "theoretical_loss": 4.56605130465848, + "tokens_seen": 138543104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048384152457372115, + "loss": 3.5835, + "theoretical_loss": 4.565765675511487, + "tokens_seen": 138608640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838314944834504, + "loss": 3.4223, + "theoretical_loss": 4.565480219175237, + "tokens_seen": 138674176 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838214643931795, + "loss": 3.6202, + "theoretical_loss": 4.56519493546358, + "tokens_seen": 138739712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048381143430290875, + "loss": 3.3628, + "theoretical_loss": 4.56490982419066, + "tokens_seen": 138805248 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048380140421263793, + "loss": 3.5243, + "theoretical_loss": 4.564624885170902, + "tokens_seen": 138870784 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837913741223671, + "loss": 3.2818, + "theoretical_loss": 4.564340118219022, + "tokens_seen": 138936320 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837813440320963, + "loss": 3.3265, + "theoretical_loss": 4.56405552315002, + "tokens_seen": 139001856 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048377131394182547, + "loss": 3.5035, + "theoretical_loss": 4.563771099779187, + "tokens_seen": 139067392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048376128385155465, + "loss": 3.5528, + "theoretical_loss": 4.563486847922093, + "tokens_seen": 139132928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837512537612839, + "loss": 3.5953, + "theoretical_loss": 4.563202767394597, + "tokens_seen": 139198464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 216543, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1856539249420166, + "objective/train/theoretical_loss": 4.562918858012843, + "objective/train/tokens_used": 159724000, + "theoretical_loss": 4.562918858012843, + "tokens_seen": 139264000 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483741223671013, + "loss": 3.1963, + "theoretical_loss": 4.562918858012843, + "tokens_seen": 139264000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048373119358074225, + "loss": 3.7505, + "theoretical_loss": 4.562635119593255, + "tokens_seen": 139329536 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048372116349047143, + "loss": 3.3943, + "theoretical_loss": 4.562351551952542, + "tokens_seen": 139395072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837111334002006, + "loss": 3.427, + "theoretical_loss": 4.5620681549076965, + "tokens_seen": 139460608 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837011033099298, + "loss": 3.3626, + "theoretical_loss": 4.561784928275992, + "tokens_seen": 139526144 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483691073219659, + "loss": 3.2654, + "theoretical_loss": 4.561501871874984, + "tokens_seen": 139591680 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048368104312938816, + "loss": 3.3665, + "theoretical_loss": 4.561218985522507, + "tokens_seen": 139657216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836710130391174, + "loss": 3.4153, + "theoretical_loss": 4.560936269036679, + "tokens_seen": 139722752 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836609829488465, + "loss": 3.5138, + "theoretical_loss": 4.560653722235895, + "tokens_seen": 139788288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048365095285857575, + "loss": 3.5586, + "theoretical_loss": 4.560371344938831, + "tokens_seen": 139853824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836409227683049, + "loss": 3.5356, + "theoretical_loss": 4.560089136964439, + "tokens_seen": 139919360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836308926780341, + "loss": 3.6322, + "theoretical_loss": 4.559807098131953, + "tokens_seen": 139984896 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836208625877633, + "loss": 3.3173, + "theoretical_loss": 4.559525228260882, + "tokens_seen": 140050432 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836108324974925, + "loss": 3.5052, + "theoretical_loss": 4.559243527171011, + "tokens_seen": 140115968 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048360080240722166, + "loss": 3.6296, + "theoretical_loss": 4.558961994682403, + "tokens_seen": 140181504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048359077231695084, + "loss": 3.6015, + "theoretical_loss": 4.558680630615397, + "tokens_seen": 140247040 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048358074222668, + "loss": 3.2321, + "theoretical_loss": 4.558399434790607, + "tokens_seen": 140312576 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048357071213640926, + "loss": 3.5938, + "theoretical_loss": 4.558118407028921, + "tokens_seen": 140378112 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835606820461384, + "loss": 3.2905, + "theoretical_loss": 4.557837547151502, + "tokens_seen": 140443648 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835506519558676, + "loss": 3.4072, + "theoretical_loss": 4.557556854979786, + "tokens_seen": 140509184 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835406218655968, + "loss": 3.4305, + "theoretical_loss": 4.5572763303354815, + "tokens_seen": 140574720 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483530591775326, + "loss": 3.4042, + "theoretical_loss": 4.556995973040574, + "tokens_seen": 140640256 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048352056168505516, + "loss": 3.6395, + "theoretical_loss": 4.556715782917314, + "tokens_seen": 140705792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048351053159478434, + "loss": 3.4382, + "theoretical_loss": 4.556435759788229, + "tokens_seen": 140771328 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835005015045135, + "loss": 3.4248, + "theoretical_loss": 4.556155903476114, + "tokens_seen": 140836864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 217936, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1173794269561768, + "objective/train/theoretical_loss": 4.555876213804037, + "objective/train/tokens_used": 161362400, + "theoretical_loss": 4.555876213804037, + "tokens_seen": 140902400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048349047141424276, + "loss": 3.3518, + "theoretical_loss": 4.555876213804037, + "tokens_seen": 140902400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048348044132397194, + "loss": 3.3859, + "theoretical_loss": 4.555596690595333, + "tokens_seen": 140967936 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834704112337011, + "loss": 3.2391, + "theoretical_loss": 4.555317333673611, + "tokens_seen": 141033472 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834603811434303, + "loss": 3.2149, + "theoretical_loss": 4.555038142862742, + "tokens_seen": 141099008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834503510531595, + "loss": 3.5474, + "theoretical_loss": 4.5547591179868725, + "tokens_seen": 141164544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834403209628887, + "loss": 3.4493, + "theoretical_loss": 4.554480258870409, + "tokens_seen": 141230080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048343029087261785, + "loss": 3.3747, + "theoretical_loss": 4.554201565338033, + "tokens_seen": 141295616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834202607823471, + "loss": 3.6331, + "theoretical_loss": 4.5539230372146875, + "tokens_seen": 141361152 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048341023069207626, + "loss": 3.3845, + "theoretical_loss": 4.553644674325584, + "tokens_seen": 141426688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048340020060180544, + "loss": 3.5964, + "theoretical_loss": 4.553366476496198, + "tokens_seen": 141492224 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833901705115346, + "loss": 3.3062, + "theoretical_loss": 4.553088443552269, + "tokens_seen": 141557760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833801404212638, + "loss": 3.4501, + "theoretical_loss": 4.552810575319806, + "tokens_seen": 141623296 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483370110330993, + "loss": 3.2792, + "theoretical_loss": 4.552532871625077, + "tokens_seen": 141688832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833600802407222, + "loss": 3.1857, + "theoretical_loss": 4.5522553322946155, + "tokens_seen": 141754368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048335005015045135, + "loss": 3.2261, + "theoretical_loss": 4.551977957155217, + "tokens_seen": 141819904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833400200601806, + "loss": 3.6914, + "theoretical_loss": 4.5517007460339425, + "tokens_seen": 141885440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833299899699097, + "loss": 3.3864, + "theoretical_loss": 4.551423698758111, + "tokens_seen": 141950976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048331995987963895, + "loss": 3.309, + "theoretical_loss": 4.551146815155304, + "tokens_seen": 142016512 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048330992978936813, + "loss": 3.6274, + "theoretical_loss": 4.550870095053366, + "tokens_seen": 142082048 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832998996990973, + "loss": 3.3515, + "theoretical_loss": 4.550593538280398, + "tokens_seen": 142147584 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832898696088265, + "loss": 3.2589, + "theoretical_loss": 4.550317144664766, + "tokens_seen": 142213120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048327983951855567, + "loss": 3.4483, + "theoretical_loss": 4.55004091403509, + "tokens_seen": 142278656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048326980942828485, + "loss": 3.3192, + "theoretical_loss": 4.5497648462202545, + "tokens_seen": 142344192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832597793380141, + "loss": 3.42, + "theoretical_loss": 4.549488941049397, + "tokens_seen": 142409728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832497492477432, + "loss": 3.4013, + "theoretical_loss": 4.549213198351914, + "tokens_seen": 142475264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 218572, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.226104259490967, + "objective/train/theoretical_loss": 4.548937617957463, + "objective/train/tokens_used": 163000800, + "theoretical_loss": 4.548937617957463, + "tokens_seen": 142540800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048323971915747245, + "loss": 3.3476, + "theoretical_loss": 4.548937617957463, + "tokens_seen": 142540800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048322968906720163, + "loss": 3.1424, + "theoretical_loss": 4.548662199695954, + "tokens_seen": 142606336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832196589769308, + "loss": 3.4049, + "theoretical_loss": 4.548386943397556, + "tokens_seen": 142671872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048320962888666, + "loss": 3.3425, + "theoretical_loss": 4.548111848892693, + "tokens_seen": 142737408 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831995987963892, + "loss": 3.6242, + "theoretical_loss": 4.547836916012042, + "tokens_seen": 142802944 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048318956870611836, + "loss": 3.1838, + "theoretical_loss": 4.547562144586539, + "tokens_seen": 142868480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831795386158476, + "loss": 3.4277, + "theoretical_loss": 4.547287534447372, + "tokens_seen": 142934016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831695085255767, + "loss": 3.2606, + "theoretical_loss": 4.5470130854259825, + "tokens_seen": 142999552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048315947843530595, + "loss": 3.3748, + "theoretical_loss": 4.546738797354065, + "tokens_seen": 143065088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831494483450351, + "loss": 3.4503, + "theoretical_loss": 4.546464670063569, + "tokens_seen": 143130624 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831394182547643, + "loss": 3.3705, + "theoretical_loss": 4.546190703386695, + "tokens_seen": 143196160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831293881644935, + "loss": 3.4002, + "theoretical_loss": 4.545916897155894, + "tokens_seen": 143261696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831193580742227, + "loss": 3.6006, + "theoretical_loss": 4.54564325120387, + "tokens_seen": 143327232 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048310932798395186, + "loss": 3.614, + "theoretical_loss": 4.545369765363578, + "tokens_seen": 143392768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048309929789368104, + "loss": 3.3347, + "theoretical_loss": 4.545096439468223, + "tokens_seen": 143458304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830892678034102, + "loss": 3.4975, + "theoretical_loss": 4.544823273351257, + "tokens_seen": 143523840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048307923771313946, + "loss": 3.5284, + "theoretical_loss": 4.544550266846388, + "tokens_seen": 143589376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830692076228686, + "loss": 3.5193, + "theoretical_loss": 4.544277419787566, + "tokens_seen": 143654912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830591775325978, + "loss": 3.3756, + "theoretical_loss": 4.544004732008993, + "tokens_seen": 143720448 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483049147442327, + "loss": 3.0692, + "theoretical_loss": 4.543732203345119, + "tokens_seen": 143785984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830391173520562, + "loss": 3.3665, + "theoretical_loss": 4.543459833630639, + "tokens_seen": 143851520 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048302908726178536, + "loss": 3.429, + "theoretical_loss": 4.543187622700497, + "tokens_seen": 143917056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048301905717151454, + "loss": 3.3939, + "theoretical_loss": 4.542915570389884, + "tokens_seen": 143982592 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830090270812437, + "loss": 3.4513, + "theoretical_loss": 4.542643676534234, + "tokens_seen": 144048128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048299899699097296, + "loss": 3.3302, + "theoretical_loss": 4.542371940969231, + "tokens_seen": 144113664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 219918, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.464306116104126, + "objective/train/theoretical_loss": 4.542100363530799, + "objective/train/tokens_used": 164639200, + "theoretical_loss": 4.542100363530799, + "tokens_seen": 144179200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829889669007021, + "loss": 3.5182, + "theoretical_loss": 4.542100363530799, + "tokens_seen": 144179200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829789368104313, + "loss": 3.3918, + "theoretical_loss": 4.54182894405511, + "tokens_seen": 144244736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048296890672016045, + "loss": 3.2309, + "theoretical_loss": 4.5415576823785795, + "tokens_seen": 144310272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829588766298897, + "loss": 3.3555, + "theoretical_loss": 4.541286578337866, + "tokens_seen": 144375808 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048294884653961887, + "loss": 3.485, + "theoretical_loss": 4.541015631769872, + "tokens_seen": 144441344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048293881644934805, + "loss": 3.382, + "theoretical_loss": 4.5407448425117405, + "tokens_seen": 144506880 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048292878635907723, + "loss": 3.5303, + "theoretical_loss": 4.540474210400859, + "tokens_seen": 144572416 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048291875626880646, + "loss": 3.3492, + "theoretical_loss": 4.540203735274855, + "tokens_seen": 144637952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829087261785356, + "loss": 3.2747, + "theoretical_loss": 4.5399334169716, + "tokens_seen": 144703488 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828986960882648, + "loss": 3.4229, + "theoretical_loss": 4.539663255329202, + "tokens_seen": 144769024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048288866599799395, + "loss": 3.3941, + "theoretical_loss": 4.539393250186015, + "tokens_seen": 144834560 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828786359077232, + "loss": 3.5768, + "theoretical_loss": 4.539123401380625, + "tokens_seen": 144900096 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048286860581745237, + "loss": 3.376, + "theoretical_loss": 4.538853708751866, + "tokens_seen": 144965632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048285857572718155, + "loss": 3.1908, + "theoretical_loss": 4.538584172138804, + "tokens_seen": 145031168 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048284854563691073, + "loss": 3.4312, + "theoretical_loss": 4.538314791380748, + "tokens_seen": 145096704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828385155466399, + "loss": 3.2382, + "theoretical_loss": 4.538045566317242, + "tokens_seen": 145162240 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828284854563691, + "loss": 3.2605, + "theoretical_loss": 4.537776496788071, + "tokens_seen": 145227776 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048281845536609833, + "loss": 3.4595, + "theoretical_loss": 4.537507582633253, + "tokens_seen": 145293312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048280842527582746, + "loss": 3.4714, + "theoretical_loss": 4.537238823693045, + "tokens_seen": 145358848 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827983951855567, + "loss": 3.4331, + "theoretical_loss": 4.536970219807939, + "tokens_seen": 145424384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827883650952858, + "loss": 3.3043, + "theoretical_loss": 4.536701770818665, + "tokens_seen": 145489920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048277833500501505, + "loss": 3.5044, + "theoretical_loss": 4.536433476566185, + "tokens_seen": 145555456 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048276830491474423, + "loss": 3.4038, + "theoretical_loss": 4.536165336891699, + "tokens_seen": 145620992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827582748244734, + "loss": 3.2575, + "theoretical_loss": 4.535897351636638, + "tokens_seen": 145686528 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827482447342026, + "loss": 3.6485, + "theoretical_loss": 4.53562952064267, + "tokens_seen": 145752064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 220709, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8053040504455566, + "objective/train/theoretical_loss": 4.535361843751696, + "objective/train/tokens_used": 166277600, + "theoretical_loss": 4.535361843751696, + "tokens_seen": 145817600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048273821464393183, + "loss": 3.4441, + "theoretical_loss": 4.535361843751696, + "tokens_seen": 145817600 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482728184553661, + "loss": 3.1346, + "theoretical_loss": 4.535094320805847, + "tokens_seen": 145883136 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827181544633902, + "loss": 3.3252, + "theoretical_loss": 4.534826951647489, + "tokens_seen": 145948672 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827081243731194, + "loss": 3.4838, + "theoretical_loss": 4.5345597361192205, + "tokens_seen": 146014208 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048269809428284856, + "loss": 3.6738, + "theoretical_loss": 4.53429267406387, + "tokens_seen": 146079744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826880641925778, + "loss": 3.4136, + "theoretical_loss": 4.5340257653244995, + "tokens_seen": 146145280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826780341023069, + "loss": 3.5642, + "theoretical_loss": 4.5337590097444, + "tokens_seen": 146210816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048266800401203615, + "loss": 3.2791, + "theoretical_loss": 4.533492407167093, + "tokens_seen": 146276352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826579739217653, + "loss": 3.4304, + "theoretical_loss": 4.53322595743633, + "tokens_seen": 146341888 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826479438314945, + "loss": 3.4174, + "theoretical_loss": 4.5329596603960916, + "tokens_seen": 146407424 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826379137412237, + "loss": 3.3817, + "theoretical_loss": 4.53269351589059, + "tokens_seen": 146472960 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826278836509529, + "loss": 3.2871, + "theoretical_loss": 4.532427523764261, + "tokens_seen": 146538496 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048261785356068206, + "loss": 3.303, + "theoretical_loss": 4.532161683861773, + "tokens_seen": 146604032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048260782347041124, + "loss": 3.5583, + "theoretical_loss": 4.5318959960280205, + "tokens_seen": 146669568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825977933801404, + "loss": 3.0956, + "theoretical_loss": 4.531630460108125, + "tokens_seen": 146735104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048258776328986966, + "loss": 3.6116, + "theoretical_loss": 4.531365075947434, + "tokens_seen": 146800640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825777331995988, + "loss": 3.4407, + "theoretical_loss": 4.531099843391524, + "tokens_seen": 146866176 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482567703109328, + "loss": 3.5475, + "theoretical_loss": 4.5308347622861955, + "tokens_seen": 146931712 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825576730190572, + "loss": 3.3681, + "theoretical_loss": 4.5305698324774735, + "tokens_seen": 146997248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825476429287864, + "loss": 3.3835, + "theoretical_loss": 4.530305053811611, + "tokens_seen": 147062784 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048253761283851556, + "loss": 3.2951, + "theoretical_loss": 4.530040426135084, + "tokens_seen": 147128320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048252758274824474, + "loss": 3.4417, + "theoretical_loss": 4.529775949294593, + "tokens_seen": 147193856 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825175526579739, + "loss": 3.1626, + "theoretical_loss": 4.529511623137061, + "tokens_seen": 147259392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048250752256770316, + "loss": 3.2913, + "theoretical_loss": 4.529247447509637, + "tokens_seen": 147324928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824974924774323, + "loss": 3.4218, + "theoretical_loss": 4.528983422259691, + "tokens_seen": 147390464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 222333, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.542501926422119, + "objective/train/theoretical_loss": 4.528719547234816, + "objective/train/tokens_used": 167916000, + "theoretical_loss": 4.528719547234816, + "tokens_seen": 147456000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824874623871615, + "loss": 3.6895, + "theoretical_loss": 4.528719547234816, + "tokens_seen": 147456000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048247743229689065, + "loss": 3.1272, + "theoretical_loss": 4.528455822282828, + "tokens_seen": 147521536 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824674022066199, + "loss": 3.3643, + "theoretical_loss": 4.528192247251763, + "tokens_seen": 147587072 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048245737211634907, + "loss": 3.3044, + "theoretical_loss": 4.52792882198988, + "tokens_seen": 147652608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048244734202607825, + "loss": 3.2101, + "theoretical_loss": 4.527665546345656, + "tokens_seen": 147718144 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048243731193580743, + "loss": 3.4948, + "theoretical_loss": 4.5274024201677925, + "tokens_seen": 147783680 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048242728184553666, + "loss": 3.286, + "theoretical_loss": 4.527139443305209, + "tokens_seen": 147849216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824172517552658, + "loss": 3.3363, + "theoretical_loss": 4.526876615607042, + "tokens_seen": 147914752 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482407221664995, + "loss": 3.5136, + "theoretical_loss": 4.526613936922654, + "tokens_seen": 147980288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048239719157472415, + "loss": 3.4325, + "theoretical_loss": 4.526351407101618, + "tokens_seen": 148045824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823871614844534, + "loss": 3.3161, + "theoretical_loss": 4.526089025993732, + "tokens_seen": 148111360 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048237713139418257, + "loss": 3.341, + "theoretical_loss": 4.525826793449008, + "tokens_seen": 148176896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048236710130391175, + "loss": 3.5225, + "theoretical_loss": 4.525564709317678, + "tokens_seen": 148242432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048235707121364093, + "loss": 3.511, + "theoretical_loss": 4.525302773450187, + "tokens_seen": 148307968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823470411233701, + "loss": 3.2191, + "theoretical_loss": 4.525040985697203, + "tokens_seen": 148373504 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823370110330993, + "loss": 3.1906, + "theoretical_loss": 4.524779345909604, + "tokens_seen": 148439040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048232698094282853, + "loss": 3.4767, + "theoretical_loss": 4.524517853938489, + "tokens_seen": 148504576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048231695085255766, + "loss": 3.328, + "theoretical_loss": 4.524256509635169, + "tokens_seen": 148570112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004823069207622869, + "loss": 3.5906, + "theoretical_loss": 4.523995312851174, + "tokens_seen": 148635648 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482296890672016, + "loss": 3.1019, + "theoretical_loss": 4.523734263438241, + "tokens_seen": 148701184 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048228686058174525, + "loss": 3.4911, + "theoretical_loss": 4.52347336124833, + "tokens_seen": 148766720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048227683049147443, + "loss": 3.5419, + "theoretical_loss": 4.52321260613361, + "tokens_seen": 148832256 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822668004012036, + "loss": 3.4556, + "theoretical_loss": 4.522951997946466, + "tokens_seen": 148897792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822567703109328, + "loss": 3.438, + "theoretical_loss": 4.522691536539492, + "tokens_seen": 148963328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048224674022066203, + "loss": 3.5173, + "theoretical_loss": 4.522431221765498, + "tokens_seen": 149028864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 222973, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.872528314590454, + "objective/train/theoretical_loss": 4.522171053477507, + "objective/train/tokens_used": 169554400, + "theoretical_loss": 4.522171053477507, + "tokens_seen": 149094400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048223671013039116, + "loss": 3.1564, + "theoretical_loss": 4.522171053477507, + "tokens_seen": 149094400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822266800401204, + "loss": 3.4409, + "theoretical_loss": 4.5219110315287505, + "tokens_seen": 149159936 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822166499498495, + "loss": 3.4656, + "theoretical_loss": 4.521651155772675, + "tokens_seen": 149225472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048220661985957876, + "loss": 3.5566, + "theoretical_loss": 4.521391426062934, + "tokens_seen": 149291008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048219658976930794, + "loss": 3.3884, + "theoretical_loss": 4.521131842253396, + "tokens_seen": 149356544 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821865596790371, + "loss": 3.6665, + "theoretical_loss": 4.520872404198139, + "tokens_seen": 149422080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821765295887663, + "loss": 3.4762, + "theoretical_loss": 4.520613111751445, + "tokens_seen": 149487616 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821664994984955, + "loss": 3.3686, + "theoretical_loss": 4.520353964767814, + "tokens_seen": 149553152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048215646940822466, + "loss": 3.5076, + "theoretical_loss": 4.5200949631019505, + "tokens_seen": 149618688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821464393179539, + "loss": 3.4631, + "theoretical_loss": 4.519836106608768, + "tokens_seen": 149684224 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482136409227683, + "loss": 3.2705, + "theoretical_loss": 4.519577395143388, + "tokens_seen": 149749760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048212637913741226, + "loss": 3.5931, + "theoretical_loss": 4.519318828561142, + "tokens_seen": 149815296 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821163490471414, + "loss": 3.3086, + "theoretical_loss": 4.519060406717565, + "tokens_seen": 149880832 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821063189568706, + "loss": 3.2906, + "theoretical_loss": 4.518802129468405, + "tokens_seen": 149946368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820962888665998, + "loss": 3.3824, + "theoretical_loss": 4.51854399666961, + "tokens_seen": 150011904 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482086258776329, + "loss": 3.3789, + "theoretical_loss": 4.518286008177341, + "tokens_seen": 150077440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048207622868605816, + "loss": 3.284, + "theoretical_loss": 4.51802816384796, + "tokens_seen": 150142976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820661985957874, + "loss": 3.2642, + "theoretical_loss": 4.517770463538038, + "tokens_seen": 150208512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048205616850551653, + "loss": 3.3741, + "theoretical_loss": 4.517512907104347, + "tokens_seen": 150274048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048204613841524576, + "loss": 3.4381, + "theoretical_loss": 4.517255494403868, + "tokens_seen": 150339584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820361083249749, + "loss": 3.3189, + "theoretical_loss": 4.516998225293785, + "tokens_seen": 150405120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820260782347041, + "loss": 3.3564, + "theoretical_loss": 4.516741099631485, + "tokens_seen": 150470656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820160481444333, + "loss": 3.2927, + "theoretical_loss": 4.51648411727456, + "tokens_seen": 150536192 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820060180541625, + "loss": 3.1088, + "theoretical_loss": 4.5162272780808035, + "tokens_seen": 150601728 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048199598796389167, + "loss": 3.3767, + "theoretical_loss": 4.515970581908216, + "tokens_seen": 150667264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 224256, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.578481435775757, + "objective/train/theoretical_loss": 4.515714028614996, + "objective/train/tokens_used": 171192800, + "theoretical_loss": 4.515714028614996, + "tokens_seen": 150732800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048198595787362085, + "loss": 3.3921, + "theoretical_loss": 4.515714028614996, + "tokens_seen": 150732800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004819759277833501, + "loss": 3.6899, + "theoretical_loss": 4.515457618059546, + "tokens_seen": 150798336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048196589769307927, + "loss": 3.365, + "theoretical_loss": 4.515201350100471, + "tokens_seen": 150863872 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048195586760280845, + "loss": 3.4232, + "theoretical_loss": 4.514945224596577, + "tokens_seen": 150929408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048194583751253763, + "loss": 3.2746, + "theoretical_loss": 4.5146892414068684, + "tokens_seen": 150994944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048193580742226686, + "loss": 3.3062, + "theoretical_loss": 4.514433400390554, + "tokens_seen": 151060480 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481925777331996, + "loss": 3.5527, + "theoretical_loss": 4.514177701407042, + "tokens_seen": 151126016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004819157472417252, + "loss": 3.4158, + "theoretical_loss": 4.51392214431594, + "tokens_seen": 151191552 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048190571715145435, + "loss": 3.3167, + "theoretical_loss": 4.513666728977054, + "tokens_seen": 151257088 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818956870611836, + "loss": 3.261, + "theoretical_loss": 4.51341145525039, + "tokens_seen": 151322624 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048188565697091277, + "loss": 3.2799, + "theoretical_loss": 4.513156322996155, + "tokens_seen": 151388160 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048187562688064195, + "loss": 3.0428, + "theoretical_loss": 4.512901332074751, + "tokens_seen": 151453696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048186559679037113, + "loss": 3.4207, + "theoretical_loss": 4.5126464823467805, + "tokens_seen": 151519232 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818555667001003, + "loss": 3.4559, + "theoretical_loss": 4.512391773673042, + "tokens_seen": 151584768 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818455366098295, + "loss": 3.4414, + "theoretical_loss": 4.5121372059145335, + "tokens_seen": 151650304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048183550651955873, + "loss": 3.1644, + "theoretical_loss": 4.511882778932447, + "tokens_seen": 151715840 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048182547642928786, + "loss": 3.1517, + "theoretical_loss": 4.511628492588174, + "tokens_seen": 151781376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818154463390171, + "loss": 3.3785, + "theoretical_loss": 4.5113743467433, + "tokens_seen": 151846912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818054162487462, + "loss": 3.5209, + "theoretical_loss": 4.511120341259608, + "tokens_seen": 151912448 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048179538615847545, + "loss": 3.5419, + "theoretical_loss": 4.510866475999077, + "tokens_seen": 151977984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048178535606820463, + "loss": 3.4455, + "theoretical_loss": 4.510612750823878, + "tokens_seen": 152043520 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817753259779338, + "loss": 3.1911, + "theoretical_loss": 4.5103591655963795, + "tokens_seen": 152109056 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481765295887663, + "loss": 3.3913, + "theoretical_loss": 4.510105720179144, + "tokens_seen": 152174592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048175526579739223, + "loss": 3.1391, + "theoretical_loss": 4.5098524144349295, + "tokens_seen": 152240128 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048174523570712136, + "loss": 3.2863, + "theoretical_loss": 4.509599248226683, + "tokens_seen": 152305664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 224258, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.519541025161743, + "objective/train/theoretical_loss": 4.509346221417552, + "objective/train/tokens_used": 172831200, + "theoretical_loss": 4.509346221417552, + "tokens_seen": 152371200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817352056168506, + "loss": 3.1877, + "theoretical_loss": 4.509346221417552, + "tokens_seen": 152371200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817251755265797, + "loss": 3.5031, + "theoretical_loss": 4.509093333870869, + "tokens_seen": 152436736 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048171514543630896, + "loss": 3.4969, + "theoretical_loss": 4.508840585450166, + "tokens_seen": 152502272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048170511534603814, + "loss": 3.4823, + "theoretical_loss": 4.508587976019164, + "tokens_seen": 152567808 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816950852557673, + "loss": 3.6068, + "theoretical_loss": 4.508335505441774, + "tokens_seen": 152633344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816850551654965, + "loss": 3.2556, + "theoretical_loss": 4.508083173582105, + "tokens_seen": 152698880 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816750250752257, + "loss": 3.4433, + "theoretical_loss": 4.507830980304451, + "tokens_seen": 152764416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048166499498495486, + "loss": 3.4769, + "theoretical_loss": 4.5075789254733, + "tokens_seen": 152829952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816549648946841, + "loss": 3.3988, + "theoretical_loss": 4.507327008953329, + "tokens_seen": 152895488 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816449348044132, + "loss": 3.4335, + "theoretical_loss": 4.507075230609407, + "tokens_seen": 152961024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048163490471414246, + "loss": 3.6205, + "theoretical_loss": 4.506823590306591, + "tokens_seen": 153026560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816248746238716, + "loss": 3.2815, + "theoretical_loss": 4.506572087910127, + "tokens_seen": 153092096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816148445336008, + "loss": 3.424, + "theoretical_loss": 4.506320723285455, + "tokens_seen": 153157632 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048160481444333, + "loss": 3.234, + "theoretical_loss": 4.506069496298198, + "tokens_seen": 153223168 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815947843530592, + "loss": 3.5371, + "theoretical_loss": 4.5058184068141705, + "tokens_seen": 153288704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048158475426278837, + "loss": 3.4785, + "theoretical_loss": 4.505567454699373, + "tokens_seen": 153354240 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815747241725176, + "loss": 3.3644, + "theoretical_loss": 4.505316639819997, + "tokens_seen": 153419776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048156469408224673, + "loss": 3.3134, + "theoretical_loss": 4.505065962042418, + "tokens_seen": 153485312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048155466399197596, + "loss": 3.2391, + "theoretical_loss": 4.504815421233202, + "tokens_seen": 153550848 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815446339017051, + "loss": 3.4874, + "theoretical_loss": 4.504565017259097, + "tokens_seen": 153616384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815346038114343, + "loss": 3.3429, + "theoretical_loss": 4.504314749987044, + "tokens_seen": 153681920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815245737211635, + "loss": 3.2746, + "theoretical_loss": 4.504064619284163, + "tokens_seen": 153747456 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815145436308927, + "loss": 3.4142, + "theoretical_loss": 4.503814625017766, + "tokens_seen": 153812992 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048150451354062187, + "loss": 3.1693, + "theoretical_loss": 4.5035647670553445, + "tokens_seen": 153878528 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048149448345035105, + "loss": 3.3487, + "theoretical_loss": 4.503315045264581, + "tokens_seen": 153944064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 224258, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4950404167175293, + "objective/train/theoretical_loss": 4.503065459513339, + "objective/train/tokens_used": 174469600, + "theoretical_loss": 4.503065459513339, + "tokens_seen": 154009600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048148445336008023, + "loss": 3.3584, + "theoretical_loss": 4.503065459513339, + "tokens_seen": 154009600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048147442326980947, + "loss": 3.3349, + "theoretical_loss": 4.502816009669665, + "tokens_seen": 154075136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004814643931795386, + "loss": 3.4341, + "theoretical_loss": 4.502566695601795, + "tokens_seen": 154140672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048145436308926783, + "loss": 3.5226, + "theoretical_loss": 4.502317517178142, + "tokens_seen": 154206208 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048144433299899696, + "loss": 3.5991, + "theoretical_loss": 4.502068474267309, + "tokens_seen": 154271744 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004814343029087262, + "loss": 3.3909, + "theoretical_loss": 4.501819566738076, + "tokens_seen": 154337280 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048142427281845537, + "loss": 3.2495, + "theoretical_loss": 4.501570794459411, + "tokens_seen": 154402816 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048141424272818455, + "loss": 3.3363, + "theoretical_loss": 4.501322157300461, + "tokens_seen": 154468352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048140421263791373, + "loss": 3.2894, + "theoretical_loss": 4.501073655130554, + "tokens_seen": 154533888 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048139418254764297, + "loss": 3.352, + "theoretical_loss": 4.500825287819205, + "tokens_seen": 154599424 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813841524573721, + "loss": 3.268, + "theoretical_loss": 4.500577055236104, + "tokens_seen": 154664960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048137412236710133, + "loss": 3.3665, + "theoretical_loss": 4.500328957251128, + "tokens_seen": 154730496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048136409227683046, + "loss": 3.2677, + "theoretical_loss": 4.500080993734329, + "tokens_seen": 154796032 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813540621865597, + "loss": 3.2372, + "theoretical_loss": 4.499833164555944, + "tokens_seen": 154861568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813440320962889, + "loss": 3.5336, + "theoretical_loss": 4.499585469586387, + "tokens_seen": 154927104 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048133400200601806, + "loss": 3.2678, + "theoretical_loss": 4.499337908696255, + "tokens_seen": 154992640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048132397191574724, + "loss": 3.2836, + "theoretical_loss": 4.499090481756321, + "tokens_seen": 155058176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813139418254764, + "loss": 3.4351, + "theoretical_loss": 4.498843188637538, + "tokens_seen": 155123712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813039117352056, + "loss": 3.361, + "theoretical_loss": 4.498596029211041, + "tokens_seen": 155189248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048129388164493483, + "loss": 3.3694, + "theoretical_loss": 4.498349003348137, + "tokens_seen": 155254784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048128385155466396, + "loss": 3.4364, + "theoretical_loss": 4.4981021109203185, + "tokens_seen": 155320320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812738214643932, + "loss": 3.3872, + "theoretical_loss": 4.49785535179925, + "tokens_seen": 155385856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812637913741223, + "loss": 3.4293, + "theoretical_loss": 4.497608725856776, + "tokens_seen": 155451392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048125376128385156, + "loss": 3.2014, + "theoretical_loss": 4.497362232964919, + "tokens_seen": 155516928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048124373119358074, + "loss": 3.1822, + "theoretical_loss": 4.497115872995876, + "tokens_seen": 155582464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 225763, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.025784730911255, + "objective/train/theoretical_loss": 4.496869645822022, + "objective/train/tokens_used": 176108000, + "theoretical_loss": 4.496869645822022, + "tokens_seen": 155648000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812337011033099, + "loss": 3.4056, + "theoretical_loss": 4.496869645822022, + "tokens_seen": 155648000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048122367101303916, + "loss": 3.2566, + "theoretical_loss": 4.496623551315908, + "tokens_seen": 155713536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048121364092276834, + "loss": 3.3016, + "theoretical_loss": 4.496377589350261, + "tokens_seen": 155779072 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812036108324975, + "loss": 3.3609, + "theoretical_loss": 4.496131759797984, + "tokens_seen": 155844608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811935807422267, + "loss": 3.1468, + "theoretical_loss": 4.495886062532153, + "tokens_seen": 155910144 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811835506519559, + "loss": 3.6107, + "theoretical_loss": 4.495640497426023, + "tokens_seen": 155975680 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048117352056168506, + "loss": 3.3142, + "theoretical_loss": 4.495395064353019, + "tokens_seen": 156041216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811634904714143, + "loss": 3.3829, + "theoretical_loss": 4.4951497631867445, + "tokens_seen": 156106752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811534603811434, + "loss": 3.4919, + "theoretical_loss": 4.494904593800973, + "tokens_seen": 156172288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048114343029087266, + "loss": 3.4889, + "theoretical_loss": 4.4946595560696565, + "tokens_seen": 156237824 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811334002006018, + "loss": 3.4904, + "theoretical_loss": 4.494414649866915, + "tokens_seen": 156303360 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481123370110331, + "loss": 3.24, + "theoretical_loss": 4.494169875067046, + "tokens_seen": 156368896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811133400200602, + "loss": 3.2623, + "theoretical_loss": 4.493925231544516, + "tokens_seen": 156434432 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811033099297894, + "loss": 3.172, + "theoretical_loss": 4.493680719173968, + "tokens_seen": 156499968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048109327983951857, + "loss": 3.5037, + "theoretical_loss": 4.4934363378302145, + "tokens_seen": 156565504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810832497492478, + "loss": 3.3995, + "theoretical_loss": 4.493192087388239, + "tokens_seen": 156631040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048107321965897693, + "loss": 3.2766, + "theoretical_loss": 4.4929479677232, + "tokens_seen": 156696576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048106318956870616, + "loss": 3.2331, + "theoretical_loss": 4.4927039787104235, + "tokens_seen": 156762112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810531594784353, + "loss": 3.4322, + "theoretical_loss": 4.4924601202254095, + "tokens_seen": 156827648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810431293881645, + "loss": 3.2955, + "theoretical_loss": 4.492216392143826, + "tokens_seen": 156893184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810330992978937, + "loss": 3.5907, + "theoretical_loss": 4.491972794341514, + "tokens_seen": 156958720 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810230692076229, + "loss": 3.2891, + "theoretical_loss": 4.49172932669448, + "tokens_seen": 157024256 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048101303911735207, + "loss": 3.4427, + "theoretical_loss": 4.491485989078906, + "tokens_seen": 157089792 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048100300902708125, + "loss": 3.3958, + "theoretical_loss": 4.491242781371138, + "tokens_seen": 157155328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048099297893681043, + "loss": 3.4415, + "theoretical_loss": 4.490999703447697, + "tokens_seen": 157220864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 226317, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2634286880493164, + "objective/train/theoretical_loss": 4.4907567551852665, + "objective/train/tokens_used": 177746400, + "theoretical_loss": 4.4907567551852665, + "tokens_seen": 157286400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048098294884653967, + "loss": 3.2088, + "theoretical_loss": 4.4907567551852665, + "tokens_seen": 157286400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004809729187562688, + "loss": 3.4287, + "theoretical_loss": 4.490513936460702, + "tokens_seen": 157351936 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048096288866599803, + "loss": 3.0838, + "theoretical_loss": 4.490271247151027, + "tokens_seen": 157417472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048095285857572716, + "loss": 3.349, + "theoretical_loss": 4.490028687133432, + "tokens_seen": 157483008 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004809428284854564, + "loss": 3.5244, + "theoretical_loss": 4.489786256285276, + "tokens_seen": 157548544 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048093279839518557, + "loss": 3.3918, + "theoretical_loss": 4.489543954484084, + "tokens_seen": 157614080 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048092276830491475, + "loss": 3.4131, + "theoretical_loss": 4.489301781607551, + "tokens_seen": 157679616 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048091273821464393, + "loss": 3.3327, + "theoretical_loss": 4.489059737533534, + "tokens_seen": 157745152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048090270812437317, + "loss": 3.4728, + "theoretical_loss": 4.48881782214006, + "tokens_seen": 157810688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808926780341023, + "loss": 3.2697, + "theoretical_loss": 4.48857603530532, + "tokens_seen": 157876224 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048088264794383153, + "loss": 3.3249, + "theoretical_loss": 4.488334376907673, + "tokens_seen": 157941760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048087261785356066, + "loss": 3.4597, + "theoretical_loss": 4.4880928468256425, + "tokens_seen": 158007296 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808625877632899, + "loss": 3.5744, + "theoretical_loss": 4.487851444937916, + "tokens_seen": 158072832 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808525576730191, + "loss": 3.3033, + "theoretical_loss": 4.487610171123347, + "tokens_seen": 158138368 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048084252758274826, + "loss": 3.3512, + "theoretical_loss": 4.487369025260954, + "tokens_seen": 158203904 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048083249749247744, + "loss": 3.0467, + "theoretical_loss": 4.48712800722992, + "tokens_seen": 158269440 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808224674022066, + "loss": 3.4032, + "theoretical_loss": 4.48688711690959, + "tokens_seen": 158334976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808124373119358, + "loss": 3.3788, + "theoretical_loss": 4.486646354179475, + "tokens_seen": 158400512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048080240722166503, + "loss": 3.4075, + "theoretical_loss": 4.48640571891925, + "tokens_seen": 158466048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048079237713139416, + "loss": 3.4903, + "theoretical_loss": 4.48616521100875, + "tokens_seen": 158531584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807823470411234, + "loss": 3.3345, + "theoretical_loss": 4.485924830327974, + "tokens_seen": 158597120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807723169508525, + "loss": 3.3844, + "theoretical_loss": 4.485684576757087, + "tokens_seen": 158662656 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048076228686058176, + "loss": 3.4555, + "theoretical_loss": 4.485444450176413, + "tokens_seen": 158728192 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048075225677031094, + "loss": 3.5851, + "theoretical_loss": 4.485204450466437, + "tokens_seen": 158793728 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807422266800401, + "loss": 3.4905, + "theoretical_loss": 4.484964577507808, + "tokens_seen": 158859264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 227589, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5188961029052734, + "objective/train/theoretical_loss": 4.484724831181337, + "objective/train/tokens_used": 179384800, + "theoretical_loss": 4.484724831181337, + "tokens_seen": 158924800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807321965897693, + "loss": 2.9862, + "theoretical_loss": 4.484724831181337, + "tokens_seen": 158924800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048072216649949854, + "loss": 3.3628, + "theoretical_loss": 4.4844852113679945, + "tokens_seen": 158990336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048071213640922766, + "loss": 3.5553, + "theoretical_loss": 4.484245717948913, + "tokens_seen": 159055872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807021063189569, + "loss": 3.0604, + "theoretical_loss": 4.484006350805385, + "tokens_seen": 159121408 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480692076228686, + "loss": 3.1705, + "theoretical_loss": 4.483767109818862, + "tokens_seen": 159186944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048068204613841526, + "loss": 3.4456, + "theoretical_loss": 4.483527994870958, + "tokens_seen": 159252480 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048067201604814444, + "loss": 2.9752, + "theoretical_loss": 4.483289005843445, + "tokens_seen": 159318016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806619859578736, + "loss": 3.2205, + "theoretical_loss": 4.483050142618255, + "tokens_seen": 159383552 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806519558676028, + "loss": 3.5844, + "theoretical_loss": 4.482811405077482, + "tokens_seen": 159449088 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480641925777332, + "loss": 3.5894, + "theoretical_loss": 4.482572793103373, + "tokens_seen": 159514624 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048063189568706117, + "loss": 3.3414, + "theoretical_loss": 4.482334306578339, + "tokens_seen": 159580160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806218655967904, + "loss": 3.3352, + "theoretical_loss": 4.482095945384946, + "tokens_seen": 159645696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048061183550651953, + "loss": 3.4557, + "theoretical_loss": 4.481857709405919, + "tokens_seen": 159711232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048060180541624877, + "loss": 3.2868, + "theoretical_loss": 4.4816195985241425, + "tokens_seen": 159776768 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048059177532597795, + "loss": 3.1668, + "theoretical_loss": 4.481381612622657, + "tokens_seen": 159842304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048058174523570713, + "loss": 3.4842, + "theoretical_loss": 4.481143751584659, + "tokens_seen": 159907840 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805717151454363, + "loss": 3.4874, + "theoretical_loss": 4.480906015293505, + "tokens_seen": 159973376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805616850551655, + "loss": 3.6502, + "theoretical_loss": 4.480668403632706, + "tokens_seen": 160038912 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048055165496489467, + "loss": 3.4696, + "theoretical_loss": 4.480430916485929, + "tokens_seen": 160104448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805416248746239, + "loss": 3.4085, + "theoretical_loss": 4.480193553736999, + "tokens_seen": 160169984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048053159478435303, + "loss": 3.296, + "theoretical_loss": 4.479956315269897, + "tokens_seen": 160235520 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048052156469408227, + "loss": 3.3078, + "theoretical_loss": 4.479719200968757, + "tokens_seen": 160301056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805115346038114, + "loss": 3.4221, + "theoretical_loss": 4.479482210717871, + "tokens_seen": 160366592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048050150451354063, + "loss": 3.3827, + "theoretical_loss": 4.479245344401685, + "tokens_seen": 160432128 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804914744232698, + "loss": 3.3653, + "theoretical_loss": 4.479008601904798, + "tokens_seen": 160497664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 228344, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3748011589050293, + "objective/train/theoretical_loss": 4.478771983111967, + "objective/train/tokens_used": 181023200, + "theoretical_loss": 4.478771983111967, + "tokens_seen": 160563200 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480481444332999, + "loss": 3.298, + "theoretical_loss": 4.478771983111967, + "tokens_seen": 160563200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048047141424272823, + "loss": 3.2943, + "theoretical_loss": 4.478535487908101, + "tokens_seen": 160628736 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048046138415245736, + "loss": 3.4003, + "theoretical_loss": 4.478299116178265, + "tokens_seen": 160694272 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804513540621866, + "loss": 3.6019, + "theoretical_loss": 4.478062867807674, + "tokens_seen": 160759808 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048044132397191577, + "loss": 3.4234, + "theoretical_loss": 4.4778267426817, + "tokens_seen": 160825344 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048043129388164495, + "loss": 3.3492, + "theoretical_loss": 4.477590740685867, + "tokens_seen": 160890880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048042126379137413, + "loss": 3.3196, + "theoretical_loss": 4.47735486170585, + "tokens_seen": 160956416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048041123370110337, + "loss": 3.2311, + "theoretical_loss": 4.47711910562748, + "tokens_seen": 161021952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804012036108325, + "loss": 3.2435, + "theoretical_loss": 4.4768834723367394, + "tokens_seen": 161087488 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048039117352056173, + "loss": 3.3293, + "theoretical_loss": 4.4766479617197605, + "tokens_seen": 161153024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048038114343029086, + "loss": 3.3614, + "theoretical_loss": 4.476412573662829, + "tokens_seen": 161218560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803711133400201, + "loss": 3.4204, + "theoretical_loss": 4.4761773080523835, + "tokens_seen": 161284096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803610832497493, + "loss": 3.266, + "theoretical_loss": 4.475942164775013, + "tokens_seen": 161349632 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048035105315947846, + "loss": 3.3162, + "theoretical_loss": 4.475707143717455, + "tokens_seen": 161415168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048034102306920764, + "loss": 3.2002, + "theoretical_loss": 4.475472244766601, + "tokens_seen": 161480704 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803309929789368, + "loss": 3.2076, + "theoretical_loss": 4.475237467809492, + "tokens_seen": 161546240 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480320962888666, + "loss": 3.3402, + "theoretical_loss": 4.47500281273332, + "tokens_seen": 161611776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048031093279839523, + "loss": 3.405, + "theoretical_loss": 4.474768279425424, + "tokens_seen": 161677312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048030090270812436, + "loss": 3.3508, + "theoretical_loss": 4.474533867773299, + "tokens_seen": 161742848 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802908726178536, + "loss": 3.2164, + "theoretical_loss": 4.474299577664581, + "tokens_seen": 161808384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802808425275827, + "loss": 3.2979, + "theoretical_loss": 4.474065408987063, + "tokens_seen": 161873920 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048027081243731196, + "loss": 3.16, + "theoretical_loss": 4.473831361628682, + "tokens_seen": 161939456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048026078234704114, + "loss": 3.4809, + "theoretical_loss": 4.473597435477526, + "tokens_seen": 162004992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802507522567703, + "loss": 3.3, + "theoretical_loss": 4.473363630421831, + "tokens_seen": 162070528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802407221664995, + "loss": 3.0521, + "theoretical_loss": 4.473129946349982, + "tokens_seen": 162136064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 229659, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4476068019866943, + "objective/train/theoretical_loss": 4.472896383150508, + "objective/train/tokens_used": 182661600, + "theoretical_loss": 4.472896383150508, + "tokens_seen": 162201600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048023069207622874, + "loss": 3.5461, + "theoretical_loss": 4.472896383150508, + "tokens_seen": 162201600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048022066198595786, + "loss": 3.4397, + "theoretical_loss": 4.472662940712091, + "tokens_seen": 162267136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802106318956871, + "loss": 3.5161, + "theoretical_loss": 4.472429618923558, + "tokens_seen": 162332672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048020060180541623, + "loss": 3.2219, + "theoretical_loss": 4.472196417673883, + "tokens_seen": 162398208 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048019057171514546, + "loss": 3.2822, + "theoretical_loss": 4.471963336852187, + "tokens_seen": 162463744 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048018054162487464, + "loss": 3.2824, + "theoretical_loss": 4.471730376347738, + "tokens_seen": 162529280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801705115346038, + "loss": 3.3961, + "theoretical_loss": 4.4714975360499505, + "tokens_seen": 162594816 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480160481444333, + "loss": 3.5644, + "theoretical_loss": 4.471264815848384, + "tokens_seen": 162660352 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801504513540622, + "loss": 3.3472, + "theoretical_loss": 4.471032215632746, + "tokens_seen": 162725888 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048014042126379137, + "loss": 3.6874, + "theoretical_loss": 4.470799735292889, + "tokens_seen": 162791424 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801303911735206, + "loss": 3.1886, + "theoretical_loss": 4.470567374718808, + "tokens_seen": 162856960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048012036108324973, + "loss": 3.1756, + "theoretical_loss": 4.470335133800649, + "tokens_seen": 162922496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048011033099297897, + "loss": 3.3402, + "theoretical_loss": 4.470103012428696, + "tokens_seen": 162988032 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048010030090270815, + "loss": 3.2792, + "theoretical_loss": 4.469871010493383, + "tokens_seen": 163053568 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048009027081243733, + "loss": 3.3714, + "theoretical_loss": 4.469639127885287, + "tokens_seen": 163119104 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800802407221665, + "loss": 3.269, + "theoretical_loss": 4.4694073644951295, + "tokens_seen": 163184640 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800702106318957, + "loss": 3.2126, + "theoretical_loss": 4.469175720213771, + "tokens_seen": 163250176 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048006018054162487, + "loss": 3.2064, + "theoretical_loss": 4.468944194932225, + "tokens_seen": 163315712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800501504513541, + "loss": 3.6002, + "theoretical_loss": 4.468712788541639, + "tokens_seen": 163381248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048004012036108323, + "loss": 3.4055, + "theoretical_loss": 4.46848150093331, + "tokens_seen": 163446784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048003009027081247, + "loss": 3.4893, + "theoretical_loss": 4.468250331998676, + "tokens_seen": 163512320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800200601805416, + "loss": 3.422, + "theoretical_loss": 4.468019281629316, + "tokens_seen": 163577856 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048001003009027083, + "loss": 3.19, + "theoretical_loss": 4.467788349716955, + "tokens_seen": 163643392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048, + "loss": 3.137, + "theoretical_loss": 4.467557536153457, + "tokens_seen": 163708928 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799899699097292, + "loss": 3.2519, + "theoretical_loss": 4.467326840830829, + "tokens_seen": 163774464 + }, + { + "debugging/Self-BLEU-5": 0.28281542061774223, + "debugging/distinct-1-grams": 0.7326597736651554, + "debugging/distinct-2-grams": 0.9067393674201281, + "debugging/entropy-1-grams": 5.525931828352874, + "debugging/entropy-2-grams": 6.085041606065698, + "debugging/length": 468.25, + "debugging/num_segments": 8, + "debugging/score": 0.00998370262687916, + "debugging/score_std": 0.008263218592709398, + "epoch": 0.05, + "objective/train/docs_used": 230304, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3832874298095703, + "objective/train/theoretical_loss": 4.467096263641219, + "objective/train/tokens_used": 184300000, + "theoretical_loss": 4.467096263641219, + "tokens_seen": 163840000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799799398194584, + "loss": 3.1978, + "theoretical_loss": 4.467096263641219, + "tokens_seen": 163840000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047996990972918756, + "loss": 3.112, + "theoretical_loss": 4.466865804476919, + "tokens_seen": 163905536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047995987963891674, + "loss": 3.2774, + "theoretical_loss": 4.466635463230359, + "tokens_seen": 163971072 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047994984954864597, + "loss": 3.3353, + "theoretical_loss": 4.466405239794113, + "tokens_seen": 164036608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799398194583751, + "loss": 3.2677, + "theoretical_loss": 4.466175134060894, + "tokens_seen": 164102144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047992978936810433, + "loss": 3.4542, + "theoretical_loss": 4.465945145923554, + "tokens_seen": 164167680 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799197592778335, + "loss": 3.4061, + "theoretical_loss": 4.4657152752750875, + "tokens_seen": 164233216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799097291875627, + "loss": 3.3211, + "theoretical_loss": 4.465485522008629, + "tokens_seen": 164298752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798996990972919, + "loss": 3.1246, + "theoretical_loss": 4.465255886017452, + "tokens_seen": 164364288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047988966900702106, + "loss": 3.3521, + "theoretical_loss": 4.465026367194971, + "tokens_seen": 164429824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047987963891675024, + "loss": 3.406, + "theoretical_loss": 4.464796965434738, + "tokens_seen": 164495360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798696088264795, + "loss": 3.2096, + "theoretical_loss": 4.464567680630443, + "tokens_seen": 164560896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798595787362086, + "loss": 3.3375, + "theoretical_loss": 4.464338512675919, + "tokens_seen": 164626432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047984954864593784, + "loss": 3.3015, + "theoretical_loss": 4.464109461465133, + "tokens_seen": 164691968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047983951855566696, + "loss": 3.1817, + "theoretical_loss": 4.4638805268921935, + "tokens_seen": 164757504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798294884653962, + "loss": 3.3159, + "theoretical_loss": 4.463651708851346, + "tokens_seen": 164823040 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798194583751254, + "loss": 3.3364, + "theoretical_loss": 4.463423007236974, + "tokens_seen": 164888576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047980942828485456, + "loss": 3.5, + "theoretical_loss": 4.4631944219436, + "tokens_seen": 164954112 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047979939819458374, + "loss": 3.5072, + "theoretical_loss": 4.462965952865879, + "tokens_seen": 165019648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797893681043129, + "loss": 3.541, + "theoretical_loss": 4.46273759989861, + "tokens_seen": 165085184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797793380140421, + "loss": 3.034, + "theoretical_loss": 4.462509362936723, + "tokens_seen": 165150720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047976930792377134, + "loss": 3.3458, + "theoretical_loss": 4.46228124187529, + "tokens_seen": 165216256 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047975927783350047, + "loss": 3.3679, + "theoretical_loss": 4.462053236609516, + "tokens_seen": 165281792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797492477432297, + "loss": 3.3828, + "theoretical_loss": 4.461825347034742, + "tokens_seen": 165347328 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797392176529589, + "loss": 3.3819, + "theoretical_loss": 4.461597573046449, + "tokens_seen": 165412864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 231649, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.920825958251953, + "objective/train/theoretical_loss": 4.461369914540247, + "objective/train/tokens_used": 185938400, + "theoretical_loss": 4.461369914540247, + "tokens_seen": 165478400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047972918756268807, + "loss": 3.1735, + "theoretical_loss": 4.461369914540247, + "tokens_seen": 165478400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797191574724173, + "loss": 3.4608, + "theoretical_loss": 4.4611423714118885, + "tokens_seen": 165543936 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047970912738214643, + "loss": 3.1453, + "theoretical_loss": 4.460914943557256, + "tokens_seen": 165609472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047969909729187566, + "loss": 3.3385, + "theoretical_loss": 4.460687630872371, + "tokens_seen": 165675008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047968906720160484, + "loss": 3.3347, + "theoretical_loss": 4.46046043325339, + "tokens_seen": 165740544 + }, + { + "epoch": 0.05, + "learning_rate": 0.000479679037111334, + "loss": 3.1779, + "theoretical_loss": 4.460233350596599, + "tokens_seen": 165806080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796690070210632, + "loss": 3.3418, + "theoretical_loss": 4.460006382798425, + "tokens_seen": 165871616 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796589769307924, + "loss": 3.1079, + "theoretical_loss": 4.459779529755423, + "tokens_seen": 165937152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047964894684052157, + "loss": 3.378, + "theoretical_loss": 4.459552791364288, + "tokens_seen": 166002688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796389167502508, + "loss": 3.5243, + "theoretical_loss": 4.459326167521844, + "tokens_seen": 166068224 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047962888665997993, + "loss": 3.4306, + "theoretical_loss": 4.4590996581250515, + "tokens_seen": 166133760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047961885656970917, + "loss": 3.3535, + "theoretical_loss": 4.458873263071002, + "tokens_seen": 166199296 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047960882647943835, + "loss": 3.311, + "theoretical_loss": 4.458646982256921, + "tokens_seen": 166264832 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047959879638916753, + "loss": 3.0939, + "theoretical_loss": 4.458420815580169, + "tokens_seen": 166330368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795887662988967, + "loss": 3.1869, + "theoretical_loss": 4.458194762938234, + "tokens_seen": 166395904 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795787362086259, + "loss": 3.3403, + "theoretical_loss": 4.457968824228743, + "tokens_seen": 166461440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047956870611835507, + "loss": 3.3175, + "theoretical_loss": 4.457742999349449, + "tokens_seen": 166526976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795586760280843, + "loss": 3.4267, + "theoretical_loss": 4.4575172881982414, + "tokens_seen": 166592512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047954864593781343, + "loss": 3.3691, + "theoretical_loss": 4.457291690673139, + "tokens_seen": 166658048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047953861584754267, + "loss": 3.4988, + "theoretical_loss": 4.457066206672291, + "tokens_seen": 166723584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795285857572718, + "loss": 3.1917, + "theoretical_loss": 4.456840836093983, + "tokens_seen": 166789120 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047951855566700103, + "loss": 3.1793, + "theoretical_loss": 4.456615578836625, + "tokens_seen": 166854656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795085255767302, + "loss": 3.4376, + "theoretical_loss": 4.456390434798762, + "tokens_seen": 166920192 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794984954864594, + "loss": 3.3839, + "theoretical_loss": 4.45616540387907, + "tokens_seen": 166985728 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794884653961886, + "loss": 3.4359, + "theoretical_loss": 4.4559404859763525, + "tokens_seen": 167051264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 232239, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8191466331481934, + "objective/train/theoretical_loss": 4.455715680989545, + "objective/train/tokens_used": 187576800, + "theoretical_loss": 4.455715680989545, + "tokens_seen": 167116800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047947843530591776, + "loss": 3.3247, + "theoretical_loss": 4.455715680989545, + "tokens_seen": 167116800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047946840521564694, + "loss": 3.442, + "theoretical_loss": 4.455490988817713, + "tokens_seen": 167182336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047945837512537617, + "loss": 3.4273, + "theoretical_loss": 4.4552664093600525, + "tokens_seen": 167247872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794483450351053, + "loss": 3.3412, + "theoretical_loss": 4.455041942515887, + "tokens_seen": 167313408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047943831494483453, + "loss": 3.2788, + "theoretical_loss": 4.454817588184669, + "tokens_seen": 167378944 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794282848545637, + "loss": 3.3126, + "theoretical_loss": 4.454593346265984, + "tokens_seen": 167444480 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794182547642929, + "loss": 3.5354, + "theoretical_loss": 4.454369216659542, + "tokens_seen": 167510016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794082246740221, + "loss": 3.2904, + "theoretical_loss": 4.454145199265183, + "tokens_seen": 167575552 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047939819458375126, + "loss": 3.3353, + "theoretical_loss": 4.453921293982877, + "tokens_seen": 167641088 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047938816449348044, + "loss": 3.3379, + "theoretical_loss": 4.453697500712722, + "tokens_seen": 167706624 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793781344032097, + "loss": 3.1888, + "theoretical_loss": 4.453473819354942, + "tokens_seen": 167772160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793681043129388, + "loss": 3.3098, + "theoretical_loss": 4.453250249809889, + "tokens_seen": 167837696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047935807422266804, + "loss": 3.3648, + "theoretical_loss": 4.453026791978045, + "tokens_seen": 167903232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047934804413239716, + "loss": 3.4082, + "theoretical_loss": 4.4528034457600185, + "tokens_seen": 167968768 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793380140421264, + "loss": 3.472, + "theoretical_loss": 4.452580211056542, + "tokens_seen": 168034304 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793279839518556, + "loss": 3.1374, + "theoretical_loss": 4.452357087768481, + "tokens_seen": 168099840 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047931795386158476, + "loss": 3.3576, + "theoretical_loss": 4.45213407579682, + "tokens_seen": 168165376 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047930792377131394, + "loss": 3.4063, + "theoretical_loss": 4.451911175042679, + "tokens_seen": 168230912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792978936810431, + "loss": 3.5888, + "theoretical_loss": 4.451688385407296, + "tokens_seen": 168296448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792878635907723, + "loss": 3.4353, + "theoretical_loss": 4.451465706792041, + "tokens_seen": 168361984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047927783350050154, + "loss": 3.6028, + "theoretical_loss": 4.4512431390984055, + "tokens_seen": 168427520 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047926780341023067, + "loss": 3.5768, + "theoretical_loss": 4.451020682228011, + "tokens_seen": 168493056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792577733199599, + "loss": 3.1725, + "theoretical_loss": 4.450798336082601, + "tokens_seen": 168558592 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792477432296891, + "loss": 3.0739, + "theoretical_loss": 4.450576100564046, + "tokens_seen": 168624128 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047923771313941827, + "loss": 3.16, + "theoretical_loss": 4.450353975574341, + "tokens_seen": 168689664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 233635, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.160566568374634, + "objective/train/theoretical_loss": 4.450131961015606, + "objective/train/tokens_used": 189215200, + "theoretical_loss": 4.450131961015606, + "tokens_seen": 168755200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047922768304914745, + "loss": 3.2785, + "theoretical_loss": 4.450131961015606, + "tokens_seen": 168755200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047921765295887663, + "loss": 3.401, + "theoretical_loss": 4.449910056790086, + "tokens_seen": 168820736 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792076228686058, + "loss": 3.122, + "theoretical_loss": 4.44968826280015, + "tokens_seen": 168886272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047919759277833504, + "loss": 3.4615, + "theoretical_loss": 4.4494665789482895, + "tokens_seen": 168951808 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047918756268806417, + "loss": 3.2752, + "theoretical_loss": 4.449245005137125, + "tokens_seen": 169017344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791775325977934, + "loss": 3.2905, + "theoretical_loss": 4.449023541269395, + "tokens_seen": 169082880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047916750250752253, + "loss": 3.3756, + "theoretical_loss": 4.448802187247966, + "tokens_seen": 169148416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047915747241725177, + "loss": 3.5968, + "theoretical_loss": 4.448580942975825, + "tokens_seen": 169213952 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047914744232698095, + "loss": 3.2809, + "theoretical_loss": 4.448359808356084, + "tokens_seen": 169279488 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047913741223671013, + "loss": 3.5359, + "theoretical_loss": 4.448138783291979, + "tokens_seen": 169345024 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791273821464393, + "loss": 3.4119, + "theoretical_loss": 4.447917867686863, + "tokens_seen": 169410560 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047911735205616855, + "loss": 3.4354, + "theoretical_loss": 4.44769706144422, + "tokens_seen": 169476096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791073219658977, + "loss": 3.4738, + "theoretical_loss": 4.44747636446765, + "tokens_seen": 169541632 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790972918756269, + "loss": 3.6068, + "theoretical_loss": 4.447255776660878, + "tokens_seen": 169607168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047908726178535604, + "loss": 3.2765, + "theoretical_loss": 4.44703529792775, + "tokens_seen": 169672704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047907723169508527, + "loss": 3.2577, + "theoretical_loss": 4.446814928172234, + "tokens_seen": 169738240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047906720160481445, + "loss": 3.3136, + "theoretical_loss": 4.446594667298421, + "tokens_seen": 169803776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047905717151454363, + "loss": 3.4342, + "theoretical_loss": 4.446374515210521, + "tokens_seen": 169869312 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790471414242728, + "loss": 3.305, + "theoretical_loss": 4.446154471812866, + "tokens_seen": 169934848 + }, + { + "epoch": 0.05, + "learning_rate": 0.000479037111334002, + "loss": 3.4233, + "theoretical_loss": 4.445934537009911, + "tokens_seen": 170000384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790270812437312, + "loss": 3.5624, + "theoretical_loss": 4.445714710706228, + "tokens_seen": 170065920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790170511534604, + "loss": 3.4564, + "theoretical_loss": 4.445494992806513, + "tokens_seen": 170131456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047900702106318954, + "loss": 3.4515, + "theoretical_loss": 4.44527538321558, + "tokens_seen": 170196992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789969909729188, + "loss": 3.3197, + "theoretical_loss": 4.445055881838365, + "tokens_seen": 170262528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789869608826479, + "loss": 3.4949, + "theoretical_loss": 4.444836488579924, + "tokens_seen": 170328064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 234408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2099924087524414, + "objective/train/theoretical_loss": 4.44461720334543, + "objective/train/tokens_used": 190853600, + "theoretical_loss": 4.44461720334543, + "tokens_seen": 170393600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047897693079237714, + "loss": 3.2463, + "theoretical_loss": 4.44461720334543, + "tokens_seen": 170393600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047896690070210637, + "loss": 3.4087, + "theoretical_loss": 4.444398026040179, + "tokens_seen": 170459136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789568706118355, + "loss": 3.4015, + "theoretical_loss": 4.444178956569585, + "tokens_seen": 170524672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047894684052156473, + "loss": 3.2236, + "theoretical_loss": 4.443959994839181, + "tokens_seen": 170590208 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789368104312939, + "loss": 3.3624, + "theoretical_loss": 4.44374114075462, + "tokens_seen": 170655744 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789267803410231, + "loss": 3.4993, + "theoretical_loss": 4.443522394221671, + "tokens_seen": 170721280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789167502507523, + "loss": 3.3937, + "theoretical_loss": 4.443303755146225, + "tokens_seen": 170786816 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047890672016048146, + "loss": 3.2099, + "theoretical_loss": 4.443085223434291, + "tokens_seen": 170852352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047889669007021064, + "loss": 3.2229, + "theoretical_loss": 4.442866798991993, + "tokens_seen": 170917888 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788866599799399, + "loss": 3.38, + "theoretical_loss": 4.442648481725577, + "tokens_seen": 170983424 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478876629889669, + "loss": 3.5082, + "theoretical_loss": 4.442430271541404, + "tokens_seen": 171048960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047886659979939824, + "loss": 3.2701, + "theoretical_loss": 4.442212168345956, + "tokens_seen": 171114496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047885656970912736, + "loss": 3.405, + "theoretical_loss": 4.4419941720458285, + "tokens_seen": 171180032 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788465396188566, + "loss": 3.0497, + "theoretical_loss": 4.441776282547736, + "tokens_seen": 171245568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788365095285858, + "loss": 3.2281, + "theoretical_loss": 4.441558499758511, + "tokens_seen": 171311104 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047882647943831496, + "loss": 3.3353, + "theoretical_loss": 4.441340823585101, + "tokens_seen": 171376640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047881644934804414, + "loss": 3.2502, + "theoretical_loss": 4.441123253934572, + "tokens_seen": 171442176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788064192577733, + "loss": 3.36, + "theoretical_loss": 4.440905790714105, + "tokens_seen": 171507712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787963891675025, + "loss": 3.2246, + "theoretical_loss": 4.440688433830999, + "tokens_seen": 171573248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047878635907723174, + "loss": 3.1875, + "theoretical_loss": 4.440471183192667, + "tokens_seen": 171638784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047877632898696087, + "loss": 3.2297, + "theoretical_loss": 4.440254038706639, + "tokens_seen": 171704320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787662988966901, + "loss": 3.4318, + "theoretical_loss": 4.440037000280561, + "tokens_seen": 171769856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787562688064193, + "loss": 3.298, + "theoretical_loss": 4.439820067822195, + "tokens_seen": 171835392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047874623871614847, + "loss": 3.5067, + "theoretical_loss": 4.439603241239416, + "tokens_seen": 171900928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047873620862587765, + "loss": 3.1999, + "theoretical_loss": 4.439386520440218, + "tokens_seen": 171966464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 235647, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.488489866256714, + "objective/train/theoretical_loss": 4.439169905332706, + "objective/train/tokens_used": 192492000, + "theoretical_loss": 4.439169905332706, + "tokens_seen": 172032000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047872617853560683, + "loss": 3.6249, + "theoretical_loss": 4.439169905332706, + "tokens_seen": 172032000 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478716148445336, + "loss": 3.4185, + "theoretical_loss": 4.438953395825102, + "tokens_seen": 172097536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047870611835506524, + "loss": 3.4638, + "theoretical_loss": 4.438736991825744, + "tokens_seen": 172163072 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047869608826479437, + "loss": 2.9901, + "theoretical_loss": 4.438520693243079, + "tokens_seen": 172228608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786860581745236, + "loss": 3.3508, + "theoretical_loss": 4.4383044999856756, + "tokens_seen": 172294144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047867602808425273, + "loss": 3.006, + "theoretical_loss": 4.438088411962211, + "tokens_seen": 172359680 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047866599799398197, + "loss": 3.2261, + "theoretical_loss": 4.437872429081477, + "tokens_seen": 172425216 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047865596790371115, + "loss": 3.3128, + "theoretical_loss": 4.437656551252381, + "tokens_seen": 172490752 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047864593781344033, + "loss": 3.1811, + "theoretical_loss": 4.4374407783839445, + "tokens_seen": 172556288 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786359077231695, + "loss": 3.6537, + "theoretical_loss": 4.437225110385297, + "tokens_seen": 172621824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047862587763289875, + "loss": 3.3197, + "theoretical_loss": 4.4370095471656885, + "tokens_seen": 172687360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786158475426279, + "loss": 3.2659, + "theoretical_loss": 4.436794088634477, + "tokens_seen": 172752896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786058174523571, + "loss": 3.432, + "theoretical_loss": 4.4365787347011345, + "tokens_seen": 172818432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047859578736208624, + "loss": 3.4144, + "theoretical_loss": 4.436363485275246, + "tokens_seen": 172883968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047858575727181547, + "loss": 3.4017, + "theoretical_loss": 4.436148340266508, + "tokens_seen": 172949504 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047857572718154465, + "loss": 3.7135, + "theoretical_loss": 4.435933299584729, + "tokens_seen": 173015040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047856569709127383, + "loss": 3.2931, + "theoretical_loss": 4.4357183631398325, + "tokens_seen": 173080576 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478555667001003, + "loss": 3.3342, + "theoretical_loss": 4.435503530841849, + "tokens_seen": 173146112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785456369107322, + "loss": 3.1036, + "theoretical_loss": 4.435288802600926, + "tokens_seen": 173211648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785356068204614, + "loss": 3.4167, + "theoretical_loss": 4.4350741783273175, + "tokens_seen": 173277184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785255767301906, + "loss": 3.3526, + "theoretical_loss": 4.434859657931392, + "tokens_seen": 173342720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047851554663991974, + "loss": 3.3756, + "theoretical_loss": 4.434645241323629, + "tokens_seen": 173408256 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478505516549649, + "loss": 3.3329, + "theoretical_loss": 4.434430928414617, + "tokens_seen": 173473792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784954864593781, + "loss": 3.2552, + "theoretical_loss": 4.434216719115057, + "tokens_seen": 173539328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047848545636910734, + "loss": 3.1126, + "theoretical_loss": 4.43400261333576, + "tokens_seen": 173604864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 236140, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.18463397026062, + "objective/train/theoretical_loss": 4.433788610987646, + "objective/train/tokens_used": 194130400, + "theoretical_loss": 4.433788610987646, + "tokens_seen": 173670400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784754262788365, + "loss": 3.3947, + "theoretical_loss": 4.433788610987646, + "tokens_seen": 173670400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784653961885657, + "loss": 3.4461, + "theoretical_loss": 4.433574711981749, + "tokens_seen": 173735936 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784553660982949, + "loss": 3.2924, + "theoretical_loss": 4.433360916229209, + "tokens_seen": 173801472 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784453360080241, + "loss": 3.3511, + "theoretical_loss": 4.433147223641278, + "tokens_seen": 173867008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047843530591775324, + "loss": 3.1406, + "theoretical_loss": 4.432933634129318, + "tokens_seen": 173932544 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784252758274825, + "loss": 3.3653, + "theoretical_loss": 4.4327201476047975, + "tokens_seen": 173998080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784152457372116, + "loss": 3.1941, + "theoretical_loss": 4.432506763979299, + "tokens_seen": 174063616 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047840521564694084, + "loss": 3.2088, + "theoretical_loss": 4.432293483164512, + "tokens_seen": 174129152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047839518555667, + "loss": 3.4825, + "theoretical_loss": 4.432080305072233, + "tokens_seen": 174194688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783851554663992, + "loss": 3.1969, + "theoretical_loss": 4.43186722961437, + "tokens_seen": 174260224 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783751253761284, + "loss": 3.3585, + "theoretical_loss": 4.431654256702938, + "tokens_seen": 174325760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047836509528585756, + "loss": 3.2937, + "theoretical_loss": 4.431441386250063, + "tokens_seen": 174391296 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047835506519558675, + "loss": 3.3733, + "theoretical_loss": 4.4312286181679745, + "tokens_seen": 174456832 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478345035105316, + "loss": 3.1708, + "theoretical_loss": 4.431015952369016, + "tokens_seen": 174522368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783350050150451, + "loss": 3.4721, + "theoretical_loss": 4.430803388765636, + "tokens_seen": 174587904 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047832497492477434, + "loss": 3.3855, + "theoretical_loss": 4.430590927270388, + "tokens_seen": 174653440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047831494483450347, + "loss": 3.2205, + "theoretical_loss": 4.430378567795938, + "tokens_seen": 174718976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783049147442327, + "loss": 3.3129, + "theoretical_loss": 4.430166310255057, + "tokens_seen": 174784512 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782948846539619, + "loss": 3.6459, + "theoretical_loss": 4.429954154560624, + "tokens_seen": 174850048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047828485456369107, + "loss": 3.3131, + "theoretical_loss": 4.429742100625624, + "tokens_seen": 174915584 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047827482447342025, + "loss": 3.2811, + "theoretical_loss": 4.429530148363151, + "tokens_seen": 174981120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782647943831495, + "loss": 3.2996, + "theoretical_loss": 4.429318297686402, + "tokens_seen": 175046656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782547642928786, + "loss": 3.3326, + "theoretical_loss": 4.429106548508685, + "tokens_seen": 175112192 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047824473420260785, + "loss": 3.1878, + "theoretical_loss": 4.428894900743411, + "tokens_seen": 175177728 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478234704112337, + "loss": 3.214, + "theoretical_loss": 4.428683354304098, + "tokens_seen": 175243264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 237488, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5905725955963135, + "objective/train/theoretical_loss": 4.428471909104372, + "objective/train/tokens_used": 195768800, + "theoretical_loss": 4.428471909104372, + "tokens_seen": 175308800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782246740220662, + "loss": 3.1411, + "theoretical_loss": 4.428471909104372, + "tokens_seen": 175308800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047821464393179544, + "loss": 3.3937, + "theoretical_loss": 4.428260565057964, + "tokens_seen": 175374336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047820461384152457, + "loss": 3.3474, + "theoretical_loss": 4.428049322078708, + "tokens_seen": 175439872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781945837512538, + "loss": 3.3299, + "theoretical_loss": 4.427838180080547, + "tokens_seen": 175505408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047818455366098293, + "loss": 3.3159, + "theoretical_loss": 4.4276271389775275, + "tokens_seen": 175570944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047817452357071217, + "loss": 3.3342, + "theoretical_loss": 4.427416198683803, + "tokens_seen": 175636480 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047816449348044135, + "loss": 3.2288, + "theoretical_loss": 4.427205359113629, + "tokens_seen": 175702016 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047815446339017053, + "loss": 3.5414, + "theoretical_loss": 4.42699462018137, + "tokens_seen": 175767552 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781444332998997, + "loss": 3.389, + "theoretical_loss": 4.42678398180149, + "tokens_seen": 175833088 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047813440320962895, + "loss": 3.2056, + "theoretical_loss": 4.426573443888563, + "tokens_seen": 175898624 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781243731193581, + "loss": 3.2972, + "theoretical_loss": 4.426363006357263, + "tokens_seen": 175964160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781143430290873, + "loss": 3.4419, + "theoretical_loss": 4.426152669122374, + "tokens_seen": 176029696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047810431293881644, + "loss": 3.285, + "theoretical_loss": 4.425942432098774, + "tokens_seen": 176095232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047809428284854567, + "loss": 3.3969, + "theoretical_loss": 4.425732295201455, + "tokens_seen": 176160768 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047808425275827485, + "loss": 3.2307, + "theoretical_loss": 4.425522258345508, + "tokens_seen": 176226304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047807422266800403, + "loss": 3.4295, + "theoretical_loss": 4.425312321446127, + "tokens_seen": 176291840 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780641925777332, + "loss": 3.1458, + "theoretical_loss": 4.425102484418613, + "tokens_seen": 176357376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780541624874624, + "loss": 3.3972, + "theoretical_loss": 4.424892747178365, + "tokens_seen": 176422912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780441323971916, + "loss": 3.527, + "theoretical_loss": 4.42468310964089, + "tokens_seen": 176488448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780341023069208, + "loss": 3.4912, + "theoretical_loss": 4.424473571721794, + "tokens_seen": 176553984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047802407221664994, + "loss": 3.2588, + "theoretical_loss": 4.42426413333679, + "tokens_seen": 176619520 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780140421263792, + "loss": 3.398, + "theoretical_loss": 4.424054794401689, + "tokens_seen": 176685056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780040120361083, + "loss": 3.3066, + "theoretical_loss": 4.423845554832406, + "tokens_seen": 176750592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047799398194583754, + "loss": 3.4778, + "theoretical_loss": 4.42363641454496, + "tokens_seen": 176816128 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779839518555667, + "loss": 3.4736, + "theoretical_loss": 4.423427373455471, + "tokens_seen": 176881664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 238146, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.402047634124756, + "objective/train/theoretical_loss": 4.42321843148016, + "objective/train/tokens_used": 197407200, + "theoretical_loss": 4.42321843148016, + "tokens_seen": 176947200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779739217652959, + "loss": 3.4011, + "theoretical_loss": 4.42321843148016, + "tokens_seen": 176947200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779638916750251, + "loss": 3.2247, + "theoretical_loss": 4.423009588535351, + "tokens_seen": 177012736 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779538615847543, + "loss": 3.5019, + "theoretical_loss": 4.422800844537466, + "tokens_seen": 177078272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047794383149448344, + "loss": 3.3271, + "theoretical_loss": 4.422592199403036, + "tokens_seen": 177143808 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779338014042127, + "loss": 3.1612, + "theoretical_loss": 4.422383653048685, + "tokens_seen": 177209344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779237713139418, + "loss": 3.5149, + "theoretical_loss": 4.422175205391145, + "tokens_seen": 177274880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047791374122367104, + "loss": 3.4788, + "theoretical_loss": 4.421966856347243, + "tokens_seen": 177340416 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779037111334002, + "loss": 3.3556, + "theoretical_loss": 4.421758605833912, + "tokens_seen": 177405952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778936810431294, + "loss": 3.2858, + "theoretical_loss": 4.421550453768181, + "tokens_seen": 177471488 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778836509528586, + "loss": 3.2064, + "theoretical_loss": 4.421342400067183, + "tokens_seen": 177537024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047787362086258776, + "loss": 3.2573, + "theoretical_loss": 4.42113444464815, + "tokens_seen": 177602560 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047786359077231695, + "loss": 3.1305, + "theoretical_loss": 4.420926587428411, + "tokens_seen": 177668096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778535606820462, + "loss": 3.3526, + "theoretical_loss": 4.420718828325403, + "tokens_seen": 177733632 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778435305917753, + "loss": 3.4669, + "theoretical_loss": 4.420511167256656, + "tokens_seen": 177799168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047783350050150454, + "loss": 3.2721, + "theoretical_loss": 4.4203036041398, + "tokens_seen": 177864704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047782347041123367, + "loss": 3.4746, + "theoretical_loss": 4.420096138892568, + "tokens_seen": 177930240 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778134403209629, + "loss": 3.2158, + "theoretical_loss": 4.419888771432789, + "tokens_seen": 177995776 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778034102306921, + "loss": 3.5373, + "theoretical_loss": 4.419681501678395, + "tokens_seen": 178061312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047779338014042127, + "loss": 3.2966, + "theoretical_loss": 4.419474329547413, + "tokens_seen": 178126848 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047778335005015045, + "loss": 3.4379, + "theoretical_loss": 4.419267254957971, + "tokens_seen": 178192384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777733199598797, + "loss": 3.5645, + "theoretical_loss": 4.419060277828295, + "tokens_seen": 178257920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777632898696088, + "loss": 3.3582, + "theoretical_loss": 4.41885339807671, + "tokens_seen": 178323456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047775325977933805, + "loss": 3.2314, + "theoretical_loss": 4.4186466156216415, + "tokens_seen": 178388992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777432296890672, + "loss": 3.3675, + "theoretical_loss": 4.418439930381609, + "tokens_seen": 178454528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777331995987964, + "loss": 3.1722, + "theoretical_loss": 4.418233342275233, + "tokens_seen": 178520064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 239453, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.536768674850464, + "objective/train/theoretical_loss": 4.418026851221231, + "objective/train/tokens_used": 199045600, + "theoretical_loss": 4.418026851221231, + "tokens_seen": 178585600 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777231695085256, + "loss": 3.5599, + "theoretical_loss": 4.418026851221231, + "tokens_seen": 178585600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047771313941825477, + "loss": 3.3849, + "theoretical_loss": 4.4178204571384185, + "tokens_seen": 178651136 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047770310932798395, + "loss": 3.2237, + "theoretical_loss": 4.41761415994571, + "tokens_seen": 178716672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047769307923771313, + "loss": 3.3179, + "theoretical_loss": 4.417407959562116, + "tokens_seen": 178782208 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776830491474423, + "loss": 3.0157, + "theoretical_loss": 4.417201855906742, + "tokens_seen": 178847744 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047767301905717155, + "loss": 3.4205, + "theoretical_loss": 4.416995848898797, + "tokens_seen": 178913280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776629889669007, + "loss": 3.5319, + "theoretical_loss": 4.4167899384575815, + "tokens_seen": 178978816 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776529588766299, + "loss": 3.3096, + "theoretical_loss": 4.416584124502495, + "tokens_seen": 179044352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047764292878635904, + "loss": 3.5886, + "theoretical_loss": 4.416378406953033, + "tokens_seen": 179109888 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776328986960883, + "loss": 3.4965, + "theoretical_loss": 4.41617278572879, + "tokens_seen": 179175424 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047762286860581746, + "loss": 3.2969, + "theoretical_loss": 4.4159672607494524, + "tokens_seen": 179240960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047761283851554664, + "loss": 3.3703, + "theoretical_loss": 4.415761831934808, + "tokens_seen": 179306496 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776028084252758, + "loss": 3.363, + "theoretical_loss": 4.415556499204737, + "tokens_seen": 179372032 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047759277833500505, + "loss": 3.4131, + "theoretical_loss": 4.415351262479216, + "tokens_seen": 179437568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775827482447342, + "loss": 3.2837, + "theoretical_loss": 4.415146121678321, + "tokens_seen": 179503104 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775727181544634, + "loss": 3.3687, + "theoretical_loss": 4.414941076722219, + "tokens_seen": 179568640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047756268806419254, + "loss": 3.2755, + "theoretical_loss": 4.4147361275311745, + "tokens_seen": 179634176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775526579739218, + "loss": 3.4791, + "theoretical_loss": 4.414531274025548, + "tokens_seen": 179699712 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047754262788365096, + "loss": 3.3726, + "theoretical_loss": 4.414326516125795, + "tokens_seen": 179765248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047753259779338014, + "loss": 3.4113, + "theoretical_loss": 4.414121853752466, + "tokens_seen": 179830784 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775225677031093, + "loss": 3.3512, + "theoretical_loss": 4.413917286826205, + "tokens_seen": 179896320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775125376128385, + "loss": 3.4069, + "theoretical_loss": 4.413712815267752, + "tokens_seen": 179961856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775025075225677, + "loss": 3.1901, + "theoretical_loss": 4.413508438997944, + "tokens_seen": 180027392 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774924774322969, + "loss": 3.5482, + "theoretical_loss": 4.4133041579377075, + "tokens_seen": 180092928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047748244734202605, + "loss": 3.3269, + "theoretical_loss": 4.413099972008068, + "tokens_seen": 180158464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 240101, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.189091444015503, + "objective/train/theoretical_loss": 4.412895881130142, + "objective/train/tokens_used": 200684000, + "theoretical_loss": 4.412895881130142, + "tokens_seen": 180224000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774724172517553, + "loss": 3.2624, + "theoretical_loss": 4.412895881130142, + "tokens_seen": 180224000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774623871614845, + "loss": 3.4578, + "theoretical_loss": 4.412691885225141, + "tokens_seen": 180289536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047745235707121364, + "loss": 3.3567, + "theoretical_loss": 4.412487984214373, + "tokens_seen": 180355072 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774423269809429, + "loss": 3.2258, + "theoretical_loss": 4.412284178019235, + "tokens_seen": 180420608 + }, + { + "epoch": 0.05, + "learning_rate": 0.000477432296890672, + "loss": 3.2944, + "theoretical_loss": 4.412080466561221, + "tokens_seen": 180486144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047742226680040124, + "loss": 3.3614, + "theoretical_loss": 4.411876849761917, + "tokens_seen": 180551680 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774122367101304, + "loss": 3.1865, + "theoretical_loss": 4.411673327543005, + "tokens_seen": 180617216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774022066198596, + "loss": 3.517, + "theoretical_loss": 4.4114698998262565, + "tokens_seen": 180682752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773921765295888, + "loss": 3.5934, + "theoretical_loss": 4.411266566533539, + "tokens_seen": 180748288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047738214643931797, + "loss": 3.0994, + "theoretical_loss": 4.41106332758681, + "tokens_seen": 180813824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047737211634904715, + "loss": 3.6293, + "theoretical_loss": 4.41086018290812, + "tokens_seen": 180879360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773620862587764, + "loss": 3.3159, + "theoretical_loss": 4.410657132419617, + "tokens_seen": 180944896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773520561685055, + "loss": 3.2151, + "theoretical_loss": 4.410454176043537, + "tokens_seen": 181010432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047734202607823474, + "loss": 3.3866, + "theoretical_loss": 4.410251313702208, + "tokens_seen": 181075968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047733199598796387, + "loss": 3.3264, + "theoretical_loss": 4.410048545318052, + "tokens_seen": 181141504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773219658976931, + "loss": 3.3692, + "theoretical_loss": 4.409845870813582, + "tokens_seen": 181207040 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773119358074223, + "loss": 3.4274, + "theoretical_loss": 4.409643290111404, + "tokens_seen": 181272576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047730190571715147, + "loss": 3.2483, + "theoretical_loss": 4.409440803134215, + "tokens_seen": 181338112 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047729187562688065, + "loss": 3.2365, + "theoretical_loss": 4.409238409804804, + "tokens_seen": 181403648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004772818455366099, + "loss": 3.2074, + "theoretical_loss": 4.409036110046051, + "tokens_seen": 181469184 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477271815446339, + "loss": 3.0851, + "theoretical_loss": 4.408833903780926, + "tokens_seen": 181534720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047726178535606825, + "loss": 3.2768, + "theoretical_loss": 4.408631790932494, + "tokens_seen": 181600256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772517552657974, + "loss": 3.7316, + "theoretical_loss": 4.408429771423909, + "tokens_seen": 181665792 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772417251755266, + "loss": 3.1774, + "theoretical_loss": 4.408227845178414, + "tokens_seen": 181731328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772316950852558, + "loss": 3.3076, + "theoretical_loss": 4.408026012119344, + "tokens_seen": 181796864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 241412, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7472660541534424, + "objective/train/theoretical_loss": 4.407824272170128, + "objective/train/tokens_used": 202322400, + "theoretical_loss": 4.407824272170128, + "tokens_seen": 181862400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047722166499498497, + "loss": 3.2786, + "theoretical_loss": 4.407824272170128, + "tokens_seen": 181862400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047721163490471415, + "loss": 3.4129, + "theoretical_loss": 4.407622625254279, + "tokens_seen": 181927936 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047720160481444333, + "loss": 2.838, + "theoretical_loss": 4.407421071295406, + "tokens_seen": 181993472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771915747241725, + "loss": 3.2404, + "theoretical_loss": 4.407219610217206, + "tokens_seen": 182059008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047718154463390175, + "loss": 3.4927, + "theoretical_loss": 4.407018241943467, + "tokens_seen": 182124544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771715145436309, + "loss": 3.1684, + "theoretical_loss": 4.406816966398064, + "tokens_seen": 182190080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771614844533601, + "loss": 3.3542, + "theoretical_loss": 4.406615783504965, + "tokens_seen": 182255616 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047715145436308924, + "loss": 3.2972, + "theoretical_loss": 4.4064146931882275, + "tokens_seen": 182321152 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771414242728185, + "loss": 3.0567, + "theoretical_loss": 4.406213695371996, + "tokens_seen": 182386688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047713139418254766, + "loss": 3.2913, + "theoretical_loss": 4.406012789980506, + "tokens_seen": 182452224 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047712136409227684, + "loss": 3.425, + "theoretical_loss": 4.405811976938084, + "tokens_seen": 182517760 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477111334002006, + "loss": 3.3947, + "theoretical_loss": 4.405611256169143, + "tokens_seen": 182583296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047710130391173525, + "loss": 3.2788, + "theoretical_loss": 4.405410627598185, + "tokens_seen": 182648832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770912738214644, + "loss": 3.2396, + "theoretical_loss": 4.405210091149802, + "tokens_seen": 182714368 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770812437311936, + "loss": 3.6049, + "theoretical_loss": 4.405009646748674, + "tokens_seen": 182779904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047707121364092274, + "loss": 3.2774, + "theoretical_loss": 4.404809294319572, + "tokens_seen": 182845440 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477061183550652, + "loss": 3.1352, + "theoretical_loss": 4.40460903378735, + "tokens_seen": 182910976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047705115346038116, + "loss": 3.5256, + "theoretical_loss": 4.404408865076955, + "tokens_seen": 182976512 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047704112337011034, + "loss": 3.3179, + "theoretical_loss": 4.404208788113422, + "tokens_seen": 183042048 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770310932798395, + "loss": 3.368, + "theoretical_loss": 4.404008802821871, + "tokens_seen": 183107584 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770210631895687, + "loss": 3.4253, + "theoretical_loss": 4.4038089091275125, + "tokens_seen": 183173120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770110330992979, + "loss": 3.3234, + "theoretical_loss": 4.403609106955645, + "tokens_seen": 183238656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770010030090271, + "loss": 3.3547, + "theoretical_loss": 4.403409396231651, + "tokens_seen": 183304192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047699097291875625, + "loss": 3.3308, + "theoretical_loss": 4.403209776881004, + "tokens_seen": 183369728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769809428284855, + "loss": 3.3826, + "theoretical_loss": 4.403010248829265, + "tokens_seen": 183435264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 241901, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.666914463043213, + "objective/train/theoretical_loss": 4.4028108120020795, + "objective/train/tokens_used": 203960800, + "theoretical_loss": 4.4028108120020795, + "tokens_seen": 183500800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047697091273821466, + "loss": 3.347, + "theoretical_loss": 4.4028108120020795, + "tokens_seen": 183500800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047696088264794384, + "loss": 3.3715, + "theoretical_loss": 4.402611466325182, + "tokens_seen": 183566336 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476950852557673, + "loss": 3.5634, + "theoretical_loss": 4.4024122117243945, + "tokens_seen": 183631872 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769408224674022, + "loss": 3.0663, + "theoretical_loss": 4.402213048125624, + "tokens_seen": 183697408 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769307923771314, + "loss": 3.059, + "theoretical_loss": 4.4020139754548655, + "tokens_seen": 183762944 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769207622868606, + "loss": 3.1995, + "theoretical_loss": 4.401814993638199, + "tokens_seen": 183828480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047691073219658975, + "loss": 3.1642, + "theoretical_loss": 4.4016161026017935, + "tokens_seen": 183894016 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476900702106319, + "loss": 2.9642, + "theoretical_loss": 4.401417302271902, + "tokens_seen": 183959552 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768906720160481, + "loss": 3.405, + "theoretical_loss": 4.401218592574865, + "tokens_seen": 184025088 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047688064192577735, + "loss": 3.362, + "theoretical_loss": 4.401019973437108, + "tokens_seen": 184090624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047687061183550653, + "loss": 3.416, + "theoretical_loss": 4.400821444785143, + "tokens_seen": 184156160 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768605817452357, + "loss": 3.1617, + "theoretical_loss": 4.400623006545567, + "tokens_seen": 184221696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768505516549649, + "loss": 3.4902, + "theoretical_loss": 4.400424658645065, + "tokens_seen": 184287232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047684052156469407, + "loss": 3.2861, + "theoretical_loss": 4.400226401010404, + "tokens_seen": 184352768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047683049147442325, + "loss": 3.3124, + "theoretical_loss": 4.40002823356844, + "tokens_seen": 184418304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768204613841525, + "loss": 2.9837, + "theoretical_loss": 4.39983015624611, + "tokens_seen": 184483840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768104312938816, + "loss": 3.2424, + "theoretical_loss": 4.39963216897044, + "tokens_seen": 184549376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047680040120361085, + "loss": 3.2719, + "theoretical_loss": 4.3994342716685395, + "tokens_seen": 184614912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047679037111334003, + "loss": 3.1485, + "theoretical_loss": 4.399236464267602, + "tokens_seen": 184680448 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767803410230692, + "loss": 3.3108, + "theoretical_loss": 4.399038746694908, + "tokens_seen": 184745984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767703109327984, + "loss": 3.135, + "theoretical_loss": 4.398841118877819, + "tokens_seen": 184811520 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767602808425276, + "loss": 3.4858, + "theoretical_loss": 4.398643580743785, + "tokens_seen": 184877056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047675025075225676, + "loss": 3.272, + "theoretical_loss": 4.398446132220338, + "tokens_seen": 184942592 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476740220661986, + "loss": 3.4381, + "theoretical_loss": 4.3982487732350934, + "tokens_seen": 185008128 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767301905717151, + "loss": 3.3967, + "theoretical_loss": 4.398051503715753, + "tokens_seen": 185073664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 243537, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3711562156677246, + "objective/train/theoretical_loss": 4.397854323590102, + "objective/train/tokens_used": 205599200, + "theoretical_loss": 4.397854323590102, + "tokens_seen": 185139200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047672016048144435, + "loss": 3.0928, + "theoretical_loss": 4.397854323590102, + "tokens_seen": 185139200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047671013039117353, + "loss": 3.2136, + "theoretical_loss": 4.397657232786008, + "tokens_seen": 185204736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767001003009027, + "loss": 3.3584, + "theoretical_loss": 4.397460231231424, + "tokens_seen": 185270272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047669007021063195, + "loss": 3.4504, + "theoretical_loss": 4.397263318854384, + "tokens_seen": 185335808 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766800401203611, + "loss": 3.3679, + "theoretical_loss": 4.39706649558301, + "tokens_seen": 185401344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766700100300903, + "loss": 3.254, + "theoretical_loss": 4.396869761345503, + "tokens_seen": 185466880 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047665997993981944, + "loss": 3.0167, + "theoretical_loss": 4.396673116070147, + "tokens_seen": 185532416 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766499498495487, + "loss": 3.397, + "theoretical_loss": 4.396476559685315, + "tokens_seen": 185597952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047663991975927786, + "loss": 3.1028, + "theoretical_loss": 4.396280092119455, + "tokens_seen": 185663488 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047662988966900704, + "loss": 3.3503, + "theoretical_loss": 4.3960837133011035, + "tokens_seen": 185729024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766198595787362, + "loss": 3.2027, + "theoretical_loss": 4.395887423158877, + "tokens_seen": 185794560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047660982948846545, + "loss": 3.36, + "theoretical_loss": 4.395691221621476, + "tokens_seen": 185860096 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765997993981946, + "loss": 3.4191, + "theoretical_loss": 4.395495108617682, + "tokens_seen": 185925632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765897693079238, + "loss": 3.4474, + "theoretical_loss": 4.39529908407636, + "tokens_seen": 185991168 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047657973921765294, + "loss": 3.2397, + "theoretical_loss": 4.3951031479264575, + "tokens_seen": 186056704 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765697091273822, + "loss": 3.4729, + "theoretical_loss": 4.394907300097002, + "tokens_seen": 186122240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047655967903711136, + "loss": 3.2495, + "theoretical_loss": 4.394711540517106, + "tokens_seen": 186187776 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047654964894684054, + "loss": 3.4181, + "theoretical_loss": 4.39451586911596, + "tokens_seen": 186253312 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765396188565697, + "loss": 3.5468, + "theoretical_loss": 4.39432028582284, + "tokens_seen": 186318848 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765295887662989, + "loss": 3.111, + "theoretical_loss": 4.394124790567101, + "tokens_seen": 186384384 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765195586760281, + "loss": 3.3198, + "theoretical_loss": 4.3939293832781825, + "tokens_seen": 186449920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765095285857573, + "loss": 3.2602, + "theoretical_loss": 4.393734063885599, + "tokens_seen": 186515456 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047649949849548645, + "loss": 3.0038, + "theoretical_loss": 4.3935388323189555, + "tokens_seen": 186580992 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764894684052157, + "loss": 3.371, + "theoretical_loss": 4.39334368850793, + "tokens_seen": 186646528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047647943831494486, + "loss": 3.4606, + "theoretical_loss": 4.3931486323822835, + "tokens_seen": 186712064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 246703, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3764562606811523, + "objective/train/theoretical_loss": 4.392953663871862, + "objective/train/tokens_used": 207237600, + "theoretical_loss": 4.392953663871862, + "tokens_seen": 186777600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047646940822467404, + "loss": 3.2311, + "theoretical_loss": 4.392953663871862, + "tokens_seen": 186777600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764593781344032, + "loss": 3.2135, + "theoretical_loss": 4.392758782906586, + "tokens_seen": 186843136 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764493480441324, + "loss": 3.5582, + "theoretical_loss": 4.392563989416462, + "tokens_seen": 186908672 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764393179538616, + "loss": 3.4679, + "theoretical_loss": 4.392369283331574, + "tokens_seen": 186974208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764292878635908, + "loss": 3.3004, + "theoretical_loss": 4.392174664582085, + "tokens_seen": 187039744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047641925777331995, + "loss": 3.3981, + "theoretical_loss": 4.391980133098244, + "tokens_seen": 187105280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764092276830492, + "loss": 3.248, + "theoretical_loss": 4.391785688810373, + "tokens_seen": 187170816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763991975927783, + "loss": 3.6385, + "theoretical_loss": 4.391591331648879, + "tokens_seen": 187236352 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047638916750250755, + "loss": 3.2769, + "theoretical_loss": 4.391397061544247, + "tokens_seen": 187301888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047637913741223673, + "loss": 3.1636, + "theoretical_loss": 4.391202878427042, + "tokens_seen": 187367424 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763691073219659, + "loss": 3.3382, + "theoretical_loss": 4.3910087822279085, + "tokens_seen": 187432960 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763590772316951, + "loss": 3.3282, + "theoretical_loss": 4.390814772877571, + "tokens_seen": 187498496 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047634904714142427, + "loss": 3.4497, + "theoretical_loss": 4.390620850306832, + "tokens_seen": 187564032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047633901705115345, + "loss": 3.5448, + "theoretical_loss": 4.390427014446575, + "tokens_seen": 187629568 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763289869608827, + "loss": 3.5607, + "theoretical_loss": 4.390233265227764, + "tokens_seen": 187695104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763189568706118, + "loss": 3.2674, + "theoretical_loss": 4.390039602581437, + "tokens_seen": 187760640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047630892678034105, + "loss": 3.3578, + "theoretical_loss": 4.389846026438715, + "tokens_seen": 187826176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047629889669007023, + "loss": 3.3827, + "theoretical_loss": 4.3896525367307975, + "tokens_seen": 187891712 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762888665997994, + "loss": 3.2597, + "theoretical_loss": 4.389459133388962, + "tokens_seen": 187957248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762788365095286, + "loss": 3.1751, + "theoretical_loss": 4.3892658163445635, + "tokens_seen": 188022784 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762688064192578, + "loss": 3.3537, + "theoretical_loss": 4.389072585529037, + "tokens_seen": 188088320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047625877632898696, + "loss": 2.8638, + "theoretical_loss": 4.388879440873897, + "tokens_seen": 188153856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762487462387162, + "loss": 3.4062, + "theoretical_loss": 4.388686382310732, + "tokens_seen": 188219392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762387161484453, + "loss": 3.1585, + "theoretical_loss": 4.388493409771213, + "tokens_seen": 188284928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047622868605817455, + "loss": 3.134, + "theoretical_loss": 4.388300523187087, + "tokens_seen": 188350464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 251659, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.317779541015625, + "objective/train/theoretical_loss": 4.3881077224901786, + "objective/train/tokens_used": 208876000, + "theoretical_loss": 4.3881077224901786, + "tokens_seen": 188416000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762186559679037, + "loss": 3.4901, + "theoretical_loss": 4.3881077224901786, + "tokens_seen": 188416000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762086258776329, + "loss": 3.4647, + "theoretical_loss": 4.38791500761239, + "tokens_seen": 188481536 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761985957873621, + "loss": 3.1316, + "theoretical_loss": 4.387722378485703, + "tokens_seen": 188547072 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761885656970913, + "loss": 3.1812, + "theoretical_loss": 4.3875298350421765, + "tokens_seen": 188612608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047617853560682046, + "loss": 3.2664, + "theoretical_loss": 4.387337377213943, + "tokens_seen": 188678144 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047616850551654964, + "loss": 3.4667, + "theoretical_loss": 4.387145004933218, + "tokens_seen": 188743680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761584754262788, + "loss": 3.1033, + "theoretical_loss": 4.38695271813229, + "tokens_seen": 188809216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047614844533600806, + "loss": 3.425, + "theoretical_loss": 4.386760516743526, + "tokens_seen": 188874752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761384152457372, + "loss": 3.301, + "theoretical_loss": 4.38656840069937, + "tokens_seen": 188940288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761283851554664, + "loss": 3.3707, + "theoretical_loss": 4.386376369932344, + "tokens_seen": 189005824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761183550651956, + "loss": 3.3374, + "theoretical_loss": 4.386184424375044, + "tokens_seen": 189071360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761083249749248, + "loss": 3.279, + "theoretical_loss": 4.385992563960145, + "tokens_seen": 189136896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047609829488465396, + "loss": 3.101, + "theoretical_loss": 4.385800788620397, + "tokens_seen": 189202432 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047608826479438314, + "loss": 3.205, + "theoretical_loss": 4.385609098288628, + "tokens_seen": 189267968 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760782347041123, + "loss": 3.2755, + "theoretical_loss": 4.385417492897741, + "tokens_seen": 189333504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047606820461384156, + "loss": 3.3262, + "theoretical_loss": 4.385225972380715, + "tokens_seen": 189399040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760581745235707, + "loss": 3.2513, + "theoretical_loss": 4.385034536670606, + "tokens_seen": 189464576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760481444332999, + "loss": 3.422, + "theoretical_loss": 4.384843185700544, + "tokens_seen": 189530112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047603811434302905, + "loss": 3.6571, + "theoretical_loss": 4.384651919403739, + "tokens_seen": 189595648 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760280842527583, + "loss": 3.1847, + "theoretical_loss": 4.384460737713471, + "tokens_seen": 189661184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047601805416248746, + "loss": 3.3644, + "theoretical_loss": 4.384269640563101, + "tokens_seen": 189726720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047600802407221665, + "loss": 3.4833, + "theoretical_loss": 4.384078627886062, + "tokens_seen": 189792256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759979939819458, + "loss": 3.2601, + "theoretical_loss": 4.383887699615863, + "tokens_seen": 189857792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047598796389167506, + "loss": 3.3371, + "theoretical_loss": 4.38369685568609, + "tokens_seen": 189923328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759779338014042, + "loss": 3.2443, + "theoretical_loss": 4.383506096030401, + "tokens_seen": 189988864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 256551, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.438608169555664, + "objective/train/theoretical_loss": 4.383315420582533, + "objective/train/tokens_used": 210514400, + "theoretical_loss": 4.383315420582533, + "tokens_seen": 190054400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759679037111334, + "loss": 3.2904, + "theoretical_loss": 4.383315420582533, + "tokens_seen": 190054400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759578736208626, + "loss": 3.0384, + "theoretical_loss": 4.383124829276294, + "tokens_seen": 190119936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759478435305918, + "loss": 3.1615, + "theoretical_loss": 4.38293432204557, + "tokens_seen": 190185472 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475937813440321, + "loss": 3.4141, + "theoretical_loss": 4.382743898824321, + "tokens_seen": 190251008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047592778335005015, + "loss": 3.2814, + "theoretical_loss": 4.3825535595465785, + "tokens_seen": 190316544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759177532597794, + "loss": 3.3771, + "theoretical_loss": 4.382363304146453, + "tokens_seen": 190382080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759077231695085, + "loss": 3.4016, + "theoretical_loss": 4.382173132558126, + "tokens_seen": 190447616 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047589769307923775, + "loss": 3.306, + "theoretical_loss": 4.381983044715856, + "tokens_seen": 190513152 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047588766298896693, + "loss": 3.2248, + "theoretical_loss": 4.381793040553973, + "tokens_seen": 190578688 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758776328986961, + "loss": 3.2466, + "theoretical_loss": 4.381603120006883, + "tokens_seen": 190644224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758676028084253, + "loss": 3.325, + "theoretical_loss": 4.381413283009065, + "tokens_seen": 190709760 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047585757271815447, + "loss": 3.1018, + "theoretical_loss": 4.381223529495073, + "tokens_seen": 190775296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047584754262788365, + "loss": 3.2762, + "theoretical_loss": 4.381033859399532, + "tokens_seen": 190840832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758375125376129, + "loss": 3.5844, + "theoretical_loss": 4.380844272657145, + "tokens_seen": 190906368 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475827482447342, + "loss": 3.3999, + "theoretical_loss": 4.380654769202683, + "tokens_seen": 190971904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047581745235707125, + "loss": 3.3852, + "theoretical_loss": 4.380465348970995, + "tokens_seen": 191037440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047580742226680043, + "loss": 3.2935, + "theoretical_loss": 4.380276011897003, + "tokens_seen": 191102976 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757973921765296, + "loss": 3.2371, + "theoretical_loss": 4.380086757915698, + "tokens_seen": 191168512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757873620862588, + "loss": 3.5679, + "theoretical_loss": 4.379897586962148, + "tokens_seen": 191234048 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475777331995988, + "loss": 3.338, + "theoretical_loss": 4.379708498971494, + "tokens_seen": 191299584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047576730190571716, + "loss": 3.1515, + "theoretical_loss": 4.379519493878948, + "tokens_seen": 191365120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757572718154464, + "loss": 3.1375, + "theoretical_loss": 4.379330571619795, + "tokens_seen": 191430656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757472417251755, + "loss": 3.2006, + "theoretical_loss": 4.379141732129394, + "tokens_seen": 191496192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047573721163490475, + "loss": 3.5164, + "theoretical_loss": 4.378952975343175, + "tokens_seen": 191561728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757271815446339, + "loss": 3.4549, + "theoretical_loss": 4.378764301196642, + "tokens_seen": 191627264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 261606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4967339038848877, + "objective/train/theoretical_loss": 4.37857570962537, + "objective/train/tokens_used": 212152800, + "theoretical_loss": 4.37857570962537, + "tokens_seen": 191692800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757171514543631, + "loss": 3.2403, + "theoretical_loss": 4.37857570962537, + "tokens_seen": 191692800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757071213640923, + "loss": 3.305, + "theoretical_loss": 4.378387200565006, + "tokens_seen": 191758336 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756970912738215, + "loss": 3.175, + "theoretical_loss": 4.378198773951272, + "tokens_seen": 191823872 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047568706118355066, + "loss": 3.3906, + "theoretical_loss": 4.378010429719957, + "tokens_seen": 191889408 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047567703109327984, + "loss": 3.4164, + "theoretical_loss": 4.377822167806928, + "tokens_seen": 191954944 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475667001003009, + "loss": 3.2804, + "theoretical_loss": 4.377633988148117, + "tokens_seen": 192020480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047565697091273826, + "loss": 3.2627, + "theoretical_loss": 4.377445890679534, + "tokens_seen": 192086016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756469408224674, + "loss": 3.1732, + "theoretical_loss": 4.377257875337257, + "tokens_seen": 192151552 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756369107321966, + "loss": 3.4658, + "theoretical_loss": 4.377069942057436, + "tokens_seen": 192217088 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756268806419258, + "loss": 3.3786, + "theoretical_loss": 4.376882090776293, + "tokens_seen": 192282624 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475616850551655, + "loss": 3.1876, + "theoretical_loss": 4.376694321430121, + "tokens_seen": 192348160 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047560682046138416, + "loss": 3.2698, + "theoretical_loss": 4.376506633955286, + "tokens_seen": 192413696 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047559679037111334, + "loss": 3.3082, + "theoretical_loss": 4.376319028288219, + "tokens_seen": 192479232 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755867602808425, + "loss": 3.4333, + "theoretical_loss": 4.37613150436543, + "tokens_seen": 192544768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047557673019057176, + "loss": 3.4606, + "theoretical_loss": 4.375944062123496, + "tokens_seen": 192610304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755667001003009, + "loss": 3.0892, + "theoretical_loss": 4.375756701499063, + "tokens_seen": 192675840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755566700100301, + "loss": 3.2806, + "theoretical_loss": 4.3755694224288515, + "tokens_seen": 192741376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047554663991975925, + "loss": 3.4885, + "theoretical_loss": 4.375382224849648, + "tokens_seen": 192806912 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755366098294885, + "loss": 3.217, + "theoretical_loss": 4.375195108698316, + "tokens_seen": 192872448 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047552657973921766, + "loss": 3.277, + "theoretical_loss": 4.375008073911781, + "tokens_seen": 192937984 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047551654964894685, + "loss": 3.6578, + "theoretical_loss": 4.374821120427047, + "tokens_seen": 193003520 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047550651955867603, + "loss": 3.4625, + "theoretical_loss": 4.374634248181182, + "tokens_seen": 193069056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047549648946840526, + "loss": 3.3939, + "theoretical_loss": 4.3744474571113265, + "tokens_seen": 193134592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754864593781344, + "loss": 3.4205, + "theoretical_loss": 4.374260747154692, + "tokens_seen": 193200128 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754764292878636, + "loss": 3.0036, + "theoretical_loss": 4.374074118248559, + "tokens_seen": 193265664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 266689, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.22265625, + "objective/train/theoretical_loss": 4.373887570330275, + "objective/train/tokens_used": 213791200, + "theoretical_loss": 4.373887570330275, + "tokens_seen": 193331200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047546639919759275, + "loss": 3.3538, + "theoretical_loss": 4.373887570330275, + "tokens_seen": 193331200 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475456369107322, + "loss": 3.0157, + "theoretical_loss": 4.373701103337263, + "tokens_seen": 193396736 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047544633901705117, + "loss": 3.3198, + "theoretical_loss": 4.373514717207009, + "tokens_seen": 193462272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047543630892678035, + "loss": 3.303, + "theoretical_loss": 4.373328411877073, + "tokens_seen": 193527808 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047542627883650953, + "loss": 3.4534, + "theoretical_loss": 4.373142187285083, + "tokens_seen": 193593344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754162487462387, + "loss": 3.442, + "theoretical_loss": 4.372956043368736, + "tokens_seen": 193658880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754062186559679, + "loss": 3.0814, + "theoretical_loss": 4.372769980065797, + "tokens_seen": 193724416 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047539618856569713, + "loss": 3.2632, + "theoretical_loss": 4.372583997314104, + "tokens_seen": 193789952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047538615847542625, + "loss": 3.0204, + "theoretical_loss": 4.372398095051559, + "tokens_seen": 193855488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753761283851555, + "loss": 3.4108, + "theoretical_loss": 4.372212273216136, + "tokens_seen": 193921024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753660982948846, + "loss": 3.3715, + "theoretical_loss": 4.372026531745877, + "tokens_seen": 193986560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047535606820461385, + "loss": 3.1585, + "theoretical_loss": 4.371840870578891, + "tokens_seen": 194052096 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047534603811434303, + "loss": 3.3463, + "theoretical_loss": 4.37165528965336, + "tokens_seen": 194117632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753360080240722, + "loss": 3.1645, + "theoretical_loss": 4.371469788907529, + "tokens_seen": 194183168 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753259779338014, + "loss": 3.542, + "theoretical_loss": 4.371284368279714, + "tokens_seen": 194248704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047531594784353063, + "loss": 3.4428, + "theoretical_loss": 4.3710990277083, + "tokens_seen": 194314240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047530591775325976, + "loss": 3.2136, + "theoretical_loss": 4.3709137671317375, + "tokens_seen": 194379776 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475295887662989, + "loss": 3.5123, + "theoretical_loss": 4.37072858648855, + "tokens_seen": 194445312 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752858575727181, + "loss": 3.4088, + "theoretical_loss": 4.370543485717322, + "tokens_seen": 194510848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047527582748244736, + "loss": 3.2998, + "theoretical_loss": 4.370358464756713, + "tokens_seen": 194576384 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047526579739217654, + "loss": 3.0609, + "theoretical_loss": 4.370173523545443, + "tokens_seen": 194641920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752557673019057, + "loss": 3.1257, + "theoretical_loss": 4.3699886620223065, + "tokens_seen": 194707456 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752457372116349, + "loss": 3.3654, + "theoretical_loss": 4.369803880126162, + "tokens_seen": 194772992 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752357071213641, + "loss": 3.362, + "theoretical_loss": 4.3696191777959354, + "tokens_seen": 194838528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047522567703109326, + "loss": 3.3996, + "theoretical_loss": 4.369434554970621, + "tokens_seen": 194904064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 269630, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.62288236618042, + "objective/train/theoretical_loss": 4.369250011589279, + "objective/train/tokens_used": 215429600, + "theoretical_loss": 4.369250011589279, + "tokens_seen": 194969600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752156469408225, + "loss": 3.4504, + "theoretical_loss": 4.369250011589279, + "tokens_seen": 194969600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752056168505517, + "loss": 3.3655, + "theoretical_loss": 4.369065547591038, + "tokens_seen": 195035136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047519558676028086, + "loss": 3.2661, + "theoretical_loss": 4.368881162915095, + "tokens_seen": 195100672 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047518555667001004, + "loss": 3.3123, + "theoretical_loss": 4.36869685750071, + "tokens_seen": 195166208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751755265797392, + "loss": 3.2071, + "theoretical_loss": 4.3685126312872145, + "tokens_seen": 195231744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047516549648946846, + "loss": 3.5141, + "theoretical_loss": 4.368328484214002, + "tokens_seen": 195297280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751554663991976, + "loss": 3.4057, + "theoretical_loss": 4.368144416220538, + "tokens_seen": 195362816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751454363089268, + "loss": 3.34, + "theoretical_loss": 4.3679604272463495, + "tokens_seen": 195428352 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475135406218656, + "loss": 3.2522, + "theoretical_loss": 4.367776517231033, + "tokens_seen": 195493888 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751253761283852, + "loss": 3.1821, + "theoretical_loss": 4.367592686114252, + "tokens_seen": 195559424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047511534603811436, + "loss": 3.3659, + "theoretical_loss": 4.367408933835733, + "tokens_seen": 195624960 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047510531594784354, + "loss": 3.3274, + "theoretical_loss": 4.367225260335272, + "tokens_seen": 195690496 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750952858575727, + "loss": 3.1391, + "theoretical_loss": 4.36704166555273, + "tokens_seen": 195756032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047508525576730196, + "loss": 3.2901, + "theoretical_loss": 4.366858149428032, + "tokens_seen": 195821568 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750752256770311, + "loss": 3.0, + "theoretical_loss": 4.366674711901173, + "tokens_seen": 195887104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750651955867603, + "loss": 3.3856, + "theoretical_loss": 4.366491352912211, + "tokens_seen": 195952640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047505516549648945, + "loss": 3.369, + "theoretical_loss": 4.366308072401271, + "tokens_seen": 196018176 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750451354062187, + "loss": 3.1375, + "theoretical_loss": 4.366124870308541, + "tokens_seen": 196083712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047503510531594787, + "loss": 3.3023, + "theoretical_loss": 4.365941746574278, + "tokens_seen": 196149248 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047502507522567705, + "loss": 3.4409, + "theoretical_loss": 4.3657587011388035, + "tokens_seen": 196214784 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047501504513540623, + "loss": 3.4023, + "theoretical_loss": 4.365575733942503, + "tokens_seen": 196280320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047500501504513546, + "loss": 3.1508, + "theoretical_loss": 4.365392844925829, + "tokens_seen": 196345856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749949849548646, + "loss": 3.2641, + "theoretical_loss": 4.365210034029298, + "tokens_seen": 196411392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749849548645938, + "loss": 3.2689, + "theoretical_loss": 4.365027301193491, + "tokens_seen": 196476928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047497492477432295, + "loss": 3.3907, + "theoretical_loss": 4.364844646359056, + "tokens_seen": 196542464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 270311, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.453747272491455, + "objective/train/theoretical_loss": 4.364662069466704, + "objective/train/tokens_used": 217068000, + "theoretical_loss": 4.364662069466704, + "tokens_seen": 196608000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749648946840522, + "loss": 3.4165, + "theoretical_loss": 4.364662069466704, + "tokens_seen": 196608000 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047495486459378137, + "loss": 3.2901, + "theoretical_loss": 4.364479570457213, + "tokens_seen": 196673536 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047494483450351055, + "loss": 3.3529, + "theoretical_loss": 4.364297149271423, + "tokens_seen": 196739072 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047493480441323973, + "loss": 3.4174, + "theoretical_loss": 4.3641148058502415, + "tokens_seen": 196804608 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749247743229689, + "loss": 3.203, + "theoretical_loss": 4.363932540134638, + "tokens_seen": 196870144 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749147442326981, + "loss": 3.1933, + "theoretical_loss": 4.363750352065647, + "tokens_seen": 196935680 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047490471414242733, + "loss": 3.2939, + "theoretical_loss": 4.363568241584368, + "tokens_seen": 197001216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047489468405215646, + "loss": 3.1258, + "theoretical_loss": 4.363386208631966, + "tokens_seen": 197066752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748846539618857, + "loss": 3.2392, + "theoretical_loss": 4.363204253149667, + "tokens_seen": 197132288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748746238716148, + "loss": 3.283, + "theoretical_loss": 4.3630223750787644, + "tokens_seen": 197197824 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047486459378134405, + "loss": 3.3828, + "theoretical_loss": 4.362840574360612, + "tokens_seen": 197263360 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047485456369107323, + "loss": 3.0395, + "theoretical_loss": 4.362658850936631, + "tokens_seen": 197328896 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748445336008024, + "loss": 3.2909, + "theoretical_loss": 4.362477204748305, + "tokens_seen": 197394432 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748345035105316, + "loss": 3.1958, + "theoretical_loss": 4.362295635737179, + "tokens_seen": 197459968 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047482447342026083, + "loss": 2.9771, + "theoretical_loss": 4.362114143844867, + "tokens_seen": 197525504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047481444332998996, + "loss": 3.289, + "theoretical_loss": 4.3619327290130405, + "tokens_seen": 197591040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748044132397192, + "loss": 3.2772, + "theoretical_loss": 4.3617513911834385, + "tokens_seen": 197656576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747943831494483, + "loss": 3.149, + "theoretical_loss": 4.361570130297863, + "tokens_seen": 197722112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047478435305917756, + "loss": 3.3416, + "theoretical_loss": 4.3613889462981765, + "tokens_seen": 197787648 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047477432296890674, + "loss": 3.4184, + "theoretical_loss": 4.361207839126308, + "tokens_seen": 197853184 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747642928786359, + "loss": 3.3547, + "theoretical_loss": 4.361026808724247, + "tokens_seen": 197918720 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747542627883651, + "loss": 3.1468, + "theoretical_loss": 4.360845855034049, + "tokens_seen": 197984256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747442326980943, + "loss": 3.2702, + "theoretical_loss": 4.360664977997828, + "tokens_seen": 198049792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047473420260782346, + "loss": 2.9289, + "theoretical_loss": 4.360484177557766, + "tokens_seen": 198115328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747241725175527, + "loss": 3.3127, + "theoretical_loss": 4.360303453656103, + "tokens_seen": 198180864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 271760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.187415838241577, + "objective/train/theoretical_loss": 4.360122806235145, + "objective/train/tokens_used": 218706400, + "theoretical_loss": 4.360122806235145, + "tokens_seen": 198246400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747141424272818, + "loss": 3.2371, + "theoretical_loss": 4.360122806235145, + "tokens_seen": 198246400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047470411233701106, + "loss": 3.1704, + "theoretical_loss": 4.359942235237257, + "tokens_seen": 198311936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746940822467402, + "loss": 3.375, + "theoretical_loss": 4.359761740604871, + "tokens_seen": 198377472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746840521564694, + "loss": 3.0178, + "theoretical_loss": 4.359581322280479, + "tokens_seen": 198443008 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746740220661986, + "loss": 2.9285, + "theoretical_loss": 4.359400980206634, + "tokens_seen": 198508544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746639919759278, + "loss": 3.3454, + "theoretical_loss": 4.359220714325954, + "tokens_seen": 198574080 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047465396188565696, + "loss": 3.3081, + "theoretical_loss": 4.359040524581116, + "tokens_seen": 198639616 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746439317953862, + "loss": 3.4269, + "theoretical_loss": 4.358860410914861, + "tokens_seen": 198705152 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746339017051153, + "loss": 3.2979, + "theoretical_loss": 4.358680373269993, + "tokens_seen": 198770688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047462387161484456, + "loss": 3.2567, + "theoretical_loss": 4.358500411589375, + "tokens_seen": 198836224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746138415245737, + "loss": 3.373, + "theoretical_loss": 4.358320525815934, + "tokens_seen": 198901760 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746038114343029, + "loss": 3.4155, + "theoretical_loss": 4.358140715892658, + "tokens_seen": 198967296 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745937813440321, + "loss": 3.5005, + "theoretical_loss": 4.357960981762595, + "tokens_seen": 199032832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745837512537613, + "loss": 3.145, + "theoretical_loss": 4.357781323368857, + "tokens_seen": 199098368 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047457372116349047, + "loss": 3.1356, + "theoretical_loss": 4.357601740654617, + "tokens_seen": 199163904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047456369107321965, + "loss": 3.3813, + "theoretical_loss": 4.357422233563106, + "tokens_seen": 199229440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047455366098294883, + "loss": 3.2667, + "theoretical_loss": 4.357242802037623, + "tokens_seen": 199294976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047454363089267807, + "loss": 3.5508, + "theoretical_loss": 4.35706344602152, + "tokens_seen": 199360512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745336008024072, + "loss": 3.3062, + "theoretical_loss": 4.356884165458217, + "tokens_seen": 199426048 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047452357071213643, + "loss": 3.3419, + "theoretical_loss": 4.356704960291191, + "tokens_seen": 199491584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047451354062186555, + "loss": 3.1005, + "theoretical_loss": 4.35652583046398, + "tokens_seen": 199557120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745035105315948, + "loss": 3.1586, + "theoretical_loss": 4.356346775920185, + "tokens_seen": 199622656 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047449348044132397, + "loss": 2.9514, + "theoretical_loss": 4.356167796603467, + "tokens_seen": 199688192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047448345035105315, + "loss": 3.5216, + "theoretical_loss": 4.355988892457546, + "tokens_seen": 199753728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744734202607824, + "loss": 3.4248, + "theoretical_loss": 4.355810063426204, + "tokens_seen": 199819264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 272554, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5305120944976807, + "objective/train/theoretical_loss": 4.355631309453283, + "objective/train/tokens_used": 220344800, + "theoretical_loss": 4.355631309453283, + "tokens_seen": 199884800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047446339017051157, + "loss": 3.2918, + "theoretical_loss": 4.355631309453283, + "tokens_seen": 199884800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047445336008024075, + "loss": 3.3689, + "theoretical_loss": 4.355452630482685, + "tokens_seen": 199950336 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047444332998996993, + "loss": 3.1337, + "theoretical_loss": 4.355274026458375, + "tokens_seen": 200015872 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744332998996991, + "loss": 3.1743, + "theoretical_loss": 4.355095497324373, + "tokens_seen": 200081408 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744232698094283, + "loss": 3.3082, + "theoretical_loss": 4.354917043024765, + "tokens_seen": 200146944 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047441323971915753, + "loss": 3.4852, + "theoretical_loss": 4.354738663503692, + "tokens_seen": 200212480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047440320962888666, + "loss": 3.2587, + "theoretical_loss": 4.354560358705358, + "tokens_seen": 200278016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743931795386159, + "loss": 2.9646, + "theoretical_loss": 4.354382128574027, + "tokens_seen": 200343552 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474383149448345, + "loss": 3.1041, + "theoretical_loss": 4.35420397305402, + "tokens_seen": 200409088 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047437311935807425, + "loss": 3.532, + "theoretical_loss": 4.35402589208972, + "tokens_seen": 200474624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047436308926780343, + "loss": 3.3644, + "theoretical_loss": 4.353847885625571, + "tokens_seen": 200540160 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743530591775326, + "loss": 3.3038, + "theoretical_loss": 4.353669953606072, + "tokens_seen": 200605696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743430290872618, + "loss": 3.1885, + "theoretical_loss": 4.353492095975787, + "tokens_seen": 200671232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047433299899699103, + "loss": 3.1056, + "theoretical_loss": 4.353314312679333, + "tokens_seen": 200736768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047432296890672016, + "loss": 3.1679, + "theoretical_loss": 4.353136603661392, + "tokens_seen": 200802304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743129388164494, + "loss": 3.3277, + "theoretical_loss": 4.352958968866704, + "tokens_seen": 200867840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743029087261785, + "loss": 3.3515, + "theoretical_loss": 4.352781408240065, + "tokens_seen": 200933376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047429287863590776, + "loss": 3.4612, + "theoretical_loss": 4.352603921726334, + "tokens_seen": 200998912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047428284854563694, + "loss": 3.2698, + "theoretical_loss": 4.352426509270425, + "tokens_seen": 201064448 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742728184553661, + "loss": 3.2832, + "theoretical_loss": 4.352249170817315, + "tokens_seen": 201129984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742627883650953, + "loss": 3.2757, + "theoretical_loss": 4.352071906312037, + "tokens_seen": 201195520 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742527582748245, + "loss": 3.2153, + "theoretical_loss": 4.351894715699684, + "tokens_seen": 201261056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047424272818455366, + "loss": 3.5588, + "theoretical_loss": 4.351717598925406, + "tokens_seen": 201326592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742326980942829, + "loss": 3.1392, + "theoretical_loss": 4.351540555934414, + "tokens_seen": 201392128 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474222668004012, + "loss": 3.2376, + "theoretical_loss": 4.351363586671976, + "tokens_seen": 201457664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 274046, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8291563987731934, + "objective/train/theoretical_loss": 4.351186691083417, + "objective/train/tokens_used": 221983200, + "theoretical_loss": 4.351186691083417, + "tokens_seen": 201523200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047421263791374126, + "loss": 3.179, + "theoretical_loss": 4.351186691083417, + "tokens_seen": 201523200 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742026078234704, + "loss": 3.047, + "theoretical_loss": 4.351009869114124, + "tokens_seen": 201588736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741925777331996, + "loss": 3.2204, + "theoretical_loss": 4.350833120709539, + "tokens_seen": 201654272 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741825476429288, + "loss": 3.2638, + "theoretical_loss": 4.350656445815164, + "tokens_seen": 201719808 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474172517552658, + "loss": 3.3704, + "theoretical_loss": 4.350479844376557, + "tokens_seen": 201785344 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047416248746238716, + "loss": 3.2403, + "theoretical_loss": 4.350303316339337, + "tokens_seen": 201850880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741524573721164, + "loss": 3.1801, + "theoretical_loss": 4.350126861649178, + "tokens_seen": 201916416 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741424272818455, + "loss": 3.0457, + "theoretical_loss": 4.349950480251813, + "tokens_seen": 201981952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047413239719157476, + "loss": 3.3629, + "theoretical_loss": 4.349774172093033, + "tokens_seen": 202047488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741223671013039, + "loss": 3.2814, + "theoretical_loss": 4.349597937118687, + "tokens_seen": 202113024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741123370110331, + "loss": 3.6036, + "theoretical_loss": 4.3494217752746795, + "tokens_seen": 202178560 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741023069207623, + "loss": 3.2269, + "theoretical_loss": 4.349245686506976, + "tokens_seen": 202244096 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004740922768304915, + "loss": 3.4228, + "theoretical_loss": 4.349069670761597, + "tokens_seen": 202309632 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047408224674022067, + "loss": 3.4884, + "theoretical_loss": 4.348893727984619, + "tokens_seen": 202375168 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047407221664994985, + "loss": 3.2129, + "theoretical_loss": 4.348717858122178, + "tokens_seen": 202440704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047406218655967903, + "loss": 3.1145, + "theoretical_loss": 4.348542061120469, + "tokens_seen": 202506240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047405215646940827, + "loss": 3.4848, + "theoretical_loss": 4.348366336925739, + "tokens_seen": 202571776 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004740421263791374, + "loss": 3.3138, + "theoretical_loss": 4.3481906854842975, + "tokens_seen": 202637312 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047403209628886663, + "loss": 3.3594, + "theoretical_loss": 4.348015106742507, + "tokens_seen": 202702848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047402206619859575, + "loss": 3.4342, + "theoretical_loss": 4.347839600646786, + "tokens_seen": 202768384 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474012036108325, + "loss": 3.2728, + "theoretical_loss": 4.347664167143615, + "tokens_seen": 202833920 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047400200601805417, + "loss": 3.491, + "theoretical_loss": 4.347488806179528, + "tokens_seen": 202899456 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047399197592778335, + "loss": 3.2498, + "theoretical_loss": 4.347313517701114, + "tokens_seen": 202964992 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047398194583751253, + "loss": 3.2725, + "theoretical_loss": 4.347138301655021, + "tokens_seen": 203030528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047397191574724177, + "loss": 3.2918, + "theoretical_loss": 4.346963157987954, + "tokens_seen": 203096064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 274684, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.094592332839966, + "objective/train/theoretical_loss": 4.346788086646671, + "objective/train/tokens_used": 223621600, + "theoretical_loss": 4.346788086646671, + "tokens_seen": 203161600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739618856569709, + "loss": 3.2821, + "theoretical_loss": 4.346788086646671, + "tokens_seen": 203161600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047395185556670013, + "loss": 3.1994, + "theoretical_loss": 4.346613087577991, + "tokens_seen": 203227136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047394182547642926, + "loss": 3.276, + "theoretical_loss": 4.346438160728785, + "tokens_seen": 203292672 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739317953861585, + "loss": 3.4814, + "theoretical_loss": 4.346263306045983, + "tokens_seen": 203358208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739217652958877, + "loss": 3.3933, + "theoretical_loss": 4.346088523476569, + "tokens_seen": 203423744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047391173520561686, + "loss": 3.1577, + "theoretical_loss": 4.345913812967584, + "tokens_seen": 203489280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047390170511534604, + "loss": 3.2551, + "theoretical_loss": 4.345739174466127, + "tokens_seen": 203554816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738916750250752, + "loss": 3.3351, + "theoretical_loss": 4.345564607919348, + "tokens_seen": 203620352 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738816449348044, + "loss": 3.2087, + "theoretical_loss": 4.3453901132744575, + "tokens_seen": 203685888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047387161484453363, + "loss": 3.5707, + "theoretical_loss": 4.345215690478719, + "tokens_seen": 203751424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047386158475426276, + "loss": 3.2015, + "theoretical_loss": 4.345041339479453, + "tokens_seen": 203816960 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473851554663992, + "loss": 3.2998, + "theoretical_loss": 4.3448670602240345, + "tokens_seen": 203882496 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738415245737211, + "loss": 3.2642, + "theoretical_loss": 4.344692852659895, + "tokens_seen": 203948032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047383149448345036, + "loss": 3.307, + "theoretical_loss": 4.34451871673452, + "tokens_seen": 204013568 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047382146439317954, + "loss": 3.3531, + "theoretical_loss": 4.344344652395451, + "tokens_seen": 204079104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738114343029087, + "loss": 3.3066, + "theoretical_loss": 4.3441706595902865, + "tokens_seen": 204144640 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738014042126379, + "loss": 3.0495, + "theoretical_loss": 4.343996738266677, + "tokens_seen": 204210176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047379137412236714, + "loss": 3.3121, + "theoretical_loss": 4.343822888372331, + "tokens_seen": 204275712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047378134403209626, + "loss": 3.32, + "theoretical_loss": 4.343649109855009, + "tokens_seen": 204341248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737713139418255, + "loss": 3.2897, + "theoretical_loss": 4.343475402662529, + "tokens_seen": 204406784 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737612838515546, + "loss": 3.325, + "theoretical_loss": 4.343301766742763, + "tokens_seen": 204472320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047375125376128386, + "loss": 3.2351, + "theoretical_loss": 4.343128202043638, + "tokens_seen": 204537856 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047374122367101304, + "loss": 3.4129, + "theoretical_loss": 4.342954708513136, + "tokens_seen": 204603392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737311935807422, + "loss": 3.2237, + "theoretical_loss": 4.342781286099291, + "tokens_seen": 204668928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047372116349047146, + "loss": 3.224, + "theoretical_loss": 4.3426079347501965, + "tokens_seen": 204734464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 275206, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4942283630371094, + "objective/train/theoretical_loss": 4.342434654413995, + "objective/train/tokens_used": 225260000, + "theoretical_loss": 4.342434654413995, + "tokens_seen": 204800000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737111334002006, + "loss": 3.4119, + "theoretical_loss": 4.342434654413995, + "tokens_seen": 204800000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737011033099298, + "loss": 3.3759, + "theoretical_loss": 4.342261445038888, + "tokens_seen": 204865536 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473691073219659, + "loss": 3.1406, + "theoretical_loss": 4.342088306573128, + "tokens_seen": 204931072 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736810431293882, + "loss": 3.2741, + "theoretical_loss": 4.341915238965026, + "tokens_seen": 204996608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047367101303911736, + "loss": 3.2403, + "theoretical_loss": 4.34174224216294, + "tokens_seen": 205062144 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736609829488466, + "loss": 3.1585, + "theoretical_loss": 4.34156931611529, + "tokens_seen": 205127680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736509528585757, + "loss": 3.2647, + "theoretical_loss": 4.341396460770547, + "tokens_seen": 205193216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047364092276830496, + "loss": 3.3709, + "theoretical_loss": 4.341223676077232, + "tokens_seen": 205258752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736308926780341, + "loss": 3.2875, + "theoretical_loss": 4.341050961983926, + "tokens_seen": 205324288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736208625877633, + "loss": 3.2638, + "theoretical_loss": 4.340878318439261, + "tokens_seen": 205389824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736108324974925, + "loss": 3.563, + "theoretical_loss": 4.340705745391922, + "tokens_seen": 205455360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736008024072217, + "loss": 3.3661, + "theoretical_loss": 4.3405332427906504, + "tokens_seen": 205520896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047359077231695087, + "loss": 3.5278, + "theoretical_loss": 4.340360810584238, + "tokens_seen": 205586432 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047358074222668005, + "loss": 3.3486, + "theoretical_loss": 4.340188448721532, + "tokens_seen": 205651968 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047357071213640923, + "loss": 3.3382, + "theoretical_loss": 4.3400161571514335, + "tokens_seen": 205717504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047356068204613847, + "loss": 3.156, + "theoretical_loss": 4.339843935822895, + "tokens_seen": 205783040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004735506519558676, + "loss": 3.4644, + "theoretical_loss": 4.339671784684923, + "tokens_seen": 205848576 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047354062186559683, + "loss": 3.0953, + "theoretical_loss": 4.339499703686579, + "tokens_seen": 205914112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047353059177532595, + "loss": 3.1301, + "theoretical_loss": 4.339327692776977, + "tokens_seen": 205979648 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004735205616850552, + "loss": 3.3351, + "theoretical_loss": 4.339155751905282, + "tokens_seen": 206045184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047351053159478437, + "loss": 3.3348, + "theoretical_loss": 4.338983881020713, + "tokens_seen": 206110720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047350050150451355, + "loss": 3.3371, + "theoretical_loss": 4.338812080072545, + "tokens_seen": 206176256 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047349047141424273, + "loss": 3.2751, + "theoretical_loss": 4.338640349010101, + "tokens_seen": 206241792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047348044132397197, + "loss": 3.1762, + "theoretical_loss": 4.3384686877827585, + "tokens_seen": 206307328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734704112337011, + "loss": 3.3436, + "theoretical_loss": 4.338297096339951, + "tokens_seen": 206372864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 276466, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.012336015701294, + "objective/train/theoretical_loss": 4.33812557463116, + "objective/train/tokens_used": 226898400, + "theoretical_loss": 4.33812557463116, + "tokens_seen": 206438400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047346038114343033, + "loss": 3.0366, + "theoretical_loss": 4.33812557463116, + "tokens_seen": 206438400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047345035105315946, + "loss": 3.2347, + "theoretical_loss": 4.3379541226059235, + "tokens_seen": 206503936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734403209628887, + "loss": 3.1413, + "theoretical_loss": 4.337782740213827, + "tokens_seen": 206569472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734302908726179, + "loss": 3.0938, + "theoretical_loss": 4.337611427404514, + "tokens_seen": 206635008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047342026078234706, + "loss": 3.0309, + "theoretical_loss": 4.337440184127679, + "tokens_seen": 206700544 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047341023069207624, + "loss": 3.3016, + "theoretical_loss": 4.337269010333065, + "tokens_seen": 206766080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734002006018054, + "loss": 3.315, + "theoretical_loss": 4.337097905970471, + "tokens_seen": 206831616 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733901705115346, + "loss": 3.5151, + "theoretical_loss": 4.336926870989748, + "tokens_seen": 206897152 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047338014042126383, + "loss": 3.2054, + "theoretical_loss": 4.336755905340797, + "tokens_seen": 206962688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047337011033099296, + "loss": 3.1878, + "theoretical_loss": 4.336585008973573, + "tokens_seen": 207028224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733600802407222, + "loss": 3.2425, + "theoretical_loss": 4.336414181838082, + "tokens_seen": 207093760 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733500501504513, + "loss": 3.5244, + "theoretical_loss": 4.336243423884382, + "tokens_seen": 207159296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047334002006018056, + "loss": 3.153, + "theoretical_loss": 4.336072735062583, + "tokens_seen": 207224832 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047332998996990974, + "loss": 3.2415, + "theoretical_loss": 4.335902115322847, + "tokens_seen": 207290368 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733199598796389, + "loss": 3.0556, + "theoretical_loss": 4.335731564615387, + "tokens_seen": 207355904 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733099297893681, + "loss": 3.1923, + "theoretical_loss": 4.335561082890468, + "tokens_seen": 207421440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047329989969909734, + "loss": 3.2044, + "theoretical_loss": 4.335390670098407, + "tokens_seen": 207486976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047328986960882646, + "loss": 3.5337, + "theoretical_loss": 4.335220326189571, + "tokens_seen": 207552512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732798395185557, + "loss": 3.2499, + "theoretical_loss": 4.335050051114379, + "tokens_seen": 207618048 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732698094282848, + "loss": 3.1384, + "theoretical_loss": 4.334879844823304, + "tokens_seen": 207683584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047325977933801406, + "loss": 3.1074, + "theoretical_loss": 4.334709707266865, + "tokens_seen": 207749120 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047324974924774324, + "loss": 3.1987, + "theoretical_loss": 4.334539638395636, + "tokens_seen": 207814656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732397191574724, + "loss": 3.1185, + "theoretical_loss": 4.334369638160242, + "tokens_seen": 207880192 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732296890672016, + "loss": 3.2328, + "theoretical_loss": 4.334199706511358, + "tokens_seen": 207945728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732196589769308, + "loss": 3.0938, + "theoretical_loss": 4.334029843399709, + "tokens_seen": 208011264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 277178, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.992138385772705, + "objective/train/theoretical_loss": 4.333860048776074, + "objective/train/tokens_used": 228536800, + "theoretical_loss": 4.333860048776074, + "tokens_seen": 208076800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047320962888665997, + "loss": 3.1004, + "theoretical_loss": 4.333860048776074, + "tokens_seen": 208076800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731995987963892, + "loss": 3.2221, + "theoretical_loss": 4.33369032259128, + "tokens_seen": 208142336 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047318956870611833, + "loss": 3.2591, + "theoretical_loss": 4.333520664796206, + "tokens_seen": 208207872 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047317953861584757, + "loss": 3.4427, + "theoretical_loss": 4.33335107534178, + "tokens_seen": 208273408 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047316950852557675, + "loss": 3.2497, + "theoretical_loss": 4.333181554178985, + "tokens_seen": 208338944 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047315947843530593, + "loss": 3.293, + "theoretical_loss": 4.3330121012588485, + "tokens_seen": 208404480 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731494483450351, + "loss": 3.5665, + "theoretical_loss": 4.332842716532454, + "tokens_seen": 208470016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731394182547643, + "loss": 3.0097, + "theoretical_loss": 4.332673399950932, + "tokens_seen": 208535552 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047312938816449347, + "loss": 3.1251, + "theoretical_loss": 4.332504151465464, + "tokens_seen": 208601088 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731193580742227, + "loss": 3.0934, + "theoretical_loss": 4.332334971027284, + "tokens_seen": 208666624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047310932798395183, + "loss": 3.4214, + "theoretical_loss": 4.332165858587672, + "tokens_seen": 208732160 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047309929789368107, + "loss": 3.0211, + "theoretical_loss": 4.331996814097963, + "tokens_seen": 208797696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730892678034102, + "loss": 3.2003, + "theoretical_loss": 4.331827837509538, + "tokens_seen": 208863232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047307923771313943, + "loss": 3.3196, + "theoretical_loss": 4.331658928773831, + "tokens_seen": 208928768 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730692076228686, + "loss": 3.2202, + "theoretical_loss": 4.331490087842324, + "tokens_seen": 208994304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730591775325978, + "loss": 3.1351, + "theoretical_loss": 4.33132131466655, + "tokens_seen": 209059840 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473049147442327, + "loss": 3.212, + "theoretical_loss": 4.3311526091980905, + "tokens_seen": 209125376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047303911735205615, + "loss": 3.2426, + "theoretical_loss": 4.330983971388578, + "tokens_seen": 209190912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047302908726178534, + "loss": 3.2021, + "theoretical_loss": 4.330815401189695, + "tokens_seen": 209256448 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047301905717151457, + "loss": 3.2868, + "theoretical_loss": 4.330646898553173, + "tokens_seen": 209321984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730090270812437, + "loss": 3.2098, + "theoretical_loss": 4.330478463430792, + "tokens_seen": 209387520 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047299899699097293, + "loss": 3.0737, + "theoretical_loss": 4.330310095774383, + "tokens_seen": 209453056 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729889669007021, + "loss": 3.3302, + "theoretical_loss": 4.330141795535828, + "tokens_seen": 209518592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729789368104313, + "loss": 3.1422, + "theoretical_loss": 4.329973562667053, + "tokens_seen": 209584128 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047296890672016053, + "loss": 3.3357, + "theoretical_loss": 4.3298053971200385, + "tokens_seen": 209649664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 278592, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8049657344818115, + "objective/train/theoretical_loss": 4.329637298846812, + "objective/train/tokens_used": 230175200, + "theoretical_loss": 4.329637298846812, + "tokens_seen": 209715200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047295887662988966, + "loss": 2.8862, + "theoretical_loss": 4.329637298846812, + "tokens_seen": 209715200 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729488465396189, + "loss": 3.3877, + "theoretical_loss": 4.329469267799451, + "tokens_seen": 209780736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729388164493481, + "loss": 3.2828, + "theoretical_loss": 4.32930130393008, + "tokens_seen": 209846272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047292878635907726, + "loss": 3.2585, + "theoretical_loss": 4.329133407190876, + "tokens_seen": 209911808 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047291875626880644, + "loss": 3.21, + "theoretical_loss": 4.3289655775340625, + "tokens_seen": 209977344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729087261785356, + "loss": 3.2672, + "theoretical_loss": 4.328797814911912, + "tokens_seen": 210042880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728986960882648, + "loss": 3.3022, + "theoretical_loss": 4.328630119276747, + "tokens_seen": 210108416 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047288866599799403, + "loss": 3.0099, + "theoretical_loss": 4.328462490580938, + "tokens_seen": 210173952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047287863590772316, + "loss": 3.2968, + "theoretical_loss": 4.328294928776903, + "tokens_seen": 210239488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728686058174524, + "loss": 3.1646, + "theoretical_loss": 4.328127433817112, + "tokens_seen": 210305024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728585757271815, + "loss": 3.2345, + "theoretical_loss": 4.327960005654081, + "tokens_seen": 210370560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047284854563691076, + "loss": 3.3589, + "theoretical_loss": 4.327792644240374, + "tokens_seen": 210436096 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047283851554663994, + "loss": 3.0922, + "theoretical_loss": 4.327625349528605, + "tokens_seen": 210501632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728284854563691, + "loss": 3.3172, + "theoretical_loss": 4.327458121471436, + "tokens_seen": 210567168 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728184553660983, + "loss": 3.3431, + "theoretical_loss": 4.3272909600215765, + "tokens_seen": 210632704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047280842527582754, + "loss": 3.1134, + "theoretical_loss": 4.327123865131786, + "tokens_seen": 210698240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047279839518555666, + "loss": 3.1697, + "theoretical_loss": 4.326956836754871, + "tokens_seen": 210763776 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727883650952859, + "loss": 3.4017, + "theoretical_loss": 4.326789874843685, + "tokens_seen": 210829312 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472778335005015, + "loss": 3.1166, + "theoretical_loss": 4.326622979351132, + "tokens_seen": 210894848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047276830491474426, + "loss": 3.0739, + "theoretical_loss": 4.326456150230163, + "tokens_seen": 210960384 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047275827482447344, + "loss": 3.2434, + "theoretical_loss": 4.326289387433776, + "tokens_seen": 211025920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727482447342026, + "loss": 3.1457, + "theoretical_loss": 4.326122690915017, + "tokens_seen": 211091456 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727382146439318, + "loss": 3.5019, + "theoretical_loss": 4.325956060626982, + "tokens_seen": 211156992 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472728184553661, + "loss": 3.1354, + "theoretical_loss": 4.325789496522812, + "tokens_seen": 211222528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047271815446339017, + "loss": 3.1215, + "theoretical_loss": 4.325622998555697, + "tokens_seen": 211288064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 279319, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.034355640411377, + "objective/train/theoretical_loss": 4.3254565666788745, + "objective/train/tokens_used": 231813600, + "theoretical_loss": 4.3254565666788745, + "tokens_seen": 211353600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727081243731194, + "loss": 3.3646, + "theoretical_loss": 4.3254565666788745, + "tokens_seen": 211353600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047269809428284853, + "loss": 3.1249, + "theoretical_loss": 4.325290200845629, + "tokens_seen": 211419136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047268806419257777, + "loss": 2.903, + "theoretical_loss": 4.3251239010092934, + "tokens_seen": 211484672 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047267803410230695, + "loss": 2.9262, + "theoretical_loss": 4.324957667123249, + "tokens_seen": 211550208 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047266800401203613, + "loss": 3.339, + "theoretical_loss": 4.32479149914092, + "tokens_seen": 211615744 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726579739217653, + "loss": 3.2618, + "theoretical_loss": 4.324625397015783, + "tokens_seen": 211681280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726479438314945, + "loss": 3.1819, + "theoretical_loss": 4.3244593607013595, + "tokens_seen": 211746816 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047263791374122367, + "loss": 2.9808, + "theoretical_loss": 4.324293390151218, + "tokens_seen": 211812352 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726278836509529, + "loss": 3.3896, + "theoretical_loss": 4.324127485318975, + "tokens_seen": 211877888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047261785356068203, + "loss": 3.5236, + "theoretical_loss": 4.323961646158294, + "tokens_seen": 211943424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047260782347041127, + "loss": 3.0748, + "theoretical_loss": 4.323795872622884, + "tokens_seen": 212008960 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725977933801404, + "loss": 3.2701, + "theoretical_loss": 4.323630164666502, + "tokens_seen": 212074496 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047258776328986963, + "loss": 3.0506, + "theoretical_loss": 4.323464522242954, + "tokens_seen": 212140032 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725777331995988, + "loss": 3.3146, + "theoretical_loss": 4.323298945306089, + "tokens_seen": 212205568 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472567703109328, + "loss": 3.3863, + "theoretical_loss": 4.3231334338098035, + "tokens_seen": 212271104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725576730190572, + "loss": 3.5083, + "theoretical_loss": 4.322967987708043, + "tokens_seen": 212336640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047254764292878636, + "loss": 3.0861, + "theoretical_loss": 4.322802606954798, + "tokens_seen": 212402176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047253761283851554, + "loss": 3.2103, + "theoretical_loss": 4.322637291504106, + "tokens_seen": 212467712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047252758274824477, + "loss": 3.2339, + "theoretical_loss": 4.32247204131005, + "tokens_seen": 212533248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725175526579739, + "loss": 3.3825, + "theoretical_loss": 4.322306856326761, + "tokens_seen": 212598784 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047250752256770313, + "loss": 3.2205, + "theoretical_loss": 4.322141736508415, + "tokens_seen": 212664320 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724974924774323, + "loss": 3.2196, + "theoretical_loss": 4.321976681809236, + "tokens_seen": 212729856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724874623871615, + "loss": 3.1189, + "theoretical_loss": 4.321811692183491, + "tokens_seen": 212795392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724774322968907, + "loss": 3.2039, + "theoretical_loss": 4.321646767585497, + "tokens_seen": 212860928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047246740220661986, + "loss": 3.1653, + "theoretical_loss": 4.3214819079696145, + "tokens_seen": 212926464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 280599, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3949620723724365, + "objective/train/theoretical_loss": 4.321317113290252, + "objective/train/tokens_used": 233452000, + "theoretical_loss": 4.321317113290252, + "tokens_seen": 212992000 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047245737211634904, + "loss": 3.3408, + "theoretical_loss": 4.321317113290252, + "tokens_seen": 212992000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724473420260783, + "loss": 3.1235, + "theoretical_loss": 4.321152383501863, + "tokens_seen": 213057536 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724373119358074, + "loss": 3.2593, + "theoretical_loss": 4.320987718558945, + "tokens_seen": 213123072 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047242728184553664, + "loss": 3.3497, + "theoretical_loss": 4.320823118416046, + "tokens_seen": 213188608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047241725175526576, + "loss": 3.5142, + "theoretical_loss": 4.320658583027755, + "tokens_seen": 213254144 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472407221664995, + "loss": 3.4399, + "theoretical_loss": 4.32049411234871, + "tokens_seen": 213319680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723971915747242, + "loss": 3.2726, + "theoretical_loss": 4.3203297063335935, + "tokens_seen": 213385216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047238716148445336, + "loss": 3.2783, + "theoretical_loss": 4.320165364937134, + "tokens_seen": 213450752 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047237713139418254, + "loss": 3.2021, + "theoretical_loss": 4.320001088114105, + "tokens_seen": 213516288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723671013039117, + "loss": 3.2311, + "theoretical_loss": 4.319836875819325, + "tokens_seen": 213581824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723570712136409, + "loss": 2.9671, + "theoretical_loss": 4.31967272800766, + "tokens_seen": 213647360 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047234704112337014, + "loss": 3.1442, + "theoretical_loss": 4.319508644634021, + "tokens_seen": 213712896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047233701103309927, + "loss": 2.8623, + "theoretical_loss": 4.319344625653361, + "tokens_seen": 213778432 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723269809428285, + "loss": 3.1813, + "theoretical_loss": 4.319180671020684, + "tokens_seen": 213843968 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723169508525577, + "loss": 3.1773, + "theoretical_loss": 4.319016780691033, + "tokens_seen": 213909504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047230692076228686, + "loss": 3.2263, + "theoretical_loss": 4.318852954619501, + "tokens_seen": 213975040 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047229689067201605, + "loss": 3.2179, + "theoretical_loss": 4.318689192761225, + "tokens_seen": 214040576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004722868605817452, + "loss": 3.3982, + "theoretical_loss": 4.318525495071385, + "tokens_seen": 214106112 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004722768304914744, + "loss": 3.2654, + "theoretical_loss": 4.318361861505207, + "tokens_seen": 214171648 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047226680040120364, + "loss": 3.0422, + "theoretical_loss": 4.318198292017964, + "tokens_seen": 214237184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047225677031093277, + "loss": 3.2805, + "theoretical_loss": 4.318034786564971, + "tokens_seen": 214302720 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472246740220662, + "loss": 3.115, + "theoretical_loss": 4.31787134510159, + "tokens_seen": 214368256 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047223671013039113, + "loss": 3.4185, + "theoretical_loss": 4.3177079675832255, + "tokens_seen": 214433792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047222668004012037, + "loss": 3.2431, + "theoretical_loss": 4.317544653965329, + "tokens_seen": 214499328 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004722166499498496, + "loss": 3.2717, + "theoretical_loss": 4.3173814042033944, + "tokens_seen": 214564864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 281081, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1554346084594727, + "objective/train/theoretical_loss": 4.317218218252963, + "objective/train/tokens_used": 235090400, + "theoretical_loss": 4.317218218252963, + "tokens_seen": 214630400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047220661985957873, + "loss": 3.2487, + "theoretical_loss": 4.317218218252963, + "tokens_seen": 214630400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047219658976930797, + "loss": 3.2627, + "theoretical_loss": 4.317055096069618, + "tokens_seen": 214695936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047218655967903715, + "loss": 3.132, + "theoretical_loss": 4.316892037608987, + "tokens_seen": 214761472 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047217652958876633, + "loss": 3.2713, + "theoretical_loss": 4.316729042826745, + "tokens_seen": 214827008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721664994984955, + "loss": 3.2406, + "theoretical_loss": 4.316566111678609, + "tokens_seen": 214892544 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721564694082247, + "loss": 3.3419, + "theoretical_loss": 4.316403244120339, + "tokens_seen": 214958080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047214643931795387, + "loss": 3.0689, + "theoretical_loss": 4.3162404401077445, + "tokens_seen": 215023616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721364092276831, + "loss": 3.3983, + "theoretical_loss": 4.316077699596671, + "tokens_seen": 215089152 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047212637913741223, + "loss": 3.0837, + "theoretical_loss": 4.315915022543016, + "tokens_seen": 215154688 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047211634904714147, + "loss": 3.1345, + "theoretical_loss": 4.315752408902716, + "tokens_seen": 215220224 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721063189568706, + "loss": 3.2947, + "theoretical_loss": 4.315589858631755, + "tokens_seen": 215285760 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047209628886659983, + "loss": 3.2647, + "theoretical_loss": 4.315427371686157, + "tokens_seen": 215351296 + }, + { + "epoch": 0.07, + "learning_rate": 0.000472086258776329, + "loss": 3.1737, + "theoretical_loss": 4.315264948021994, + "tokens_seen": 215416832 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720762286860582, + "loss": 3.1847, + "theoretical_loss": 4.315102587595379, + "tokens_seen": 215482368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720661985957874, + "loss": 3.0922, + "theoretical_loss": 4.31494029036247, + "tokens_seen": 215547904 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047205616850551656, + "loss": 3.287, + "theoretical_loss": 4.314778056279468, + "tokens_seen": 215613440 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047204613841524574, + "loss": 2.8694, + "theoretical_loss": 4.314615885302619, + "tokens_seen": 215678976 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047203610832497497, + "loss": 3.373, + "theoretical_loss": 4.314453777388209, + "tokens_seen": 215744512 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720260782347041, + "loss": 3.2566, + "theoretical_loss": 4.314291732492573, + "tokens_seen": 215810048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047201604814443333, + "loss": 2.9877, + "theoretical_loss": 4.314129750572087, + "tokens_seen": 215875584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720060180541625, + "loss": 3.3091, + "theoretical_loss": 4.3139678315831675, + "tokens_seen": 215941120 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719959879638917, + "loss": 3.0783, + "theoretical_loss": 4.313805975482278, + "tokens_seen": 216006656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719859578736209, + "loss": 3.1844, + "theoretical_loss": 4.313644182225926, + "tokens_seen": 216072192 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047197592778335006, + "loss": 3.2515, + "theoretical_loss": 4.313482451770659, + "tokens_seen": 216137728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047196589769307924, + "loss": 2.9306, + "theoretical_loss": 4.313320784073069, + "tokens_seen": 216203264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 282379, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2120323181152344, + "objective/train/theoretical_loss": 4.3131591790897925, + "objective/train/tokens_used": 236728800, + "theoretical_loss": 4.3131591790897925, + "tokens_seen": 216268800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719558676028085, + "loss": 3.1633, + "theoretical_loss": 4.3131591790897925, + "tokens_seen": 216268800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719458375125376, + "loss": 3.3385, + "theoretical_loss": 4.3129976367775065, + "tokens_seen": 216334336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047193580742226684, + "loss": 3.1109, + "theoretical_loss": 4.312836157092934, + "tokens_seen": 216399872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047192577733199596, + "loss": 3.3772, + "theoretical_loss": 4.312674739992839, + "tokens_seen": 216465408 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719157472417252, + "loss": 3.3361, + "theoretical_loss": 4.31251338543403, + "tokens_seen": 216530944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719057171514544, + "loss": 3.3251, + "theoretical_loss": 4.312352093373354, + "tokens_seen": 216596480 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047189568706118356, + "loss": 3.0503, + "theoretical_loss": 4.312190863767708, + "tokens_seen": 216662016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047188565697091274, + "loss": 3.0291, + "theoretical_loss": 4.312029696574027, + "tokens_seen": 216727552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718756268806419, + "loss": 3.0042, + "theoretical_loss": 4.311868591749287, + "tokens_seen": 216793088 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718655967903711, + "loss": 3.0656, + "theoretical_loss": 4.311707549250514, + "tokens_seen": 216858624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047185556670010034, + "loss": 3.1287, + "theoretical_loss": 4.311546569034767, + "tokens_seen": 216924160 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047184553660982947, + "loss": 3.1405, + "theoretical_loss": 4.311385651059155, + "tokens_seen": 216989696 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718355065195587, + "loss": 3.425, + "theoretical_loss": 4.311224795280825, + "tokens_seen": 217055232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718254764292879, + "loss": 3.282, + "theoretical_loss": 4.3110640016569715, + "tokens_seen": 217120768 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047181544633901706, + "loss": 3.4067, + "theoretical_loss": 4.310903270144825, + "tokens_seen": 217186304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047180541624874625, + "loss": 3.1557, + "theoretical_loss": 4.310742600701664, + "tokens_seen": 217251840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717953861584754, + "loss": 3.1474, + "theoretical_loss": 4.310581993284805, + "tokens_seen": 217317376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717853560682046, + "loss": 3.2756, + "theoretical_loss": 4.310421447851609, + "tokens_seen": 217382912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047177532597793384, + "loss": 3.4118, + "theoretical_loss": 4.310260964359479, + "tokens_seen": 217448448 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047176529588766297, + "loss": 3.2428, + "theoretical_loss": 4.310100542765858, + "tokens_seen": 217513984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717552657973922, + "loss": 3.2141, + "theoretical_loss": 4.309940183028236, + "tokens_seen": 217579520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047174523570712133, + "loss": 3.3118, + "theoretical_loss": 4.309779885104139, + "tokens_seen": 217645056 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047173520561685057, + "loss": 3.276, + "theoretical_loss": 4.309619648951139, + "tokens_seen": 217710592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047172517552657975, + "loss": 2.8709, + "theoretical_loss": 4.3094594745268475, + "tokens_seen": 217776128 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047171514543630893, + "loss": 3.3411, + "theoretical_loss": 4.30929936178892, + "tokens_seen": 217841664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 283045, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1448984146118164, + "objective/train/theoretical_loss": 4.309139310695053, + "objective/train/tokens_used": 238367200, + "theoretical_loss": 4.309139310695053, + "tokens_seen": 217907200 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717051153460381, + "loss": 3.247, + "theoretical_loss": 4.309139310695053, + "tokens_seen": 217907200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047169508525576735, + "loss": 2.9108, + "theoretical_loss": 4.308979321202983, + "tokens_seen": 217972736 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716850551654965, + "loss": 3.2046, + "theoretical_loss": 4.308819393270491, + "tokens_seen": 218038272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716750250752257, + "loss": 3.076, + "theoretical_loss": 4.308659526855396, + "tokens_seen": 218103808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047166499498495484, + "loss": 3.245, + "theoretical_loss": 4.308499721915563, + "tokens_seen": 218169344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047165496489468407, + "loss": 3.2956, + "theoretical_loss": 4.308339978408897, + "tokens_seen": 218234880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047164493480441325, + "loss": 3.0581, + "theoretical_loss": 4.308180296293341, + "tokens_seen": 218300416 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047163490471414243, + "loss": 3.1606, + "theoretical_loss": 4.308020675526883, + "tokens_seen": 218365952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716248746238716, + "loss": 3.2829, + "theoretical_loss": 4.307861116067554, + "tokens_seen": 218431488 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716148445336008, + "loss": 3.3331, + "theoretical_loss": 4.30770161787342, + "tokens_seen": 218497024 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047160481444333, + "loss": 3.2521, + "theoretical_loss": 4.307542180902594, + "tokens_seen": 218562560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715947843530592, + "loss": 3.0335, + "theoretical_loss": 4.307382805113228, + "tokens_seen": 218628096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047158475426278834, + "loss": 2.9783, + "theoretical_loss": 4.307223490463516, + "tokens_seen": 218693632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715747241725176, + "loss": 3.1517, + "theoretical_loss": 4.307064236911692, + "tokens_seen": 218759168 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715646940822467, + "loss": 3.002, + "theoretical_loss": 4.30690504441603, + "tokens_seen": 218824704 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047155466399197594, + "loss": 3.13, + "theoretical_loss": 4.306745912934849, + "tokens_seen": 218890240 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715446339017051, + "loss": 3.2757, + "theoretical_loss": 4.306586842426504, + "tokens_seen": 218955776 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715346038114343, + "loss": 3.1729, + "theoretical_loss": 4.306427832849394, + "tokens_seen": 219021312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715245737211635, + "loss": 2.8273, + "theoretical_loss": 4.306268884161959, + "tokens_seen": 219086848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715145436308927, + "loss": 3.0639, + "theoretical_loss": 4.306109996322679, + "tokens_seen": 219152384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047150451354062184, + "loss": 3.2826, + "theoretical_loss": 4.305951169290073, + "tokens_seen": 219217920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714944834503511, + "loss": 3.2331, + "theoretical_loss": 4.305792403022703, + "tokens_seen": 219283456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714844533600802, + "loss": 3.4157, + "theoretical_loss": 4.305633697479171, + "tokens_seen": 219348992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047147442326980944, + "loss": 3.2171, + "theoretical_loss": 4.305475052618119, + "tokens_seen": 219414528 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714643931795387, + "loss": 3.271, + "theoretical_loss": 4.30531646839823, + "tokens_seen": 219480064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 284124, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.518115997314453, + "objective/train/theoretical_loss": 4.305157944778228, + "objective/train/tokens_used": 240005600, + "theoretical_loss": 4.305157944778228, + "tokens_seen": 219545600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714543630892678, + "loss": 3.2974, + "theoretical_loss": 4.305157944778228, + "tokens_seen": 219545600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047144433299899704, + "loss": 3.1396, + "theoretical_loss": 4.304999481716876, + "tokens_seen": 219611136 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047143430290872616, + "loss": 3.3291, + "theoretical_loss": 4.304841079172979, + "tokens_seen": 219676672 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714242728184554, + "loss": 3.3637, + "theoretical_loss": 4.30468273710538, + "tokens_seen": 219742208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714142427281846, + "loss": 3.22, + "theoretical_loss": 4.304524455472965, + "tokens_seen": 219807744 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047140421263791376, + "loss": 3.2553, + "theoretical_loss": 4.304366234234659, + "tokens_seen": 219873280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047139418254764294, + "loss": 3.2947, + "theoretical_loss": 4.304208073349426, + "tokens_seen": 219938816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713841524573721, + "loss": 3.2111, + "theoretical_loss": 4.304049972776271, + "tokens_seen": 220004352 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713741223671013, + "loss": 3.2684, + "theoretical_loss": 4.30389193247424, + "tokens_seen": 220069888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047136409227683054, + "loss": 3.1529, + "theoretical_loss": 4.303733952402419, + "tokens_seen": 220135424 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047135406218655967, + "loss": 3.1474, + "theoretical_loss": 4.303576032519931, + "tokens_seen": 220200960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713440320962889, + "loss": 3.0742, + "theoretical_loss": 4.303418172785943, + "tokens_seen": 220266496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713340020060181, + "loss": 3.1944, + "theoretical_loss": 4.303260373159659, + "tokens_seen": 220332032 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047132397191574726, + "loss": 3.0874, + "theoretical_loss": 4.303102633600322, + "tokens_seen": 220397568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047131394182547645, + "loss": 3.0249, + "theoretical_loss": 4.30294495406722, + "tokens_seen": 220463104 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047130391173520563, + "loss": 3.434, + "theoretical_loss": 4.3027873345196745, + "tokens_seen": 220528640 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712938816449348, + "loss": 3.4907, + "theoretical_loss": 4.302629774917049, + "tokens_seen": 220594176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047128385155466404, + "loss": 3.1007, + "theoretical_loss": 4.302472275218748, + "tokens_seen": 220659712 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047127382146439317, + "loss": 3.3572, + "theoretical_loss": 4.302314835384214, + "tokens_seen": 220725248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712637913741224, + "loss": 3.2629, + "theoretical_loss": 4.30215745537293, + "tokens_seen": 220790784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047125376128385153, + "loss": 3.0362, + "theoretical_loss": 4.302000135144416, + "tokens_seen": 220856320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047124373119358077, + "loss": 3.1044, + "theoretical_loss": 4.301842874658235, + "tokens_seen": 220921856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047123370110330995, + "loss": 3.2226, + "theoretical_loss": 4.301685673873987, + "tokens_seen": 220987392 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047122367101303913, + "loss": 3.2153, + "theoretical_loss": 4.301528532751312, + "tokens_seen": 221052928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712136409227683, + "loss": 3.5554, + "theoretical_loss": 4.301371451249888, + "tokens_seen": 221118464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 284674, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.90860652923584, + "objective/train/theoretical_loss": 4.301214429329433, + "objective/train/tokens_used": 241644000, + "theoretical_loss": 4.301214429329433, + "tokens_seen": 221184000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047120361083249755, + "loss": 3.2309, + "theoretical_loss": 4.301214429329433, + "tokens_seen": 221184000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711935807422267, + "loss": 3.2685, + "theoretical_loss": 4.301057466949707, + "tokens_seen": 221249536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711835506519559, + "loss": 3.1091, + "theoretical_loss": 4.300900564070504, + "tokens_seen": 221315072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047117352056168504, + "loss": 3.0911, + "theoretical_loss": 4.30074372065166, + "tokens_seen": 221380608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047116349047141427, + "loss": 3.3026, + "theoretical_loss": 4.300586936653049, + "tokens_seen": 221446144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047115346038114345, + "loss": 3.2759, + "theoretical_loss": 4.300430212034587, + "tokens_seen": 221511680 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047114343029087263, + "loss": 3.1257, + "theoretical_loss": 4.300273546756223, + "tokens_seen": 221577216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711334002006018, + "loss": 3.1675, + "theoretical_loss": 4.300116940777951, + "tokens_seen": 221642752 + }, + { + "epoch": 0.07, + "learning_rate": 0.000471123370110331, + "loss": 2.9268, + "theoretical_loss": 4.299960394059799, + "tokens_seen": 221708288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711133400200602, + "loss": 3.1183, + "theoretical_loss": 4.299803906561835, + "tokens_seen": 221773824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711033099297894, + "loss": 3.3301, + "theoretical_loss": 4.29964747824417, + "tokens_seen": 221839360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047109327983951854, + "loss": 3.0193, + "theoretical_loss": 4.299491109066947, + "tokens_seen": 221904896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710832497492478, + "loss": 3.4411, + "theoretical_loss": 4.299334798990351, + "tokens_seen": 221970432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710732196589769, + "loss": 3.2296, + "theoretical_loss": 4.2991785479746065, + "tokens_seen": 222035968 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047106318956870614, + "loss": 3.0854, + "theoretical_loss": 4.299022355979974, + "tokens_seen": 222101504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710531594784353, + "loss": 3.2255, + "theoretical_loss": 4.298866222966755, + "tokens_seen": 222167040 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710431293881645, + "loss": 3.2013, + "theoretical_loss": 4.298710148895286, + "tokens_seen": 222232576 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710330992978937, + "loss": 3.2448, + "theoretical_loss": 4.298554133725946, + "tokens_seen": 222298112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710230692076229, + "loss": 3.1119, + "theoretical_loss": 4.298398177419149, + "tokens_seen": 222363648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047101303911735204, + "loss": 3.1882, + "theoretical_loss": 4.298242279935349, + "tokens_seen": 222429184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710030090270813, + "loss": 3.0531, + "theoretical_loss": 4.2980864412350375, + "tokens_seen": 222494720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709929789368104, + "loss": 3.0501, + "theoretical_loss": 4.297930661278745, + "tokens_seen": 222560256 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047098294884653964, + "loss": 3.1275, + "theoretical_loss": 4.297774940027038, + "tokens_seen": 222625792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709729187562688, + "loss": 3.3449, + "theoretical_loss": 4.297619277440523, + "tokens_seen": 222691328 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470962888665998, + "loss": 3.2312, + "theoretical_loss": 4.297463673479846, + "tokens_seen": 222756864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 286126, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6270840167999268, + "objective/train/theoretical_loss": 4.297308128105687, + "objective/train/tokens_used": 243282400, + "theoretical_loss": 4.297308128105687, + "tokens_seen": 222822400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709528585757272, + "loss": 3.1513, + "theoretical_loss": 4.297308128105687, + "tokens_seen": 222822400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047094282848545636, + "loss": 3.1221, + "theoretical_loss": 4.297152641278767, + "tokens_seen": 222887936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047093279839518555, + "loss": 3.2143, + "theoretical_loss": 4.296997212959842, + "tokens_seen": 222953472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709227683049148, + "loss": 3.0636, + "theoretical_loss": 4.296841843109711, + "tokens_seen": 223019008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709127382146439, + "loss": 3.2735, + "theoretical_loss": 4.296686531689204, + "tokens_seen": 223084544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047090270812437314, + "loss": 2.9135, + "theoretical_loss": 4.296531278659193, + "tokens_seen": 223150080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047089267803410227, + "loss": 3.1243, + "theoretical_loss": 4.296376083980589, + "tokens_seen": 223215616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708826479438315, + "loss": 3.0309, + "theoretical_loss": 4.296220947614337, + "tokens_seen": 223281152 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708726178535607, + "loss": 3.154, + "theoretical_loss": 4.296065869521421, + "tokens_seen": 223346688 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047086258776328987, + "loss": 2.8156, + "theoretical_loss": 4.295910849662862, + "tokens_seen": 223412224 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047085255767301905, + "loss": 3.0104, + "theoretical_loss": 4.2957558879997215, + "tokens_seen": 223477760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708425275827483, + "loss": 3.1348, + "theoretical_loss": 4.295600984493093, + "tokens_seen": 223543296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708324974924774, + "loss": 3.047, + "theoretical_loss": 4.295446139104112, + "tokens_seen": 223608832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047082246740220665, + "loss": 3.3838, + "theoretical_loss": 4.295291351793951, + "tokens_seen": 223674368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708124373119358, + "loss": 3.0171, + "theoretical_loss": 4.295136622523817, + "tokens_seen": 223739904 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470802407221665, + "loss": 3.3862, + "theoretical_loss": 4.294981951254956, + "tokens_seen": 223805440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707923771313942, + "loss": 3.1766, + "theoretical_loss": 4.294827337948651, + "tokens_seen": 223870976 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047078234704112337, + "loss": 3.0689, + "theoretical_loss": 4.294672782566224, + "tokens_seen": 223936512 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047077231695085255, + "loss": 3.1193, + "theoretical_loss": 4.29451828506903, + "tokens_seen": 224002048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047076228686058173, + "loss": 3.4053, + "theoretical_loss": 4.294363845418465, + "tokens_seen": 224067584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707522567703109, + "loss": 3.1876, + "theoretical_loss": 4.29420946357596, + "tokens_seen": 224133120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047074222668004015, + "loss": 3.2133, + "theoretical_loss": 4.294055139502985, + "tokens_seen": 224198656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707321965897693, + "loss": 3.2064, + "theoretical_loss": 4.293900873161043, + "tokens_seen": 224264192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707221664994985, + "loss": 3.295, + "theoretical_loss": 4.293746664511678, + "tokens_seen": 224329728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047071213640922775, + "loss": 2.9466, + "theoretical_loss": 4.293592513516469, + "tokens_seen": 224395264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 286836, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5235793590545654, + "objective/train/theoretical_loss": 4.293438420137031, + "objective/train/tokens_used": 244920800, + "theoretical_loss": 4.293438420137031, + "tokens_seen": 224460800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707021063189569, + "loss": 3.2967, + "theoretical_loss": 4.293438420137031, + "tokens_seen": 224460800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706920762286861, + "loss": 3.1972, + "theoretical_loss": 4.293284384335017, + "tokens_seen": 224526336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047068204613841524, + "loss": 3.2584, + "theoretical_loss": 4.293130406072118, + "tokens_seen": 224591872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047067201604814447, + "loss": 3.1142, + "theoretical_loss": 4.292976485310057, + "tokens_seen": 224657408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047066198595787365, + "loss": 3.1867, + "theoretical_loss": 4.2928226220106005, + "tokens_seen": 224722944 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047065195586760283, + "loss": 3.1763, + "theoretical_loss": 4.292668816135545, + "tokens_seen": 224788480 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470641925777332, + "loss": 3.5007, + "theoretical_loss": 4.292515067646727, + "tokens_seen": 224854016 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706318956870612, + "loss": 2.8707, + "theoretical_loss": 4.29236137650602, + "tokens_seen": 224919552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706218655967904, + "loss": 3.2017, + "theoretical_loss": 4.2922077426753305, + "tokens_seen": 224985088 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706118355065196, + "loss": 3.0439, + "theoretical_loss": 4.292054166116605, + "tokens_seen": 225050624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047060180541624874, + "loss": 3.2209, + "theoretical_loss": 4.291900646791825, + "tokens_seen": 225116160 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470591775325978, + "loss": 3.2066, + "theoretical_loss": 4.2917471846630075, + "tokens_seen": 225181696 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705817452357071, + "loss": 2.9519, + "theoretical_loss": 4.291593779692207, + "tokens_seen": 225247232 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047057171514543634, + "loss": 3.1154, + "theoretical_loss": 4.291440431841513, + "tokens_seen": 225312768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705616850551655, + "loss": 3.181, + "theoretical_loss": 4.291287141073053, + "tokens_seen": 225378304 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705516549648947, + "loss": 2.9915, + "theoretical_loss": 4.291133907348989, + "tokens_seen": 225443840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705416248746239, + "loss": 3.1199, + "theoretical_loss": 4.29098073063152, + "tokens_seen": 225509376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705315947843531, + "loss": 3.3002, + "theoretical_loss": 4.29082761088288, + "tokens_seen": 225574912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047052156469408224, + "loss": 2.7099, + "theoretical_loss": 4.290674548065338, + "tokens_seen": 225640448 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705115346038115, + "loss": 3.1867, + "theoretical_loss": 4.290521542141203, + "tokens_seen": 225705984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705015045135406, + "loss": 3.0885, + "theoretical_loss": 4.290368593072817, + "tokens_seen": 225771520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047049147442326984, + "loss": 3.1099, + "theoretical_loss": 4.290215700822556, + "tokens_seen": 225837056 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470481444332999, + "loss": 3.1279, + "theoretical_loss": 4.290062865352837, + "tokens_seen": 225902592 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704714142427282, + "loss": 3.0228, + "theoretical_loss": 4.289910086626108, + "tokens_seen": 225968128 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704613841524574, + "loss": 3.3652, + "theoretical_loss": 4.289757364604855, + "tokens_seen": 226033664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 288143, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2444591522216797, + "objective/train/theoretical_loss": 4.2896046992515995, + "objective/train/tokens_used": 246559200, + "theoretical_loss": 4.2896046992515995, + "tokens_seen": 226099200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047045135406218656, + "loss": 3.2585, + "theoretical_loss": 4.2896046992515995, + "tokens_seen": 226099200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047044132397191575, + "loss": 2.841, + "theoretical_loss": 4.289452090528897, + "tokens_seen": 226164736 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470431293881645, + "loss": 3.1428, + "theoretical_loss": 4.289299538399341, + "tokens_seen": 226230272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704212637913741, + "loss": 3.229, + "theoretical_loss": 4.28914704282556, + "tokens_seen": 226295808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047041123370110334, + "loss": 3.003, + "theoretical_loss": 4.288994603770215, + "tokens_seen": 226361344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047040120361083247, + "loss": 3.1008, + "theoretical_loss": 4.288842221196007, + "tokens_seen": 226426880 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703911735205617, + "loss": 3.0763, + "theoretical_loss": 4.28868989506567, + "tokens_seen": 226492416 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703811434302909, + "loss": 3.1798, + "theoretical_loss": 4.288537625341974, + "tokens_seen": 226557952 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047037111334002007, + "loss": 3.5773, + "theoretical_loss": 4.288385411987722, + "tokens_seen": 226623488 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047036108324974925, + "loss": 3.156, + "theoretical_loss": 4.288233254965755, + "tokens_seen": 226689024 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703510531594785, + "loss": 2.9742, + "theoretical_loss": 4.2880811542389505, + "tokens_seen": 226754560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703410230692076, + "loss": 2.9553, + "theoretical_loss": 4.287929109770217, + "tokens_seen": 226820096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047033099297893685, + "loss": 3.1251, + "theoretical_loss": 4.287777121522501, + "tokens_seen": 226885632 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470320962888666, + "loss": 3.0942, + "theoretical_loss": 4.287625189458781, + "tokens_seen": 226951168 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703109327983952, + "loss": 3.2267, + "theoretical_loss": 4.287473313542077, + "tokens_seen": 227016704 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703009027081244, + "loss": 3.0715, + "theoretical_loss": 4.287321493735438, + "tokens_seen": 227082240 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047029087261785357, + "loss": 3.2224, + "theoretical_loss": 4.287169730001949, + "tokens_seen": 227147776 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047028084252758275, + "loss": 3.1082, + "theoretical_loss": 4.287018022304733, + "tokens_seen": 227213312 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047027081243731193, + "loss": 3.2017, + "theoretical_loss": 4.286866370606943, + "tokens_seen": 227278848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702607823470411, + "loss": 2.8765, + "theoretical_loss": 4.286714774871772, + "tokens_seen": 227344384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047025075225677035, + "loss": 2.7387, + "theoretical_loss": 4.286563235062444, + "tokens_seen": 227409920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702407221664995, + "loss": 2.8634, + "theoretical_loss": 4.28641175114222, + "tokens_seen": 227475456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702306920762287, + "loss": 2.7575, + "theoretical_loss": 4.286260323074394, + "tokens_seen": 227540992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047022066198595784, + "loss": 3.0389, + "theoretical_loss": 4.286108950822296, + "tokens_seen": 227606528 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702106318956871, + "loss": 3.0842, + "theoretical_loss": 4.285957634349289, + "tokens_seen": 227672064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 288537, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2806308269500732, + "objective/train/theoretical_loss": 4.285806373618774, + "objective/train/tokens_used": 248197600, + "theoretical_loss": 4.285806373618774, + "tokens_seen": 227737600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047020060180541626, + "loss": 3.402, + "theoretical_loss": 4.285806373618774, + "tokens_seen": 227737600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047019057171514544, + "loss": 3.2526, + "theoretical_loss": 4.285655168594182, + "tokens_seen": 227803136 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701805416248746, + "loss": 3.2338, + "theoretical_loss": 4.285504019238982, + "tokens_seen": 227868672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047017051153460385, + "loss": 3.1311, + "theoretical_loss": 4.285352925516676, + "tokens_seen": 227934208 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470160481444333, + "loss": 3.2231, + "theoretical_loss": 4.2852018873908, + "tokens_seen": 227999744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701504513540622, + "loss": 3.1156, + "theoretical_loss": 4.285050904824925, + "tokens_seen": 228065280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047014042126379134, + "loss": 3.2022, + "theoretical_loss": 4.284899977782658, + "tokens_seen": 228130816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701303911735206, + "loss": 3.1321, + "theoretical_loss": 4.284749106227636, + "tokens_seen": 228196352 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047012036108324976, + "loss": 3.1009, + "theoretical_loss": 4.284598290123535, + "tokens_seen": 228261888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047011033099297894, + "loss": 3.2566, + "theoretical_loss": 4.284447529434061, + "tokens_seen": 228327424 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701003009027081, + "loss": 3.1493, + "theoretical_loss": 4.284296824122959, + "tokens_seen": 228392960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700902708124373, + "loss": 3.1941, + "theoretical_loss": 4.284146174154003, + "tokens_seen": 228458496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700802407221665, + "loss": 2.9355, + "theoretical_loss": 4.283995579491004, + "tokens_seen": 228524032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700702106318957, + "loss": 3.098, + "theoretical_loss": 4.283845040097807, + "tokens_seen": 228589568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047006018054162484, + "loss": 3.013, + "theoretical_loss": 4.28369455593829, + "tokens_seen": 228655104 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700501504513541, + "loss": 3.2876, + "theoretical_loss": 4.2835441269763646, + "tokens_seen": 228720640 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047004012036108326, + "loss": 3.1562, + "theoretical_loss": 4.283393753175979, + "tokens_seen": 228786176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047003009027081244, + "loss": 3.1474, + "theoretical_loss": 4.283243434501112, + "tokens_seen": 228851712 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700200601805416, + "loss": 3.1408, + "theoretical_loss": 4.283093170915778, + "tokens_seen": 228917248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700100300902708, + "loss": 3.0895, + "theoretical_loss": 4.282942962384023, + "tokens_seen": 228982784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047, + "loss": 3.1473, + "theoretical_loss": 4.282792808869932, + "tokens_seen": 229048320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699899699097292, + "loss": 3.2697, + "theoretical_loss": 4.282642710337618, + "tokens_seen": 229113856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046997993981945835, + "loss": 2.9681, + "theoretical_loss": 4.28249266675123, + "tokens_seen": 229179392 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699699097291876, + "loss": 3.0361, + "theoretical_loss": 4.282342678074951, + "tokens_seen": 229244928 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046995987963891676, + "loss": 3.0598, + "theoretical_loss": 4.2821927442729955, + "tokens_seen": 229310464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 289830, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.521411418914795, + "objective/train/theoretical_loss": 4.282042865309616, + "objective/train/tokens_used": 249836000, + "theoretical_loss": 4.282042865309616, + "tokens_seen": 229376000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046994984954864595, + "loss": 3.1125, + "theoretical_loss": 4.282042865309616, + "tokens_seen": 229376000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699398194583752, + "loss": 2.9385, + "theoretical_loss": 4.281893041149093, + "tokens_seen": 229441536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699297893681043, + "loss": 3.3297, + "theoretical_loss": 4.2817432717557455, + "tokens_seen": 229507072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046991975927783354, + "loss": 2.8433, + "theoretical_loss": 4.28159355709392, + "tokens_seen": 229572608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046990972918756267, + "loss": 3.0393, + "theoretical_loss": 4.281443897128004, + "tokens_seen": 229638144 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698996990972919, + "loss": 3.0533, + "theoretical_loss": 4.2812942918224115, + "tokens_seen": 229703680 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698896690070211, + "loss": 3.0605, + "theoretical_loss": 4.281144741141593, + "tokens_seen": 229769216 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046987963891675027, + "loss": 3.2703, + "theoretical_loss": 4.280995245050032, + "tokens_seen": 229834752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046986960882647945, + "loss": 3.296, + "theoretical_loss": 4.2808458035122445, + "tokens_seen": 229900288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698595787362087, + "loss": 3.1057, + "theoretical_loss": 4.2806964164927805, + "tokens_seen": 229965824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698495486459378, + "loss": 3.2996, + "theoretical_loss": 4.280547083956224, + "tokens_seen": 230031360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046983951855566705, + "loss": 3.1414, + "theoretical_loss": 4.280397805867188, + "tokens_seen": 230096896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698294884653962, + "loss": 2.9225, + "theoretical_loss": 4.280248582190324, + "tokens_seen": 230162432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698194583751254, + "loss": 3.1511, + "theoretical_loss": 4.280099412890312, + "tokens_seen": 230227968 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698094282848546, + "loss": 3.36, + "theoretical_loss": 4.279950297931869, + "tokens_seen": 230293504 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046979939819458377, + "loss": 3.1327, + "theoretical_loss": 4.27980123727974, + "tokens_seen": 230359040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046978936810431295, + "loss": 3.1655, + "theoretical_loss": 4.279652230898709, + "tokens_seen": 230424576 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046977933801404213, + "loss": 2.9951, + "theoretical_loss": 4.279503278753586, + "tokens_seen": 230490112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697693079237713, + "loss": 3.1871, + "theoretical_loss": 4.27935438080922, + "tokens_seen": 230555648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046975927783350055, + "loss": 3.413, + "theoretical_loss": 4.27920553703049, + "tokens_seen": 230621184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697492477432297, + "loss": 3.0407, + "theoretical_loss": 4.279056747382306, + "tokens_seen": 230686720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697392176529589, + "loss": 3.2621, + "theoretical_loss": 4.278908011829613, + "tokens_seen": 230752256 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046972918756268804, + "loss": 3.0509, + "theoretical_loss": 4.27875933033739, + "tokens_seen": 230817792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697191574724173, + "loss": 3.1669, + "theoretical_loss": 4.278610702870646, + "tokens_seen": 230883328 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046970912738214646, + "loss": 3.0121, + "theoretical_loss": 4.278462129394423, + "tokens_seen": 230948864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 290404, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.064453125, + "objective/train/theoretical_loss": 4.278313609873795, + "objective/train/tokens_used": 251474400, + "theoretical_loss": 4.278313609873795, + "tokens_seen": 231014400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046969909729187564, + "loss": 3.1929, + "theoretical_loss": 4.278313609873795, + "tokens_seen": 231014400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696890672016048, + "loss": 3.1449, + "theoretical_loss": 4.278165144273871, + "tokens_seen": 231079936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046967903711133405, + "loss": 3.1433, + "theoretical_loss": 4.27801673255979, + "tokens_seen": 231145472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696690070210632, + "loss": 3.1905, + "theoretical_loss": 4.277868374696725, + "tokens_seen": 231211008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696589769307924, + "loss": 3.2421, + "theoretical_loss": 4.277720070649879, + "tokens_seen": 231276544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046964894684052154, + "loss": 3.3793, + "theoretical_loss": 4.277571820384491, + "tokens_seen": 231342080 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696389167502508, + "loss": 3.0555, + "theoretical_loss": 4.277423623865829, + "tokens_seen": 231407616 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046962888665997996, + "loss": 3.0292, + "theoretical_loss": 4.277275481059195, + "tokens_seen": 231473152 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046961885656970914, + "loss": 3.2957, + "theoretical_loss": 4.2771273919299215, + "tokens_seen": 231538688 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696088264794383, + "loss": 3.2876, + "theoretical_loss": 4.276979356443377, + "tokens_seen": 231604224 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695987963891675, + "loss": 3.3123, + "theoretical_loss": 4.276831374564957, + "tokens_seen": 231669760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695887662988967, + "loss": 3.0926, + "theoretical_loss": 4.276683446260093, + "tokens_seen": 231735296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695787362086259, + "loss": 3.0816, + "theoretical_loss": 4.276535571494247, + "tokens_seen": 231800832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046956870611835505, + "loss": 2.8716, + "theoretical_loss": 4.276387750232913, + "tokens_seen": 231866368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695586760280843, + "loss": 3.5923, + "theoretical_loss": 4.276239982441617, + "tokens_seen": 231931904 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046954864593781346, + "loss": 3.1727, + "theoretical_loss": 4.276092268085918, + "tokens_seen": 231997440 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046953861584754264, + "loss": 3.0979, + "theoretical_loss": 4.275944607131406, + "tokens_seen": 232062976 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695285857572718, + "loss": 3.179, + "theoretical_loss": 4.275796999543703, + "tokens_seen": 232128512 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469518555667001, + "loss": 3.0949, + "theoretical_loss": 4.275649445288461, + "tokens_seen": 232194048 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695085255767302, + "loss": 3.2259, + "theoretical_loss": 4.275501944331367, + "tokens_seen": 232259584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694984954864594, + "loss": 3.2972, + "theoretical_loss": 4.275354496638139, + "tokens_seen": 232325120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046948846539618855, + "loss": 3.5002, + "theoretical_loss": 4.275207102174525, + "tokens_seen": 232390656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694784353059178, + "loss": 2.9256, + "theoretical_loss": 4.275059760906305, + "tokens_seen": 232456192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694684052156469, + "loss": 3.3357, + "theoretical_loss": 4.2749124727992935, + "tokens_seen": 232521728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046945837512537615, + "loss": 3.146, + "theoretical_loss": 4.274765237819333, + "tokens_seen": 232587264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 291491, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0524933338165283, + "objective/train/theoretical_loss": 4.274618055932298, + "objective/train/tokens_used": 253112800, + "theoretical_loss": 4.274618055932298, + "tokens_seen": 232652800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694483450351053, + "loss": 2.9909, + "theoretical_loss": 4.274618055932298, + "tokens_seen": 232652800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694383149448345, + "loss": 3.1934, + "theoretical_loss": 4.2744709271040975, + "tokens_seen": 232718336 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694282848545637, + "loss": 3.075, + "theoretical_loss": 4.27432385130067, + "tokens_seen": 232783872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046941825476429287, + "loss": 3.3298, + "theoretical_loss": 4.274176828487984, + "tokens_seen": 232849408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046940822467402205, + "loss": 3.1214, + "theoretical_loss": 4.2740298586320415, + "tokens_seen": 232914944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693981945837513, + "loss": 3.2709, + "theoretical_loss": 4.273882941698876, + "tokens_seen": 232980480 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693881644934804, + "loss": 3.351, + "theoretical_loss": 4.27373607765455, + "tokens_seen": 233046016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046937813440320965, + "loss": 3.24, + "theoretical_loss": 4.2735892664651605, + "tokens_seen": 233111552 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046936810431293883, + "loss": 3.1293, + "theoretical_loss": 4.273442508096833, + "tokens_seen": 233177088 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469358074222668, + "loss": 3.2126, + "theoretical_loss": 4.273295802515726, + "tokens_seen": 233242624 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693480441323972, + "loss": 3.1292, + "theoretical_loss": 4.273149149688028, + "tokens_seen": 233308160 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693380140421264, + "loss": 3.0769, + "theoretical_loss": 4.27300254957996, + "tokens_seen": 233373696 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046932798395185555, + "loss": 3.026, + "theoretical_loss": 4.272856002157772, + "tokens_seen": 233439232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693179538615848, + "loss": 3.335, + "theoretical_loss": 4.272709507387748, + "tokens_seen": 233504768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693079237713139, + "loss": 3.1597, + "theoretical_loss": 4.2725630652362, + "tokens_seen": 233570304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046929789368104315, + "loss": 3.1306, + "theoretical_loss": 4.272416675669473, + "tokens_seen": 233635840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692878635907723, + "loss": 3.2993, + "theoretical_loss": 4.272270338653942, + "tokens_seen": 233701376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692778335005015, + "loss": 3.2775, + "theoretical_loss": 4.272124054156014, + "tokens_seen": 233766912 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692678034102307, + "loss": 3.0821, + "theoretical_loss": 4.271977822142125, + "tokens_seen": 233832448 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692577733199599, + "loss": 3.1264, + "theoretical_loss": 4.271831642578745, + "tokens_seen": 233897984 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046924774322968906, + "loss": 3.0504, + "theoretical_loss": 4.27168551543237, + "tokens_seen": 233963520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046923771313941824, + "loss": 3.29, + "theoretical_loss": 4.271539440669532, + "tokens_seen": 234029056 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692276830491474, + "loss": 3.2624, + "theoretical_loss": 4.27139341825679, + "tokens_seen": 234094592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046921765295887666, + "loss": 3.288, + "theoretical_loss": 4.271247448160736, + "tokens_seen": 234160128 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046920762286860584, + "loss": 3.2345, + "theoretical_loss": 4.27110153034799, + "tokens_seen": 234225664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 292210, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.441500663757324, + "objective/train/theoretical_loss": 4.270955664785207, + "objective/train/tokens_used": 254751200, + "theoretical_loss": 4.270955664785207, + "tokens_seen": 234291200 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469197592778335, + "loss": 3.2181, + "theoretical_loss": 4.270955664785207, + "tokens_seen": 234291200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046918756268806425, + "loss": 3.0661, + "theoretical_loss": 4.2708098514390676, + "tokens_seen": 234356736 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691775325977934, + "loss": 3.0524, + "theoretical_loss": 4.270664090276286, + "tokens_seen": 234422272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691675025075226, + "loss": 3.2508, + "theoretical_loss": 4.2705183812636065, + "tokens_seen": 234487808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046915747241725174, + "loss": 3.3243, + "theoretical_loss": 4.270372724367803, + "tokens_seen": 234553344 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469147442326981, + "loss": 3.2352, + "theoretical_loss": 4.270227119555681, + "tokens_seen": 234618880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046913741223671016, + "loss": 3.1664, + "theoretical_loss": 4.270081566794076, + "tokens_seen": 234684416 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046912738214643934, + "loss": 3.2272, + "theoretical_loss": 4.269936066049852, + "tokens_seen": 234749952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691173520561685, + "loss": 3.403, + "theoretical_loss": 4.269790617289907, + "tokens_seen": 234815488 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691073219658977, + "loss": 3.171, + "theoretical_loss": 4.269645220481166, + "tokens_seen": 234881024 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690972918756269, + "loss": 3.1504, + "theoretical_loss": 4.269499875590587, + "tokens_seen": 234946560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690872617853561, + "loss": 3.3593, + "theoretical_loss": 4.269354582585156, + "tokens_seen": 235012096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046907723169508525, + "loss": 3.4094, + "theoretical_loss": 4.269209341431889, + "tokens_seen": 235077632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690672016048145, + "loss": 3.0117, + "theoretical_loss": 4.269064152097835, + "tokens_seen": 235143168 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046905717151454366, + "loss": 3.1555, + "theoretical_loss": 4.26891901455007, + "tokens_seen": 235208704 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046904714142427284, + "loss": 3.3753, + "theoretical_loss": 4.268773928755701, + "tokens_seen": 235274240 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469037111334002, + "loss": 3.1484, + "theoretical_loss": 4.268628894681868, + "tokens_seen": 235339776 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690270812437312, + "loss": 3.2826, + "theoretical_loss": 4.268483912295735, + "tokens_seen": 235405312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690170511534604, + "loss": 3.267, + "theoretical_loss": 4.268338981564502, + "tokens_seen": 235470848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690070210631896, + "loss": 3.0986, + "theoretical_loss": 4.268194102455395, + "tokens_seen": 235536384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046899699097291875, + "loss": 3.0481, + "theoretical_loss": 4.26804927493567, + "tokens_seen": 235601920 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468986960882648, + "loss": 3.3586, + "theoretical_loss": 4.267904498972618, + "tokens_seen": 235667456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689769307923771, + "loss": 3.1682, + "theoretical_loss": 4.267759774533552, + "tokens_seen": 235732992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046896690070210635, + "loss": 3.1601, + "theoretical_loss": 4.267615101585821, + "tokens_seen": 235798528 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046895687061183553, + "loss": 3.3558, + "theoretical_loss": 4.267470480096801, + "tokens_seen": 235864064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 293604, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7816476821899414, + "objective/train/theoretical_loss": 4.267325910033897, + "objective/train/tokens_used": 256389600, + "theoretical_loss": 4.267325910033897, + "tokens_seen": 235929600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689468405215647, + "loss": 3.0005, + "theoretical_loss": 4.267325910033897, + "tokens_seen": 235929600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689368104312939, + "loss": 3.3922, + "theoretical_loss": 4.267181391364547, + "tokens_seen": 235995136 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046892678034102307, + "loss": 3.2998, + "theoretical_loss": 4.267036924056215, + "tokens_seen": 236060672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046891675025075225, + "loss": 3.0282, + "theoretical_loss": 4.266892508076397, + "tokens_seen": 236126208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689067201604815, + "loss": 3.2748, + "theoretical_loss": 4.266748143392617, + "tokens_seen": 236191744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688966900702106, + "loss": 3.0778, + "theoretical_loss": 4.26660382997243, + "tokens_seen": 236257280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046888665997993985, + "loss": 3.3932, + "theoretical_loss": 4.26645956778342, + "tokens_seen": 236322816 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046887662988966903, + "loss": 3.5158, + "theoretical_loss": 4.2663153567932, + "tokens_seen": 236388352 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688665997993982, + "loss": 2.9413, + "theoretical_loss": 4.266171196969412, + "tokens_seen": 236453888 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688565697091274, + "loss": 3.0881, + "theoretical_loss": 4.2660270882797295, + "tokens_seen": 236519424 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688465396188566, + "loss": 3.3507, + "theoretical_loss": 4.265883030691853, + "tokens_seen": 236584960 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046883650952858575, + "loss": 3.2892, + "theoretical_loss": 4.265739024173515, + "tokens_seen": 236650496 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468826479438315, + "loss": 3.1898, + "theoretical_loss": 4.265595068692473, + "tokens_seen": 236716032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688164493480441, + "loss": 3.0696, + "theoretical_loss": 4.26545116421652, + "tokens_seen": 236781568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046880641925777335, + "loss": 3.385, + "theoretical_loss": 4.265307310713471, + "tokens_seen": 236847104 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687963891675025, + "loss": 3.3006, + "theoretical_loss": 4.2651635081511765, + "tokens_seen": 236912640 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687863590772317, + "loss": 3.1971, + "theoretical_loss": 4.265019756497512, + "tokens_seen": 236978176 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687763289869609, + "loss": 3.0931, + "theoretical_loss": 4.264876055720386, + "tokens_seen": 237043712 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687662988966901, + "loss": 3.1088, + "theoretical_loss": 4.264732405787731, + "tokens_seen": 237109248 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046875626880641926, + "loss": 3.0559, + "theoretical_loss": 4.264588806667513, + "tokens_seen": 237174784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046874623871614844, + "loss": 3.3792, + "theoretical_loss": 4.264445258327724, + "tokens_seen": 237240320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687362086258776, + "loss": 3.2076, + "theoretical_loss": 4.264301760736389, + "tokens_seen": 237305856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046872617853560686, + "loss": 3.2832, + "theoretical_loss": 4.264158313861557, + "tokens_seen": 237371392 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468716148445336, + "loss": 3.3468, + "theoretical_loss": 4.264014917671309, + "tokens_seen": 237436928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687061183550652, + "loss": 3.0554, + "theoretical_loss": 4.2638715721337554, + "tokens_seen": 237502464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 294906, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.298981189727783, + "objective/train/theoretical_loss": 4.263728277217032, + "objective/train/tokens_used": 258028000, + "theoretical_loss": 4.263728277217032, + "tokens_seen": 237568000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686960882647944, + "loss": 3.2936, + "theoretical_loss": 4.263728277217032, + "tokens_seen": 237568000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686860581745236, + "loss": 3.0715, + "theoretical_loss": 4.263585032889306, + "tokens_seen": 237633536 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046867602808425276, + "loss": 3.2912, + "theoretical_loss": 4.263441839118776, + "tokens_seen": 237699072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046866599799398194, + "loss": 2.9441, + "theoretical_loss": 4.2632986958736625, + "tokens_seen": 237764608 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686559679037111, + "loss": 3.467, + "theoretical_loss": 4.263155603122221, + "tokens_seen": 237830144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046864593781344036, + "loss": 3.2158, + "theoretical_loss": 4.263012560832733, + "tokens_seen": 237895680 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686359077231695, + "loss": 3.2119, + "theoretical_loss": 4.262869568973508, + "tokens_seen": 237961216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686258776328987, + "loss": 3.2893, + "theoretical_loss": 4.262726627512886, + "tokens_seen": 238026752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046861584754262785, + "loss": 3.4987, + "theoretical_loss": 4.262583736419234, + "tokens_seen": 238092288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686058174523571, + "loss": 3.271, + "theoretical_loss": 4.26244089566095, + "tokens_seen": 238157824 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046859578736208626, + "loss": 3.2722, + "theoretical_loss": 4.262298105206456, + "tokens_seen": 238223360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046858575727181545, + "loss": 3.3172, + "theoretical_loss": 4.262155365024207, + "tokens_seen": 238288896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685757271815446, + "loss": 3.2976, + "theoretical_loss": 4.262012675082685, + "tokens_seen": 238354432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685656970912738, + "loss": 3.229, + "theoretical_loss": 4.261870035350399, + "tokens_seen": 238419968 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468555667001003, + "loss": 3.5022, + "theoretical_loss": 4.261727445795888, + "tokens_seen": 238485504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685456369107322, + "loss": 3.1887, + "theoretical_loss": 4.26158490638772, + "tokens_seen": 238551040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046853560682046135, + "loss": 3.1939, + "theoretical_loss": 4.261442417094488, + "tokens_seen": 238616576 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685255767301906, + "loss": 3.0544, + "theoretical_loss": 4.261299977884816, + "tokens_seen": 238682112 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046851554663991977, + "loss": 3.1491, + "theoretical_loss": 4.2611575887273565, + "tokens_seen": 238747648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046850551654964895, + "loss": 3.215, + "theoretical_loss": 4.261015249590789, + "tokens_seen": 238813184 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046849548645937813, + "loss": 3.2087, + "theoretical_loss": 4.260872960443822, + "tokens_seen": 238878720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684854563691073, + "loss": 3.0708, + "theoretical_loss": 4.260730721255191, + "tokens_seen": 238944256 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684754262788365, + "loss": 2.872, + "theoretical_loss": 4.260588531993662, + "tokens_seen": 239009792 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046846539618856573, + "loss": 3.329, + "theoretical_loss": 4.260446392628026, + "tokens_seen": 239075328 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684553660982949, + "loss": 3.2693, + "theoretical_loss": 4.2603043031271035, + "tokens_seen": 239140864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 295763, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9719858169555664, + "objective/train/theoretical_loss": 4.260162263459744, + "objective/train/tokens_used": 259666400, + "theoretical_loss": 4.260162263459744, + "tokens_seen": 239206400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684453360080241, + "loss": 3.251, + "theoretical_loss": 4.260162263459744, + "tokens_seen": 239206400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046843530591775327, + "loss": 3.3018, + "theoretical_loss": 4.260020273594824, + "tokens_seen": 239271936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046842527582748245, + "loss": 2.9441, + "theoretical_loss": 4.259878333501247, + "tokens_seen": 239337472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684152457372117, + "loss": 3.3198, + "theoretical_loss": 4.259736443147946, + "tokens_seen": 239403008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684052156469408, + "loss": 3.1598, + "theoretical_loss": 4.259594602503881, + "tokens_seen": 239468544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046839518555667005, + "loss": 3.4569, + "theoretical_loss": 4.259452811538041, + "tokens_seen": 239534080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046838515546639923, + "loss": 3.3406, + "theoretical_loss": 4.259311070219441, + "tokens_seen": 239599616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683751253761284, + "loss": 3.0996, + "theoretical_loss": 4.259169378517125, + "tokens_seen": 239665152 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683650952858576, + "loss": 3.2584, + "theoretical_loss": 4.259027736400165, + "tokens_seen": 239730688 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683550651955868, + "loss": 3.4028, + "theoretical_loss": 4.258886143837661, + "tokens_seen": 239796224 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046834503510531595, + "loss": 3.2419, + "theoretical_loss": 4.258744600798739, + "tokens_seen": 239861760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683350050150452, + "loss": 3.121, + "theoretical_loss": 4.2586031072525525, + "tokens_seen": 239927296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683249749247743, + "loss": 3.1897, + "theoretical_loss": 4.258461663168285, + "tokens_seen": 239992832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046831494483450355, + "loss": 3.0282, + "theoretical_loss": 4.258320268515147, + "tokens_seen": 240058368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683049147442327, + "loss": 3.1978, + "theoretical_loss": 4.258178923262376, + "tokens_seen": 240123904 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682948846539619, + "loss": 3.2214, + "theoretical_loss": 4.258037627379235, + "tokens_seen": 240189440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682848545636911, + "loss": 3.3644, + "theoretical_loss": 4.257896380835018, + "tokens_seen": 240254976 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682748244734203, + "loss": 3.1584, + "theoretical_loss": 4.257755183599045, + "tokens_seen": 240320512 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046826479438314946, + "loss": 3.1607, + "theoretical_loss": 4.257614035640662, + "tokens_seen": 240386048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046825476429287864, + "loss": 2.992, + "theoretical_loss": 4.257472936929246, + "tokens_seen": 240451584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682447342026078, + "loss": 3.399, + "theoretical_loss": 4.257331887434198, + "tokens_seen": 240517120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046823470411233706, + "loss": 3.076, + "theoretical_loss": 4.257190887124946, + "tokens_seen": 240582656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682246740220662, + "loss": 3.1499, + "theoretical_loss": 4.25704993597095, + "tokens_seen": 240648192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682146439317954, + "loss": 3.3741, + "theoretical_loss": 4.256909033941691, + "tokens_seen": 240713728 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682046138415246, + "loss": 3.2284, + "theoretical_loss": 4.256768181006683, + "tokens_seen": 240779264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 296516, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0105109214782715, + "objective/train/theoretical_loss": 4.2566273771354615, + "objective/train/tokens_used": 261304800, + "theoretical_loss": 4.2566273771354615, + "tokens_seen": 240844800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681945837512538, + "loss": 3.02, + "theoretical_loss": 4.2566273771354615, + "tokens_seen": 240844800 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046818455366098296, + "loss": 3.3288, + "theoretical_loss": 4.256486622297595, + "tokens_seen": 240910336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046817452357071214, + "loss": 3.397, + "theoretical_loss": 4.256345916462674, + "tokens_seen": 240975872 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681644934804413, + "loss": 3.0537, + "theoretical_loss": 4.256205259600321, + "tokens_seen": 241041408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046815446339017056, + "loss": 3.077, + "theoretical_loss": 4.256064651680182, + "tokens_seen": 241106944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681444332998997, + "loss": 3.2307, + "theoretical_loss": 4.255924092671931, + "tokens_seen": 241172480 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681344032096289, + "loss": 2.9942, + "theoretical_loss": 4.255783582545269, + "tokens_seen": 241238016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046812437311935805, + "loss": 3.281, + "theoretical_loss": 4.255643121269924, + "tokens_seen": 241303552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681143430290873, + "loss": 3.2357, + "theoretical_loss": 4.255502708815651, + "tokens_seen": 241369088 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046810431293881646, + "loss": 3.098, + "theoretical_loss": 4.255362345152234, + "tokens_seen": 241434624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046809428284854565, + "loss": 3.0425, + "theoretical_loss": 4.255222030249479, + "tokens_seen": 241500160 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680842527582748, + "loss": 3.1521, + "theoretical_loss": 4.255081764077224, + "tokens_seen": 241565696 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468074222668004, + "loss": 3.3244, + "theoretical_loss": 4.25494154660533, + "tokens_seen": 241631232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680641925777332, + "loss": 3.0535, + "theoretical_loss": 4.254801377803689, + "tokens_seen": 241696768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680541624874624, + "loss": 3.0864, + "theoretical_loss": 4.254661257642215, + "tokens_seen": 241762304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046804413239719155, + "loss": 3.1507, + "theoretical_loss": 4.254521186090852, + "tokens_seen": 241827840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680341023069208, + "loss": 3.1771, + "theoretical_loss": 4.254381163119568, + "tokens_seen": 241893376 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046802407221664997, + "loss": 3.3228, + "theoretical_loss": 4.254241188698361, + "tokens_seen": 241958912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046801404212637915, + "loss": 3.1643, + "theoretical_loss": 4.2541012627972545, + "tokens_seen": 242024448 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046800401203610833, + "loss": 3.0985, + "theoretical_loss": 4.2539613853862965, + "tokens_seen": 242089984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679939819458375, + "loss": 3.1079, + "theoretical_loss": 4.253821556435565, + "tokens_seen": 242155520 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679839518555667, + "loss": 3.1036, + "theoretical_loss": 4.253681775915161, + "tokens_seen": 242221056 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046797392176529593, + "loss": 3.1256, + "theoretical_loss": 4.253542043795215, + "tokens_seen": 242286592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046796389167502505, + "loss": 3.0556, + "theoretical_loss": 4.253402360045882, + "tokens_seen": 242352128 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679538615847543, + "loss": 3.2891, + "theoretical_loss": 4.253262724637346, + "tokens_seen": 242417664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 297983, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.877701997756958, + "objective/train/theoretical_loss": 4.253123137539814, + "objective/train/tokens_used": 262943200, + "theoretical_loss": 4.253123137539814, + "tokens_seen": 242483200 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679438314944834, + "loss": 3.1046, + "theoretical_loss": 4.253123137539814, + "tokens_seen": 242483200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046793380140421265, + "loss": 3.1167, + "theoretical_loss": 4.252983598723521, + "tokens_seen": 242548736 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046792377131394183, + "loss": 3.4114, + "theoretical_loss": 4.25284410815873, + "tokens_seen": 242614272 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467913741223671, + "loss": 3.323, + "theoretical_loss": 4.2527046658157275, + "tokens_seen": 242679808 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679037111334002, + "loss": 2.809, + "theoretical_loss": 4.252565271664828, + "tokens_seen": 242745344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046789368104312943, + "loss": 3.1643, + "theoretical_loss": 4.252425925676373, + "tokens_seen": 242810880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046788365095285856, + "loss": 3.4548, + "theoretical_loss": 4.252286627820727, + "tokens_seen": 242876416 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678736208625878, + "loss": 3.2465, + "theoretical_loss": 4.252147378068285, + "tokens_seen": 242941952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678635907723169, + "loss": 3.2615, + "theoretical_loss": 4.252008176389465, + "tokens_seen": 243007488 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046785356068204616, + "loss": 3.1489, + "theoretical_loss": 4.251869022754712, + "tokens_seen": 243073024 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046784353059177534, + "loss": 3.2578, + "theoretical_loss": 4.251729917134498, + "tokens_seen": 243138560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678335005015045, + "loss": 3.0778, + "theoretical_loss": 4.251590859499322, + "tokens_seen": 243204096 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678234704112337, + "loss": 3.046, + "theoretical_loss": 4.251451849819704, + "tokens_seen": 243269632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678134403209629, + "loss": 3.255, + "theoretical_loss": 4.251312888066197, + "tokens_seen": 243335168 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046780341023069206, + "loss": 2.9979, + "theoretical_loss": 4.251173974209375, + "tokens_seen": 243400704 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677933801404213, + "loss": 3.0556, + "theoretical_loss": 4.251035108219839, + "tokens_seen": 243466240 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677833500501504, + "loss": 3.2338, + "theoretical_loss": 4.250896290068218, + "tokens_seen": 243531776 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046777331995987966, + "loss": 3.1279, + "theoretical_loss": 4.250757519725165, + "tokens_seen": 243597312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677632898696088, + "loss": 3.2705, + "theoretical_loss": 4.25061879716136, + "tokens_seen": 243662848 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467753259779338, + "loss": 3.1101, + "theoretical_loss": 4.250480122347507, + "tokens_seen": 243728384 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677432296890672, + "loss": 2.9749, + "theoretical_loss": 4.250341495254337, + "tokens_seen": 243793920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677331995987964, + "loss": 3.0554, + "theoretical_loss": 4.250202915852608, + "tokens_seen": 243859456 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046772316950852556, + "loss": 2.9862, + "theoretical_loss": 4.250064384113102, + "tokens_seen": 243924992 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677131394182548, + "loss": 2.8649, + "theoretical_loss": 4.249925900006627, + "tokens_seen": 243990528 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467703109327984, + "loss": 3.533, + "theoretical_loss": 4.249787463504019, + "tokens_seen": 244056064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 298552, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.992788553237915, + "objective/train/theoretical_loss": 4.249649074576134, + "objective/train/tokens_used": 264581600, + "theoretical_loss": 4.249649074576134, + "tokens_seen": 244121600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046769307923771316, + "loss": 3.0907, + "theoretical_loss": 4.249649074576134, + "tokens_seen": 244121600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046768304914744234, + "loss": 2.9822, + "theoretical_loss": 4.249510733193862, + "tokens_seen": 244187136 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676730190571715, + "loss": 3.1675, + "theoretical_loss": 4.249372439328111, + "tokens_seen": 244252672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046766298896690076, + "loss": 3.1947, + "theoretical_loss": 4.249234192949818, + "tokens_seen": 244318208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676529588766299, + "loss": 3.1614, + "theoretical_loss": 4.249095994029947, + "tokens_seen": 244383744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676429287863591, + "loss": 3.2011, + "theoretical_loss": 4.248957842539484, + "tokens_seen": 244449280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046763289869608825, + "loss": 3.1561, + "theoretical_loss": 4.248819738449442, + "tokens_seen": 244514816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676228686058175, + "loss": 3.3678, + "theoretical_loss": 4.2486816817308615, + "tokens_seen": 244580352 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046761283851554666, + "loss": 3.1089, + "theoretical_loss": 4.248543672354805, + "tokens_seen": 244645888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046760280842527585, + "loss": 3.2738, + "theoretical_loss": 4.248405710292364, + "tokens_seen": 244711424 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467592778335005, + "loss": 3.3828, + "theoretical_loss": 4.248267795514652, + "tokens_seen": 244776960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675827482447342, + "loss": 3.1759, + "theoretical_loss": 4.248129927992808, + "tokens_seen": 244842496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675727181544634, + "loss": 3.004, + "theoretical_loss": 4.247992107698002, + "tokens_seen": 244908032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675626880641926, + "loss": 3.2486, + "theoretical_loss": 4.247854334601421, + "tokens_seen": 244973568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046755265797392175, + "loss": 3.2047, + "theoretical_loss": 4.247716608674283, + "tokens_seen": 245039104 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467542627883651, + "loss": 3.0704, + "theoretical_loss": 4.247578929887829, + "tokens_seen": 245104640 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046753259779338017, + "loss": 2.9287, + "theoretical_loss": 4.247441298213326, + "tokens_seen": 245170176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046752256770310935, + "loss": 3.0956, + "theoretical_loss": 4.247303713622067, + "tokens_seen": 245235712 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046751253761283853, + "loss": 3.0942, + "theoretical_loss": 4.247166176085367, + "tokens_seen": 245301248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675025075225677, + "loss": 3.0354, + "theoretical_loss": 4.247028685574569, + "tokens_seen": 245366784 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674924774322969, + "loss": 3.135, + "theoretical_loss": 4.246891242061041, + "tokens_seen": 245432320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046748244734202613, + "loss": 3.0614, + "theoretical_loss": 4.246753845516174, + "tokens_seen": 245497856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046747241725175525, + "loss": 3.1953, + "theoretical_loss": 4.246616495911388, + "tokens_seen": 245563392 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674623871614845, + "loss": 2.9679, + "theoretical_loss": 4.246479193218123, + "tokens_seen": 245628928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674523570712136, + "loss": 3.3495, + "theoretical_loss": 4.246341937407848, + "tokens_seen": 245694464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 299755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3899707794189453, + "objective/train/theoretical_loss": 4.246204728452055, + "objective/train/tokens_used": 266220000, + "theoretical_loss": 4.246204728452055, + "tokens_seen": 245760000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046744232698094285, + "loss": 3.138, + "theoretical_loss": 4.246204728452055, + "tokens_seen": 245760000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046743229689067203, + "loss": 3.3381, + "theoretical_loss": 4.246067566322259, + "tokens_seen": 245825536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674222668004012, + "loss": 2.9391, + "theoretical_loss": 4.245930450990007, + "tokens_seen": 245891072 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674122367101304, + "loss": 3.2446, + "theoretical_loss": 4.245793382426861, + "tokens_seen": 245956608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046740220661985963, + "loss": 3.1792, + "theoretical_loss": 4.245656360604417, + "tokens_seen": 246022144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046739217652958876, + "loss": 3.3674, + "theoretical_loss": 4.24551938549429, + "tokens_seen": 246087680 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467382146439318, + "loss": 3.1692, + "theoretical_loss": 4.2453824570681205, + "tokens_seen": 246153216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673721163490471, + "loss": 3.3341, + "theoretical_loss": 4.245245575297577, + "tokens_seen": 246218752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046736208625877636, + "loss": 3.3756, + "theoretical_loss": 4.2451087401543495, + "tokens_seen": 246284288 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046735205616850554, + "loss": 3.2973, + "theoretical_loss": 4.244971951610154, + "tokens_seen": 246349824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673420260782347, + "loss": 3.376, + "theoretical_loss": 4.24483520963673, + "tokens_seen": 246415360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673319959879639, + "loss": 2.9486, + "theoretical_loss": 4.244698514205844, + "tokens_seen": 246480896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673219658976931, + "loss": 3.0086, + "theoretical_loss": 4.244561865289285, + "tokens_seen": 246546432 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046731193580742226, + "loss": 3.1113, + "theoretical_loss": 4.244425262858867, + "tokens_seen": 246611968 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673019057171515, + "loss": 3.2665, + "theoretical_loss": 4.2442887068864295, + "tokens_seen": 246677504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672918756268806, + "loss": 3.1233, + "theoretical_loss": 4.244152197343835, + "tokens_seen": 246743040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046728184553660986, + "loss": 3.3192, + "theoretical_loss": 4.244015734202973, + "tokens_seen": 246808576 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467271815446339, + "loss": 3.3794, + "theoretical_loss": 4.243879317435755, + "tokens_seen": 246874112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672617853560682, + "loss": 2.863, + "theoretical_loss": 4.243742947014117, + "tokens_seen": 246939648 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672517552657974, + "loss": 3.3378, + "theoretical_loss": 4.243606622910021, + "tokens_seen": 247005184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672417251755266, + "loss": 2.9326, + "theoretical_loss": 4.243470345095453, + "tokens_seen": 247070720 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046723169508525576, + "loss": 3.1262, + "theoretical_loss": 4.2433341135424225, + "tokens_seen": 247136256 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467221664994985, + "loss": 3.2178, + "theoretical_loss": 4.243197928222964, + "tokens_seen": 247201792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672116349047141, + "loss": 3.2335, + "theoretical_loss": 4.243061789109136, + "tokens_seen": 247267328 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046720160481444336, + "loss": 3.1028, + "theoretical_loss": 4.242925696173021, + "tokens_seen": 247332864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 300199, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2579872608184814, + "objective/train/theoretical_loss": 4.2427896493867285, + "objective/train/tokens_used": 267858400, + "theoretical_loss": 4.2427896493867285, + "tokens_seen": 247398400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004671915747241725, + "loss": 3.0951, + "theoretical_loss": 4.2427896493867285, + "tokens_seen": 247398400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004671815446339017, + "loss": 3.0132, + "theoretical_loss": 4.242653648722387, + "tokens_seen": 247463936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671715145436309, + "loss": 3.3596, + "theoretical_loss": 4.242517694152154, + "tokens_seen": 247529472 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671614844533601, + "loss": 2.881, + "theoretical_loss": 4.24238178564821, + "tokens_seen": 247595008 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046715145436308927, + "loss": 3.3289, + "theoretical_loss": 4.242245923182756, + "tokens_seen": 247660544 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046714142427281845, + "loss": 3.1006, + "theoretical_loss": 4.242110106728022, + "tokens_seen": 247726080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046713139418254763, + "loss": 3.3098, + "theoretical_loss": 4.241974336256261, + "tokens_seen": 247791616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046712136409227686, + "loss": 3.4344, + "theoretical_loss": 4.241838611739748, + "tokens_seen": 247857152 + }, + { + "epoch": 0.08, + "learning_rate": 0.000467111334002006, + "loss": 3.3127, + "theoretical_loss": 4.241702933150783, + "tokens_seen": 247922688 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671013039117352, + "loss": 3.2709, + "theoretical_loss": 4.241567300461693, + "tokens_seen": 247988224 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046709127382146435, + "loss": 3.2324, + "theoretical_loss": 4.241431713644823, + "tokens_seen": 248053760 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670812437311936, + "loss": 2.8616, + "theoretical_loss": 4.241296172672547, + "tokens_seen": 248119296 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046707121364092277, + "loss": 3.1879, + "theoretical_loss": 4.24116067751726, + "tokens_seen": 248184832 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046706118355065195, + "loss": 3.2202, + "theoretical_loss": 4.241025228151383, + "tokens_seen": 248250368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046705115346038113, + "loss": 3.2631, + "theoretical_loss": 4.24088982454736, + "tokens_seen": 248315904 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046704112337011037, + "loss": 3.0812, + "theoretical_loss": 4.240754466677659, + "tokens_seen": 248381440 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670310932798395, + "loss": 3.2234, + "theoretical_loss": 4.240619154514771, + "tokens_seen": 248446976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046702106318956873, + "loss": 3.2165, + "theoretical_loss": 4.240483888031212, + "tokens_seen": 248512512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046701103309929786, + "loss": 2.9284, + "theoretical_loss": 4.240348667199521, + "tokens_seen": 248578048 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670010030090271, + "loss": 3.1256, + "theoretical_loss": 4.240213491992261, + "tokens_seen": 248643584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669909729187563, + "loss": 3.1306, + "theoretical_loss": 4.240078362382019, + "tokens_seen": 248709120 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046698094282848545, + "loss": 3.1289, + "theoretical_loss": 4.239943278341404, + "tokens_seen": 248774656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046697091273821464, + "loss": 3.2737, + "theoretical_loss": 4.239808239843052, + "tokens_seen": 248840192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669608826479438, + "loss": 3.1408, + "theoretical_loss": 4.239673246859619, + "tokens_seen": 248905728 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046695085255767305, + "loss": 3.2095, + "theoretical_loss": 4.239538299363788, + "tokens_seen": 248971264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 301486, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3037164211273193, + "objective/train/theoretical_loss": 4.239403397328261, + "objective/train/tokens_used": 269496800, + "theoretical_loss": 4.239403397328261, + "tokens_seen": 249036800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046694082246740223, + "loss": 3.109, + "theoretical_loss": 4.239403397328261, + "tokens_seen": 249036800 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669307923771314, + "loss": 3.2687, + "theoretical_loss": 4.239268540725769, + "tokens_seen": 249102336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669207622868606, + "loss": 3.2338, + "theoretical_loss": 4.239133729529064, + "tokens_seen": 249167872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046691073219658983, + "loss": 3.023, + "theoretical_loss": 4.2389989637109196, + "tokens_seen": 249233408 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046690070210631896, + "loss": 3.2841, + "theoretical_loss": 4.2388642432441355, + "tokens_seen": 249298944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668906720160482, + "loss": 3.4195, + "theoretical_loss": 4.238729568101535, + "tokens_seen": 249364480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668806419257773, + "loss": 3.0559, + "theoretical_loss": 4.238594938255963, + "tokens_seen": 249430016 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046687061183550656, + "loss": 3.1747, + "theoretical_loss": 4.2384603536802885, + "tokens_seen": 249495552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046686058174523574, + "loss": 3.1241, + "theoretical_loss": 4.238325814347404, + "tokens_seen": 249561088 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668505516549649, + "loss": 3.3317, + "theoretical_loss": 4.238191320230227, + "tokens_seen": 249626624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668405215646941, + "loss": 3.127, + "theoretical_loss": 4.238056871301695, + "tokens_seen": 249692160 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668304914744233, + "loss": 3.1163, + "theoretical_loss": 4.237922467534771, + "tokens_seen": 249757696 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046682046138415246, + "loss": 3.1401, + "theoretical_loss": 4.237788108902441, + "tokens_seen": 249823232 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668104312938817, + "loss": 3.285, + "theoretical_loss": 4.237653795377714, + "tokens_seen": 249888768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668004012036108, + "loss": 3.4912, + "theoretical_loss": 4.237519526933622, + "tokens_seen": 249954304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046679037111334006, + "loss": 2.9408, + "theoretical_loss": 4.2373853035432205, + "tokens_seen": 250019840 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667803410230692, + "loss": 3.312, + "theoretical_loss": 4.237251125179588, + "tokens_seen": 250085376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667703109327984, + "loss": 3.14, + "theoretical_loss": 4.237116991815826, + "tokens_seen": 250150912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667602808425276, + "loss": 3.1893, + "theoretical_loss": 4.23698290342506, + "tokens_seen": 250216448 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667502507522568, + "loss": 3.1478, + "theoretical_loss": 4.236848859980437, + "tokens_seen": 250281984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046674022066198596, + "loss": 3.0303, + "theoretical_loss": 4.23671486145513, + "tokens_seen": 250347520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667301905717152, + "loss": 3.1047, + "theoretical_loss": 4.236580907822331, + "tokens_seen": 250413056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667201604814443, + "loss": 3.0878, + "theoretical_loss": 4.236446999055257, + "tokens_seen": 250478592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046671013039117356, + "loss": 3.123, + "theoretical_loss": 4.2363131351271495, + "tokens_seen": 250544128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667001003009027, + "loss": 3.1892, + "theoretical_loss": 4.2361793160112695, + "tokens_seen": 250609664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 302298, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1085546016693115, + "objective/train/theoretical_loss": 4.236045541680905, + "objective/train/tokens_used": 271135200, + "theoretical_loss": 4.236045541680905, + "tokens_seen": 250675200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666900702106319, + "loss": 3.3128, + "theoretical_loss": 4.236045541680905, + "tokens_seen": 250675200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666800401203611, + "loss": 3.097, + "theoretical_loss": 4.235911812109363, + "tokens_seen": 250740736 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666700100300903, + "loss": 3.054, + "theoretical_loss": 4.235778127269976, + "tokens_seen": 250806272 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046665997993981947, + "loss": 3.2811, + "theoretical_loss": 4.235644487136098, + "tokens_seen": 250871808 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046664994984954865, + "loss": 3.2177, + "theoretical_loss": 4.235510891681108, + "tokens_seen": 250937344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046663991975927783, + "loss": 3.1458, + "theoretical_loss": 4.235377340878404, + "tokens_seen": 251002880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046662988966900706, + "loss": 3.2887, + "theoretical_loss": 4.23524383470141, + "tokens_seen": 251068416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666198595787362, + "loss": 3.5902, + "theoretical_loss": 4.235110373123572, + "tokens_seen": 251133952 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046660982948846543, + "loss": 3.2736, + "theoretical_loss": 4.2349769561183574, + "tokens_seen": 251199488 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046659979939819455, + "loss": 3.3576, + "theoretical_loss": 4.2348435836592575, + "tokens_seen": 251265024 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665897693079238, + "loss": 3.0621, + "theoretical_loss": 4.234710255719786, + "tokens_seen": 251330560 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046657973921765297, + "loss": 3.2782, + "theoretical_loss": 4.234576972273481, + "tokens_seen": 251396096 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046656970912738215, + "loss": 3.1883, + "theoretical_loss": 4.234443733293899, + "tokens_seen": 251461632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046655967903711133, + "loss": 3.2245, + "theoretical_loss": 4.234310538754624, + "tokens_seen": 251527168 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046654964894684057, + "loss": 3.2696, + "theoretical_loss": 4.2341773886292575, + "tokens_seen": 251592704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665396188565697, + "loss": 3.1524, + "theoretical_loss": 4.234044282891429, + "tokens_seen": 251658240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046652958876629893, + "loss": 3.0471, + "theoretical_loss": 4.233911221514787, + "tokens_seen": 251723776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046651955867602806, + "loss": 3.1227, + "theoretical_loss": 4.233778204473002, + "tokens_seen": 251789312 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665095285857573, + "loss": 3.2876, + "theoretical_loss": 4.23364523173977, + "tokens_seen": 251854848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664994984954865, + "loss": 3.2065, + "theoretical_loss": 4.233512303288807, + "tokens_seen": 251920384 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046648946840521565, + "loss": 2.8681, + "theoretical_loss": 4.233379419093851, + "tokens_seen": 251985920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046647943831494484, + "loss": 3.0819, + "theoretical_loss": 4.233246579128666, + "tokens_seen": 252051456 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466469408224674, + "loss": 3.1822, + "theoretical_loss": 4.233113783367033, + "tokens_seen": 252116992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664593781344032, + "loss": 3.1466, + "theoretical_loss": 4.232981031782761, + "tokens_seen": 252182528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046644934804413243, + "loss": 3.3105, + "theoretical_loss": 4.232848324349677, + "tokens_seen": 252248064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 303567, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1599833965301514, + "objective/train/theoretical_loss": 4.232715661041632, + "objective/train/tokens_used": 272773600, + "theoretical_loss": 4.232715661041632, + "tokens_seen": 252313600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046643931795386156, + "loss": 3.1736, + "theoretical_loss": 4.232715661041632, + "tokens_seen": 252313600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664292878635908, + "loss": 3.1568, + "theoretical_loss": 4.232583041832499, + "tokens_seen": 252379136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664192577733199, + "loss": 3.1369, + "theoretical_loss": 4.232450466696174, + "tokens_seen": 252444672 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046640922768304916, + "loss": 3.0532, + "theoretical_loss": 4.2323179356065745, + "tokens_seen": 252510208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046639919759277834, + "loss": 2.9697, + "theoretical_loss": 4.23218544853764, + "tokens_seen": 252575744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663891675025075, + "loss": 3.31, + "theoretical_loss": 4.232053005463333, + "tokens_seen": 252641280 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663791374122367, + "loss": 3.2688, + "theoretical_loss": 4.231920606357638, + "tokens_seen": 252706816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046636910732196594, + "loss": 3.0768, + "theoretical_loss": 4.231788251194559, + "tokens_seen": 252772352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046635907723169506, + "loss": 3.1853, + "theoretical_loss": 4.231655939948127, + "tokens_seen": 252837888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663490471414243, + "loss": 3.0185, + "theoretical_loss": 4.231523672592392, + "tokens_seen": 252903424 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663390170511534, + "loss": 3.2061, + "theoretical_loss": 4.231391449101425, + "tokens_seen": 252968960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046632898696088266, + "loss": 3.0254, + "theoretical_loss": 4.231259269449322, + "tokens_seen": 253034496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046631895687061184, + "loss": 3.4604, + "theoretical_loss": 4.231127133610198, + "tokens_seen": 253100032 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466308926780341, + "loss": 3.1453, + "theoretical_loss": 4.230995041558194, + "tokens_seen": 253165568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662988966900702, + "loss": 3.1521, + "theoretical_loss": 4.230862993267468, + "tokens_seen": 253231104 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662888665997994, + "loss": 3.2638, + "theoretical_loss": 4.230730988712205, + "tokens_seen": 253296640 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046627883650952857, + "loss": 3.3752, + "theoretical_loss": 4.230599027866606, + "tokens_seen": 253362176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662688064192578, + "loss": 3.1583, + "theoretical_loss": 4.2304671107048994, + "tokens_seen": 253427712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046625877632898693, + "loss": 3.2989, + "theoretical_loss": 4.2303352372013325, + "tokens_seen": 253493248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046624874623871616, + "loss": 3.2402, + "theoretical_loss": 4.230203407330176, + "tokens_seen": 253558784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046623871614844535, + "loss": 3.1902, + "theoretical_loss": 4.230071621065721, + "tokens_seen": 253624320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662286860581745, + "loss": 2.866, + "theoretical_loss": 4.2299398783822815, + "tokens_seen": 253689856 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662186559679037, + "loss": 3.2554, + "theoretical_loss": 4.229808179254192, + "tokens_seen": 253755392 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662086258776329, + "loss": 3.2293, + "theoretical_loss": 4.22967652365581, + "tokens_seen": 253820928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661985957873621, + "loss": 2.9234, + "theoretical_loss": 4.229544911561513, + "tokens_seen": 253886464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 304269, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1997804641723633, + "objective/train/theoretical_loss": 4.229413342945703, + "objective/train/tokens_used": 274412000, + "theoretical_loss": 4.229413342945703, + "tokens_seen": 253952000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661885656970913, + "loss": 3.353, + "theoretical_loss": 4.229413342945703, + "tokens_seen": 253952000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661785356068205, + "loss": 3.1267, + "theoretical_loss": 4.229281817782801, + "tokens_seen": 254017536 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046616850551654967, + "loss": 3.2084, + "theoretical_loss": 4.229150336047251, + "tokens_seen": 254083072 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046615847542627885, + "loss": 3.3788, + "theoretical_loss": 4.229018897713519, + "tokens_seen": 254148608 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046614844533600803, + "loss": 3.0708, + "theoretical_loss": 4.22888750275609, + "tokens_seen": 254214144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046613841524573727, + "loss": 3.0028, + "theoretical_loss": 4.228756151149475, + "tokens_seen": 254279680 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661283851554664, + "loss": 3.3021, + "theoretical_loss": 4.228624842868202, + "tokens_seen": 254345216 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046611835506519563, + "loss": 3.0294, + "theoretical_loss": 4.228493577886824, + "tokens_seen": 254410752 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046610832497492475, + "loss": 3.1577, + "theoretical_loss": 4.228362356179913, + "tokens_seen": 254476288 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466098294884654, + "loss": 3.0022, + "theoretical_loss": 4.228231177722063, + "tokens_seen": 254541824 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046608826479438317, + "loss": 3.0895, + "theoretical_loss": 4.228100042487892, + "tokens_seen": 254607360 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046607823470411235, + "loss": 3.1647, + "theoretical_loss": 4.227968950452035, + "tokens_seen": 254672896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046606820461384153, + "loss": 3.2423, + "theoretical_loss": 4.227837901589153, + "tokens_seen": 254738432 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046605817452357077, + "loss": 3.056, + "theoretical_loss": 4.227706895873924, + "tokens_seen": 254803968 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660481444332999, + "loss": 2.8991, + "theoretical_loss": 4.227575933281051, + "tokens_seen": 254869504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046603811434302913, + "loss": 3.0, + "theoretical_loss": 4.227445013785257, + "tokens_seen": 254935040 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046602808425275826, + "loss": 3.0826, + "theoretical_loss": 4.227314137361285, + "tokens_seen": 255000576 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660180541624875, + "loss": 3.2037, + "theoretical_loss": 4.227183303983901, + "tokens_seen": 255066112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660080240722167, + "loss": 3.2881, + "theoretical_loss": 4.227052513627893, + "tokens_seen": 255131648 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046599799398194586, + "loss": 3.1299, + "theoretical_loss": 4.226921766268067, + "tokens_seen": 255197184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046598796389167504, + "loss": 3.0684, + "theoretical_loss": 4.226791061879253, + "tokens_seen": 255262720 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659779338014042, + "loss": 3.0469, + "theoretical_loss": 4.226660400436302, + "tokens_seen": 255328256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659679037111334, + "loss": 3.0472, + "theoretical_loss": 4.226529781914084, + "tokens_seen": 255393792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046595787362086263, + "loss": 3.2079, + "theoretical_loss": 4.226399206287493, + "tokens_seen": 255459328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046594784353059176, + "loss": 3.1358, + "theoretical_loss": 4.226268673531442, + "tokens_seen": 255524864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 305849, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.743330240249634, + "objective/train/theoretical_loss": 4.226138183620867, + "objective/train/tokens_used": 276050400, + "theoretical_loss": 4.226138183620867, + "tokens_seen": 255590400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465937813440321, + "loss": 3.1548, + "theoretical_loss": 4.226138183620867, + "tokens_seen": 255590400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659277833500501, + "loss": 3.0213, + "theoretical_loss": 4.226007736530723, + "tokens_seen": 255655936 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046591775325977936, + "loss": 3.1984, + "theoretical_loss": 4.225877332235987, + "tokens_seen": 255721472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046590772316950854, + "loss": 3.1091, + "theoretical_loss": 4.225746970711657, + "tokens_seen": 255787008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658976930792377, + "loss": 3.2381, + "theoretical_loss": 4.225616651932753, + "tokens_seen": 255852544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658876629889669, + "loss": 3.2222, + "theoretical_loss": 4.225486375874315, + "tokens_seen": 255918080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046587763289869614, + "loss": 3.1889, + "theoretical_loss": 4.225356142511402, + "tokens_seen": 255983616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046586760280842526, + "loss": 3.0673, + "theoretical_loss": 4.225225951819099, + "tokens_seen": 256049152 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658575727181545, + "loss": 3.1606, + "theoretical_loss": 4.225095803772507, + "tokens_seen": 256114688 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658475426278836, + "loss": 3.2571, + "theoretical_loss": 4.22496569834675, + "tokens_seen": 256180224 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046583751253761286, + "loss": 3.3425, + "theoretical_loss": 4.224835635516973, + "tokens_seen": 256245760 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046582748244734204, + "loss": 3.0092, + "theoretical_loss": 4.224705615258341, + "tokens_seen": 256311296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658174523570712, + "loss": 3.2458, + "theoretical_loss": 4.224575637546041, + "tokens_seen": 256376832 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658074222668004, + "loss": 2.9137, + "theoretical_loss": 4.224445702355279, + "tokens_seen": 256442368 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657973921765296, + "loss": 3.1672, + "theoretical_loss": 4.2243158096612845, + "tokens_seen": 256507904 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046578736208625877, + "loss": 3.1599, + "theoretical_loss": 4.224185959439305, + "tokens_seen": 256573440 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465777331995988, + "loss": 3.1255, + "theoretical_loss": 4.22405615166461, + "tokens_seen": 256638976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046576730190571713, + "loss": 3.1789, + "theoretical_loss": 4.22392638631249, + "tokens_seen": 256704512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046575727181544636, + "loss": 3.1087, + "theoretical_loss": 4.223796663358255, + "tokens_seen": 256770048 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046574724172517555, + "loss": 3.2464, + "theoretical_loss": 4.223666982777237, + "tokens_seen": 256835584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657372116349047, + "loss": 3.327, + "theoretical_loss": 4.223537344544788, + "tokens_seen": 256901120 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657271815446339, + "loss": 3.1453, + "theoretical_loss": 4.223407748636282, + "tokens_seen": 256966656 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657171514543631, + "loss": 3.0814, + "theoretical_loss": 4.22327819502711, + "tokens_seen": 257032192 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046570712136409227, + "loss": 3.1326, + "theoretical_loss": 4.223148683692687, + "tokens_seen": 257097728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656970912738215, + "loss": 3.2236, + "theoretical_loss": 4.223019214608446, + "tokens_seen": 257163264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 306364, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.629127264022827, + "objective/train/theoretical_loss": 4.222889787749845, + "objective/train/tokens_used": 277688800, + "theoretical_loss": 4.222889787749845, + "tokens_seen": 257228800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046568706118355063, + "loss": 3.0022, + "theoretical_loss": 4.222889787749845, + "tokens_seen": 257228800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046567703109327987, + "loss": 3.2723, + "theoretical_loss": 4.222760403092358, + "tokens_seen": 257294336 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465667001003009, + "loss": 3.1032, + "theoretical_loss": 4.22263106061148, + "tokens_seen": 257359872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046565697091273823, + "loss": 3.2668, + "theoretical_loss": 4.222501760282729, + "tokens_seen": 257425408 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656469408224674, + "loss": 3.3694, + "theoretical_loss": 4.22237250208164, + "tokens_seen": 257490944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656369107321966, + "loss": 3.0323, + "theoretical_loss": 4.222243285983772, + "tokens_seen": 257556480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656268806419258, + "loss": 3.0638, + "theoretical_loss": 4.222114111964703, + "tokens_seen": 257622016 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046561685055165495, + "loss": 3.2751, + "theoretical_loss": 4.221984980000029, + "tokens_seen": 257687552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046560682046138414, + "loss": 3.1497, + "theoretical_loss": 4.2218558900653695, + "tokens_seen": 257753088 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046559679037111337, + "loss": 3.2393, + "theoretical_loss": 4.221726842136364, + "tokens_seen": 257818624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655867602808425, + "loss": 3.3169, + "theoretical_loss": 4.2215978361886695, + "tokens_seen": 257884160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046557673019057173, + "loss": 3.1882, + "theoretical_loss": 4.221468872197967, + "tokens_seen": 257949696 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655667001003009, + "loss": 3.0225, + "theoretical_loss": 4.221339950139956, + "tokens_seen": 258015232 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655566700100301, + "loss": 3.1899, + "theoretical_loss": 4.221211069990357, + "tokens_seen": 258080768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655466399197593, + "loss": 3.2241, + "theoretical_loss": 4.221082231724908, + "tokens_seen": 258146304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046553660982948846, + "loss": 3.1808, + "theoretical_loss": 4.22095343531937, + "tokens_seen": 258211840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046552657973921764, + "loss": 3.4274, + "theoretical_loss": 4.220824680749525, + "tokens_seen": 258277376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655165496489469, + "loss": 3.0687, + "theoretical_loss": 4.220695967991171, + "tokens_seen": 258342912 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465506519558676, + "loss": 3.1397, + "theoretical_loss": 4.220567297020131, + "tokens_seen": 258408448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046549648946840524, + "loss": 3.1805, + "theoretical_loss": 4.220438667812244, + "tokens_seen": 258473984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046548645937813436, + "loss": 3.1492, + "theoretical_loss": 4.220310080343373, + "tokens_seen": 258539520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654764292878636, + "loss": 2.7064, + "theoretical_loss": 4.220181534589398, + "tokens_seen": 258605056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654663991975928, + "loss": 3.0719, + "theoretical_loss": 4.22005303052622, + "tokens_seen": 258670592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046545636910732196, + "loss": 3.2042, + "theoretical_loss": 4.219924568129759, + "tokens_seen": 258736128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654463390170512, + "loss": 3.0768, + "theoretical_loss": 4.219796147375957, + "tokens_seen": 258801664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 307092, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.476236343383789, + "objective/train/theoretical_loss": 4.219667768240775, + "objective/train/tokens_used": 279327200, + "theoretical_loss": 4.219667768240775, + "tokens_seen": 258867200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654363089267803, + "loss": 3.1775, + "theoretical_loss": 4.219667768240775, + "tokens_seen": 258867200 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046542627883650956, + "loss": 2.9703, + "theoretical_loss": 4.219539430700195, + "tokens_seen": 258932736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046541624874623874, + "loss": 3.0814, + "theoretical_loss": 4.2194111347302155, + "tokens_seen": 258998272 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654062186559679, + "loss": 3.124, + "theoretical_loss": 4.219282880306859, + "tokens_seen": 259063808 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653961885656971, + "loss": 2.9946, + "theoretical_loss": 4.219154667406166, + "tokens_seen": 259129344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046538615847542634, + "loss": 3.1049, + "theoretical_loss": 4.219026496004198, + "tokens_seen": 259194880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046537612838515546, + "loss": 3.06, + "theoretical_loss": 4.218898366077035, + "tokens_seen": 259260416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653660982948847, + "loss": 3.0953, + "theoretical_loss": 4.218770277600775, + "tokens_seen": 259325952 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653560682046138, + "loss": 2.8445, + "theoretical_loss": 4.218642230551541, + "tokens_seen": 259391488 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046534603811434306, + "loss": 3.0274, + "theoretical_loss": 4.218514224905472, + "tokens_seen": 259457024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046533600802407224, + "loss": 2.9485, + "theoretical_loss": 4.218386260638727, + "tokens_seen": 259522560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653259779338014, + "loss": 3.1876, + "theoretical_loss": 4.2182583377274865, + "tokens_seen": 259588096 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653159478435306, + "loss": 3.1376, + "theoretical_loss": 4.218130456147948, + "tokens_seen": 259653632 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653059177532598, + "loss": 3.1214, + "theoretical_loss": 4.218002615876332, + "tokens_seen": 259719168 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046529588766298897, + "loss": 3.2295, + "theoretical_loss": 4.217874816888877, + "tokens_seen": 259784704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652858575727182, + "loss": 3.0723, + "theoretical_loss": 4.217747059161839, + "tokens_seen": 259850240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046527582748244733, + "loss": 2.9696, + "theoretical_loss": 4.217619342671498, + "tokens_seen": 259915776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046526579739217656, + "loss": 3.0124, + "theoretical_loss": 4.2174916673941505, + "tokens_seen": 259981312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046525576730190575, + "loss": 2.8966, + "theoretical_loss": 4.217364033306113, + "tokens_seen": 260046848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652457372116349, + "loss": 3.1619, + "theoretical_loss": 4.217236440383724, + "tokens_seen": 260112384 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652357071213641, + "loss": 2.9974, + "theoretical_loss": 4.217108888603337, + "tokens_seen": 260177920 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652256770310933, + "loss": 2.9334, + "theoretical_loss": 4.21698137794133, + "tokens_seen": 260243456 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046521564694082247, + "loss": 3.2643, + "theoretical_loss": 4.216853908374097, + "tokens_seen": 260308992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652056168505517, + "loss": 3.3784, + "theoretical_loss": 4.216726479878052, + "tokens_seen": 260374528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046519558676028083, + "loss": 3.1801, + "theoretical_loss": 4.216599092429631, + "tokens_seen": 260440064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 308211, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.457490921020508, + "objective/train/theoretical_loss": 4.216471746005286, + "objective/train/tokens_used": 280965600, + "theoretical_loss": 4.216471746005286, + "tokens_seen": 260505600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046518555667001007, + "loss": 3.1286, + "theoretical_loss": 4.216471746005286, + "tokens_seen": 260505600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651755265797392, + "loss": 2.9917, + "theoretical_loss": 4.216344440581491, + "tokens_seen": 260571136 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046516549648946843, + "loss": 3.0801, + "theoretical_loss": 4.2162171761347365, + "tokens_seen": 260636672 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651554663991976, + "loss": 3.1188, + "theoretical_loss": 4.2160899526415365, + "tokens_seen": 260702208 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651454363089268, + "loss": 3.0944, + "theoretical_loss": 4.215962770078422, + "tokens_seen": 260767744 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465135406218656, + "loss": 3.3147, + "theoretical_loss": 4.215835628421942, + "tokens_seen": 260833280 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046512537612838515, + "loss": 3.1019, + "theoretical_loss": 4.215708527648667, + "tokens_seen": 260898816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046511534603811434, + "loss": 2.8503, + "theoretical_loss": 4.215581467735187, + "tokens_seen": 260964352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046510531594784357, + "loss": 3.2366, + "theoretical_loss": 4.215454448658109, + "tokens_seen": 261029888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650952858575727, + "loss": 3.1674, + "theoretical_loss": 4.215327470394062, + "tokens_seen": 261095424 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046508525576730193, + "loss": 3.1104, + "theoretical_loss": 4.215200532919691, + "tokens_seen": 261160960 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650752256770311, + "loss": 3.197, + "theoretical_loss": 4.215073636211664, + "tokens_seen": 261226496 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650651955867603, + "loss": 3.1214, + "theoretical_loss": 4.214946780246666, + "tokens_seen": 261292032 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650551654964895, + "loss": 3.0783, + "theoretical_loss": 4.214819965001401, + "tokens_seen": 261357568 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046504513540621866, + "loss": 3.0267, + "theoretical_loss": 4.214693190452593, + "tokens_seen": 261423104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046503510531594784, + "loss": 3.0135, + "theoretical_loss": 4.214566456576984, + "tokens_seen": 261488640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650250752256771, + "loss": 2.9927, + "theoretical_loss": 4.214439763351336, + "tokens_seen": 261554176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650150451354062, + "loss": 2.9392, + "theoretical_loss": 4.214313110752431, + "tokens_seen": 261619712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046500501504513544, + "loss": 2.9189, + "theoretical_loss": 4.214186498757069, + "tokens_seen": 261685248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046499498495486456, + "loss": 3.0795, + "theoretical_loss": 4.214059927342068, + "tokens_seen": 261750784 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649849548645938, + "loss": 3.0373, + "theoretical_loss": 4.213933396484267, + "tokens_seen": 261816320 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464974924774323, + "loss": 3.1087, + "theoretical_loss": 4.213806906160523, + "tokens_seen": 261881856 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046496489468405216, + "loss": 3.2205, + "theoretical_loss": 4.213680456347712, + "tokens_seen": 261947392 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046495486459378134, + "loss": 3.438, + "theoretical_loss": 4.213554047022729, + "tokens_seen": 262012928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649448345035105, + "loss": 3.1792, + "theoretical_loss": 4.213427678162489, + "tokens_seen": 262078464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 308640, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.578385353088379, + "objective/train/theoretical_loss": 4.213301349743924, + "objective/train/tokens_used": 282604000, + "theoretical_loss": 4.213301349743924, + "tokens_seen": 262144000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649348044132397, + "loss": 3.2087, + "theoretical_loss": 4.213301349743924, + "tokens_seen": 262144000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046492477432296894, + "loss": 3.0893, + "theoretical_loss": 4.2131750617439865, + "tokens_seen": 262209536 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046491474423269807, + "loss": 3.1618, + "theoretical_loss": 4.213048814139647, + "tokens_seen": 262275072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649047141424273, + "loss": 2.9118, + "theoretical_loss": 4.212922606907895, + "tokens_seen": 262340608 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648946840521565, + "loss": 3.2177, + "theoretical_loss": 4.21279644002574, + "tokens_seen": 262406144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046488465396188566, + "loss": 3.1045, + "theoretical_loss": 4.212670313470209, + "tokens_seen": 262471680 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046487462387161485, + "loss": 3.0042, + "theoretical_loss": 4.212544227218347, + "tokens_seen": 262537216 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464864593781344, + "loss": 3.0721, + "theoretical_loss": 4.21241818124722, + "tokens_seen": 262602752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648545636910732, + "loss": 3.1759, + "theoretical_loss": 4.212292175533912, + "tokens_seen": 262668288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046484453360080244, + "loss": 3.2897, + "theoretical_loss": 4.212166210055526, + "tokens_seen": 262733824 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046483450351053157, + "loss": 3.1411, + "theoretical_loss": 4.212040284789181, + "tokens_seen": 262799360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648244734202608, + "loss": 3.0484, + "theoretical_loss": 4.211914399712019, + "tokens_seen": 262864896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046481444332998993, + "loss": 3.0421, + "theoretical_loss": 4.211788554801198, + "tokens_seen": 262930432 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046480441323971917, + "loss": 3.1647, + "theoretical_loss": 4.211662750033895, + "tokens_seen": 262995968 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046479438314944835, + "loss": 2.9247, + "theoretical_loss": 4.211536985387307, + "tokens_seen": 263061504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046478435305917753, + "loss": 3.1853, + "theoretical_loss": 4.211411260838647, + "tokens_seen": 263127040 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647743229689067, + "loss": 3.0719, + "theoretical_loss": 4.2112855763651496, + "tokens_seen": 263192576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046476429287863595, + "loss": 3.1743, + "theoretical_loss": 4.211159931944065, + "tokens_seen": 263258112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647542627883651, + "loss": 3.3044, + "theoretical_loss": 4.211034327552666, + "tokens_seen": 263323648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647442326980943, + "loss": 3.172, + "theoretical_loss": 4.210908763168239, + "tokens_seen": 263389184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046473420260782344, + "loss": 2.8232, + "theoretical_loss": 4.210783238768093, + "tokens_seen": 263454720 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046472417251755267, + "loss": 2.8838, + "theoretical_loss": 4.210657754329553, + "tokens_seen": 263520256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647141424272819, + "loss": 3.1246, + "theoretical_loss": 4.210532309829965, + "tokens_seen": 263585792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046470411233701103, + "loss": 3.0826, + "theoretical_loss": 4.21040690524669, + "tokens_seen": 263651328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046469408224674027, + "loss": 3.0629, + "theoretical_loss": 4.21028154055711, + "tokens_seen": 263716864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 309875, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5037975311279297, + "objective/train/theoretical_loss": 4.2101562157386265, + "objective/train/tokens_used": 284242400, + "theoretical_loss": 4.2101562157386265, + "tokens_seen": 263782400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646840521564694, + "loss": 2.8095, + "theoretical_loss": 4.2101562157386265, + "tokens_seen": 263782400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046467402206619863, + "loss": 3.2129, + "theoretical_loss": 4.210030930768655, + "tokens_seen": 263847936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646639919759278, + "loss": 3.3535, + "theoretical_loss": 4.2099056856246335, + "tokens_seen": 263913472 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464653961885657, + "loss": 3.0754, + "theoretical_loss": 4.209780480284017, + "tokens_seen": 263979008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646439317953862, + "loss": 3.2122, + "theoretical_loss": 4.209655314724279, + "tokens_seen": 264044544 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046463390170511535, + "loss": 2.9998, + "theoretical_loss": 4.209530188922911, + "tokens_seen": 264110080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046462387161484454, + "loss": 3.2189, + "theoretical_loss": 4.209405102857422, + "tokens_seen": 264175616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046461384152457377, + "loss": 3.1448, + "theoretical_loss": 4.209280056505342, + "tokens_seen": 264241152 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646038114343029, + "loss": 3.1553, + "theoretical_loss": 4.209155049844217, + "tokens_seen": 264306688 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046459378134403213, + "loss": 3.0602, + "theoretical_loss": 4.209030082851612, + "tokens_seen": 264372224 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645837512537613, + "loss": 3.1091, + "theoretical_loss": 4.208905155505109, + "tokens_seen": 264437760 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645737211634905, + "loss": 2.961, + "theoretical_loss": 4.20878026778231, + "tokens_seen": 264503296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645636910732197, + "loss": 2.9389, + "theoretical_loss": 4.208655419660834, + "tokens_seen": 264568832 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046455366098294886, + "loss": 3.2368, + "theoretical_loss": 4.208530611118321, + "tokens_seen": 264634368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046454363089267804, + "loss": 3.2025, + "theoretical_loss": 4.208405842132423, + "tokens_seen": 264699904 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645336008024073, + "loss": 3.3101, + "theoretical_loss": 4.208281112680817, + "tokens_seen": 264765440 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645235707121364, + "loss": 2.9905, + "theoretical_loss": 4.208156422741195, + "tokens_seen": 264830976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046451354062186564, + "loss": 3.3088, + "theoretical_loss": 4.208031772291265, + "tokens_seen": 264896512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046450351053159476, + "loss": 3.1124, + "theoretical_loss": 4.207907161308757, + "tokens_seen": 264962048 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464493480441324, + "loss": 3.011, + "theoretical_loss": 4.2077825897714165, + "tokens_seen": 265027584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644834503510532, + "loss": 3.1281, + "theoretical_loss": 4.207658057657008, + "tokens_seen": 265093120 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046447342026078236, + "loss": 3.2965, + "theoretical_loss": 4.207533564943316, + "tokens_seen": 265158656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046446339017051154, + "loss": 3.0647, + "theoretical_loss": 4.207409111608138, + "tokens_seen": 265224192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644533600802407, + "loss": 2.9901, + "theoretical_loss": 4.2072846976292935, + "tokens_seen": 265289728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644433299899699, + "loss": 3.0768, + "theoretical_loss": 4.2071603229846195, + "tokens_seen": 265355264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 311046, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1395962238311768, + "objective/train/theoretical_loss": 4.20703598765197, + "objective/train/tokens_used": 285880800, + "theoretical_loss": 4.20703598765197, + "tokens_seen": 265420800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046443329989969914, + "loss": 3.1445, + "theoretical_loss": 4.20703598765197, + "tokens_seen": 265420800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046442326980942827, + "loss": 3.0687, + "theoretical_loss": 4.206911691609217, + "tokens_seen": 265486336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644132397191575, + "loss": 2.8642, + "theoretical_loss": 4.206787434834251, + "tokens_seen": 265551872 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644032096288867, + "loss": 3.3262, + "theoretical_loss": 4.20666321730498, + "tokens_seen": 265617408 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046439317953861586, + "loss": 3.095, + "theoretical_loss": 4.206539038999329, + "tokens_seen": 265682944 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046438314944834505, + "loss": 2.9848, + "theoretical_loss": 4.206414899895244, + "tokens_seen": 265748480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004643731193580742, + "loss": 3.1623, + "theoretical_loss": 4.206290799970685, + "tokens_seen": 265814016 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004643630892678034, + "loss": 3.1056, + "theoretical_loss": 4.206166739203632, + "tokens_seen": 265879552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046435305917753264, + "loss": 3.2523, + "theoretical_loss": 4.206042717572082, + "tokens_seen": 265945088 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046434302908726177, + "loss": 2.9739, + "theoretical_loss": 4.20591873505405, + "tokens_seen": 266010624 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464332998996991, + "loss": 3.0009, + "theoretical_loss": 4.20579479162757, + "tokens_seen": 266076160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046432296890672013, + "loss": 3.1793, + "theoretical_loss": 4.205670887270691, + "tokens_seen": 266141696 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046431293881644937, + "loss": 3.0394, + "theoretical_loss": 4.205547021961482, + "tokens_seen": 266207232 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046430290872617855, + "loss": 3.04, + "theoretical_loss": 4.205423195678029, + "tokens_seen": 266272768 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046429287863590773, + "loss": 2.7836, + "theoretical_loss": 4.205299408398435, + "tokens_seen": 266338304 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642828485456369, + "loss": 3.4339, + "theoretical_loss": 4.2051756601008226, + "tokens_seen": 266403840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046427281845536615, + "loss": 3.2731, + "theoretical_loss": 4.20505195076333, + "tokens_seen": 266469376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642627883650953, + "loss": 2.9399, + "theoretical_loss": 4.204928280364115, + "tokens_seen": 266534912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642527582748245, + "loss": 2.6865, + "theoretical_loss": 4.20480464888135, + "tokens_seen": 266600448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046424272818455364, + "loss": 3.1693, + "theoretical_loss": 4.204681056293228, + "tokens_seen": 266665984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046423269809428287, + "loss": 3.178, + "theoretical_loss": 4.204557502577957, + "tokens_seen": 266731520 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046422266800401205, + "loss": 3.0357, + "theoretical_loss": 4.204433987713767, + "tokens_seen": 266797056 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046421263791374123, + "loss": 3.1383, + "theoretical_loss": 4.2043105116789, + "tokens_seen": 266862592 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642026078234704, + "loss": 3.1161, + "theoretical_loss": 4.204187074451617, + "tokens_seen": 266928128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641925777331996, + "loss": 3.0181, + "theoretical_loss": 4.204063676010202, + "tokens_seen": 266993664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 311732, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2471554279327393, + "objective/train/theoretical_loss": 4.203940316332948, + "objective/train/tokens_used": 287519200, + "theoretical_loss": 4.203940316332948, + "tokens_seen": 267059200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641825476429288, + "loss": 3.2104, + "theoretical_loss": 4.203940316332948, + "tokens_seen": 267059200 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464172517552658, + "loss": 3.0556, + "theoretical_loss": 4.203816995398171, + "tokens_seen": 267124736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046416248746238714, + "loss": 2.944, + "theoretical_loss": 4.203693713184203, + "tokens_seen": 267190272 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641524573721164, + "loss": 3.212, + "theoretical_loss": 4.203570469669392, + "tokens_seen": 267255808 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641424272818455, + "loss": 3.2094, + "theoretical_loss": 4.203447264832107, + "tokens_seen": 267321344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046413239719157474, + "loss": 3.4161, + "theoretical_loss": 4.203324098650731, + "tokens_seen": 267386880 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641223671013039, + "loss": 3.1592, + "theoretical_loss": 4.203200971103666, + "tokens_seen": 267452416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641123370110331, + "loss": 3.1786, + "theoretical_loss": 4.20307788216933, + "tokens_seen": 267517952 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641023069207623, + "loss": 3.2507, + "theoretical_loss": 4.202954831826159, + "tokens_seen": 267583488 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640922768304915, + "loss": 3.132, + "theoretical_loss": 4.202831820052609, + "tokens_seen": 267649024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046408224674022064, + "loss": 3.032, + "theoretical_loss": 4.202708846827148, + "tokens_seen": 267714560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640722166499499, + "loss": 2.9754, + "theoretical_loss": 4.202585912128266, + "tokens_seen": 267780096 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464062186559679, + "loss": 3.0707, + "theoretical_loss": 4.202463015934468, + "tokens_seen": 267845632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046405215646940824, + "loss": 2.8317, + "theoretical_loss": 4.202340158224277, + "tokens_seen": 267911168 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640421263791374, + "loss": 3.2203, + "theoretical_loss": 4.202217338976231, + "tokens_seen": 267976704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640320962888666, + "loss": 2.7621, + "theoretical_loss": 4.2020945581688895, + "tokens_seen": 268042240 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640220661985958, + "loss": 2.9985, + "theoretical_loss": 4.201971815780826, + "tokens_seen": 268107776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046401203610832496, + "loss": 3.1052, + "theoretical_loss": 4.201849111790631, + "tokens_seen": 268173312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046400200601805414, + "loss": 3.1588, + "theoretical_loss": 4.201726446176915, + "tokens_seen": 268238848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639919759277834, + "loss": 3.1762, + "theoretical_loss": 4.201603818918302, + "tokens_seen": 268304384 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639819458375125, + "loss": 2.9964, + "theoretical_loss": 4.201481229993435, + "tokens_seen": 268369920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046397191574724174, + "loss": 2.9145, + "theoretical_loss": 4.201358679380976, + "tokens_seen": 268435456 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639618856569709, + "loss": 3.1938, + "theoretical_loss": 4.201236167059601, + "tokens_seen": 268500992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639518555667001, + "loss": 2.9321, + "theoretical_loss": 4.201113693008002, + "tokens_seen": 268566528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046394182547642934, + "loss": 2.843, + "theoretical_loss": 4.200991257204894, + "tokens_seen": 268632064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 312511, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1893441677093506, + "objective/train/theoretical_loss": 4.2008688596290025, + "objective/train/tokens_used": 289157600, + "theoretical_loss": 4.2008688596290025, + "tokens_seen": 268697600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046393179538615847, + "loss": 3.3237, + "theoretical_loss": 4.2008688596290025, + "tokens_seen": 268697600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639217652958877, + "loss": 3.1753, + "theoretical_loss": 4.200746500259073, + "tokens_seen": 268763136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639117352056169, + "loss": 2.9467, + "theoretical_loss": 4.200624179073869, + "tokens_seen": 268828672 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046390170511534606, + "loss": 3.0959, + "theoretical_loss": 4.2005018960521685, + "tokens_seen": 268894208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046389167502507525, + "loss": 3.1224, + "theoretical_loss": 4.200379651172769, + "tokens_seen": 268959744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638816449348044, + "loss": 3.1511, + "theoretical_loss": 4.200257444414483, + "tokens_seen": 269025280 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638716148445336, + "loss": 3.2735, + "theoretical_loss": 4.200135275756139, + "tokens_seen": 269090816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046386158475426284, + "loss": 3.0709, + "theoretical_loss": 4.200013145176587, + "tokens_seen": 269156352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046385155466399197, + "loss": 3.3529, + "theoretical_loss": 4.199891052654689, + "tokens_seen": 269221888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638415245737212, + "loss": 3.1699, + "theoretical_loss": 4.199768998169326, + "tokens_seen": 269287424 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046383149448345033, + "loss": 3.1224, + "theoretical_loss": 4.199646981699395, + "tokens_seen": 269352960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046382146439317957, + "loss": 3.1028, + "theoretical_loss": 4.199525003223812, + "tokens_seen": 269418496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046381143430290875, + "loss": 2.9704, + "theoretical_loss": 4.199403062721506, + "tokens_seen": 269484032 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046380140421263793, + "loss": 3.1699, + "theoretical_loss": 4.199281160171427, + "tokens_seen": 269549568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637913741223671, + "loss": 2.773, + "theoretical_loss": 4.1991592955525405, + "tokens_seen": 269615104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046378134403209635, + "loss": 3.1007, + "theoretical_loss": 4.199037468843825, + "tokens_seen": 269680640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637713139418255, + "loss": 3.3052, + "theoretical_loss": 4.198915680024282, + "tokens_seen": 269746176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637612838515547, + "loss": 2.9844, + "theoretical_loss": 4.198793929072925, + "tokens_seen": 269811712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046375125376128384, + "loss": 3.1632, + "theoretical_loss": 4.198672215968785, + "tokens_seen": 269877248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046374122367101307, + "loss": 3.0686, + "theoretical_loss": 4.198550540690912, + "tokens_seen": 269942784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046373119358074225, + "loss": 2.9668, + "theoretical_loss": 4.198428903218371, + "tokens_seen": 270008320 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046372116349047143, + "loss": 2.9512, + "theoretical_loss": 4.198307303530243, + "tokens_seen": 270073856 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637111334002006, + "loss": 3.3384, + "theoretical_loss": 4.198185741605628, + "tokens_seen": 270139392 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637011033099298, + "loss": 3.2329, + "theoretical_loss": 4.19806421742364, + "tokens_seen": 270204928 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463691073219659, + "loss": 2.7934, + "theoretical_loss": 4.197942730963412, + "tokens_seen": 270270464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 313588, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1681175231933594, + "objective/train/theoretical_loss": 4.19782128220409, + "objective/train/tokens_used": 290796000, + "theoretical_loss": 4.19782128220409, + "tokens_seen": 270336000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636810431293882, + "loss": 3.1274, + "theoretical_loss": 4.19782128220409, + "tokens_seen": 270336000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046367101303911734, + "loss": 3.231, + "theoretical_loss": 4.19769987112484, + "tokens_seen": 270401536 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636609829488466, + "loss": 2.9321, + "theoretical_loss": 4.1975784977048445, + "tokens_seen": 270467072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636509528585757, + "loss": 3.1377, + "theoretical_loss": 4.1974571619233, + "tokens_seen": 270532608 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046364092276830494, + "loss": 2.8866, + "theoretical_loss": 4.197335863759422, + "tokens_seen": 270598144 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636308926780341, + "loss": 3.0662, + "theoretical_loss": 4.1972146031924416, + "tokens_seen": 270663680 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636208625877633, + "loss": 2.9822, + "theoretical_loss": 4.197093380201606, + "tokens_seen": 270729216 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636108324974925, + "loss": 3.1604, + "theoretical_loss": 4.196972194766179, + "tokens_seen": 270794752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636008024072217, + "loss": 2.7832, + "theoretical_loss": 4.196851046865442, + "tokens_seen": 270860288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046359077231695084, + "loss": 3.0355, + "theoretical_loss": 4.1967299364786905, + "tokens_seen": 270925824 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635807422266801, + "loss": 3.1407, + "theoretical_loss": 4.196608863585239, + "tokens_seen": 270991360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635707121364092, + "loss": 2.861, + "theoretical_loss": 4.1964878281644165, + "tokens_seen": 271056896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046356068204613844, + "loss": 3.2408, + "theoretical_loss": 4.19636683019557, + "tokens_seen": 271122432 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635506519558676, + "loss": 3.0655, + "theoretical_loss": 4.196245869658061, + "tokens_seen": 271187968 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635406218655968, + "loss": 2.9336, + "theoretical_loss": 4.1961249465312696, + "tokens_seen": 271253504 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463530591775326, + "loss": 2.9904, + "theoretical_loss": 4.196004060794589, + "tokens_seen": 271319040 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046352056168505516, + "loss": 2.8734, + "theoretical_loss": 4.195883212427433, + "tokens_seen": 271384576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046351053159478434, + "loss": 3.0506, + "theoretical_loss": 4.195762401409229, + "tokens_seen": 271450112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635005015045136, + "loss": 2.8748, + "theoretical_loss": 4.19564162771942, + "tokens_seen": 271515648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634904714142427, + "loss": 3.0623, + "theoretical_loss": 4.195520891337466, + "tokens_seen": 271581184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046348044132397194, + "loss": 3.0601, + "theoretical_loss": 4.195400192242845, + "tokens_seen": 271646720 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046347041123370107, + "loss": 2.7883, + "theoretical_loss": 4.19527953041505, + "tokens_seen": 271712256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634603811434303, + "loss": 3.1026, + "theoretical_loss": 4.19515890583359, + "tokens_seen": 271777792 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634503510531595, + "loss": 3.1229, + "theoretical_loss": 4.195038318477989, + "tokens_seen": 271843328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046344032096288867, + "loss": 2.9915, + "theoretical_loss": 4.194917768327789, + "tokens_seen": 271908864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 314289, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.050889253616333, + "objective/train/theoretical_loss": 4.194797255362549, + "objective/train/tokens_used": 292434400, + "theoretical_loss": 4.194797255362549, + "tokens_seen": 271974400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046343029087261785, + "loss": 3.1228, + "theoretical_loss": 4.194797255362549, + "tokens_seen": 271974400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634202607823471, + "loss": 3.1203, + "theoretical_loss": 4.194676779561841, + "tokens_seen": 272039936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634102306920762, + "loss": 2.7881, + "theoretical_loss": 4.194556340905256, + "tokens_seen": 272105472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046340020060180545, + "loss": 3.0799, + "theoretical_loss": 4.194435939372401, + "tokens_seen": 272171008 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046339017051153457, + "loss": 3.032, + "theoretical_loss": 4.194315574942896, + "tokens_seen": 272236544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633801404212638, + "loss": 3.2427, + "theoretical_loss": 4.194195247596381, + "tokens_seen": 272302080 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463370110330993, + "loss": 2.9247, + "theoretical_loss": 4.19407495731251, + "tokens_seen": 272367616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046336008024072217, + "loss": 2.9747, + "theoretical_loss": 4.193954704070952, + "tokens_seen": 272433152 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046335005015045135, + "loss": 3.1666, + "theoretical_loss": 4.193834487851396, + "tokens_seen": 272498688 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046334002006018053, + "loss": 3.0714, + "theoretical_loss": 4.193714308633542, + "tokens_seen": 272564224 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633299899699097, + "loss": 3.0147, + "theoretical_loss": 4.1935941663971095, + "tokens_seen": 272629760 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046331995987963895, + "loss": 3.1997, + "theoretical_loss": 4.193474061121833, + "tokens_seen": 272695296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633099297893681, + "loss": 2.8764, + "theoretical_loss": 4.193353992787463, + "tokens_seen": 272760832 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632998996990973, + "loss": 3.2184, + "theoretical_loss": 4.193233961373766, + "tokens_seen": 272826368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046328986960882644, + "loss": 3.1254, + "theoretical_loss": 4.1931139668605235, + "tokens_seen": 272891904 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632798395185557, + "loss": 3.0326, + "theoretical_loss": 4.192994009227535, + "tokens_seen": 272957440 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046326980942828485, + "loss": 3.0574, + "theoretical_loss": 4.192874088454613, + "tokens_seen": 273022976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046325977933801404, + "loss": 3.1175, + "theoretical_loss": 4.19275420452159, + "tokens_seen": 273088512 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632497492477432, + "loss": 3.0634, + "theoretical_loss": 4.192634357408309, + "tokens_seen": 273154048 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046323971915747245, + "loss": 2.7333, + "theoretical_loss": 4.192514547094634, + "tokens_seen": 273219584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632296890672016, + "loss": 2.9853, + "theoretical_loss": 4.192394773560441, + "tokens_seen": 273285120 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632196589769308, + "loss": 3.1364, + "theoretical_loss": 4.192275036785625, + "tokens_seen": 273350656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046320962888666, + "loss": 2.8319, + "theoretical_loss": 4.192155336750094, + "tokens_seen": 273416192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631995987963892, + "loss": 3.0046, + "theoretical_loss": 4.192035673433773, + "tokens_seen": 273481728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631895687061184, + "loss": 3.1155, + "theoretical_loss": 4.191916046816605, + "tokens_seen": 273547264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 314822, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1304068565368652, + "objective/train/theoretical_loss": 4.191796456878544, + "objective/train/tokens_used": 294072800, + "theoretical_loss": 4.191796456878544, + "tokens_seen": 273612800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046317953861584754, + "loss": 2.943, + "theoretical_loss": 4.191796456878544, + "tokens_seen": 273612800 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631695085255768, + "loss": 2.8587, + "theoretical_loss": 4.191676903599563, + "tokens_seen": 273678336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631594784353059, + "loss": 3.1101, + "theoretical_loss": 4.191557386959651, + "tokens_seen": 273743872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046314944834503514, + "loss": 3.1475, + "theoretical_loss": 4.191437906938811, + "tokens_seen": 273809408 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631394182547643, + "loss": 3.0439, + "theoretical_loss": 4.191318463517062, + "tokens_seen": 273874944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631293881644935, + "loss": 3.1469, + "theoretical_loss": 4.19119905667444, + "tokens_seen": 273940480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631193580742227, + "loss": 3.1943, + "theoretical_loss": 4.191079686390996, + "tokens_seen": 274006016 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631093279839519, + "loss": 3.1121, + "theoretical_loss": 4.190960352646796, + "tokens_seen": 274071552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046309929789368104, + "loss": 3.0344, + "theoretical_loss": 4.190841055421921, + "tokens_seen": 274137088 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630892678034103, + "loss": 3.0578, + "theoretical_loss": 4.19072179469647, + "tokens_seen": 274202624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630792377131394, + "loss": 3.1027, + "theoretical_loss": 4.190602570450556, + "tokens_seen": 274268160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046306920762286864, + "loss": 3.0957, + "theoretical_loss": 4.190483382664308, + "tokens_seen": 274333696 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630591775325978, + "loss": 2.8938, + "theoretical_loss": 4.19036423131787, + "tokens_seen": 274399232 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463049147442327, + "loss": 2.8262, + "theoretical_loss": 4.190245116391403, + "tokens_seen": 274464768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630391173520562, + "loss": 2.9228, + "theoretical_loss": 4.190126037865082, + "tokens_seen": 274530304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046302908726178536, + "loss": 3.0006, + "theoretical_loss": 4.190006995719098, + "tokens_seen": 274595840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046301905717151455, + "loss": 2.9758, + "theoretical_loss": 4.1898879899336565, + "tokens_seen": 274661376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630090270812438, + "loss": 3.2112, + "theoretical_loss": 4.189769020488981, + "tokens_seen": 274726912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629989969909729, + "loss": 3.0473, + "theoretical_loss": 4.189650087365309, + "tokens_seen": 274792448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046298896690070214, + "loss": 2.8983, + "theoretical_loss": 4.189531190542893, + "tokens_seen": 274857984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046297893681043127, + "loss": 2.7118, + "theoretical_loss": 4.189412330002001, + "tokens_seen": 274923520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629689067201605, + "loss": 2.9403, + "theoretical_loss": 4.189293505722918, + "tokens_seen": 274989056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629588766298897, + "loss": 2.9515, + "theoretical_loss": 4.189174717685942, + "tokens_seen": 275054592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046294884653961887, + "loss": 2.7063, + "theoretical_loss": 4.189055965871389, + "tokens_seen": 275120128 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046293881644934805, + "loss": 3.0041, + "theoretical_loss": 4.188937250259587, + "tokens_seen": 275185664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 318374, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3687500953674316, + "objective/train/theoretical_loss": 4.188818570830883, + "objective/train/tokens_used": 295711200, + "theoretical_loss": 4.188818570830883, + "tokens_seen": 275251200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629287863590773, + "loss": 2.9334, + "theoretical_loss": 4.188818570830883, + "tokens_seen": 275251200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629187562688064, + "loss": 2.9519, + "theoretical_loss": 4.188699927565638, + "tokens_seen": 275316736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046290872617853565, + "loss": 3.24, + "theoretical_loss": 4.188581320444228, + "tokens_seen": 275382272 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046289869608826477, + "loss": 3.0205, + "theoretical_loss": 4.1884627494470426, + "tokens_seen": 275447808 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462888665997994, + "loss": 3.1389, + "theoretical_loss": 4.1883442145544905, + "tokens_seen": 275513344 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628786359077232, + "loss": 3.117, + "theoretical_loss": 4.188225715746992, + "tokens_seen": 275578880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046286860581745237, + "loss": 3.0967, + "theoretical_loss": 4.188107253004986, + "tokens_seen": 275644416 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046285857572718155, + "loss": 3.2164, + "theoretical_loss": 4.187988826308925, + "tokens_seen": 275709952 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046284854563691073, + "loss": 3.0597, + "theoretical_loss": 4.187870435639275, + "tokens_seen": 275775488 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628385155466399, + "loss": 2.9352, + "theoretical_loss": 4.18775208097652, + "tokens_seen": 275841024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046282848545636915, + "loss": 3.2597, + "theoretical_loss": 4.187633762301159, + "tokens_seen": 275906560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628184553660983, + "loss": 3.0023, + "theoretical_loss": 4.187515479593704, + "tokens_seen": 275972096 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628084252758275, + "loss": 2.9841, + "theoretical_loss": 4.187397232834683, + "tokens_seen": 276037632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046279839518555664, + "loss": 3.3956, + "theoretical_loss": 4.187279022004642, + "tokens_seen": 276103168 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627883650952859, + "loss": 3.2086, + "theoretical_loss": 4.1871608470841375, + "tokens_seen": 276168704 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046277833500501505, + "loss": 2.8913, + "theoretical_loss": 4.1870427080537445, + "tokens_seen": 276234240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046276830491474424, + "loss": 2.9785, + "theoretical_loss": 4.1869246048940525, + "tokens_seen": 276299776 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627582748244734, + "loss": 3.138, + "theoretical_loss": 4.186806537585666, + "tokens_seen": 276365312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046274824473420265, + "loss": 2.9597, + "theoretical_loss": 4.186688506109202, + "tokens_seen": 276430848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627382146439318, + "loss": 3.1554, + "theoretical_loss": 4.186570510445296, + "tokens_seen": 276496384 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462728184553661, + "loss": 3.1408, + "theoretical_loss": 4.186452550574599, + "tokens_seen": 276561920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046271815446339014, + "loss": 2.8797, + "theoretical_loss": 4.186334626477774, + "tokens_seen": 276627456 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627081243731194, + "loss": 2.892, + "theoretical_loss": 4.186216738135501, + "tokens_seen": 276692992 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046269809428284856, + "loss": 2.6408, + "theoretical_loss": 4.186098885528473, + "tokens_seen": 276758528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046268806419257774, + "loss": 3.0981, + "theoretical_loss": 4.185981068637401, + "tokens_seen": 276824064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.31486439704895, + "objective/train/theoretical_loss": 4.185863287443008, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.185863287443008, + "tokens_seen": 276889600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626780341023069, + "loss": 3.0755, + "theoretical_loss": 4.185863287443008, + "tokens_seen": 276889600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626680040120361, + "loss": 2.9726, + "theoretical_loss": 4.185745541926035, + "tokens_seen": 276955136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626579739217653, + "loss": 3.1442, + "theoretical_loss": 4.185627832067237, + "tokens_seen": 277020672 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626479438314945, + "loss": 3.17, + "theoretical_loss": 4.1855101578473795, + "tokens_seen": 277086208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046263791374122364, + "loss": 3.0287, + "theoretical_loss": 4.18539251924725, + "tokens_seen": 277151744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626278836509529, + "loss": 2.8911, + "theoretical_loss": 4.185274916247646, + "tokens_seen": 277217280 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462617853560682, + "loss": 2.869, + "theoretical_loss": 4.185157348829383, + "tokens_seen": 277282816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046260782347041124, + "loss": 3.1912, + "theoretical_loss": 4.185039816973289, + "tokens_seen": 277348352 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625977933801404, + "loss": 2.8488, + "theoretical_loss": 4.184922320660207, + "tokens_seen": 277413888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625877632898696, + "loss": 3.0904, + "theoretical_loss": 4.184804859870997, + "tokens_seen": 277479424 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625777331995988, + "loss": 3.0076, + "theoretical_loss": 4.184687434586531, + "tokens_seen": 277544960 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462567703109328, + "loss": 2.8916, + "theoretical_loss": 4.184570044787698, + "tokens_seen": 277610496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046255767301905715, + "loss": 2.8376, + "theoretical_loss": 4.1844526904554, + "tokens_seen": 277676032 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625476429287864, + "loss": 2.6536, + "theoretical_loss": 4.184335371570556, + "tokens_seen": 277741568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625376128385155, + "loss": 2.97, + "theoretical_loss": 4.184218088114097, + "tokens_seen": 277807104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046252758274824475, + "loss": 3.0261, + "theoretical_loss": 4.1841008400669715, + "tokens_seen": 277872640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625175526579739, + "loss": 3.0633, + "theoretical_loss": 4.183983627410142, + "tokens_seen": 277938176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625075225677031, + "loss": 3.0716, + "theoretical_loss": 4.183866450124584, + "tokens_seen": 278003712 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624974924774323, + "loss": 3.0718, + "theoretical_loss": 4.18374930819129, + "tokens_seen": 278069248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046248746238716147, + "loss": 3.0414, + "theoretical_loss": 4.183632201591264, + "tokens_seen": 278134784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046247743229689065, + "loss": 2.8162, + "theoretical_loss": 4.18351513030553, + "tokens_seen": 278200320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624674022066199, + "loss": 3.1109, + "theoretical_loss": 4.1833980943151206, + "tokens_seen": 278265856 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046245737211634907, + "loss": 3.0844, + "theoretical_loss": 4.183281093601087, + "tokens_seen": 278331392 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046244734202607825, + "loss": 3.2941, + "theoretical_loss": 4.183164128144495, + "tokens_seen": 278396928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624373119358075, + "loss": 3.0747, + "theoretical_loss": 4.183047197926422, + "tokens_seen": 278462464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0842607021331787, + "objective/train/theoretical_loss": 4.182930302927963, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.182930302927963, + "tokens_seen": 278528000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624272818455366, + "loss": 2.8917, + "theoretical_loss": 4.182930302927963, + "tokens_seen": 278528000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046241725175526585, + "loss": 3.085, + "theoretical_loss": 4.182813443130227, + "tokens_seen": 278593536 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462407221664995, + "loss": 2.8507, + "theoretical_loss": 4.182696618514337, + "tokens_seen": 278659072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623971915747242, + "loss": 2.9847, + "theoretical_loss": 4.18257982906143, + "tokens_seen": 278724608 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623871614844534, + "loss": 3.0537, + "theoretical_loss": 4.1824630747526585, + "tokens_seen": 278790144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046237713139418257, + "loss": 2.8442, + "theoretical_loss": 4.182346355569189, + "tokens_seen": 278855680 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046236710130391175, + "loss": 3.2378, + "theoretical_loss": 4.182229671492204, + "tokens_seen": 278921216 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046235707121364093, + "loss": 2.8921, + "theoretical_loss": 4.1821130225028975, + "tokens_seen": 278986752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623470411233701, + "loss": 2.9071, + "theoretical_loss": 4.1819964085824815, + "tokens_seen": 279052288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046233701103309935, + "loss": 2.8913, + "theoretical_loss": 4.181879829712178, + "tokens_seen": 279117824 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623269809428285, + "loss": 3.2138, + "theoretical_loss": 4.181763285873231, + "tokens_seen": 279183360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623169508525577, + "loss": 3.0489, + "theoretical_loss": 4.181646777046889, + "tokens_seen": 279248896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046230692076228684, + "loss": 2.8127, + "theoretical_loss": 4.181530303214423, + "tokens_seen": 279314432 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622968906720161, + "loss": 3.1112, + "theoretical_loss": 4.181413864357115, + "tokens_seen": 279379968 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046228686058174525, + "loss": 2.9245, + "theoretical_loss": 4.181297460456262, + "tokens_seen": 279445504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046227683049147444, + "loss": 2.6736, + "theoretical_loss": 4.181181091493174, + "tokens_seen": 279511040 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622668004012036, + "loss": 3.0354, + "theoretical_loss": 4.181064757449178, + "tokens_seen": 279576576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046225677031093285, + "loss": 3.3787, + "theoretical_loss": 4.180948458305615, + "tokens_seen": 279642112 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462246740220662, + "loss": 2.9473, + "theoretical_loss": 4.180832194043836, + "tokens_seen": 279707648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622367101303912, + "loss": 2.9305, + "theoretical_loss": 4.180715964645213, + "tokens_seen": 279773184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046222668004012034, + "loss": 3.0333, + "theoretical_loss": 4.180599770091126, + "tokens_seen": 279838720 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622166499498496, + "loss": 2.9703, + "theoretical_loss": 4.180483610362975, + "tokens_seen": 279904256 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046220661985957876, + "loss": 2.7939, + "theoretical_loss": 4.18036748544217, + "tokens_seen": 279969792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046219658976930794, + "loss": 3.0377, + "theoretical_loss": 4.180251395310137, + "tokens_seen": 280035328 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621865596790371, + "loss": 3.0576, + "theoretical_loss": 4.1801353399483165, + "tokens_seen": 280100864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.94958758354187, + "objective/train/theoretical_loss": 4.180019319338163, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.180019319338163, + "tokens_seen": 280166400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621765295887663, + "loss": 3.0546, + "theoretical_loss": 4.180019319338163, + "tokens_seen": 280166400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621664994984955, + "loss": 2.875, + "theoretical_loss": 4.179903333461144, + "tokens_seen": 280231936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621564694082247, + "loss": 3.0677, + "theoretical_loss": 4.179787382298744, + "tokens_seen": 280297472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046214643931795384, + "loss": 3.1748, + "theoretical_loss": 4.179671465832458, + "tokens_seen": 280363008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621364092276831, + "loss": 3.0775, + "theoretical_loss": 4.179555584043799, + "tokens_seen": 280428544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621263791374122, + "loss": 3.1528, + "theoretical_loss": 4.17943973691429, + "tokens_seen": 280494080 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046211634904714144, + "loss": 2.9587, + "theoretical_loss": 4.179323924425472, + "tokens_seen": 280559616 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004621063189568706, + "loss": 2.9175, + "theoretical_loss": 4.179208146558899, + "tokens_seen": 280625152 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620962888665998, + "loss": 2.9214, + "theoretical_loss": 4.1790924032961385, + "tokens_seen": 280690688 + }, + { + "epoch": 0.09, + "learning_rate": 0.000462086258776329, + "loss": 2.9464, + "theoretical_loss": 4.178976694618772, + "tokens_seen": 280756224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620762286860582, + "loss": 3.1425, + "theoretical_loss": 4.178861020508395, + "tokens_seen": 280821760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046206619859578735, + "loss": 2.8949, + "theoretical_loss": 4.178745380946619, + "tokens_seen": 280887296 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620561685055166, + "loss": 2.9406, + "theoretical_loss": 4.178629775915066, + "tokens_seen": 280952832 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620461384152457, + "loss": 2.9719, + "theoretical_loss": 4.178514205395376, + "tokens_seen": 281018368 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046203610832497495, + "loss": 2.9724, + "theoretical_loss": 4.178398669369201, + "tokens_seen": 281083904 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620260782347041, + "loss": 3.0159, + "theoretical_loss": 4.178283167818206, + "tokens_seen": 281149440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620160481444333, + "loss": 3.1142, + "theoretical_loss": 4.178167700724073, + "tokens_seen": 281214976 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620060180541625, + "loss": 3.1041, + "theoretical_loss": 4.178052268068494, + "tokens_seen": 281280512 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046199598796389167, + "loss": 2.9121, + "theoretical_loss": 4.177936869833179, + "tokens_seen": 281346048 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046198595787362085, + "loss": 2.8584, + "theoretical_loss": 4.17782150599985, + "tokens_seen": 281411584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619759277833501, + "loss": 2.9394, + "theoretical_loss": 4.1777061765502435, + "tokens_seen": 281477120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619658976930792, + "loss": 3.0848, + "theoretical_loss": 4.1775908814661085, + "tokens_seen": 281542656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046195586760280845, + "loss": 2.7208, + "theoretical_loss": 4.17747562072921, + "tokens_seen": 281608192 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046194583751253763, + "loss": 3.0835, + "theoretical_loss": 4.177360394321325, + "tokens_seen": 281673728 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619358074222668, + "loss": 3.1928, + "theoretical_loss": 4.177245202224246, + "tokens_seen": 281739264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1719119548797607, + "objective/train/theoretical_loss": 4.17713004441978, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.17713004441978, + "tokens_seen": 281804800 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461925777331996, + "loss": 2.9915, + "theoretical_loss": 4.17713004441978, + "tokens_seen": 281804800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619157472417252, + "loss": 2.9478, + "theoretical_loss": 4.177014920889745, + "tokens_seen": 281870336 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046190571715145435, + "loss": 3.0954, + "theoretical_loss": 4.176899831615974, + "tokens_seen": 281935872 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618956870611836, + "loss": 2.9708, + "theoretical_loss": 4.176784776580316, + "tokens_seen": 282001408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618856569709127, + "loss": 3.0337, + "theoretical_loss": 4.176669755764632, + "tokens_seen": 282066944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046187562688064195, + "loss": 3.261, + "theoretical_loss": 4.176554769150796, + "tokens_seen": 282132480 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618655967903711, + "loss": 3.0503, + "theoretical_loss": 4.176439816720697, + "tokens_seen": 282198016 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618555667001003, + "loss": 2.8879, + "theoretical_loss": 4.1763248984562376, + "tokens_seen": 282263552 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618455366098295, + "loss": 2.9648, + "theoretical_loss": 4.176210014339335, + "tokens_seen": 282329088 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618355065195587, + "loss": 2.673, + "theoretical_loss": 4.17609516435192, + "tokens_seen": 282394624 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046182547642928786, + "loss": 3.2389, + "theoretical_loss": 4.1759803484759335, + "tokens_seen": 282460160 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046181544633901704, + "loss": 2.9818, + "theoretical_loss": 4.175865566693336, + "tokens_seen": 282525696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618054162487462, + "loss": 2.7571, + "theoretical_loss": 4.175750818986098, + "tokens_seen": 282591232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046179538615847545, + "loss": 2.9559, + "theoretical_loss": 4.1756361053362046, + "tokens_seen": 282656768 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617853560682046, + "loss": 3.1227, + "theoretical_loss": 4.1755214257256545, + "tokens_seen": 282722304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617753259779338, + "loss": 3.1455, + "theoretical_loss": 4.17540678013646, + "tokens_seen": 282787840 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461765295887663, + "loss": 3.044, + "theoretical_loss": 4.175292168550648, + "tokens_seen": 282853376 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617552657973922, + "loss": 2.7818, + "theoretical_loss": 4.175177590950257, + "tokens_seen": 282918912 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046174523570712136, + "loss": 2.7973, + "theoretical_loss": 4.175063047317342, + "tokens_seen": 282984448 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046173520561685054, + "loss": 3.1903, + "theoretical_loss": 4.174948537633968, + "tokens_seen": 283049984 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617251755265797, + "loss": 3.2353, + "theoretical_loss": 4.174834061882218, + "tokens_seen": 283115520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046171514543630896, + "loss": 3.0471, + "theoretical_loss": 4.1747196200441845, + "tokens_seen": 283181056 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046170511534603814, + "loss": 2.8341, + "theoretical_loss": 4.174605212101977, + "tokens_seen": 283246592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616950852557673, + "loss": 2.8812, + "theoretical_loss": 4.174490838037716, + "tokens_seen": 283312128 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616850551654965, + "loss": 2.9857, + "theoretical_loss": 4.174376497833537, + "tokens_seen": 283377664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5323517322540283, + "objective/train/theoretical_loss": 4.174262191471587, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.174262191471587, + "tokens_seen": 283443200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616750250752257, + "loss": 2.8143, + "theoretical_loss": 4.174262191471587, + "tokens_seen": 283443200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616649949849549, + "loss": 3.11, + "theoretical_loss": 4.17414791893403, + "tokens_seen": 283508736 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046165496489468404, + "loss": 3.0364, + "theoretical_loss": 4.17403368020304, + "tokens_seen": 283574272 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616449348044133, + "loss": 3.2199, + "theoretical_loss": 4.173919475260808, + "tokens_seen": 283639808 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616349047141424, + "loss": 3.0457, + "theoretical_loss": 4.173805304089536, + "tokens_seen": 283705344 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046162487462387164, + "loss": 2.9339, + "theoretical_loss": 4.173691166671439, + "tokens_seen": 283770880 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616148445336008, + "loss": 3.1206, + "theoretical_loss": 4.173577062988748, + "tokens_seen": 283836416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046160481444333, + "loss": 2.7604, + "theoretical_loss": 4.173462993023706, + "tokens_seen": 283901952 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615947843530592, + "loss": 2.7994, + "theoretical_loss": 4.173348956758568, + "tokens_seen": 283967488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615847542627884, + "loss": 2.885, + "theoretical_loss": 4.173234954175605, + "tokens_seen": 284033024 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046157472417251755, + "loss": 2.7621, + "theoretical_loss": 4.173120985257102, + "tokens_seen": 284098560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615646940822468, + "loss": 3.1411, + "theoretical_loss": 4.173007049985352, + "tokens_seen": 284164096 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615546639919759, + "loss": 2.7966, + "theoretical_loss": 4.172893148342667, + "tokens_seen": 284229632 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046154463390170515, + "loss": 2.9813, + "theoretical_loss": 4.172779280311372, + "tokens_seen": 284295168 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615346038114343, + "loss": 3.0168, + "theoretical_loss": 4.172665445873801, + "tokens_seen": 284360704 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615245737211635, + "loss": 2.9192, + "theoretical_loss": 4.172551645012307, + "tokens_seen": 284426240 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615145436308927, + "loss": 2.8819, + "theoretical_loss": 4.1724378777092515, + "tokens_seen": 284491776 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046150451354062187, + "loss": 2.5965, + "theoretical_loss": 4.172324143947012, + "tokens_seen": 284557312 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046149448345035105, + "loss": 3.0194, + "theoretical_loss": 4.172210443707979, + "tokens_seen": 284622848 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614844533600803, + "loss": 2.7609, + "theoretical_loss": 4.1720967769745565, + "tokens_seen": 284688384 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614744232698094, + "loss": 2.9768, + "theoretical_loss": 4.171983143729159, + "tokens_seen": 284753920 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046146439317953865, + "loss": 2.7417, + "theoretical_loss": 4.1718695439542195, + "tokens_seen": 284819456 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046145436308926783, + "loss": 3.1363, + "theoretical_loss": 4.17175597763218, + "tokens_seen": 284884992 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461444332998997, + "loss": 2.9471, + "theoretical_loss": 4.171642444745497, + "tokens_seen": 284950528 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614343029087262, + "loss": 3.2626, + "theoretical_loss": 4.1715289452766395, + "tokens_seen": 285016064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.09801983833313, + "objective/train/theoretical_loss": 4.1714154792080915, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.1714154792080915, + "tokens_seen": 285081600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614242728184554, + "loss": 3.0522, + "theoretical_loss": 4.1714154792080915, + "tokens_seen": 285081600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046141424272818455, + "loss": 2.7098, + "theoretical_loss": 4.171302046522349, + "tokens_seen": 285147136 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614042126379138, + "loss": 3.1704, + "theoretical_loss": 4.171188647201921, + "tokens_seen": 285212672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613941825476429, + "loss": 3.0845, + "theoretical_loss": 4.1710752812293315, + "tokens_seen": 285278208 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046138415245737215, + "loss": 2.8987, + "theoretical_loss": 4.170961948587115, + "tokens_seen": 285343744 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613741223671013, + "loss": 2.7445, + "theoretical_loss": 4.17084864925782, + "tokens_seen": 285409280 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613640922768305, + "loss": 3.0298, + "theoretical_loss": 4.1707353832240095, + "tokens_seen": 285474816 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613540621865597, + "loss": 2.9432, + "theoretical_loss": 4.170622150468258, + "tokens_seen": 285540352 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613440320962889, + "loss": 3.1768, + "theoretical_loss": 4.170508950973154, + "tokens_seen": 285605888 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046133400200601806, + "loss": 2.7724, + "theoretical_loss": 4.1703957847213, + "tokens_seen": 285671424 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046132397191574724, + "loss": 2.7251, + "theoretical_loss": 4.170282651695308, + "tokens_seen": 285736960 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613139418254764, + "loss": 2.9493, + "theoretical_loss": 4.170169551877808, + "tokens_seen": 285802496 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046130391173520566, + "loss": 2.8802, + "theoretical_loss": 4.170056485251439, + "tokens_seen": 285868032 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612938816449348, + "loss": 3.0992, + "theoretical_loss": 4.169943451798856, + "tokens_seen": 285933568 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461283851554664, + "loss": 2.9054, + "theoretical_loss": 4.169830451502724, + "tokens_seen": 285999104 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612738214643932, + "loss": 2.8996, + "theoretical_loss": 4.169717484345725, + "tokens_seen": 286064640 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612637913741224, + "loss": 2.7219, + "theoretical_loss": 4.1696045503105506, + "tokens_seen": 286130176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046125376128385156, + "loss": 3.0844, + "theoretical_loss": 4.169491649379905, + "tokens_seen": 286195712 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046124373119358074, + "loss": 3.1184, + "theoretical_loss": 4.169378781536509, + "tokens_seen": 286261248 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612337011033099, + "loss": 2.7949, + "theoretical_loss": 4.169265946763095, + "tokens_seen": 286326784 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046122367101303916, + "loss": 3.0171, + "theoretical_loss": 4.169153145042405, + "tokens_seen": 286392320 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612136409227683, + "loss": 3.0261, + "theoretical_loss": 4.169040376357199, + "tokens_seen": 286457856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612036108324975, + "loss": 2.9589, + "theoretical_loss": 4.168927640690246, + "tokens_seen": 286523392 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046119358074222665, + "loss": 2.9269, + "theoretical_loss": 4.16881493802433, + "tokens_seen": 286588928 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611835506519559, + "loss": 2.916, + "theoretical_loss": 4.168702268342248, + "tokens_seen": 286654464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.830076217651367, + "objective/train/theoretical_loss": 4.168589631626808, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.168589631626808, + "tokens_seen": 286720000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046117352056168506, + "loss": 2.9636, + "theoretical_loss": 4.168589631626808, + "tokens_seen": 286720000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046116349047141425, + "loss": 2.7699, + "theoretical_loss": 4.168477027860833, + "tokens_seen": 286785536 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611534603811434, + "loss": 3.0801, + "theoretical_loss": 4.168364457027158, + "tokens_seen": 286851072 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611434302908726, + "loss": 3.0063, + "theoretical_loss": 4.168251919108632, + "tokens_seen": 286916608 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611334002006018, + "loss": 2.8081, + "theoretical_loss": 4.168139414088113, + "tokens_seen": 286982144 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461123370110331, + "loss": 3.0765, + "theoretical_loss": 4.168026941948478, + "tokens_seen": 287047680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046111334002006015, + "loss": 3.1996, + "theoretical_loss": 4.167914502672611, + "tokens_seen": 287113216 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611033099297894, + "loss": 2.7884, + "theoretical_loss": 4.1678020962434115, + "tokens_seen": 287178752 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046109327983951857, + "loss": 2.9895, + "theoretical_loss": 4.167689722643792, + "tokens_seen": 287244288 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046108324974924775, + "loss": 3.0916, + "theoretical_loss": 4.1675773818566775, + "tokens_seen": 287309824 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046107321965897693, + "loss": 2.9913, + "theoretical_loss": 4.167465073865006, + "tokens_seen": 287375360 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610631895687061, + "loss": 2.8553, + "theoretical_loss": 4.167352798651726, + "tokens_seen": 287440896 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610531594784353, + "loss": 3.0485, + "theoretical_loss": 4.167240556199802, + "tokens_seen": 287506432 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610431293881645, + "loss": 3.1886, + "theoretical_loss": 4.167128346492211, + "tokens_seen": 287571968 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046103309929789365, + "loss": 2.9215, + "theoretical_loss": 4.16701616951194, + "tokens_seen": 287637504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610230692076229, + "loss": 3.0773, + "theoretical_loss": 4.1669040252419896, + "tokens_seen": 287703040 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461013039117352, + "loss": 3.2776, + "theoretical_loss": 4.166791913665375, + "tokens_seen": 287768576 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046100300902708125, + "loss": 3.0554, + "theoretical_loss": 4.166679834765123, + "tokens_seen": 287834112 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046099297893681043, + "loss": 2.9973, + "theoretical_loss": 4.166567788524272, + "tokens_seen": 287899648 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609829488465396, + "loss": 3.2166, + "theoretical_loss": 4.166455774925875, + "tokens_seen": 287965184 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609729187562688, + "loss": 3.1155, + "theoretical_loss": 4.166343793952995, + "tokens_seen": 288030720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046096288866599803, + "loss": 2.9107, + "theoretical_loss": 4.166231845588712, + "tokens_seen": 288096256 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609528585757272, + "loss": 2.8911, + "theoretical_loss": 4.166119929816113, + "tokens_seen": 288161792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609428284854564, + "loss": 3.0025, + "theoretical_loss": 4.166008046618303, + "tokens_seen": 288227328 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609327983951856, + "loss": 2.9673, + "theoretical_loss": 4.1658961959783944, + "tokens_seen": 288292864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.154595136642456, + "objective/train/theoretical_loss": 4.165784377879517, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.165784377879517, + "tokens_seen": 288358400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046092276830491475, + "loss": 3.198, + "theoretical_loss": 4.165784377879517, + "tokens_seen": 288358400 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460912738214644, + "loss": 3.1049, + "theoretical_loss": 4.165672592304811, + "tokens_seen": 288423936 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609027081243731, + "loss": 2.9952, + "theoretical_loss": 4.165560839237429, + "tokens_seen": 288489472 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046089267803410235, + "loss": 3.0699, + "theoretical_loss": 4.165449118660536, + "tokens_seen": 288555008 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608826479438315, + "loss": 3.0712, + "theoretical_loss": 4.16533743055731, + "tokens_seen": 288620544 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608726178535607, + "loss": 3.1647, + "theoretical_loss": 4.165225774910941, + "tokens_seen": 288686080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608625877632899, + "loss": 3.2739, + "theoretical_loss": 4.165114151704634, + "tokens_seen": 288751616 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608525576730191, + "loss": 3.0161, + "theoretical_loss": 4.165002560921601, + "tokens_seen": 288817152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046084252758274826, + "loss": 3.1432, + "theoretical_loss": 4.164891002545073, + "tokens_seen": 288882688 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046083249749247744, + "loss": 2.9712, + "theoretical_loss": 4.16477947655829, + "tokens_seen": 288948224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608224674022066, + "loss": 2.8887, + "theoretical_loss": 4.164667982944504, + "tokens_seen": 289013760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046081243731193586, + "loss": 2.7605, + "theoretical_loss": 4.164556521686981, + "tokens_seen": 289079296 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460802407221665, + "loss": 3.1795, + "theoretical_loss": 4.1644450927689975, + "tokens_seen": 289144832 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607923771313942, + "loss": 2.902, + "theoretical_loss": 4.164333696173846, + "tokens_seen": 289210368 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607823470411234, + "loss": 3.0938, + "theoretical_loss": 4.164222331884827, + "tokens_seen": 289275904 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607723169508526, + "loss": 2.9628, + "theoretical_loss": 4.164110999885256, + "tokens_seen": 289341440 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046076228686058176, + "loss": 2.8941, + "theoretical_loss": 4.163999700158462, + "tokens_seen": 289406976 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046075225677031094, + "loss": 3.0364, + "theoretical_loss": 4.163888432687784, + "tokens_seen": 289472512 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607422266800401, + "loss": 3.1758, + "theoretical_loss": 4.163777197456573, + "tokens_seen": 289538048 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046073219658976936, + "loss": 2.7997, + "theoretical_loss": 4.163665994448197, + "tokens_seen": 289603584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607221664994985, + "loss": 2.9896, + "theoretical_loss": 4.163554823646027, + "tokens_seen": 289669120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607121364092277, + "loss": 2.9085, + "theoretical_loss": 4.163443685033458, + "tokens_seen": 289734656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046070210631895685, + "loss": 2.8829, + "theoretical_loss": 4.163332578593889, + "tokens_seen": 289800192 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606920762286861, + "loss": 2.846, + "theoretical_loss": 4.163221504310734, + "tokens_seen": 289865728 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046068204613841526, + "loss": 2.9586, + "theoretical_loss": 4.1631104621674195, + "tokens_seen": 289931264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6529836654663086, + "objective/train/theoretical_loss": 4.162999452147384, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.162999452147384, + "tokens_seen": 289996800 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046067201604814445, + "loss": 2.9531, + "theoretical_loss": 4.162999452147384, + "tokens_seen": 289996800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606619859578736, + "loss": 3.323, + "theoretical_loss": 4.1628884742340775, + "tokens_seen": 290062336 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606519558676028, + "loss": 3.1529, + "theoretical_loss": 4.162777528410963, + "tokens_seen": 290127872 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460641925777332, + "loss": 3.1177, + "theoretical_loss": 4.162666614661518, + "tokens_seen": 290193408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606318956870612, + "loss": 2.821, + "theoretical_loss": 4.162555732969227, + "tokens_seen": 290258944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046062186559679035, + "loss": 2.7999, + "theoretical_loss": 4.162444883317591, + "tokens_seen": 290324480 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606118355065196, + "loss": 3.0117, + "theoretical_loss": 4.162334065690123, + "tokens_seen": 290390016 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046060180541624877, + "loss": 2.6194, + "theoretical_loss": 4.162223280070345, + "tokens_seen": 290455552 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046059177532597795, + "loss": 2.7548, + "theoretical_loss": 4.1621125264417955, + "tokens_seen": 290521088 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046058174523570713, + "loss": 2.8914, + "theoretical_loss": 4.162001804788021, + "tokens_seen": 290586624 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605717151454363, + "loss": 2.9791, + "theoretical_loss": 4.161891115092583, + "tokens_seen": 290652160 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605616850551655, + "loss": 3.3375, + "theoretical_loss": 4.161780457339055, + "tokens_seen": 290717696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605516549648947, + "loss": 3.075, + "theoretical_loss": 4.161669831511022, + "tokens_seen": 290783232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046054162487462385, + "loss": 3.0684, + "theoretical_loss": 4.16155923759208, + "tokens_seen": 290848768 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605315947843531, + "loss": 3.2013, + "theoretical_loss": 4.161448675565838, + "tokens_seen": 290914304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605215646940822, + "loss": 2.8147, + "theoretical_loss": 4.161338145415918, + "tokens_seen": 290979840 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046051153460381145, + "loss": 2.8885, + "theoretical_loss": 4.161227647125955, + "tokens_seen": 291045376 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046050150451354063, + "loss": 2.9984, + "theoretical_loss": 4.161117180679591, + "tokens_seen": 291110912 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604914744232698, + "loss": 2.9891, + "theoretical_loss": 4.161006746060488, + "tokens_seen": 291176448 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460481444332999, + "loss": 2.8484, + "theoretical_loss": 4.160896343252311, + "tokens_seen": 291241984 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046047141424272823, + "loss": 2.7959, + "theoretical_loss": 4.160785972238745, + "tokens_seen": 291307520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046046138415245736, + "loss": 3.0276, + "theoretical_loss": 4.160675633003484, + "tokens_seen": 291373056 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604513540621866, + "loss": 2.9596, + "theoretical_loss": 4.16056532553023, + "tokens_seen": 291438592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604413239719157, + "loss": 2.9352, + "theoretical_loss": 4.160455049802706, + "tokens_seen": 291504128 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046043129388164495, + "loss": 3.0643, + "theoretical_loss": 4.1603448058046375, + "tokens_seen": 291569664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.340557098388672, + "objective/train/theoretical_loss": 4.160234593519768, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.160234593519768, + "tokens_seen": 291635200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046042126379137414, + "loss": 3.2364, + "theoretical_loss": 4.160234593519768, + "tokens_seen": 291635200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604112337011033, + "loss": 3.0605, + "theoretical_loss": 4.160124412931852, + "tokens_seen": 291700736 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604012036108325, + "loss": 2.8302, + "theoretical_loss": 4.160014264024654, + "tokens_seen": 291766272 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603911735205617, + "loss": 2.7241, + "theoretical_loss": 4.159904146781952, + "tokens_seen": 291831808 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046038114343029086, + "loss": 2.7337, + "theoretical_loss": 4.159794061187536, + "tokens_seen": 291897344 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603711133400201, + "loss": 3.0597, + "theoretical_loss": 4.1596840072252075, + "tokens_seen": 291962880 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603610832497492, + "loss": 2.8511, + "theoretical_loss": 4.159573984878779, + "tokens_seen": 292028416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046035105315947846, + "loss": 3.1022, + "theoretical_loss": 4.159463994132079, + "tokens_seen": 292093952 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603410230692076, + "loss": 3.2257, + "theoretical_loss": 4.15935403496894, + "tokens_seen": 292159488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603309929789368, + "loss": 3.1807, + "theoretical_loss": 4.159244107373215, + "tokens_seen": 292225024 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460320962888666, + "loss": 2.9728, + "theoretical_loss": 4.159134211328765, + "tokens_seen": 292290560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603109327983952, + "loss": 2.9467, + "theoretical_loss": 4.159024346819461, + "tokens_seen": 292356096 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046030090270812436, + "loss": 3.1047, + "theoretical_loss": 4.158914513829189, + "tokens_seen": 292421632 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602908726178536, + "loss": 3.202, + "theoretical_loss": 4.158804712341845, + "tokens_seen": 292487168 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602808425275827, + "loss": 3.202, + "theoretical_loss": 4.158694942341338, + "tokens_seen": 292552704 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046027081243731196, + "loss": 3.0828, + "theoretical_loss": 4.1585852038115885, + "tokens_seen": 292618240 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602607823470411, + "loss": 2.9317, + "theoretical_loss": 4.1584754967365285, + "tokens_seen": 292683776 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602507522567703, + "loss": 2.9311, + "theoretical_loss": 4.1583658211001016, + "tokens_seen": 292749312 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602407221664995, + "loss": 3.1576, + "theoretical_loss": 4.158256176886264, + "tokens_seen": 292814848 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602306920762287, + "loss": 2.8076, + "theoretical_loss": 4.158146564078982, + "tokens_seen": 292880384 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046022066198595787, + "loss": 3.1682, + "theoretical_loss": 4.158036982662237, + "tokens_seen": 292945920 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046021063189568705, + "loss": 2.8877, + "theoretical_loss": 4.157927432620018, + "tokens_seen": 293011456 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602006018054163, + "loss": 2.8661, + "theoretical_loss": 4.157817913936329, + "tokens_seen": 293076992 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046019057171514546, + "loss": 3.0495, + "theoretical_loss": 4.157708426595184, + "tokens_seen": 293142528 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046018054162487465, + "loss": 3.0307, + "theoretical_loss": 4.157598970580608, + "tokens_seen": 293208064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6987125873565674, + "objective/train/theoretical_loss": 4.157489545876642, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.157489545876642, + "tokens_seen": 293273600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601705115346038, + "loss": 2.8924, + "theoretical_loss": 4.157489545876642, + "tokens_seen": 293273600 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460160481444333, + "loss": 2.7811, + "theoretical_loss": 4.157380152467333, + "tokens_seen": 293339136 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601504513540622, + "loss": 2.929, + "theoretical_loss": 4.157270790336742, + "tokens_seen": 293404672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601404212637914, + "loss": 3.0652, + "theoretical_loss": 4.157161459468944, + "tokens_seen": 293470208 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046013039117352055, + "loss": 3.0149, + "theoretical_loss": 4.157052159848023, + "tokens_seen": 293535744 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601203610832498, + "loss": 2.8461, + "theoretical_loss": 4.156942891458074, + "tokens_seen": 293601280 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046011033099297897, + "loss": 2.9769, + "theoretical_loss": 4.156833654283207, + "tokens_seen": 293666816 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046010030090270815, + "loss": 3.1259, + "theoretical_loss": 4.15672444830754, + "tokens_seen": 293732352 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046009027081243733, + "loss": 3.0044, + "theoretical_loss": 4.156615273515205, + "tokens_seen": 293797888 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600802407221665, + "loss": 3.1242, + "theoretical_loss": 4.156506129890344, + "tokens_seen": 293863424 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600702106318957, + "loss": 2.8938, + "theoretical_loss": 4.156397017417111, + "tokens_seen": 293928960 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046006018054162493, + "loss": 2.9038, + "theoretical_loss": 4.156287936079675, + "tokens_seen": 293994496 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046005015045135405, + "loss": 2.9051, + "theoretical_loss": 4.156178885862209, + "tokens_seen": 294060032 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600401203610833, + "loss": 2.9019, + "theoretical_loss": 4.156069866748906, + "tokens_seen": 294125568 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600300902708124, + "loss": 2.8916, + "theoretical_loss": 4.155960878723965, + "tokens_seen": 294191104 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046002006018054165, + "loss": 2.8845, + "theoretical_loss": 4.155851921771598, + "tokens_seen": 294256640 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046001003009027083, + "loss": 2.8652, + "theoretical_loss": 4.155742995876029, + "tokens_seen": 294322176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046, + "loss": 2.8854, + "theoretical_loss": 4.155634101021494, + "tokens_seen": 294387712 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599899699097292, + "loss": 3.0241, + "theoretical_loss": 4.155525237192238, + "tokens_seen": 294453248 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045997993981945843, + "loss": 3.2217, + "theoretical_loss": 4.155416404372522, + "tokens_seen": 294518784 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045996990972918756, + "loss": 2.8677, + "theoretical_loss": 4.155307602546614, + "tokens_seen": 294584320 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599598796389168, + "loss": 2.9117, + "theoretical_loss": 4.155198831698795, + "tokens_seen": 294649856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599498495486459, + "loss": 2.8769, + "theoretical_loss": 4.155090091813358, + "tokens_seen": 294715392 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045993981945837515, + "loss": 3.2191, + "theoretical_loss": 4.154981382874608, + "tokens_seen": 294780928 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045992978936810434, + "loss": 2.9602, + "theoretical_loss": 4.154872704866859, + "tokens_seen": 294846464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4592936038970947, + "objective/train/theoretical_loss": 4.15476405777444, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.15476405777444, + "tokens_seen": 294912000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599197592778335, + "loss": 3.0343, + "theoretical_loss": 4.15476405777444, + "tokens_seen": 294912000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599097291875627, + "loss": 2.9556, + "theoretical_loss": 4.154655441581687, + "tokens_seen": 294977536 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598996990972919, + "loss": 2.9162, + "theoretical_loss": 4.154546856272952, + "tokens_seen": 295043072 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045988966900702106, + "loss": 2.8154, + "theoretical_loss": 4.154438301832596, + "tokens_seen": 295108608 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598796389167503, + "loss": 3.1434, + "theoretical_loss": 4.154329778244991, + "tokens_seen": 295174144 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598696088264794, + "loss": 2.7265, + "theoretical_loss": 4.154221285494521, + "tokens_seen": 295239680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045985957873620866, + "loss": 3.1865, + "theoretical_loss": 4.154112823565582, + "tokens_seen": 295305216 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598495486459378, + "loss": 2.9111, + "theoretical_loss": 4.15400439244258, + "tokens_seen": 295370752 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459839518555667, + "loss": 2.9867, + "theoretical_loss": 4.153895992109935, + "tokens_seen": 295436288 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598294884653962, + "loss": 2.9011, + "theoretical_loss": 4.153787622552073, + "tokens_seen": 295501824 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598194583751254, + "loss": 3.0471, + "theoretical_loss": 4.153679283753439, + "tokens_seen": 295567360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045980942828485456, + "loss": 2.909, + "theoretical_loss": 4.15357097569848, + "tokens_seen": 295632896 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597993981945838, + "loss": 2.7348, + "theoretical_loss": 4.153462698371665, + "tokens_seen": 295698432 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597893681043129, + "loss": 3.0335, + "theoretical_loss": 4.1533544517574645, + "tokens_seen": 295763968 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045977933801404216, + "loss": 2.9854, + "theoretical_loss": 4.153246235840367, + "tokens_seen": 295829504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597693079237713, + "loss": 2.8577, + "theoretical_loss": 4.153138050604868, + "tokens_seen": 295895040 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597592778335005, + "loss": 3.1334, + "theoretical_loss": 4.153029896035476, + "tokens_seen": 295960576 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597492477432297, + "loss": 3.1153, + "theoretical_loss": 4.152921772116712, + "tokens_seen": 296026112 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597392176529589, + "loss": 3.0102, + "theoretical_loss": 4.152813678833106, + "tokens_seen": 296091648 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045972918756268807, + "loss": 2.846, + "theoretical_loss": 4.152705616169202, + "tokens_seen": 296157184 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045971915747241725, + "loss": 3.0878, + "theoretical_loss": 4.15259758410955, + "tokens_seen": 296222720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045970912738214643, + "loss": 3.0873, + "theoretical_loss": 4.152489582638719, + "tokens_seen": 296288256 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045969909729187566, + "loss": 3.111, + "theoretical_loss": 4.152381611741281, + "tokens_seen": 296353792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596890672016048, + "loss": 2.9683, + "theoretical_loss": 4.152273671401824, + "tokens_seen": 296419328 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459679037111334, + "loss": 3.2746, + "theoretical_loss": 4.152165761604948, + "tokens_seen": 296484864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 320606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3823139667510986, + "objective/train/theoretical_loss": 4.152057882335261, + "objective/train/tokens_used": 296742368, + "theoretical_loss": 4.152057882335261, + "tokens_seen": 296550400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045966900702106315, + "loss": 2.9545, + "theoretical_loss": 4.152057882335261, + "tokens_seen": 296550400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596589769307924, + "loss": 2.8693, + "theoretical_loss": 4.151950033577383, + "tokens_seen": 296615936 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045964894684052157, + "loss": 3.2989, + "theoretical_loss": 4.151842215315947, + "tokens_seen": 296681472 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045963891675025075, + "loss": 3.0708, + "theoretical_loss": 4.151734427535594, + "tokens_seen": 296747008 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045962888665997993, + "loss": 3.9868, + "theoretical_loss": 4.151601419005685, + "tokens_seen": 296827904 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045961885656970917, + "loss": 3.0976, + "theoretical_loss": 4.151493699276069, + "tokens_seen": 296893440 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004596088264794383, + "loss": 3.2042, + "theoretical_loss": 4.151386009977943, + "tokens_seen": 296958976 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045959879638916753, + "loss": 3.1598, + "theoretical_loss": 4.151278351095997, + "tokens_seen": 297024512 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045958876629889666, + "loss": 3.2069, + "theoretical_loss": 4.15117072261493, + "tokens_seen": 297090048 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004595787362086259, + "loss": 2.9252, + "theoretical_loss": 4.151063124519455, + "tokens_seen": 297155584 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004595687061183551, + "loss": 3.1758, + "theoretical_loss": 4.150955556794295, + "tokens_seen": 297221120 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045955867602808425, + "loss": 3.1888, + "theoretical_loss": 4.150848019424184, + "tokens_seen": 297286656 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045954864593781344, + "loss": 3.1973, + "theoretical_loss": 4.150740512393868, + "tokens_seen": 297352192 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004595386158475426, + "loss": 3.1925, + "theoretical_loss": 4.1506330356881005, + "tokens_seen": 297417728 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004595285857572718, + "loss": 3.2762, + "theoretical_loss": 4.150525589291652, + "tokens_seen": 297483264 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045951855566700103, + "loss": 3.2021, + "theoretical_loss": 4.150418173189299, + "tokens_seen": 297548800 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045950852557673016, + "loss": 3.1805, + "theoretical_loss": 4.1503107873658305, + "tokens_seen": 297614336 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004594984954864594, + "loss": 3.3438, + "theoretical_loss": 4.150203431806046, + "tokens_seen": 297679872 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004594884653961885, + "loss": 3.1938, + "theoretical_loss": 4.150096106494758, + "tokens_seen": 297745408 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045947843530591776, + "loss": 3.1626, + "theoretical_loss": 4.149988811416788, + "tokens_seen": 297810944 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045946840521564694, + "loss": 3.1176, + "theoretical_loss": 4.149881546556971, + "tokens_seen": 297876480 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004594583751253761, + "loss": 3.0244, + "theoretical_loss": 4.149774311900147, + "tokens_seen": 297942016 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045944834503510536, + "loss": 3.0722, + "theoretical_loss": 4.149667107431174, + "tokens_seen": 298007552 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045943831494483454, + "loss": 3.0816, + "theoretical_loss": 4.149559933134916, + "tokens_seen": 298073088 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004594282848545637, + "loss": 3.2045, + "theoretical_loss": 4.149452788996252, + "tokens_seen": 298138624 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 388086, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5071983337402344, + "objective/train/theoretical_loss": 4.149372450674081, + "objective/train/tokens_used": 318647776, + "theoretical_loss": 4.149372450674081, + "tokens_seen": 298187776 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004594182547642929, + "loss": 3.0893, + "theoretical_loss": 4.149345675000067, + "tokens_seen": 298204160 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004594082246740221, + "loss": 3.2211, + "theoretical_loss": 4.149238591131261, + "tokens_seen": 298269696 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045939819458375126, + "loss": 3.1047, + "theoretical_loss": 4.149131537374743, + "tokens_seen": 298335232 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004593881644934805, + "loss": 3.1427, + "theoretical_loss": 4.149024513715434, + "tokens_seen": 298400768 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004593781344032096, + "loss": 3.2841, + "theoretical_loss": 4.148917520138264, + "tokens_seen": 298466304 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045936810431293886, + "loss": 3.1813, + "theoretical_loss": 4.148810556628176, + "tokens_seen": 298531840 + }, + { + "epoch": 1.0, + "learning_rate": 0.000459358074222668, + "loss": 3.0694, + "theoretical_loss": 4.148703623170123, + "tokens_seen": 298597376 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004593480441323972, + "loss": 3.0848, + "theoretical_loss": 4.148596719749067, + "tokens_seen": 298662912 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004593380140421264, + "loss": 3.1606, + "theoretical_loss": 4.148489846349984, + "tokens_seen": 298728448 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004593279839518556, + "loss": 3.001, + "theoretical_loss": 4.14838300295786, + "tokens_seen": 298793984 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045931795386158476, + "loss": 3.2776, + "theoretical_loss": 4.148276189557689, + "tokens_seen": 298859520 + }, + { + "epoch": 1.0, + "learning_rate": 0.000459307923771314, + "loss": 3.1101, + "theoretical_loss": 4.148169406134479, + "tokens_seen": 298925056 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004592978936810431, + "loss": 3.2241, + "theoretical_loss": 4.148062652673248, + "tokens_seen": 298990592 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045928786359077236, + "loss": 3.2768, + "theoretical_loss": 4.147955929159024, + "tokens_seen": 299056128 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004592778335005015, + "loss": 3.3648, + "theoretical_loss": 4.147849235576846, + "tokens_seen": 299121664 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004592678034102307, + "loss": 3.039, + "theoretical_loss": 4.147742571911765, + "tokens_seen": 299187200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004592577733199599, + "loss": 3.0152, + "theoretical_loss": 4.147635938148841, + "tokens_seen": 299252736 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004592477432296891, + "loss": 3.1572, + "theoretical_loss": 4.147529334273145, + "tokens_seen": 299318272 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045923771313941827, + "loss": 3.2587, + "theoretical_loss": 4.147422760269759, + "tokens_seen": 299383808 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045922768304914745, + "loss": 3.0774, + "theoretical_loss": 4.147316216123777, + "tokens_seen": 299449344 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045921765295887663, + "loss": 3.2984, + "theoretical_loss": 4.147209701820302, + "tokens_seen": 299514880 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045920762286860586, + "loss": 3.0177, + "theoretical_loss": 4.147103217344448, + "tokens_seen": 299580416 + }, + { + "epoch": 1.0, + "learning_rate": 0.000459197592778335, + "loss": 3.1661, + "theoretical_loss": 4.14699676268134, + "tokens_seen": 299645952 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004591875626880642, + "loss": 2.9458, + "theoretical_loss": 4.146890337816114, + "tokens_seen": 299711488 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045917753259779335, + "loss": 2.9969, + "theoretical_loss": 4.146783942733915, + "tokens_seen": 299777024 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 393339, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2248926162719727, + "objective/train/theoretical_loss": 4.1467041659584485, + "objective/train/tokens_used": 320286176, + "theoretical_loss": 4.1467041659584485, + "tokens_seen": 299826176 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004591675025075226, + "loss": 3.0643, + "theoretical_loss": 4.146677577419902, + "tokens_seen": 299842560 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045915747241725177, + "loss": 3.0779, + "theoretical_loss": 4.14657124185924, + "tokens_seen": 299908096 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045914744232698095, + "loss": 3.1616, + "theoretical_loss": 4.14646493603711, + "tokens_seen": 299973632 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045913741223671013, + "loss": 3.0127, + "theoretical_loss": 4.146358659938699, + "tokens_seen": 300039168 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045912738214643937, + "loss": 3.0727, + "theoretical_loss": 4.146252413549207, + "tokens_seen": 300104704 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004591173520561685, + "loss": 3.2627, + "theoretical_loss": 4.146146196853843, + "tokens_seen": 300170240 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045910732196589773, + "loss": 3.1219, + "theoretical_loss": 4.146040009837829, + "tokens_seen": 300235776 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045909729187562686, + "loss": 3.2786, + "theoretical_loss": 4.145933852486396, + "tokens_seen": 300301312 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004590872617853561, + "loss": 3.0808, + "theoretical_loss": 4.145827724784784, + "tokens_seen": 300366848 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004590772316950853, + "loss": 3.054, + "theoretical_loss": 4.145721626718247, + "tokens_seen": 300432384 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045906720160481445, + "loss": 3.0884, + "theoretical_loss": 4.1456155582720475, + "tokens_seen": 300497920 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045905717151454364, + "loss": 3.0052, + "theoretical_loss": 4.145509519431459, + "tokens_seen": 300563456 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004590471414242728, + "loss": 2.9324, + "theoretical_loss": 4.145403510181765, + "tokens_seen": 300628992 + }, + { + "epoch": 1.0, + "learning_rate": 0.000459037111334002, + "loss": 3.1987, + "theoretical_loss": 4.145297530508261, + "tokens_seen": 300694528 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045902708124373123, + "loss": 3.245, + "theoretical_loss": 4.145191580396251, + "tokens_seen": 300760064 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045901705115346036, + "loss": 3.2895, + "theoretical_loss": 4.145085659831049, + "tokens_seen": 300825600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004590070210631896, + "loss": 3.3486, + "theoretical_loss": 4.144979768797985, + "tokens_seen": 300891136 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004589969909729187, + "loss": 3.0806, + "theoretical_loss": 4.14487390728239, + "tokens_seen": 300956672 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045898696088264796, + "loss": 2.9642, + "theoretical_loss": 4.144768075269616, + "tokens_seen": 301022208 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045897693079237714, + "loss": 3.1238, + "theoretical_loss": 4.144662272745018, + "tokens_seen": 301087744 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004589669007021063, + "loss": 3.0449, + "theoretical_loss": 4.144556499693964, + "tokens_seen": 301153280 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004589568706118355, + "loss": 3.1233, + "theoretical_loss": 4.144450756101832, + "tokens_seen": 301218816 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045894684052156474, + "loss": 3.1373, + "theoretical_loss": 4.144345041954011, + "tokens_seen": 301284352 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045893681043129386, + "loss": 3.047, + "theoretical_loss": 4.1442393572359, + "tokens_seen": 301349888 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004589267803410231, + "loss": 3.0807, + "theoretical_loss": 4.14413370193291, + "tokens_seen": 301415424 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 398212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0288240909576416, + "objective/train/theoretical_loss": 4.144054479750567, + "objective/train/tokens_used": 321924576, + "theoretical_loss": 4.144054479750567, + "tokens_seen": 301464576 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004589167502507522, + "loss": 3.0242, + "theoretical_loss": 4.144028076030458, + "tokens_seen": 301480960 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045890672016048146, + "loss": 3.2372, + "theoretical_loss": 4.143922479513977, + "tokens_seen": 301546496 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045889669007021064, + "loss": 3.141, + "theoretical_loss": 4.143816912368906, + "tokens_seen": 301612032 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004588866599799398, + "loss": 3.2979, + "theoretical_loss": 4.143711374580697, + "tokens_seen": 301677568 + }, + { + "epoch": 1.0, + "learning_rate": 0.000458876629889669, + "loss": 3.1576, + "theoretical_loss": 4.143605866134811, + "tokens_seen": 301743104 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004588665997993982, + "loss": 3.1577, + "theoretical_loss": 4.1435003870167195, + "tokens_seen": 301808640 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045885656970912737, + "loss": 3.1194, + "theoretical_loss": 4.143394937211906, + "tokens_seen": 301874176 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004588465396188566, + "loss": 3.0641, + "theoretical_loss": 4.143289516705861, + "tokens_seen": 301939712 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045883650952858573, + "loss": 3.1287, + "theoretical_loss": 4.14318412548409, + "tokens_seen": 302005248 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045882647943831496, + "loss": 3.1244, + "theoretical_loss": 4.143078763532104, + "tokens_seen": 302070784 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045881644934804415, + "loss": 3.2705, + "theoretical_loss": 4.1429734308354265, + "tokens_seen": 302136320 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004588064192577733, + "loss": 3.2505, + "theoretical_loss": 4.142868127379592, + "tokens_seen": 302201856 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004587963891675025, + "loss": 3.0527, + "theoretical_loss": 4.142762853150145, + "tokens_seen": 302267392 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004587863590772317, + "loss": 3.0989, + "theoretical_loss": 4.142657608132638, + "tokens_seen": 302332928 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045877632898696087, + "loss": 3.3695, + "theoretical_loss": 4.142552392312638, + "tokens_seen": 302398464 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004587662988966901, + "loss": 3.1914, + "theoretical_loss": 4.142447205675717, + "tokens_seen": 302464000 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045875626880641923, + "loss": 3.3743, + "theoretical_loss": 4.142342048207462, + "tokens_seen": 302529536 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045874623871614847, + "loss": 3.2538, + "theoretical_loss": 4.1422369198934685, + "tokens_seen": 302595072 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004587362086258776, + "loss": 3.2982, + "theoretical_loss": 4.142131820719342, + "tokens_seen": 302660608 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045872617853560683, + "loss": 3.1992, + "theoretical_loss": 4.142026750670697, + "tokens_seen": 302726144 + }, + { + "epoch": 1.0, + "learning_rate": 0.000458716148445336, + "loss": 3.1915, + "theoretical_loss": 4.14192170973316, + "tokens_seen": 302791680 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004587061183550652, + "loss": 3.3022, + "theoretical_loss": 4.141816697892368, + "tokens_seen": 302857216 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004586960882647944, + "loss": 3.347, + "theoretical_loss": 4.141711715133967, + "tokens_seen": 302922752 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045868605817452355, + "loss": 3.2762, + "theoretical_loss": 4.1416067614436125, + "tokens_seen": 302988288 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004586760280842528, + "loss": 3.1287, + "theoretical_loss": 4.141501836806973, + "tokens_seen": 303053824 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 403119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8905088901519775, + "objective/train/theoretical_loss": 4.141423162387376, + "objective/train/tokens_used": 323562976, + "theoretical_loss": 4.141423162387376, + "tokens_seen": 303102976 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045866599799398197, + "loss": 3.0808, + "theoretical_loss": 4.141396941209724, + "tokens_seen": 303119360 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045865596790371115, + "loss": 3.2722, + "theoretical_loss": 4.141292074637554, + "tokens_seen": 303184896 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045864593781344033, + "loss": 3.4675, + "theoretical_loss": 4.141187237076158, + "tokens_seen": 303250432 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045863590772316957, + "loss": 3.2572, + "theoretical_loss": 4.141082428511247, + "tokens_seen": 303315968 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004586258776328987, + "loss": 3.2468, + "theoretical_loss": 4.140977648928534, + "tokens_seen": 303381504 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045861584754262793, + "loss": 3.025, + "theoretical_loss": 4.14087289831375, + "tokens_seen": 303447040 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045860581745235706, + "loss": 3.2632, + "theoretical_loss": 4.140768176652632, + "tokens_seen": 303512576 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004585957873620863, + "loss": 3.4513, + "theoretical_loss": 4.1406634839309255, + "tokens_seen": 303578112 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004585857572718155, + "loss": 3.307, + "theoretical_loss": 4.140558820134391, + "tokens_seen": 303643648 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045857572718154465, + "loss": 3.2101, + "theoretical_loss": 4.140454185248797, + "tokens_seen": 303709184 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045856569709127384, + "loss": 3.3302, + "theoretical_loss": 4.140349579259919, + "tokens_seen": 303774720 + }, + { + "epoch": 1.0, + "learning_rate": 0.000458555667001003, + "loss": 3.2633, + "theoretical_loss": 4.140245002153547, + "tokens_seen": 303840256 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004585456369107322, + "loss": 3.0562, + "theoretical_loss": 4.140140453915478, + "tokens_seen": 303905792 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045853560682046143, + "loss": 3.1859, + "theoretical_loss": 4.140035934531521, + "tokens_seen": 303971328 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045852557673019056, + "loss": 3.1359, + "theoretical_loss": 4.139931443987494, + "tokens_seen": 304036864 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004585155466399198, + "loss": 3.4099, + "theoretical_loss": 4.139826982269225, + "tokens_seen": 304102400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004585055165496489, + "loss": 3.2253, + "theoretical_loss": 4.139722549362553, + "tokens_seen": 304167936 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045849548645937816, + "loss": 3.1415, + "theoretical_loss": 4.139618145253326, + "tokens_seen": 304233472 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045848545636910734, + "loss": 3.0069, + "theoretical_loss": 4.139513769927402, + "tokens_seen": 304299008 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004584754262788365, + "loss": 3.3609, + "theoretical_loss": 4.139409423370649, + "tokens_seen": 304364544 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004584653961885657, + "loss": 3.3372, + "theoretical_loss": 4.139305105568946, + "tokens_seen": 304430080 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045845536609829494, + "loss": 3.124, + "theoretical_loss": 4.139200816508181, + "tokens_seen": 304495616 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045844533600802406, + "loss": 3.1771, + "theoretical_loss": 4.139096556174252, + "tokens_seen": 304561152 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004584353059177533, + "loss": 3.2137, + "theoretical_loss": 4.138992324553068, + "tokens_seen": 304626688 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004584252758274824, + "loss": 3.2045, + "theoretical_loss": 4.138888121630545, + "tokens_seen": 304692224 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 408173, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1989989280700684, + "objective/train/theoretical_loss": 4.138809988263685, + "objective/train/tokens_used": 325201376, + "theoretical_loss": 4.138809988263685, + "tokens_seen": 304741376 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045841524573721166, + "loss": 3.3398, + "theoretical_loss": 4.138783947392613, + "tokens_seen": 304757760 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045840521564694084, + "loss": 3.1126, + "theoretical_loss": 4.13867980182521, + "tokens_seen": 304823296 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045839518555667, + "loss": 3.1801, + "theoretical_loss": 4.138575684914282, + "tokens_seen": 304888832 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004583851554663992, + "loss": 3.2703, + "theoretical_loss": 4.138471596645789, + "tokens_seen": 304954368 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004583751253761284, + "loss": 3.2164, + "theoretical_loss": 4.138367537005697, + "tokens_seen": 305019904 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045836509528585757, + "loss": 3.3049, + "theoretical_loss": 4.1382635059799835, + "tokens_seen": 305085440 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004583550651955868, + "loss": 2.9696, + "theoretical_loss": 4.138159503554638, + "tokens_seen": 305150976 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045834503510531593, + "loss": 3.3098, + "theoretical_loss": 4.138055529715655, + "tokens_seen": 305216512 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045833500501504516, + "loss": 3.1387, + "theoretical_loss": 4.137951584449044, + "tokens_seen": 305282048 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045832497492477435, + "loss": 3.1771, + "theoretical_loss": 4.13784766774082, + "tokens_seen": 305347584 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004583149448345035, + "loss": 3.171, + "theoretical_loss": 4.137743779577011, + "tokens_seen": 305413120 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004583049147442327, + "loss": 3.009, + "theoretical_loss": 4.137639919943655, + "tokens_seen": 305478656 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004582948846539619, + "loss": 3.0046, + "theoretical_loss": 4.137536088826796, + "tokens_seen": 305544192 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045828485456369107, + "loss": 3.0266, + "theoretical_loss": 4.1374322862124915, + "tokens_seen": 305609728 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004582748244734203, + "loss": 2.9813, + "theoretical_loss": 4.137328512086807, + "tokens_seen": 305675264 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045826479438314943, + "loss": 3.0888, + "theoretical_loss": 4.1372247664358195, + "tokens_seen": 305740800 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045825476429287867, + "loss": 2.9889, + "theoretical_loss": 4.137121049245614, + "tokens_seen": 305806336 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004582447342026078, + "loss": 2.9691, + "theoretical_loss": 4.137017360502286, + "tokens_seen": 305871872 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045823470411233703, + "loss": 2.893, + "theoretical_loss": 4.136913700191942, + "tokens_seen": 305937408 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004582246740220662, + "loss": 3.1183, + "theoretical_loss": 4.136810068300694, + "tokens_seen": 306002944 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004582146439317954, + "loss": 3.2618, + "theoretical_loss": 4.13670646481467, + "tokens_seen": 306068480 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045820461384152457, + "loss": 3.2482, + "theoretical_loss": 4.1366028897200025, + "tokens_seen": 306134016 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045819458375125375, + "loss": 3.1096, + "theoretical_loss": 4.1364993430028365, + "tokens_seen": 306199552 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045818455366098294, + "loss": 3.4132, + "theoretical_loss": 4.136395824649327, + "tokens_seen": 306265088 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045817452357071217, + "loss": 3.1493, + "theoretical_loss": 4.136292334645636, + "tokens_seen": 306330624 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 413344, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.146451950073242, + "objective/train/theoretical_loss": 4.136214735739118, + "objective/train/tokens_used": 326839776, + "theoretical_loss": 4.136214735739118, + "tokens_seen": 306379776 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004581644934804413, + "loss": 3.0853, + "theoretical_loss": 4.136188872977938, + "tokens_seen": 306396160 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045815446339017053, + "loss": 3.0507, + "theoretical_loss": 4.1360854396324145, + "tokens_seen": 306461696 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004581444332998997, + "loss": 3.0544, + "theoretical_loss": 4.13598203459526, + "tokens_seen": 306527232 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004581344032096289, + "loss": 3.0592, + "theoretical_loss": 4.135878657852677, + "tokens_seen": 306592768 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004581243731193581, + "loss": 3.1452, + "theoretical_loss": 4.135775309390876, + "tokens_seen": 306658304 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045811434302908726, + "loss": 2.9714, + "theoretical_loss": 4.135671989196081, + "tokens_seen": 306723840 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045810431293881644, + "loss": 3.279, + "theoretical_loss": 4.135568697254522, + "tokens_seen": 306789376 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004580942828485457, + "loss": 3.2136, + "theoretical_loss": 4.135465433552441, + "tokens_seen": 306854912 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004580842527582748, + "loss": 3.1741, + "theoretical_loss": 4.135362198076088, + "tokens_seen": 306920448 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045807422266800404, + "loss": 3.2101, + "theoretical_loss": 4.135258990811723, + "tokens_seen": 306985984 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045806419257773316, + "loss": 3.0828, + "theoretical_loss": 4.1351558117456175, + "tokens_seen": 307051520 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004580541624874624, + "loss": 3.0378, + "theoretical_loss": 4.13505266086405, + "tokens_seen": 307117056 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004580441323971916, + "loss": 3.0833, + "theoretical_loss": 4.134949538153309, + "tokens_seen": 307182592 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045803410230692076, + "loss": 3.0974, + "theoretical_loss": 4.134846443599694, + "tokens_seen": 307248128 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045802407221664994, + "loss": 3.1697, + "theoretical_loss": 4.1347433771895155, + "tokens_seen": 307313664 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004580140421263791, + "loss": 3.1566, + "theoretical_loss": 4.134640338909088, + "tokens_seen": 307379200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004580040120361083, + "loss": 2.9549, + "theoretical_loss": 4.134537328744742, + "tokens_seen": 307444736 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045799398194583754, + "loss": 3.0834, + "theoretical_loss": 4.134434346682812, + "tokens_seen": 307510272 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045798395185556667, + "loss": 2.92, + "theoretical_loss": 4.134331392709647, + "tokens_seen": 307575808 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004579739217652959, + "loss": 3.018, + "theoretical_loss": 4.134228466811601, + "tokens_seen": 307641344 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004579638916750251, + "loss": 3.0011, + "theoretical_loss": 4.1341255689750405, + "tokens_seen": 307706880 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045795386158475426, + "loss": 3.1451, + "theoretical_loss": 4.134022699186342, + "tokens_seen": 307772416 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004579438314944835, + "loss": 3.0998, + "theoretical_loss": 4.133919857431889, + "tokens_seen": 307837952 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004579338014042126, + "loss": 3.0204, + "theoretical_loss": 4.133817043698075, + "tokens_seen": 307903488 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045792377131394186, + "loss": 3.0224, + "theoretical_loss": 4.133714257971306, + "tokens_seen": 307969024 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 418392, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.903918981552124, + "objective/train/theoretical_loss": 4.133637187047677, + "objective/train/tokens_used": 328478176, + "theoretical_loss": 4.133637187047677, + "tokens_seen": 308018176 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045791374122367104, + "loss": 2.9841, + "theoretical_loss": 4.133611500237993, + "tokens_seen": 308034560 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004579037111334002, + "loss": 2.9786, + "theoretical_loss": 4.133508770484561, + "tokens_seen": 308100096 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004578936810431294, + "loss": 3.0327, + "theoretical_loss": 4.13340606869744, + "tokens_seen": 308165632 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004578836509528586, + "loss": 3.1817, + "theoretical_loss": 4.133303394863072, + "tokens_seen": 308231168 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045787362086258777, + "loss": 2.8875, + "theoretical_loss": 4.133200748967909, + "tokens_seen": 308296704 + }, + { + "epoch": 1.0, + "learning_rate": 0.000457863590772317, + "loss": 3.3509, + "theoretical_loss": 4.133098130998412, + "tokens_seen": 308362240 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045785356068204613, + "loss": 3.14, + "theoretical_loss": 4.1329955409410495, + "tokens_seen": 308427776 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045784353059177536, + "loss": 3.1157, + "theoretical_loss": 4.132892978782301, + "tokens_seen": 308493312 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045783350050150455, + "loss": 3.2767, + "theoretical_loss": 4.1327904445086565, + "tokens_seen": 308558848 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004578234704112337, + "loss": 3.1308, + "theoretical_loss": 4.132687938106614, + "tokens_seen": 308624384 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004578134403209629, + "loss": 2.9828, + "theoretical_loss": 4.132585459562681, + "tokens_seen": 308689920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004578034102306921, + "loss": 3.2487, + "theoretical_loss": 4.132483008863374, + "tokens_seen": 308755456 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045779338014042127, + "loss": 2.948, + "theoretical_loss": 4.13238058599522, + "tokens_seen": 308820992 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004577833500501505, + "loss": 3.2227, + "theoretical_loss": 4.132278190944755, + "tokens_seen": 308886528 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045777331995987963, + "loss": 3.115, + "theoretical_loss": 4.132175823698523, + "tokens_seen": 308952064 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045776328986960887, + "loss": 3.1668, + "theoretical_loss": 4.1320734842430795, + "tokens_seen": 309017600 + }, + { + "epoch": 1.0, + "learning_rate": 0.000457753259779338, + "loss": 3.0767, + "theoretical_loss": 4.131971172564989, + "tokens_seen": 309083136 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045774322968906723, + "loss": 2.9926, + "theoretical_loss": 4.131868888650824, + "tokens_seen": 309148672 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004577331995987964, + "loss": 3.0552, + "theoretical_loss": 4.1317666324871665, + "tokens_seen": 309214208 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004577231695085256, + "loss": 3.0081, + "theoretical_loss": 4.13166440406061, + "tokens_seen": 309279744 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004577131394182548, + "loss": 2.9077, + "theoretical_loss": 4.131562203357753, + "tokens_seen": 309345280 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045770310932798395, + "loss": 3.0159, + "theoretical_loss": 4.131460030365209, + "tokens_seen": 309410816 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045769307923771314, + "loss": 3.2991, + "theoretical_loss": 4.131357885069596, + "tokens_seen": 309476352 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045768304914744237, + "loss": 3.2686, + "theoretical_loss": 4.131255767457543, + "tokens_seen": 309541888 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004576730190571715, + "loss": 2.9812, + "theoretical_loss": 4.13115367751569, + "tokens_seen": 309607424 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 423122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.105621099472046, + "objective/train/theoretical_loss": 4.131077128209836, + "objective/train/tokens_used": 330116576, + "theoretical_loss": 4.131077128209836, + "tokens_seen": 309656576 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045766298896690073, + "loss": 3.2017, + "theoretical_loss": 4.1310516152306835, + "tokens_seen": 309672960 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004576529588766299, + "loss": 3.1174, + "theoretical_loss": 4.130949580589181, + "tokens_seen": 309738496 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004576429287863591, + "loss": 3.0827, + "theoretical_loss": 4.130847573577848, + "tokens_seen": 309804032 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004576328986960883, + "loss": 3.1589, + "theoretical_loss": 4.13074559418336, + "tokens_seen": 309869568 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045762286860581746, + "loss": 3.2419, + "theoretical_loss": 4.1306436423924024, + "tokens_seen": 309935104 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045761283851554664, + "loss": 3.0622, + "theoretical_loss": 4.130541718191669, + "tokens_seen": 310000640 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004576028084252759, + "loss": 3.0627, + "theoretical_loss": 4.130439821567863, + "tokens_seen": 310066176 + }, + { + "epoch": 1.0, + "learning_rate": 0.000457592778335005, + "loss": 2.9596, + "theoretical_loss": 4.130337952507695, + "tokens_seen": 310131712 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045758274824473424, + "loss": 3.1407, + "theoretical_loss": 4.130236110997889, + "tokens_seen": 310197248 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045757271815446336, + "loss": 3.0654, + "theoretical_loss": 4.130134297025175, + "tokens_seen": 310262784 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004575626880641926, + "loss": 3.1397, + "theoretical_loss": 4.130032510576292, + "tokens_seen": 310328320 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004575526579739218, + "loss": 3.171, + "theoretical_loss": 4.129930751637991, + "tokens_seen": 310393856 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045754262788365096, + "loss": 3.2332, + "theoretical_loss": 4.129829020197029, + "tokens_seen": 310459392 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045753259779338014, + "loss": 3.0021, + "theoretical_loss": 4.1297273162401735, + "tokens_seen": 310524928 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004575225677031093, + "loss": 2.9555, + "theoretical_loss": 4.1296256397542015, + "tokens_seen": 310590464 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004575125376128385, + "loss": 3.1127, + "theoretical_loss": 4.1295239907258985, + "tokens_seen": 310656000 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045750250752256774, + "loss": 3.0528, + "theoretical_loss": 4.12942236914206, + "tokens_seen": 310721536 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045749247743229687, + "loss": 3.0231, + "theoretical_loss": 4.129320774989489, + "tokens_seen": 310787072 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004574824473420261, + "loss": 3.1342, + "theoretical_loss": 4.129219208255002, + "tokens_seen": 310852608 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004574724172517553, + "loss": 3.113, + "theoretical_loss": 4.129117668925415, + "tokens_seen": 310918144 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045746238716148446, + "loss": 2.9051, + "theoretical_loss": 4.129016156987566, + "tokens_seen": 310983680 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045745235707121364, + "loss": 2.9875, + "theoretical_loss": 4.128914672428293, + "tokens_seen": 311049216 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004574423269809428, + "loss": 3.06, + "theoretical_loss": 4.128813215234444, + "tokens_seen": 311114752 + }, + { + "epoch": 1.0, + "learning_rate": 0.000457432296890672, + "loss": 3.0676, + "theoretical_loss": 4.128711785392879, + "tokens_seen": 311180288 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045742226680040124, + "loss": 3.1011, + "theoretical_loss": 4.128610382890467, + "tokens_seen": 311245824 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 428239, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.340728282928467, + "objective/train/theoretical_loss": 4.128534348947081, + "objective/train/tokens_used": 331754976, + "theoretical_loss": 4.128534348947081, + "tokens_seen": 311294976 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045741223671013037, + "loss": 3.1619, + "theoretical_loss": 4.128509007714083, + "tokens_seen": 311311360 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004574022066198596, + "loss": 3.0496, + "theoretical_loss": 4.128407659850614, + "tokens_seen": 311376896 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045739217652958873, + "loss": 2.8642, + "theoretical_loss": 4.128306339286954, + "tokens_seen": 311442432 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045738214643931797, + "loss": 3.0832, + "theoretical_loss": 4.1282050460100095, + "tokens_seen": 311507968 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045737211634904715, + "loss": 3.1083, + "theoretical_loss": 4.12810378000669, + "tokens_seen": 311573504 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045736208625877633, + "loss": 2.8529, + "theoretical_loss": 4.12800254126392, + "tokens_seen": 311639040 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004573520561685055, + "loss": 3.1388, + "theoretical_loss": 4.12790132976863, + "tokens_seen": 311704576 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045734202607823475, + "loss": 2.9444, + "theoretical_loss": 4.127800145507759, + "tokens_seen": 311770112 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045733199598796387, + "loss": 2.8887, + "theoretical_loss": 4.127698988468259, + "tokens_seen": 311835648 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004573219658976931, + "loss": 3.1635, + "theoretical_loss": 4.127597858637085, + "tokens_seen": 311901184 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045731193580742223, + "loss": 3.0093, + "theoretical_loss": 4.1274967560012055, + "tokens_seen": 311966720 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045730190571715147, + "loss": 3.1035, + "theoretical_loss": 4.127395680547597, + "tokens_seen": 312032256 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045729187562688065, + "loss": 3.207, + "theoretical_loss": 4.127294632263243, + "tokens_seen": 312097792 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045728184553660983, + "loss": 3.0375, + "theoretical_loss": 4.127193611135139, + "tokens_seen": 312163328 + }, + { + "epoch": 1.0, + "learning_rate": 0.000457271815446339, + "loss": 2.8885, + "theoretical_loss": 4.1270926171502875, + "tokens_seen": 312228864 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004572617853560682, + "loss": 3.0487, + "theoretical_loss": 4.1269916502957, + "tokens_seen": 312294400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004572517552657974, + "loss": 3.0653, + "theoretical_loss": 4.126890710558398, + "tokens_seen": 312359936 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004572417251755266, + "loss": 2.9576, + "theoretical_loss": 4.126789797925411, + "tokens_seen": 312425472 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045723169508525574, + "loss": 3.2364, + "theoretical_loss": 4.126688912383777, + "tokens_seen": 312491008 + }, + { + "epoch": 1.0, + "learning_rate": 0.000457221664994985, + "loss": 3.2365, + "theoretical_loss": 4.126588053920545, + "tokens_seen": 312556544 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004572116349047141, + "loss": 3.1703, + "theoretical_loss": 4.126487222522771, + "tokens_seen": 312622080 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045720160481444334, + "loss": 3.1546, + "theoretical_loss": 4.12638641817752, + "tokens_seen": 312687616 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045719157472417257, + "loss": 3.2756, + "theoretical_loss": 4.126285640871867, + "tokens_seen": 312753152 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004571815446339017, + "loss": 3.1369, + "theoretical_loss": 4.126184890592894, + "tokens_seen": 312818688 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045717151454363093, + "loss": 2.9366, + "theoretical_loss": 4.126084167327694, + "tokens_seen": 312884224 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 433098, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1568994522094727, + "objective/train/theoretical_loss": 4.126008642598823, + "objective/train/tokens_used": 333393376, + "theoretical_loss": 4.126008642598823, + "tokens_seen": 312933376 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004571614844533601, + "loss": 3.0267, + "theoretical_loss": 4.125983471063368, + "tokens_seen": 312949760 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004571514543630893, + "loss": 3.0007, + "theoretical_loss": 4.125882801787025, + "tokens_seen": 313015296 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004571414242728185, + "loss": 3.1487, + "theoretical_loss": 4.125782159485785, + "tokens_seen": 313080832 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045713139418254766, + "loss": 3.1603, + "theoretical_loss": 4.125681544146774, + "tokens_seen": 313146368 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045712136409227684, + "loss": 3.1322, + "theoretical_loss": 4.12558095575713, + "tokens_seen": 313211904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004571113340020061, + "loss": 3.1128, + "theoretical_loss": 4.125480394303996, + "tokens_seen": 313277440 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004571013039117352, + "loss": 3.0045, + "theoretical_loss": 4.125379859774528, + "tokens_seen": 313342976 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045709127382146444, + "loss": 3.0059, + "theoretical_loss": 4.1252793521558875, + "tokens_seen": 313408512 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045708124373119356, + "loss": 2.9702, + "theoretical_loss": 4.125178871435247, + "tokens_seen": 313474048 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004570712136409228, + "loss": 3.2466, + "theoretical_loss": 4.125078417599785, + "tokens_seen": 313539584 + }, + { + "epoch": 1.01, + "learning_rate": 0.000457061183550652, + "loss": 2.8958, + "theoretical_loss": 4.124977990636694, + "tokens_seen": 313605120 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045705115346038116, + "loss": 2.9426, + "theoretical_loss": 4.124877590533169, + "tokens_seen": 313670656 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045704112337011034, + "loss": 3.1383, + "theoretical_loss": 4.124777217276417, + "tokens_seen": 313736192 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004570310932798395, + "loss": 2.9676, + "theoretical_loss": 4.124676870853656, + "tokens_seen": 313801728 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004570210631895687, + "loss": 2.6934, + "theoretical_loss": 4.124576551252107, + "tokens_seen": 313867264 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045701103309929794, + "loss": 2.9895, + "theoretical_loss": 4.124476258459005, + "tokens_seen": 313932800 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045700100300902707, + "loss": 3.3011, + "theoretical_loss": 4.124375992461592, + "tokens_seen": 313998336 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004569909729187563, + "loss": 2.9931, + "theoretical_loss": 4.124275753247116, + "tokens_seen": 314063872 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004569809428284855, + "loss": 3.0106, + "theoretical_loss": 4.124175540802839, + "tokens_seen": 314129408 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045697091273821466, + "loss": 3.0805, + "theoretical_loss": 4.124075355116028, + "tokens_seen": 314194944 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045696088264794384, + "loss": 3.1193, + "theoretical_loss": 4.123975196173959, + "tokens_seen": 314260480 + }, + { + "epoch": 1.01, + "learning_rate": 0.000456950852557673, + "loss": 2.9561, + "theoretical_loss": 4.123875063963917, + "tokens_seen": 314326016 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004569408224674022, + "loss": 3.0758, + "theoretical_loss": 4.123774958473197, + "tokens_seen": 314391552 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045693079237713144, + "loss": 3.1406, + "theoretical_loss": 4.123674879689101, + "tokens_seen": 314457088 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045692076228686057, + "loss": 3.1404, + "theoretical_loss": 4.123574827598942, + "tokens_seen": 314522624 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 438319, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2052862644195557, + "objective/train/theoretical_loss": 4.12349980604159, + "objective/train/tokens_used": 335031776, + "theoretical_loss": 4.12349980604159, + "tokens_seen": 314571776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004569107321965898, + "loss": 3.1546, + "theoretical_loss": 4.123474802190039, + "tokens_seen": 314588160 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045690070210631893, + "loss": 3.1286, + "theoretical_loss": 4.123374803449719, + "tokens_seen": 314653696 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045689067201604817, + "loss": 3.022, + "theoretical_loss": 4.123274831365322, + "tokens_seen": 314719232 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045688064192577735, + "loss": 3.0278, + "theoretical_loss": 4.123174885924192, + "tokens_seen": 314784768 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045687061183550653, + "loss": 3.1057, + "theoretical_loss": 4.123074967113686, + "tokens_seen": 314850304 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004568605817452357, + "loss": 3.2642, + "theoretical_loss": 4.122975074921165, + "tokens_seen": 314915840 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045685055165496495, + "loss": 2.8919, + "theoretical_loss": 4.122875209334002, + "tokens_seen": 314981376 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045684052156469407, + "loss": 3.119, + "theoretical_loss": 4.122775370339577, + "tokens_seen": 315046912 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004568304914744233, + "loss": 3.188, + "theoretical_loss": 4.12267555792528, + "tokens_seen": 315112448 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045682046138415243, + "loss": 3.0429, + "theoretical_loss": 4.122575772078507, + "tokens_seen": 315177984 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045681043129388167, + "loss": 3.2101, + "theoretical_loss": 4.122476012786667, + "tokens_seen": 315243520 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045680040120361085, + "loss": 3.2976, + "theoretical_loss": 4.122376280037173, + "tokens_seen": 315309056 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045679037111334003, + "loss": 3.0171, + "theoretical_loss": 4.122276573817448, + "tokens_seen": 315374592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004567803410230692, + "loss": 3.0178, + "theoretical_loss": 4.122176894114926, + "tokens_seen": 315440128 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004567703109327984, + "loss": 3.2494, + "theoretical_loss": 4.122077240917046, + "tokens_seen": 315505664 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004567602808425276, + "loss": 2.9009, + "theoretical_loss": 4.121977614211259, + "tokens_seen": 315571200 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004567502507522568, + "loss": 3.2204, + "theoretical_loss": 4.12187801398502, + "tokens_seen": 315636736 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045674022066198594, + "loss": 3.1532, + "theoretical_loss": 4.121778440225798, + "tokens_seen": 315702272 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004567301905717152, + "loss": 3.0529, + "theoretical_loss": 4.121678892921066, + "tokens_seen": 315767808 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004567201604814443, + "loss": 2.8736, + "theoretical_loss": 4.121579372058308, + "tokens_seen": 315833344 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045671013039117354, + "loss": 3.103, + "theoretical_loss": 4.121479877625016, + "tokens_seen": 315898880 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004567001003009027, + "loss": 3.0901, + "theoretical_loss": 4.12138040960869, + "tokens_seen": 315964416 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004566900702106319, + "loss": 2.9924, + "theoretical_loss": 4.12128096799684, + "tokens_seen": 316029952 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004566800401203611, + "loss": 3.1131, + "theoretical_loss": 4.1211815527769815, + "tokens_seen": 316095488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004566700100300903, + "loss": 3.2011, + "theoretical_loss": 4.1210821639366415, + "tokens_seen": 316161024 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 440664, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3091273307800293, + "objective/train/theoretical_loss": 4.121007639610445, + "objective/train/tokens_used": 336670176, + "theoretical_loss": 4.121007639610445, + "tokens_seen": 316210176 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045665997993981944, + "loss": 3.1517, + "theoretical_loss": 4.120982801463353, + "tokens_seen": 316226560 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004566499498495487, + "loss": 3.0728, + "theoretical_loss": 4.1208834653446615, + "tokens_seen": 316292096 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004566399197592778, + "loss": 3.2325, + "theoretical_loss": 4.120784155568115, + "tokens_seen": 316357632 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045662988966900704, + "loss": 3.1194, + "theoretical_loss": 4.120684872121277, + "tokens_seen": 316423168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004566198595787362, + "loss": 3.1201, + "theoretical_loss": 4.120585614991712, + "tokens_seen": 316488704 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004566098294884654, + "loss": 3.1454, + "theoretical_loss": 4.120486384166998, + "tokens_seen": 316554240 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004565997993981946, + "loss": 2.9571, + "theoretical_loss": 4.120387179634721, + "tokens_seen": 316619776 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045658976930792376, + "loss": 3.1203, + "theoretical_loss": 4.120288001382474, + "tokens_seen": 316685312 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045657973921765294, + "loss": 3.1862, + "theoretical_loss": 4.1201888493978585, + "tokens_seen": 316750848 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004565697091273822, + "loss": 3.0073, + "theoretical_loss": 4.120089723668484, + "tokens_seen": 316816384 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004565596790371113, + "loss": 3.1276, + "theoretical_loss": 4.1199906241819715, + "tokens_seen": 316881920 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045654964894684054, + "loss": 3.1453, + "theoretical_loss": 4.119891550925946, + "tokens_seen": 316947456 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045653961885656967, + "loss": 3.1468, + "theoretical_loss": 4.119792503888043, + "tokens_seen": 317012992 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004565295887662989, + "loss": 3.0982, + "theoretical_loss": 4.119693483055908, + "tokens_seen": 317078528 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004565195586760281, + "loss": 2.9135, + "theoretical_loss": 4.119594488417192, + "tokens_seen": 317144064 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045650952858575727, + "loss": 3.1709, + "theoretical_loss": 4.119495519959556, + "tokens_seen": 317209600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045649949849548645, + "loss": 3.1193, + "theoretical_loss": 4.119396577670669, + "tokens_seen": 317275136 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004564894684052157, + "loss": 3.1036, + "theoretical_loss": 4.119297661538209, + "tokens_seen": 317340672 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004564794383149448, + "loss": 2.9798, + "theoretical_loss": 4.11919877154986, + "tokens_seen": 317406208 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045646940822467405, + "loss": 3.0293, + "theoretical_loss": 4.119099907693318, + "tokens_seen": 317471744 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045645937813440317, + "loss": 3.0721, + "theoretical_loss": 4.119001069956284, + "tokens_seen": 317537280 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004564493480441324, + "loss": 3.3166, + "theoretical_loss": 4.11890225832647, + "tokens_seen": 317602816 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045643931795386164, + "loss": 2.9832, + "theoretical_loss": 4.118803472791592, + "tokens_seen": 317668352 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045642928786359077, + "loss": 3.1933, + "theoretical_loss": 4.1187047133393815, + "tokens_seen": 317733888 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045641925777332, + "loss": 3.0072, + "theoretical_loss": 4.118605979957572, + "tokens_seen": 317799424 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 445676, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2730116844177246, + "objective/train/theoretical_loss": 4.118531947022541, + "objective/train/tokens_used": 338308576, + "theoretical_loss": 4.118531947022541, + "tokens_seen": 317848576 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045640922768304913, + "loss": 3.2766, + "theoretical_loss": 4.118507272633906, + "tokens_seen": 317864960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045639919759277837, + "loss": 3.1235, + "theoretical_loss": 4.118408591356138, + "tokens_seen": 317930496 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045638916750250755, + "loss": 2.7607, + "theoretical_loss": 4.118309936112027, + "tokens_seen": 317996032 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045637913741223673, + "loss": 3.1125, + "theoretical_loss": 4.118211306889343, + "tokens_seen": 318061568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004563691073219659, + "loss": 3.1064, + "theoretical_loss": 4.1181127036758625, + "tokens_seen": 318127104 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045635907723169515, + "loss": 3.0764, + "theoretical_loss": 4.118014126459371, + "tokens_seen": 318192640 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045634904714142427, + "loss": 3.127, + "theoretical_loss": 4.11791557522766, + "tokens_seen": 318258176 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004563390170511535, + "loss": 3.0333, + "theoretical_loss": 4.117817049968535, + "tokens_seen": 318323712 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045632898696088263, + "loss": 3.21, + "theoretical_loss": 4.117718550669802, + "tokens_seen": 318389248 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045631895687061187, + "loss": 3.0226, + "theoretical_loss": 4.117620077319282, + "tokens_seen": 318454784 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045630892678034105, + "loss": 3.2592, + "theoretical_loss": 4.117521629904801, + "tokens_seen": 318520320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045629889669007023, + "loss": 2.9392, + "theoretical_loss": 4.117423208414193, + "tokens_seen": 318585856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004562888665997994, + "loss": 3.0316, + "theoretical_loss": 4.117324812835301, + "tokens_seen": 318651392 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004562788365095286, + "loss": 3.1168, + "theoretical_loss": 4.117226443155976, + "tokens_seen": 318716928 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004562688064192578, + "loss": 3.0642, + "theoretical_loss": 4.117128099364078, + "tokens_seen": 318782464 + }, + { + "epoch": 1.01, + "learning_rate": 0.000456258776328987, + "loss": 2.8883, + "theoretical_loss": 4.1170297814474734, + "tokens_seen": 318848000 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045624874623871614, + "loss": 3.0011, + "theoretical_loss": 4.116931489394039, + "tokens_seen": 318913536 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004562387161484454, + "loss": 2.9764, + "theoretical_loss": 4.116833223191657, + "tokens_seen": 318979072 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004562286860581745, + "loss": 2.9898, + "theoretical_loss": 4.116734982828222, + "tokens_seen": 319044608 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045621865596790374, + "loss": 3.1809, + "theoretical_loss": 4.116636768291631, + "tokens_seen": 319110144 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004562086258776329, + "loss": 3.1512, + "theoretical_loss": 4.116538579569793, + "tokens_seen": 319175680 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004561985957873621, + "loss": 3.0586, + "theoretical_loss": 4.116440416650627, + "tokens_seen": 319241216 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004561885656970913, + "loss": 3.0317, + "theoretical_loss": 4.116342279522055, + "tokens_seen": 319306752 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004561785356068205, + "loss": 3.145, + "theoretical_loss": 4.11624416817201, + "tokens_seen": 319372288 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045616850551654964, + "loss": 2.9133, + "theoretical_loss": 4.1161460825884335, + "tokens_seen": 319437824 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 450671, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.054537773132324, + "objective/train/theoretical_loss": 4.116072535302745, + "objective/train/tokens_used": 339946976, + "theoretical_loss": 4.116072535302745, + "tokens_seen": 319486976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004561584754262789, + "loss": 2.8832, + "theoretical_loss": 4.116048022759274, + "tokens_seen": 319503360 + }, + { + "epoch": 1.01, + "learning_rate": 0.000456148445336008, + "loss": 3.0356, + "theoretical_loss": 4.115949988672488, + "tokens_seen": 319568896 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045613841524573724, + "loss": 3.0226, + "theoretical_loss": 4.115851980316042, + "tokens_seen": 319634432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004561283851554664, + "loss": 2.753, + "theoretical_loss": 4.115753997677907, + "tokens_seen": 319699968 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004561183550651956, + "loss": 3.0136, + "theoretical_loss": 4.115656040746067, + "tokens_seen": 319765504 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004561083249749248, + "loss": 3.0264, + "theoretical_loss": 4.11555810950851, + "tokens_seen": 319831040 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045609829488465396, + "loss": 3.0399, + "theoretical_loss": 4.1154602039532335, + "tokens_seen": 319896576 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045608826479438314, + "loss": 2.8733, + "theoretical_loss": 4.115362324068242, + "tokens_seen": 319962112 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004560782347041124, + "loss": 3.123, + "theoretical_loss": 4.115264469841551, + "tokens_seen": 320027648 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004560682046138415, + "loss": 3.013, + "theoretical_loss": 4.115166641261181, + "tokens_seen": 320093184 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045605817452357074, + "loss": 3.0235, + "theoretical_loss": 4.115068838315162, + "tokens_seen": 320158720 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045604814443329987, + "loss": 2.9962, + "theoretical_loss": 4.1149710609915315, + "tokens_seen": 320224256 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004560381143430291, + "loss": 3.0287, + "theoretical_loss": 4.114873309278335, + "tokens_seen": 320289792 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004560280842527583, + "loss": 2.9697, + "theoretical_loss": 4.114775583163627, + "tokens_seen": 320355328 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045601805416248747, + "loss": 2.974, + "theoretical_loss": 4.114677882635468, + "tokens_seen": 320420864 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045600802407221665, + "loss": 3.1124, + "theoretical_loss": 4.114580207681929, + "tokens_seen": 320486400 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004559979939819459, + "loss": 3.2231, + "theoretical_loss": 4.114482558291087, + "tokens_seen": 320551936 + }, + { + "epoch": 1.01, + "learning_rate": 0.000455987963891675, + "loss": 2.8844, + "theoretical_loss": 4.114384934451029, + "tokens_seen": 320617472 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045597793380140425, + "loss": 3.105, + "theoretical_loss": 4.114287336149847, + "tokens_seen": 320683008 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045596790371113337, + "loss": 2.9922, + "theoretical_loss": 4.114189763375643, + "tokens_seen": 320748544 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004559578736208626, + "loss": 3.102, + "theoretical_loss": 4.1140922161165285, + "tokens_seen": 320814080 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004559478435305918, + "loss": 3.1323, + "theoretical_loss": 4.11399469436062, + "tokens_seen": 320879616 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045593781344032097, + "loss": 2.9403, + "theoretical_loss": 4.113897198096042, + "tokens_seen": 320945152 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045592778335005015, + "loss": 3.1139, + "theoretical_loss": 4.11379972731093, + "tokens_seen": 321010688 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045591775325977933, + "loss": 2.9772, + "theoretical_loss": 4.113702281993424, + "tokens_seen": 321076224 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 452128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.911926031112671, + "objective/train/theoretical_loss": 4.113629214711283, + "objective/train/tokens_used": 341585376, + "theoretical_loss": 4.113629214711283, + "tokens_seen": 321125376 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004559077231695085, + "loss": 2.9501, + "theoretical_loss": 4.113604862131675, + "tokens_seen": 321141760 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045589769307923775, + "loss": 2.9992, + "theoretical_loss": 4.113507467713839, + "tokens_seen": 321207296 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004558876629889669, + "loss": 3.1123, + "theoretical_loss": 4.113410098728081, + "tokens_seen": 321272832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004558776328986961, + "loss": 3.1029, + "theoretical_loss": 4.1133127551625766, + "tokens_seen": 321338368 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045586760280842524, + "loss": 3.1395, + "theoretical_loss": 4.113215437005504, + "tokens_seen": 321403904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004558575727181545, + "loss": 2.9044, + "theoretical_loss": 4.113118144245054, + "tokens_seen": 321469440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045584754262788365, + "loss": 3.1156, + "theoretical_loss": 4.113020876869424, + "tokens_seen": 321534976 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045583751253761284, + "loss": 3.1783, + "theoretical_loss": 4.112923634866817, + "tokens_seen": 321600512 + }, + { + "epoch": 1.01, + "learning_rate": 0.000455827482447342, + "loss": 3.0768, + "theoretical_loss": 4.112826418225447, + "tokens_seen": 321666048 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045581745235707125, + "loss": 2.8923, + "theoretical_loss": 4.112729226933536, + "tokens_seen": 321731584 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004558074222668004, + "loss": 2.8801, + "theoretical_loss": 4.112632060979309, + "tokens_seen": 321797120 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004557973921765296, + "loss": 2.9205, + "theoretical_loss": 4.112534920351004, + "tokens_seen": 321862656 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045578736208625874, + "loss": 2.9779, + "theoretical_loss": 4.112437805036866, + "tokens_seen": 321928192 + }, + { + "epoch": 1.01, + "learning_rate": 0.000455777331995988, + "loss": 3.0556, + "theoretical_loss": 4.112340715025147, + "tokens_seen": 321993728 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045576730190571716, + "loss": 2.9769, + "theoretical_loss": 4.112243650304106, + "tokens_seen": 322059264 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045575727181544634, + "loss": 3.1575, + "theoretical_loss": 4.11214661086201, + "tokens_seen": 322124800 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004557472417251755, + "loss": 3.2281, + "theoretical_loss": 4.112049596687136, + "tokens_seen": 322190336 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004557372116349047, + "loss": 3.2923, + "theoretical_loss": 4.111952607767767, + "tokens_seen": 322255872 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004557271815446339, + "loss": 3.1374, + "theoretical_loss": 4.111855644092194, + "tokens_seen": 322321408 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004557171514543631, + "loss": 2.9752, + "theoretical_loss": 4.111758705648716, + "tokens_seen": 322386944 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045570712136409224, + "loss": 2.9913, + "theoretical_loss": 4.11166179242564, + "tokens_seen": 322452480 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004556970912738215, + "loss": 2.955, + "theoretical_loss": 4.11156490441128, + "tokens_seen": 322518016 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004556870611835507, + "loss": 3.1211, + "theoretical_loss": 4.111468041593958, + "tokens_seen": 322583552 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045567703109327984, + "loss": 2.7598, + "theoretical_loss": 4.111371203962006, + "tokens_seen": 322649088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004556670010030091, + "loss": 3.1663, + "theoretical_loss": 4.11127439150376, + "tokens_seen": 322714624 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 452464, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7435483932495117, + "objective/train/theoretical_loss": 4.111201798673309, + "objective/train/tokens_used": 343223776, + "theoretical_loss": 4.111201798673309, + "tokens_seen": 322763776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004556569709127382, + "loss": 3.0986, + "theoretical_loss": 4.111177604207566, + "tokens_seen": 322780160 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045564694082246744, + "loss": 3.123, + "theoretical_loss": 4.111080842061779, + "tokens_seen": 322845696 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004556369107321966, + "loss": 2.9448, + "theoretical_loss": 4.110984105054758, + "tokens_seen": 322911232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004556268806419258, + "loss": 2.6903, + "theoretical_loss": 4.110887393174874, + "tokens_seen": 322976768 + }, + { + "epoch": 1.01, + "learning_rate": 0.000455616850551655, + "loss": 3.3049, + "theoretical_loss": 4.110790706410502, + "tokens_seen": 323042304 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045560682046138416, + "loss": 3.0815, + "theoretical_loss": 4.110694044750028, + "tokens_seen": 323107840 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045559679037111334, + "loss": 3.0381, + "theoretical_loss": 4.110597408181842, + "tokens_seen": 323173376 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004555867602808426, + "loss": 3.1317, + "theoretical_loss": 4.110500796694346, + "tokens_seen": 323238912 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004555767301905717, + "loss": 2.9622, + "theoretical_loss": 4.110404210275947, + "tokens_seen": 323304448 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045556670010030094, + "loss": 3.0428, + "theoretical_loss": 4.110307648915059, + "tokens_seen": 323369984 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045555667001003007, + "loss": 3.0631, + "theoretical_loss": 4.110211112600107, + "tokens_seen": 323435520 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004555466399197593, + "loss": 3.0437, + "theoretical_loss": 4.11011460131952, + "tokens_seen": 323501056 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004555366098294885, + "loss": 3.08, + "theoretical_loss": 4.110018115061737, + "tokens_seen": 323566592 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045552657973921767, + "loss": 2.8587, + "theoretical_loss": 4.109921653815205, + "tokens_seen": 323632128 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045551654964894685, + "loss": 2.8589, + "theoretical_loss": 4.109825217568375, + "tokens_seen": 323697664 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004555065195586761, + "loss": 2.9758, + "theoretical_loss": 4.109728806309711, + "tokens_seen": 323763200 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004554964894684052, + "loss": 3.0621, + "theoretical_loss": 4.1096324200276815, + "tokens_seen": 323828736 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045548645937813445, + "loss": 3.2006, + "theoretical_loss": 4.109536058710763, + "tokens_seen": 323894272 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045547642928786357, + "loss": 3.2291, + "theoretical_loss": 4.109439722347439, + "tokens_seen": 323959808 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004554663991975928, + "loss": 3.0339, + "theoretical_loss": 4.109343410926202, + "tokens_seen": 324025344 + }, + { + "epoch": 1.01, + "learning_rate": 0.000455456369107322, + "loss": 3.1038, + "theoretical_loss": 4.109247124435552, + "tokens_seen": 324090880 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045544633901705117, + "loss": 3.2273, + "theoretical_loss": 4.109150862863997, + "tokens_seen": 324156416 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045543630892678035, + "loss": 3.0505, + "theoretical_loss": 4.1090546262000505, + "tokens_seen": 324221952 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045542627883650953, + "loss": 3.101, + "theoretical_loss": 4.108958414432235, + "tokens_seen": 324287488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004554162487462387, + "loss": 3.1216, + "theoretical_loss": 4.10886222754908, + "tokens_seen": 324353024 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 453833, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.195937395095825, + "objective/train/theoretical_loss": 4.108790103710379, + "objective/train/tokens_used": 344862176, + "theoretical_loss": 4.108790103710379, + "tokens_seen": 324402176 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045540621865596795, + "loss": 3.2008, + "theoretical_loss": 4.108766065539125, + "tokens_seen": 324418560 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004553961885656971, + "loss": 3.0328, + "theoretical_loss": 4.108669928390915, + "tokens_seen": 324484096 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004553861584754263, + "loss": 3.0984, + "theoretical_loss": 4.108573816093003, + "tokens_seen": 324549632 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045537612838515544, + "loss": 2.8293, + "theoretical_loss": 4.108477728633948, + "tokens_seen": 324615168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004553660982948847, + "loss": 3.2014, + "theoretical_loss": 4.108381666002318, + "tokens_seen": 324680704 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045535606820461385, + "loss": 3.0183, + "theoretical_loss": 4.10828562818669, + "tokens_seen": 324746240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045534603811434304, + "loss": 3.1596, + "theoretical_loss": 4.108189615175646, + "tokens_seen": 324811776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004553360080240722, + "loss": 2.7589, + "theoretical_loss": 4.1080936269577775, + "tokens_seen": 324877312 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045532597793380145, + "loss": 2.8328, + "theoretical_loss": 4.107997663521683, + "tokens_seen": 324942848 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004553159478435306, + "loss": 2.7061, + "theoretical_loss": 4.107901724855966, + "tokens_seen": 325008384 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004553059177532598, + "loss": 2.8632, + "theoretical_loss": 4.107805810949244, + "tokens_seen": 325073920 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045529588766298894, + "loss": 2.9472, + "theoretical_loss": 4.107709921790134, + "tokens_seen": 325139456 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004552858575727182, + "loss": 2.9933, + "theoretical_loss": 4.1076140573672655, + "tokens_seen": 325204992 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045527582748244736, + "loss": 3.0292, + "theoretical_loss": 4.107518217669275, + "tokens_seen": 325270528 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045526579739217654, + "loss": 2.9263, + "theoretical_loss": 4.107422402684806, + "tokens_seen": 325336064 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004552557673019057, + "loss": 3.1346, + "theoretical_loss": 4.107326612402509, + "tokens_seen": 325401600 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004552457372116349, + "loss": 3.1047, + "theoretical_loss": 4.107230846811042, + "tokens_seen": 325467136 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004552357071213641, + "loss": 3.0838, + "theoretical_loss": 4.107135105899072, + "tokens_seen": 325532672 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004552256770310933, + "loss": 3.2631, + "theoretical_loss": 4.107039389655271, + "tokens_seen": 325598208 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045521564694082244, + "loss": 2.9412, + "theoretical_loss": 4.106943698068321, + "tokens_seen": 325663744 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004552056168505517, + "loss": 3.1445, + "theoretical_loss": 4.106848031126909, + "tokens_seen": 325729280 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004551955867602808, + "loss": 2.9177, + "theoretical_loss": 4.106752388819732, + "tokens_seen": 325794816 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045518555667001004, + "loss": 2.8788, + "theoretical_loss": 4.106656771135494, + "tokens_seen": 325860352 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004551755265797392, + "loss": 3.0533, + "theoretical_loss": 4.106561178062903, + "tokens_seen": 325925888 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004551654964894684, + "loss": 3.2877, + "theoretical_loss": 4.10646560959068, + "tokens_seen": 325991424 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 455028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.201298236846924, + "objective/train/theoretical_loss": 4.10639394937372, + "objective/train/tokens_used": 346500576, + "theoretical_loss": 4.10639394937372, + "tokens_seen": 326040576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004551554663991976, + "loss": 3.1642, + "theoretical_loss": 4.106370065707549, + "tokens_seen": 326056960 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004551454363089268, + "loss": 2.8599, + "theoretical_loss": 4.1062745464022425, + "tokens_seen": 326122496 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045513540621865595, + "loss": 2.9966, + "theoretical_loss": 4.106179051663502, + "tokens_seen": 326188032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004551253761283852, + "loss": 3.0048, + "theoretical_loss": 4.106083581480076, + "tokens_seen": 326253568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004551153460381143, + "loss": 3.1238, + "theoretical_loss": 4.105988135840717, + "tokens_seen": 326319104 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045510531594784354, + "loss": 2.9494, + "theoretical_loss": 4.105892714734191, + "tokens_seen": 326384640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004550952858575727, + "loss": 2.8344, + "theoretical_loss": 4.105797318149266, + "tokens_seen": 326450176 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004550852557673019, + "loss": 3.2039, + "theoretical_loss": 4.105701946074721, + "tokens_seen": 326515712 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004550752256770311, + "loss": 3.1371, + "theoretical_loss": 4.105606598499339, + "tokens_seen": 326581248 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045506519558676027, + "loss": 3.0273, + "theoretical_loss": 4.105511275411914, + "tokens_seen": 326646784 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045505516549648945, + "loss": 2.9177, + "theoretical_loss": 4.105415976801243, + "tokens_seen": 326712320 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004550451354062187, + "loss": 3.1699, + "theoretical_loss": 4.105320702656137, + "tokens_seen": 326777856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004550351053159478, + "loss": 3.0427, + "theoretical_loss": 4.105225452965406, + "tokens_seen": 326843392 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045502507522567705, + "loss": 3.0283, + "theoretical_loss": 4.105130227717876, + "tokens_seen": 326908928 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045501504513540623, + "loss": 3.0798, + "theoretical_loss": 4.105035026902372, + "tokens_seen": 326974464 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004550050150451354, + "loss": 3.1388, + "theoretical_loss": 4.104939850507733, + "tokens_seen": 327040000 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004549949849548646, + "loss": 2.9548, + "theoretical_loss": 4.104844698522801, + "tokens_seen": 327105536 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045498495486459377, + "loss": 3.1927, + "theoretical_loss": 4.104749570936429, + "tokens_seen": 327171072 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045497492477432295, + "loss": 2.939, + "theoretical_loss": 4.104654467737474, + "tokens_seen": 327236608 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004549648946840522, + "loss": 3.1495, + "theoretical_loss": 4.104559388914802, + "tokens_seen": 327302144 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045495486459378137, + "loss": 3.1292, + "theoretical_loss": 4.104464334457286, + "tokens_seen": 327367680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045494483450351055, + "loss": 3.1137, + "theoretical_loss": 4.104369304353806, + "tokens_seen": 327433216 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045493480441323973, + "loss": 3.1885, + "theoretical_loss": 4.104274298593252, + "tokens_seen": 327498752 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004549247743229689, + "loss": 3.1253, + "theoretical_loss": 4.1041793171645145, + "tokens_seen": 327564288 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045491474423269815, + "loss": 2.9915, + "theoretical_loss": 4.104084360056499, + "tokens_seen": 327629824 + }, + { + "debugging/Self-BLEU-5": 0.6136948618909069, + "debugging/distinct-1-grams": 0.7435805096055491, + "debugging/distinct-2-grams": 0.8810311841604012, + "debugging/entropy-1-grams": 6.325230867052365, + "debugging/entropy-2-grams": 7.6356135460409735, + "debugging/length": 511.3030303030303, + "debugging/num_segments": 33, + "debugging/score": 0.0005877098914164371, + "debugging/score_std": 0.0013283427361280467, + "epoch": 1.01, + "objective/train/docs_used": 455830, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0165796279907227, + "objective/train/theoretical_loss": 4.1040131581792885, + "objective/train/tokens_used": 348138976, + "theoretical_loss": 4.1040131581792885, + "tokens_seen": 327678976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004549047141424273, + "loss": 2.8767, + "theoretical_loss": 4.103989427258114, + "tokens_seen": 327695360 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004548946840521565, + "loss": 3.0342, + "theoretical_loss": 4.103894518758277, + "tokens_seen": 327760896 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045488465396188564, + "loss": 3.0731, + "theoretical_loss": 4.1037996345459105, + "tokens_seen": 327826432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004548746238716149, + "loss": 3.1697, + "theoretical_loss": 4.103704774609946, + "tokens_seen": 327891968 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045486459378134405, + "loss": 3.1106, + "theoretical_loss": 4.103609938939324, + "tokens_seen": 327957504 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045485456369107324, + "loss": 3.1378, + "theoretical_loss": 4.103515127522988, + "tokens_seen": 328023040 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004548445336008024, + "loss": 2.9621, + "theoretical_loss": 4.103420340349893, + "tokens_seen": 328088576 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045483450351053165, + "loss": 2.9669, + "theoretical_loss": 4.103325577408997, + "tokens_seen": 328154112 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004548244734202608, + "loss": 3.0197, + "theoretical_loss": 4.103230838689269, + "tokens_seen": 328219648 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045481444332999, + "loss": 3.0518, + "theoretical_loss": 4.103136124179684, + "tokens_seen": 328285184 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045480441323971914, + "loss": 3.2399, + "theoretical_loss": 4.103041433869223, + "tokens_seen": 328350720 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004547943831494484, + "loss": 3.0063, + "theoretical_loss": 4.102946767746875, + "tokens_seen": 328416256 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045478435305917756, + "loss": 2.907, + "theoretical_loss": 4.102852125801638, + "tokens_seen": 328481792 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045477432296890674, + "loss": 3.197, + "theoretical_loss": 4.102757508022513, + "tokens_seen": 328547328 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004547642928786359, + "loss": 3.2354, + "theoretical_loss": 4.102662914398513, + "tokens_seen": 328612864 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004547542627883651, + "loss": 2.9079, + "theoretical_loss": 4.102568344918655, + "tokens_seen": 328678400 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004547442326980943, + "loss": 2.8512, + "theoretical_loss": 4.102473799571964, + "tokens_seen": 328743936 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004547342026078235, + "loss": 3.0689, + "theoretical_loss": 4.102379278347472, + "tokens_seen": 328809472 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045472417251755264, + "loss": 2.9659, + "theoretical_loss": 4.10228478123422, + "tokens_seen": 328875008 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004547141424272819, + "loss": 3.2956, + "theoretical_loss": 4.102190308221253, + "tokens_seen": 328940544 + }, + { + "epoch": 1.01, + "learning_rate": 0.000454704112337011, + "loss": 3.0001, + "theoretical_loss": 4.102095859297625, + "tokens_seen": 329006080 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045469408224674024, + "loss": 3.0739, + "theoretical_loss": 4.102001434452398, + "tokens_seen": 329071616 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004546840521564694, + "loss": 3.0976, + "theoretical_loss": 4.101907033674639, + "tokens_seen": 329137152 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004546740220661986, + "loss": 3.0629, + "theoretical_loss": 4.101812656953424, + "tokens_seen": 329202688 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004546639919759278, + "loss": 2.9582, + "theoretical_loss": 4.101718304277834, + "tokens_seen": 329268224 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 456654, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.36710524559021, + "objective/train/theoretical_loss": 4.10164755554452, + "objective/train/tokens_used": 349777376, + "theoretical_loss": 4.10164755554452, + "tokens_seen": 329317376 + }, + { + "epoch": 1.01, + "learning_rate": 0.000454653961885657, + "loss": 2.9395, + "theoretical_loss": 4.10162397563696, + "tokens_seen": 329333760 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045464393179538615, + "loss": 3.1366, + "theoretical_loss": 4.101529671019898, + "tokens_seen": 329399296 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004546339017051154, + "loss": 2.7547, + "theoretical_loss": 4.101435390415752, + "tokens_seen": 329464832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004546238716148445, + "loss": 3.222, + "theoretical_loss": 4.101341133813632, + "tokens_seen": 329530368 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045461384152457374, + "loss": 2.9597, + "theoretical_loss": 4.101246901202655, + "tokens_seen": 329595904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004546038114343029, + "loss": 3.0324, + "theoretical_loss": 4.101152692571949, + "tokens_seen": 329661440 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004545937813440321, + "loss": 3.1991, + "theoretical_loss": 4.101058507910645, + "tokens_seen": 329726976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004545837512537613, + "loss": 3.0136, + "theoretical_loss": 4.100964347207881, + "tokens_seen": 329792512 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045457372116349047, + "loss": 3.1253, + "theoretical_loss": 4.100870210452804, + "tokens_seen": 329858048 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045456369107321965, + "loss": 3.1968, + "theoretical_loss": 4.100776097634567, + "tokens_seen": 329923584 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004545536609829489, + "loss": 3.0509, + "theoretical_loss": 4.100682008742331, + "tokens_seen": 329989120 + }, + { + "epoch": 1.01, + "learning_rate": 0.000454543630892678, + "loss": 2.961, + "theoretical_loss": 4.100587943765264, + "tokens_seen": 330054656 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045453360080240725, + "loss": 2.9026, + "theoretical_loss": 4.100493902692539, + "tokens_seen": 330120192 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045452357071213643, + "loss": 3.0556, + "theoretical_loss": 4.100399885513339, + "tokens_seen": 330185728 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004545135406218656, + "loss": 3.03, + "theoretical_loss": 4.10030589221685, + "tokens_seen": 330251264 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004545035105315948, + "loss": 2.9057, + "theoretical_loss": 4.100211922792271, + "tokens_seen": 330316800 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045449348044132397, + "loss": 3.1829, + "theoretical_loss": 4.100117977228804, + "tokens_seen": 330382336 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045448345035105315, + "loss": 2.9077, + "theoretical_loss": 4.1000240555156555, + "tokens_seen": 330447872 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004544734202607824, + "loss": 3.0517, + "theoretical_loss": 4.099930157642047, + "tokens_seen": 330513408 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004544633901705115, + "loss": 2.9146, + "theoretical_loss": 4.099836283597199, + "tokens_seen": 330578944 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045445336008024075, + "loss": 2.989, + "theoretical_loss": 4.099742433370342, + "tokens_seen": 330644480 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004544433299899699, + "loss": 2.8763, + "theoretical_loss": 4.0996486069507165, + "tokens_seen": 330710016 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004544332998996991, + "loss": 2.9498, + "theoretical_loss": 4.099554804327565, + "tokens_seen": 330775552 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004544232698094283, + "loss": 2.8131, + "theoretical_loss": 4.09946102549014, + "tokens_seen": 330841088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004544132397191575, + "loss": 2.9849, + "theoretical_loss": 4.099367270427699, + "tokens_seen": 330906624 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 458161, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8100082874298096, + "objective/train/theoretical_loss": 4.099296969726746, + "objective/train/tokens_used": 351415776, + "theoretical_loss": 4.099296969726746, + "tokens_seen": 330955776 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045440320962888666, + "loss": 2.9439, + "theoretical_loss": 4.099273539129509, + "tokens_seen": 330972160 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045439317953861584, + "loss": 3.0252, + "theoretical_loss": 4.099179831584843, + "tokens_seen": 331037696 + }, + { + "epoch": 1.01, + "learning_rate": 0.000454383149448345, + "loss": 3.0193, + "theoretical_loss": 4.09908614778298, + "tokens_seen": 331103232 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045437311935807425, + "loss": 2.9958, + "theoretical_loss": 4.098992487713207, + "tokens_seen": 331168768 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004543630892678034, + "loss": 3.0734, + "theoretical_loss": 4.098898851364815, + "tokens_seen": 331234304 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004543530591775326, + "loss": 2.9824, + "theoretical_loss": 4.098805238727109, + "tokens_seen": 331299840 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004543430290872618, + "loss": 3.2124, + "theoretical_loss": 4.098711649789392, + "tokens_seen": 331365376 + }, + { + "epoch": 1.01, + "learning_rate": 0.000454332998996991, + "loss": 2.8041, + "theoretical_loss": 4.098618084540981, + "tokens_seen": 331430912 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045432296890672016, + "loss": 2.8408, + "theoretical_loss": 4.098524542971197, + "tokens_seen": 331496448 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045431293881644934, + "loss": 3.1551, + "theoretical_loss": 4.098431025069368, + "tokens_seen": 331561984 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004543029087261785, + "loss": 3.0807, + "theoretical_loss": 4.098337530824828, + "tokens_seen": 331627520 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045429287863590776, + "loss": 2.9272, + "theoretical_loss": 4.09824406022692, + "tokens_seen": 331693056 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004542828485456369, + "loss": 3.0841, + "theoretical_loss": 4.098150613264993, + "tokens_seen": 331758592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004542728184553661, + "loss": 3.2841, + "theoretical_loss": 4.0980571899284035, + "tokens_seen": 331824128 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045426278836509525, + "loss": 2.9232, + "theoretical_loss": 4.097963790206513, + "tokens_seen": 331889664 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004542527582748245, + "loss": 2.9302, + "theoretical_loss": 4.097870414088691, + "tokens_seen": 331955200 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045424272818455366, + "loss": 3.2363, + "theoretical_loss": 4.097777061564315, + "tokens_seen": 332020736 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045423269809428284, + "loss": 3.1058, + "theoretical_loss": 4.097683732622768, + "tokens_seen": 332086272 + }, + { + "epoch": 1.01, + "learning_rate": 0.000454222668004012, + "loss": 2.9857, + "theoretical_loss": 4.09759042725344, + "tokens_seen": 332151808 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004542126379137412, + "loss": 3.1798, + "theoretical_loss": 4.097497145445729, + "tokens_seen": 332217344 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045420260782347044, + "loss": 2.9651, + "theoretical_loss": 4.097403887189038, + "tokens_seen": 332282880 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004541925777331996, + "loss": 3.1198, + "theoretical_loss": 4.097310652472778, + "tokens_seen": 332348416 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004541825476429288, + "loss": 2.9496, + "theoretical_loss": 4.097217441286367, + "tokens_seen": 332413952 + }, + { + "epoch": 1.01, + "learning_rate": 0.000454172517552658, + "loss": 3.0268, + "theoretical_loss": 4.09712425361923, + "tokens_seen": 332479488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004541624874623872, + "loss": 2.9316, + "theoretical_loss": 4.097031089460796, + "tokens_seen": 332545024 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 459492, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3007261753082275, + "objective/train/theoretical_loss": 4.096961231763205, + "objective/train/tokens_used": 353054176, + "theoretical_loss": 4.096961231763205, + "tokens_seen": 332594176 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045415245737211635, + "loss": 3.137, + "theoretical_loss": 4.096937948800506, + "tokens_seen": 332610560 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004541424272818456, + "loss": 3.1584, + "theoretical_loss": 4.096844831627804, + "tokens_seen": 332676096 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004541323971915747, + "loss": 3.3038, + "theoretical_loss": 4.096751737932141, + "tokens_seen": 332741632 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045412236710130395, + "loss": 2.9974, + "theoretical_loss": 4.096658667702978, + "tokens_seen": 332807168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004541123370110331, + "loss": 2.7968, + "theoretical_loss": 4.096565620929778, + "tokens_seen": 332872704 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004541023069207623, + "loss": 2.9963, + "theoretical_loss": 4.096472597602014, + "tokens_seen": 332938240 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004540922768304915, + "loss": 3.0869, + "theoretical_loss": 4.096379597709166, + "tokens_seen": 333003776 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045408224674022067, + "loss": 3.1367, + "theoretical_loss": 4.0962866212407185, + "tokens_seen": 333069312 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045407221664994985, + "loss": 2.9563, + "theoretical_loss": 4.096193668186165, + "tokens_seen": 333134848 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004540621865596791, + "loss": 2.7802, + "theoretical_loss": 4.096100738535004, + "tokens_seen": 333200384 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004540521564694082, + "loss": 3.09, + "theoretical_loss": 4.096007832276742, + "tokens_seen": 333265920 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045404212637913745, + "loss": 2.9668, + "theoretical_loss": 4.0959149494008935, + "tokens_seen": 333331456 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045403209628886663, + "loss": 3.0414, + "theoretical_loss": 4.095822089896976, + "tokens_seen": 333396992 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004540220661985958, + "loss": 2.8439, + "theoretical_loss": 4.0957292537545165, + "tokens_seen": 333462528 + }, + { + "epoch": 1.01, + "learning_rate": 0.000454012036108325, + "loss": 3.1205, + "theoretical_loss": 4.09563644096305, + "tokens_seen": 333528064 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045400200601805417, + "loss": 3.0406, + "theoretical_loss": 4.095543651512115, + "tokens_seen": 333593600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045399197592778335, + "loss": 3.1992, + "theoretical_loss": 4.095450885391257, + "tokens_seen": 333659136 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004539819458375126, + "loss": 2.9049, + "theoretical_loss": 4.095358142590031, + "tokens_seen": 333724672 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004539719157472417, + "loss": 2.8475, + "theoretical_loss": 4.095265423097998, + "tokens_seen": 333790208 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045396188565697095, + "loss": 3.0034, + "theoretical_loss": 4.095172726904723, + "tokens_seen": 333855744 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004539518555667001, + "loss": 3.0068, + "theoretical_loss": 4.09508005399978, + "tokens_seen": 333921280 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004539418254764293, + "loss": 2.9264, + "theoretical_loss": 4.094987404372751, + "tokens_seen": 333986816 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004539317953861585, + "loss": 2.9861, + "theoretical_loss": 4.094894778013221, + "tokens_seen": 334052352 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004539217652958877, + "loss": 2.9869, + "theoretical_loss": 4.094802174910784, + "tokens_seen": 334117888 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045391173520561686, + "loss": 3.2001, + "theoretical_loss": 4.094709595055042, + "tokens_seen": 334183424 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 460159, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9135870933532715, + "objective/train/theoretical_loss": 4.094640175412626, + "objective/train/tokens_used": 354692576, + "theoretical_loss": 4.094640175412626, + "tokens_seen": 334232576 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045390170511534604, + "loss": 3.0121, + "theoretical_loss": 4.0946170384356, + "tokens_seen": 334248960 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004538916750250752, + "loss": 2.7814, + "theoretical_loss": 4.094524505042074, + "tokens_seen": 334314496 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045388164493480445, + "loss": 2.9696, + "theoretical_loss": 4.094431994864082, + "tokens_seen": 334380032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004538716148445336, + "loss": 2.9823, + "theoretical_loss": 4.094339507891252, + "tokens_seen": 334445568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004538615847542628, + "loss": 3.209, + "theoretical_loss": 4.094247044113219, + "tokens_seen": 334511104 + }, + { + "epoch": 1.01, + "learning_rate": 0.000453851554663992, + "loss": 3.0497, + "theoretical_loss": 4.094154603519623, + "tokens_seen": 334576640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004538415245737212, + "loss": 3.0694, + "theoretical_loss": 4.0940621861001105, + "tokens_seen": 334642176 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045383149448345036, + "loss": 2.9195, + "theoretical_loss": 4.093969791844335, + "tokens_seen": 334707712 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045382146439317954, + "loss": 2.8378, + "theoretical_loss": 4.093877420741958, + "tokens_seen": 334773248 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004538114343029087, + "loss": 3.0146, + "theoretical_loss": 4.093785072782646, + "tokens_seen": 334838784 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045380140421263796, + "loss": 3.0526, + "theoretical_loss": 4.093692747956072, + "tokens_seen": 334904320 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004537913741223671, + "loss": 3.1662, + "theoretical_loss": 4.0936004462519175, + "tokens_seen": 334969856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004537813440320963, + "loss": 2.9986, + "theoretical_loss": 4.093508167659869, + "tokens_seen": 335035392 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045377131394182545, + "loss": 3.1044, + "theoretical_loss": 4.09341591216962, + "tokens_seen": 335100928 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004537612838515547, + "loss": 2.893, + "theoretical_loss": 4.0933236797708705, + "tokens_seen": 335166464 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045375125376128386, + "loss": 2.9955, + "theoretical_loss": 4.093231470453327, + "tokens_seen": 335232000 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045374122367101304, + "loss": 3.1863, + "theoretical_loss": 4.093139284206703, + "tokens_seen": 335297536 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004537311935807422, + "loss": 3.2446, + "theoretical_loss": 4.09304712102072, + "tokens_seen": 335363072 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004537211634904714, + "loss": 3.3968, + "theoretical_loss": 4.092954980885102, + "tokens_seen": 335428608 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004537111334002006, + "loss": 3.0159, + "theoretical_loss": 4.092862863789584, + "tokens_seen": 335494144 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004537011033099298, + "loss": 3.1544, + "theoretical_loss": 4.092770769723905, + "tokens_seen": 335559680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045369107321965895, + "loss": 3.0962, + "theoretical_loss": 4.092678698677811, + "tokens_seen": 335625216 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004536810431293882, + "loss": 3.0672, + "theoretical_loss": 4.0925866506410555, + "tokens_seen": 335690752 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045367101303911737, + "loss": 3.2044, + "theoretical_loss": 4.092494625603397, + "tokens_seen": 335756288 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045366098294884655, + "loss": 2.981, + "theoretical_loss": 4.0924026235546025, + "tokens_seen": 335821824 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 460945, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6775364875793457, + "objective/train/theoretical_loss": 4.092333637098296, + "objective/train/tokens_used": 356330976, + "theoretical_loss": 4.092333637098296, + "tokens_seen": 335870976 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045365095285857573, + "loss": 2.8384, + "theoretical_loss": 4.092310644484444, + "tokens_seen": 335887360 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004536409227683049, + "loss": 2.7994, + "theoretical_loss": 4.092218688382701, + "tokens_seen": 335952896 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004536308926780341, + "loss": 2.7465, + "theoretical_loss": 4.092126755239159, + "tokens_seen": 336018432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004536208625877633, + "loss": 3.0426, + "theoretical_loss": 4.092034845043608, + "tokens_seen": 336083968 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045361083249749245, + "loss": 3.0685, + "theoretical_loss": 4.09194295778585, + "tokens_seen": 336149504 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004536008024072217, + "loss": 2.9471, + "theoretical_loss": 4.091851093455689, + "tokens_seen": 336215040 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004535907723169508, + "loss": 2.9211, + "theoretical_loss": 4.091759252042936, + "tokens_seen": 336280576 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045358074222668005, + "loss": 2.8346, + "theoretical_loss": 4.091667433537408, + "tokens_seen": 336346112 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045357071213640923, + "loss": 2.8476, + "theoretical_loss": 4.0915756379289325, + "tokens_seen": 336411648 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004535606820461384, + "loss": 2.9507, + "theoretical_loss": 4.09148386520734, + "tokens_seen": 336477184 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004535506519558676, + "loss": 3.0561, + "theoretical_loss": 4.091392115362467, + "tokens_seen": 336542720 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045354062186559683, + "loss": 2.7804, + "theoretical_loss": 4.091300388384158, + "tokens_seen": 336608256 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045353059177532596, + "loss": 3.1013, + "theoretical_loss": 4.091208684262265, + "tokens_seen": 336673792 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004535205616850552, + "loss": 3.2401, + "theoretical_loss": 4.0911170029866435, + "tokens_seen": 336739328 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004535105315947843, + "loss": 3.0821, + "theoretical_loss": 4.091025344547158, + "tokens_seen": 336804864 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045350050150451355, + "loss": 3.2724, + "theoretical_loss": 4.090933708933679, + "tokens_seen": 336870400 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045349047141424274, + "loss": 3.2233, + "theoretical_loss": 4.0908420961360825, + "tokens_seen": 336935936 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004534804413239719, + "loss": 2.8302, + "theoretical_loss": 4.090750506144251, + "tokens_seen": 337001472 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004534704112337011, + "loss": 2.9115, + "theoretical_loss": 4.090658938948074, + "tokens_seen": 337067008 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004534603811434303, + "loss": 3.08, + "theoretical_loss": 4.090567394537449, + "tokens_seen": 337132544 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004534503510531595, + "loss": 2.9024, + "theoretical_loss": 4.090475872902277, + "tokens_seen": 337198080 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004534403209628887, + "loss": 3.1429, + "theoretical_loss": 4.0903843740324675, + "tokens_seen": 337263616 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004534302908726179, + "loss": 2.8771, + "theoretical_loss": 4.090292897917935, + "tokens_seen": 337329152 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045342026078234706, + "loss": 3.1249, + "theoretical_loss": 4.090201444548601, + "tokens_seen": 337394688 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045341023069207624, + "loss": 3.1044, + "theoretical_loss": 4.090110013914395, + "tokens_seen": 337460224 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 462430, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.748074531555176, + "objective/train/theoretical_loss": 4.090041455852612, + "objective/train/tokens_used": 357969376, + "theoretical_loss": 4.090041455852612, + "tokens_seen": 337509376 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004534002006018054, + "loss": 2.9316, + "theoretical_loss": 4.09001860600525, + "tokens_seen": 337525760 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045339017051153465, + "loss": 2.849, + "theoretical_loss": 4.0899272208111075, + "tokens_seen": 337591296 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004533801404212638, + "loss": 3.1926, + "theoretical_loss": 4.089835858321916, + "tokens_seen": 337656832 + }, + { + "epoch": 1.01, + "learning_rate": 0.000453370110330993, + "loss": 3.0211, + "theoretical_loss": 4.089744518527627, + "tokens_seen": 337722368 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004533600802407222, + "loss": 2.9728, + "theoretical_loss": 4.089653201418201, + "tokens_seen": 337787904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004533500501504514, + "loss": 3.1214, + "theoretical_loss": 4.089561906983606, + "tokens_seen": 337853440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045334002006018056, + "loss": 3.0502, + "theoretical_loss": 4.089470635213814, + "tokens_seen": 337918976 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045332998996990974, + "loss": 3.1409, + "theoretical_loss": 4.089379386098804, + "tokens_seen": 337984512 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004533199598796389, + "loss": 2.8961, + "theoretical_loss": 4.089288159628562, + "tokens_seen": 338050048 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045330992978936816, + "loss": 2.851, + "theoretical_loss": 4.08919695579308, + "tokens_seen": 338115584 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004532998996990973, + "loss": 2.8503, + "theoretical_loss": 4.089105774582356, + "tokens_seen": 338181120 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004532898696088265, + "loss": 3.0414, + "theoretical_loss": 4.089014615986394, + "tokens_seen": 338246656 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045327983951855565, + "loss": 2.8973, + "theoretical_loss": 4.0889234799952066, + "tokens_seen": 338312192 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004532698094282849, + "loss": 2.8654, + "theoretical_loss": 4.088832366598811, + "tokens_seen": 338377728 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045325977933801406, + "loss": 2.9145, + "theoretical_loss": 4.08874127578723, + "tokens_seen": 338443264 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045324974924774324, + "loss": 2.9358, + "theoretical_loss": 4.0886502075504945, + "tokens_seen": 338508800 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004532397191574724, + "loss": 3.1151, + "theoretical_loss": 4.08855916187864, + "tokens_seen": 338574336 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004532296890672016, + "loss": 3.2944, + "theoretical_loss": 4.08846813876171, + "tokens_seen": 338639872 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004532196589769308, + "loss": 2.8268, + "theoretical_loss": 4.088377138189754, + "tokens_seen": 338705408 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045320962888666, + "loss": 3.0523, + "theoretical_loss": 4.088286160152827, + "tokens_seen": 338770944 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045319959879638915, + "loss": 2.8429, + "theoretical_loss": 4.088195204640989, + "tokens_seen": 338836480 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004531895687061184, + "loss": 2.8917, + "theoretical_loss": 4.088104271644311, + "tokens_seen": 338902016 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045317953861584757, + "loss": 3.2186, + "theoretical_loss": 4.088013361152865, + "tokens_seen": 338967552 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045316950852557675, + "loss": 2.8986, + "theoretical_loss": 4.087922473156732, + "tokens_seen": 339033088 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045315947843530593, + "loss": 3.0517, + "theoretical_loss": 4.087831607646, + "tokens_seen": 339098624 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 463251, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.839505434036255, + "objective/train/theoretical_loss": 4.087763473263035, + "objective/train/tokens_used": 359607776, + "theoretical_loss": 4.087763473263035, + "tokens_seen": 339147776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004531494483450351, + "loss": 2.941, + "theoretical_loss": 4.087740764610761, + "tokens_seen": 339164160 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004531394182547643, + "loss": 2.9092, + "theoretical_loss": 4.0876499440411145, + "tokens_seen": 339229696 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004531293881644935, + "loss": 2.8542, + "theoretical_loss": 4.087559145927166, + "tokens_seen": 339295232 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045311935807422265, + "loss": 3.1179, + "theoretical_loss": 4.087468370259028, + "tokens_seen": 339360768 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004531093279839519, + "loss": 2.9362, + "theoretical_loss": 4.087377617026819, + "tokens_seen": 339426304 + }, + { + "epoch": 1.01, + "learning_rate": 0.000453099297893681, + "loss": 2.8019, + "theoretical_loss": 4.087286886220663, + "tokens_seen": 339491840 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045308926780341025, + "loss": 3.1268, + "theoretical_loss": 4.087196177830691, + "tokens_seen": 339557376 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045307923771313943, + "loss": 2.732, + "theoretical_loss": 4.08710549184704, + "tokens_seen": 339622912 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004530692076228686, + "loss": 3.1866, + "theoretical_loss": 4.087014828259853, + "tokens_seen": 339688448 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004530591775325978, + "loss": 2.9075, + "theoretical_loss": 4.086924187059279, + "tokens_seen": 339753984 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045304914744232703, + "loss": 3.0424, + "theoretical_loss": 4.086833568235474, + "tokens_seen": 339819520 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045303911735205616, + "loss": 3.0533, + "theoretical_loss": 4.086742971778601, + "tokens_seen": 339885056 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004530290872617854, + "loss": 3.0583, + "theoretical_loss": 4.086652397678827, + "tokens_seen": 339950592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004530190571715145, + "loss": 2.9185, + "theoretical_loss": 4.086561845926326, + "tokens_seen": 340016128 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045300902708124375, + "loss": 2.9725, + "theoretical_loss": 4.086471316511281, + "tokens_seen": 340081664 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045299899699097294, + "loss": 2.917, + "theoretical_loss": 4.086380809423876, + "tokens_seen": 340147200 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004529889669007021, + "loss": 3.1826, + "theoretical_loss": 4.086290324654303, + "tokens_seen": 340212736 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004529789368104313, + "loss": 2.8471, + "theoretical_loss": 4.086199862192766, + "tokens_seen": 340278272 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004529689067201605, + "loss": 3.1347, + "theoretical_loss": 4.086109422029466, + "tokens_seen": 340343808 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045295887662988966, + "loss": 2.7183, + "theoretical_loss": 4.0860190041546165, + "tokens_seen": 340409344 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004529488465396189, + "loss": 2.9848, + "theoretical_loss": 4.085928608558435, + "tokens_seen": 340474880 + }, + { + "epoch": 1.01, + "learning_rate": 0.000452938816449348, + "loss": 3.2565, + "theoretical_loss": 4.085838235231145, + "tokens_seen": 340540416 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045292878635907726, + "loss": 3.0739, + "theoretical_loss": 4.085747884162976, + "tokens_seen": 340605952 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004529187562688064, + "loss": 2.9197, + "theoretical_loss": 4.0856575553441665, + "tokens_seen": 340671488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004529087261785356, + "loss": 3.154, + "theoretical_loss": 4.085567248764956, + "tokens_seen": 340737024 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 464753, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.337740182876587, + "objective/train/theoretical_loss": 4.08549953341942, + "objective/train/tokens_used": 361246176, + "theoretical_loss": 4.08549953341942, + "tokens_seen": 340786176 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004528986960882648, + "loss": 3.2774, + "theoretical_loss": 4.085476964415595, + "tokens_seen": 340802560 + }, + { + "epoch": 1.01, + "learning_rate": 0.000452888665997994, + "loss": 3.0845, + "theoretical_loss": 4.085386702286338, + "tokens_seen": 340868096 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045287863590772316, + "loss": 2.9111, + "theoretical_loss": 4.085296462367445, + "tokens_seen": 340933632 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004528686058174524, + "loss": 3.3461, + "theoretical_loss": 4.085206244649184, + "tokens_seen": 340999168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004528585757271815, + "loss": 2.8499, + "theoretical_loss": 4.085116049121828, + "tokens_seen": 341064704 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045284854563691076, + "loss": 3.4213, + "theoretical_loss": 4.085025875775655, + "tokens_seen": 341130240 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004528385155466399, + "loss": 2.9534, + "theoretical_loss": 4.0849357246009514, + "tokens_seen": 341195776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004528284854563691, + "loss": 3.0351, + "theoretical_loss": 4.084845595588009, + "tokens_seen": 341261312 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004528184553660983, + "loss": 3.1023, + "theoretical_loss": 4.084755488727124, + "tokens_seen": 341326848 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004528084252758275, + "loss": 2.9196, + "theoretical_loss": 4.084665404008602, + "tokens_seen": 341392384 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045279839518555667, + "loss": 2.8665, + "theoretical_loss": 4.084575341422752, + "tokens_seen": 341457920 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045278836509528585, + "loss": 3.187, + "theoretical_loss": 4.084485300959889, + "tokens_seen": 341523456 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045277833500501503, + "loss": 3.1001, + "theoretical_loss": 4.084395282610337, + "tokens_seen": 341588992 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045276830491474426, + "loss": 3.0675, + "theoretical_loss": 4.0843052863644225, + "tokens_seen": 341654528 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004527582748244734, + "loss": 3.1473, + "theoretical_loss": 4.08421531221248, + "tokens_seen": 341720064 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004527482447342026, + "loss": 3.0618, + "theoretical_loss": 4.084125360144849, + "tokens_seen": 341785600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045273821464393175, + "loss": 3.083, + "theoretical_loss": 4.084035430151879, + "tokens_seen": 341851136 + }, + { + "epoch": 1.01, + "learning_rate": 0.000452728184553661, + "loss": 2.9518, + "theoretical_loss": 4.083945522223919, + "tokens_seen": 341916672 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045271815446339017, + "loss": 3.136, + "theoretical_loss": 4.083855636351329, + "tokens_seen": 341982208 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045270812437311935, + "loss": 2.9707, + "theoretical_loss": 4.0837657725244725, + "tokens_seen": 342047744 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004526980942828486, + "loss": 2.9953, + "theoretical_loss": 4.083675930733721, + "tokens_seen": 342113280 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045268806419257777, + "loss": 2.6718, + "theoretical_loss": 4.083586110969451, + "tokens_seen": 342178816 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045267803410230695, + "loss": 3.0025, + "theoretical_loss": 4.083496313222046, + "tokens_seen": 342244352 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045266800401203613, + "loss": 3.0224, + "theoretical_loss": 4.083406537481893, + "tokens_seen": 342309888 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004526579739217653, + "loss": 2.7411, + "theoretical_loss": 4.083316783739388, + "tokens_seen": 342375424 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 465448, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2629733085632324, + "objective/train/theoretical_loss": 4.083249482862691, + "objective/train/tokens_used": 362884576, + "theoretical_loss": 4.083249482862691, + "tokens_seen": 342424576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004526479438314945, + "loss": 2.895, + "theoretical_loss": 4.083227051984932, + "tokens_seen": 342440960 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004526379137412237, + "loss": 2.8273, + "theoretical_loss": 4.08313734220893, + "tokens_seen": 342506496 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045262788365095285, + "loss": 2.9571, + "theoretical_loss": 4.083047654401797, + "tokens_seen": 342572032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004526178535606821, + "loss": 2.6863, + "theoretical_loss": 4.082957988553951, + "tokens_seen": 342637568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004526078234704112, + "loss": 2.9279, + "theoretical_loss": 4.082868344655816, + "tokens_seen": 342703104 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045259779338014045, + "loss": 3.321, + "theoretical_loss": 4.082778722697825, + "tokens_seen": 342768640 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045258776328986963, + "loss": 3.1226, + "theoretical_loss": 4.082689122670413, + "tokens_seen": 342834176 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004525777331995988, + "loss": 3.1412, + "theoretical_loss": 4.082599544564024, + "tokens_seen": 342899712 + }, + { + "epoch": 1.01, + "learning_rate": 0.000452567703109328, + "loss": 2.9624, + "theoretical_loss": 4.082509988369106, + "tokens_seen": 342965248 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045255767301905723, + "loss": 3.0756, + "theoretical_loss": 4.0824204540761135, + "tokens_seen": 343030784 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045254764292878636, + "loss": 2.9775, + "theoretical_loss": 4.082330941675508, + "tokens_seen": 343096320 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004525376128385156, + "loss": 2.9741, + "theoretical_loss": 4.082241451157757, + "tokens_seen": 343161856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004525275827482447, + "loss": 3.0864, + "theoretical_loss": 4.0821519825133326, + "tokens_seen": 343227392 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045251755265797395, + "loss": 3.3031, + "theoretical_loss": 4.082062535732713, + "tokens_seen": 343292928 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045250752256770314, + "loss": 2.8638, + "theoretical_loss": 4.081973110806383, + "tokens_seen": 343358464 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004524974924774323, + "loss": 3.0286, + "theoretical_loss": 4.081883707724835, + "tokens_seen": 343424000 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004524874623871615, + "loss": 2.8336, + "theoretical_loss": 4.081794326478563, + "tokens_seen": 343489536 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004524774322968907, + "loss": 3.2582, + "theoretical_loss": 4.081704967058071, + "tokens_seen": 343555072 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045246740220661986, + "loss": 3.1308, + "theoretical_loss": 4.081615629453868, + "tokens_seen": 343620608 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004524573721163491, + "loss": 3.0641, + "theoretical_loss": 4.081526313656466, + "tokens_seen": 343686144 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004524473420260782, + "loss": 3.0581, + "theoretical_loss": 4.081437019656389, + "tokens_seen": 343751680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045243731193580746, + "loss": 3.0062, + "theoretical_loss": 4.081347747444161, + "tokens_seen": 343817216 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004524272818455366, + "loss": 2.7659, + "theoretical_loss": 4.081258497010314, + "tokens_seen": 343882752 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004524172517552658, + "loss": 3.0713, + "theoretical_loss": 4.081169268345387, + "tokens_seen": 343948288 + }, + { + "epoch": 1.01, + "learning_rate": 0.000452407221664995, + "loss": 3.0026, + "theoretical_loss": 4.081080061439923, + "tokens_seen": 344013824 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 466745, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.469646692276001, + "objective/train/theoretical_loss": 4.0810131705347885, + "objective/train/tokens_used": 364522976, + "theoretical_loss": 4.0810131705347885, + "tokens_seen": 344062976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004523971915747242, + "loss": 3.0142, + "theoretical_loss": 4.080990876284473, + "tokens_seen": 344079360 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045238716148445336, + "loss": 3.1162, + "theoretical_loss": 4.0809017128695935, + "tokens_seen": 344144896 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004523771313941826, + "loss": 3.3286, + "theoretical_loss": 4.080812571185845, + "tokens_seen": 344210432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004523671013039117, + "loss": 2.7948, + "theoretical_loss": 4.080723451223795, + "tokens_seen": 344275968 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045235707121364096, + "loss": 3.0005, + "theoretical_loss": 4.080634352974018, + "tokens_seen": 344341504 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004523470411233701, + "loss": 3.055, + "theoretical_loss": 4.080545276427092, + "tokens_seen": 344407040 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004523370110330993, + "loss": 3.0788, + "theoretical_loss": 4.080456221573604, + "tokens_seen": 344472576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004523269809428285, + "loss": 2.9626, + "theoretical_loss": 4.080367188404144, + "tokens_seen": 344538112 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004523169508525577, + "loss": 2.8527, + "theoretical_loss": 4.080278176909309, + "tokens_seen": 344603648 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045230692076228687, + "loss": 3.0898, + "theoretical_loss": 4.080189187079703, + "tokens_seen": 344669184 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045229689067201605, + "loss": 3.1361, + "theoretical_loss": 4.080100218905933, + "tokens_seen": 344734720 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045228686058174523, + "loss": 3.1205, + "theoretical_loss": 4.080011272378616, + "tokens_seen": 344800256 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045227683049147446, + "loss": 3.0475, + "theoretical_loss": 4.07992234748837, + "tokens_seen": 344865792 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004522668004012036, + "loss": 2.9471, + "theoretical_loss": 4.079833444225824, + "tokens_seen": 344931328 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004522567703109328, + "loss": 3.1468, + "theoretical_loss": 4.079744562581608, + "tokens_seen": 344996864 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045224674022066195, + "loss": 3.1542, + "theoretical_loss": 4.0796557025463605, + "tokens_seen": 345062400 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004522367101303912, + "loss": 2.932, + "theoretical_loss": 4.079566864110725, + "tokens_seen": 345127936 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045222668004012037, + "loss": 2.7915, + "theoretical_loss": 4.0794780472653525, + "tokens_seen": 345193472 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045221664994984955, + "loss": 3.0188, + "theoretical_loss": 4.0793892520008965, + "tokens_seen": 345259008 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045220661985957873, + "loss": 3.2555, + "theoretical_loss": 4.07930047830802, + "tokens_seen": 345324544 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045219658976930797, + "loss": 2.9997, + "theoretical_loss": 4.079211726177389, + "tokens_seen": 345390080 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004521865596790371, + "loss": 2.8722, + "theoretical_loss": 4.079122995599677, + "tokens_seen": 345455616 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045217652958876633, + "loss": 2.816, + "theoretical_loss": 4.079034286565563, + "tokens_seen": 345521152 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045216649949849546, + "loss": 3.0846, + "theoretical_loss": 4.078945599065731, + "tokens_seen": 345586688 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004521564694082247, + "loss": 3.057, + "theoretical_loss": 4.078856933090871, + "tokens_seen": 345652224 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 467437, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.132908344268799, + "objective/train/theoretical_loss": 4.078790447729892, + "objective/train/tokens_used": 366161376, + "theoretical_loss": 4.078790447729892, + "tokens_seen": 345701376 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045214643931795387, + "loss": 3.0499, + "theoretical_loss": 4.07876828863168, + "tokens_seen": 345717760 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045213640922768305, + "loss": 2.9937, + "theoretical_loss": 4.078679665678859, + "tokens_seen": 345783296 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045212637913741223, + "loss": 3.0931, + "theoretical_loss": 4.078591064223116, + "tokens_seen": 345848832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004521163490471414, + "loss": 2.9227, + "theoretical_loss": 4.078502484255164, + "tokens_seen": 345914368 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004521063189568706, + "loss": 2.871, + "theoretical_loss": 4.078413925765724, + "tokens_seen": 345979904 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045209628886659983, + "loss": 3.1182, + "theoretical_loss": 4.078325388745519, + "tokens_seen": 346045440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045208625877632896, + "loss": 3.0976, + "theoretical_loss": 4.07823687318528, + "tokens_seen": 346110976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004520762286860582, + "loss": 2.9415, + "theoretical_loss": 4.078148379075744, + "tokens_seen": 346176512 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004520661985957873, + "loss": 2.9781, + "theoretical_loss": 4.078059906407653, + "tokens_seen": 346242048 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045205616850551656, + "loss": 2.825, + "theoretical_loss": 4.077971455171755, + "tokens_seen": 346307584 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045204613841524574, + "loss": 2.8616, + "theoretical_loss": 4.077883025358804, + "tokens_seen": 346373120 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004520361083249749, + "loss": 2.8784, + "theoretical_loss": 4.07779461695956, + "tokens_seen": 346438656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004520260782347041, + "loss": 2.9527, + "theoretical_loss": 4.077706229964786, + "tokens_seen": 346504192 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045201604814443334, + "loss": 3.3518, + "theoretical_loss": 4.077617864365255, + "tokens_seen": 346569728 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045200601805416246, + "loss": 2.7224, + "theoretical_loss": 4.077529520151743, + "tokens_seen": 346635264 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004519959879638917, + "loss": 3.0247, + "theoretical_loss": 4.077441197315032, + "tokens_seen": 346700800 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004519859578736208, + "loss": 3.023, + "theoretical_loss": 4.0773528958459115, + "tokens_seen": 346766336 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045197592778335006, + "loss": 2.9417, + "theoretical_loss": 4.0772646157351735, + "tokens_seen": 346831872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045196589769307924, + "loss": 2.6767, + "theoretical_loss": 4.077176356973618, + "tokens_seen": 346897408 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004519558676028084, + "loss": 3.0087, + "theoretical_loss": 4.077088119552052, + "tokens_seen": 346962944 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045194583751253766, + "loss": 3.1533, + "theoretical_loss": 4.076999903461283, + "tokens_seen": 347028480 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004519358074222668, + "loss": 2.7099, + "theoretical_loss": 4.07691170869213, + "tokens_seen": 347094016 + }, + { + "epoch": 1.02, + "learning_rate": 0.000451925777331996, + "loss": 2.8064, + "theoretical_loss": 4.076823535235415, + "tokens_seen": 347159552 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004519157472417252, + "loss": 3.2354, + "theoretical_loss": 4.0767353830819655, + "tokens_seen": 347225088 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004519057171514544, + "loss": 2.6779, + "theoretical_loss": 4.076647252222616, + "tokens_seen": 347290624 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 469057, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.269904375076294, + "objective/train/theoretical_loss": 4.076581168046844, + "objective/train/tokens_used": 367799776, + "theoretical_loss": 4.076581168046844, + "tokens_seen": 347339776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045189568706118356, + "loss": 3.1138, + "theoretical_loss": 4.076559142648204, + "tokens_seen": 347356160 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004518856569709128, + "loss": 2.8085, + "theoretical_loss": 4.076471054349575, + "tokens_seen": 347421696 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004518756268806419, + "loss": 3.0591, + "theoretical_loss": 4.076382987317581, + "tokens_seen": 347487232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045186559679037116, + "loss": 3.0139, + "theoretical_loss": 4.076294941543078, + "tokens_seen": 347552768 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004518555667001003, + "loss": 3.0779, + "theoretical_loss": 4.076206917016927, + "tokens_seen": 347618304 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004518455366098295, + "loss": 2.8424, + "theoretical_loss": 4.076118913729996, + "tokens_seen": 347683840 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004518355065195587, + "loss": 2.9054, + "theoretical_loss": 4.0760309316731576, + "tokens_seen": 347749376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004518254764292879, + "loss": 3.2079, + "theoretical_loss": 4.075942970837292, + "tokens_seen": 347814912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045181544633901707, + "loss": 2.9092, + "theoretical_loss": 4.075855031213283, + "tokens_seen": 347880448 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045180541624874625, + "loss": 3.1271, + "theoretical_loss": 4.075767112792021, + "tokens_seen": 347945984 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045179538615847543, + "loss": 2.9086, + "theoretical_loss": 4.075679215564401, + "tokens_seen": 348011520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045178535606820466, + "loss": 2.8916, + "theoretical_loss": 4.075591339521326, + "tokens_seen": 348077056 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004517753259779338, + "loss": 3.1885, + "theoretical_loss": 4.0755034846537015, + "tokens_seen": 348142592 + }, + { + "epoch": 1.02, + "learning_rate": 0.000451765295887663, + "loss": 2.7859, + "theoretical_loss": 4.07541565095244, + "tokens_seen": 348208128 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045175526579739215, + "loss": 3.1342, + "theoretical_loss": 4.0753278384084615, + "tokens_seen": 348273664 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004517452357071214, + "loss": 3.0243, + "theoretical_loss": 4.075240047012688, + "tokens_seen": 348339200 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045173520561685057, + "loss": 2.9966, + "theoretical_loss": 4.07515227675605, + "tokens_seen": 348404736 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045172517552657975, + "loss": 3.2115, + "theoretical_loss": 4.075064527629483, + "tokens_seen": 348470272 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045171514543630893, + "loss": 3.2221, + "theoretical_loss": 4.074976799623926, + "tokens_seen": 348535808 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045170511534603817, + "loss": 2.8883, + "theoretical_loss": 4.074889092730325, + "tokens_seen": 348601344 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004516950852557673, + "loss": 3.1443, + "theoretical_loss": 4.074801406939635, + "tokens_seen": 348666880 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045168505516549653, + "loss": 3.1144, + "theoretical_loss": 4.07471374224281, + "tokens_seen": 348732416 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045167502507522566, + "loss": 2.9944, + "theoretical_loss": 4.074626098630815, + "tokens_seen": 348797952 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004516649949849549, + "loss": 2.8234, + "theoretical_loss": 4.074538476094617, + "tokens_seen": 348863488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045165496489468407, + "loss": 2.834, + "theoretical_loss": 4.0744508746251915, + "tokens_seen": 348929024 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 469778, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0074195861816406, + "objective/train/theoretical_loss": 4.074385187342765, + "objective/train/tokens_used": 369438176, + "theoretical_loss": 4.074385187342765, + "tokens_seen": 348978176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045164493480441325, + "loss": 3.1902, + "theoretical_loss": 4.0743632942135175, + "tokens_seen": 348994560 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045163490471414244, + "loss": 3.0386, + "theoretical_loss": 4.07427573485058, + "tokens_seen": 349060096 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004516248746238716, + "loss": 2.8443, + "theoretical_loss": 4.07418819652737, + "tokens_seen": 349125632 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004516148445336008, + "loss": 2.8372, + "theoretical_loss": 4.074100679234883, + "tokens_seen": 349191168 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045160481444333003, + "loss": 3.1183, + "theoretical_loss": 4.074013182964123, + "tokens_seen": 349256704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045159478435305916, + "loss": 3.104, + "theoretical_loss": 4.0739257077060955, + "tokens_seen": 349322240 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004515847542627884, + "loss": 3.0652, + "theoretical_loss": 4.073838253451814, + "tokens_seen": 349387776 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004515747241725175, + "loss": 3.0714, + "theoretical_loss": 4.073750820192296, + "tokens_seen": 349453312 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045156469408224676, + "loss": 2.9575, + "theoretical_loss": 4.073663407918566, + "tokens_seen": 349518848 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045155466399197594, + "loss": 3.0506, + "theoretical_loss": 4.073576016621656, + "tokens_seen": 349584384 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004515446339017051, + "loss": 2.8519, + "theoretical_loss": 4.073488646292597, + "tokens_seen": 349649920 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004515346038114343, + "loss": 2.8341, + "theoretical_loss": 4.073401296922432, + "tokens_seen": 349715456 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045152457372116354, + "loss": 3.1848, + "theoretical_loss": 4.073313968502206, + "tokens_seen": 349780992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045151454363089266, + "loss": 2.9554, + "theoretical_loss": 4.0732266610229715, + "tokens_seen": 349846528 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004515045135406219, + "loss": 2.7127, + "theoretical_loss": 4.073139374475784, + "tokens_seen": 349912064 + }, + { + "epoch": 1.02, + "learning_rate": 0.000451494483450351, + "loss": 3.185, + "theoretical_loss": 4.073052108851707, + "tokens_seen": 349977600 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045148445336008026, + "loss": 3.3413, + "theoretical_loss": 4.072964864141809, + "tokens_seen": 350043136 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045147442326980944, + "loss": 3.0749, + "theoretical_loss": 4.072877640337162, + "tokens_seen": 350108672 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004514643931795386, + "loss": 3.016, + "theoretical_loss": 4.072790437428846, + "tokens_seen": 350174208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004514543630892678, + "loss": 3.1049, + "theoretical_loss": 4.072703255407946, + "tokens_seen": 350239744 + }, + { + "epoch": 1.02, + "learning_rate": 0.000451444332998997, + "loss": 3.0196, + "theoretical_loss": 4.07261609426555, + "tokens_seen": 350305280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045143430290872617, + "loss": 2.9661, + "theoretical_loss": 4.072528953992756, + "tokens_seen": 350370816 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004514242728184554, + "loss": 2.7829, + "theoretical_loss": 4.072441834580663, + "tokens_seen": 350436352 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045141424272818453, + "loss": 3.1238, + "theoretical_loss": 4.072354736020377, + "tokens_seen": 350501888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045140421263791376, + "loss": 3.0012, + "theoretical_loss": 4.07226765830301, + "tokens_seen": 350567424 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 471047, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1343419551849365, + "objective/train/theoretical_loss": 4.072202363687808, + "objective/train/tokens_used": 371076576, + "theoretical_loss": 4.072202363687808, + "tokens_seen": 350616576 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045139418254764294, + "loss": 3.0134, + "theoretical_loss": 4.072180601419681, + "tokens_seen": 350632960 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004513841524573721, + "loss": 2.9707, + "theoretical_loss": 4.072093565361511, + "tokens_seen": 350698496 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004513741223671013, + "loss": 2.9995, + "theoretical_loss": 4.072006550119628, + "tokens_seen": 350764032 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004513640922768305, + "loss": 2.9866, + "theoretical_loss": 4.071919555685166, + "tokens_seen": 350829568 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045135406218655967, + "loss": 2.9988, + "theoretical_loss": 4.071832582049264, + "tokens_seen": 350895104 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004513440320962889, + "loss": 2.6705, + "theoretical_loss": 4.071745629203066, + "tokens_seen": 350960640 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045133400200601803, + "loss": 3.159, + "theoretical_loss": 4.071658697137722, + "tokens_seen": 351026176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045132397191574727, + "loss": 2.6714, + "theoretical_loss": 4.071571785844387, + "tokens_seen": 351091712 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004513139418254764, + "loss": 3.0984, + "theoretical_loss": 4.071484895314223, + "tokens_seen": 351157248 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045130391173520563, + "loss": 3.1536, + "theoretical_loss": 4.071398025538394, + "tokens_seen": 351222784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004512938816449348, + "loss": 2.8315, + "theoretical_loss": 4.071311176508073, + "tokens_seen": 351288320 + }, + { + "epoch": 1.02, + "learning_rate": 0.000451283851554664, + "loss": 3.047, + "theoretical_loss": 4.071224348214435, + "tokens_seen": 351353856 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045127382146439317, + "loss": 3.2294, + "theoretical_loss": 4.071137540648665, + "tokens_seen": 351419392 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045126379137412235, + "loss": 3.1709, + "theoretical_loss": 4.0710507538019485, + "tokens_seen": 351484928 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045125376128385153, + "loss": 3.2218, + "theoretical_loss": 4.070963987665479, + "tokens_seen": 351550464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045124373119358077, + "loss": 2.9715, + "theoretical_loss": 4.0708772422304556, + "tokens_seen": 351616000 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004512337011033099, + "loss": 3.0497, + "theoretical_loss": 4.070790517488081, + "tokens_seen": 351681536 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045122367101303913, + "loss": 2.9379, + "theoretical_loss": 4.070703813429566, + "tokens_seen": 351747072 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004512136409227683, + "loss": 2.9688, + "theoretical_loss": 4.070617130046124, + "tokens_seen": 351812608 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004512036108324975, + "loss": 3.1147, + "theoretical_loss": 4.070530467328975, + "tokens_seen": 351878144 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045119358074222673, + "loss": 2.9876, + "theoretical_loss": 4.070443825269344, + "tokens_seen": 351943680 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045118355065195586, + "loss": 3.2065, + "theoretical_loss": 4.070357203858462, + "tokens_seen": 352009216 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004511735205616851, + "loss": 3.0601, + "theoretical_loss": 4.070270603087565, + "tokens_seen": 352074752 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004511634904714143, + "loss": 3.2063, + "theoretical_loss": 4.0701840229478945, + "tokens_seen": 352140288 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045115346038114345, + "loss": 2.9421, + "theoretical_loss": 4.070097463430697, + "tokens_seen": 352205824 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 471658, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1030523777008057, + "objective/train/theoretical_loss": 4.070032557321034, + "objective/train/tokens_used": 372714976, + "theoretical_loss": 4.070032557321034, + "tokens_seen": 352254976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045114343029087264, + "loss": 2.8468, + "theoretical_loss": 4.0700109245272245, + "tokens_seen": 352271360 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004511334002006018, + "loss": 3.0807, + "theoretical_loss": 4.0699244062287345, + "tokens_seen": 352336896 + }, + { + "epoch": 1.02, + "learning_rate": 0.000451123370110331, + "loss": 2.9094, + "theoretical_loss": 4.069837908526489, + "tokens_seen": 352402432 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045111334002006023, + "loss": 3.0076, + "theoretical_loss": 4.069751431411758, + "tokens_seen": 352467968 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045110330992978936, + "loss": 3.1799, + "theoretical_loss": 4.0696649748758125, + "tokens_seen": 352533504 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004510932798395186, + "loss": 3.1436, + "theoretical_loss": 4.069578538909933, + "tokens_seen": 352599040 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004510832497492477, + "loss": 2.9335, + "theoretical_loss": 4.069492123505402, + "tokens_seen": 352664576 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045107321965897696, + "loss": 3.0457, + "theoretical_loss": 4.069405728653509, + "tokens_seen": 352730112 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045106318956870614, + "loss": 3.0376, + "theoretical_loss": 4.0693193543455495, + "tokens_seen": 352795648 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004510531594784353, + "loss": 3.1604, + "theoretical_loss": 4.069233000572823, + "tokens_seen": 352861184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004510431293881645, + "loss": 3.2364, + "theoretical_loss": 4.069146667326635, + "tokens_seen": 352926720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045103309929789374, + "loss": 2.9682, + "theoretical_loss": 4.069060354598296, + "tokens_seen": 352992256 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045102306920762286, + "loss": 3.0206, + "theoretical_loss": 4.06897406237912, + "tokens_seen": 353057792 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004510130391173521, + "loss": 2.8432, + "theoretical_loss": 4.0688877906604315, + "tokens_seen": 353123328 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004510030090270812, + "loss": 2.9897, + "theoretical_loss": 4.068801539433554, + "tokens_seen": 353188864 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045099297893681046, + "loss": 3.0886, + "theoretical_loss": 4.06871530868982, + "tokens_seen": 353254400 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045098294884653964, + "loss": 3.1712, + "theoretical_loss": 4.068629098420567, + "tokens_seen": 353319936 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004509729187562688, + "loss": 2.9892, + "theoretical_loss": 4.068542908617136, + "tokens_seen": 353385472 + }, + { + "epoch": 1.02, + "learning_rate": 0.000450962888665998, + "loss": 3.1108, + "theoretical_loss": 4.068456739270876, + "tokens_seen": 353451008 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004509528585757272, + "loss": 3.0438, + "theoretical_loss": 4.068370590373139, + "tokens_seen": 353516544 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045094282848545637, + "loss": 2.8643, + "theoretical_loss": 4.068284461915282, + "tokens_seen": 353582080 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004509327983951856, + "loss": 2.9281, + "theoretical_loss": 4.06819835388867, + "tokens_seen": 353647616 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045092276830491473, + "loss": 3.2311, + "theoretical_loss": 4.06811226628467, + "tokens_seen": 353713152 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045091273821464396, + "loss": 3.1787, + "theoretical_loss": 4.068026199094657, + "tokens_seen": 353778688 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045090270812437314, + "loss": 3.188, + "theoretical_loss": 4.067940152310008, + "tokens_seen": 353844224 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 473221, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.438317060470581, + "objective/train/theoretical_loss": 4.06787563060736, + "objective/train/tokens_used": 374353376, + "theoretical_loss": 4.06787563060736, + "tokens_seen": 353893376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004508926780341023, + "loss": 3.0398, + "theoretical_loss": 4.06785412592211, + "tokens_seen": 353909760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004508826479438315, + "loss": 3.107, + "theoretical_loss": 4.06776811992235, + "tokens_seen": 353975296 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004508726178535607, + "loss": 3.1091, + "theoretical_loss": 4.067682134302124, + "tokens_seen": 354040832 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045086258776328987, + "loss": 3.1053, + "theoretical_loss": 4.06759616905283, + "tokens_seen": 354106368 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004508525576730191, + "loss": 3.3487, + "theoretical_loss": 4.0675102241658765, + "tokens_seen": 354171904 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045084252758274823, + "loss": 3.0064, + "theoretical_loss": 4.0674242996326715, + "tokens_seen": 354237440 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045083249749247747, + "loss": 2.8039, + "theoretical_loss": 4.06733839544463, + "tokens_seen": 354302976 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004508224674022066, + "loss": 3.0391, + "theoretical_loss": 4.067252511593175, + "tokens_seen": 354368512 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045081243731193583, + "loss": 2.8456, + "theoretical_loss": 4.067166648069731, + "tokens_seen": 354434048 + }, + { + "epoch": 1.02, + "learning_rate": 0.000450802407221665, + "loss": 2.9981, + "theoretical_loss": 4.067080804865728, + "tokens_seen": 354499584 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004507923771313942, + "loss": 2.7936, + "theoretical_loss": 4.066994981972604, + "tokens_seen": 354565120 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045078234704112337, + "loss": 3.2045, + "theoretical_loss": 4.066909179381801, + "tokens_seen": 354630656 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045077231695085255, + "loss": 2.9505, + "theoretical_loss": 4.066823397084764, + "tokens_seen": 354696192 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045076228686058173, + "loss": 2.6534, + "theoretical_loss": 4.066737635072946, + "tokens_seen": 354761728 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045075225677031097, + "loss": 2.7746, + "theoretical_loss": 4.066651893337804, + "tokens_seen": 354827264 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004507422266800401, + "loss": 3.0798, + "theoretical_loss": 4.066566171870799, + "tokens_seen": 354892800 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045073219658976933, + "loss": 3.0763, + "theoretical_loss": 4.066480470663401, + "tokens_seen": 354958336 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004507221664994985, + "loss": 3.2006, + "theoretical_loss": 4.06639478970708, + "tokens_seen": 355023872 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004507121364092277, + "loss": 2.9487, + "theoretical_loss": 4.066309128993316, + "tokens_seen": 355089408 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004507021063189569, + "loss": 3.2167, + "theoretical_loss": 4.0662234885135895, + "tokens_seen": 355154944 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045069207622868606, + "loss": 2.998, + "theoretical_loss": 4.066137868259391, + "tokens_seen": 355220480 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045068204613841524, + "loss": 3.0349, + "theoretical_loss": 4.0660522682222116, + "tokens_seen": 355286016 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004506720160481445, + "loss": 3.0877, + "theoretical_loss": 4.065966688393551, + "tokens_seen": 355351552 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004506619859578736, + "loss": 2.874, + "theoretical_loss": 4.065881128764912, + "tokens_seen": 355417088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045065195586760284, + "loss": 3.1169, + "theoretical_loss": 4.065795589327804, + "tokens_seen": 355482624 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 473851, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1546261310577393, + "objective/train/theoretical_loss": 4.065731447995559, + "objective/train/tokens_used": 375991776, + "theoretical_loss": 4.065731447995559, + "tokens_seen": 355531776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045064192577733196, + "loss": 2.968, + "theoretical_loss": 4.06571007007374, + "tokens_seen": 355548160 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004506318956870612, + "loss": 3.0017, + "theoretical_loss": 4.065624570994239, + "tokens_seen": 355613696 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004506218655967904, + "loss": 3.1213, + "theoretical_loss": 4.065539092080826, + "tokens_seen": 355679232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045061183550651956, + "loss": 3.0196, + "theoretical_loss": 4.0654536333250295, + "tokens_seen": 355744768 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045060180541624874, + "loss": 2.9044, + "theoretical_loss": 4.065368194718383, + "tokens_seen": 355810304 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004505917753259779, + "loss": 3.0828, + "theoretical_loss": 4.065282776252427, + "tokens_seen": 355875840 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004505817452357071, + "loss": 2.883, + "theoretical_loss": 4.065197377918706, + "tokens_seen": 355941376 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045057171514543634, + "loss": 2.8719, + "theoretical_loss": 4.065111999708767, + "tokens_seen": 356006912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045056168505516547, + "loss": 2.7669, + "theoretical_loss": 4.065026641614169, + "tokens_seen": 356072448 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004505516549648947, + "loss": 2.9525, + "theoretical_loss": 4.064941303626469, + "tokens_seen": 356137984 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004505416248746239, + "loss": 2.9686, + "theoretical_loss": 4.064855985737234, + "tokens_seen": 356203520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045053159478435306, + "loss": 3.0237, + "theoretical_loss": 4.064770687938031, + "tokens_seen": 356269056 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045052156469408224, + "loss": 3.1213, + "theoretical_loss": 4.064685410220437, + "tokens_seen": 356334592 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004505115346038114, + "loss": 3.1203, + "theoretical_loss": 4.064600152576032, + "tokens_seen": 356400128 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004505015045135406, + "loss": 3.2214, + "theoretical_loss": 4.0645149149964, + "tokens_seen": 356465664 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045049147442326984, + "loss": 2.9673, + "theoretical_loss": 4.064429697473134, + "tokens_seen": 356531200 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045048144433299897, + "loss": 2.9293, + "theoretical_loss": 4.064344499997826, + "tokens_seen": 356596736 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004504714142427282, + "loss": 2.7086, + "theoretical_loss": 4.06425932256208, + "tokens_seen": 356662272 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045046138415245733, + "loss": 3.1569, + "theoretical_loss": 4.064174165157499, + "tokens_seen": 356727808 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045045135406218657, + "loss": 2.777, + "theoretical_loss": 4.064089027775694, + "tokens_seen": 356793344 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004504413239719158, + "loss": 2.8872, + "theoretical_loss": 4.064003910408281, + "tokens_seen": 356858880 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045043129388164493, + "loss": 3.1839, + "theoretical_loss": 4.06391881304688, + "tokens_seen": 356924416 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045042126379137416, + "loss": 3.1245, + "theoretical_loss": 4.063833735683118, + "tokens_seen": 356989952 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045041123370110334, + "loss": 2.964, + "theoretical_loss": 4.063748678308624, + "tokens_seen": 357055488 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004504012036108325, + "loss": 3.1374, + "theoretical_loss": 4.063663640915035, + "tokens_seen": 357121024 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 475208, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0372159481048584, + "objective/train/theoretical_loss": 4.063599875977284, + "objective/train/tokens_used": 377630176, + "theoretical_loss": 4.063599875977284, + "tokens_seen": 357170176 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004503911735205617, + "loss": 3.083, + "theoretical_loss": 4.063578623493992, + "tokens_seen": 357186560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004503811434302909, + "loss": 3.0497, + "theoretical_loss": 4.06349362603714, + "tokens_seen": 357252096 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045037111334002007, + "loss": 3.3375, + "theoretical_loss": 4.0634086485361305, + "tokens_seen": 357317632 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004503610832497493, + "loss": 2.9054, + "theoretical_loss": 4.063323690982619, + "tokens_seen": 357383168 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045035105315947843, + "loss": 2.9316, + "theoretical_loss": 4.0632387533682675, + "tokens_seen": 357448704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045034102306920767, + "loss": 3.0126, + "theoretical_loss": 4.06315383568474, + "tokens_seen": 357514240 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004503309929789368, + "loss": 3.0289, + "theoretical_loss": 4.063068937923709, + "tokens_seen": 357579776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045032096288866603, + "loss": 2.7174, + "theoretical_loss": 4.062984060076849, + "tokens_seen": 357645312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004503109327983952, + "loss": 3.0905, + "theoretical_loss": 4.062899202135844, + "tokens_seen": 357710848 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004503009027081244, + "loss": 2.8817, + "theoretical_loss": 4.062814364092376, + "tokens_seen": 357776384 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045029087261785357, + "loss": 2.7171, + "theoretical_loss": 4.062729545938138, + "tokens_seen": 357841920 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045028084252758275, + "loss": 2.963, + "theoretical_loss": 4.062644747664827, + "tokens_seen": 357907456 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045027081243731193, + "loss": 3.0307, + "theoretical_loss": 4.062559969264141, + "tokens_seen": 357972992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045026078234704117, + "loss": 3.2462, + "theoretical_loss": 4.062475210727789, + "tokens_seen": 358038528 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004502507522567703, + "loss": 2.9787, + "theoretical_loss": 4.06239047204748, + "tokens_seen": 358104064 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045024072216649953, + "loss": 2.927, + "theoretical_loss": 4.06230575321493, + "tokens_seen": 358169600 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004502306920762287, + "loss": 3.0136, + "theoretical_loss": 4.06222105422186, + "tokens_seen": 358235136 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004502206619859579, + "loss": 3.1244, + "theoretical_loss": 4.062136375059996, + "tokens_seen": 358300672 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004502106318956871, + "loss": 2.8825, + "theoretical_loss": 4.062051715721069, + "tokens_seen": 358366208 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045020060180541626, + "loss": 3.108, + "theoretical_loss": 4.061967076196815, + "tokens_seen": 358431744 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045019057171514544, + "loss": 3.1374, + "theoretical_loss": 4.061882456478973, + "tokens_seen": 358497280 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004501805416248747, + "loss": 3.17, + "theoretical_loss": 4.0617978565592905, + "tokens_seen": 358562816 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004501705115346038, + "loss": 3.2081, + "theoretical_loss": 4.061713276429517, + "tokens_seen": 358628352 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045016048144433304, + "loss": 3.1158, + "theoretical_loss": 4.061628716081408, + "tokens_seen": 358693888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045015045135406216, + "loss": 3.176, + "theoretical_loss": 4.061544175506725, + "tokens_seen": 358759424 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 475771, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.862164258956909, + "objective/train/theoretical_loss": 4.061480783047069, + "objective/train/tokens_used": 379268576, + "theoretical_loss": 4.061480783047069, + "tokens_seen": 358808576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004501404212637914, + "loss": 3.0135, + "theoretical_loss": 4.061459654697233, + "tokens_seen": 358824960 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004501303911735206, + "loss": 3.2575, + "theoretical_loss": 4.061375153644701, + "tokens_seen": 358890496 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045012036108324976, + "loss": 3.0728, + "theoretical_loss": 4.061290672340906, + "tokens_seen": 358956032 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045011033099297894, + "loss": 3.1692, + "theoretical_loss": 4.061206210777627, + "tokens_seen": 359021568 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004501003009027081, + "loss": 3.0361, + "theoretical_loss": 4.0611217689466494, + "tokens_seen": 359087104 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004500902708124373, + "loss": 3.0446, + "theoretical_loss": 4.061037346839764, + "tokens_seen": 359152640 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045008024072216654, + "loss": 3.0256, + "theoretical_loss": 4.060952944448765, + "tokens_seen": 359218176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045007021063189567, + "loss": 3.0506, + "theoretical_loss": 4.060868561765452, + "tokens_seen": 359283712 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004500601805416249, + "loss": 3.0259, + "theoretical_loss": 4.060784198781631, + "tokens_seen": 359349248 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004500501504513541, + "loss": 3.1077, + "theoretical_loss": 4.06069985548911, + "tokens_seen": 359414784 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045004012036108326, + "loss": 3.1462, + "theoretical_loss": 4.060615531879705, + "tokens_seen": 359480320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045003009027081244, + "loss": 3.0044, + "theoretical_loss": 4.0605312279452335, + "tokens_seen": 359545856 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004500200601805416, + "loss": 2.947, + "theoretical_loss": 4.060446943677523, + "tokens_seen": 359611392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004500100300902708, + "loss": 3.0459, + "theoretical_loss": 4.060362679068399, + "tokens_seen": 359676928 + }, + { + "epoch": 1.02, + "learning_rate": 0.00045000000000000004, + "loss": 3.2938, + "theoretical_loss": 4.060278434109699, + "tokens_seen": 359742464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044998996990972917, + "loss": 2.9767, + "theoretical_loss": 4.060194208793259, + "tokens_seen": 359808000 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004499799398194584, + "loss": 2.9244, + "theoretical_loss": 4.060110003110925, + "tokens_seen": 359873536 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044996990972918753, + "loss": 3.1669, + "theoretical_loss": 4.0600258170545445, + "tokens_seen": 359939072 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044995987963891677, + "loss": 3.0841, + "theoretical_loss": 4.059941650615972, + "tokens_seen": 360004608 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044994984954864595, + "loss": 3.0194, + "theoretical_loss": 4.059857503787066, + "tokens_seen": 360070144 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044993981945837513, + "loss": 2.9446, + "theoretical_loss": 4.059773376559689, + "tokens_seen": 360135680 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004499297893681043, + "loss": 2.8132, + "theoretical_loss": 4.05968926892571, + "tokens_seen": 360201216 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044991975927783355, + "loss": 3.0256, + "theoretical_loss": 4.059605180877001, + "tokens_seen": 360266752 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044990972918756267, + "loss": 3.0306, + "theoretical_loss": 4.05952111240544, + "tokens_seen": 360332288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004498996990972919, + "loss": 2.9097, + "theoretical_loss": 4.0594370635029104, + "tokens_seen": 360397824 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 477253, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3396573066711426, + "objective/train/theoretical_loss": 4.059374039663311, + "objective/train/tokens_used": 380906976, + "theoretical_loss": 4.059374039663311, + "tokens_seen": 360446976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044988966900702103, + "loss": 3.1348, + "theoretical_loss": 4.0593530341613, + "tokens_seen": 360463360 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044987963891675027, + "loss": 2.9424, + "theoretical_loss": 4.059269024372501, + "tokens_seen": 360528896 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044986960882647945, + "loss": 3.1762, + "theoretical_loss": 4.05918503412841, + "tokens_seen": 360594432 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044985957873620863, + "loss": 3.2145, + "theoretical_loss": 4.059101063420929, + "tokens_seen": 360659968 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004498495486459378, + "loss": 2.899, + "theoretical_loss": 4.059017112241966, + "tokens_seen": 360725504 + }, + { + "epoch": 1.02, + "learning_rate": 0.000449839518555667, + "loss": 3.1533, + "theoretical_loss": 4.058933180583431, + "tokens_seen": 360791040 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004498294884653962, + "loss": 3.015, + "theoretical_loss": 4.0588492684372435, + "tokens_seen": 360856576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004498194583751254, + "loss": 3.1045, + "theoretical_loss": 4.058765375795321, + "tokens_seen": 360922112 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044980942828485454, + "loss": 3.291, + "theoretical_loss": 4.058681502649593, + "tokens_seen": 360987648 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044979939819458377, + "loss": 2.8875, + "theoretical_loss": 4.058597648991988, + "tokens_seen": 361053184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004497893681043129, + "loss": 2.8632, + "theoretical_loss": 4.058513814814443, + "tokens_seen": 361118720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044977933801404213, + "loss": 2.8654, + "theoretical_loss": 4.058430000108898, + "tokens_seen": 361184256 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004497693079237713, + "loss": 3.1835, + "theoretical_loss": 4.058346204867299, + "tokens_seen": 361249792 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004497592778335005, + "loss": 3.1647, + "theoretical_loss": 4.058262429081596, + "tokens_seen": 361315328 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004497492477432297, + "loss": 3.0466, + "theoretical_loss": 4.058178672743744, + "tokens_seen": 361380864 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004497392176529589, + "loss": 3.0466, + "theoretical_loss": 4.058094935845703, + "tokens_seen": 361446400 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044972918756268804, + "loss": 3.0096, + "theoretical_loss": 4.058011218379436, + "tokens_seen": 361511936 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004497191574724173, + "loss": 3.2179, + "theoretical_loss": 4.057927520336913, + "tokens_seen": 361577472 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004497091273821464, + "loss": 3.4683, + "theoretical_loss": 4.05784384171011, + "tokens_seen": 361643008 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044969909729187564, + "loss": 3.0278, + "theoretical_loss": 4.057760182491003, + "tokens_seen": 361708544 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004496890672016049, + "loss": 2.9469, + "theoretical_loss": 4.057676542671577, + "tokens_seen": 361774080 + }, + { + "epoch": 1.02, + "learning_rate": 0.000449679037111334, + "loss": 3.0088, + "theoretical_loss": 4.0575929222438205, + "tokens_seen": 361839616 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044966900702106324, + "loss": 2.9364, + "theoretical_loss": 4.057509321199726, + "tokens_seen": 361905152 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044965897693079236, + "loss": 2.808, + "theoretical_loss": 4.057425739531292, + "tokens_seen": 361970688 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004496489468405216, + "loss": 3.017, + "theoretical_loss": 4.057342177230519, + "tokens_seen": 362036224 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 478095, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1937191486358643, + "objective/train/theoretical_loss": 4.057279518210161, + "objective/train/tokens_used": 382545376, + "theoretical_loss": 4.057279518210161, + "tokens_seen": 362085376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004496389167502508, + "loss": 3.2181, + "theoretical_loss": 4.057258634289418, + "tokens_seen": 362101760 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044962888665997996, + "loss": 2.8883, + "theoretical_loss": 4.057175110699999, + "tokens_seen": 362167296 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044961885656970914, + "loss": 2.9905, + "theoretical_loss": 4.057091606454279, + "tokens_seen": 362232832 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004496088264794383, + "loss": 2.9349, + "theoretical_loss": 4.057008121544279, + "tokens_seen": 362298368 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004495987963891675, + "loss": 3.0119, + "theoretical_loss": 4.056924655962027, + "tokens_seen": 362363904 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044958876629889674, + "loss": 2.8529, + "theoretical_loss": 4.056841209699553, + "tokens_seen": 362429440 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044957873620862587, + "loss": 3.0478, + "theoretical_loss": 4.0567577827488925, + "tokens_seen": 362494976 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004495687061183551, + "loss": 3.0898, + "theoretical_loss": 4.056674375102086, + "tokens_seen": 362560512 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004495586760280843, + "loss": 2.9169, + "theoretical_loss": 4.05659098675118, + "tokens_seen": 362626048 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044954864593781346, + "loss": 2.6805, + "theoretical_loss": 4.056507617688223, + "tokens_seen": 362691584 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044953861584754264, + "loss": 3.002, + "theoretical_loss": 4.05642426790527, + "tokens_seen": 362757120 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004495285857572718, + "loss": 3.2354, + "theoretical_loss": 4.056340937394381, + "tokens_seen": 362822656 + }, + { + "epoch": 1.02, + "learning_rate": 0.000449518555667001, + "loss": 2.8529, + "theoretical_loss": 4.0562576261476195, + "tokens_seen": 362888192 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044950852557673024, + "loss": 3.0567, + "theoretical_loss": 4.056174334157054, + "tokens_seen": 362953728 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044949849548645937, + "loss": 3.1938, + "theoretical_loss": 4.056091061414759, + "tokens_seen": 363019264 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004494884653961886, + "loss": 2.914, + "theoretical_loss": 4.0560078079128115, + "tokens_seen": 363084800 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044947843530591773, + "loss": 2.9784, + "theoretical_loss": 4.055924573643295, + "tokens_seen": 363150336 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044946840521564697, + "loss": 2.9387, + "theoretical_loss": 4.055841358598297, + "tokens_seen": 363215872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044945837512537615, + "loss": 3.182, + "theoretical_loss": 4.055758162769909, + "tokens_seen": 363281408 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044944834503510533, + "loss": 3.0795, + "theoretical_loss": 4.055674986150228, + "tokens_seen": 363346944 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004494383149448345, + "loss": 3.0805, + "theoretical_loss": 4.055591828731356, + "tokens_seen": 363412480 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044942828485456375, + "loss": 3.2346, + "theoretical_loss": 4.0555086905054, + "tokens_seen": 363478016 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044941825476429287, + "loss": 3.0509, + "theoretical_loss": 4.05542557146447, + "tokens_seen": 363543552 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004494082246740221, + "loss": 2.9644, + "theoretical_loss": 4.055342471600682, + "tokens_seen": 363609088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044939819458375123, + "loss": 3.0934, + "theoretical_loss": 4.055259390906155, + "tokens_seen": 363674624 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 479451, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.949183702468872, + "objective/train/theoretical_loss": 4.05519709296035, + "objective/train/tokens_used": 384183776, + "theoretical_loss": 4.05519709296035, + "tokens_seen": 363723776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044938816449348047, + "loss": 2.8857, + "theoretical_loss": 4.055176329373015, + "tokens_seen": 363740160 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044937813440320965, + "loss": 2.8673, + "theoretical_loss": 4.055093286993392, + "tokens_seen": 363805696 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044936810431293883, + "loss": 2.9403, + "theoretical_loss": 4.055010263759419, + "tokens_seen": 363871232 + }, + { + "epoch": 1.02, + "learning_rate": 0.000449358074222668, + "loss": 2.9892, + "theoretical_loss": 4.054927259663235, + "tokens_seen": 363936768 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004493480441323972, + "loss": 3.1854, + "theoretical_loss": 4.054844274696984, + "tokens_seen": 364002304 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004493380140421264, + "loss": 3.1805, + "theoretical_loss": 4.0547613088528145, + "tokens_seen": 364067840 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004493279839518556, + "loss": 2.9269, + "theoretical_loss": 4.054678362122878, + "tokens_seen": 364133376 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044931795386158474, + "loss": 3.0968, + "theoretical_loss": 4.0545954344993325, + "tokens_seen": 364198912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044930792377131397, + "loss": 3.1839, + "theoretical_loss": 4.05451252597434, + "tokens_seen": 364264448 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004492978936810431, + "loss": 3.0138, + "theoretical_loss": 4.054429636540068, + "tokens_seen": 364329984 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044928786359077234, + "loss": 2.8682, + "theoretical_loss": 4.0543467661886865, + "tokens_seen": 364395520 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004492778335005015, + "loss": 2.8514, + "theoretical_loss": 4.054263914912372, + "tokens_seen": 364461056 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004492678034102307, + "loss": 2.8017, + "theoretical_loss": 4.0541810827033045, + "tokens_seen": 364526592 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004492577733199599, + "loss": 3.0815, + "theoretical_loss": 4.054098269553669, + "tokens_seen": 364592128 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004492477432296891, + "loss": 2.8874, + "theoretical_loss": 4.054015475455656, + "tokens_seen": 364657664 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044923771313941824, + "loss": 3.159, + "theoretical_loss": 4.053932700401459, + "tokens_seen": 364723200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004492276830491475, + "loss": 3.033, + "theoretical_loss": 4.053849944383279, + "tokens_seen": 364788736 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004492176529588766, + "loss": 3.0449, + "theoretical_loss": 4.0537672073933155, + "tokens_seen": 364854272 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044920762286860584, + "loss": 3.2518, + "theoretical_loss": 4.05368448942378, + "tokens_seen": 364919808 + }, + { + "epoch": 1.02, + "learning_rate": 0.000449197592778335, + "loss": 3.0096, + "theoretical_loss": 4.053601790466884, + "tokens_seen": 364985344 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004491875626880642, + "loss": 3.032, + "theoretical_loss": 4.053519110514845, + "tokens_seen": 365050880 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004491775325977934, + "loss": 3.2143, + "theoretical_loss": 4.053436449559886, + "tokens_seen": 365116416 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044916750250752256, + "loss": 2.92, + "theoretical_loss": 4.05335380759423, + "tokens_seen": 365181952 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044915747241725174, + "loss": 3.1182, + "theoretical_loss": 4.053271184610111, + "tokens_seen": 365247488 + }, + { + "epoch": 1.02, + "learning_rate": 0.000449147442326981, + "loss": 2.9261, + "theoretical_loss": 4.053188580599764, + "tokens_seen": 365313024 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 480072, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2834551334381104, + "objective/train/theoretical_loss": 4.053126640038872, + "objective/train/tokens_used": 385822176, + "theoretical_loss": 4.053126640038872, + "tokens_seen": 365362176 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004491374122367101, + "loss": 2.9938, + "theoretical_loss": 4.053105995555429, + "tokens_seen": 365378560 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044912738214643934, + "loss": 2.9087, + "theoretical_loss": 4.0530234294693495, + "tokens_seen": 365444096 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044911735205616847, + "loss": 2.8535, + "theoretical_loss": 4.052940882333777, + "tokens_seen": 365509632 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004491073219658977, + "loss": 3.0883, + "theoretical_loss": 4.052858354140964, + "tokens_seen": 365575168 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004490972918756269, + "loss": 2.8664, + "theoretical_loss": 4.052775844883168, + "tokens_seen": 365640704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044908726178535607, + "loss": 3.0148, + "theoretical_loss": 4.0526933545526544, + "tokens_seen": 365706240 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044907723169508525, + "loss": 3.1162, + "theoretical_loss": 4.052610883141689, + "tokens_seen": 365771776 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004490672016048145, + "loss": 3.2736, + "theoretical_loss": 4.052528430642544, + "tokens_seen": 365837312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004490571715145436, + "loss": 2.9012, + "theoretical_loss": 4.052445997047497, + "tokens_seen": 365902848 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044904714142427284, + "loss": 3.0513, + "theoretical_loss": 4.052363582348827, + "tokens_seen": 365968384 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044903711133400197, + "loss": 3.3379, + "theoretical_loss": 4.052281186538821, + "tokens_seen": 366033920 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004490270812437312, + "loss": 3.041, + "theoretical_loss": 4.052198809609769, + "tokens_seen": 366099456 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004490170511534604, + "loss": 2.924, + "theoretical_loss": 4.0521164515539665, + "tokens_seen": 366164992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044900702106318957, + "loss": 2.9846, + "theoretical_loss": 4.052034112363711, + "tokens_seen": 366230528 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044899699097291875, + "loss": 3.087, + "theoretical_loss": 4.051951792031308, + "tokens_seen": 366296064 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044898696088264793, + "loss": 3.0914, + "theoretical_loss": 4.051869490549064, + "tokens_seen": 366361600 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004489769307923771, + "loss": 2.9215, + "theoretical_loss": 4.051787207909294, + "tokens_seen": 366427136 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044896690070210635, + "loss": 2.9358, + "theoretical_loss": 4.051704944104313, + "tokens_seen": 366492672 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004489568706118355, + "loss": 3.0429, + "theoretical_loss": 4.051622699126444, + "tokens_seen": 366558208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004489468405215647, + "loss": 3.0332, + "theoretical_loss": 4.0515404729680125, + "tokens_seen": 366623744 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044893681043129395, + "loss": 3.057, + "theoretical_loss": 4.051458265621351, + "tokens_seen": 366689280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044892678034102307, + "loss": 3.0415, + "theoretical_loss": 4.0513760770787925, + "tokens_seen": 366754816 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004489167502507523, + "loss": 3.3661, + "theoretical_loss": 4.051293907332678, + "tokens_seen": 366820352 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044890672016048143, + "loss": 3.2608, + "theoretical_loss": 4.051211756375352, + "tokens_seen": 366885888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044889669007021067, + "loss": 3.042, + "theoretical_loss": 4.051129624199163, + "tokens_seen": 366951424 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 481451, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.963195323944092, + "objective/train/theoretical_loss": 4.051068037387542, + "objective/train/tokens_used": 387460576, + "theoretical_loss": 4.051068037387542, + "tokens_seen": 367000576 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044888665997993985, + "loss": 3.0687, + "theoretical_loss": 4.051047510796463, + "tokens_seen": 367016960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044887662988966903, + "loss": 2.9519, + "theoretical_loss": 4.050965416159612, + "tokens_seen": 367082496 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004488665997993982, + "loss": 2.8079, + "theoretical_loss": 4.05088334028097, + "tokens_seen": 367148032 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004488565697091274, + "loss": 3.114, + "theoretical_loss": 4.050801283152905, + "tokens_seen": 367213568 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004488465396188566, + "loss": 2.8896, + "theoretical_loss": 4.050719244767788, + "tokens_seen": 367279104 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004488365095285858, + "loss": 2.8136, + "theoretical_loss": 4.0506372251179945, + "tokens_seen": 367344640 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044882647943831494, + "loss": 2.8483, + "theoretical_loss": 4.050555224195904, + "tokens_seen": 367410176 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004488164493480442, + "loss": 3.2117, + "theoretical_loss": 4.050473241993901, + "tokens_seen": 367475712 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004488064192577733, + "loss": 3.0052, + "theoretical_loss": 4.0503912785043745, + "tokens_seen": 367541248 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044879638916750254, + "loss": 2.9371, + "theoretical_loss": 4.050309333719719, + "tokens_seen": 367606784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004487863590772317, + "loss": 3.207, + "theoretical_loss": 4.050227407632331, + "tokens_seen": 367672320 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004487763289869609, + "loss": 3.1296, + "theoretical_loss": 4.050145500234613, + "tokens_seen": 367737856 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004487662988966901, + "loss": 2.7027, + "theoretical_loss": 4.050063611518974, + "tokens_seen": 367803392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004487562688064193, + "loss": 3.1566, + "theoretical_loss": 4.049981741477821, + "tokens_seen": 367868928 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044874623871614844, + "loss": 2.958, + "theoretical_loss": 4.049899890103573, + "tokens_seen": 367934464 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004487362086258777, + "loss": 3.1707, + "theoretical_loss": 4.049818057388649, + "tokens_seen": 368000000 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004487261785356068, + "loss": 3.0476, + "theoretical_loss": 4.049736243325473, + "tokens_seen": 368065536 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044871614844533604, + "loss": 2.7808, + "theoretical_loss": 4.049654447906473, + "tokens_seen": 368131072 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004487061183550652, + "loss": 3.0115, + "theoretical_loss": 4.049572671124086, + "tokens_seen": 368196608 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004486960882647944, + "loss": 3.1322, + "theoretical_loss": 4.049490912970745, + "tokens_seen": 368262144 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004486860581745236, + "loss": 2.9311, + "theoretical_loss": 4.049409173438896, + "tokens_seen": 368327680 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044867602808425276, + "loss": 3.1052, + "theoretical_loss": 4.049327452520982, + "tokens_seen": 368393216 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044866599799398194, + "loss": 2.8961, + "theoretical_loss": 4.049245750209458, + "tokens_seen": 368458752 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004486559679037112, + "loss": 2.943, + "theoretical_loss": 4.049164066496776, + "tokens_seen": 368524288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004486459378134403, + "loss": 2.9977, + "theoretical_loss": 4.049082401375397, + "tokens_seen": 368589824 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 482108, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.12469744682312, + "objective/train/theoretical_loss": 4.049021164730373, + "objective/train/tokens_used": 389098976, + "theoretical_loss": 4.049021164730373, + "tokens_seen": 368638976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044863590772316954, + "loss": 3.2333, + "theoretical_loss": 4.049000754837786, + "tokens_seen": 368655360 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044862587763289867, + "loss": 2.8482, + "theoretical_loss": 4.04891912687641, + "tokens_seen": 368720896 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004486158475426279, + "loss": 2.9425, + "theoretical_loss": 4.048837517483744, + "tokens_seen": 368786432 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004486058174523571, + "loss": 3.0541, + "theoretical_loss": 4.048755926652262, + "tokens_seen": 368851968 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044859578736208627, + "loss": 2.9036, + "theoretical_loss": 4.04867435437445, + "tokens_seen": 368917504 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044858575727181545, + "loss": 3.1034, + "theoretical_loss": 4.0485928006427905, + "tokens_seen": 368983040 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004485757271815447, + "loss": 3.0932, + "theoretical_loss": 4.0485112654497755, + "tokens_seen": 369048576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004485656970912738, + "loss": 3.0221, + "theoretical_loss": 4.048429748787899, + "tokens_seen": 369114112 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044855566700100304, + "loss": 2.7535, + "theoretical_loss": 4.048348250649662, + "tokens_seen": 369179648 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044854563691073217, + "loss": 3.1027, + "theoretical_loss": 4.048266771027565, + "tokens_seen": 369245184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004485356068204614, + "loss": 3.3042, + "theoretical_loss": 4.048185309914119, + "tokens_seen": 369310720 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004485255767301906, + "loss": 3.091, + "theoretical_loss": 4.048103867301836, + "tokens_seen": 369376256 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044851554663991977, + "loss": 2.9852, + "theoretical_loss": 4.04802244318323, + "tokens_seen": 369441792 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044850551654964895, + "loss": 3.1036, + "theoretical_loss": 4.047941037550825, + "tokens_seen": 369507328 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044849548645937813, + "loss": 2.9854, + "theoretical_loss": 4.047859650397145, + "tokens_seen": 369572864 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004484854563691073, + "loss": 3.1245, + "theoretical_loss": 4.04777828171472, + "tokens_seen": 369638400 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044847542627883655, + "loss": 2.9639, + "theoretical_loss": 4.047696931496084, + "tokens_seen": 369703936 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004484653961885657, + "loss": 3.0576, + "theoretical_loss": 4.047615599733775, + "tokens_seen": 369769472 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004484553660982949, + "loss": 3.1132, + "theoretical_loss": 4.047534286420337, + "tokens_seen": 369835008 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044844533600802404, + "loss": 3.0274, + "theoretical_loss": 4.047452991548315, + "tokens_seen": 369900544 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044843530591775327, + "loss": 3.2649, + "theoretical_loss": 4.047371715110262, + "tokens_seen": 369966080 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044842527582748245, + "loss": 2.865, + "theoretical_loss": 4.047290457098733, + "tokens_seen": 370031616 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044841524573721163, + "loss": 3.3263, + "theoretical_loss": 4.047209217506289, + "tokens_seen": 370097152 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004484052156469408, + "loss": 2.9046, + "theoretical_loss": 4.047127996325493, + "tokens_seen": 370162688 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044839518555667005, + "loss": 3.1456, + "theoretical_loss": 4.0470467935489145, + "tokens_seen": 370228224 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 483097, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9543488025665283, + "objective/train/theoretical_loss": 4.04698590353978, + "objective/train/tokens_used": 390737376, + "theoretical_loss": 4.04698590353978, + "tokens_seen": 370277376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004483851554663992, + "loss": 2.8715, + "theoretical_loss": 4.046965609169126, + "tokens_seen": 370293760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004483751253761284, + "loss": 2.9675, + "theoretical_loss": 4.046884443178706, + "tokens_seen": 370359296 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044836509528585754, + "loss": 3.2167, + "theoretical_loss": 4.046803295570235, + "tokens_seen": 370424832 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004483550651955868, + "loss": 3.1818, + "theoretical_loss": 4.046722166336298, + "tokens_seen": 370490368 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044834503510531596, + "loss": 3.1305, + "theoretical_loss": 4.046641055469488, + "tokens_seen": 370555904 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044833500501504514, + "loss": 2.8626, + "theoretical_loss": 4.046559962962398, + "tokens_seen": 370621440 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004483249749247743, + "loss": 2.9436, + "theoretical_loss": 4.046478888807626, + "tokens_seen": 370686976 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004483149448345035, + "loss": 3.2137, + "theoretical_loss": 4.046397832997776, + "tokens_seen": 370752512 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004483049147442327, + "loss": 3.1725, + "theoretical_loss": 4.046316795525456, + "tokens_seen": 370818048 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004482948846539619, + "loss": 3.1907, + "theoretical_loss": 4.046235776383276, + "tokens_seen": 370883584 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044828485456369104, + "loss": 2.9513, + "theoretical_loss": 4.046154775563854, + "tokens_seen": 370949120 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004482748244734203, + "loss": 3.2516, + "theoretical_loss": 4.046073793059808, + "tokens_seen": 371014656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004482647943831494, + "loss": 3.1746, + "theoretical_loss": 4.045992828863764, + "tokens_seen": 371080192 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044825476429287864, + "loss": 3.0957, + "theoretical_loss": 4.045911882968351, + "tokens_seen": 371145728 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004482447342026078, + "loss": 3.1395, + "theoretical_loss": 4.045830955366201, + "tokens_seen": 371211264 + }, + { + "epoch": 1.02, + "learning_rate": 0.000448234704112337, + "loss": 3.0624, + "theoretical_loss": 4.045750046049952, + "tokens_seen": 371276800 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004482246740220662, + "loss": 2.9705, + "theoretical_loss": 4.045669155012246, + "tokens_seen": 371342336 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004482146439317954, + "loss": 2.8102, + "theoretical_loss": 4.045588282245728, + "tokens_seen": 371407872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044820461384152455, + "loss": 3.1322, + "theoretical_loss": 4.045507427743048, + "tokens_seen": 371473408 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004481945837512538, + "loss": 3.0764, + "theoretical_loss": 4.045426591496861, + "tokens_seen": 371538944 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044818455366098296, + "loss": 3.0931, + "theoretical_loss": 4.045345773499825, + "tokens_seen": 371604480 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044817452357071214, + "loss": 3.1696, + "theoretical_loss": 4.045264973744604, + "tokens_seen": 371670016 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004481644934804414, + "loss": 2.9068, + "theoretical_loss": 4.045184192223863, + "tokens_seen": 371735552 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004481544633901705, + "loss": 3.0335, + "theoretical_loss": 4.045103428930275, + "tokens_seen": 371801088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044814443329989974, + "loss": 3.166, + "theoretical_loss": 4.045022683856517, + "tokens_seen": 371866624 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 483681, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6399648189544678, + "objective/train/theoretical_loss": 4.044962137003555, + "objective/train/tokens_used": 392375776, + "theoretical_loss": 4.044962137003555, + "tokens_seen": 371915776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044813440320962887, + "loss": 2.9421, + "theoretical_loss": 4.044941956995265, + "tokens_seen": 371932160 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004481243731193581, + "loss": 3.0246, + "theoretical_loss": 4.044861248339206, + "tokens_seen": 371997696 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004481143430290873, + "loss": 3.1591, + "theoretical_loss": 4.044780557881026, + "tokens_seen": 372063232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044810431293881647, + "loss": 2.8982, + "theoretical_loss": 4.04469988561342, + "tokens_seen": 372128768 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044809428284854565, + "loss": 2.7799, + "theoretical_loss": 4.044619231529083, + "tokens_seen": 372194304 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004480842527582749, + "loss": 3.1344, + "theoretical_loss": 4.0445385956207165, + "tokens_seen": 372259840 + }, + { + "epoch": 1.02, + "learning_rate": 0.000448074222668004, + "loss": 2.8566, + "theoretical_loss": 4.0444579778810255, + "tokens_seen": 372325376 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044806419257773324, + "loss": 2.9545, + "theoretical_loss": 4.04437737830272, + "tokens_seen": 372390912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044805416248746237, + "loss": 3.1218, + "theoretical_loss": 4.044296796878512, + "tokens_seen": 372456448 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004480441323971916, + "loss": 3.2154, + "theoretical_loss": 4.044216233601119, + "tokens_seen": 372521984 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004480341023069208, + "loss": 3.1728, + "theoretical_loss": 4.044135688463267, + "tokens_seen": 372587520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044802407221664997, + "loss": 2.8922, + "theoretical_loss": 4.044055161457678, + "tokens_seen": 372653056 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044801404212637915, + "loss": 3.1435, + "theoretical_loss": 4.0439746525770826, + "tokens_seen": 372718592 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044800401203610833, + "loss": 2.858, + "theoretical_loss": 4.043894161814217, + "tokens_seen": 372784128 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004479939819458375, + "loss": 2.9771, + "theoretical_loss": 4.043813689161819, + "tokens_seen": 372849664 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044798395185556675, + "loss": 3.1192, + "theoretical_loss": 4.043733234612633, + "tokens_seen": 372915200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004479739217652959, + "loss": 3.0501, + "theoretical_loss": 4.043652798159405, + "tokens_seen": 372980736 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004479638916750251, + "loss": 3.1939, + "theoretical_loss": 4.0435723797948855, + "tokens_seen": 373046272 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044795386158475424, + "loss": 3.0302, + "theoretical_loss": 4.043491979511831, + "tokens_seen": 373111808 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044794383149448347, + "loss": 2.9471, + "theoretical_loss": 4.043411597303, + "tokens_seen": 373177344 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044793380140421265, + "loss": 2.924, + "theoretical_loss": 4.043331233161158, + "tokens_seen": 373242880 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044792377131394183, + "loss": 2.9933, + "theoretical_loss": 4.043250887079072, + "tokens_seen": 373308416 + }, + { + "epoch": 1.02, + "learning_rate": 0.000447913741223671, + "loss": 3.0149, + "theoretical_loss": 4.043170559049514, + "tokens_seen": 373373952 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044790371113340025, + "loss": 2.4948, + "theoretical_loss": 4.04309024906526, + "tokens_seen": 373439488 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004478936810431294, + "loss": 3.101, + "theoretical_loss": 4.043009957119092, + "tokens_seen": 373505024 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 485048, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.050812244415283, + "objective/train/theoretical_loss": 4.042949749992618, + "objective/train/tokens_used": 394014176, + "theoretical_loss": 4.042949749992618, + "tokens_seen": 373554176 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004478836509528586, + "loss": 3.1085, + "theoretical_loss": 4.042929683203793, + "tokens_seen": 373570560 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044787362086258774, + "loss": 3.0532, + "theoretical_loss": 4.042849427312152, + "tokens_seen": 373636096 + }, + { + "epoch": 1.02, + "learning_rate": 0.000447863590772317, + "loss": 3.0551, + "theoretical_loss": 4.042769189436964, + "tokens_seen": 373701632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044785356068204616, + "loss": 3.2651, + "theoretical_loss": 4.042688969571023, + "tokens_seen": 373767168 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044784353059177534, + "loss": 3.2799, + "theoretical_loss": 4.042608767707131, + "tokens_seen": 373832704 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004478335005015045, + "loss": 2.9216, + "theoretical_loss": 4.042528583838095, + "tokens_seen": 373898240 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004478234704112337, + "loss": 2.8679, + "theoretical_loss": 4.042448417956722, + "tokens_seen": 373963776 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004478134403209629, + "loss": 3.1769, + "theoretical_loss": 4.042368270055828, + "tokens_seen": 374029312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004478034102306921, + "loss": 3.1326, + "theoretical_loss": 4.042288140128229, + "tokens_seen": 374094848 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044779338014042124, + "loss": 3.1083, + "theoretical_loss": 4.042208028166748, + "tokens_seen": 374160384 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004477833500501505, + "loss": 3.0138, + "theoretical_loss": 4.04212793416421, + "tokens_seen": 374225920 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004477733199598796, + "loss": 2.8839, + "theoretical_loss": 4.042047858113446, + "tokens_seen": 374291456 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044776328986960884, + "loss": 3.1341, + "theoretical_loss": 4.041967800007289, + "tokens_seen": 374356992 + }, + { + "epoch": 1.02, + "learning_rate": 0.000447753259779338, + "loss": 2.8194, + "theoretical_loss": 4.041887759838579, + "tokens_seen": 374422528 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004477432296890672, + "loss": 3.1354, + "theoretical_loss": 4.041807737600157, + "tokens_seen": 374488064 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004477331995987964, + "loss": 2.9551, + "theoretical_loss": 4.041727733284871, + "tokens_seen": 374553600 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004477231695085256, + "loss": 3.1074, + "theoretical_loss": 4.04164774688557, + "tokens_seen": 374619136 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044771313941825475, + "loss": 3.0928, + "theoretical_loss": 4.04156777839511, + "tokens_seen": 374684672 + }, + { + "epoch": 1.02, + "learning_rate": 0.000447703109327984, + "loss": 2.9912, + "theoretical_loss": 4.041487827806349, + "tokens_seen": 374750208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004476930792377131, + "loss": 3.1209, + "theoretical_loss": 4.04140789511215, + "tokens_seen": 374815744 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044768304914744234, + "loss": 3.1041, + "theoretical_loss": 4.041327980305381, + "tokens_seen": 374881280 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004476730190571715, + "loss": 3.141, + "theoretical_loss": 4.041248083378912, + "tokens_seen": 374946816 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004476629889669007, + "loss": 3.1131, + "theoretical_loss": 4.041168204325619, + "tokens_seen": 375012352 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004476529588766299, + "loss": 3.1638, + "theoretical_loss": 4.041088343138382, + "tokens_seen": 375077888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044764292878635907, + "loss": 3.3661, + "theoretical_loss": 4.041008499810082, + "tokens_seen": 375143424 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 485815, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0911853313446045, + "objective/train/theoretical_loss": 4.040948629029507, + "objective/train/tokens_used": 395652576, + "theoretical_loss": 4.040948629029507, + "tokens_seen": 375192576 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044763289869608825, + "loss": 3.0192, + "theoretical_loss": 4.040928674333609, + "tokens_seen": 375208960 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004476228686058175, + "loss": 3.1861, + "theoretical_loss": 4.040848866701853, + "tokens_seen": 375274496 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004476128385155466, + "loss": 3.2503, + "theoretical_loss": 4.04076907690771, + "tokens_seen": 375340032 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044760280842527585, + "loss": 3.1357, + "theoretical_loss": 4.040689304944079, + "tokens_seen": 375405568 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044759277833500503, + "loss": 3.0223, + "theoretical_loss": 4.0406095508038655, + "tokens_seen": 375471104 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004475827482447342, + "loss": 2.9539, + "theoretical_loss": 4.040529814479976, + "tokens_seen": 375536640 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004475727181544634, + "loss": 3.148, + "theoretical_loss": 4.040450095965323, + "tokens_seen": 375602176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044756268806419257, + "loss": 2.8196, + "theoretical_loss": 4.040370395252821, + "tokens_seen": 375667712 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044755265797392175, + "loss": 3.2455, + "theoretical_loss": 4.040290712335391, + "tokens_seen": 375733248 + }, + { + "epoch": 1.02, + "learning_rate": 0.000447542627883651, + "loss": 3.2465, + "theoretical_loss": 4.0402110472059585, + "tokens_seen": 375798784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004475325977933801, + "loss": 3.1096, + "theoretical_loss": 4.040131399857449, + "tokens_seen": 375864320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044752256770310935, + "loss": 3.0396, + "theoretical_loss": 4.040051770282796, + "tokens_seen": 375929856 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004475125376128385, + "loss": 3.2297, + "theoretical_loss": 4.039972158474936, + "tokens_seen": 375995392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004475025075225677, + "loss": 3.0009, + "theoretical_loss": 4.0398925644268076, + "tokens_seen": 376060928 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004474924774322969, + "loss": 2.9304, + "theoretical_loss": 4.039812988131357, + "tokens_seen": 376126464 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004474824473420261, + "loss": 3.1235, + "theoretical_loss": 4.03973342958153, + "tokens_seen": 376192000 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044747241725175526, + "loss": 2.9068, + "theoretical_loss": 4.039653888770282, + "tokens_seen": 376257536 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044746238716148444, + "loss": 3.0073, + "theoretical_loss": 4.039574365690567, + "tokens_seen": 376323072 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004474523570712136, + "loss": 2.8717, + "theoretical_loss": 4.0394948603353455, + "tokens_seen": 376388608 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044744232698094285, + "loss": 2.926, + "theoretical_loss": 4.039415372697583, + "tokens_seen": 376454144 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044743229689067204, + "loss": 2.7771, + "theoretical_loss": 4.039335902770247, + "tokens_seen": 376519680 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004474222668004012, + "loss": 2.9918, + "theoretical_loss": 4.03925645054631, + "tokens_seen": 376585216 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044741223671013045, + "loss": 2.9529, + "theoretical_loss": 4.039177016018749, + "tokens_seen": 376650752 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004474022066198596, + "loss": 2.9578, + "theoretical_loss": 4.039097599180543, + "tokens_seen": 376716288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004473921765295888, + "loss": 2.9367, + "theoretical_loss": 4.039018200024678, + "tokens_seen": 376781824 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 486601, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.832075834274292, + "objective/train/theoretical_loss": 4.038958662257597, + "objective/train/tokens_used": 397290976, + "theoretical_loss": 4.038958662257597, + "tokens_seen": 376830976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044738214643931794, + "loss": 2.9877, + "theoretical_loss": 4.038938818544143, + "tokens_seen": 376847360 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004473721163490472, + "loss": 2.8239, + "theoretical_loss": 4.038859454731927, + "tokens_seen": 376912896 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044736208625877636, + "loss": 3.158, + "theoretical_loss": 4.0387801085810295, + "tokens_seen": 376978432 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044735205616850554, + "loss": 3.0955, + "theoretical_loss": 4.038700780084451, + "tokens_seen": 377043968 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004473420260782347, + "loss": 3.0649, + "theoretical_loss": 4.038621469235193, + "tokens_seen": 377109504 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004473319959879639, + "loss": 2.9452, + "theoretical_loss": 4.038542176026267, + "tokens_seen": 377175040 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004473219658976931, + "loss": 3.0891, + "theoretical_loss": 4.038462900450684, + "tokens_seen": 377240576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004473119358074223, + "loss": 3.1298, + "theoretical_loss": 4.03838364250146, + "tokens_seen": 377306112 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044730190571715144, + "loss": 3.1469, + "theoretical_loss": 4.038304402171616, + "tokens_seen": 377371648 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004472918756268807, + "loss": 3.1611, + "theoretical_loss": 4.038225179454177, + "tokens_seen": 377437184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004472818455366098, + "loss": 3.165, + "theoretical_loss": 4.038145974342169, + "tokens_seen": 377502720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044727181544633904, + "loss": 2.916, + "theoretical_loss": 4.038066786828626, + "tokens_seen": 377568256 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004472617853560682, + "loss": 2.9843, + "theoretical_loss": 4.037987616906584, + "tokens_seen": 377633792 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004472517552657974, + "loss": 3.0845, + "theoretical_loss": 4.037908464569083, + "tokens_seen": 377699328 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004472417251755266, + "loss": 3.1241, + "theoretical_loss": 4.037829329809167, + "tokens_seen": 377764864 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004472316950852558, + "loss": 3.1601, + "theoretical_loss": 4.037750212619883, + "tokens_seen": 377830400 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044722166499498495, + "loss": 3.0712, + "theoretical_loss": 4.037671112994286, + "tokens_seen": 377895936 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004472116349047142, + "loss": 2.8286, + "theoretical_loss": 4.037592030925429, + "tokens_seen": 377961472 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004472016048144433, + "loss": 2.8222, + "theoretical_loss": 4.037512966406375, + "tokens_seen": 378027008 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044719157472417254, + "loss": 2.9329, + "theoretical_loss": 4.0374339194301845, + "tokens_seen": 378092544 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004471815446339017, + "loss": 3.0938, + "theoretical_loss": 4.037354889989928, + "tokens_seen": 378158080 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004471715145436309, + "loss": 3.2318, + "theoretical_loss": 4.037275878078676, + "tokens_seen": 378223616 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004471614844533601, + "loss": 2.9692, + "theoretical_loss": 4.037196883689504, + "tokens_seen": 378289152 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044715145436308927, + "loss": 3.0049, + "theoretical_loss": 4.037117906815491, + "tokens_seen": 378354688 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044714142427281845, + "loss": 3.2511, + "theoretical_loss": 4.037038947449723, + "tokens_seen": 378420224 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 487344, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.918959140777588, + "objective/train/theoretical_loss": 4.036979739411024, + "objective/train/tokens_used": 398929376, + "theoretical_loss": 4.036979739411024, + "tokens_seen": 378469376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004471313941825477, + "loss": 3.0742, + "theoretical_loss": 4.036960005585286, + "tokens_seen": 378485760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004471213640922768, + "loss": 3.0475, + "theoretical_loss": 4.036881081215271, + "tokens_seen": 378551296 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044711133400200605, + "loss": 2.8166, + "theoretical_loss": 4.036802174332774, + "tokens_seen": 378616832 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044710130391173523, + "loss": 3.1203, + "theoretical_loss": 4.036723284930894, + "tokens_seen": 378682368 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004470912738214644, + "loss": 3.0322, + "theoretical_loss": 4.036644413002735, + "tokens_seen": 378747904 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004470812437311936, + "loss": 3.1208, + "theoretical_loss": 4.036565558541404, + "tokens_seen": 378813440 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044707121364092277, + "loss": 2.895, + "theoretical_loss": 4.03648672154001, + "tokens_seen": 378878976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044706118355065195, + "loss": 3.1827, + "theoretical_loss": 4.03640790199167, + "tokens_seen": 378944512 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004470511534603812, + "loss": 3.1239, + "theoretical_loss": 4.036329099889502, + "tokens_seen": 379010048 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004470411233701103, + "loss": 3.0161, + "theoretical_loss": 4.03625031522663, + "tokens_seen": 379075584 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044703109327983955, + "loss": 3.126, + "theoretical_loss": 4.036171547996179, + "tokens_seen": 379141120 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004470210631895687, + "loss": 3.2521, + "theoretical_loss": 4.036092798191279, + "tokens_seen": 379206656 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004470110330992979, + "loss": 3.1731, + "theoretical_loss": 4.036014065805067, + "tokens_seen": 379272192 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004470010030090271, + "loss": 3.1899, + "theoretical_loss": 4.035935350830679, + "tokens_seen": 379337728 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004469909729187563, + "loss": 3.0432, + "theoretical_loss": 4.035856653261258, + "tokens_seen": 379403264 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044698094282848546, + "loss": 3.1162, + "theoretical_loss": 4.03577797308995, + "tokens_seen": 379468800 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044697091273821464, + "loss": 2.9898, + "theoretical_loss": 4.035699310309906, + "tokens_seen": 379534336 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004469608826479438, + "loss": 3.2695, + "theoretical_loss": 4.035620664914278, + "tokens_seen": 379599872 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044695085255767305, + "loss": 3.017, + "theoretical_loss": 4.035542036896226, + "tokens_seen": 379665408 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004469408224674022, + "loss": 2.9715, + "theoretical_loss": 4.035463426248908, + "tokens_seen": 379730944 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004469307923771314, + "loss": 2.9837, + "theoretical_loss": 4.0353848329654936, + "tokens_seen": 379796480 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004469207622868606, + "loss": 3.0837, + "theoretical_loss": 4.035306257039149, + "tokens_seen": 379862016 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004469107321965898, + "loss": 2.937, + "theoretical_loss": 4.03522769846305, + "tokens_seen": 379927552 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044690070210631896, + "loss": 3.0365, + "theoretical_loss": 4.0351491572303715, + "tokens_seen": 379993088 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044689067201604814, + "loss": 3.1037, + "theoretical_loss": 4.035070633334295, + "tokens_seen": 380058624 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 488451, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9471192359924316, + "objective/train/theoretical_loss": 4.035011751785283, + "objective/train/tokens_used": 400567776, + "theoretical_loss": 4.035011751785283, + "tokens_seen": 380107776 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004468806419257773, + "loss": 3.079, + "theoretical_loss": 4.034992126768006, + "tokens_seen": 380124160 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044687061183550656, + "loss": 2.9451, + "theoretical_loss": 4.034913637524692, + "tokens_seen": 380189696 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004468605817452357, + "loss": 3.2163, + "theoretical_loss": 4.0348351655975465, + "tokens_seen": 380255232 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004468505516549649, + "loss": 2.9821, + "theoretical_loss": 4.0347567109797655, + "tokens_seen": 380320768 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044684052156469405, + "loss": 3.075, + "theoretical_loss": 4.034678273664548, + "tokens_seen": 380386304 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004468304914744233, + "loss": 2.9896, + "theoretical_loss": 4.0345998536451, + "tokens_seen": 380451840 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044682046138415246, + "loss": 2.702, + "theoretical_loss": 4.034521450914628, + "tokens_seen": 380517376 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044681043129388164, + "loss": 2.9938, + "theoretical_loss": 4.034443065466345, + "tokens_seen": 380582912 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004468004012036108, + "loss": 3.0962, + "theoretical_loss": 4.034364697293466, + "tokens_seen": 380648448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044679037111334, + "loss": 3.1166, + "theoretical_loss": 4.034286346389209, + "tokens_seen": 380713984 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004467803410230692, + "loss": 3.0233, + "theoretical_loss": 4.034208012746798, + "tokens_seen": 380779520 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004467703109327984, + "loss": 3.026, + "theoretical_loss": 4.03412969635946, + "tokens_seen": 380845056 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044676028084252755, + "loss": 2.9949, + "theoretical_loss": 4.034051397220427, + "tokens_seen": 380910592 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004467502507522568, + "loss": 3.1725, + "theoretical_loss": 4.033973115322932, + "tokens_seen": 380976128 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044674022066198597, + "loss": 3.2379, + "theoretical_loss": 4.033894850660214, + "tokens_seen": 381041664 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044673019057171515, + "loss": 3.2136, + "theoretical_loss": 4.0338166032255165, + "tokens_seen": 381107200 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044672016048144433, + "loss": 3.0469, + "theoretical_loss": 4.033738373012083, + "tokens_seen": 381172736 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004467101303911735, + "loss": 3.164, + "theoretical_loss": 4.0336601600131665, + "tokens_seen": 381238272 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004467001003009027, + "loss": 3.0613, + "theoretical_loss": 4.033581964222018, + "tokens_seen": 381303808 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004466900702106319, + "loss": 3.239, + "theoretical_loss": 4.033503785631897, + "tokens_seen": 381369344 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004466800401203611, + "loss": 2.5838, + "theoretical_loss": 4.033425624236063, + "tokens_seen": 381434880 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004466700100300903, + "loss": 3.1374, + "theoretical_loss": 4.033347480027782, + "tokens_seen": 381500416 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044665997993981947, + "loss": 2.9139, + "theoretical_loss": 4.033269353000324, + "tokens_seen": 381565952 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044664994984954865, + "loss": 3.1137, + "theoretical_loss": 4.03319124314696, + "tokens_seen": 381631488 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004466399197592779, + "loss": 2.9559, + "theoretical_loss": 4.0331131504609665, + "tokens_seen": 381697024 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 489837, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9685873985290527, + "objective/train/theoretical_loss": 4.033054592208517, + "objective/train/tokens_used": 402206176, + "theoretical_loss": 4.033054592208517, + "tokens_seen": 381746176 + }, + { + "epoch": 1.03, + "learning_rate": 0.000446629889669007, + "loss": 3.1153, + "theoretical_loss": 4.033035074935626, + "tokens_seen": 381762560 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044661985957873625, + "loss": 2.9333, + "theoretical_loss": 4.03295701656422, + "tokens_seen": 381828096 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044660982948846543, + "loss": 2.8328, + "theoretical_loss": 4.0328789753400365, + "tokens_seen": 381893632 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004465997993981946, + "loss": 2.9213, + "theoretical_loss": 4.032800951256369, + "tokens_seen": 381959168 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004465897693079238, + "loss": 2.9061, + "theoretical_loss": 4.032722944306511, + "tokens_seen": 382024704 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044657973921765297, + "loss": 3.0426, + "theoretical_loss": 4.032644954483763, + "tokens_seen": 382090240 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044656970912738215, + "loss": 2.8363, + "theoretical_loss": 4.032566981781427, + "tokens_seen": 382155776 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004465596790371114, + "loss": 3.1766, + "theoretical_loss": 4.03248902619281, + "tokens_seen": 382221312 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004465496489468405, + "loss": 2.9881, + "theoretical_loss": 4.032411087711223, + "tokens_seen": 382286848 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044653961885656975, + "loss": 3.2702, + "theoretical_loss": 4.032333166329979, + "tokens_seen": 382352384 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004465295887662989, + "loss": 2.9304, + "theoretical_loss": 4.032255262042397, + "tokens_seen": 382417920 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004465195586760281, + "loss": 2.8589, + "theoretical_loss": 4.0321773748417975, + "tokens_seen": 382483456 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004465095285857573, + "loss": 2.8999, + "theoretical_loss": 4.032099504721508, + "tokens_seen": 382548992 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004464994984954865, + "loss": 3.0983, + "theoretical_loss": 4.032021651674857, + "tokens_seen": 382614528 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044648946840521566, + "loss": 3.0967, + "theoretical_loss": 4.0319438156951755, + "tokens_seen": 382680064 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044647943831494484, + "loss": 3.0136, + "theoretical_loss": 4.031865996775803, + "tokens_seen": 382745600 + }, + { + "epoch": 1.03, + "learning_rate": 0.000446469408224674, + "loss": 3.0933, + "theoretical_loss": 4.031788194910078, + "tokens_seen": 382811136 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044645937813440325, + "loss": 2.915, + "theoretical_loss": 4.031710410091346, + "tokens_seen": 382876672 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004464493480441324, + "loss": 2.9882, + "theoretical_loss": 4.031632642312955, + "tokens_seen": 382942208 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004464393179538616, + "loss": 3.0723, + "theoretical_loss": 4.031554891568255, + "tokens_seen": 383007744 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004464292878635908, + "loss": 3.1333, + "theoretical_loss": 4.031477157850603, + "tokens_seen": 383073280 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044641925777332, + "loss": 3.0266, + "theoretical_loss": 4.031399441153358, + "tokens_seen": 383138816 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044640922768304916, + "loss": 3.0681, + "theoretical_loss": 4.031321741469881, + "tokens_seen": 383204352 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044639919759277834, + "loss": 3.0067, + "theoretical_loss": 4.031244058793542, + "tokens_seen": 383269888 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004463891675025075, + "loss": 2.9739, + "theoretical_loss": 4.031166393117708, + "tokens_seen": 383335424 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 490505, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2893757820129395, + "objective/train/theoretical_loss": 4.031108155013429, + "objective/train/tokens_used": 403844576, + "theoretical_loss": 4.031108155013429, + "tokens_seen": 383384576 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044637913741223676, + "loss": 3.1077, + "theoretical_loss": 4.031088744435754, + "tokens_seen": 383400960 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004463691073219659, + "loss": 3.155, + "theoretical_loss": 4.0310111127410595, + "tokens_seen": 383466496 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004463590772316951, + "loss": 3.1105, + "theoretical_loss": 4.030933498027005, + "tokens_seen": 383532032 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044634904714142425, + "loss": 3.181, + "theoretical_loss": 4.030855900286974, + "tokens_seen": 383597568 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004463390170511535, + "loss": 2.9759, + "theoretical_loss": 4.030778319514357, + "tokens_seen": 383663104 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044632898696088266, + "loss": 3.1096, + "theoretical_loss": 4.030700755702545, + "tokens_seen": 383728640 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044631895687061184, + "loss": 3.0576, + "theoretical_loss": 4.030623208844937, + "tokens_seen": 383794176 + }, + { + "epoch": 1.03, + "learning_rate": 0.000446308926780341, + "loss": 3.1402, + "theoretical_loss": 4.030545678934931, + "tokens_seen": 383859712 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004462988966900702, + "loss": 3.1986, + "theoretical_loss": 4.030468165965931, + "tokens_seen": 383925248 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004462888665997994, + "loss": 3.0959, + "theoretical_loss": 4.030390669931345, + "tokens_seen": 383990784 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004462788365095286, + "loss": 2.9131, + "theoretical_loss": 4.030313190824582, + "tokens_seen": 384056320 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044626880641925775, + "loss": 3.1382, + "theoretical_loss": 4.030235728639059, + "tokens_seen": 384121856 + }, + { + "epoch": 1.03, + "learning_rate": 0.000446258776328987, + "loss": 3.0705, + "theoretical_loss": 4.030158283368193, + "tokens_seen": 384187392 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044624874623871617, + "loss": 2.8131, + "theoretical_loss": 4.030080855005408, + "tokens_seen": 384252928 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044623871614844535, + "loss": 3.0487, + "theoretical_loss": 4.030003443544127, + "tokens_seen": 384318464 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044622868605817453, + "loss": 3.2123, + "theoretical_loss": 4.0299260489777815, + "tokens_seen": 384384000 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004462186559679037, + "loss": 3.0109, + "theoretical_loss": 4.0298486712998045, + "tokens_seen": 384449536 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004462086258776329, + "loss": 3.0026, + "theoretical_loss": 4.029771310503632, + "tokens_seen": 384515072 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004461985957873621, + "loss": 3.0486, + "theoretical_loss": 4.029693966582705, + "tokens_seen": 384580608 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044618856569709125, + "loss": 3.036, + "theoretical_loss": 4.029616639530467, + "tokens_seen": 384646144 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004461785356068205, + "loss": 2.7565, + "theoretical_loss": 4.029539329340366, + "tokens_seen": 384711680 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004461685055165496, + "loss": 3.1694, + "theoretical_loss": 4.029462036005855, + "tokens_seen": 384777216 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044615847542627885, + "loss": 2.9726, + "theoretical_loss": 4.029384759520386, + "tokens_seen": 384842752 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044614844533600803, + "loss": 2.9842, + "theoretical_loss": 4.0293074998774205, + "tokens_seen": 384908288 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004461384152457372, + "loss": 2.8077, + "theoretical_loss": 4.02923025707042, + "tokens_seen": 384973824 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 491392, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1775639057159424, + "objective/train/theoretical_loss": 4.029172336009841, + "objective/train/tokens_used": 405482976, + "theoretical_loss": 4.029172336009841, + "tokens_seen": 385022976 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004461283851554664, + "loss": 2.9078, + "theoretical_loss": 4.029153031092851, + "tokens_seen": 385039360 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044611835506519563, + "loss": 2.9131, + "theoretical_loss": 4.029075821938182, + "tokens_seen": 385104896 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044610832497492476, + "loss": 3.0688, + "theoretical_loss": 4.028998629599888, + "tokens_seen": 385170432 + }, + { + "epoch": 1.03, + "learning_rate": 0.000446098294884654, + "loss": 3.0794, + "theoretical_loss": 4.028921454071445, + "tokens_seen": 385235968 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004460882647943831, + "loss": 3.0845, + "theoretical_loss": 4.0288442953463335, + "tokens_seen": 385301504 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044607823470411235, + "loss": 3.1702, + "theoretical_loss": 4.028767153418038, + "tokens_seen": 385367040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044606820461384153, + "loss": 3.2518, + "theoretical_loss": 4.028690028280046, + "tokens_seen": 385432576 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004460581745235707, + "loss": 2.9465, + "theoretical_loss": 4.02861291992585, + "tokens_seen": 385498112 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004460481444332999, + "loss": 3.3349, + "theoretical_loss": 4.028535828348945, + "tokens_seen": 385563648 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004460381143430291, + "loss": 2.9932, + "theoretical_loss": 4.02845875354283, + "tokens_seen": 385629184 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044602808425275826, + "loss": 3.0814, + "theoretical_loss": 4.0283816955010066, + "tokens_seen": 385694720 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004460180541624875, + "loss": 2.9856, + "theoretical_loss": 4.0283046542169805, + "tokens_seen": 385760256 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004460080240722166, + "loss": 3.2441, + "theoretical_loss": 4.028227629684263, + "tokens_seen": 385825792 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044599799398194586, + "loss": 3.0764, + "theoretical_loss": 4.028150621896366, + "tokens_seen": 385891328 + }, + { + "epoch": 1.03, + "learning_rate": 0.000445987963891675, + "loss": 3.0117, + "theoretical_loss": 4.028073630846807, + "tokens_seen": 385956864 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004459779338014042, + "loss": 2.9714, + "theoretical_loss": 4.0279966565291065, + "tokens_seen": 386022400 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004459679037111334, + "loss": 2.8873, + "theoretical_loss": 4.027919698936788, + "tokens_seen": 386087936 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004459578736208626, + "loss": 3.2267, + "theoretical_loss": 4.02784275806338, + "tokens_seen": 386153472 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044594784353059176, + "loss": 3.2652, + "theoretical_loss": 4.027765833902413, + "tokens_seen": 386219008 + }, + { + "epoch": 1.03, + "learning_rate": 0.000445937813440321, + "loss": 3.0323, + "theoretical_loss": 4.027688926447423, + "tokens_seen": 386284544 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004459277833500502, + "loss": 3.0881, + "theoretical_loss": 4.0276120356919485, + "tokens_seen": 386350080 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044591775325977936, + "loss": 3.2264, + "theoretical_loss": 4.02753516162953, + "tokens_seen": 386415616 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044590772316950854, + "loss": 3.1626, + "theoretical_loss": 4.027458304253715, + "tokens_seen": 386481152 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004458976930792377, + "loss": 3.2694, + "theoretical_loss": 4.027381463558052, + "tokens_seen": 386546688 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044588766298896696, + "loss": 2.8776, + "theoretical_loss": 4.027304639536093, + "tokens_seen": 386612224 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 492041, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7662007808685303, + "objective/train/theoretical_loss": 4.027247032457867, + "objective/train/tokens_used": 407121376, + "theoretical_loss": 4.027247032457867, + "tokens_seen": 386661376 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004458776328986961, + "loss": 2.9938, + "theoretical_loss": 4.027227832181397, + "tokens_seen": 386677760 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004458676028084253, + "loss": 2.8942, + "theoretical_loss": 4.027151041487521, + "tokens_seen": 386743296 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044585757271815445, + "loss": 3.0691, + "theoretical_loss": 4.027074267448031, + "tokens_seen": 386808832 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004458475426278837, + "loss": 3.1324, + "theoretical_loss": 4.026997510056493, + "tokens_seen": 386874368 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044583751253761286, + "loss": 3.1412, + "theoretical_loss": 4.026920769306478, + "tokens_seen": 386939904 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044582748244734204, + "loss": 2.9207, + "theoretical_loss": 4.0268440451915595, + "tokens_seen": 387005440 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004458174523570712, + "loss": 2.9868, + "theoretical_loss": 4.0267673377053175, + "tokens_seen": 387070976 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004458074222668004, + "loss": 3.0294, + "theoretical_loss": 4.026690646841332, + "tokens_seen": 387136512 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004457973921765296, + "loss": 2.9402, + "theoretical_loss": 4.026613972593188, + "tokens_seen": 387202048 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004457873620862588, + "loss": 3.1123, + "theoretical_loss": 4.026537314954475, + "tokens_seen": 387267584 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044577733199598795, + "loss": 3.0472, + "theoretical_loss": 4.026460673918785, + "tokens_seen": 387333120 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004457673019057172, + "loss": 2.9787, + "theoretical_loss": 4.026384049479713, + "tokens_seen": 387398656 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044575727181544637, + "loss": 2.9231, + "theoretical_loss": 4.026307441630858, + "tokens_seen": 387464192 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044574724172517555, + "loss": 2.9785, + "theoretical_loss": 4.026230850365824, + "tokens_seen": 387529728 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044573721163490473, + "loss": 3.0627, + "theoretical_loss": 4.026154275678216, + "tokens_seen": 387595264 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004457271815446339, + "loss": 3.1657, + "theoretical_loss": 4.0260777175616465, + "tokens_seen": 387660800 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004457171514543631, + "loss": 2.9694, + "theoretical_loss": 4.026001176009726, + "tokens_seen": 387726336 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004457071213640923, + "loss": 3.1933, + "theoretical_loss": 4.025924651016073, + "tokens_seen": 387791872 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044569709127382145, + "loss": 2.8754, + "theoretical_loss": 4.025848142574307, + "tokens_seen": 387857408 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004456870611835507, + "loss": 3.1072, + "theoretical_loss": 4.025771650678053, + "tokens_seen": 387922944 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004456770310932798, + "loss": 3.3678, + "theoretical_loss": 4.025695175320939, + "tokens_seen": 387988480 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044566700100300905, + "loss": 3.0147, + "theoretical_loss": 4.025618716496595, + "tokens_seen": 388054016 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044565697091273823, + "loss": 3.3046, + "theoretical_loss": 4.025542274198656, + "tokens_seen": 388119552 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004456469408224674, + "loss": 3.2766, + "theoretical_loss": 4.025465848420762, + "tokens_seen": 388185088 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004456369107321966, + "loss": 3.0493, + "theoretical_loss": 4.025389439156552, + "tokens_seen": 388250624 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 492346, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0495290756225586, + "objective/train/theoretical_loss": 4.025332143041677, + "objective/train/tokens_used": 408759776, + "theoretical_loss": 4.025332143041677, + "tokens_seen": 388299776 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044562688064192583, + "loss": 3.0685, + "theoretical_loss": 4.025313046399672, + "tokens_seen": 388316160 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044561685055165496, + "loss": 3.2269, + "theoretical_loss": 4.025236670143771, + "tokens_seen": 388381696 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004456068204613842, + "loss": 3.2541, + "theoretical_loss": 4.025160310382502, + "tokens_seen": 388447232 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004455967903711133, + "loss": 2.9198, + "theoretical_loss": 4.02508396710952, + "tokens_seen": 388512768 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044558676028084255, + "loss": 2.9715, + "theoretical_loss": 4.025007640318485, + "tokens_seen": 388578304 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044557673019057173, + "loss": 3.109, + "theoretical_loss": 4.024931330003058, + "tokens_seen": 388643840 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004455667001003009, + "loss": 3.1998, + "theoretical_loss": 4.024855036156907, + "tokens_seen": 388709376 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004455566700100301, + "loss": 3.1695, + "theoretical_loss": 4.024778758773702, + "tokens_seen": 388774912 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004455466399197593, + "loss": 2.9735, + "theoretical_loss": 4.024702497847115, + "tokens_seen": 388840448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044553660982948846, + "loss": 3.1217, + "theoretical_loss": 4.0246262533708235, + "tokens_seen": 388905984 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004455265797392177, + "loss": 3.1701, + "theoretical_loss": 4.024550025338508, + "tokens_seen": 388971520 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004455165496489468, + "loss": 3.2158, + "theoretical_loss": 4.024473813743851, + "tokens_seen": 389037056 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044550651955867606, + "loss": 3.3209, + "theoretical_loss": 4.024397618580542, + "tokens_seen": 389102592 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004454964894684052, + "loss": 3.1834, + "theoretical_loss": 4.02432143984227, + "tokens_seen": 389168128 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004454864593781344, + "loss": 2.9146, + "theoretical_loss": 4.024245277522729, + "tokens_seen": 389233664 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004454764292878636, + "loss": 3.3165, + "theoretical_loss": 4.0241691316156185, + "tokens_seen": 389299200 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004454663991975928, + "loss": 3.4463, + "theoretical_loss": 4.024093002114638, + "tokens_seen": 389364736 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044545636910732196, + "loss": 3.1539, + "theoretical_loss": 4.0240168890134935, + "tokens_seen": 389430272 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004454463390170512, + "loss": 3.547, + "theoretical_loss": 4.0239407923058925, + "tokens_seen": 389495808 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004454363089267803, + "loss": 3.1451, + "theoretical_loss": 4.023864711985547, + "tokens_seen": 389561344 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044542627883650956, + "loss": 3.1436, + "theoretical_loss": 4.023788648046171, + "tokens_seen": 389626880 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004454162487462387, + "loss": 2.9621, + "theoretical_loss": 4.023712600481484, + "tokens_seen": 389692416 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004454062186559679, + "loss": 3.042, + "theoretical_loss": 4.023636569285208, + "tokens_seen": 389757952 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004453961885656971, + "loss": 3.1537, + "theoretical_loss": 4.023560554451068, + "tokens_seen": 389823488 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004453861584754263, + "loss": 3.1769, + "theoretical_loss": 4.023484555972794, + "tokens_seen": 389889024 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 492346, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5270936489105225, + "objective/train/theoretical_loss": 4.0234275678438545, + "objective/train/tokens_used": 410398176, + "theoretical_loss": 4.0234275678438545, + "tokens_seen": 389938176 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044537612838515547, + "loss": 3.1988, + "theoretical_loss": 4.023408573844117, + "tokens_seen": 389954560 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044536609829488465, + "loss": 2.8549, + "theoretical_loss": 4.023332608058775, + "tokens_seen": 390020096 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044535606820461383, + "loss": 3.1868, + "theoretical_loss": 4.023256658610505, + "tokens_seen": 390085632 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044534603811434306, + "loss": 3.2635, + "theoretical_loss": 4.02318072549305, + "tokens_seen": 390151168 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004453360080240722, + "loss": 3.1127, + "theoretical_loss": 4.0231048087001575, + "tokens_seen": 390216704 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004453259779338014, + "loss": 3.2178, + "theoretical_loss": 4.023028908225577, + "tokens_seen": 390282240 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044531594784353055, + "loss": 3.1681, + "theoretical_loss": 4.02295302406306, + "tokens_seen": 390347776 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004453059177532598, + "loss": 3.1644, + "theoretical_loss": 4.0228771562063645, + "tokens_seen": 390413312 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044529588766298897, + "loss": 3.3908, + "theoretical_loss": 4.022801304649251, + "tokens_seen": 390478848 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044528585757271815, + "loss": 3.2459, + "theoretical_loss": 4.022725469385481, + "tokens_seen": 390544384 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044527582748244733, + "loss": 2.9103, + "theoretical_loss": 4.022649650408823, + "tokens_seen": 390609920 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044526579739217657, + "loss": 3.2225, + "theoretical_loss": 4.022573847713048, + "tokens_seen": 390675456 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004452557673019057, + "loss": 2.9859, + "theoretical_loss": 4.022498061291927, + "tokens_seen": 390740992 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044524573721163493, + "loss": 3.1762, + "theoretical_loss": 4.022422291139238, + "tokens_seen": 390806528 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044523570712136406, + "loss": 3.3121, + "theoretical_loss": 4.022346537248763, + "tokens_seen": 390872064 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004452256770310933, + "loss": 3.2159, + "theoretical_loss": 4.022270799614285, + "tokens_seen": 390937600 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044521564694082247, + "loss": 3.1719, + "theoretical_loss": 4.02219507822959, + "tokens_seen": 391003136 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044520561685055165, + "loss": 3.325, + "theoretical_loss": 4.022119373088472, + "tokens_seen": 391068672 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044519558676028083, + "loss": 3.2354, + "theoretical_loss": 4.022043684184723, + "tokens_seen": 391134208 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044518555667001, + "loss": 3.0924, + "theoretical_loss": 4.021968011512141, + "tokens_seen": 391199744 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044517552657973925, + "loss": 3.1403, + "theoretical_loss": 4.0218923550645265, + "tokens_seen": 391265280 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044516549648946843, + "loss": 3.2618, + "theoretical_loss": 4.021816714835685, + "tokens_seen": 391330816 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004451554663991976, + "loss": 3.1097, + "theoretical_loss": 4.0217410908194235, + "tokens_seen": 391396352 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004451454363089268, + "loss": 3.0826, + "theoretical_loss": 4.021665483009555, + "tokens_seen": 391461888 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044513540621865603, + "loss": 3.325, + "theoretical_loss": 4.021589891399891, + "tokens_seen": 391527424 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 493102, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2480216026306152, + "objective/train/theoretical_loss": 4.02153320832031, + "objective/train/tokens_used": 412036576, + "theoretical_loss": 4.02153320832031, + "tokens_seen": 391576576 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044512537612838516, + "loss": 3.1819, + "theoretical_loss": 4.021514315984252, + "tokens_seen": 391592960 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004451153460381144, + "loss": 3.0459, + "theoretical_loss": 4.021438756756459, + "tokens_seen": 391658496 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004451053159478435, + "loss": 3.2661, + "theoretical_loss": 4.021363213710337, + "tokens_seen": 391724032 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044509528585757275, + "loss": 3.1196, + "theoretical_loss": 4.021287686839712, + "tokens_seen": 391789568 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044508525576730194, + "loss": 3.0289, + "theoretical_loss": 4.021212176138419, + "tokens_seen": 391855104 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004450752256770311, + "loss": 3.4092, + "theoretical_loss": 4.02113668160029, + "tokens_seen": 391920640 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004450651955867603, + "loss": 3.0095, + "theoretical_loss": 4.0210612032191655, + "tokens_seen": 391986176 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004450551654964895, + "loss": 3.417, + "theoretical_loss": 4.020985740988887, + "tokens_seen": 392051712 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044504513540621866, + "loss": 3.3539, + "theoretical_loss": 4.020910294903298, + "tokens_seen": 392117248 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004450351053159479, + "loss": 3.3295, + "theoretical_loss": 4.020834864956248, + "tokens_seen": 392182784 + }, + { + "epoch": 1.03, + "learning_rate": 0.000445025075225677, + "loss": 3.4475, + "theoretical_loss": 4.020759451141589, + "tokens_seen": 392248320 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044501504513540626, + "loss": 3.405, + "theoretical_loss": 4.020684053453176, + "tokens_seen": 392313856 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004450050150451354, + "loss": 3.3016, + "theoretical_loss": 4.020608671884868, + "tokens_seen": 392379392 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004449949849548646, + "loss": 3.223, + "theoretical_loss": 4.020533306430527, + "tokens_seen": 392444928 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004449849548645938, + "loss": 3.0813, + "theoretical_loss": 4.020457957084017, + "tokens_seen": 392510464 + }, + { + "epoch": 1.03, + "learning_rate": 0.000444974924774323, + "loss": 3.2733, + "theoretical_loss": 4.0203826238392075, + "tokens_seen": 392576000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044496489468405216, + "loss": 3.4012, + "theoretical_loss": 4.020307306689972, + "tokens_seen": 392641536 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004449548645937814, + "loss": 2.9446, + "theoretical_loss": 4.020232005630183, + "tokens_seen": 392707072 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004449448345035105, + "loss": 2.8994, + "theoretical_loss": 4.020156720653722, + "tokens_seen": 392772608 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044493480441323976, + "loss": 3.1714, + "theoretical_loss": 4.020081451754469, + "tokens_seen": 392838144 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004449247743229689, + "loss": 3.33, + "theoretical_loss": 4.0200061989263105, + "tokens_seen": 392903680 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004449147442326981, + "loss": 3.2939, + "theoretical_loss": 4.019930962163135, + "tokens_seen": 392969216 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004449047141424273, + "loss": 3.3044, + "theoretical_loss": 4.019855741458835, + "tokens_seen": 393034752 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004448946840521565, + "loss": 3.1703, + "theoretical_loss": 4.019780536807305, + "tokens_seen": 393100288 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044488465396188567, + "loss": 3.1619, + "theoretical_loss": 4.019705348202446, + "tokens_seen": 393165824 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 494542, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9207966327667236, + "objective/train/theoretical_loss": 4.019648967275759, + "objective/train/tokens_used": 413674976, + "theoretical_loss": 4.019648967275759, + "tokens_seen": 393214976 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044487462387161485, + "loss": 3.1161, + "theoretical_loss": 4.019630175638158, + "tokens_seen": 393231360 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044486459378134403, + "loss": 3.1258, + "theoretical_loss": 4.0195550191083464, + "tokens_seen": 393296896 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044485456369107326, + "loss": 3.12, + "theoretical_loss": 4.019479878606921, + "tokens_seen": 393362432 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004448445336008024, + "loss": 3.2797, + "theoretical_loss": 4.019404754127793, + "tokens_seen": 393427968 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004448345035105316, + "loss": 3.1239, + "theoretical_loss": 4.01932964566488, + "tokens_seen": 393493504 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044482447342026075, + "loss": 3.118, + "theoretical_loss": 4.0192545532120985, + "tokens_seen": 393559040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044481444332999, + "loss": 3.1753, + "theoretical_loss": 4.01917947676337, + "tokens_seen": 393624576 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044480441323971917, + "loss": 3.1646, + "theoretical_loss": 4.019104416312623, + "tokens_seen": 393690112 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044479438314944835, + "loss": 3.0934, + "theoretical_loss": 4.019029371853784, + "tokens_seen": 393755648 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044478435305917753, + "loss": 3.1208, + "theoretical_loss": 4.018954343380786, + "tokens_seen": 393821184 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044477432296890677, + "loss": 3.0607, + "theoretical_loss": 4.018879330887563, + "tokens_seen": 393886720 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004447642928786359, + "loss": 3.2782, + "theoretical_loss": 4.018804334368055, + "tokens_seen": 393952256 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044475426278836513, + "loss": 3.16, + "theoretical_loss": 4.018729353816203, + "tokens_seen": 394017792 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044474423269809426, + "loss": 3.2467, + "theoretical_loss": 4.018654389225954, + "tokens_seen": 394083328 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004447342026078235, + "loss": 3.3955, + "theoretical_loss": 4.018579440591255, + "tokens_seen": 394148864 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044472417251755267, + "loss": 3.2859, + "theoretical_loss": 4.018504507906059, + "tokens_seen": 394214400 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044471414242728185, + "loss": 3.3852, + "theoretical_loss": 4.0184295911643195, + "tokens_seen": 394279936 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044470411233701103, + "loss": 3.437, + "theoretical_loss": 4.018354690359996, + "tokens_seen": 394345472 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004446940822467402, + "loss": 3.277, + "theoretical_loss": 4.01827980548705, + "tokens_seen": 394411008 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004446840521564694, + "loss": 3.3371, + "theoretical_loss": 4.018204936539448, + "tokens_seen": 394476544 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044467402206619863, + "loss": 3.2478, + "theoretical_loss": 4.018130083511156, + "tokens_seen": 394542080 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044466399197592776, + "loss": 3.2044, + "theoretical_loss": 4.018055246396147, + "tokens_seen": 394607616 + }, + { + "epoch": 1.03, + "learning_rate": 0.000444653961885657, + "loss": 3.2246, + "theoretical_loss": 4.017980425188396, + "tokens_seen": 394673152 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004446439317953861, + "loss": 3.0988, + "theoretical_loss": 4.017905619881881, + "tokens_seen": 394738688 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044463390170511536, + "loss": 3.0712, + "theoretical_loss": 4.017830830470583, + "tokens_seen": 394804224 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 495175, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2577452659606934, + "objective/train/theoretical_loss": 4.017774748839727, + "objective/train/tokens_used": 415313376, + "theoretical_loss": 4.017774748839727, + "tokens_seen": 394853376 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044462387161484454, + "loss": 3.1142, + "theoretical_loss": 4.017756056948487, + "tokens_seen": 394869760 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004446138415245737, + "loss": 3.3469, + "theoretical_loss": 4.017681299309582, + "tokens_seen": 394935296 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004446038114343029, + "loss": 3.1336, + "theoretical_loss": 4.017606557547858, + "tokens_seen": 395000832 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044459378134403214, + "loss": 2.8472, + "theoretical_loss": 4.01753183165731, + "tokens_seen": 395066368 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044458375125376126, + "loss": 3.1497, + "theoretical_loss": 4.017457121631936, + "tokens_seen": 395131904 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004445737211634905, + "loss": 3.2117, + "theoretical_loss": 4.017382427465737, + "tokens_seen": 395197440 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004445636910732196, + "loss": 2.946, + "theoretical_loss": 4.017307749152717, + "tokens_seen": 395262976 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044455366098294886, + "loss": 3.2121, + "theoretical_loss": 4.017233086686884, + "tokens_seen": 395328512 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044454363089267804, + "loss": 2.9614, + "theoretical_loss": 4.017158440062249, + "tokens_seen": 395394048 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004445336008024072, + "loss": 3.1452, + "theoretical_loss": 4.017083809272826, + "tokens_seen": 395459584 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004445235707121364, + "loss": 2.9447, + "theoretical_loss": 4.017009194312633, + "tokens_seen": 395525120 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004445135406218656, + "loss": 2.99, + "theoretical_loss": 4.016934595175689, + "tokens_seen": 395590656 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044450351053159477, + "loss": 3.0288, + "theoretical_loss": 4.01686001185602, + "tokens_seen": 395656192 + }, + { + "epoch": 1.03, + "learning_rate": 0.000444493480441324, + "loss": 3.3945, + "theoretical_loss": 4.016785444347652, + "tokens_seen": 395721728 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044448345035105313, + "loss": 3.2868, + "theoretical_loss": 4.016710892644614, + "tokens_seen": 395787264 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044447342026078236, + "loss": 3.281, + "theoretical_loss": 4.016636356740943, + "tokens_seen": 395852800 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004444633901705115, + "loss": 3.1532, + "theoretical_loss": 4.016561836630672, + "tokens_seen": 395918336 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004444533600802407, + "loss": 3.2487, + "theoretical_loss": 4.016487332307844, + "tokens_seen": 395983872 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044444332998996996, + "loss": 3.1453, + "theoretical_loss": 4.016412843766502, + "tokens_seen": 396049408 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004444332998996991, + "loss": 3.1682, + "theoretical_loss": 4.016338371000691, + "tokens_seen": 396114944 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004444232698094283, + "loss": 3.1292, + "theoretical_loss": 4.016263914004462, + "tokens_seen": 396180480 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004444132397191575, + "loss": 3.4037, + "theoretical_loss": 4.016189472771868, + "tokens_seen": 396246016 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004444032096288867, + "loss": 3.0772, + "theoretical_loss": 4.016115047296965, + "tokens_seen": 396311552 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044439317953861587, + "loss": 3.2491, + "theoretical_loss": 4.016040637573812, + "tokens_seen": 396377088 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044438314944834505, + "loss": 3.2143, + "theoretical_loss": 4.0159662435964725, + "tokens_seen": 396442624 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 496403, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9757843017578125, + "objective/train/theoretical_loss": 4.015910458443088, + "objective/train/tokens_used": 416951776, + "theoretical_loss": 4.015910458443088, + "tokens_seen": 396491776 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044437311935807423, + "loss": 2.9239, + "theoretical_loss": 4.015891865359012, + "tokens_seen": 396508160 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044436308926780346, + "loss": 2.9201, + "theoretical_loss": 4.0158175028555, + "tokens_seen": 396573696 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004443530591775326, + "loss": 3.2929, + "theoretical_loss": 4.015743156080008, + "tokens_seen": 396639232 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004443430290872618, + "loss": 2.9639, + "theoretical_loss": 4.015668825026613, + "tokens_seen": 396704768 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044433299899699095, + "loss": 3.1691, + "theoretical_loss": 4.015594509689393, + "tokens_seen": 396770304 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004443229689067202, + "loss": 3.0509, + "theoretical_loss": 4.015520210062429, + "tokens_seen": 396835840 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044431293881644937, + "loss": 3.3591, + "theoretical_loss": 4.015445926139808, + "tokens_seen": 396901376 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044430290872617855, + "loss": 3.1861, + "theoretical_loss": 4.015371657915617, + "tokens_seen": 396966912 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044429287863590773, + "loss": 3.2149, + "theoretical_loss": 4.015297405383948, + "tokens_seen": 397032448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044428284854563697, + "loss": 3.2117, + "theoretical_loss": 4.015223168538896, + "tokens_seen": 397097984 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004442728184553661, + "loss": 3.1278, + "theoretical_loss": 4.015148947374559, + "tokens_seen": 397163520 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044426278836509533, + "loss": 3.1822, + "theoretical_loss": 4.015074741885038, + "tokens_seen": 397229056 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044425275827482446, + "loss": 3.0979, + "theoretical_loss": 4.015000552064437, + "tokens_seen": 397294592 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004442427281845537, + "loss": 3.0718, + "theoretical_loss": 4.014926377906864, + "tokens_seen": 397360128 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044423269809428287, + "loss": 3.0584, + "theoretical_loss": 4.01485221940643, + "tokens_seen": 397425664 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044422266800401205, + "loss": 3.2676, + "theoretical_loss": 4.014778076557249, + "tokens_seen": 397491200 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044421263791374123, + "loss": 3.1287, + "theoretical_loss": 4.0147039493534376, + "tokens_seen": 397556736 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004442026078234704, + "loss": 3.184, + "theoretical_loss": 4.014629837789116, + "tokens_seen": 397622272 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004441925777331996, + "loss": 3.3226, + "theoretical_loss": 4.014555741858407, + "tokens_seen": 397687808 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044418254764292883, + "loss": 3.1903, + "theoretical_loss": 4.014481661555439, + "tokens_seen": 397753344 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044417251755265796, + "loss": 3.1214, + "theoretical_loss": 4.0144075968743405, + "tokens_seen": 397818880 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004441624874623872, + "loss": 3.2588, + "theoretical_loss": 4.014333547809246, + "tokens_seen": 397884416 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004441524573721163, + "loss": 3.0024, + "theoretical_loss": 4.01425951435429, + "tokens_seen": 397949952 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044414242728184556, + "loss": 3.0515, + "theoretical_loss": 4.014185496503612, + "tokens_seen": 398015488 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044413239719157474, + "loss": 3.0877, + "theoretical_loss": 4.014111494251355, + "tokens_seen": 398081024 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 497345, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9470760822296143, + "objective/train/theoretical_loss": 4.014056002795105, + "objective/train/tokens_used": 418590176, + "theoretical_loss": 4.014056002795105, + "tokens_seen": 398130176 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004441223671013039, + "loss": 3.09, + "theoretical_loss": 4.014037507591665, + "tokens_seen": 398146560 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004441123370110331, + "loss": 3.3322, + "theoretical_loss": 4.01396353651869, + "tokens_seen": 398212096 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044410230692076234, + "loss": 3.3269, + "theoretical_loss": 4.013889581026583, + "tokens_seen": 398277632 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044409227683049146, + "loss": 3.1256, + "theoretical_loss": 4.013815641109498, + "tokens_seen": 398343168 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004440822467402207, + "loss": 2.9061, + "theoretical_loss": 4.0137417167615945, + "tokens_seen": 398408704 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004440722166499498, + "loss": 2.7541, + "theoretical_loss": 4.013667807977033, + "tokens_seen": 398474240 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044406218655967906, + "loss": 2.8525, + "theoretical_loss": 4.013593914749977, + "tokens_seen": 398539776 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044405215646940824, + "loss": 3.2677, + "theoretical_loss": 4.013520037074597, + "tokens_seen": 398605312 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004440421263791374, + "loss": 3.069, + "theoretical_loss": 4.013446174945062, + "tokens_seen": 398670848 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004440320962888666, + "loss": 3.0376, + "theoretical_loss": 4.013372328355547, + "tokens_seen": 398736384 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004440220661985958, + "loss": 3.0581, + "theoretical_loss": 4.0132984973002275, + "tokens_seen": 398801920 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044401203610832497, + "loss": 3.1568, + "theoretical_loss": 4.013224681773286, + "tokens_seen": 398867456 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004440020060180542, + "loss": 3.2621, + "theoretical_loss": 4.0131508817689046, + "tokens_seen": 398932992 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044399197592778333, + "loss": 3.0157, + "theoretical_loss": 4.013077097281269, + "tokens_seen": 398998528 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044398194583751256, + "loss": 3.3744, + "theoretical_loss": 4.013003328304571, + "tokens_seen": 399064064 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004439719157472417, + "loss": 3.2167, + "theoretical_loss": 4.012929574833002, + "tokens_seen": 399129600 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004439618856569709, + "loss": 3.2892, + "theoretical_loss": 4.012855836860759, + "tokens_seen": 399195136 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004439518555667001, + "loss": 3.3375, + "theoretical_loss": 4.01278211438204, + "tokens_seen": 399260672 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004439418254764293, + "loss": 3.1236, + "theoretical_loss": 4.012708407391049, + "tokens_seen": 399326208 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044393179538615847, + "loss": 2.9568, + "theoretical_loss": 4.012634715881989, + "tokens_seen": 399391744 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004439217652958877, + "loss": 3.2874, + "theoretical_loss": 4.012561039849069, + "tokens_seen": 399457280 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044391173520561683, + "loss": 3.1477, + "theoretical_loss": 4.012487379286502, + "tokens_seen": 399522816 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044390170511534607, + "loss": 2.9509, + "theoretical_loss": 4.012413734188501, + "tokens_seen": 399588352 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004438916750250752, + "loss": 3.1483, + "theoretical_loss": 4.012340104549285, + "tokens_seen": 399653888 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044388164493480443, + "loss": 3.11, + "theoretical_loss": 4.012266490363073, + "tokens_seen": 399719424 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 502454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3102781772613525, + "objective/train/theoretical_loss": 4.012211289860975, + "objective/train/tokens_used": 420228576, + "theoretical_loss": 4.012211289860975, + "tokens_seen": 399768576 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004438716148445336, + "loss": 3.2869, + "theoretical_loss": 4.012192891624092, + "tokens_seen": 399784960 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004438615847542628, + "loss": 2.9235, + "theoretical_loss": 4.012119308326567, + "tokens_seen": 399850496 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044385155466399197, + "loss": 3.0863, + "theoretical_loss": 4.012045740464728, + "tokens_seen": 399916032 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044384152457372115, + "loss": 3.0504, + "theoretical_loss": 4.01197218803281, + "tokens_seen": 399981568 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044383149448345033, + "loss": 3.1842, + "theoretical_loss": 4.011898651025048, + "tokens_seen": 400047104 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044382146439317957, + "loss": 3.1272, + "theoretical_loss": 4.011825129435683, + "tokens_seen": 400112640 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004438114343029087, + "loss": 3.1282, + "theoretical_loss": 4.011751623258955, + "tokens_seen": 400178176 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044380140421263793, + "loss": 3.0844, + "theoretical_loss": 4.011678132489112, + "tokens_seen": 400243712 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004437913741223671, + "loss": 2.8656, + "theoretical_loss": 4.011604657120401, + "tokens_seen": 400309248 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004437813440320963, + "loss": 3.2295, + "theoretical_loss": 4.011531197147076, + "tokens_seen": 400374784 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004437713139418255, + "loss": 3.0765, + "theoretical_loss": 4.011457752563391, + "tokens_seen": 400440320 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044376128385155466, + "loss": 3.2233, + "theoretical_loss": 4.011384323363603, + "tokens_seen": 400505856 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044375125376128384, + "loss": 3.3812, + "theoretical_loss": 4.011310909541975, + "tokens_seen": 400571392 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044374122367101307, + "loss": 3.0616, + "theoretical_loss": 4.011237511092769, + "tokens_seen": 400636928 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004437311935807422, + "loss": 3.1718, + "theoretical_loss": 4.011164128010255, + "tokens_seen": 400702464 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044372116349047143, + "loss": 3.0474, + "theoretical_loss": 4.011090760288701, + "tokens_seen": 400768000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044371113340020056, + "loss": 3.0441, + "theoretical_loss": 4.011017407922381, + "tokens_seen": 400833536 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004437011033099298, + "loss": 3.1871, + "theoretical_loss": 4.010944070905572, + "tokens_seen": 400899072 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044369107321965903, + "loss": 3.3112, + "theoretical_loss": 4.010870749232554, + "tokens_seen": 400964608 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044368104312938816, + "loss": 2.9837, + "theoretical_loss": 4.0107974428976085, + "tokens_seen": 401030144 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004436710130391174, + "loss": 3.4095, + "theoretical_loss": 4.010724151895022, + "tokens_seen": 401095680 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004436609829488465, + "loss": 3.2873, + "theoretical_loss": 4.010650876219083, + "tokens_seen": 401161216 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044365095285857576, + "loss": 3.1268, + "theoretical_loss": 4.010577615864083, + "tokens_seen": 401226752 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044364092276830494, + "loss": 3.0925, + "theoretical_loss": 4.010504370824316, + "tokens_seen": 401292288 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004436308926780341, + "loss": 3.0146, + "theoretical_loss": 4.010431141094083, + "tokens_seen": 401357824 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 507639, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0078208446502686, + "objective/train/theoretical_loss": 4.01037622883986, + "objective/train/tokens_used": 421866976, + "theoretical_loss": 4.01037622883986, + "tokens_seen": 401406976 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004436208625877633, + "loss": 2.9291, + "theoretical_loss": 4.010357926667682, + "tokens_seen": 401423360 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044361083249749254, + "loss": 3.3317, + "theoretical_loss": 4.0102847275394184, + "tokens_seen": 401488896 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044360080240722166, + "loss": 2.7376, + "theoretical_loss": 4.0102115437035994, + "tokens_seen": 401554432 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004435907723169509, + "loss": 3.2031, + "theoretical_loss": 4.0101383751545345, + "tokens_seen": 401619968 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044358074222668, + "loss": 3.1482, + "theoretical_loss": 4.010065221886537, + "tokens_seen": 401685504 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044357071213640926, + "loss": 3.0891, + "theoretical_loss": 4.0099920838939225, + "tokens_seen": 401751040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044356068204613844, + "loss": 3.0714, + "theoretical_loss": 4.009918961171012, + "tokens_seen": 401816576 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004435506519558676, + "loss": 3.1246, + "theoretical_loss": 4.009845853712126, + "tokens_seen": 401882112 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004435406218655968, + "loss": 3.2701, + "theoretical_loss": 4.0097727615115915, + "tokens_seen": 401947648 + }, + { + "epoch": 1.03, + "learning_rate": 0.000443530591775326, + "loss": 3.082, + "theoretical_loss": 4.009699684563736, + "tokens_seen": 402013184 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044352056168505517, + "loss": 3.2898, + "theoretical_loss": 4.00962662286289, + "tokens_seen": 402078720 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004435105315947844, + "loss": 3.0541, + "theoretical_loss": 4.00955357640339, + "tokens_seen": 402144256 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044350050150451353, + "loss": 2.9924, + "theoretical_loss": 4.009480545179572, + "tokens_seen": 402209792 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044349047141424276, + "loss": 2.8212, + "theoretical_loss": 4.0094075291857765, + "tokens_seen": 402275328 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004434804413239719, + "loss": 2.8057, + "theoretical_loss": 4.009334528416347, + "tokens_seen": 402340864 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004434704112337011, + "loss": 3.1605, + "theoretical_loss": 4.009261542865631, + "tokens_seen": 402406400 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004434603811434303, + "loss": 3.3208, + "theoretical_loss": 4.009188572527977, + "tokens_seen": 402471936 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004434503510531595, + "loss": 3.2758, + "theoretical_loss": 4.009115617397739, + "tokens_seen": 402537472 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044344032096288867, + "loss": 3.1012, + "theoretical_loss": 4.009042677469271, + "tokens_seen": 402603008 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004434302908726179, + "loss": 3.0349, + "theoretical_loss": 4.008969752736932, + "tokens_seen": 402668544 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044342026078234703, + "loss": 3.3604, + "theoretical_loss": 4.0088968431950835, + "tokens_seen": 402734080 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044341023069207627, + "loss": 3.0932, + "theoretical_loss": 4.008823948838091, + "tokens_seen": 402799616 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004434002006018054, + "loss": 3.0842, + "theoretical_loss": 4.008751069660322, + "tokens_seen": 402865152 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044339017051153463, + "loss": 3.1979, + "theoretical_loss": 4.008678205656146, + "tokens_seen": 402930688 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004433801404212638, + "loss": 3.2418, + "theoretical_loss": 4.008605356819937, + "tokens_seen": 402996224 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 512662, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.06799578666687, + "objective/train/theoretical_loss": 4.0085507301433765, + "objective/train/tokens_used": 423505376, + "theoretical_loss": 4.0085507301433765, + "tokens_seen": 403045376 + }, + { + "epoch": 1.03, + "learning_rate": 0.000443370110330993, + "loss": 3.2713, + "theoretical_loss": 4.008532523146073, + "tokens_seen": 403061760 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044336008024072217, + "loss": 3.0465, + "theoretical_loss": 4.008459704628931, + "tokens_seen": 403127296 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044335005015045135, + "loss": 3.1517, + "theoretical_loss": 4.008386901262897, + "tokens_seen": 403192832 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044334002006018053, + "loss": 3.1538, + "theoretical_loss": 4.0083141130423545, + "tokens_seen": 403258368 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044332998996990977, + "loss": 2.9468, + "theoretical_loss": 4.008241339961692, + "tokens_seen": 403323904 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004433199598796389, + "loss": 3.0336, + "theoretical_loss": 4.008168582015301, + "tokens_seen": 403389440 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044330992978936813, + "loss": 3.1535, + "theoretical_loss": 4.008095839197578, + "tokens_seen": 403454976 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004432998996990973, + "loss": 3.3492, + "theoretical_loss": 4.008023111502919, + "tokens_seen": 403520512 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004432898696088265, + "loss": 3.0574, + "theoretical_loss": 4.007950398925725, + "tokens_seen": 403586048 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004432798395185557, + "loss": 3.1388, + "theoretical_loss": 4.007877701460398, + "tokens_seen": 403651584 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044326980942828486, + "loss": 3.1729, + "theoretical_loss": 4.0078050191013475, + "tokens_seen": 403717120 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044325977933801404, + "loss": 2.9521, + "theoretical_loss": 4.007732351842981, + "tokens_seen": 403782656 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044324974924774327, + "loss": 3.2514, + "theoretical_loss": 4.007659699679711, + "tokens_seen": 403848192 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004432397191574724, + "loss": 2.8846, + "theoretical_loss": 4.007587062605954, + "tokens_seen": 403913728 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044322968906720163, + "loss": 3.2135, + "theoretical_loss": 4.007514440616128, + "tokens_seen": 403979264 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044321965897693076, + "loss": 2.8579, + "theoretical_loss": 4.007441833704654, + "tokens_seen": 404044800 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044320962888666, + "loss": 2.7827, + "theoretical_loss": 4.0073692418659554, + "tokens_seen": 404110336 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004431995987963892, + "loss": 3.0201, + "theoretical_loss": 4.007296665094462, + "tokens_seen": 404175872 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044318956870611836, + "loss": 3.2041, + "theoretical_loss": 4.007224103384603, + "tokens_seen": 404241408 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044317953861584754, + "loss": 2.8284, + "theoretical_loss": 4.007151556730811, + "tokens_seen": 404306944 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004431695085255767, + "loss": 3.0129, + "theoretical_loss": 4.007079025127523, + "tokens_seen": 404372480 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004431594784353059, + "loss": 3.0203, + "theoretical_loss": 4.007006508569178, + "tokens_seen": 404438016 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044314944834503514, + "loss": 3.1886, + "theoretical_loss": 4.0069340070502175, + "tokens_seen": 404503552 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044313941825476427, + "loss": 3.0306, + "theoretical_loss": 4.006861520565088, + "tokens_seen": 404569088 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004431293881644935, + "loss": 3.1072, + "theoretical_loss": 4.006789049108235, + "tokens_seen": 404634624 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 517837, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.787900924682617, + "objective/train/theoretical_loss": 4.006734705374566, + "objective/train/tokens_used": 425143776, + "theoretical_loss": 4.006734705374566, + "tokens_seen": 404683776 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004431193580742227, + "loss": 2.9616, + "theoretical_loss": 4.006716592674112, + "tokens_seen": 404700160 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044310932798395186, + "loss": 3.1183, + "theoretical_loss": 4.006644151257173, + "tokens_seen": 404765696 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044309929789368104, + "loss": 2.8431, + "theoretical_loss": 4.006571724851873, + "tokens_seen": 404831232 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004430892678034102, + "loss": 2.9166, + "theoretical_loss": 4.0064993134526725, + "tokens_seen": 404896768 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004430792377131394, + "loss": 3.0856, + "theoretical_loss": 4.006426917054036, + "tokens_seen": 404962304 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044306920762286864, + "loss": 3.0765, + "theoretical_loss": 4.006354535650425, + "tokens_seen": 405027840 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044305917753259777, + "loss": 3.1055, + "theoretical_loss": 4.0062821692363135, + "tokens_seen": 405093376 + }, + { + "epoch": 1.03, + "learning_rate": 0.000443049147442327, + "loss": 3.0614, + "theoretical_loss": 4.00620981780617, + "tokens_seen": 405158912 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044303911735205613, + "loss": 3.0428, + "theoretical_loss": 4.006137481354468, + "tokens_seen": 405224448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044302908726178537, + "loss": 3.0897, + "theoretical_loss": 4.006065159875687, + "tokens_seen": 405289984 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044301905717151455, + "loss": 2.9146, + "theoretical_loss": 4.005992853364307, + "tokens_seen": 405355520 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044300902708124373, + "loss": 3.0938, + "theoretical_loss": 4.005920561814811, + "tokens_seen": 405421056 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004429989969909729, + "loss": 3.0032, + "theoretical_loss": 4.005848285221685, + "tokens_seen": 405486592 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004429889669007021, + "loss": 2.8133, + "theoretical_loss": 4.005776023579418, + "tokens_seen": 405552128 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044297893681043127, + "loss": 2.9431, + "theoretical_loss": 4.005703776882503, + "tokens_seen": 405617664 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004429689067201605, + "loss": 3.0047, + "theoretical_loss": 4.005631545125434, + "tokens_seen": 405683200 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044295887662988963, + "loss": 2.989, + "theoretical_loss": 4.00555932830271, + "tokens_seen": 405748736 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044294884653961887, + "loss": 3.2523, + "theoretical_loss": 4.0054871264088305, + "tokens_seen": 405814272 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004429388164493481, + "loss": 3.0664, + "theoretical_loss": 4.0054149394383, + "tokens_seen": 405879808 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044292878635907723, + "loss": 3.2111, + "theoretical_loss": 4.005342767385625, + "tokens_seen": 405945344 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044291875626880647, + "loss": 2.8902, + "theoretical_loss": 4.005270610245315, + "tokens_seen": 406010880 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004429087261785356, + "loss": 3.4227, + "theoretical_loss": 4.005198468011882, + "tokens_seen": 406076416 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044289869608826483, + "loss": 3.1243, + "theoretical_loss": 4.005126340679842, + "tokens_seen": 406141952 + }, + { + "epoch": 1.03, + "learning_rate": 0.000442888665997994, + "loss": 3.1989, + "theoretical_loss": 4.005054228243713, + "tokens_seen": 406207488 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004428786359077232, + "loss": 2.9508, + "theoretical_loss": 4.004982130698017, + "tokens_seen": 406273024 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 518449, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.878129482269287, + "objective/train/theoretical_loss": 4.004928067307296, + "objective/train/tokens_used": 426782176, + "theoretical_loss": 4.004928067307296, + "tokens_seen": 406322176 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044286860581745237, + "loss": 3.0946, + "theoretical_loss": 4.004910048037276, + "tokens_seen": 406338560 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044285857572718155, + "loss": 2.8832, + "theoretical_loss": 4.00483798025602, + "tokens_seen": 406404096 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044284854563691073, + "loss": 3.1023, + "theoretical_loss": 4.0047659273487755, + "tokens_seen": 406469632 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044283851554663997, + "loss": 3.0547, + "theoretical_loss": 4.004693889310077, + "tokens_seen": 406535168 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004428284854563691, + "loss": 3.1845, + "theoretical_loss": 4.00462186613446, + "tokens_seen": 406600704 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044281845536609833, + "loss": 3.0654, + "theoretical_loss": 4.004549857816463, + "tokens_seen": 406666240 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004428084252758275, + "loss": 3.0538, + "theoretical_loss": 4.004477864350627, + "tokens_seen": 406731776 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004427983951855567, + "loss": 3.0571, + "theoretical_loss": 4.004405885731497, + "tokens_seen": 406797312 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004427883650952859, + "loss": 3.0495, + "theoretical_loss": 4.00433392195362, + "tokens_seen": 406862848 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044277833500501506, + "loss": 3.1538, + "theoretical_loss": 4.004261973011546, + "tokens_seen": 406928384 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044276830491474424, + "loss": 2.9353, + "theoretical_loss": 4.004190038899827, + "tokens_seen": 406993920 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044275827482447347, + "loss": 2.8766, + "theoretical_loss": 4.004118119613019, + "tokens_seen": 407059456 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004427482447342026, + "loss": 3.0604, + "theoretical_loss": 4.004046215145682, + "tokens_seen": 407124992 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044273821464393184, + "loss": 3.2647, + "theoretical_loss": 4.003974325492377, + "tokens_seen": 407190528 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044272818455366096, + "loss": 3.1432, + "theoretical_loss": 4.003902450647669, + "tokens_seen": 407256064 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004427181544633902, + "loss": 3.0507, + "theoretical_loss": 4.003830590606123, + "tokens_seen": 407321600 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004427081243731194, + "loss": 3.0707, + "theoretical_loss": 4.003758745362311, + "tokens_seen": 407387136 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044269809428284856, + "loss": 3.1622, + "theoretical_loss": 4.003686914910807, + "tokens_seen": 407452672 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044268806419257774, + "loss": 3.146, + "theoretical_loss": 4.003615099246184, + "tokens_seen": 407518208 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004426780341023069, + "loss": 3.2283, + "theoretical_loss": 4.003543298363022, + "tokens_seen": 407583744 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004426680040120361, + "loss": 2.921, + "theoretical_loss": 4.003471512255905, + "tokens_seen": 407649280 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044265797392176534, + "loss": 3.1143, + "theoretical_loss": 4.003399740919413, + "tokens_seen": 407714816 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044264794383149447, + "loss": 3.0341, + "theoretical_loss": 4.003327984348138, + "tokens_seen": 407780352 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004426379137412237, + "loss": 3.1449, + "theoretical_loss": 4.003256242536667, + "tokens_seen": 407845888 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004426278836509529, + "loss": 2.9538, + "theoretical_loss": 4.003184515479592, + "tokens_seen": 407911424 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 519624, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0527055263519287, + "objective/train/theoretical_loss": 4.00313072986611, + "objective/train/tokens_used": 428420576, + "theoretical_loss": 4.00313072986611, + "tokens_seen": 407960576 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044261785356068206, + "loss": 3.0662, + "theoretical_loss": 4.003112803171513, + "tokens_seen": 407976960 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044260782347041124, + "loss": 2.9138, + "theoretical_loss": 4.003041105607026, + "tokens_seen": 408042496 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004425977933801404, + "loss": 3.1145, + "theoretical_loss": 4.002969422780733, + "tokens_seen": 408108032 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004425877632898696, + "loss": 3.0576, + "theoretical_loss": 4.002897754687238, + "tokens_seen": 408173568 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044257773319959884, + "loss": 3.2281, + "theoretical_loss": 4.0028261013211495, + "tokens_seen": 408239104 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044256770310932797, + "loss": 3.1774, + "theoretical_loss": 4.002754462677077, + "tokens_seen": 408304640 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004425576730190572, + "loss": 3.0006, + "theoretical_loss": 4.002682838749632, + "tokens_seen": 408370176 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044254764292878633, + "loss": 3.1701, + "theoretical_loss": 4.002611229533432, + "tokens_seen": 408435712 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044253761283851557, + "loss": 3.0723, + "theoretical_loss": 4.002539635023095, + "tokens_seen": 408501248 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044252758274824475, + "loss": 3.0007, + "theoretical_loss": 4.002468055213243, + "tokens_seen": 408566784 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044251755265797393, + "loss": 3.1507, + "theoretical_loss": 4.0023964900985, + "tokens_seen": 408632320 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004425075225677031, + "loss": 3.3785, + "theoretical_loss": 4.002324939673492, + "tokens_seen": 408697856 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004424974924774323, + "loss": 3.0969, + "theoretical_loss": 4.0022534039328495, + "tokens_seen": 408763392 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044248746238716147, + "loss": 3.1206, + "theoretical_loss": 4.002181882871206, + "tokens_seen": 408828928 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004424774322968907, + "loss": 2.8046, + "theoretical_loss": 4.002110376483197, + "tokens_seen": 408894464 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044246740220661983, + "loss": 2.9415, + "theoretical_loss": 4.002038884763459, + "tokens_seen": 408960000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044245737211634907, + "loss": 2.9706, + "theoretical_loss": 4.001967407706635, + "tokens_seen": 409025536 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044244734202607825, + "loss": 2.9634, + "theoretical_loss": 4.0018959453073695, + "tokens_seen": 409091072 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044243731193580743, + "loss": 3.2558, + "theoretical_loss": 4.001824497560308, + "tokens_seen": 409156608 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004424272818455366, + "loss": 3.1996, + "theoretical_loss": 4.0017530644601, + "tokens_seen": 409222144 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004424172517552658, + "loss": 2.9816, + "theoretical_loss": 4.001681646001399, + "tokens_seen": 409287680 + }, + { + "epoch": 1.03, + "learning_rate": 0.000442407221664995, + "loss": 3.027, + "theoretical_loss": 4.0016102421788595, + "tokens_seen": 409353216 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004423971915747242, + "loss": 2.993, + "theoretical_loss": 4.00153885298714, + "tokens_seen": 409418752 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044238716148445334, + "loss": 2.9679, + "theoretical_loss": 4.001467478420901, + "tokens_seen": 409484288 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044237713139418257, + "loss": 3.3059, + "theoretical_loss": 4.001396118474806, + "tokens_seen": 409549824 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 520212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.317723512649536, + "objective/train/theoretical_loss": 4.001342608106496, + "objective/train/tokens_used": 430058976, + "theoretical_loss": 4.001342608106496, + "tokens_seen": 409598976 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004423671013039117, + "loss": 2.9615, + "theoretical_loss": 4.001324773143522, + "tokens_seen": 409615360 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044235707121364093, + "loss": 3.0724, + "theoretical_loss": 4.001253442421718, + "tokens_seen": 409680896 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004423470411233701, + "loss": 2.965, + "theoretical_loss": 4.001182126304067, + "tokens_seen": 409746432 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004423370110330993, + "loss": 3.0159, + "theoretical_loss": 4.001110824785242, + "tokens_seen": 409811968 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004423269809428285, + "loss": 2.9357, + "theoretical_loss": 4.001039537859921, + "tokens_seen": 409877504 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004423169508525577, + "loss": 3.4018, + "theoretical_loss": 4.000968265522786, + "tokens_seen": 409943040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044230692076228684, + "loss": 3.1943, + "theoretical_loss": 4.000897007768518, + "tokens_seen": 410008576 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004422968906720161, + "loss": 2.9675, + "theoretical_loss": 4.000825764591807, + "tokens_seen": 410074112 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004422868605817452, + "loss": 3.1719, + "theoretical_loss": 4.000754535987337, + "tokens_seen": 410139648 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044227683049147444, + "loss": 3.1564, + "theoretical_loss": 4.0006833219498015, + "tokens_seen": 410205184 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004422668004012036, + "loss": 3.1727, + "theoretical_loss": 4.000612122473896, + "tokens_seen": 410270720 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004422567703109328, + "loss": 3.0706, + "theoretical_loss": 4.000540937554316, + "tokens_seen": 410336256 + }, + { + "epoch": 1.03, + "learning_rate": 0.000442246740220662, + "loss": 3.193, + "theoretical_loss": 4.000469767185763, + "tokens_seen": 410401792 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044223671013039116, + "loss": 2.8826, + "theoretical_loss": 4.000398611362938, + "tokens_seen": 410467328 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044222668004012034, + "loss": 3.0742, + "theoretical_loss": 4.0003274700805465, + "tokens_seen": 410532864 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004422166499498496, + "loss": 3.1663, + "theoretical_loss": 4.000256343333298, + "tokens_seen": 410598400 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004422066198595787, + "loss": 3.1183, + "theoretical_loss": 4.000185231115903, + "tokens_seen": 410663936 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044219658976930794, + "loss": 3.0318, + "theoretical_loss": 4.000114133423077, + "tokens_seen": 410729472 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004421865596790371, + "loss": 2.8639, + "theoretical_loss": 4.000043050249533, + "tokens_seen": 410795008 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004421765295887663, + "loss": 3.0937, + "theoretical_loss": 3.999971981589993, + "tokens_seen": 410860544 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044216649949849554, + "loss": 2.9955, + "theoretical_loss": 3.9999009274391772, + "tokens_seen": 410926080 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044215646940822467, + "loss": 3.2867, + "theoretical_loss": 3.9998298877918126, + "tokens_seen": 410991616 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004421464393179539, + "loss": 3.0194, + "theoretical_loss": 3.999758862642625, + "tokens_seen": 411057152 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004421364092276831, + "loss": 3.0622, + "theoretical_loss": 3.9996878519863452, + "tokens_seen": 411122688 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044212637913741226, + "loss": 3.0314, + "theoretical_loss": 3.999616855817707, + "tokens_seen": 411188224 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 521565, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3985636234283447, + "objective/train/theoretical_loss": 3.9995636181955763, + "objective/train/tokens_used": 431697376, + "theoretical_loss": 3.9995636181955763, + "tokens_seen": 411237376 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044211634904714144, + "loss": 3.0621, + "theoretical_loss": 3.999545874131446, + "tokens_seen": 411253760 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004421063189568706, + "loss": 3.194, + "theoretical_loss": 3.9994749069223, + "tokens_seen": 411319296 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004420962888665998, + "loss": 2.8937, + "theoretical_loss": 3.9994039541850115, + "tokens_seen": 411384832 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044208625877632904, + "loss": 2.9052, + "theoretical_loss": 3.9993330159143237, + "tokens_seen": 411450368 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044207622868605817, + "loss": 3.1371, + "theoretical_loss": 3.999262092104984, + "tokens_seen": 411515904 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004420661985957874, + "loss": 3.1621, + "theoretical_loss": 3.9991911827517415, + "tokens_seen": 411581440 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044205616850551653, + "loss": 2.9365, + "theoretical_loss": 3.9991202878493493, + "tokens_seen": 411646976 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044204613841524577, + "loss": 3.1542, + "theoretical_loss": 3.9990494073925618, + "tokens_seen": 411712512 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044203610832497495, + "loss": 3.4033, + "theoretical_loss": 3.9989785413761365, + "tokens_seen": 411778048 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044202607823470413, + "loss": 3.1086, + "theoretical_loss": 3.9989076897948355, + "tokens_seen": 411843584 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004420160481444333, + "loss": 2.9042, + "theoretical_loss": 3.99883685264342, + "tokens_seen": 411909120 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004420060180541625, + "loss": 3.1093, + "theoretical_loss": 3.9987660299166574, + "tokens_seen": 411974656 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044199598796389167, + "loss": 3.225, + "theoretical_loss": 3.998695221609317, + "tokens_seen": 412040192 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004419859578736209, + "loss": 3.0794, + "theoretical_loss": 3.9986244277161687, + "tokens_seen": 412105728 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044197592778335003, + "loss": 2.9221, + "theoretical_loss": 3.9985536482319874, + "tokens_seen": 412171264 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044196589769307927, + "loss": 3.2153, + "theoretical_loss": 3.99848288315155, + "tokens_seen": 412236800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044195586760280845, + "loss": 3.1729, + "theoretical_loss": 3.998412132469637, + "tokens_seen": 412302336 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044194583751253763, + "loss": 3.0724, + "theoretical_loss": 3.9983413961810292, + "tokens_seen": 412367872 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004419358074222668, + "loss": 2.8081, + "theoretical_loss": 3.9982706742805125, + "tokens_seen": 412433408 + }, + { + "epoch": 1.04, + "learning_rate": 0.000441925777331996, + "loss": 2.9037, + "theoretical_loss": 3.998199966762875, + "tokens_seen": 412498944 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004419157472417252, + "loss": 3.138, + "theoretical_loss": 3.998129273622907, + "tokens_seen": 412564480 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004419057171514544, + "loss": 2.9746, + "theoretical_loss": 3.9980585948554013, + "tokens_seen": 412630016 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044189568706118354, + "loss": 2.8928, + "theoretical_loss": 3.9979879304551544, + "tokens_seen": 412695552 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044188565697091277, + "loss": 3.0935, + "theoretical_loss": 3.9979172804169654, + "tokens_seen": 412761088 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004418756268806419, + "loss": 3.2098, + "theoretical_loss": 3.997846644735634, + "tokens_seen": 412826624 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 522883, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.032809019088745, + "objective/train/theoretical_loss": 3.997793677393199, + "objective/train/tokens_used": 433335776, + "theoretical_loss": 3.997793677393199, + "tokens_seen": 412875776 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044186559679037113, + "loss": 2.9367, + "theoretical_loss": 3.997776023405966, + "tokens_seen": 412892160 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004418555667001003, + "loss": 3.0454, + "theoretical_loss": 3.997705416422767, + "tokens_seen": 412957696 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004418455366098295, + "loss": 2.9081, + "theoretical_loss": 3.9976348237808477, + "tokens_seen": 413023232 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004418355065195587, + "loss": 3.3134, + "theoretical_loss": 3.9975642454750195, + "tokens_seen": 413088768 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004418254764292879, + "loss": 3.0579, + "theoretical_loss": 3.9974936815000977, + "tokens_seen": 413154304 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044181544633901704, + "loss": 3.1252, + "theoretical_loss": 3.997423131850899, + "tokens_seen": 413219840 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004418054162487463, + "loss": 3.1003, + "theoretical_loss": 3.9973525965222443, + "tokens_seen": 413285376 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004417953861584754, + "loss": 3.0599, + "theoretical_loss": 3.9972820755089566, + "tokens_seen": 413350912 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044178535606820464, + "loss": 2.923, + "theoretical_loss": 3.9972115688058616, + "tokens_seen": 413416448 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004417753259779338, + "loss": 3.0775, + "theoretical_loss": 3.9971410764077877, + "tokens_seen": 413481984 + }, + { + "epoch": 1.04, + "learning_rate": 0.000441765295887663, + "loss": 2.9165, + "theoretical_loss": 3.997070598309566, + "tokens_seen": 413547520 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004417552657973922, + "loss": 2.9974, + "theoretical_loss": 3.9970001345060293, + "tokens_seen": 413613056 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044174523570712136, + "loss": 3.001, + "theoretical_loss": 3.9969296849920153, + "tokens_seen": 413678592 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044173520561685054, + "loss": 3.0008, + "theoretical_loss": 3.9968592497623625, + "tokens_seen": 413744128 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004417251755265798, + "loss": 3.11, + "theoretical_loss": 3.996788828811913, + "tokens_seen": 413809664 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004417151454363089, + "loss": 3.2571, + "theoretical_loss": 3.996718422135511, + "tokens_seen": 413875200 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044170511534603814, + "loss": 2.9652, + "theoretical_loss": 3.996648029728003, + "tokens_seen": 413940736 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044169508525576727, + "loss": 3.0139, + "theoretical_loss": 3.9965776515842397, + "tokens_seen": 414006272 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004416850551654965, + "loss": 2.9294, + "theoretical_loss": 3.9965072876990737, + "tokens_seen": 414071808 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004416750250752257, + "loss": 3.0109, + "theoretical_loss": 3.99643693806736, + "tokens_seen": 414137344 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044166499498495487, + "loss": 3.1063, + "theoretical_loss": 3.9963666026839557, + "tokens_seen": 414202880 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044165496489468405, + "loss": 3.2242, + "theoretical_loss": 3.996296281543722, + "tokens_seen": 414268416 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004416449348044133, + "loss": 2.8582, + "theoretical_loss": 3.996225974641522, + "tokens_seen": 414333952 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004416349047141424, + "loss": 2.8527, + "theoretical_loss": 3.9961556819722217, + "tokens_seen": 414399488 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044162487462387164, + "loss": 3.037, + "theoretical_loss": 3.9960854035306888, + "tokens_seen": 414465024 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 523694, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1503584384918213, + "objective/train/theoretical_loss": 3.9960327040334263, + "objective/train/tokens_used": 434974176, + "theoretical_loss": 3.9960327040334263, + "tokens_seen": 414514176 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044161484453360077, + "loss": 3.2747, + "theoretical_loss": 3.996015139311795, + "tokens_seen": 414530560 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044160481444333, + "loss": 3.2075, + "theoretical_loss": 3.9959448893104144, + "tokens_seen": 414596096 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004415947843530592, + "loss": 3.0925, + "theoretical_loss": 3.9958746535214233, + "tokens_seen": 414661632 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044158475426278837, + "loss": 3.201, + "theoretical_loss": 3.9958044319397006, + "tokens_seen": 414727168 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044157472417251755, + "loss": 2.9766, + "theoretical_loss": 3.9957342245601275, + "tokens_seen": 414792704 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044156469408224673, + "loss": 3.1882, + "theoretical_loss": 3.99566403137759, + "tokens_seen": 414858240 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004415546639919759, + "loss": 3.0593, + "theoretical_loss": 3.995593852386974, + "tokens_seen": 414923776 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044154463390170515, + "loss": 3.1353, + "theoretical_loss": 3.995523687583169, + "tokens_seen": 414989312 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004415346038114343, + "loss": 3.1151, + "theoretical_loss": 3.9954535369610684, + "tokens_seen": 415054848 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004415245737211635, + "loss": 3.1027, + "theoretical_loss": 3.995383400515567, + "tokens_seen": 415120384 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044151454363089264, + "loss": 3.0921, + "theoretical_loss": 3.9953132782415617, + "tokens_seen": 415185920 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044150451354062187, + "loss": 3.1593, + "theoretical_loss": 3.9952431701339535, + "tokens_seen": 415251456 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044149448345035105, + "loss": 3.2781, + "theoretical_loss": 3.995173076187645, + "tokens_seen": 415316992 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044148445336008023, + "loss": 2.9717, + "theoretical_loss": 3.995102996397542, + "tokens_seen": 415382528 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004414744232698094, + "loss": 3.0149, + "theoretical_loss": 3.995032930758552, + "tokens_seen": 415448064 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044146439317953865, + "loss": 3.0351, + "theoretical_loss": 3.9949628792655876, + "tokens_seen": 415513600 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004414543630892678, + "loss": 3.1212, + "theoretical_loss": 3.9948928419135603, + "tokens_seen": 415579136 + }, + { + "epoch": 1.04, + "learning_rate": 0.000441444332998997, + "loss": 2.9529, + "theoretical_loss": 3.9948228186973873, + "tokens_seen": 415644672 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004414343029087262, + "loss": 3.1814, + "theoretical_loss": 3.9947528096119873, + "tokens_seen": 415710208 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004414242728184554, + "loss": 3.1078, + "theoretical_loss": 3.9946828146522817, + "tokens_seen": 415775744 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004414142427281846, + "loss": 2.9443, + "theoretical_loss": 3.9946128338131945, + "tokens_seen": 415841280 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044140421263791374, + "loss": 2.9845, + "theoretical_loss": 3.9945428670896517, + "tokens_seen": 415906816 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044139418254764297, + "loss": 3.0643, + "theoretical_loss": 3.9944729144765834, + "tokens_seen": 415972352 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004413841524573721, + "loss": 2.9436, + "theoretical_loss": 3.9944029759689204, + "tokens_seen": 416037888 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044137412236710133, + "loss": 2.895, + "theoretical_loss": 3.9943330515615987, + "tokens_seen": 416103424 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 524726, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1443862915039062, + "objective/train/theoretical_loss": 3.9942806175064094, + "objective/train/tokens_used": 436612576, + "theoretical_loss": 3.9942806175064094, + "tokens_seen": 416152576 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004413640922768305, + "loss": 2.8181, + "theoretical_loss": 3.994263141249554, + "tokens_seen": 416168960 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004413540621865597, + "loss": 3.0632, + "theoretical_loss": 3.994193245027726, + "tokens_seen": 416234496 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004413440320962889, + "loss": 3.2087, + "theoretical_loss": 3.9941233628910586, + "tokens_seen": 416300032 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004413340020060181, + "loss": 3.0843, + "theoretical_loss": 3.9940534948344952, + "tokens_seen": 416365568 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044132397191574724, + "loss": 3.115, + "theoretical_loss": 3.993983640852984, + "tokens_seen": 416431104 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004413139418254765, + "loss": 3.1099, + "theoretical_loss": 3.993913800941475, + "tokens_seen": 416496640 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004413039117352056, + "loss": 3.0741, + "theoretical_loss": 3.9938439750949213, + "tokens_seen": 416562176 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044129388164493484, + "loss": 3.0151, + "theoretical_loss": 3.9937741633082777, + "tokens_seen": 416627712 + }, + { + "epoch": 1.04, + "learning_rate": 0.000441283851554664, + "loss": 3.0126, + "theoretical_loss": 3.9937043655765025, + "tokens_seen": 416693248 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004412738214643932, + "loss": 3.0725, + "theoretical_loss": 3.993634581894556, + "tokens_seen": 416758784 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004412637913741224, + "loss": 3.2187, + "theoretical_loss": 3.9935648122574015, + "tokens_seen": 416824320 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044125376128385156, + "loss": 3.131, + "theoretical_loss": 3.993495056660005, + "tokens_seen": 416889856 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044124373119358074, + "loss": 3.0721, + "theoretical_loss": 3.993425315097335, + "tokens_seen": 416955392 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044123370110331, + "loss": 2.9879, + "theoretical_loss": 3.993355587564362, + "tokens_seen": 417020928 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004412236710130391, + "loss": 3.079, + "theoretical_loss": 3.9932858740560597, + "tokens_seen": 417086464 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044121364092276834, + "loss": 3.1582, + "theoretical_loss": 3.9932161745674035, + "tokens_seen": 417152000 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044120361083249747, + "loss": 3.2366, + "theoretical_loss": 3.9931464890933737, + "tokens_seen": 417217536 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004411935807422267, + "loss": 3.0629, + "theoretical_loss": 3.9930768176289506, + "tokens_seen": 417283072 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004411835506519559, + "loss": 3.1617, + "theoretical_loss": 3.9930071601691184, + "tokens_seen": 417348608 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044117352056168507, + "loss": 3.0412, + "theoretical_loss": 3.9929375167088637, + "tokens_seen": 417414144 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044116349047141425, + "loss": 2.9656, + "theoretical_loss": 3.9928678872431744, + "tokens_seen": 417479680 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004411534603811435, + "loss": 2.9363, + "theoretical_loss": 3.992798271767044, + "tokens_seen": 417545216 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004411434302908726, + "loss": 2.9418, + "theoretical_loss": 3.992728670275465, + "tokens_seen": 417610752 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044113340020060184, + "loss": 3.0558, + "theoretical_loss": 3.992659082763436, + "tokens_seen": 417676288 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044112337011033097, + "loss": 3.0002, + "theoretical_loss": 3.9925895092259545, + "tokens_seen": 417741824 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 525508, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.890538454055786, + "objective/train/theoretical_loss": 3.992537338240634, + "objective/train/tokens_used": 438250976, + "theoretical_loss": 3.992537338240634, + "tokens_seen": 417790976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004411133400200602, + "loss": 2.9902, + "theoretical_loss": 3.9925199496580235, + "tokens_seen": 417807360 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004411033099297894, + "loss": 3.0079, + "theoretical_loss": 3.9924504040546474, + "tokens_seen": 417872896 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044109327983951857, + "loss": 3.1488, + "theoretical_loss": 3.992380872410833, + "tokens_seen": 417938432 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044108324974924775, + "loss": 3.2423, + "theoretical_loss": 3.9923113547215907, + "tokens_seen": 418003968 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044107321965897693, + "loss": 3.121, + "theoretical_loss": 3.992241850981932, + "tokens_seen": 418069504 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004410631895687061, + "loss": 2.8882, + "theoretical_loss": 3.9921723611868716, + "tokens_seen": 418135040 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044105315947843535, + "loss": 3.145, + "theoretical_loss": 3.992102885331427, + "tokens_seen": 418200576 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004410431293881645, + "loss": 2.9919, + "theoretical_loss": 3.9920334234106187, + "tokens_seen": 418266112 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004410330992978937, + "loss": 2.9014, + "theoretical_loss": 3.991963975419468, + "tokens_seen": 418331648 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044102306920762284, + "loss": 3.1452, + "theoretical_loss": 3.991894541353001, + "tokens_seen": 418397184 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044101303911735207, + "loss": 3.098, + "theoretical_loss": 3.9918251212062454, + "tokens_seen": 418462720 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044100300902708125, + "loss": 2.9439, + "theoretical_loss": 3.99175571497423, + "tokens_seen": 418528256 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044099297893681043, + "loss": 2.8596, + "theoretical_loss": 3.991686322651989, + "tokens_seen": 418593792 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004409829488465396, + "loss": 2.8567, + "theoretical_loss": 3.9916169442345564, + "tokens_seen": 418659328 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044097291875626885, + "loss": 3.0336, + "theoretical_loss": 3.991547579716971, + "tokens_seen": 418724864 + }, + { + "epoch": 1.04, + "learning_rate": 0.000440962888665998, + "loss": 3.01, + "theoretical_loss": 3.9914782290942723, + "tokens_seen": 418790400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004409528585757272, + "loss": 2.718, + "theoretical_loss": 3.9914088923615036, + "tokens_seen": 418855936 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044094282848545634, + "loss": 2.9002, + "theoretical_loss": 3.9913395695137104, + "tokens_seen": 418921472 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004409327983951856, + "loss": 3.1815, + "theoretical_loss": 3.99127026054594, + "tokens_seen": 418987008 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044092276830491476, + "loss": 3.1064, + "theoretical_loss": 3.991200965453244, + "tokens_seen": 419052544 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044091273821464394, + "loss": 2.9756, + "theoretical_loss": 3.991131684230675, + "tokens_seen": 419118080 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004409027081243731, + "loss": 3.1044, + "theoretical_loss": 3.991062416873288, + "tokens_seen": 419183616 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004408926780341023, + "loss": 2.9907, + "theoretical_loss": 3.990993163376142, + "tokens_seen": 419249152 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004408826479438315, + "loss": 2.5876, + "theoretical_loss": 3.990923923734297, + "tokens_seen": 419314688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004408726178535607, + "loss": 2.9581, + "theoretical_loss": 3.990854697942816, + "tokens_seen": 419380224 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 526274, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7906696796417236, + "objective/train/theoretical_loss": 3.9908027876855394, + "objective/train/tokens_used": 439889376, + "theoretical_loss": 3.9908027876855394, + "tokens_seen": 419429376 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044086258776328984, + "loss": 2.9342, + "theoretical_loss": 3.990785485996766, + "tokens_seen": 419445760 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004408525576730191, + "loss": 2.8886, + "theoretical_loss": 3.990716287891214, + "tokens_seen": 419511296 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004408425275827482, + "loss": 2.9543, + "theoretical_loss": 3.9906471036212308, + "tokens_seen": 419576832 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044083249749247744, + "loss": 3.0434, + "theoretical_loss": 3.9905779331818905, + "tokens_seen": 419642368 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004408224674022066, + "loss": 2.7933, + "theoretical_loss": 3.9905087765682676, + "tokens_seen": 419707904 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004408124373119358, + "loss": 2.9713, + "theoretical_loss": 3.990439633775442, + "tokens_seen": 419773440 + }, + { + "epoch": 1.04, + "learning_rate": 0.000440802407221665, + "loss": 3.2049, + "theoretical_loss": 3.9903705047984936, + "tokens_seen": 419838976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004407923771313942, + "loss": 3.0042, + "theoretical_loss": 3.9903013896325064, + "tokens_seen": 419904512 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044078234704112335, + "loss": 3.002, + "theoretical_loss": 3.9902322882725656, + "tokens_seen": 419970048 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004407723169508526, + "loss": 3.2834, + "theoretical_loss": 3.9901632007137593, + "tokens_seen": 420035584 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004407622868605817, + "loss": 3.0347, + "theoretical_loss": 3.9900941269511794, + "tokens_seen": 420101120 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044075225677031094, + "loss": 3.0477, + "theoretical_loss": 3.990025066979919, + "tokens_seen": 420166656 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004407422266800401, + "loss": 2.8861, + "theoretical_loss": 3.9899560207950744, + "tokens_seen": 420232192 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004407321965897693, + "loss": 2.878, + "theoretical_loss": 3.989886988391743, + "tokens_seen": 420297728 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004407221664994985, + "loss": 3.1336, + "theoretical_loss": 3.989817969765027, + "tokens_seen": 420363264 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044071213640922767, + "loss": 3.0215, + "theoretical_loss": 3.9897489649100284, + "tokens_seen": 420428800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044070210631895685, + "loss": 2.9364, + "theoretical_loss": 3.9896799738218545, + "tokens_seen": 420494336 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004406920762286861, + "loss": 3.0076, + "theoretical_loss": 3.9896109964956135, + "tokens_seen": 420559872 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044068204613841527, + "loss": 2.8764, + "theoretical_loss": 3.9895420329264155, + "tokens_seen": 420625408 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044067201604814445, + "loss": 3.2049, + "theoretical_loss": 3.989473083109375, + "tokens_seen": 420690944 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004406619859578737, + "loss": 3.239, + "theoretical_loss": 3.9894041470396076, + "tokens_seen": 420756480 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004406519558676028, + "loss": 3.1395, + "theoretical_loss": 3.989335224712232, + "tokens_seen": 420822016 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044064192577733204, + "loss": 3.0907, + "theoretical_loss": 3.989266316122369, + "tokens_seen": 420887552 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044063189568706117, + "loss": 3.1008, + "theoretical_loss": 3.989197421265141, + "tokens_seen": 420953088 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004406218655967904, + "loss": 3.0425, + "theoretical_loss": 3.9891285401356757, + "tokens_seen": 421018624 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 527775, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.216331958770752, + "objective/train/theoretical_loss": 3.9890768882944894, + "objective/train/tokens_used": 441527776, + "theoretical_loss": 3.9890768882944894, + "tokens_seen": 421067776 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004406118355065196, + "loss": 3.0228, + "theoretical_loss": 3.9890596727291, + "tokens_seen": 421084160 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044060180541624877, + "loss": 2.7891, + "theoretical_loss": 3.988990819040546, + "tokens_seen": 421149696 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044059177532597795, + "loss": 3.0307, + "theoretical_loss": 3.9889219790651462, + "tokens_seen": 421215232 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044058174523570713, + "loss": 3.0988, + "theoretical_loss": 3.988853152798037, + "tokens_seen": 421280768 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004405717151454363, + "loss": 2.8487, + "theoretical_loss": 3.9887843402343566, + "tokens_seen": 421346304 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044056168505516555, + "loss": 3.0812, + "theoretical_loss": 3.9887155413692463, + "tokens_seen": 421411840 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004405516549648947, + "loss": 3.0347, + "theoretical_loss": 3.9886467561978476, + "tokens_seen": 421477376 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004405416248746239, + "loss": 2.7433, + "theoretical_loss": 3.9885779847153087, + "tokens_seen": 421542912 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044053159478435304, + "loss": 3.1027, + "theoretical_loss": 3.9885092269167766, + "tokens_seen": 421608448 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044052156469408227, + "loss": 3.1315, + "theoretical_loss": 3.9884404827974023, + "tokens_seen": 421673984 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044051153460381145, + "loss": 2.9011, + "theoretical_loss": 3.9883717523523385, + "tokens_seen": 421739520 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044050150451354063, + "loss": 2.9213, + "theoretical_loss": 3.9883030355767417, + "tokens_seen": 421805056 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004404914744232698, + "loss": 3.0712, + "theoretical_loss": 3.98823433246577, + "tokens_seen": 421870592 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044048144433299905, + "loss": 3.0033, + "theoretical_loss": 3.9881656430145833, + "tokens_seen": 421936128 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004404714142427282, + "loss": 3.0171, + "theoretical_loss": 3.988096967218345, + "tokens_seen": 422001664 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004404613841524574, + "loss": 2.9677, + "theoretical_loss": 3.9880283050722207, + "tokens_seen": 422067200 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044045135406218654, + "loss": 2.9678, + "theoretical_loss": 3.9879596565713786, + "tokens_seen": 422132736 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004404413239719158, + "loss": 2.8646, + "theoretical_loss": 3.987891021710989, + "tokens_seen": 422198272 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044043129388164496, + "loss": 3.17, + "theoretical_loss": 3.987822400486225, + "tokens_seen": 422263808 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044042126379137414, + "loss": 3.1096, + "theoretical_loss": 3.9877537928922613, + "tokens_seen": 422329344 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004404112337011033, + "loss": 3.0704, + "theoretical_loss": 3.9876851989242765, + "tokens_seen": 422394880 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004404012036108325, + "loss": 3.0149, + "theoretical_loss": 3.9876166185774506, + "tokens_seen": 422460416 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004403911735205617, + "loss": 3.1474, + "theoretical_loss": 3.987548051846966, + "tokens_seen": 422525952 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004403811434302909, + "loss": 2.9211, + "theoretical_loss": 3.987479498728009, + "tokens_seen": 422591488 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044037111334002004, + "loss": 3.144, + "theoretical_loss": 3.987410959215766, + "tokens_seen": 422657024 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 528406, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4659383296966553, + "objective/train/theoretical_loss": 3.9873595635080967, + "objective/train/tokens_used": 443166176, + "theoretical_loss": 3.9873595635080967, + "tokens_seen": 422706176 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004403610832497493, + "loss": 3.073, + "theoretical_loss": 3.987342433305428, + "tokens_seen": 422722560 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004403510531594784, + "loss": 3.0885, + "theoretical_loss": 3.9872739209921866, + "tokens_seen": 422788096 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044034102306920764, + "loss": 3.0353, + "theoretical_loss": 3.987205422271238, + "tokens_seen": 422853632 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004403309929789368, + "loss": 3.1106, + "theoretical_loss": 3.9871369371377785, + "tokens_seen": 422919168 + }, + { + "epoch": 1.04, + "learning_rate": 0.000440320962888666, + "loss": 2.91, + "theoretical_loss": 3.987068465587009, + "tokens_seen": 422984704 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004403109327983952, + "loss": 2.991, + "theoretical_loss": 3.9870000076141303, + "tokens_seen": 423050240 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004403009027081244, + "loss": 3.0972, + "theoretical_loss": 3.9869315632143483, + "tokens_seen": 423115776 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044029087261785355, + "loss": 3.0585, + "theoretical_loss": 3.9868631323828705, + "tokens_seen": 423181312 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004402808425275828, + "loss": 2.8196, + "theoretical_loss": 3.9867947151149052, + "tokens_seen": 423246848 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004402708124373119, + "loss": 3.1558, + "theoretical_loss": 3.9867263114056657, + "tokens_seen": 423312384 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044026078234704114, + "loss": 3.2312, + "theoretical_loss": 3.9866579212503654, + "tokens_seen": 423377920 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004402507522567703, + "loss": 3.2391, + "theoretical_loss": 3.986589544644222, + "tokens_seen": 423443456 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004402407221664995, + "loss": 3.1316, + "theoretical_loss": 3.986521181582455, + "tokens_seen": 423508992 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004402306920762287, + "loss": 3.2101, + "theoretical_loss": 3.9864528320602854, + "tokens_seen": 423574528 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044022066198595787, + "loss": 2.9653, + "theoretical_loss": 3.9863844960729375, + "tokens_seen": 423640064 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044021063189568705, + "loss": 3.0989, + "theoretical_loss": 3.9863161736156387, + "tokens_seen": 423705600 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004402006018054163, + "loss": 2.7849, + "theoretical_loss": 3.9862478646836172, + "tokens_seen": 423771136 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004401905717151454, + "loss": 2.9124, + "theoretical_loss": 3.986179569272105, + "tokens_seen": 423836672 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044018054162487465, + "loss": 3.0429, + "theoretical_loss": 3.9861112873763354, + "tokens_seen": 423902208 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044017051153460383, + "loss": 2.9653, + "theoretical_loss": 3.986043018991545, + "tokens_seen": 423967744 + }, + { + "epoch": 1.04, + "learning_rate": 0.000440160481444333, + "loss": 3.0573, + "theoretical_loss": 3.9859747641129726, + "tokens_seen": 424033280 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004401504513540622, + "loss": 3.0092, + "theoretical_loss": 3.9859065227358585, + "tokens_seen": 424098816 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044014042126379137, + "loss": 2.9568, + "theoretical_loss": 3.985838294855448, + "tokens_seen": 424164352 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044013039117352055, + "loss": 2.8587, + "theoretical_loss": 3.9857700804669847, + "tokens_seen": 424229888 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004401203610832498, + "loss": 2.9795, + "theoretical_loss": 3.9857018795657186, + "tokens_seen": 424295424 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 530052, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4916634559631348, + "objective/train/theoretical_loss": 3.985650737737885, + "objective/train/tokens_used": 444804576, + "theoretical_loss": 3.985650737737885, + "tokens_seen": 424344576 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004401103309929789, + "loss": 3.1533, + "theoretical_loss": 3.9856336921469007, + "tokens_seen": 424360960 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044010030090270815, + "loss": 2.9523, + "theoretical_loss": 3.9855655182057825, + "tokens_seen": 424426496 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004400902708124373, + "loss": 2.8567, + "theoretical_loss": 3.9854973577376214, + "tokens_seen": 424492032 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004400802407221665, + "loss": 2.8743, + "theoretical_loss": 3.9854292107376734, + "tokens_seen": 424557568 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004400702106318957, + "loss": 2.8946, + "theoretical_loss": 3.9853610772012007, + "tokens_seen": 424623104 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004400601805416249, + "loss": 2.9646, + "theoretical_loss": 3.985292957123465, + "tokens_seen": 424688640 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044005015045135406, + "loss": 2.9647, + "theoretical_loss": 3.9852248504997325, + "tokens_seen": 424754176 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044004012036108324, + "loss": 2.936, + "theoretical_loss": 3.985156757325269, + "tokens_seen": 424819712 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004400300902708124, + "loss": 2.8415, + "theoretical_loss": 3.9850886775953462, + "tokens_seen": 424885248 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044002006018054165, + "loss": 3.0282, + "theoretical_loss": 3.9850206113052353, + "tokens_seen": 424950784 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004400100300902708, + "loss": 3.0696, + "theoretical_loss": 3.9849525584502112, + "tokens_seen": 425016320 + }, + { + "epoch": 1.04, + "learning_rate": 0.00044, + "loss": 3.1165, + "theoretical_loss": 3.984884519025552, + "tokens_seen": 425081856 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004399899699097292, + "loss": 3.2719, + "theoretical_loss": 3.9848164930265364, + "tokens_seen": 425147392 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004399799398194584, + "loss": 2.9398, + "theoretical_loss": 3.9847484804484457, + "tokens_seen": 425212928 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043996990972918756, + "loss": 2.7438, + "theoretical_loss": 3.9846804812865653, + "tokens_seen": 425278464 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043995987963891674, + "loss": 3.0575, + "theoretical_loss": 3.9846124955361812, + "tokens_seen": 425344000 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004399498495486459, + "loss": 2.8866, + "theoretical_loss": 3.9845445231925827, + "tokens_seen": 425409536 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043993981945837516, + "loss": 3.0092, + "theoretical_loss": 3.984476564251061, + "tokens_seen": 425475072 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043992978936810434, + "loss": 3.1991, + "theoretical_loss": 3.9844086187069108, + "tokens_seen": 425540608 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004399197592778335, + "loss": 3.1595, + "theoretical_loss": 3.9843406865554267, + "tokens_seen": 425606144 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004399097291875627, + "loss": 3.1975, + "theoretical_loss": 3.984272767791908, + "tokens_seen": 425671680 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004398996990972919, + "loss": 2.9193, + "theoretical_loss": 3.984204862411657, + "tokens_seen": 425737216 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004398896690070211, + "loss": 2.92, + "theoretical_loss": 3.9841369704099745, + "tokens_seen": 425802752 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043987963891675024, + "loss": 2.9934, + "theoretical_loss": 3.9840690917821675, + "tokens_seen": 425868288 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004398696088264795, + "loss": 3.1759, + "theoretical_loss": 3.984001226523545, + "tokens_seen": 425933824 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 530759, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.535398006439209, + "objective/train/theoretical_loss": 3.983950336350283, + "objective/train/tokens_used": 446442976, + "theoretical_loss": 3.983950336350283, + "tokens_seen": 425982976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004398595787362086, + "loss": 2.9939, + "theoretical_loss": 3.983933374629416, + "tokens_seen": 425999360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043984954864593784, + "loss": 3.0605, + "theoretical_loss": 3.9838655360950925, + "tokens_seen": 426064896 + }, + { + "epoch": 1.04, + "learning_rate": 0.000439839518555667, + "loss": 3.1724, + "theoretical_loss": 3.983797710915892, + "tokens_seen": 426130432 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004398294884653962, + "loss": 2.8873, + "theoretical_loss": 3.9837298990871304, + "tokens_seen": 426195968 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004398194583751254, + "loss": 2.9109, + "theoretical_loss": 3.9836621006041284, + "tokens_seen": 426261504 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004398094282848546, + "loss": 3.051, + "theoretical_loss": 3.9835943154622075, + "tokens_seen": 426327040 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043979939819458375, + "loss": 2.8254, + "theoretical_loss": 3.9835265436566925, + "tokens_seen": 426392576 + }, + { + "epoch": 1.04, + "learning_rate": 0.000439789368104313, + "loss": 3.2306, + "theoretical_loss": 3.9834587851829104, + "tokens_seen": 426458112 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004397793380140421, + "loss": 3.1756, + "theoretical_loss": 3.9833910400361905, + "tokens_seen": 426523648 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043976930792377134, + "loss": 2.9631, + "theoretical_loss": 3.9833233082118644, + "tokens_seen": 426589184 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004397592778335005, + "loss": 3.1489, + "theoretical_loss": 3.9832555897052666, + "tokens_seen": 426654720 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004397492477432297, + "loss": 3.0471, + "theoretical_loss": 3.9831878845117323, + "tokens_seen": 426720256 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004397392176529589, + "loss": 3.0495, + "theoretical_loss": 3.9831201926266018, + "tokens_seen": 426785792 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043972918756268807, + "loss": 3.1702, + "theoretical_loss": 3.983052514045215, + "tokens_seen": 426851328 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043971915747241725, + "loss": 2.9261, + "theoretical_loss": 3.982984848762915, + "tokens_seen": 426916864 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004397091273821465, + "loss": 3.0344, + "theoretical_loss": 3.982917196775049, + "tokens_seen": 426982400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004396990972918756, + "loss": 3.1101, + "theoretical_loss": 3.9828495580769636, + "tokens_seen": 427047936 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043968906720160485, + "loss": 2.8813, + "theoretical_loss": 3.98278193266401, + "tokens_seen": 427113472 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043967903711133403, + "loss": 2.9827, + "theoretical_loss": 3.9827143205315405, + "tokens_seen": 427179008 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004396690070210632, + "loss": 2.8908, + "theoretical_loss": 3.982646721674911, + "tokens_seen": 427244544 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004396589769307924, + "loss": 3.0251, + "theoretical_loss": 3.9825791360894787, + "tokens_seen": 427310080 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043964894684052157, + "loss": 2.885, + "theoretical_loss": 3.9825115637706023, + "tokens_seen": 427375616 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043963891675025075, + "loss": 2.9362, + "theoretical_loss": 3.982444004713645, + "tokens_seen": 427441152 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043962888665998, + "loss": 2.9634, + "theoretical_loss": 3.982376458913971, + "tokens_seen": 427506688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004396188565697091, + "loss": 3.0015, + "theoretical_loss": 3.982308926366947, + "tokens_seen": 427572224 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 532253, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.377031087875366, + "objective/train/theoretical_loss": 3.9822582856509445, + "objective/train/tokens_used": 448081376, + "theoretical_loss": 3.9822582856509445, + "tokens_seen": 427621376 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043960882647943835, + "loss": 2.8872, + "theoretical_loss": 3.982241407067942, + "tokens_seen": 427637760 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004395987963891675, + "loss": 3.1085, + "theoretical_loss": 3.9821739010123274, + "tokens_seen": 427703296 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004395887662988967, + "loss": 3.1265, + "theoretical_loss": 3.9821064081954773, + "tokens_seen": 427768832 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004395787362086259, + "loss": 2.9826, + "theoretical_loss": 3.9820389286127678, + "tokens_seen": 427834368 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004395687061183551, + "loss": 3.0233, + "theoretical_loss": 3.981971462259577, + "tokens_seen": 427899904 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043955867602808426, + "loss": 2.8907, + "theoretical_loss": 3.981904009131285, + "tokens_seen": 427965440 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043954864593781344, + "loss": 2.9427, + "theoretical_loss": 3.9818365692232764, + "tokens_seen": 428030976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004395386158475426, + "loss": 2.6778, + "theoretical_loss": 3.981769142530936, + "tokens_seen": 428096512 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043952858575727185, + "loss": 3.1932, + "theoretical_loss": 3.98170172904965, + "tokens_seen": 428162048 + }, + { + "epoch": 1.04, + "learning_rate": 0.000439518555667001, + "loss": 2.9245, + "theoretical_loss": 3.9816343287748106, + "tokens_seen": 428227584 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004395085255767302, + "loss": 2.8477, + "theoretical_loss": 3.9815669417018085, + "tokens_seen": 428293120 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004394984954864594, + "loss": 3.1402, + "theoretical_loss": 3.981499567826039, + "tokens_seen": 428358656 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004394884653961886, + "loss": 2.8769, + "theoretical_loss": 3.981432207142899, + "tokens_seen": 428424192 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043947843530591776, + "loss": 2.8358, + "theoretical_loss": 3.9813648596477877, + "tokens_seen": 428489728 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043946840521564694, + "loss": 3.0459, + "theoretical_loss": 3.9812975253361067, + "tokens_seen": 428555264 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004394583751253761, + "loss": 3.0235, + "theoretical_loss": 3.9812302042032597, + "tokens_seen": 428620800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043944834503510536, + "loss": 3.1087, + "theoretical_loss": 3.981162896244653, + "tokens_seen": 428686336 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004394383149448345, + "loss": 3.0048, + "theoretical_loss": 3.9810956014556944, + "tokens_seen": 428751872 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004394282848545637, + "loss": 3.1698, + "theoretical_loss": 3.981028319831796, + "tokens_seen": 428817408 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043941825476429285, + "loss": 3.0103, + "theoretical_loss": 3.9809610513683698, + "tokens_seen": 428882944 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004394082246740221, + "loss": 3.2602, + "theoretical_loss": 3.9808937960608315, + "tokens_seen": 428948480 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043939819458375126, + "loss": 2.9979, + "theoretical_loss": 3.9808265539045986, + "tokens_seen": 429014016 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043938816449348044, + "loss": 2.995, + "theoretical_loss": 3.9807593248950917, + "tokens_seen": 429079552 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004393781344032096, + "loss": 3.0497, + "theoretical_loss": 3.980692109027732, + "tokens_seen": 429145088 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004393681043129388, + "loss": 3.0166, + "theoretical_loss": 3.980624906297945, + "tokens_seen": 429210624 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 533007, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.870386838912964, + "objective/train/theoretical_loss": 3.980574512869385, + "objective/train/tokens_used": 449719776, + "theoretical_loss": 3.980574512869385, + "tokens_seen": 429259776 + }, + { + "epoch": 1.04, + "learning_rate": 0.000439358074222668, + "loss": 2.9974, + "theoretical_loss": 3.9805577167011563, + "tokens_seen": 429276160 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004393480441323972, + "loss": 2.8626, + "theoretical_loss": 3.980490540232797, + "tokens_seen": 429341696 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043933801404212635, + "loss": 2.9723, + "theoretical_loss": 3.9804233768882966, + "tokens_seen": 429407232 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004393279839518556, + "loss": 2.8119, + "theoretical_loss": 3.9803562266630896, + "tokens_seen": 429472768 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043931795386158477, + "loss": 2.9868, + "theoretical_loss": 3.9802890895526124, + "tokens_seen": 429538304 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043930792377131395, + "loss": 2.909, + "theoretical_loss": 3.9802219655523023, + "tokens_seen": 429603840 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043929789368104313, + "loss": 3.1078, + "theoretical_loss": 3.9801548546576004, + "tokens_seen": 429669376 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004392878635907723, + "loss": 2.9401, + "theoretical_loss": 3.9800877568639494, + "tokens_seen": 429734912 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004392778335005015, + "loss": 3.1711, + "theoretical_loss": 3.9800206721667952, + "tokens_seen": 429800448 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004392678034102307, + "loss": 3.2001, + "theoretical_loss": 3.979953600561584, + "tokens_seen": 429865984 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043925777331995985, + "loss": 3.0095, + "theoretical_loss": 3.979886542043766, + "tokens_seen": 429931520 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004392477432296891, + "loss": 3.0156, + "theoretical_loss": 3.979819496608793, + "tokens_seen": 429997056 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004392377131394182, + "loss": 2.8932, + "theoretical_loss": 3.9797524642521194, + "tokens_seen": 430062592 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043922768304914745, + "loss": 3.1111, + "theoretical_loss": 3.979685444969202, + "tokens_seen": 430128128 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043921765295887663, + "loss": 3.0072, + "theoretical_loss": 3.979618438755498, + "tokens_seen": 430193664 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004392076228686058, + "loss": 2.9538, + "theoretical_loss": 3.9795514456064702, + "tokens_seen": 430259200 + }, + { + "epoch": 1.04, + "learning_rate": 0.000439197592778335, + "loss": 3.0761, + "theoretical_loss": 3.979484465517581, + "tokens_seen": 430324736 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043918756268806423, + "loss": 2.9912, + "theoretical_loss": 3.979417498484297, + "tokens_seen": 430390272 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004391775325977934, + "loss": 2.8828, + "theoretical_loss": 3.9793505445020845, + "tokens_seen": 430455808 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004391675025075226, + "loss": 2.9818, + "theoretical_loss": 3.9792836035664148, + "tokens_seen": 430521344 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043915747241725177, + "loss": 3.1237, + "theoretical_loss": 3.9792166756727596, + "tokens_seen": 430586880 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043914744232698095, + "loss": 3.0967, + "theoretical_loss": 3.9791497608165933, + "tokens_seen": 430652416 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004391374122367102, + "loss": 3.1895, + "theoretical_loss": 3.9790828589933933, + "tokens_seen": 430717952 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004391273821464393, + "loss": 2.8996, + "theoretical_loss": 3.9790159701986387, + "tokens_seen": 430783488 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043911735205616855, + "loss": 3.0491, + "theoretical_loss": 3.978949094427811, + "tokens_seen": 430849024 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 534288, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8886990547180176, + "objective/train/theoretical_loss": 3.9788989461439246, + "objective/train/tokens_used": 451358176, + "theoretical_loss": 3.9788989461439246, + "tokens_seen": 430898176 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004391073219658977, + "loss": 2.7626, + "theoretical_loss": 3.978882231676393, + "tokens_seen": 430914560 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004390972918756269, + "loss": 2.8555, + "theoretical_loss": 3.9788153819398717, + "tokens_seen": 430980096 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004390872617853561, + "loss": 3.1994, + "theoretical_loss": 3.978748545213734, + "tokens_seen": 431045632 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004390772316950853, + "loss": 3.0032, + "theoretical_loss": 3.9786817214934715, + "tokens_seen": 431111168 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043906720160481446, + "loss": 2.9658, + "theoretical_loss": 3.9786149107745765, + "tokens_seen": 431176704 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043905717151454364, + "loss": 3.1315, + "theoretical_loss": 3.9785481130525433, + "tokens_seen": 431242240 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004390471414242728, + "loss": 3.1294, + "theoretical_loss": 3.9784813283228693, + "tokens_seen": 431307776 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043903711133400205, + "loss": 3.1198, + "theoretical_loss": 3.978414556581054, + "tokens_seen": 431373312 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004390270812437312, + "loss": 3.0879, + "theoretical_loss": 3.9783477978225994, + "tokens_seen": 431438848 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004390170511534604, + "loss": 3.0557, + "theoretical_loss": 3.9782810520430085, + "tokens_seen": 431504384 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004390070210631896, + "loss": 2.8589, + "theoretical_loss": 3.9782143192377886, + "tokens_seen": 431569920 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004389969909729188, + "loss": 2.7922, + "theoretical_loss": 3.9781475994024467, + "tokens_seen": 431635456 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043898696088264796, + "loss": 2.9462, + "theoretical_loss": 3.9780808925324935, + "tokens_seen": 431700992 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043897693079237714, + "loss": 3.2187, + "theoretical_loss": 3.978014198623443, + "tokens_seen": 431766528 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004389669007021063, + "loss": 2.6678, + "theoretical_loss": 3.977947517670809, + "tokens_seen": 431832064 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043895687061183556, + "loss": 3.0396, + "theoretical_loss": 3.9778808496701092, + "tokens_seen": 431897600 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004389468405215647, + "loss": 3.0952, + "theoretical_loss": 3.9778141946168635, + "tokens_seen": 431963136 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004389368104312939, + "loss": 2.9717, + "theoretical_loss": 3.9777475525065933, + "tokens_seen": 432028672 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043892678034102305, + "loss": 3.0584, + "theoretical_loss": 3.9776809233348223, + "tokens_seen": 432094208 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004389167502507523, + "loss": 2.9005, + "theoretical_loss": 3.977614307097077, + "tokens_seen": 432159744 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043890672016048146, + "loss": 2.6508, + "theoretical_loss": 3.977547703788886, + "tokens_seen": 432225280 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043889669007021064, + "loss": 3.1334, + "theoretical_loss": 3.9774811134057795, + "tokens_seen": 432290816 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004388866599799398, + "loss": 3.1029, + "theoretical_loss": 3.97741453594329, + "tokens_seen": 432356352 + }, + { + "epoch": 1.04, + "learning_rate": 0.000438876629889669, + "loss": 2.8582, + "theoretical_loss": 3.977347971396954, + "tokens_seen": 432421888 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004388665997993982, + "loss": 3.0586, + "theoretical_loss": 3.977281419762307, + "tokens_seen": 432487424 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 535120, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.702747344970703, + "objective/train/theoretical_loss": 3.9772315145069355, + "objective/train/tokens_used": 452996576, + "theoretical_loss": 3.9772315145069355, + "tokens_seen": 432536576 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004388565697091274, + "loss": 2.9321, + "theoretical_loss": 3.9772148810348904, + "tokens_seen": 432552960 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043884653961885655, + "loss": 2.9524, + "theoretical_loss": 3.9771483552102445, + "tokens_seen": 432618496 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004388365095285858, + "loss": 2.8339, + "theoretical_loss": 3.977081842283913, + "tokens_seen": 432684032 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043882647943831497, + "loss": 2.6309, + "theoretical_loss": 3.977015342251444, + "tokens_seen": 432749568 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043881644934804415, + "loss": 2.8433, + "theoretical_loss": 3.976948855108384, + "tokens_seen": 432815104 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043880641925777333, + "loss": 3.1073, + "theoretical_loss": 3.9768823808502836, + "tokens_seen": 432880640 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004387963891675025, + "loss": 2.8288, + "theoretical_loss": 3.976815919472697, + "tokens_seen": 432946176 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004387863590772317, + "loss": 2.9447, + "theoretical_loss": 3.976749470971178, + "tokens_seen": 433011712 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004387763289869609, + "loss": 2.929, + "theoretical_loss": 3.976683035341284, + "tokens_seen": 433077248 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043876629889669005, + "loss": 3.1009, + "theoretical_loss": 3.9766166125785753, + "tokens_seen": 433142784 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004387562688064193, + "loss": 2.8277, + "theoretical_loss": 3.976550202678612, + "tokens_seen": 433208320 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004387462387161484, + "loss": 2.8715, + "theoretical_loss": 3.9764838056369594, + "tokens_seen": 433273856 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043873620862587765, + "loss": 2.9434, + "theoretical_loss": 3.9764174214491828, + "tokens_seen": 433339392 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043872617853560683, + "loss": 3.0301, + "theoretical_loss": 3.97635105011085, + "tokens_seen": 433404928 + }, + { + "epoch": 1.04, + "learning_rate": 0.000438716148445336, + "loss": 2.9667, + "theoretical_loss": 3.976284691617532, + "tokens_seen": 433470464 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004387061183550652, + "loss": 2.8421, + "theoretical_loss": 3.976218345964801, + "tokens_seen": 433536000 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043869608826479443, + "loss": 3.0121, + "theoretical_loss": 3.9761520131482326, + "tokens_seen": 433601536 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043868605817452356, + "loss": 3.2531, + "theoretical_loss": 3.976085693163403, + "tokens_seen": 433667072 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004386760280842528, + "loss": 3.0608, + "theoretical_loss": 3.9760193860058917, + "tokens_seen": 433732608 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004386659979939819, + "loss": 3.1406, + "theoretical_loss": 3.97595309167128, + "tokens_seen": 433798144 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043865596790371115, + "loss": 2.7294, + "theoretical_loss": 3.975886810155152, + "tokens_seen": 433863680 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043864593781344033, + "loss": 2.9871, + "theoretical_loss": 3.9758205414530923, + "tokens_seen": 433929216 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004386359077231695, + "loss": 2.9434, + "theoretical_loss": 3.9757542855606895, + "tokens_seen": 433994752 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004386258776328987, + "loss": 2.856, + "theoretical_loss": 3.975688042473534, + "tokens_seen": 434060288 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004386158475426279, + "loss": 3.0601, + "theoretical_loss": 3.975621812187218, + "tokens_seen": 434125824 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 536351, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.160634756088257, + "objective/train/theoretical_loss": 3.9755721478703814, + "objective/train/tokens_used": 454634976, + "theoretical_loss": 3.9755721478703814, + "tokens_seen": 434174976 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043860581745235706, + "loss": 3.0205, + "theoretical_loss": 3.9755555946973358, + "tokens_seen": 434191360 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004385957873620863, + "loss": 2.9008, + "theoretical_loss": 3.975489389999484, + "tokens_seen": 434256896 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004385857572718154, + "loss": 2.9809, + "theoretical_loss": 3.9754231980892616, + "tokens_seen": 434322432 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043857572718154466, + "loss": 3.0585, + "theoretical_loss": 3.97535701896227, + "tokens_seen": 434387968 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004385656970912738, + "loss": 3.105, + "theoretical_loss": 3.9752908526141115, + "tokens_seen": 434453504 + }, + { + "epoch": 1.04, + "learning_rate": 0.000438555667001003, + "loss": 3.0696, + "theoretical_loss": 3.9752246990403926, + "tokens_seen": 434519040 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004385456369107322, + "loss": 3.1542, + "theoretical_loss": 3.9751585582367195, + "tokens_seen": 434584576 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004385356068204614, + "loss": 2.8699, + "theoretical_loss": 3.9750924301987034, + "tokens_seen": 434650112 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043852557673019056, + "loss": 3.0068, + "theoretical_loss": 3.975026314921955, + "tokens_seen": 434715648 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004385155466399198, + "loss": 2.9832, + "theoretical_loss": 3.974960212402089, + "tokens_seen": 434781184 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004385055165496489, + "loss": 2.9908, + "theoretical_loss": 3.974894122634722, + "tokens_seen": 434846720 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043849548645937816, + "loss": 2.838, + "theoretical_loss": 3.9748280456154714, + "tokens_seen": 434912256 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004384854563691073, + "loss": 3.1155, + "theoretical_loss": 3.9747619813399586, + "tokens_seen": 434977792 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004384754262788365, + "loss": 3.0884, + "theoretical_loss": 3.974695929803806, + "tokens_seen": 435043328 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004384653961885657, + "loss": 3.1434, + "theoretical_loss": 3.9746298910026385, + "tokens_seen": 435108864 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004384553660982949, + "loss": 3.0162, + "theoretical_loss": 3.974563864932083, + "tokens_seen": 435174400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043844533600802407, + "loss": 2.9452, + "theoretical_loss": 3.9744978515877696, + "tokens_seen": 435239936 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043843530591775325, + "loss": 3.0573, + "theoretical_loss": 3.9744318509653285, + "tokens_seen": 435305472 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004384252758274825, + "loss": 3.0258, + "theoretical_loss": 3.9743658630603935, + "tokens_seen": 435371008 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043841524573721166, + "loss": 2.9923, + "theoretical_loss": 3.9742998878686007, + "tokens_seen": 435436544 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043840521564694084, + "loss": 3.0141, + "theoretical_loss": 3.974233925385588, + "tokens_seen": 435502080 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043839518555667, + "loss": 2.7964, + "theoretical_loss": 3.974167975606995, + "tokens_seen": 435567616 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004383851554663992, + "loss": 2.9255, + "theoretical_loss": 3.974102038528464, + "tokens_seen": 435633152 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004383751253761284, + "loss": 2.8229, + "theoretical_loss": 3.9740361141456395, + "tokens_seen": 435698688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004383650952858576, + "loss": 3.1127, + "theoretical_loss": 3.9739702024541677, + "tokens_seen": 435764224 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 537149, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.811138868331909, + "objective/train/theoretical_loss": 3.9739207770116467, + "objective/train/tokens_used": 456273376, + "theoretical_loss": 3.9739207770116467, + "tokens_seen": 435813376 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043835506519558675, + "loss": 2.8946, + "theoretical_loss": 3.9739043034496975, + "tokens_seen": 435829760 + }, + { + "epoch": 1.04, + "learning_rate": 0.000438345035105316, + "loss": 2.9828, + "theoretical_loss": 3.9738384171278787, + "tokens_seen": 435895296 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043833500501504517, + "loss": 3.0393, + "theoretical_loss": 3.9737725434843654, + "tokens_seen": 435960832 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043832497492477435, + "loss": 2.9571, + "theoretical_loss": 3.9737066825148126, + "tokens_seen": 436026368 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043831494483450353, + "loss": 3.052, + "theoretical_loss": 3.9736408342148763, + "tokens_seen": 436091904 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004383049147442327, + "loss": 2.8974, + "theoretical_loss": 3.973574998580217, + "tokens_seen": 436157440 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004382948846539619, + "loss": 3.1101, + "theoretical_loss": 3.973509175606495, + "tokens_seen": 436222976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004382848545636911, + "loss": 2.8409, + "theoretical_loss": 3.9734433652893753, + "tokens_seen": 436288512 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043827482447342025, + "loss": 3.0663, + "theoretical_loss": 3.9733775676245227, + "tokens_seen": 436354048 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004382647943831495, + "loss": 3.0505, + "theoretical_loss": 3.973311782607605, + "tokens_seen": 436419584 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004382547642928786, + "loss": 3.0826, + "theoretical_loss": 3.973246010234292, + "tokens_seen": 436485120 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043824473420260785, + "loss": 3.0302, + "theoretical_loss": 3.973180250500257, + "tokens_seen": 436550656 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043823470411233703, + "loss": 2.9014, + "theoretical_loss": 3.973114503401174, + "tokens_seen": 436616192 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004382246740220662, + "loss": 3.0467, + "theoretical_loss": 3.973048768932718, + "tokens_seen": 436681728 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004382146439317954, + "loss": 3.1691, + "theoretical_loss": 3.972983047090569, + "tokens_seen": 436747264 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043820461384152463, + "loss": 2.8118, + "theoretical_loss": 3.972917337870407, + "tokens_seen": 436812800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043819458375125376, + "loss": 2.9634, + "theoretical_loss": 3.9728516412679147, + "tokens_seen": 436878336 + }, + { + "epoch": 1.04, + "learning_rate": 0.000438184553660983, + "loss": 3.2248, + "theoretical_loss": 3.9727859572787763, + "tokens_seen": 436943872 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004381745235707121, + "loss": 2.7478, + "theoretical_loss": 3.9727202858986805, + "tokens_seen": 437009408 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043816449348044135, + "loss": 2.9089, + "theoretical_loss": 3.972654627123316, + "tokens_seen": 437074944 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043815446339017053, + "loss": 3.0526, + "theoretical_loss": 3.972588980948373, + "tokens_seen": 437140480 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004381444332998997, + "loss": 3.0026, + "theoretical_loss": 3.9725233473695454, + "tokens_seen": 437206016 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004381344032096289, + "loss": 2.8952, + "theoretical_loss": 3.9724577263825296, + "tokens_seen": 437271552 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004381243731193581, + "loss": 2.9139, + "theoretical_loss": 3.9723921179830217, + "tokens_seen": 437337088 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043811434302908726, + "loss": 2.9151, + "theoretical_loss": 3.972326522166722, + "tokens_seen": 437402624 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 538542, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2445921897888184, + "objective/train/theoretical_loss": 3.972277333559642, + "objective/train/tokens_used": 457911776, + "theoretical_loss": 3.972277333559642, + "tokens_seen": 437451776 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004381043129388165, + "loss": 3.0233, + "theoretical_loss": 3.972260938929333, + "tokens_seen": 437468160 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004380942828485456, + "loss": 2.7278, + "theoretical_loss": 3.972195368266558, + "tokens_seen": 437533696 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043808425275827486, + "loss": 3.0123, + "theoretical_loss": 3.9721298101741027, + "tokens_seen": 437599232 + }, + { + "epoch": 1.04, + "learning_rate": 0.000438074222668004, + "loss": 2.9663, + "theoretical_loss": 3.972064264647676, + "tokens_seen": 437664768 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004380641925777332, + "loss": 3.1004, + "theoretical_loss": 3.971998731682988, + "tokens_seen": 437730304 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004380541624874624, + "loss": 2.9891, + "theoretical_loss": 3.9719332112757515, + "tokens_seen": 437795840 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004380441323971916, + "loss": 3.0848, + "theoretical_loss": 3.9718677034216796, + "tokens_seen": 437861376 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043803410230692076, + "loss": 2.9775, + "theoretical_loss": 3.97180220811649, + "tokens_seen": 437926912 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043802407221665, + "loss": 2.7554, + "theoretical_loss": 3.9717367253559015, + "tokens_seen": 437992448 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004380140421263791, + "loss": 3.0661, + "theoretical_loss": 3.971671255135634, + "tokens_seen": 438057984 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043800401203610836, + "loss": 3.0481, + "theoretical_loss": 3.971605797451411, + "tokens_seen": 438123520 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004379939819458375, + "loss": 2.7857, + "theoretical_loss": 3.971540352298958, + "tokens_seen": 438189056 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004379839518555667, + "loss": 3.0102, + "theoretical_loss": 3.971474919674001, + "tokens_seen": 438254592 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004379739217652959, + "loss": 2.9069, + "theoretical_loss": 3.97140949957227, + "tokens_seen": 438320128 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004379638916750251, + "loss": 3.0839, + "theoretical_loss": 3.9713440919894962, + "tokens_seen": 438385664 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043795386158475427, + "loss": 2.8571, + "theoretical_loss": 3.971278696921412, + "tokens_seen": 438451200 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043794383149448345, + "loss": 3.0024, + "theoretical_loss": 3.971213314363754, + "tokens_seen": 438516736 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043793380140421263, + "loss": 2.8872, + "theoretical_loss": 3.9711479443122593, + "tokens_seen": 438582272 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043792377131394186, + "loss": 3.0489, + "theoretical_loss": 3.9710825867626673, + "tokens_seen": 438647808 + }, + { + "epoch": 1.04, + "learning_rate": 0.000437913741223671, + "loss": 2.8011, + "theoretical_loss": 3.9710172417107206, + "tokens_seen": 438713344 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004379037111334002, + "loss": 2.8608, + "theoretical_loss": 3.970951909152162, + "tokens_seen": 438778880 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043789368104312935, + "loss": 3.0229, + "theoretical_loss": 3.9708865890827383, + "tokens_seen": 438844416 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004378836509528586, + "loss": 3.0305, + "theoretical_loss": 3.9708212814981962, + "tokens_seen": 438909952 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043787362086258777, + "loss": 2.5924, + "theoretical_loss": 3.970755986394287, + "tokens_seen": 438975488 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043786359077231695, + "loss": 2.9254, + "theoretical_loss": 3.970690703766763, + "tokens_seen": 439041024 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 539178, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.423835277557373, + "objective/train/theoretical_loss": 3.9706417499811923, + "objective/train/tokens_used": 459550176, + "theoretical_loss": 3.9706417499811923, + "tokens_seen": 439090176 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043785356068204613, + "loss": 2.8201, + "theoretical_loss": 3.970625433611377, + "tokens_seen": 439106560 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043784353059177537, + "loss": 3.0262, + "theoretical_loss": 3.9705601759238864, + "tokens_seen": 439172096 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004378335005015045, + "loss": 3.1224, + "theoretical_loss": 3.97049493070005, + "tokens_seen": 439237632 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043782347041123373, + "loss": 3.0464, + "theoretical_loss": 3.970429697935627, + "tokens_seen": 439303168 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043781344032096286, + "loss": 2.9987, + "theoretical_loss": 3.9703644776263802, + "tokens_seen": 439368704 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004378034102306921, + "loss": 3.0031, + "theoretical_loss": 3.9702992697680752, + "tokens_seen": 439434240 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043779338014042127, + "loss": 2.8794, + "theoretical_loss": 3.970234074356477, + "tokens_seen": 439499776 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043778335005015045, + "loss": 3.1284, + "theoretical_loss": 3.9701688913873565, + "tokens_seen": 439565312 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043777331995987963, + "loss": 2.988, + "theoretical_loss": 3.970103720856483, + "tokens_seen": 439630848 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004377632898696088, + "loss": 2.8583, + "theoretical_loss": 3.9700385627596297, + "tokens_seen": 439696384 + }, + { + "epoch": 1.04, + "learning_rate": 0.000437753259779338, + "loss": 3.0441, + "theoretical_loss": 3.969973417092571, + "tokens_seen": 439761920 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043774322968906723, + "loss": 2.8557, + "theoretical_loss": 3.969908283851085, + "tokens_seen": 439827456 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043773319959879636, + "loss": 2.9863, + "theoretical_loss": 3.9698431630309505, + "tokens_seen": 439892992 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004377231695085256, + "loss": 3.1955, + "theoretical_loss": 3.969778054627948, + "tokens_seen": 439958528 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004377131394182547, + "loss": 2.8304, + "theoretical_loss": 3.969712958637862, + "tokens_seen": 440024064 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043770310932798396, + "loss": 2.9701, + "theoretical_loss": 3.969647875056476, + "tokens_seen": 440089600 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043769307923771314, + "loss": 2.954, + "theoretical_loss": 3.9695828038795784, + "tokens_seen": 440155136 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004376830491474423, + "loss": 2.7975, + "theoretical_loss": 3.969517745102958, + "tokens_seen": 440220672 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043767301905717155, + "loss": 2.7506, + "theoretical_loss": 3.9694526987224075, + "tokens_seen": 440286208 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043766298896690073, + "loss": 2.9735, + "theoretical_loss": 3.9693876647337185, + "tokens_seen": 440351744 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004376529588766299, + "loss": 2.7676, + "theoretical_loss": 3.969322643132688, + "tokens_seen": 440417280 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004376429287863591, + "loss": 2.9377, + "theoretical_loss": 3.969257633915113, + "tokens_seen": 440482816 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004376328986960883, + "loss": 2.7283, + "theoretical_loss": 3.9691926370767936, + "tokens_seen": 440548352 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043762286860581746, + "loss": 2.9292, + "theoretical_loss": 3.969127652613531, + "tokens_seen": 440613888 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004376128385155467, + "loss": 3.1896, + "theoretical_loss": 3.9690626805211293, + "tokens_seen": 440679424 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 540524, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3463034629821777, + "objective/train/theoretical_loss": 3.9690139595676817, + "objective/train/tokens_used": 461188576, + "theoretical_loss": 3.9690139595676817, + "tokens_seen": 440728576 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004376028084252758, + "loss": 3.082, + "theoretical_loss": 3.9689977207953935, + "tokens_seen": 440744960 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043759277833500506, + "loss": 2.7782, + "theoretical_loss": 3.9689327734321327, + "tokens_seen": 440810496 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004375827482447342, + "loss": 3.0849, + "theoretical_loss": 3.9688678384271556, + "tokens_seen": 440876032 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004375727181544634, + "loss": 3.0616, + "theoretical_loss": 3.9688029157762745, + "tokens_seen": 440941568 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004375626880641926, + "loss": 3.1015, + "theoretical_loss": 3.968738005475304, + "tokens_seen": 441007104 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004375526579739218, + "loss": 2.913, + "theoretical_loss": 3.9686731075200594, + "tokens_seen": 441072640 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043754262788365096, + "loss": 2.9542, + "theoretical_loss": 3.9686082219063588, + "tokens_seen": 441138176 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004375325977933802, + "loss": 2.8682, + "theoretical_loss": 3.968543348630022, + "tokens_seen": 441203712 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004375225677031093, + "loss": 2.7976, + "theoretical_loss": 3.968478487686872, + "tokens_seen": 441269248 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043751253761283856, + "loss": 2.8125, + "theoretical_loss": 3.9684136390727325, + "tokens_seen": 441334784 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004375025075225677, + "loss": 2.6787, + "theoretical_loss": 3.9683488027834293, + "tokens_seen": 441400320 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004374924774322969, + "loss": 2.9182, + "theoretical_loss": 3.9682839788147906, + "tokens_seen": 441465856 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004374824473420261, + "loss": 2.6642, + "theoretical_loss": 3.9682191671626477, + "tokens_seen": 441531392 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004374724172517553, + "loss": 2.9396, + "theoretical_loss": 3.968154367822832, + "tokens_seen": 441596928 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043746238716148447, + "loss": 2.8495, + "theoretical_loss": 3.968089580791178, + "tokens_seen": 441662464 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043745235707121365, + "loss": 3.2537, + "theoretical_loss": 3.9680248060635215, + "tokens_seen": 441728000 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043744232698094283, + "loss": 3.1128, + "theoretical_loss": 3.967960043635702, + "tokens_seen": 441793536 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043743229689067206, + "loss": 2.9745, + "theoretical_loss": 3.9678952935035587, + "tokens_seen": 441859072 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004374222668004012, + "loss": 2.9838, + "theoretical_loss": 3.967830555662935, + "tokens_seen": 441924608 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004374122367101304, + "loss": 3.0596, + "theoretical_loss": 3.9677658301096743, + "tokens_seen": 441990144 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043740220661985955, + "loss": 3.0076, + "theoretical_loss": 3.967701116839624, + "tokens_seen": 442055680 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004373921765295888, + "loss": 3.0611, + "theoretical_loss": 3.9676364158486326, + "tokens_seen": 442121216 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043738214643931797, + "loss": 2.8529, + "theoretical_loss": 3.9675717271325492, + "tokens_seen": 442186752 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043737211634904715, + "loss": 2.7567, + "theoretical_loss": 3.967507050687228, + "tokens_seen": 442252288 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043736208625877633, + "loss": 3.0194, + "theoretical_loss": 3.967442386508522, + "tokens_seen": 442317824 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 541315, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.549229145050049, + "objective/train/theoretical_loss": 3.9673938964219673, + "objective/train/tokens_used": 462826976, + "theoretical_loss": 3.9673938964219673, + "tokens_seen": 442366976 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043735205616850557, + "loss": 2.6802, + "theoretical_loss": 3.967377734592289, + "tokens_seen": 442383360 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004373420260782347, + "loss": 2.9142, + "theoretical_loss": 3.9673130949343873, + "tokens_seen": 442448896 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043733199598796393, + "loss": 3.0676, + "theoretical_loss": 3.967248467530677, + "tokens_seen": 442514432 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043732196589769306, + "loss": 3.0618, + "theoretical_loss": 3.967183852377021, + "tokens_seen": 442579968 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004373119358074223, + "loss": 3.0316, + "theoretical_loss": 3.9671192494692837, + "tokens_seen": 442645504 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043730190571715147, + "loss": 3.0938, + "theoretical_loss": 3.9670546588033315, + "tokens_seen": 442711040 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043729187562688065, + "loss": 2.8994, + "theoretical_loss": 3.9669900803750338, + "tokens_seen": 442776576 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043728184553660983, + "loss": 3.0618, + "theoretical_loss": 3.96692551418026, + "tokens_seen": 442842112 + }, + { + "epoch": 1.04, + "learning_rate": 0.000437271815446339, + "loss": 3.1426, + "theoretical_loss": 3.966860960214883, + "tokens_seen": 442907648 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004372617853560682, + "loss": 2.9978, + "theoretical_loss": 3.966796418474779, + "tokens_seen": 442973184 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043725175526579743, + "loss": 2.6247, + "theoretical_loss": 3.966731888955823, + "tokens_seen": 443038720 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043724172517552656, + "loss": 2.8093, + "theoretical_loss": 3.966667371653893, + "tokens_seen": 443104256 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004372316950852558, + "loss": 2.9878, + "theoretical_loss": 3.9666028665648714, + "tokens_seen": 443169792 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004372216649949849, + "loss": 2.8664, + "theoretical_loss": 3.96653837368464, + "tokens_seen": 443235328 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043721163490471416, + "loss": 2.9273, + "theoretical_loss": 3.9664738930090833, + "tokens_seen": 443300864 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043720160481444334, + "loss": 2.9889, + "theoretical_loss": 3.9664094245340875, + "tokens_seen": 443366400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004371915747241725, + "loss": 2.9439, + "theoretical_loss": 3.9663449682555423, + "tokens_seen": 443431936 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004371815446339017, + "loss": 3.0401, + "theoretical_loss": 3.966280524169337, + "tokens_seen": 443497472 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043717151454363093, + "loss": 3.0757, + "theoretical_loss": 3.966216092271365, + "tokens_seen": 443563008 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043716148445336006, + "loss": 2.9564, + "theoretical_loss": 3.966151672557521, + "tokens_seen": 443628544 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004371514543630893, + "loss": 2.8941, + "theoretical_loss": 3.9660872650237016, + "tokens_seen": 443694080 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004371414242728184, + "loss": 3.0021, + "theoretical_loss": 3.966022869665804, + "tokens_seen": 443759616 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043713139418254766, + "loss": 3.0318, + "theoretical_loss": 3.96595848647973, + "tokens_seen": 443825152 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043712136409227684, + "loss": 3.0716, + "theoretical_loss": 3.9658941154613823, + "tokens_seen": 443890688 + }, + { + "epoch": 1.04, + "learning_rate": 0.000437111334002006, + "loss": 2.9925, + "theoretical_loss": 3.9658297566066647, + "tokens_seen": 443956224 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 542939, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.203843355178833, + "objective/train/theoretical_loss": 3.965781495445546, + "objective/train/tokens_used": 464465376, + "theoretical_loss": 3.965781495445546, + "tokens_seen": 444005376 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004371013039117352, + "loss": 3.0367, + "theoretical_loss": 3.9657654099114836, + "tokens_seen": 444021760 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004370912738214644, + "loss": 3.0506, + "theoretical_loss": 3.965701075371748, + "tokens_seen": 444087296 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043708124373119356, + "loss": 2.9351, + "theoretical_loss": 3.965636752983368, + "tokens_seen": 444152832 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004370712136409228, + "loss": 2.8604, + "theoretical_loss": 3.965572442742256, + "tokens_seen": 444218368 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004370611835506519, + "loss": 3.1753, + "theoretical_loss": 3.9655081446443265, + "tokens_seen": 444283904 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043705115346038116, + "loss": 2.8183, + "theoretical_loss": 3.9654438586854965, + "tokens_seen": 444349440 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004370411233701103, + "loss": 3.1016, + "theoretical_loss": 3.965379584861684, + "tokens_seen": 444414976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004370310932798395, + "loss": 2.6801, + "theoretical_loss": 3.965315323168808, + "tokens_seen": 444480512 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004370210631895687, + "loss": 2.9384, + "theoretical_loss": 3.9652510736027926, + "tokens_seen": 444546048 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004370110330992979, + "loss": 2.8253, + "theoretical_loss": 3.9651868361595612, + "tokens_seen": 444611584 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043700100300902707, + "loss": 2.925, + "theoretical_loss": 3.9651226108350395, + "tokens_seen": 444677120 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004369909729187563, + "loss": 2.9335, + "theoretical_loss": 3.965058397625157, + "tokens_seen": 444742656 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043698094282848543, + "loss": 2.9976, + "theoretical_loss": 3.9649941965258435, + "tokens_seen": 444808192 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043697091273821467, + "loss": 2.7717, + "theoretical_loss": 3.9649300075330305, + "tokens_seen": 444873728 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004369608826479438, + "loss": 2.8541, + "theoretical_loss": 3.964865830642653, + "tokens_seen": 444939264 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043695085255767303, + "loss": 3.0042, + "theoretical_loss": 3.964801665850646, + "tokens_seen": 445004800 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004369408224674022, + "loss": 3.0289, + "theoretical_loss": 3.964737513152949, + "tokens_seen": 445070336 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004369307923771314, + "loss": 3.0322, + "theoretical_loss": 3.9646733725455006, + "tokens_seen": 445135872 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004369207622868606, + "loss": 3.0371, + "theoretical_loss": 3.964609244024243, + "tokens_seen": 445201408 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043691073219658975, + "loss": 2.8602, + "theoretical_loss": 3.9645451275851205, + "tokens_seen": 445266944 + }, + { + "epoch": 1.05, + "learning_rate": 0.000436900702106319, + "loss": 2.9065, + "theoretical_loss": 3.9644810232240792, + "tokens_seen": 445332480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043689067201604817, + "loss": 2.9665, + "theoretical_loss": 3.9644169309370665, + "tokens_seen": 445398016 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043688064192577735, + "loss": 2.9056, + "theoretical_loss": 3.964352850720032, + "tokens_seen": 445463552 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043687061183550653, + "loss": 2.8584, + "theoretical_loss": 3.9642887825689286, + "tokens_seen": 445529088 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043686058174523577, + "loss": 2.9096, + "theoretical_loss": 3.964224726479708, + "tokens_seen": 445594624 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 543579, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.705620765686035, + "objective/train/theoretical_loss": 3.964176692325972, + "objective/train/tokens_used": 466103776, + "theoretical_loss": 3.964176692325972, + "tokens_seen": 445643776 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004368505516549649, + "loss": 3.0128, + "theoretical_loss": 3.9641606824483278, + "tokens_seen": 445660160 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043684052156469413, + "loss": 3.0941, + "theoretical_loss": 3.9640966504707444, + "tokens_seen": 445725696 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043683049147442326, + "loss": 2.9789, + "theoretical_loss": 3.964032630542918, + "tokens_seen": 445791232 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004368204613841525, + "loss": 2.81, + "theoretical_loss": 3.96396862266081, + "tokens_seen": 445856768 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043681043129388167, + "loss": 2.9512, + "theoretical_loss": 3.9639046268203835, + "tokens_seen": 445922304 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043680040120361085, + "loss": 3.0781, + "theoretical_loss": 3.963840643017604, + "tokens_seen": 445987840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043679037111334003, + "loss": 2.7061, + "theoretical_loss": 3.9637766712484384, + "tokens_seen": 446053376 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004367803410230692, + "loss": 3.125, + "theoretical_loss": 3.9637127115088573, + "tokens_seen": 446118912 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004367703109327984, + "loss": 2.981, + "theoretical_loss": 3.963648763794831, + "tokens_seen": 446184448 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043676028084252763, + "loss": 2.7527, + "theoretical_loss": 3.9635848281023325, + "tokens_seen": 446249984 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043675025075225676, + "loss": 2.8724, + "theoretical_loss": 3.9635209044273365, + "tokens_seen": 446315520 + }, + { + "epoch": 1.05, + "learning_rate": 0.000436740220661986, + "loss": 2.9677, + "theoretical_loss": 3.9634569927658214, + "tokens_seen": 446381056 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004367301905717151, + "loss": 2.9814, + "theoretical_loss": 3.9633930931137655, + "tokens_seen": 446446592 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043672016048144436, + "loss": 2.9977, + "theoretical_loss": 3.963329205467149, + "tokens_seen": 446512128 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043671013039117354, + "loss": 2.8533, + "theoretical_loss": 3.9632653298219562, + "tokens_seen": 446577664 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004367001003009027, + "loss": 2.7676, + "theoretical_loss": 3.96320146617417, + "tokens_seen": 446643200 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004366900702106319, + "loss": 3.0384, + "theoretical_loss": 3.9631376145197796, + "tokens_seen": 446708736 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043668004012036113, + "loss": 3.0348, + "theoretical_loss": 3.963073774854771, + "tokens_seen": 446774272 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043667001003009026, + "loss": 3.0659, + "theoretical_loss": 3.9630099471751365, + "tokens_seen": 446839808 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004366599799398195, + "loss": 3.0177, + "theoretical_loss": 3.962946131476868, + "tokens_seen": 446905344 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004366499498495486, + "loss": 2.9298, + "theoretical_loss": 3.9628823277559597, + "tokens_seen": 446970880 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043663991975927786, + "loss": 2.9698, + "theoretical_loss": 3.9628185360084087, + "tokens_seen": 447036416 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043662988966900704, + "loss": 3.0568, + "theoretical_loss": 3.962754756230213, + "tokens_seen": 447101952 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004366198595787362, + "loss": 2.9263, + "theoretical_loss": 3.9626909884173727, + "tokens_seen": 447167488 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004366098294884654, + "loss": 3.1906, + "theoretical_loss": 3.9626272325658896, + "tokens_seen": 447233024 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 544310, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0421924591064453, + "objective/train/theoretical_loss": 3.962579423524514, + "objective/train/tokens_used": 467742176, + "theoretical_loss": 3.962579423524514, + "tokens_seen": 447282176 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004365997993981946, + "loss": 3.0931, + "theoretical_loss": 3.9625634886717678, + "tokens_seen": 447298560 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043658976930792376, + "loss": 2.9716, + "theoretical_loss": 3.962499756731014, + "tokens_seen": 447364096 + }, + { + "epoch": 1.05, + "learning_rate": 0.000436579739217653, + "loss": 3.0392, + "theoretical_loss": 3.9624360367396347, + "tokens_seen": 447429632 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043656970912738213, + "loss": 2.9594, + "theoretical_loss": 3.9623723286936414, + "tokens_seen": 447495168 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043655967903711136, + "loss": 3.1369, + "theoretical_loss": 3.9623086325890444, + "tokens_seen": 447560704 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004365496489468405, + "loss": 2.9239, + "theoretical_loss": 3.9622449484218585, + "tokens_seen": 447626240 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004365396188565697, + "loss": 3.2243, + "theoretical_loss": 3.9621812761880975, + "tokens_seen": 447691776 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004365295887662989, + "loss": 2.8872, + "theoretical_loss": 3.9621176158837814, + "tokens_seen": 447757312 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004365195586760281, + "loss": 3.1265, + "theoretical_loss": 3.9620539675049278, + "tokens_seen": 447822848 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043650952858575727, + "loss": 2.9101, + "theoretical_loss": 3.9619903310475575, + "tokens_seen": 447888384 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004364994984954865, + "loss": 2.9973, + "theoretical_loss": 3.9619267065076955, + "tokens_seen": 447953920 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043648946840521563, + "loss": 2.9611, + "theoretical_loss": 3.961863093881366, + "tokens_seen": 448019456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043647943831494487, + "loss": 3.2598, + "theoretical_loss": 3.9617994931645955, + "tokens_seen": 448084992 + }, + { + "epoch": 1.05, + "learning_rate": 0.000436469408224674, + "loss": 2.9044, + "theoretical_loss": 3.9617359043534135, + "tokens_seen": 448150528 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043645937813440323, + "loss": 2.9648, + "theoretical_loss": 3.961672327443851, + "tokens_seen": 448216064 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004364493480441324, + "loss": 2.9154, + "theoretical_loss": 3.96160876243194, + "tokens_seen": 448281600 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004364393179538616, + "loss": 2.9511, + "theoretical_loss": 3.961545209313715, + "tokens_seen": 448347136 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043642928786359077, + "loss": 2.8572, + "theoretical_loss": 3.961481668085214, + "tokens_seen": 448412672 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043641925777331995, + "loss": 3.2968, + "theoretical_loss": 3.9614181387424745, + "tokens_seen": 448478208 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043640922768304913, + "loss": 2.8314, + "theoretical_loss": 3.9613546212815365, + "tokens_seen": 448543744 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043639919759277837, + "loss": 2.9011, + "theoretical_loss": 3.961291115698442, + "tokens_seen": 448609280 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004363891675025075, + "loss": 2.757, + "theoretical_loss": 3.9612276219892366, + "tokens_seen": 448674816 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043637913741223673, + "loss": 2.8699, + "theoretical_loss": 3.9611641401499647, + "tokens_seen": 448740352 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004363691073219659, + "loss": 3.0423, + "theoretical_loss": 3.961100670176675, + "tokens_seen": 448805888 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004363590772316951, + "loss": 2.9675, + "theoretical_loss": 3.961037212065418, + "tokens_seen": 448871424 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 544864, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.588275909423828, + "objective/train/theoretical_loss": 3.9609896262640576, + "objective/train/tokens_used": 469380576, + "theoretical_loss": 3.9609896262640576, + "tokens_seen": 448920576 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004363490471414243, + "loss": 2.9927, + "theoretical_loss": 3.960973765812244, + "tokens_seen": 448936960 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043633901705115346, + "loss": 3.0827, + "theoretical_loss": 3.9609103314132064, + "tokens_seen": 449002496 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043632898696088264, + "loss": 3.0377, + "theoretical_loss": 3.960846908864362, + "tokens_seen": 449068032 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043631895687061187, + "loss": 2.8994, + "theoretical_loss": 3.9607834981617676, + "tokens_seen": 449133568 + }, + { + "epoch": 1.05, + "learning_rate": 0.000436308926780341, + "loss": 2.9719, + "theoretical_loss": 3.9607200993014824, + "tokens_seen": 449199104 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043629889669007023, + "loss": 2.9095, + "theoretical_loss": 3.960656712279567, + "tokens_seen": 449264640 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043628886659979936, + "loss": 2.9354, + "theoretical_loss": 3.960593337092086, + "tokens_seen": 449330176 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004362788365095286, + "loss": 3.012, + "theoretical_loss": 3.9605299737351025, + "tokens_seen": 449395712 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004362688064192578, + "loss": 3.1365, + "theoretical_loss": 3.9604666222046845, + "tokens_seen": 449461248 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043625877632898696, + "loss": 2.8716, + "theoretical_loss": 3.9604032824968995, + "tokens_seen": 449526784 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043624874623871614, + "loss": 2.8419, + "theoretical_loss": 3.960339954607819, + "tokens_seen": 449592320 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004362387161484453, + "loss": 3.0657, + "theoretical_loss": 3.9602766385335153, + "tokens_seen": 449657856 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004362286860581745, + "loss": 3.0267, + "theoretical_loss": 3.9602133342700623, + "tokens_seen": 449723392 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043621865596790374, + "loss": 3.0393, + "theoretical_loss": 3.9601500418135367, + "tokens_seen": 449788928 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043620862587763286, + "loss": 3.1022, + "theoretical_loss": 3.960086761160016, + "tokens_seen": 449854464 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004361985957873621, + "loss": 2.9125, + "theoretical_loss": 3.960023492305581, + "tokens_seen": 449920000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004361885656970913, + "loss": 2.9061, + "theoretical_loss": 3.959960235246312, + "tokens_seen": 449985536 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043617853560682046, + "loss": 3.0702, + "theoretical_loss": 3.9598969899782936, + "tokens_seen": 450051072 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004361685055165497, + "loss": 2.9372, + "theoretical_loss": 3.9598337564976114, + "tokens_seen": 450116608 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004361584754262788, + "loss": 3.0303, + "theoretical_loss": 3.9597705348003527, + "tokens_seen": 450182144 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043614844533600806, + "loss": 2.8191, + "theoretical_loss": 3.959707324882607, + "tokens_seen": 450247680 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043613841524573724, + "loss": 3.0286, + "theoretical_loss": 3.9596441267404647, + "tokens_seen": 450313216 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004361283851554664, + "loss": 2.9917, + "theoretical_loss": 3.95958094037002, + "tokens_seen": 450378752 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004361183550651956, + "loss": 3.238, + "theoretical_loss": 3.959517765767366, + "tokens_seen": 450444288 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004361083249749248, + "loss": 2.9412, + "theoretical_loss": 3.9594546029286013, + "tokens_seen": 450509824 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 544864, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3638360500335693, + "objective/train/theoretical_loss": 3.9594072385172328, + "objective/train/tokens_used": 471018976, + "theoretical_loss": 3.9594072385172328, + "tokens_seen": 450558976 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043609829488465397, + "loss": 2.9778, + "theoretical_loss": 3.9593914518498234, + "tokens_seen": 450575360 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004360882647943832, + "loss": 3.1076, + "theoretical_loss": 3.9593283125271332, + "tokens_seen": 450640896 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043607823470411233, + "loss": 3.0533, + "theoretical_loss": 3.959265184956633, + "tokens_seen": 450706432 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043606820461384156, + "loss": 3.0076, + "theoretical_loss": 3.9592020691344265, + "tokens_seen": 450771968 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004360581745235707, + "loss": 3.2244, + "theoretical_loss": 3.9591389650566207, + "tokens_seen": 450837504 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004360481444332999, + "loss": 3.0192, + "theoretical_loss": 3.959075872719322, + "tokens_seen": 450903040 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004360381143430291, + "loss": 3.082, + "theoretical_loss": 3.9590127921186413, + "tokens_seen": 450968576 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004360280842527583, + "loss": 3.1096, + "theoretical_loss": 3.95894972325069, + "tokens_seen": 451034112 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043601805416248747, + "loss": 2.8023, + "theoretical_loss": 3.9588866661115816, + "tokens_seen": 451099648 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004360080240722167, + "loss": 3.124, + "theoretical_loss": 3.958823620697431, + "tokens_seen": 451165184 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043599799398194583, + "loss": 2.9392, + "theoretical_loss": 3.958760587004355, + "tokens_seen": 451230720 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043598796389167507, + "loss": 3.0512, + "theoretical_loss": 3.958697565028474, + "tokens_seen": 451296256 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004359779338014042, + "loss": 2.9383, + "theoretical_loss": 3.958634554765908, + "tokens_seen": 451361792 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043596790371113343, + "loss": 3.0894, + "theoretical_loss": 3.95857155621278, + "tokens_seen": 451427328 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004359578736208626, + "loss": 3.1975, + "theoretical_loss": 3.958508569365214, + "tokens_seen": 451492864 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004359478435305918, + "loss": 3.0222, + "theoretical_loss": 3.9584455942193366, + "tokens_seen": 451558400 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043593781344032097, + "loss": 2.9316, + "theoretical_loss": 3.9583826307712764, + "tokens_seen": 451623936 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043592778335005015, + "loss": 3.0144, + "theoretical_loss": 3.958319679017163, + "tokens_seen": 451689472 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043591775325977933, + "loss": 2.9815, + "theoretical_loss": 3.9582567389531285, + "tokens_seen": 451755008 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043590772316950857, + "loss": 2.9248, + "theoretical_loss": 3.9581938105753065, + "tokens_seen": 451820544 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004358976930792377, + "loss": 2.924, + "theoretical_loss": 3.958130893879833, + "tokens_seen": 451886080 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043588766298896693, + "loss": 2.9269, + "theoretical_loss": 3.9580679888628456, + "tokens_seen": 451951616 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004358776328986961, + "loss": 3.0172, + "theoretical_loss": 3.958005095520483, + "tokens_seen": 452017152 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004358676028084253, + "loss": 3.0345, + "theoretical_loss": 3.9579422138488862, + "tokens_seen": 452082688 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004358575727181545, + "loss": 3.2033, + "theoretical_loss": 3.9578793438441986, + "tokens_seen": 452148224 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 545628, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9914121627807617, + "objective/train/theoretical_loss": 3.9578321989947725, + "objective/train/tokens_used": 472657376, + "theoretical_loss": 3.9578321989947725, + "tokens_seen": 452197376 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043584754262788366, + "loss": 2.9125, + "theoretical_loss": 3.9578164855025646, + "tokens_seen": 452213760 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043583751253761284, + "loss": 2.7379, + "theoretical_loss": 3.9577536388201313, + "tokens_seen": 452279296 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043582748244734207, + "loss": 3.1926, + "theoretical_loss": 3.957690803793047, + "tokens_seen": 452344832 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004358174523570712, + "loss": 3.1275, + "theoretical_loss": 3.9576279804174614, + "tokens_seen": 452410368 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043580742226680043, + "loss": 3.0179, + "theoretical_loss": 3.957565168689528, + "tokens_seen": 452475904 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043579739217652956, + "loss": 3.0056, + "theoretical_loss": 3.9575023686053985, + "tokens_seen": 452541440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004357873620862588, + "loss": 3.1728, + "theoretical_loss": 3.957439580161231, + "tokens_seen": 452606976 + }, + { + "epoch": 1.05, + "learning_rate": 0.000435777331995988, + "loss": 3.027, + "theoretical_loss": 3.9573768033531813, + "tokens_seen": 452672512 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043576730190571716, + "loss": 2.8647, + "theoretical_loss": 3.95731403817741, + "tokens_seen": 452738048 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043575727181544634, + "loss": 2.8473, + "theoretical_loss": 3.9572512846300776, + "tokens_seen": 452803584 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004357472417251755, + "loss": 2.6547, + "theoretical_loss": 3.9571885427073474, + "tokens_seen": 452869120 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004357372116349047, + "loss": 2.7046, + "theoretical_loss": 3.9571258124053843, + "tokens_seen": 452934656 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043572718154463394, + "loss": 2.9029, + "theoretical_loss": 3.957063093720355, + "tokens_seen": 453000192 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043571715145436306, + "loss": 3.0426, + "theoretical_loss": 3.957000386648428, + "tokens_seen": 453065728 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004357071213640923, + "loss": 2.8844, + "theoretical_loss": 3.9569376911857734, + "tokens_seen": 453131264 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004356970912738215, + "loss": 3.0411, + "theoretical_loss": 3.9568750073285637, + "tokens_seen": 453196800 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043568706118355066, + "loss": 2.8895, + "theoretical_loss": 3.9568123350729727, + "tokens_seen": 453262336 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043567703109327984, + "loss": 3.0225, + "theoretical_loss": 3.956749674415176, + "tokens_seen": 453327872 + }, + { + "epoch": 1.05, + "learning_rate": 0.000435667001003009, + "loss": 3.0718, + "theoretical_loss": 3.9566870253513513, + "tokens_seen": 453393408 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004356569709127382, + "loss": 3.0213, + "theoretical_loss": 3.956624387877678, + "tokens_seen": 453458944 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043564694082246744, + "loss": 3.0282, + "theoretical_loss": 3.956561761990338, + "tokens_seen": 453524480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043563691073219657, + "loss": 2.9475, + "theoretical_loss": 3.9564991476855136, + "tokens_seen": 453590016 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004356268806419258, + "loss": 2.8315, + "theoretical_loss": 3.956436544959389, + "tokens_seen": 453655552 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043561685055165493, + "loss": 3.0489, + "theoretical_loss": 3.9563739538081517, + "tokens_seen": 453721088 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043560682046138417, + "loss": 2.9775, + "theoretical_loss": 3.9563113742279907, + "tokens_seen": 453786624 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 546923, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7837202548980713, + "objective/train/theoretical_loss": 3.956264447134096, + "objective/train/tokens_used": 474295776, + "theoretical_loss": 3.956264447134096, + "tokens_seen": 453835776 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043559679037111335, + "loss": 2.8202, + "theoretical_loss": 3.956248806215095, + "tokens_seen": 453852160 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043558676028084253, + "loss": 2.867, + "theoretical_loss": 3.9561862497656572, + "tokens_seen": 453917696 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004355767301905717, + "loss": 2.8822, + "theoretical_loss": 3.9561237048758713, + "tokens_seen": 453983232 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004355667001003009, + "loss": 2.8726, + "theoretical_loss": 3.9560611715419327, + "tokens_seen": 454048768 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043555667001003007, + "loss": 3.0744, + "theoretical_loss": 3.955998649760039, + "tokens_seen": 454114304 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004355466399197593, + "loss": 2.7415, + "theoretical_loss": 3.9559361395263895, + "tokens_seen": 454179840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043553660982948843, + "loss": 2.862, + "theoretical_loss": 3.9558736408371855, + "tokens_seen": 454245376 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043552657973921767, + "loss": 2.7807, + "theoretical_loss": 3.9558111536886287, + "tokens_seen": 454310912 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043551654964894685, + "loss": 2.9495, + "theoretical_loss": 3.955748678076925, + "tokens_seen": 454376448 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043550651955867603, + "loss": 3.0138, + "theoretical_loss": 3.9556862139982805, + "tokens_seen": 454441984 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004354964894684052, + "loss": 2.9964, + "theoretical_loss": 3.955623761448903, + "tokens_seen": 454507520 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004354864593781344, + "loss": 2.9618, + "theoretical_loss": 3.955561320425004, + "tokens_seen": 454573056 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004354764292878636, + "loss": 2.9674, + "theoretical_loss": 3.9554988909227933, + "tokens_seen": 454638592 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004354663991975928, + "loss": 2.9339, + "theoretical_loss": 3.955436472938486, + "tokens_seen": 454704128 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043545636910732194, + "loss": 2.8827, + "theoretical_loss": 3.9553740664682966, + "tokens_seen": 454769664 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043544633901705117, + "loss": 2.8341, + "theoretical_loss": 3.9553116715084427, + "tokens_seen": 454835200 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004354363089267803, + "loss": 3.045, + "theoretical_loss": 3.955249288055143, + "tokens_seen": 454900736 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043542627883650953, + "loss": 3.152, + "theoretical_loss": 3.9551869161046187, + "tokens_seen": 454966272 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043541624874623877, + "loss": 2.8714, + "theoretical_loss": 3.955124555653092, + "tokens_seen": 455031808 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004354062186559679, + "loss": 2.9861, + "theoretical_loss": 3.9550622066967875, + "tokens_seen": 455097344 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043539618856569713, + "loss": 3.0678, + "theoretical_loss": 3.9549998692319317, + "tokens_seen": 455162880 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004353861584754263, + "loss": 3.1052, + "theoretical_loss": 3.9549375432547516, + "tokens_seen": 455228416 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004353761283851555, + "loss": 3.0939, + "theoretical_loss": 3.9548752287614772, + "tokens_seen": 455293952 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004353660982948847, + "loss": 3.2468, + "theoretical_loss": 3.9548129257483406, + "tokens_seen": 455359488 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043535606820461386, + "loss": 2.7976, + "theoretical_loss": 3.9547506342115737, + "tokens_seen": 455425024 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 547608, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1719532012939453, + "objective/train/theoretical_loss": 3.9547039230881023, + "objective/train/tokens_used": 475934176, + "theoretical_loss": 3.9547039230881023, + "tokens_seen": 455474176 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043534603811434304, + "loss": 3.0834, + "theoretical_loss": 3.954688354147413, + "tokens_seen": 455490560 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043533600802407227, + "loss": 2.8907, + "theoretical_loss": 3.9546260855520945, + "tokens_seen": 455556096 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004353259779338014, + "loss": 3.0901, + "theoretical_loss": 3.9545638284218567, + "tokens_seen": 455621632 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043531594784353063, + "loss": 2.9834, + "theoretical_loss": 3.9545015827529406, + "tokens_seen": 455687168 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043530591775325976, + "loss": 2.9154, + "theoretical_loss": 3.9544393485415874, + "tokens_seen": 455752704 + }, + { + "epoch": 1.05, + "learning_rate": 0.000435295887662989, + "loss": 3.0887, + "theoretical_loss": 3.954377125784042, + "tokens_seen": 455818240 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004352858575727182, + "loss": 2.8017, + "theoretical_loss": 3.954314914476549, + "tokens_seen": 455883776 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043527582748244736, + "loss": 2.9674, + "theoretical_loss": 3.9542527146153565, + "tokens_seen": 455949312 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043526579739217654, + "loss": 2.8448, + "theoretical_loss": 3.954190526196714, + "tokens_seen": 456014848 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004352557673019057, + "loss": 2.9596, + "theoretical_loss": 3.9541283492168717, + "tokens_seen": 456080384 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004352457372116349, + "loss": 3.2475, + "theoretical_loss": 3.954066183672083, + "tokens_seen": 456145920 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043523570712136414, + "loss": 2.9562, + "theoretical_loss": 3.9540040295586016, + "tokens_seen": 456211456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043522567703109326, + "loss": 3.0325, + "theoretical_loss": 3.9539418868726846, + "tokens_seen": 456276992 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004352156469408225, + "loss": 2.9111, + "theoretical_loss": 3.95387975561059, + "tokens_seen": 456342528 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004352056168505517, + "loss": 3.1916, + "theoretical_loss": 3.9538176357685764, + "tokens_seen": 456408064 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043519558676028086, + "loss": 2.9947, + "theoretical_loss": 3.953755527342907, + "tokens_seen": 456473600 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043518555667001004, + "loss": 2.9115, + "theoretical_loss": 3.953693430329844, + "tokens_seen": 456539136 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004351755265797392, + "loss": 3.0481, + "theoretical_loss": 3.953631344725653, + "tokens_seen": 456604672 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004351654964894684, + "loss": 2.8957, + "theoretical_loss": 3.953569270526601, + "tokens_seen": 456670208 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043515546639919764, + "loss": 2.9168, + "theoretical_loss": 3.953507207728956, + "tokens_seen": 456735744 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043514543630892677, + "loss": 2.9412, + "theoretical_loss": 3.9534451563289883, + "tokens_seen": 456801280 + }, + { + "epoch": 1.05, + "learning_rate": 0.000435135406218656, + "loss": 2.9884, + "theoretical_loss": 3.9533831163229705, + "tokens_seen": 456866816 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043512537612838513, + "loss": 3.1086, + "theoretical_loss": 3.953321087707177, + "tokens_seen": 456932352 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043511534603811437, + "loss": 3.0378, + "theoretical_loss": 3.9532590704778823, + "tokens_seen": 456997888 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043510531594784355, + "loss": 3.1419, + "theoretical_loss": 3.9531970646313637, + "tokens_seen": 457063424 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 548950, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.220716714859009, + "objective/train/theoretical_loss": 3.9531505677141836, + "objective/train/tokens_used": 477572576, + "theoretical_loss": 3.9531505677141836, + "tokens_seen": 457112576 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043509528585757273, + "loss": 3.0836, + "theoretical_loss": 3.953135070163901, + "tokens_seen": 457128960 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004350852557673019, + "loss": 2.9593, + "theoretical_loss": 3.9530730870717745, + "tokens_seen": 457194496 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004350752256770311, + "loss": 3.2264, + "theoretical_loss": 3.9530111153512677, + "tokens_seen": 457260032 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043506519558676027, + "loss": 3.0847, + "theoretical_loss": 3.9529491549986644, + "tokens_seen": 457325568 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004350551654964895, + "loss": 2.9372, + "theoretical_loss": 3.9528872060102502, + "tokens_seen": 457391104 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043504513540621863, + "loss": 3.154, + "theoretical_loss": 3.952825268382314, + "tokens_seen": 457456640 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043503510531594787, + "loss": 3.0226, + "theoretical_loss": 3.9527633421111448, + "tokens_seen": 457522176 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043502507522567705, + "loss": 3.0597, + "theoretical_loss": 3.952701427193033, + "tokens_seen": 457587712 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043501504513540623, + "loss": 3.0086, + "theoretical_loss": 3.9526395236242737, + "tokens_seen": 457653248 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004350050150451354, + "loss": 3.2069, + "theoretical_loss": 3.9525776314011605, + "tokens_seen": 457718784 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004349949849548646, + "loss": 3.0413, + "theoretical_loss": 3.95251575051999, + "tokens_seen": 457784320 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004349849548645938, + "loss": 2.9921, + "theoretical_loss": 3.9524538809770604, + "tokens_seen": 457849856 + }, + { + "epoch": 1.05, + "learning_rate": 0.000434974924774323, + "loss": 3.0149, + "theoretical_loss": 3.952392022768672, + "tokens_seen": 457915392 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043496489468405214, + "loss": 2.9887, + "theoretical_loss": 3.952330175891127, + "tokens_seen": 457980928 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043495486459378137, + "loss": 2.9086, + "theoretical_loss": 3.952268340340728, + "tokens_seen": 458046464 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004349448345035105, + "loss": 3.1181, + "theoretical_loss": 3.952206516113781, + "tokens_seen": 458112000 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043493480441323973, + "loss": 3.0618, + "theoretical_loss": 3.952144703206592, + "tokens_seen": 458177536 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004349247743229689, + "loss": 2.952, + "theoretical_loss": 3.952082901615471, + "tokens_seen": 458243072 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004349147442326981, + "loss": 3.0284, + "theoretical_loss": 3.9520211113367276, + "tokens_seen": 458308608 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004349047141424273, + "loss": 3.1181, + "theoretical_loss": 3.9519593323666746, + "tokens_seen": 458374144 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004348946840521565, + "loss": 2.913, + "theoretical_loss": 3.951897564701625, + "tokens_seen": 458439680 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043488465396188564, + "loss": 2.6729, + "theoretical_loss": 3.951835808337895, + "tokens_seen": 458505216 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004348746238716149, + "loss": 3.3398, + "theoretical_loss": 3.9517740632718015, + "tokens_seen": 458570752 + }, + { + "epoch": 1.05, + "learning_rate": 0.000434864593781344, + "loss": 2.8925, + "theoretical_loss": 3.9517123294996646, + "tokens_seen": 458636288 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043485456369107324, + "loss": 2.9762, + "theoretical_loss": 3.951650607017804, + "tokens_seen": 458701824 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 550265, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.620121955871582, + "objective/train/theoretical_loss": 3.9516043225634405, + "objective/train/tokens_used": 479210976, + "theoretical_loss": 3.9516043225634405, + "tokens_seen": 458750976 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004348445336008024, + "loss": 3.1754, + "theoretical_loss": 3.9515888958225425, + "tokens_seen": 458767360 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004348345035105316, + "loss": 2.9674, + "theoretical_loss": 3.951527195910205, + "tokens_seen": 458832896 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004348244734202608, + "loss": 3.0026, + "theoretical_loss": 3.9514655072771165, + "tokens_seen": 458898432 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043481444332998996, + "loss": 2.9806, + "theoretical_loss": 3.951403829919606, + "tokens_seen": 458963968 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043480441323971914, + "loss": 3.0238, + "theoretical_loss": 3.951342163834001, + "tokens_seen": 459029504 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004347943831494484, + "loss": 2.9712, + "theoretical_loss": 3.9512805090166347, + "tokens_seen": 459095040 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004347843530591775, + "loss": 2.9981, + "theoretical_loss": 3.9512188654638387, + "tokens_seen": 459160576 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043477432296890674, + "loss": 2.9099, + "theoretical_loss": 3.951157233171948, + "tokens_seen": 459226112 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043476429287863587, + "loss": 3.0506, + "theoretical_loss": 3.9510956121372978, + "tokens_seen": 459291648 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004347542627883651, + "loss": 2.9484, + "theoretical_loss": 3.951034002356228, + "tokens_seen": 459357184 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004347442326980943, + "loss": 2.9222, + "theoretical_loss": 3.9509724038250775, + "tokens_seen": 459422720 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043473420260782346, + "loss": 3.1718, + "theoretical_loss": 3.950910816540187, + "tokens_seen": 459488256 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043472417251755265, + "loss": 3.1192, + "theoretical_loss": 3.9508492404979005, + "tokens_seen": 459553792 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004347141424272819, + "loss": 3.248, + "theoretical_loss": 3.950787675694562, + "tokens_seen": 459619328 + }, + { + "epoch": 1.05, + "learning_rate": 0.000434704112337011, + "loss": 2.9839, + "theoretical_loss": 3.950726122126519, + "tokens_seen": 459684864 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043469408224674024, + "loss": 3.1868, + "theoretical_loss": 3.9506645797901196, + "tokens_seen": 459750400 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004346840521564694, + "loss": 2.8167, + "theoretical_loss": 3.950603048681714, + "tokens_seen": 459815936 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004346740220661986, + "loss": 3.07, + "theoretical_loss": 3.950541528797652, + "tokens_seen": 459881472 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043466399197592784, + "loss": 2.9742, + "theoretical_loss": 3.9504800201342896, + "tokens_seen": 459947008 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043465396188565697, + "loss": 3.0656, + "theoretical_loss": 3.9504185226879804, + "tokens_seen": 460012544 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004346439317953862, + "loss": 3.0949, + "theoretical_loss": 3.950357036455081, + "tokens_seen": 460078080 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043463390170511533, + "loss": 2.9103, + "theoretical_loss": 3.950295561431951, + "tokens_seen": 460143616 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043462387161484457, + "loss": 3.0709, + "theoretical_loss": 3.95023409761495, + "tokens_seen": 460209152 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043461384152457375, + "loss": 2.99, + "theoretical_loss": 3.950172645000439, + "tokens_seen": 460274688 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043460381143430293, + "loss": 3.1795, + "theoretical_loss": 3.9501112035847834, + "tokens_seen": 460340224 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 550910, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.578948736190796, + "objective/train/theoretical_loss": 3.950065129870103, + "objective/train/tokens_used": 480849376, + "theoretical_loss": 3.950065129870103, + "tokens_seen": 460389376 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004345937813440321, + "loss": 2.9974, + "theoretical_loss": 3.950049773364347, + "tokens_seen": 460405760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004345837512537613, + "loss": 3.1578, + "theoretical_loss": 3.9499883543354977, + "tokens_seen": 460471296 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043457372116349047, + "loss": 3.0666, + "theoretical_loss": 3.9499269464946036, + "tokens_seen": 460536832 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004345636910732197, + "loss": 3.0071, + "theoretical_loss": 3.949865549838035, + "tokens_seen": 460602368 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043455366098294883, + "loss": 3.25, + "theoretical_loss": 3.9498041643621646, + "tokens_seen": 460667904 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043454363089267807, + "loss": 3.1709, + "theoretical_loss": 3.949742790063366, + "tokens_seen": 460733440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043453360080240725, + "loss": 2.9825, + "theoretical_loss": 3.9496814269380143, + "tokens_seen": 460798976 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043452357071213643, + "loss": 2.7215, + "theoretical_loss": 3.9496200749824864, + "tokens_seen": 460864512 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004345135406218656, + "loss": 3.2174, + "theoretical_loss": 3.949558734193162, + "tokens_seen": 460930048 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004345035105315948, + "loss": 3.0671, + "theoretical_loss": 3.9494974045664213, + "tokens_seen": 460995584 + }, + { + "epoch": 1.05, + "learning_rate": 0.000434493480441324, + "loss": 2.8494, + "theoretical_loss": 3.949436086098646, + "tokens_seen": 461061120 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004344834503510532, + "loss": 3.0576, + "theoretical_loss": 3.9493747787862206, + "tokens_seen": 461126656 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043447342026078234, + "loss": 2.9423, + "theoretical_loss": 3.9493134826255303, + "tokens_seen": 461192192 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043446339017051157, + "loss": 3.0723, + "theoretical_loss": 3.9492521976129624, + "tokens_seen": 461257728 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004344533600802407, + "loss": 2.9127, + "theoretical_loss": 3.9491909237449065, + "tokens_seen": 461323264 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043444332998996993, + "loss": 2.8746, + "theoretical_loss": 3.9491296610177526, + "tokens_seen": 461388800 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004344332998996991, + "loss": 3.0022, + "theoretical_loss": 3.949068409427893, + "tokens_seen": 461454336 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004344232698094283, + "loss": 2.8165, + "theoretical_loss": 3.949007168971722, + "tokens_seen": 461519872 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004344132397191575, + "loss": 3.2636, + "theoretical_loss": 3.9489459396456343, + "tokens_seen": 461585408 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004344032096288867, + "loss": 3.1145, + "theoretical_loss": 3.9488847214460288, + "tokens_seen": 461650944 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043439317953861584, + "loss": 2.9202, + "theoretical_loss": 3.9488235143693036, + "tokens_seen": 461716480 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004343831494483451, + "loss": 2.8524, + "theoretical_loss": 3.94876231841186, + "tokens_seen": 461782016 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004343731193580742, + "loss": 3.0581, + "theoretical_loss": 3.948701133570099, + "tokens_seen": 461847552 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043436308926780344, + "loss": 3.0472, + "theoretical_loss": 3.9486399598404263, + "tokens_seen": 461913088 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004343530591775326, + "loss": 2.8361, + "theoretical_loss": 3.948578797219247, + "tokens_seen": 461978624 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 552255, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0387556552886963, + "objective/train/theoretical_loss": 3.9485329325411502, + "objective/train/tokens_used": 482487776, + "theoretical_loss": 3.9485329325411502, + "tokens_seen": 462027776 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004343430290872618, + "loss": 2.9454, + "theoretical_loss": 3.9485176457029683, + "tokens_seen": 462044160 + }, + { + "epoch": 1.05, + "learning_rate": 0.000434332998996991, + "loss": 2.9201, + "theoretical_loss": 3.948456505287999, + "tokens_seen": 462109696 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043432296890672016, + "loss": 2.7927, + "theoretical_loss": 3.9483953759707515, + "tokens_seen": 462175232 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043431293881644934, + "loss": 3.0389, + "theoretical_loss": 3.948334257747636, + "tokens_seen": 462240768 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004343029087261786, + "loss": 3.0607, + "theoretical_loss": 3.948273150615068, + "tokens_seen": 462306304 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004342928786359077, + "loss": 2.8376, + "theoretical_loss": 3.9482120545694626, + "tokens_seen": 462371840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043428284854563694, + "loss": 3.1191, + "theoretical_loss": 3.948150969607237, + "tokens_seen": 462437376 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043427281845536607, + "loss": 3.0403, + "theoretical_loss": 3.9480898957248116, + "tokens_seen": 462502912 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004342627883650953, + "loss": 3.2992, + "theoretical_loss": 3.948028832918606, + "tokens_seen": 462568448 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004342527582748245, + "loss": 2.7688, + "theoretical_loss": 3.9479677811850427, + "tokens_seen": 462633984 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043424272818455367, + "loss": 2.912, + "theoretical_loss": 3.947906740520546, + "tokens_seen": 462699520 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043423269809428285, + "loss": 2.9298, + "theoretical_loss": 3.9478457109215412, + "tokens_seen": 462765056 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004342226680040121, + "loss": 2.8303, + "theoretical_loss": 3.947784692384457, + "tokens_seen": 462830592 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004342126379137412, + "loss": 3.0642, + "theoretical_loss": 3.9477236849057205, + "tokens_seen": 462896128 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043420260782347044, + "loss": 3.0432, + "theoretical_loss": 3.9476626884817634, + "tokens_seen": 462961664 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043419257773319957, + "loss": 2.8215, + "theoretical_loss": 3.9476017031090187, + "tokens_seen": 463027200 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004341825476429288, + "loss": 2.9692, + "theoretical_loss": 3.9475407287839195, + "tokens_seen": 463092736 + }, + { + "epoch": 1.05, + "learning_rate": 0.000434172517552658, + "loss": 3.0146, + "theoretical_loss": 3.947479765502902, + "tokens_seen": 463158272 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043416248746238717, + "loss": 3.0465, + "theoretical_loss": 3.947418813262403, + "tokens_seen": 463223808 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043415245737211635, + "loss": 3.0851, + "theoretical_loss": 3.947357872058862, + "tokens_seen": 463289344 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043414242728184553, + "loss": 2.7837, + "theoretical_loss": 3.9472969418887196, + "tokens_seen": 463354880 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004341323971915747, + "loss": 3.2644, + "theoretical_loss": 3.9472360227484176, + "tokens_seen": 463420416 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043412236710130395, + "loss": 3.228, + "theoretical_loss": 3.9471751146344003, + "tokens_seen": 463485952 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004341123370110331, + "loss": 2.8354, + "theoretical_loss": 3.947114217543113, + "tokens_seen": 463551488 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004341023069207623, + "loss": 2.8766, + "theoretical_loss": 3.947053331471003, + "tokens_seen": 463617024 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 552845, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9544570446014404, + "objective/train/theoretical_loss": 3.94700767414612, + "objective/train/tokens_used": 484126176, + "theoretical_loss": 3.94700767414612, + "tokens_seen": 463666176 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043409227683049144, + "loss": 3.0834, + "theoretical_loss": 3.9469924564145202, + "tokens_seen": 463682560 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043408224674022067, + "loss": 2.8789, + "theoretical_loss": 3.9469315923701136, + "tokens_seen": 463748096 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043407221664994985, + "loss": 2.9113, + "theoretical_loss": 3.946870739334236, + "tokens_seen": 463813632 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043406218655967903, + "loss": 2.9886, + "theoretical_loss": 3.946809897303342, + "tokens_seen": 463879168 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004340521564694082, + "loss": 2.9726, + "theoretical_loss": 3.9467490662738856, + "tokens_seen": 463944704 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043404212637913745, + "loss": 2.9747, + "theoretical_loss": 3.9466882462423243, + "tokens_seen": 464010240 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004340320962888666, + "loss": 3.2831, + "theoretical_loss": 3.9466274372051178, + "tokens_seen": 464075776 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004340220661985958, + "loss": 2.8867, + "theoretical_loss": 3.9465666391587257, + "tokens_seen": 464141312 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043401203610832494, + "loss": 3.0543, + "theoretical_loss": 3.94650585209961, + "tokens_seen": 464206848 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004340020060180542, + "loss": 2.9371, + "theoretical_loss": 3.9464450760242347, + "tokens_seen": 464272384 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043399197592778336, + "loss": 3.0978, + "theoretical_loss": 3.946384310929065, + "tokens_seen": 464337920 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043398194583751254, + "loss": 2.9277, + "theoretical_loss": 3.9463235568105675, + "tokens_seen": 464403456 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004339719157472417, + "loss": 3.1595, + "theoretical_loss": 3.946262813665211, + "tokens_seen": 464468992 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004339618856569709, + "loss": 2.8856, + "theoretical_loss": 3.9462020814894663, + "tokens_seen": 464534528 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004339518555667001, + "loss": 2.9228, + "theoretical_loss": 3.946141360279804, + "tokens_seen": 464600064 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004339418254764293, + "loss": 2.8936, + "theoretical_loss": 3.9460806500326986, + "tokens_seen": 464665600 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004339317953861585, + "loss": 3.135, + "theoretical_loss": 3.9460199507446245, + "tokens_seen": 464731136 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004339217652958877, + "loss": 3.0263, + "theoretical_loss": 3.9459592624120594, + "tokens_seen": 464796672 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004339117352056169, + "loss": 3.0817, + "theoretical_loss": 3.94589858503148, + "tokens_seen": 464862208 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043390170511534604, + "loss": 2.9917, + "theoretical_loss": 3.945837918599368, + "tokens_seen": 464927744 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004338916750250753, + "loss": 3.0115, + "theoretical_loss": 3.9457772631122037, + "tokens_seen": 464993280 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004338816449348044, + "loss": 2.9553, + "theoretical_loss": 3.945716618566472, + "tokens_seen": 465058816 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043387161484453364, + "loss": 2.8438, + "theoretical_loss": 3.945655984958656, + "tokens_seen": 465124352 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004338615847542628, + "loss": 2.9346, + "theoretical_loss": 3.9455953622852427, + "tokens_seen": 465189888 + }, + { + "epoch": 1.05, + "learning_rate": 0.000433851554663992, + "loss": 2.9295, + "theoretical_loss": 3.9455347505427207, + "tokens_seen": 465255424 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 553513, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9996495246887207, + "objective/train/theoretical_loss": 3.945489298907115, + "objective/train/tokens_used": 485764576, + "theoretical_loss": 3.945489298907115, + "tokens_seen": 465304576 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004338415245737212, + "loss": 2.9936, + "theoretical_loss": 3.9454741497275796, + "tokens_seen": 465320960 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043383149448345036, + "loss": 2.875, + "theoretical_loss": 3.945413559836311, + "tokens_seen": 465386496 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043382146439317954, + "loss": 3.1446, + "theoretical_loss": 3.9453529808654064, + "tokens_seen": 465452032 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004338114343029088, + "loss": 3.0625, + "theoretical_loss": 3.9452924128113622, + "tokens_seen": 465517568 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004338014042126379, + "loss": 3.0735, + "theoretical_loss": 3.945231855670674, + "tokens_seen": 465583104 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043379137412236714, + "loss": 3.1854, + "theoretical_loss": 3.9451713094398393, + "tokens_seen": 465648640 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043378134403209627, + "loss": 3.1381, + "theoretical_loss": 3.9451107741153577, + "tokens_seen": 465714176 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004337713139418255, + "loss": 3.0158, + "theoretical_loss": 3.9450502496937307, + "tokens_seen": 465779712 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004337612838515547, + "loss": 2.8256, + "theoretical_loss": 3.9449897361714603, + "tokens_seen": 465845248 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043375125376128387, + "loss": 2.8059, + "theoretical_loss": 3.944929233545051, + "tokens_seen": 465910784 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043374122367101305, + "loss": 2.9526, + "theoretical_loss": 3.9448687418110095, + "tokens_seen": 465976320 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004337311935807423, + "loss": 3.1095, + "theoretical_loss": 3.944808260965842, + "tokens_seen": 466041856 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004337211634904714, + "loss": 3.2617, + "theoretical_loss": 3.9447477910060584, + "tokens_seen": 466107392 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043371113340020064, + "loss": 3.0492, + "theoretical_loss": 3.94468733192817, + "tokens_seen": 466172928 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043370110330992977, + "loss": 3.0652, + "theoretical_loss": 3.944626883728688, + "tokens_seen": 466238464 + }, + { + "epoch": 1.05, + "learning_rate": 0.000433691073219659, + "loss": 3.1735, + "theoretical_loss": 3.9445664464041266, + "tokens_seen": 466304000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004336810431293882, + "loss": 2.9657, + "theoretical_loss": 3.944506019951002, + "tokens_seen": 466369536 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043367101303911737, + "loss": 2.8674, + "theoretical_loss": 3.9444456043658302, + "tokens_seen": 466435072 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043366098294884655, + "loss": 3.032, + "theoretical_loss": 3.944385199645132, + "tokens_seen": 466500608 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043365095285857573, + "loss": 2.8132, + "theoretical_loss": 3.9443248057854254, + "tokens_seen": 466566144 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004336409227683049, + "loss": 2.8774, + "theoretical_loss": 3.9442644227832337, + "tokens_seen": 466631680 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043363089267803415, + "loss": 2.8377, + "theoretical_loss": 3.9442040506350806, + "tokens_seen": 466697216 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004336208625877633, + "loss": 3.2088, + "theoretical_loss": 3.9441436893374906, + "tokens_seen": 466762752 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004336108324974925, + "loss": 2.8835, + "theoretical_loss": 3.9440833388869914, + "tokens_seen": 466828288 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043360080240722164, + "loss": 2.9217, + "theoretical_loss": 3.9440229992801106, + "tokens_seen": 466893824 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 555014, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.009627103805542, + "objective/train/theoretical_loss": 3.943977751688987, + "objective/train/tokens_used": 487402976, + "theoretical_loss": 3.943977751688987, + "tokens_seen": 466942976 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043359077231695087, + "loss": 2.8843, + "theoretical_loss": 3.943962670513378, + "tokens_seen": 466959360 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043358074222668005, + "loss": 2.7812, + "theoretical_loss": 3.9439023525833257, + "tokens_seen": 467024896 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043357071213640923, + "loss": 3.0704, + "theoretical_loss": 3.9438420454864875, + "tokens_seen": 467090432 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004335606820461384, + "loss": 3.2732, + "theoretical_loss": 3.9437817492193963, + "tokens_seen": 467155968 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043355065195586765, + "loss": 2.9273, + "theoretical_loss": 3.9437214637785902, + "tokens_seen": 467221504 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004335406218655968, + "loss": 3.0148, + "theoretical_loss": 3.943661189160607, + "tokens_seen": 467287040 + }, + { + "epoch": 1.05, + "learning_rate": 0.000433530591775326, + "loss": 3.1349, + "theoretical_loss": 3.9436009253619853, + "tokens_seen": 467352576 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043352056168505514, + "loss": 2.9502, + "theoretical_loss": 3.9435406723792665, + "tokens_seen": 467418112 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004335105315947844, + "loss": 2.8156, + "theoretical_loss": 3.9434804302089934, + "tokens_seen": 467483648 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043350050150451356, + "loss": 2.8378, + "theoretical_loss": 3.9434201988477113, + "tokens_seen": 467549184 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043349047141424274, + "loss": 3.1404, + "theoretical_loss": 3.943359978291965, + "tokens_seen": 467614720 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004334804413239719, + "loss": 2.8589, + "theoretical_loss": 3.943299768538302, + "tokens_seen": 467680256 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004334704112337011, + "loss": 3.0278, + "theoretical_loss": 3.943239569583272, + "tokens_seen": 467745792 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004334603811434303, + "loss": 3.1748, + "theoretical_loss": 3.943179381423425, + "tokens_seen": 467811328 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004334503510531595, + "loss": 2.818, + "theoretical_loss": 3.9431192040553142, + "tokens_seen": 467876864 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043344032096288864, + "loss": 2.8783, + "theoretical_loss": 3.9430590374754924, + "tokens_seen": 467942400 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004334302908726179, + "loss": 3.0593, + "theoretical_loss": 3.9429988816805155, + "tokens_seen": 468007936 + }, + { + "epoch": 1.05, + "learning_rate": 0.000433420260782347, + "loss": 2.9382, + "theoretical_loss": 3.9429387366669406, + "tokens_seen": 468073472 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043341023069207624, + "loss": 3.0675, + "theoretical_loss": 3.9428786024313256, + "tokens_seen": 468139008 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004334002006018054, + "loss": 2.8618, + "theoretical_loss": 3.9428184789702323, + "tokens_seen": 468204544 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004333901705115346, + "loss": 2.9609, + "theoretical_loss": 3.9427583662802204, + "tokens_seen": 468270080 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004333801404212638, + "loss": 2.8331, + "theoretical_loss": 3.9426982643578548, + "tokens_seen": 468335616 + }, + { + "epoch": 1.05, + "learning_rate": 0.000433370110330993, + "loss": 2.8806, + "theoretical_loss": 3.9426381731996996, + "tokens_seen": 468401152 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043336008024072215, + "loss": 2.8969, + "theoretical_loss": 3.9425780928023215, + "tokens_seen": 468466688 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004333500501504514, + "loss": 2.82, + "theoretical_loss": 3.9425180231622883, + "tokens_seen": 468532224 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 555695, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4344749450683594, + "objective/train/theoretical_loss": 3.9424729779897074, + "objective/train/tokens_used": 489041376, + "theoretical_loss": 3.9424729779897074, + "tokens_seen": 468581376 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004333400200601805, + "loss": 2.8374, + "theoretical_loss": 3.94245796427617, + "tokens_seen": 468597760 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043332998996990974, + "loss": 2.963, + "theoretical_loss": 3.942397916140538, + "tokens_seen": 468663296 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004333199598796389, + "loss": 2.9556, + "theoretical_loss": 3.942337878751964, + "tokens_seen": 468728832 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004333099297893681, + "loss": 2.7379, + "theoretical_loss": 3.942277852107024, + "tokens_seen": 468794368 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004332998996990973, + "loss": 2.9639, + "theoretical_loss": 3.9422178362022917, + "tokens_seen": 468859904 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043328986960882647, + "loss": 3.1148, + "theoretical_loss": 3.9421578310343466, + "tokens_seen": 468925440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043327983951855565, + "loss": 3.0438, + "theoretical_loss": 3.9420978365997668, + "tokens_seen": 468990976 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004332698094282849, + "loss": 2.9268, + "theoretical_loss": 3.942037852895133, + "tokens_seen": 469056512 + }, + { + "epoch": 1.05, + "learning_rate": 0.000433259779338014, + "loss": 2.9954, + "theoretical_loss": 3.9419778799170286, + "tokens_seen": 469122048 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043324974924774325, + "loss": 2.9419, + "theoretical_loss": 3.941917917662035, + "tokens_seen": 469187584 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043323971915747243, + "loss": 3.0639, + "theoretical_loss": 3.941857966126739, + "tokens_seen": 469253120 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004332296890672016, + "loss": 2.9824, + "theoretical_loss": 3.941798025307728, + "tokens_seen": 469318656 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004332196589769308, + "loss": 2.8335, + "theoretical_loss": 3.941738095201589, + "tokens_seen": 469384192 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043320962888665997, + "loss": 2.9499, + "theoretical_loss": 3.941678175804913, + "tokens_seen": 469449728 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043319959879638915, + "loss": 2.9987, + "theoretical_loss": 3.9416182671142908, + "tokens_seen": 469515264 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004331895687061184, + "loss": 3.0375, + "theoretical_loss": 3.9415583691263167, + "tokens_seen": 469580800 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043317953861584757, + "loss": 2.9472, + "theoretical_loss": 3.9414984818375842, + "tokens_seen": 469646336 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043316950852557675, + "loss": 3.2323, + "theoretical_loss": 3.9414386052446906, + "tokens_seen": 469711872 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043315947843530593, + "loss": 3.0547, + "theoretical_loss": 3.9413787393442328, + "tokens_seen": 469777408 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004331494483450351, + "loss": 2.8593, + "theoretical_loss": 3.9413188841328104, + "tokens_seen": 469842944 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043313941825476435, + "loss": 2.9321, + "theoretical_loss": 3.941259039607025, + "tokens_seen": 469908480 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004331293881644935, + "loss": 2.831, + "theoretical_loss": 3.9411992057634775, + "tokens_seen": 469974016 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004331193580742227, + "loss": 3.0224, + "theoretical_loss": 3.9411393825987737, + "tokens_seen": 470039552 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043310932798395184, + "loss": 2.9007, + "theoretical_loss": 3.941079570109518, + "tokens_seen": 470105088 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043309929789368107, + "loss": 2.9678, + "theoretical_loss": 3.9410197682923185, + "tokens_seen": 470170624 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 556746, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.731928586959839, + "objective/train/theoretical_loss": 3.9409749239309146, + "objective/train/tokens_used": 490679776, + "theoretical_loss": 3.9409749239309146, + "tokens_seen": 470219776 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043308926780341025, + "loss": 2.95, + "theoretical_loss": 3.940959977143783, + "tokens_seen": 470236160 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043307923771313943, + "loss": 2.7916, + "theoretical_loss": 3.9409001966605217, + "tokens_seen": 470301696 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004330692076228686, + "loss": 3.0253, + "theoretical_loss": 3.9408404268391473, + "tokens_seen": 470367232 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043305917753259785, + "loss": 2.9868, + "theoretical_loss": 3.940780667676272, + "tokens_seen": 470432768 + }, + { + "epoch": 1.05, + "learning_rate": 0.000433049147442327, + "loss": 3.1419, + "theoretical_loss": 3.9407209191685117, + "tokens_seen": 470498304 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004330391173520562, + "loss": 2.9542, + "theoretical_loss": 3.940661181312482, + "tokens_seen": 470563840 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043302908726178534, + "loss": 3.0518, + "theoretical_loss": 3.9406014541048013, + "tokens_seen": 470629376 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004330190571715146, + "loss": 2.8218, + "theoretical_loss": 3.9405417375420893, + "tokens_seen": 470694912 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043300902708124376, + "loss": 3.094, + "theoretical_loss": 3.9404820316209666, + "tokens_seen": 470760448 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043299899699097294, + "loss": 2.9356, + "theoretical_loss": 3.940422336338056, + "tokens_seen": 470825984 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004329889669007021, + "loss": 3.0012, + "theoretical_loss": 3.940362651689982, + "tokens_seen": 470891520 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004329789368104313, + "loss": 3.1362, + "theoretical_loss": 3.940302977673369, + "tokens_seen": 470957056 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004329689067201605, + "loss": 3.0018, + "theoretical_loss": 3.940243314284846, + "tokens_seen": 471022592 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004329588766298897, + "loss": 2.8662, + "theoretical_loss": 3.940183661521041, + "tokens_seen": 471088128 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043294884653961884, + "loss": 3.1666, + "theoretical_loss": 3.940124019378583, + "tokens_seen": 471153664 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004329388164493481, + "loss": 3.1978, + "theoretical_loss": 3.940064387854106, + "tokens_seen": 471219200 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004329287863590772, + "loss": 2.7946, + "theoretical_loss": 3.9400047669442424, + "tokens_seen": 471284736 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043291875626880644, + "loss": 2.9157, + "theoretical_loss": 3.939945156645627, + "tokens_seen": 471350272 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004329087261785356, + "loss": 2.9845, + "theoretical_loss": 3.9398855569548963, + "tokens_seen": 471415808 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004328986960882648, + "loss": 2.9426, + "theoretical_loss": 3.939825967868688, + "tokens_seen": 471481344 + }, + { + "epoch": 1.05, + "learning_rate": 0.000432888665997994, + "loss": 2.7147, + "theoretical_loss": 3.9397663893836428, + "tokens_seen": 471546880 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004328786359077232, + "loss": 3.2359, + "theoretical_loss": 3.9397068214964, + "tokens_seen": 471612416 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043286860581745235, + "loss": 3.0412, + "theoretical_loss": 3.939647264203604, + "tokens_seen": 471677952 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004328585757271816, + "loss": 2.9235, + "theoretical_loss": 3.9395877175018965, + "tokens_seen": 471743488 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004328485456369107, + "loss": 3.1163, + "theoretical_loss": 3.9395281813879257, + "tokens_seen": 471809024 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 557491, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.751948595046997, + "objective/train/theoretical_loss": 3.939483536248632, + "objective/train/tokens_used": 492318176, + "theoretical_loss": 3.939483536248632, + "tokens_seen": 471858176 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043283851554663994, + "loss": 2.8767, + "theoretical_loss": 3.9394686558583376, + "tokens_seen": 471874560 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004328284854563691, + "loss": 2.8283, + "theoretical_loss": 3.9394091409097807, + "tokens_seen": 471940096 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004328184553660983, + "loss": 2.7199, + "theoretical_loss": 3.939349636538905, + "tokens_seen": 472005632 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004328084252758275, + "loss": 3.1735, + "theoretical_loss": 3.9392901427423634, + "tokens_seen": 472071168 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043279839518555667, + "loss": 2.9966, + "theoretical_loss": 3.939230659516808, + "tokens_seen": 472136704 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043278836509528585, + "loss": 2.6941, + "theoretical_loss": 3.9391711868588946, + "tokens_seen": 472202240 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004327783350050151, + "loss": 3.0288, + "theoretical_loss": 3.9391117247652785, + "tokens_seen": 472267776 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004327683049147442, + "loss": 3.1079, + "theoretical_loss": 3.939052273232618, + "tokens_seen": 472333312 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043275827482447345, + "loss": 3.0203, + "theoretical_loss": 3.938992832257572, + "tokens_seen": 472398848 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043274824473420263, + "loss": 2.8275, + "theoretical_loss": 3.9389334018368025, + "tokens_seen": 472464384 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004327382146439318, + "loss": 3.2009, + "theoretical_loss": 3.938873981966971, + "tokens_seen": 472529920 + }, + { + "epoch": 1.05, + "learning_rate": 0.000432728184553661, + "loss": 2.8613, + "theoretical_loss": 3.938814572644741, + "tokens_seen": 472595456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043271815446339017, + "loss": 2.9294, + "theoretical_loss": 3.938755173866779, + "tokens_seen": 472660992 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043270812437311935, + "loss": 3.0615, + "theoretical_loss": 3.9386957856297515, + "tokens_seen": 472726528 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004326980942828486, + "loss": 3.0866, + "theoretical_loss": 3.9386364079303267, + "tokens_seen": 472792064 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004326880641925777, + "loss": 2.9678, + "theoretical_loss": 3.938577040765175, + "tokens_seen": 472857600 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043267803410230695, + "loss": 3.0434, + "theoretical_loss": 3.9385176841309675, + "tokens_seen": 472923136 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004326680040120361, + "loss": 3.0813, + "theoretical_loss": 3.938458338024377, + "tokens_seen": 472988672 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004326579739217653, + "loss": 3.0351, + "theoretical_loss": 3.9383990024420794, + "tokens_seen": 473054208 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004326479438314945, + "loss": 3.0305, + "theoretical_loss": 3.938339677380749, + "tokens_seen": 473119744 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004326379137412237, + "loss": 3.0702, + "theoretical_loss": 3.938280362837064, + "tokens_seen": 473185280 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043262788365095286, + "loss": 2.8885, + "theoretical_loss": 3.938221058807704, + "tokens_seen": 473250816 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043261785356068204, + "loss": 3.0995, + "theoretical_loss": 3.9381617652893484, + "tokens_seen": 473316352 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004326078234704112, + "loss": 3.1313, + "theoretical_loss": 3.93810248227868, + "tokens_seen": 473381888 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043259779338014045, + "loss": 2.9592, + "theoretical_loss": 3.938043209772382, + "tokens_seen": 473447424 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 558752, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.13558030128479, + "objective/train/theoretical_loss": 3.937998762284158, + "objective/train/tokens_used": 493956576, + "theoretical_loss": 3.937998762284158, + "tokens_seen": 473496576 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004325877632898696, + "loss": 2.9736, + "theoretical_loss": 3.9379839477671403, + "tokens_seen": 473512960 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004325777331995988, + "loss": 3.0554, + "theoretical_loss": 3.937924696259641, + "tokens_seen": 473578496 + }, + { + "epoch": 1.05, + "learning_rate": 0.000432567703109328, + "loss": 3.0045, + "theoretical_loss": 3.9378654552465715, + "tokens_seen": 473644032 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004325576730190572, + "loss": 3.0406, + "theoretical_loss": 3.937806224724622, + "tokens_seen": 473709568 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043254764292878636, + "loss": 2.9355, + "theoretical_loss": 3.9377470046904834, + "tokens_seen": 473775104 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043253761283851554, + "loss": 2.8813, + "theoretical_loss": 3.9376877951408487, + "tokens_seen": 473840640 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004325275827482447, + "loss": 2.823, + "theoretical_loss": 3.937628596072412, + "tokens_seen": 473906176 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043251755265797396, + "loss": 2.9766, + "theoretical_loss": 3.9375694074818677, + "tokens_seen": 473971712 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004325075225677031, + "loss": 2.8709, + "theoretical_loss": 3.9375102293659143, + "tokens_seen": 474037248 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004324974924774323, + "loss": 3.0939, + "theoretical_loss": 3.9374510617212497, + "tokens_seen": 474102784 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043248746238716145, + "loss": 3.0702, + "theoretical_loss": 3.937391904544574, + "tokens_seen": 474168320 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004324774322968907, + "loss": 3.1023, + "theoretical_loss": 3.937332757832589, + "tokens_seen": 474233856 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043246740220661986, + "loss": 2.914, + "theoretical_loss": 3.9372736215819977, + "tokens_seen": 474299392 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043245737211634904, + "loss": 2.7269, + "theoretical_loss": 3.9372144957895046, + "tokens_seen": 474364928 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004324473420260782, + "loss": 3.1261, + "theoretical_loss": 3.9371553804518156, + "tokens_seen": 474430464 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004324373119358074, + "loss": 3.0787, + "theoretical_loss": 3.9370962755656382, + "tokens_seen": 474496000 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043242728184553664, + "loss": 3.2062, + "theoretical_loss": 3.937037181127682, + "tokens_seen": 474561536 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004324172517552658, + "loss": 3.1947, + "theoretical_loss": 3.936978097134657, + "tokens_seen": 474627072 + }, + { + "epoch": 1.05, + "learning_rate": 0.000432407221664995, + "loss": 3.0234, + "theoretical_loss": 3.9369190235832754, + "tokens_seen": 474692608 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004323971915747242, + "loss": 2.9252, + "theoretical_loss": 3.936859960470251, + "tokens_seen": 474758144 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004323871614844534, + "loss": 2.8916, + "theoretical_loss": 3.936800907792298, + "tokens_seen": 474823680 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043237713139418255, + "loss": 3.1763, + "theoretical_loss": 3.9367418655461335, + "tokens_seen": 474889216 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004323671013039118, + "loss": 3.1412, + "theoretical_loss": 3.936682833728476, + "tokens_seen": 474954752 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004323570712136409, + "loss": 3.0111, + "theoretical_loss": 3.936623812336043, + "tokens_seen": 475020288 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043234704112337014, + "loss": 3.1496, + "theoretical_loss": 3.9365648013655576, + "tokens_seen": 475085824 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 559487, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8737614154815674, + "objective/train/theoretical_loss": 3.936520549975124, + "objective/train/tokens_used": 495594976, + "theoretical_loss": 3.936520549975124, + "tokens_seen": 475134976 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004323370110330993, + "loss": 3.1738, + "theoretical_loss": 3.936505800813741, + "tokens_seen": 475151360 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004323269809428285, + "loss": 2.7831, + "theoretical_loss": 3.9364468106773174, + "tokens_seen": 475216896 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004323169508525577, + "loss": 2.8794, + "theoretical_loss": 3.936387830953013, + "tokens_seen": 475282432 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043230692076228687, + "loss": 3.0701, + "theoretical_loss": 3.936328861637553, + "tokens_seen": 475347968 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043229689067201605, + "loss": 2.9024, + "theoretical_loss": 3.936269902727667, + "tokens_seen": 475413504 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004322868605817453, + "loss": 3.3167, + "theoretical_loss": 3.9362109542200843, + "tokens_seen": 475479040 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004322768304914744, + "loss": 3.2355, + "theoretical_loss": 3.9361520161115364, + "tokens_seen": 475544576 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043226680040120365, + "loss": 3.1841, + "theoretical_loss": 3.9360930883987564, + "tokens_seen": 475610112 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043225677031093283, + "loss": 3.0284, + "theoretical_loss": 3.936034171078478, + "tokens_seen": 475675648 + }, + { + "epoch": 1.05, + "learning_rate": 0.000432246740220662, + "loss": 3.0582, + "theoretical_loss": 3.9359752641474373, + "tokens_seen": 475741184 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004322367101303912, + "loss": 3.0325, + "theoretical_loss": 3.9359163676023705, + "tokens_seen": 475806720 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043222668004012037, + "loss": 3.1134, + "theoretical_loss": 3.935857481440018, + "tokens_seen": 475872256 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043221664994984955, + "loss": 3.2472, + "theoretical_loss": 3.935798605657119, + "tokens_seen": 475937792 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004322066198595788, + "loss": 3.1808, + "theoretical_loss": 3.9357397402504146, + "tokens_seen": 476003328 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004321965897693079, + "loss": 3.0204, + "theoretical_loss": 3.935680885216649, + "tokens_seen": 476068864 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043218655967903715, + "loss": 3.1083, + "theoretical_loss": 3.9356220405525666, + "tokens_seen": 476134400 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004321765295887663, + "loss": 2.8384, + "theoretical_loss": 3.9355632062549124, + "tokens_seen": 476199936 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004321664994984955, + "loss": 3.0837, + "theoretical_loss": 3.9355043823204348, + "tokens_seen": 476265472 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004321564694082247, + "loss": 3.0754, + "theoretical_loss": 3.935445568745883, + "tokens_seen": 476331008 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004321464393179539, + "loss": 3.1409, + "theoretical_loss": 3.9353867655280066, + "tokens_seen": 476396544 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043213640922768306, + "loss": 3.2543, + "theoretical_loss": 3.935327972663558, + "tokens_seen": 476462080 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043212637913741224, + "loss": 2.7809, + "theoretical_loss": 3.935269190149291, + "tokens_seen": 476527616 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004321163490471414, + "loss": 2.9419, + "theoretical_loss": 3.93521041798196, + "tokens_seen": 476593152 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043210631895687065, + "loss": 2.8718, + "theoretical_loss": 3.935151656158321, + "tokens_seen": 476658688 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004320962888665998, + "loss": 3.0566, + "theoretical_loss": 3.9350929046751313, + "tokens_seen": 476724224 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 560707, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.231940984725952, + "objective/train/theoretical_loss": 3.9350488478467103, + "objective/train/tokens_used": 497233376, + "theoretical_loss": 3.9350488478467103, + "tokens_seen": 476773376 + }, + { + "epoch": 1.05, + "learning_rate": 0.000432086258776329, + "loss": 3.0351, + "theoretical_loss": 3.9350341635291515, + "tokens_seen": 476789760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004320762286860582, + "loss": 3.0426, + "theoretical_loss": 3.934975432717142, + "tokens_seen": 476855296 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004320661985957874, + "loss": 2.9293, + "theoretical_loss": 3.9349167122358644, + "tokens_seen": 476920832 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043205616850551656, + "loss": 3.0121, + "theoretical_loss": 3.934858002082082, + "tokens_seen": 476986368 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043204613841524574, + "loss": 3.1352, + "theoretical_loss": 3.9347993022525607, + "tokens_seen": 477051904 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004320361083249749, + "loss": 3.0302, + "theoretical_loss": 3.9347406127440663, + "tokens_seen": 477117440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043202607823470416, + "loss": 2.9891, + "theoretical_loss": 3.9346819335533674, + "tokens_seen": 477182976 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004320160481444333, + "loss": 3.0205, + "theoretical_loss": 3.9346232646772332, + "tokens_seen": 477248512 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004320060180541625, + "loss": 2.7596, + "theoretical_loss": 3.934564606112435, + "tokens_seen": 477314048 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043199598796389165, + "loss": 3.0225, + "theoretical_loss": 3.934505957855744, + "tokens_seen": 477379584 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004319859578736209, + "loss": 2.9677, + "theoretical_loss": 3.934447319903935, + "tokens_seen": 477445120 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043197592778335006, + "loss": 2.7368, + "theoretical_loss": 3.934388692253783, + "tokens_seen": 477510656 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043196589769307924, + "loss": 3.0439, + "theoretical_loss": 3.934330074902065, + "tokens_seen": 477576192 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004319558676028084, + "loss": 2.9094, + "theoretical_loss": 3.9342714678455586, + "tokens_seen": 477641728 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004319458375125376, + "loss": 2.786, + "theoretical_loss": 3.9342128710810433, + "tokens_seen": 477707264 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004319358074222668, + "loss": 3.1414, + "theoretical_loss": 3.9341542846053006, + "tokens_seen": 477772800 + }, + { + "epoch": 1.05, + "learning_rate": 0.000431925777331996, + "loss": 2.915, + "theoretical_loss": 3.934095708415113, + "tokens_seen": 477838336 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043191574724172515, + "loss": 2.9227, + "theoretical_loss": 3.934037142507264, + "tokens_seen": 477903872 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004319057171514544, + "loss": 2.7711, + "theoretical_loss": 3.9339785868785397, + "tokens_seen": 477969408 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043189568706118357, + "loss": 2.8507, + "theoretical_loss": 3.9339200415257265, + "tokens_seen": 478034944 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043188565697091275, + "loss": 3.0298, + "theoretical_loss": 3.933861506445613, + "tokens_seen": 478100480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043187562688064193, + "loss": 2.8273, + "theoretical_loss": 3.933802981634988, + "tokens_seen": 478166016 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004318655967903711, + "loss": 3.1108, + "theoretical_loss": 3.9337444670906434, + "tokens_seen": 478231552 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004318555667001003, + "loss": 2.9331, + "theoretical_loss": 3.9336859628093723, + "tokens_seen": 478297088 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004318455366098295, + "loss": 3.0098, + "theoretical_loss": 3.933627468787968, + "tokens_seen": 478362624 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 561422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4702494144439697, + "objective/train/theoretical_loss": 3.933583605003024, + "objective/train/tokens_used": 498871776, + "theoretical_loss": 3.933583605003024, + "tokens_seen": 478411776 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043183550651955865, + "loss": 2.9758, + "theoretical_loss": 3.933568985023226, + "tokens_seen": 478428160 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004318254764292879, + "loss": 3.136, + "theoretical_loss": 3.933510511511943, + "tokens_seen": 478493696 + }, + { + "epoch": 1.06, + "learning_rate": 0.000431815446339017, + "loss": 3.188, + "theoretical_loss": 3.9334520482509183, + "tokens_seen": 478559232 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043180541624874625, + "loss": 3.0016, + "theoretical_loss": 3.933393595236951, + "tokens_seen": 478624768 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043179538615847543, + "loss": 2.8216, + "theoretical_loss": 3.9333351524668427, + "tokens_seen": 478690304 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004317853560682046, + "loss": 2.9534, + "theoretical_loss": 3.9332767199373957, + "tokens_seen": 478755840 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004317753259779338, + "loss": 3.0022, + "theoretical_loss": 3.9332182976454146, + "tokens_seen": 478821376 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043176529588766303, + "loss": 3.1134, + "theoretical_loss": 3.9331598855877044, + "tokens_seen": 478886912 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043175526579739215, + "loss": 2.8389, + "theoretical_loss": 3.9331014837610727, + "tokens_seen": 478952448 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004317452357071214, + "loss": 2.9801, + "theoretical_loss": 3.933043092162327, + "tokens_seen": 479017984 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004317352056168505, + "loss": 2.9627, + "theoretical_loss": 3.9329847107882787, + "tokens_seen": 479083520 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043172517552657975, + "loss": 2.7128, + "theoretical_loss": 3.9329263396357375, + "tokens_seen": 479149056 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043171514543630893, + "loss": 2.8601, + "theoretical_loss": 3.932867978701517, + "tokens_seen": 479214592 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004317051153460381, + "loss": 3.0035, + "theoretical_loss": 3.932809627982431, + "tokens_seen": 479280128 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004316950852557673, + "loss": 3.0842, + "theoretical_loss": 3.9327512874752952, + "tokens_seen": 479345664 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004316850551654965, + "loss": 3.1308, + "theoretical_loss": 3.9326929571769265, + "tokens_seen": 479411200 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004316750250752257, + "loss": 3.1854, + "theoretical_loss": 3.9326346370841434, + "tokens_seen": 479476736 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004316649949849549, + "loss": 2.9725, + "theoretical_loss": 3.932576327193766, + "tokens_seen": 479542272 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004316549648946841, + "loss": 2.7423, + "theoretical_loss": 3.9325180275026153, + "tokens_seen": 479607808 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043164493480441326, + "loss": 3.0698, + "theoretical_loss": 3.9324597380075135, + "tokens_seen": 479673344 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043163490471414244, + "loss": 2.9938, + "theoretical_loss": 3.932401458705286, + "tokens_seen": 479738880 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004316248746238716, + "loss": 3.025, + "theoretical_loss": 3.932343189592757, + "tokens_seen": 479804416 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043161484453360085, + "loss": 3.0979, + "theoretical_loss": 3.9322849306667536, + "tokens_seen": 479869952 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043160481444333, + "loss": 2.8941, + "theoretical_loss": 3.9322266819241056, + "tokens_seen": 479935488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004315947843530592, + "loss": 3.0009, + "theoretical_loss": 3.9321684433616415, + "tokens_seen": 480001024 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 562507, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.448958158493042, + "objective/train/theoretical_loss": 3.932124771118633, + "objective/train/tokens_used": 500510176, + "theoretical_loss": 3.932124771118633, + "tokens_seen": 480050176 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004315847542627884, + "loss": 2.906, + "theoretical_loss": 3.932110214976193, + "tokens_seen": 480066560 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004315747241725176, + "loss": 2.8742, + "theoretical_loss": 3.9320519967645926, + "tokens_seen": 480132096 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043156469408224676, + "loss": 3.1216, + "theoretical_loss": 3.931993788723674, + "tokens_seen": 480197632 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043155466399197594, + "loss": 2.9462, + "theoretical_loss": 3.9319355908502738, + "tokens_seen": 480263168 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004315446339017051, + "loss": 2.9604, + "theoretical_loss": 3.931877403141228, + "tokens_seen": 480328704 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043153460381143436, + "loss": 3.1821, + "theoretical_loss": 3.9318192255933755, + "tokens_seen": 480394240 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004315245737211635, + "loss": 3.1779, + "theoretical_loss": 3.9317610582035556, + "tokens_seen": 480459776 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004315145436308927, + "loss": 3.0144, + "theoretical_loss": 3.9317029009686086, + "tokens_seen": 480525312 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043150451354062185, + "loss": 3.1989, + "theoretical_loss": 3.9316447538853794, + "tokens_seen": 480590848 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004314944834503511, + "loss": 2.9107, + "theoretical_loss": 3.9315866169507094, + "tokens_seen": 480656384 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043148445336008026, + "loss": 2.7994, + "theoretical_loss": 3.931528490161446, + "tokens_seen": 480721920 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043147442326980944, + "loss": 2.9174, + "theoretical_loss": 3.9314703735144345, + "tokens_seen": 480787456 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004314643931795386, + "loss": 2.7913, + "theoretical_loss": 3.931412267006524, + "tokens_seen": 480852992 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004314543630892678, + "loss": 3.0289, + "theoretical_loss": 3.931354170634563, + "tokens_seen": 480918528 + }, + { + "epoch": 1.06, + "learning_rate": 0.000431444332998997, + "loss": 3.0211, + "theoretical_loss": 3.9312960843954046, + "tokens_seen": 480984064 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004314343029087262, + "loss": 2.9529, + "theoretical_loss": 3.9312380082858995, + "tokens_seen": 481049600 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043142427281845535, + "loss": 3.0352, + "theoretical_loss": 3.931179942302902, + "tokens_seen": 481115136 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004314142427281846, + "loss": 2.9488, + "theoretical_loss": 3.9311218864432673, + "tokens_seen": 481180672 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043140421263791377, + "loss": 2.9354, + "theoretical_loss": 3.9310638407038523, + "tokens_seen": 481246208 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043139418254764295, + "loss": 2.8764, + "theoretical_loss": 3.931005805081515, + "tokens_seen": 481311744 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043138415245737213, + "loss": 2.8272, + "theoretical_loss": 3.9309477795731143, + "tokens_seen": 481377280 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004313741223671013, + "loss": 2.9896, + "theoretical_loss": 3.930889764175512, + "tokens_seen": 481442816 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004313640922768305, + "loss": 2.9828, + "theoretical_loss": 3.930831758885569, + "tokens_seen": 481508352 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004313540621865597, + "loss": 3.0283, + "theoretical_loss": 3.9307737637001505, + "tokens_seen": 481573888 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043134403209628885, + "loss": 2.915, + "theoretical_loss": 3.93071577861612, + "tokens_seen": 481639424 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 563271, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.949291229248047, + "objective/train/theoretical_loss": 3.9306722964302496, + "objective/train/tokens_used": 502148576, + "theoretical_loss": 3.9306722964302496, + "tokens_seen": 481688576 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004313340020060181, + "loss": 3.0166, + "theoretical_loss": 3.930657803630346, + "tokens_seen": 481704960 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004313239719157472, + "loss": 2.8268, + "theoretical_loss": 3.9305998387396945, + "tokens_seen": 481770496 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043131394182547645, + "loss": 3.1203, + "theoretical_loss": 3.9305418839410353, + "tokens_seen": 481836032 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043130391173520563, + "loss": 3.1673, + "theoretical_loss": 3.930483939231239, + "tokens_seen": 481901568 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004312938816449348, + "loss": 2.9154, + "theoretical_loss": 3.930426004607178, + "tokens_seen": 481967104 + }, + { + "epoch": 1.06, + "learning_rate": 0.000431283851554664, + "loss": 2.7331, + "theoretical_loss": 3.930368080065726, + "tokens_seen": 482032640 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043127382146439323, + "loss": 2.9744, + "theoretical_loss": 3.930310165603757, + "tokens_seen": 482098176 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043126379137412236, + "loss": 2.959, + "theoretical_loss": 3.930252261218147, + "tokens_seen": 482163712 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004312537612838516, + "loss": 2.9013, + "theoretical_loss": 3.9301943669057744, + "tokens_seen": 482229248 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004312437311935807, + "loss": 2.9071, + "theoretical_loss": 3.9301364826635186, + "tokens_seen": 482294784 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043123370110330995, + "loss": 2.7957, + "theoretical_loss": 3.930078608488259, + "tokens_seen": 482360320 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043122367101303913, + "loss": 2.822, + "theoretical_loss": 3.9300207443768773, + "tokens_seen": 482425856 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004312136409227683, + "loss": 3.0268, + "theoretical_loss": 3.929962890326258, + "tokens_seen": 482491392 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004312036108324975, + "loss": 3.1638, + "theoretical_loss": 3.9299050463332845, + "tokens_seen": 482556928 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004311935807422267, + "loss": 3.1264, + "theoretical_loss": 3.929847212394843, + "tokens_seen": 482622464 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043118355065195586, + "loss": 2.8405, + "theoretical_loss": 3.929789388507821, + "tokens_seen": 482688000 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004311735205616851, + "loss": 2.9131, + "theoretical_loss": 3.9297315746691073, + "tokens_seen": 482753536 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004311634904714142, + "loss": 2.8616, + "theoretical_loss": 3.9296737708755916, + "tokens_seen": 482819072 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043115346038114346, + "loss": 3.1964, + "theoretical_loss": 3.9296159771241657, + "tokens_seen": 482884608 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004311434302908726, + "loss": 3.1841, + "theoretical_loss": 3.9295581934117223, + "tokens_seen": 482950144 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004311334002006018, + "loss": 2.7733, + "theoretical_loss": 3.9295004197351564, + "tokens_seen": 483015680 + }, + { + "epoch": 1.06, + "learning_rate": 0.000431123370110331, + "loss": 2.8998, + "theoretical_loss": 3.9294426560913625, + "tokens_seen": 483081216 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004311133400200602, + "loss": 3.0662, + "theoretical_loss": 3.9293849024772385, + "tokens_seen": 483146752 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043110330992978936, + "loss": 3.1417, + "theoretical_loss": 3.9293271588896825, + "tokens_seen": 483212288 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004310932798395186, + "loss": 3.0213, + "theoretical_loss": 3.929269425325594, + "tokens_seen": 483277824 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 567309, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9936575889587402, + "objective/train/theoretical_loss": 3.929226131728565, + "objective/train/tokens_used": 503786976, + "theoretical_loss": 3.929226131728565, + "tokens_seen": 483326976 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004310832497492477, + "loss": 2.9944, + "theoretical_loss": 3.929211701781875, + "tokens_seen": 483343360 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043107321965897696, + "loss": 2.9511, + "theoretical_loss": 3.929153988255427, + "tokens_seen": 483408896 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004310631895687061, + "loss": 3.0354, + "theoretical_loss": 3.9290962847431548, + "tokens_seen": 483474432 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004310531594784353, + "loss": 2.9623, + "theoretical_loss": 3.9290385912419628, + "tokens_seen": 483539968 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004310431293881645, + "loss": 2.8225, + "theoretical_loss": 3.9289809077487585, + "tokens_seen": 483605504 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004310330992978937, + "loss": 3.083, + "theoretical_loss": 3.9289232342604494, + "tokens_seen": 483671040 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043102306920762286, + "loss": 3.022, + "theoretical_loss": 3.9288655707739455, + "tokens_seen": 483736576 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043101303911735205, + "loss": 2.8435, + "theoretical_loss": 3.928807917286157, + "tokens_seen": 483802112 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004310030090270812, + "loss": 2.8934, + "theoretical_loss": 3.928750273793997, + "tokens_seen": 483867648 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043099297893681046, + "loss": 3.0615, + "theoretical_loss": 3.9286926402943774, + "tokens_seen": 483933184 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004309829488465396, + "loss": 2.8151, + "theoretical_loss": 3.928635016784215, + "tokens_seen": 483998720 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004309729187562688, + "loss": 3.0473, + "theoretical_loss": 3.9285774032604244, + "tokens_seen": 484064256 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043096288866599795, + "loss": 3.172, + "theoretical_loss": 3.9285197997199246, + "tokens_seen": 484129792 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004309528585757272, + "loss": 2.8463, + "theoretical_loss": 3.928462206159634, + "tokens_seen": 484195328 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043094282848545637, + "loss": 2.9347, + "theoretical_loss": 3.928404622576472, + "tokens_seen": 484260864 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043093279839518555, + "loss": 2.9781, + "theoretical_loss": 3.928347048967362, + "tokens_seen": 484326400 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004309227683049148, + "loss": 3.0882, + "theoretical_loss": 3.928289485329227, + "tokens_seen": 484391936 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043091273821464397, + "loss": 2.8604, + "theoretical_loss": 3.92823193165899, + "tokens_seen": 484457472 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043090270812437315, + "loss": 2.9441, + "theoretical_loss": 3.928174387953579, + "tokens_seen": 484523008 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043089267803410233, + "loss": 2.9715, + "theoretical_loss": 3.9281168542099194, + "tokens_seen": 484588544 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004308826479438315, + "loss": 3.1032, + "theoretical_loss": 3.9280593304249405, + "tokens_seen": 484654080 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004308726178535607, + "loss": 2.9442, + "theoretical_loss": 3.928001816595572, + "tokens_seen": 484719616 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004308625877632899, + "loss": 3.163, + "theoretical_loss": 3.9279443127187452, + "tokens_seen": 484785152 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043085255767301905, + "loss": 3.1574, + "theoretical_loss": 3.9278868187913933, + "tokens_seen": 484850688 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004308425275827483, + "loss": 2.7887, + "theoretical_loss": 3.9278293348104496, + "tokens_seen": 484916224 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 572265, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1022114753723145, + "objective/train/theoretical_loss": 3.927786228350229, + "objective/train/tokens_used": 505425376, + "theoretical_loss": 3.927786228350229, + "tokens_seen": 484965376 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004308324974924774, + "loss": 3.0504, + "theoretical_loss": 3.9277718607728502, + "tokens_seen": 484981760 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043082246740220665, + "loss": 2.8934, + "theoretical_loss": 3.9277143966755315, + "tokens_seen": 485047296 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043081243731193583, + "loss": 2.9485, + "theoretical_loss": 3.927656942515431, + "tokens_seen": 485112832 + }, + { + "epoch": 1.06, + "learning_rate": 0.000430802407221665, + "loss": 3.1336, + "theoretical_loss": 3.927599498289489, + "tokens_seen": 485178368 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004307923771313942, + "loss": 2.9035, + "theoretical_loss": 3.927542063994646, + "tokens_seen": 485243904 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043078234704112343, + "loss": 3.1617, + "theoretical_loss": 3.927484639627844, + "tokens_seen": 485309440 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043077231695085256, + "loss": 2.9164, + "theoretical_loss": 3.927427225186027, + "tokens_seen": 485374976 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004307622868605818, + "loss": 3.2708, + "theoretical_loss": 3.927369820666139, + "tokens_seen": 485440512 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004307522567703109, + "loss": 3.16, + "theoretical_loss": 3.9273124260651278, + "tokens_seen": 485506048 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043074222668004015, + "loss": 3.1773, + "theoretical_loss": 3.927255041379939, + "tokens_seen": 485571584 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043073219658976933, + "loss": 2.9499, + "theoretical_loss": 3.9271976666075226, + "tokens_seen": 485637120 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004307221664994985, + "loss": 2.79, + "theoretical_loss": 3.9271403017448288, + "tokens_seen": 485702656 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004307121364092277, + "loss": 3.0427, + "theoretical_loss": 3.9270829467888095, + "tokens_seen": 485768192 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004307021063189569, + "loss": 3.1896, + "theoretical_loss": 3.9270256017364167, + "tokens_seen": 485833728 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043069207622868606, + "loss": 3.217, + "theoretical_loss": 3.9269682665846064, + "tokens_seen": 485899264 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004306820461384153, + "loss": 3.0031, + "theoretical_loss": 3.9269109413303322, + "tokens_seen": 485964800 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004306720160481444, + "loss": 3.174, + "theoretical_loss": 3.9268536259705527, + "tokens_seen": 486030336 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043066198595787366, + "loss": 2.8777, + "theoretical_loss": 3.926796320502225, + "tokens_seen": 486095872 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004306519558676028, + "loss": 3.1536, + "theoretical_loss": 3.926739024922311, + "tokens_seen": 486161408 + }, + { + "epoch": 1.06, + "learning_rate": 0.000430641925777332, + "loss": 2.9334, + "theoretical_loss": 3.926681739227769, + "tokens_seen": 486226944 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004306318956870612, + "loss": 2.8147, + "theoretical_loss": 3.926624463415563, + "tokens_seen": 486292480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004306218655967904, + "loss": 2.9436, + "theoretical_loss": 3.9265671974826564, + "tokens_seen": 486358016 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043061183550651956, + "loss": 2.9908, + "theoretical_loss": 3.9265099414260147, + "tokens_seen": 486423552 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004306018054162488, + "loss": 3.0091, + "theoretical_loss": 3.9264526952426033, + "tokens_seen": 486489088 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004305917753259779, + "loss": 2.9953, + "theoretical_loss": 3.926395458929391, + "tokens_seen": 486554624 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 577157, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9272384643554688, + "objective/train/theoretical_loss": 3.9263525381699758, + "objective/train/tokens_used": 507063776, + "theoretical_loss": 3.9263525381699758, + "tokens_seen": 486603776 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043058174523570716, + "loss": 3.0174, + "theoretical_loss": 3.926338232483346, + "tokens_seen": 486620160 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004305717151454363, + "loss": 3.1603, + "theoretical_loss": 3.9262810159014396, + "tokens_seen": 486685696 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004305616850551655, + "loss": 3.1405, + "theoretical_loss": 3.926223809180643, + "tokens_seen": 486751232 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004305516549648947, + "loss": 2.9448, + "theoretical_loss": 3.9261666123179295, + "tokens_seen": 486816768 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004305416248746239, + "loss": 3.2668, + "theoretical_loss": 3.9261094253102735, + "tokens_seen": 486882304 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043053159478435306, + "loss": 2.8592, + "theoretical_loss": 3.926052248154651, + "tokens_seen": 486947840 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043052156469408225, + "loss": 2.8944, + "theoretical_loss": 3.925995080848039, + "tokens_seen": 487013376 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004305115346038114, + "loss": 2.9189, + "theoretical_loss": 3.9259379233874157, + "tokens_seen": 487078912 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043050150451354066, + "loss": 2.8615, + "theoretical_loss": 3.9258807757697607, + "tokens_seen": 487144448 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004304914744232698, + "loss": 3.1765, + "theoretical_loss": 3.9258236379920564, + "tokens_seen": 487209984 + }, + { + "epoch": 1.06, + "learning_rate": 0.000430481444332999, + "loss": 2.7061, + "theoretical_loss": 3.925766510051284, + "tokens_seen": 487275520 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043047141424272815, + "loss": 2.9585, + "theoretical_loss": 3.9257093919444275, + "tokens_seen": 487341056 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004304613841524574, + "loss": 3.07, + "theoretical_loss": 3.925652283668472, + "tokens_seen": 487406592 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043045135406218657, + "loss": 3.2781, + "theoretical_loss": 3.9255951852204047, + "tokens_seen": 487472128 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043044132397191575, + "loss": 2.9378, + "theoretical_loss": 3.9255380965972124, + "tokens_seen": 487537664 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043043129388164493, + "loss": 3.1473, + "theoretical_loss": 3.9254810177958848, + "tokens_seen": 487603200 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043042126379137417, + "loss": 2.8296, + "theoretical_loss": 3.9254239488134117, + "tokens_seen": 487668736 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004304112337011033, + "loss": 3.0285, + "theoretical_loss": 3.925366889646786, + "tokens_seen": 487734272 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043040120361083253, + "loss": 2.8856, + "theoretical_loss": 3.925309840292999, + "tokens_seen": 487799808 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043039117352056165, + "loss": 2.8328, + "theoretical_loss": 3.925252800749047, + "tokens_seen": 487865344 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004303811434302909, + "loss": 2.9618, + "theoretical_loss": 3.925195771011924, + "tokens_seen": 487930880 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043037111334002007, + "loss": 3.0661, + "theoretical_loss": 3.925138751078629, + "tokens_seen": 487996416 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043036108324974925, + "loss": 3.176, + "theoretical_loss": 3.925081740946159, + "tokens_seen": 488061952 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043035105315947843, + "loss": 3.0429, + "theoretical_loss": 3.9250247406115135, + "tokens_seen": 488127488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004303410230692076, + "loss": 3.0888, + "theoretical_loss": 3.924967750071694, + "tokens_seen": 488193024 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 582212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2292799949645996, + "objective/train/theoretical_loss": 3.9249250135928815, + "objective/train/tokens_used": 508702176, + "theoretical_loss": 3.9249250135928815, + "tokens_seen": 488242176 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004303309929789368, + "loss": 2.9761, + "theoretical_loss": 3.9249107693237035, + "tokens_seen": 488258560 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043032096288866603, + "loss": 2.9192, + "theoretical_loss": 3.924853798364545, + "tokens_seen": 488324096 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043031093279839516, + "loss": 3.129, + "theoretical_loss": 3.9247968371912227, + "tokens_seen": 488389632 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004303009027081244, + "loss": 3.078, + "theoretical_loss": 3.9247398858007445, + "tokens_seen": 488455168 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004302908726178535, + "loss": 3.0365, + "theoretical_loss": 3.924682944190117, + "tokens_seen": 488520704 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043028084252758276, + "loss": 2.8859, + "theoretical_loss": 3.924626012356349, + "tokens_seen": 488586240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043027081243731194, + "loss": 3.0156, + "theoretical_loss": 3.924569090296451, + "tokens_seen": 488651776 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004302607823470411, + "loss": 3.0717, + "theoretical_loss": 3.9245121780074355, + "tokens_seen": 488717312 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004302507522567703, + "loss": 2.8483, + "theoretical_loss": 3.924455275486314, + "tokens_seen": 488782848 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043024072216649953, + "loss": 2.7496, + "theoretical_loss": 3.9243983827301006, + "tokens_seen": 488848384 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043023069207622866, + "loss": 3.1034, + "theoretical_loss": 3.924341499735812, + "tokens_seen": 488913920 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004302206619859579, + "loss": 2.8161, + "theoretical_loss": 3.924284626500464, + "tokens_seen": 488979456 + }, + { + "epoch": 1.06, + "learning_rate": 0.000430210631895687, + "loss": 3.0165, + "theoretical_loss": 3.9242277630210753, + "tokens_seen": 489044992 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043020060180541626, + "loss": 3.1262, + "theoretical_loss": 3.9241709092946655, + "tokens_seen": 489110528 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043019057171514544, + "loss": 3.0549, + "theoretical_loss": 3.924114065318255, + "tokens_seen": 489176064 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004301805416248746, + "loss": 3.0324, + "theoretical_loss": 3.9240572310888657, + "tokens_seen": 489241600 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043017051153460386, + "loss": 2.9433, + "theoretical_loss": 3.924000406603521, + "tokens_seen": 489307136 + }, + { + "epoch": 1.06, + "learning_rate": 0.000430160481444333, + "loss": 2.8329, + "theoretical_loss": 3.9239435918592456, + "tokens_seen": 489372672 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004301504513540622, + "loss": 3.0254, + "theoretical_loss": 3.923886786853066, + "tokens_seen": 489438208 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004301404212637914, + "loss": 2.874, + "theoretical_loss": 3.9238299915820085, + "tokens_seen": 489503744 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004301303911735206, + "loss": 2.8958, + "theoretical_loss": 3.9237732060431023, + "tokens_seen": 489569280 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043012036108324976, + "loss": 2.9936, + "theoretical_loss": 3.9237164302333776, + "tokens_seen": 489634816 + }, + { + "epoch": 1.06, + "learning_rate": 0.000430110330992979, + "loss": 3.0332, + "theoretical_loss": 3.923659664149865, + "tokens_seen": 489700352 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004301003009027081, + "loss": 3.1645, + "theoretical_loss": 3.923602907789597, + "tokens_seen": 489765888 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043009027081243736, + "loss": 2.7748, + "theoretical_loss": 3.9235461611496083, + "tokens_seen": 489831424 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 587295, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1512694358825684, + "objective/train/theoretical_loss": 3.923503607546766, + "objective/train/tokens_used": 510340576, + "theoretical_loss": 3.923503607546766, + "tokens_seen": 489880576 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004300802407221665, + "loss": 2.8325, + "theoretical_loss": 3.9234894242269327, + "tokens_seen": 489896960 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004300702106318957, + "loss": 2.8625, + "theoretical_loss": 3.9234326970186073, + "tokens_seen": 489962496 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004300601805416249, + "loss": 3.0172, + "theoretical_loss": 3.92337597952167, + "tokens_seen": 490028032 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004300501504513541, + "loss": 3.0533, + "theoretical_loss": 3.9233192717331598, + "tokens_seen": 490093568 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043004012036108326, + "loss": 2.9677, + "theoretical_loss": 3.923262573650116, + "tokens_seen": 490159104 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043003009027081245, + "loss": 2.8643, + "theoretical_loss": 3.9232058852695815, + "tokens_seen": 490224640 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043002006018054163, + "loss": 3.1406, + "theoretical_loss": 3.9231492065885982, + "tokens_seen": 490290176 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043001003009027086, + "loss": 3.2728, + "theoretical_loss": 3.9230925376042114, + "tokens_seen": 490355712 + }, + { + "epoch": 1.06, + "learning_rate": 0.00043, + "loss": 3.1938, + "theoretical_loss": 3.923035878313465, + "tokens_seen": 490421248 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004299899699097292, + "loss": 2.9093, + "theoretical_loss": 3.9229792287134075, + "tokens_seen": 490486784 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042997993981945835, + "loss": 2.8083, + "theoretical_loss": 3.922922588801086, + "tokens_seen": 490552320 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004299699097291876, + "loss": 3.3187, + "theoretical_loss": 3.9228659585735497, + "tokens_seen": 490617856 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042995987963891677, + "loss": 2.8643, + "theoretical_loss": 3.92280933802785, + "tokens_seen": 490683392 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042994984954864595, + "loss": 2.8438, + "theoretical_loss": 3.922752727161038, + "tokens_seen": 490748928 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042993981945837513, + "loss": 3.1228, + "theoretical_loss": 3.922696125970168, + "tokens_seen": 490814464 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042992978936810437, + "loss": 2.8619, + "theoretical_loss": 3.9226395344522933, + "tokens_seen": 490880000 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004299197592778335, + "loss": 3.0386, + "theoretical_loss": 3.9225829526044707, + "tokens_seen": 490945536 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042990972918756273, + "loss": 2.8399, + "theoretical_loss": 3.922526380423757, + "tokens_seen": 491011072 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042989969909729185, + "loss": 2.8468, + "theoretical_loss": 3.922469817907211, + "tokens_seen": 491076608 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004298896690070211, + "loss": 2.9284, + "theoretical_loss": 3.9224132650518913, + "tokens_seen": 491142144 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042987963891675027, + "loss": 2.9865, + "theoretical_loss": 3.9223567218548596, + "tokens_seen": 491207680 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042986960882647945, + "loss": 2.9417, + "theoretical_loss": 3.9223001883131783, + "tokens_seen": 491273216 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042985957873620863, + "loss": 3.009, + "theoretical_loss": 3.9222436644239114, + "tokens_seen": 491338752 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004298495486459378, + "loss": 2.8589, + "theoretical_loss": 3.922187150184122, + "tokens_seen": 491404288 + }, + { + "epoch": 1.06, + "learning_rate": 0.000429839518555667, + "loss": 2.9139, + "theoretical_loss": 3.9221306455908778, + "tokens_seen": 491469824 + }, + { + "debugging/Self-BLEU-5": 0.5154921509631192, + "debugging/distinct-1-grams": 0.734842616282129, + "debugging/distinct-2-grams": 0.8846155258692667, + "debugging/entropy-1-grams": 6.056339442041576, + "debugging/entropy-2-grams": 7.145308721973038, + "debugging/length": 484.85, + "debugging/num_segments": 20, + "debugging/score": 0.003003109095205246, + "debugging/score_std": 0.004943536934620138, + "epoch": 1.06, + "objective/train/docs_used": 590236, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2846457958221436, + "objective/train/theoretical_loss": 3.922088273474726, + "objective/train/tokens_used": 511978976, + "theoretical_loss": 3.922088273474726, + "tokens_seen": 491518976 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042982948846539623, + "loss": 3.0361, + "theoretical_loss": 3.9220741506412464, + "tokens_seen": 491535360 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042981945837512536, + "loss": 2.8973, + "theoretical_loss": 3.9220176653322953, + "tokens_seen": 491600896 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004298094282848546, + "loss": 2.7277, + "theoretical_loss": 3.9219611896610944, + "tokens_seen": 491666432 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004297993981945837, + "loss": 2.8928, + "theoretical_loss": 3.921904723624716, + "tokens_seen": 491731968 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042978936810431296, + "loss": 2.8518, + "theoretical_loss": 3.9218482672202324, + "tokens_seen": 491797504 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042977933801404214, + "loss": 3.1745, + "theoretical_loss": 3.9217918204447173, + "tokens_seen": 491863040 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004297693079237713, + "loss": 2.4701, + "theoretical_loss": 3.9217353832952453, + "tokens_seen": 491928576 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004297592778335005, + "loss": 3.0984, + "theoretical_loss": 3.921678955768893, + "tokens_seen": 491994112 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042974924774322973, + "loss": 3.0063, + "theoretical_loss": 3.9216225378627385, + "tokens_seen": 492059648 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042973921765295886, + "loss": 2.8113, + "theoretical_loss": 3.92156612957386, + "tokens_seen": 492125184 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004297291875626881, + "loss": 3.1234, + "theoretical_loss": 3.921509730899338, + "tokens_seen": 492190720 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004297191574724172, + "loss": 2.9805, + "theoretical_loss": 3.9214533418362545, + "tokens_seen": 492256256 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042970912738214646, + "loss": 2.7724, + "theoretical_loss": 3.9213969623816913, + "tokens_seen": 492321792 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042969909729187564, + "loss": 2.9657, + "theoretical_loss": 3.921340592532733, + "tokens_seen": 492387328 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004296890672016048, + "loss": 3.0349, + "theoretical_loss": 3.921284232286465, + "tokens_seen": 492452864 + }, + { + "epoch": 1.06, + "learning_rate": 0.000429679037111334, + "loss": 3.1185, + "theoretical_loss": 3.9212278816399735, + "tokens_seen": 492518400 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004296690070210632, + "loss": 3.1046, + "theoretical_loss": 3.9211715405903464, + "tokens_seen": 492583936 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042965897693079236, + "loss": 2.7548, + "theoretical_loss": 3.9211152091346735, + "tokens_seen": 492649472 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004296489468405216, + "loss": 2.9028, + "theoretical_loss": 3.9210588872700436, + "tokens_seen": 492715008 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004296389167502507, + "loss": 2.9545, + "theoretical_loss": 3.9210025749935493, + "tokens_seen": 492780544 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042962888665997996, + "loss": 2.8856, + "theoretical_loss": 3.9209462723022837, + "tokens_seen": 492846080 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004296188565697091, + "loss": 2.8481, + "theoretical_loss": 3.9208899791933414, + "tokens_seen": 492911616 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004296088264794383, + "loss": 3.013, + "theoretical_loss": 3.9208336956638163, + "tokens_seen": 492977152 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004295987963891675, + "loss": 2.8045, + "theoretical_loss": 3.920777421710807, + "tokens_seen": 493042688 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004295887662988967, + "loss": 2.8337, + "theoretical_loss": 3.9207211573314096, + "tokens_seen": 493108224 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 590917, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5917956829071045, + "objective/train/theoretical_loss": 3.9206789653278, + "objective/train/tokens_used": 513617376, + "theoretical_loss": 3.9206789653278, + "tokens_seen": 493157376 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042957873620862587, + "loss": 2.8503, + "theoretical_loss": 3.9206649025227245, + "tokens_seen": 493173760 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004295687061183551, + "loss": 3.2481, + "theoretical_loss": 3.9206086572818517, + "tokens_seen": 493239296 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042955867602808423, + "loss": 3.1467, + "theoretical_loss": 3.920552421605894, + "tokens_seen": 493304832 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042954864593781347, + "loss": 2.8639, + "theoretical_loss": 3.920496195491953, + "tokens_seen": 493370368 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004295386158475426, + "loss": 3.1696, + "theoretical_loss": 3.920439978937134, + "tokens_seen": 493435904 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042952858575727183, + "loss": 3.0995, + "theoretical_loss": 3.9203837719385417, + "tokens_seen": 493501440 + }, + { + "epoch": 1.06, + "learning_rate": 0.000429518555667001, + "loss": 2.6516, + "theoretical_loss": 3.920327574493284, + "tokens_seen": 493566976 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004295085255767302, + "loss": 3.0273, + "theoretical_loss": 3.920271386598468, + "tokens_seen": 493632512 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042949849548645937, + "loss": 2.7616, + "theoretical_loss": 3.9202152082512036, + "tokens_seen": 493698048 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042948846539618855, + "loss": 2.9871, + "theoretical_loss": 3.9201590394486012, + "tokens_seen": 493763584 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042947843530591773, + "loss": 2.8426, + "theoretical_loss": 3.9201028801877724, + "tokens_seen": 493829120 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042946840521564697, + "loss": 2.9274, + "theoretical_loss": 3.920046730465831, + "tokens_seen": 493894656 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004294583751253761, + "loss": 2.8587, + "theoretical_loss": 3.919990590279891, + "tokens_seen": 493960192 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042944834503510533, + "loss": 2.7168, + "theoretical_loss": 3.9199344596270675, + "tokens_seen": 494025728 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004294383149448345, + "loss": 2.8623, + "theoretical_loss": 3.9198783385044784, + "tokens_seen": 494091264 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004294282848545637, + "loss": 2.8781, + "theoretical_loss": 3.919822226909241, + "tokens_seen": 494156800 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042941825476429293, + "loss": 3.0756, + "theoretical_loss": 3.919766124838475, + "tokens_seen": 494222336 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042940822467402206, + "loss": 2.9693, + "theoretical_loss": 3.9197100322893013, + "tokens_seen": 494287872 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004293981945837513, + "loss": 2.8422, + "theoretical_loss": 3.919653949258841, + "tokens_seen": 494353408 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042938816449348047, + "loss": 2.9434, + "theoretical_loss": 3.919597875744218, + "tokens_seen": 494418944 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042937813440320965, + "loss": 2.9088, + "theoretical_loss": 3.9195418117425564, + "tokens_seen": 494484480 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042936810431293883, + "loss": 2.8325, + "theoretical_loss": 3.919485757250982, + "tokens_seen": 494550016 + }, + { + "epoch": 1.06, + "learning_rate": 0.000429358074222668, + "loss": 2.8604, + "theoretical_loss": 3.9194297122666213, + "tokens_seen": 494615552 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004293480441323972, + "loss": 2.9683, + "theoretical_loss": 3.919373676786603, + "tokens_seen": 494681088 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042933801404212643, + "loss": 3.1226, + "theoretical_loss": 3.919317650808056, + "tokens_seen": 494746624 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 592366, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.896491527557373, + "objective/train/theoretical_loss": 3.91927563755776, + "objective/train/tokens_used": 515255776, + "theoretical_loss": 3.91927563755776, + "tokens_seen": 494795776 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042932798395185556, + "loss": 3.0733, + "theoretical_loss": 3.9192616343281106, + "tokens_seen": 494812160 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004293179538615848, + "loss": 3.0851, + "theoretical_loss": 3.9192056273438998, + "tokens_seen": 494877696 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004293079237713139, + "loss": 3.1179, + "theoretical_loss": 3.919149629852556, + "tokens_seen": 494943232 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042929789368104316, + "loss": 3.011, + "theoretical_loss": 3.919093641851214, + "tokens_seen": 495008768 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042928786359077234, + "loss": 2.7185, + "theoretical_loss": 3.9190376633370088, + "tokens_seen": 495074304 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004292778335005015, + "loss": 3.0177, + "theoretical_loss": 3.918981694307077, + "tokens_seen": 495139840 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004292678034102307, + "loss": 2.9071, + "theoretical_loss": 3.9189257347585578, + "tokens_seen": 495205376 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042925777331995993, + "loss": 3.0857, + "theoretical_loss": 3.9188697846885905, + "tokens_seen": 495270912 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042924774322968906, + "loss": 3.1196, + "theoretical_loss": 3.9188138440943145, + "tokens_seen": 495336448 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004292377131394183, + "loss": 2.9395, + "theoretical_loss": 3.9187579129728727, + "tokens_seen": 495401984 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004292276830491474, + "loss": 3.0981, + "theoretical_loss": 3.9187019913214076, + "tokens_seen": 495467520 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042921765295887666, + "loss": 2.8836, + "theoretical_loss": 3.918646079137064, + "tokens_seen": 495533056 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042920762286860584, + "loss": 3.0769, + "theoretical_loss": 3.918590176416987, + "tokens_seen": 495598592 + }, + { + "epoch": 1.06, + "learning_rate": 0.000429197592778335, + "loss": 2.9373, + "theoretical_loss": 3.918534283158323, + "tokens_seen": 495664128 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004291875626880642, + "loss": 2.8688, + "theoretical_loss": 3.9184783993582215, + "tokens_seen": 495729664 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004291775325977934, + "loss": 2.866, + "theoretical_loss": 3.91842252501383, + "tokens_seen": 495795200 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042916750250752256, + "loss": 3.0718, + "theoretical_loss": 3.9183666601222997, + "tokens_seen": 495860736 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004291574724172518, + "loss": 2.7853, + "theoretical_loss": 3.9183108046807833, + "tokens_seen": 495926272 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004291474423269809, + "loss": 2.8375, + "theoretical_loss": 3.918254958686432, + "tokens_seen": 495991808 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042913741223671016, + "loss": 3.17, + "theoretical_loss": 3.918199122136401, + "tokens_seen": 496057344 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004291273821464393, + "loss": 2.9484, + "theoretical_loss": 3.9181432950278463, + "tokens_seen": 496122880 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004291173520561685, + "loss": 2.966, + "theoretical_loss": 3.9180874773579233, + "tokens_seen": 496188416 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004291073219658977, + "loss": 2.9595, + "theoretical_loss": 3.91803166912379, + "tokens_seen": 496253952 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004290972918756269, + "loss": 2.6767, + "theoretical_loss": 3.917975870322607, + "tokens_seen": 496319488 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042908726178535607, + "loss": 2.8961, + "theoretical_loss": 3.917920080951533, + "tokens_seen": 496385024 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 593160, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7651126384735107, + "objective/train/theoretical_loss": 3.9178782451100296, + "objective/train/tokens_used": 516894176, + "theoretical_loss": 3.9178782451100296, + "tokens_seen": 496434176 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004290772316950853, + "loss": 3.037, + "theoretical_loss": 3.9178643010077305, + "tokens_seen": 496450560 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042906720160481443, + "loss": 2.8813, + "theoretical_loss": 3.9178085304883616, + "tokens_seen": 496516096 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042905717151454367, + "loss": 2.8188, + "theoretical_loss": 3.9177527693905914, + "tokens_seen": 496581632 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004290471414242728, + "loss": 3.0604, + "theoretical_loss": 3.9176970177115837, + "tokens_seen": 496647168 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042903711133400203, + "loss": 2.8691, + "theoretical_loss": 3.9176412754485064, + "tokens_seen": 496712704 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004290270812437312, + "loss": 2.9651, + "theoretical_loss": 3.917585542598527, + "tokens_seen": 496778240 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004290170511534604, + "loss": 3.1923, + "theoretical_loss": 3.9175298191588137, + "tokens_seen": 496843776 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042900702106318957, + "loss": 2.8673, + "theoretical_loss": 3.9174741051265367, + "tokens_seen": 496909312 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042899699097291875, + "loss": 2.9436, + "theoretical_loss": 3.9174184004988684, + "tokens_seen": 496974848 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042898696088264793, + "loss": 3.2724, + "theoretical_loss": 3.9173627052729803, + "tokens_seen": 497040384 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042897693079237717, + "loss": 3.1074, + "theoretical_loss": 3.9173070194460466, + "tokens_seen": 497105920 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004289669007021063, + "loss": 2.9554, + "theoretical_loss": 3.9172513430152427, + "tokens_seen": 497171456 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042895687061183553, + "loss": 2.8133, + "theoretical_loss": 3.9171956759777453, + "tokens_seen": 497236992 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004289468405215647, + "loss": 2.9769, + "theoretical_loss": 3.9171400183307306, + "tokens_seen": 497302528 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004289368104312939, + "loss": 3.1201, + "theoretical_loss": 3.917084370071378, + "tokens_seen": 497368064 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004289267803410231, + "loss": 2.8267, + "theoretical_loss": 3.9170287311968677, + "tokens_seen": 497433600 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042891675025075226, + "loss": 2.8601, + "theoretical_loss": 3.9169731017043805, + "tokens_seen": 497499136 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042890672016048144, + "loss": 3.0935, + "theoretical_loss": 3.9169174815910983, + "tokens_seen": 497564672 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042889669007021067, + "loss": 2.785, + "theoretical_loss": 3.916861870854206, + "tokens_seen": 497630208 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004288866599799398, + "loss": 3.1246, + "theoretical_loss": 3.9168062694908876, + "tokens_seen": 497695744 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042887662988966903, + "loss": 3.036, + "theoretical_loss": 3.916750677498329, + "tokens_seen": 497761280 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042886659979939816, + "loss": 3.0575, + "theoretical_loss": 3.916695094873718, + "tokens_seen": 497826816 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004288565697091274, + "loss": 2.9982, + "theoretical_loss": 3.9166395216142424, + "tokens_seen": 497892352 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004288465396188566, + "loss": 3.0071, + "theoretical_loss": 3.9165839577170924, + "tokens_seen": 497957888 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042883650952858576, + "loss": 2.8608, + "theoretical_loss": 3.916528403179459, + "tokens_seen": 498023424 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 593907, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9000375270843506, + "objective/train/theoretical_loss": 3.916486743416727, + "objective/train/tokens_used": 518532576, + "theoretical_loss": 3.916486743416727, + "tokens_seen": 498072576 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042882647943831494, + "loss": 2.9289, + "theoretical_loss": 3.9164728579985333, + "tokens_seen": 498088960 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004288164493480441, + "loss": 2.7877, + "theoretical_loss": 3.91641732217151, + "tokens_seen": 498154496 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004288064192577733, + "loss": 2.9093, + "theoretical_loss": 3.9163617956955825, + "tokens_seen": 498220032 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042879638916750254, + "loss": 2.9958, + "theoretical_loss": 3.916306278567947, + "tokens_seen": 498285568 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042878635907723166, + "loss": 2.9177, + "theoretical_loss": 3.9162507707858003, + "tokens_seen": 498351104 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004287763289869609, + "loss": 2.8919, + "theoretical_loss": 3.916195272346341, + "tokens_seen": 498416640 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004287662988966901, + "loss": 2.8702, + "theoretical_loss": 3.916139783246768, + "tokens_seen": 498482176 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042875626880641926, + "loss": 2.7663, + "theoretical_loss": 3.916084303484282, + "tokens_seen": 498547712 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042874623871614844, + "loss": 2.8986, + "theoretical_loss": 3.9160288330560844, + "tokens_seen": 498613248 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004287362086258776, + "loss": 2.9018, + "theoretical_loss": 3.9159733719593786, + "tokens_seen": 498678784 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004287261785356068, + "loss": 2.907, + "theoretical_loss": 3.9159179201913688, + "tokens_seen": 498744320 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042871614844533604, + "loss": 2.955, + "theoretical_loss": 3.9158624777492603, + "tokens_seen": 498809856 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042870611835506517, + "loss": 3.0871, + "theoretical_loss": 3.9158070446302595, + "tokens_seen": 498875392 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004286960882647944, + "loss": 2.8137, + "theoretical_loss": 3.915751620831575, + "tokens_seen": 498940928 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042868605817452353, + "loss": 2.955, + "theoretical_loss": 3.9156962063504146, + "tokens_seen": 499006464 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042867602808425276, + "loss": 2.7694, + "theoretical_loss": 3.915640801183989, + "tokens_seen": 499072000 + }, + { + "epoch": 1.06, + "learning_rate": 0.000428665997993982, + "loss": 3.0629, + "theoretical_loss": 3.9155854053295105, + "tokens_seen": 499137536 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004286559679037111, + "loss": 2.9166, + "theoretical_loss": 3.91553001878419, + "tokens_seen": 499203072 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042864593781344036, + "loss": 2.9496, + "theoretical_loss": 3.915474641545243, + "tokens_seen": 499268608 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004286359077231695, + "loss": 2.9411, + "theoretical_loss": 3.915419273609883, + "tokens_seen": 499334144 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004286258776328987, + "loss": 2.7409, + "theoretical_loss": 3.9153639149753277, + "tokens_seen": 499399680 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004286158475426279, + "loss": 2.7517, + "theoretical_loss": 3.915308565638793, + "tokens_seen": 499465216 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004286058174523571, + "loss": 2.8606, + "theoretical_loss": 3.9152532255974983, + "tokens_seen": 499530752 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042859578736208627, + "loss": 3.058, + "theoretical_loss": 3.9151978948486637, + "tokens_seen": 499596288 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004285857572718155, + "loss": 3.086, + "theoretical_loss": 3.9151425733895096, + "tokens_seen": 499661824 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 595290, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1176106929779053, + "objective/train/theoretical_loss": 3.915101088389826, + "objective/train/tokens_used": 520170976, + "theoretical_loss": 3.915101088389826, + "tokens_seen": 499710976 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042857572718154463, + "loss": 3.1203, + "theoretical_loss": 3.9150872612172583, + "tokens_seen": 499727360 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042856569709127387, + "loss": 2.8166, + "theoretical_loss": 3.9150319583291333, + "tokens_seen": 499792896 + }, + { + "epoch": 1.06, + "learning_rate": 0.000428555667001003, + "loss": 3.111, + "theoretical_loss": 3.914976664722359, + "tokens_seen": 499858432 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042854563691073223, + "loss": 3.1595, + "theoretical_loss": 3.914921380394162, + "tokens_seen": 499923968 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004285356068204614, + "loss": 2.9206, + "theoretical_loss": 3.914866105341768, + "tokens_seen": 499989504 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004285255767301906, + "loss": 2.7353, + "theoretical_loss": 3.914810839562406, + "tokens_seen": 500055040 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042851554663991977, + "loss": 3.1139, + "theoretical_loss": 3.9147555830533047, + "tokens_seen": 500120576 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042850551654964895, + "loss": 2.973, + "theoretical_loss": 3.9147003358116956, + "tokens_seen": 500186112 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042849548645937813, + "loss": 2.8314, + "theoretical_loss": 3.9146450978348093, + "tokens_seen": 500251648 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042848545636910737, + "loss": 2.9281, + "theoretical_loss": 3.9145898691198795, + "tokens_seen": 500317184 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004284754262788365, + "loss": 2.805, + "theoretical_loss": 3.91453464966414, + "tokens_seen": 500382720 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042846539618856573, + "loss": 2.773, + "theoretical_loss": 3.9144794394648263, + "tokens_seen": 500448256 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004284553660982949, + "loss": 2.9605, + "theoretical_loss": 3.9144242385191745, + "tokens_seen": 500513792 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004284453360080241, + "loss": 2.9448, + "theoretical_loss": 3.9143690468244223, + "tokens_seen": 500579328 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004284353059177533, + "loss": 3.1425, + "theoretical_loss": 3.9143138643778093, + "tokens_seen": 500644864 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042842527582748246, + "loss": 2.8884, + "theoretical_loss": 3.914258691176575, + "tokens_seen": 500710400 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042841524573721164, + "loss": 3.1058, + "theoretical_loss": 3.9142035272179605, + "tokens_seen": 500775936 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042840521564694087, + "loss": 2.7515, + "theoretical_loss": 3.9141483724992083, + "tokens_seen": 500841472 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042839518555667, + "loss": 3.033, + "theoretical_loss": 3.914093227017562, + "tokens_seen": 500907008 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042838515546639923, + "loss": 2.963, + "theoretical_loss": 3.9140380907702665, + "tokens_seen": 500972544 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042837512537612836, + "loss": 3.0991, + "theoretical_loss": 3.9139829637545676, + "tokens_seen": 501038080 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004283650952858576, + "loss": 2.987, + "theoretical_loss": 3.913927845967712, + "tokens_seen": 501103616 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004283550651955868, + "loss": 3.0178, + "theoretical_loss": 3.9138727374069493, + "tokens_seen": 501169152 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042834503510531596, + "loss": 3.0842, + "theoretical_loss": 3.9138176380695278, + "tokens_seen": 501234688 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042833500501504514, + "loss": 3.0508, + "theoretical_loss": 3.9137625479526985, + "tokens_seen": 501300224 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 595812, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.282951831817627, + "objective/train/theoretical_loss": 3.913721236414437, + "objective/train/tokens_used": 521809376, + "theoretical_loss": 3.913721236414437, + "tokens_seen": 501349376 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004283249749247743, + "loss": 3.1562, + "theoretical_loss": 3.9137074670537135, + "tokens_seen": 501365760 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004283149448345035, + "loss": 3.1112, + "theoretical_loss": 3.9136523953698257, + "tokens_seen": 501431296 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042830491474423274, + "loss": 2.9509, + "theoretical_loss": 3.913597332898289, + "tokens_seen": 501496832 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042829488465396186, + "loss": 2.7248, + "theoretical_loss": 3.9135422796363595, + "tokens_seen": 501562368 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004282848545636911, + "loss": 3.0696, + "theoretical_loss": 3.913487235581293, + "tokens_seen": 501627904 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004282748244734203, + "loss": 2.7844, + "theoretical_loss": 3.913432200730348, + "tokens_seen": 501693440 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042826479438314946, + "loss": 2.8522, + "theoretical_loss": 3.913377175080783, + "tokens_seen": 501758976 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042825476429287864, + "loss": 2.7476, + "theoretical_loss": 3.9133221586298577, + "tokens_seen": 501824512 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004282447342026078, + "loss": 2.8357, + "theoretical_loss": 3.913267151374834, + "tokens_seen": 501890048 + }, + { + "epoch": 1.06, + "learning_rate": 0.000428234704112337, + "loss": 2.9961, + "theoretical_loss": 3.913212153312974, + "tokens_seen": 501955584 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042822467402206624, + "loss": 3.0573, + "theoretical_loss": 3.9131571644415413, + "tokens_seen": 502021120 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042821464393179537, + "loss": 2.9952, + "theoretical_loss": 3.913102184757801, + "tokens_seen": 502086656 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004282046138415246, + "loss": 2.7377, + "theoretical_loss": 3.9130472142590187, + "tokens_seen": 502152192 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042819458375125373, + "loss": 2.9165, + "theoretical_loss": 3.9129922529424617, + "tokens_seen": 502217728 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042818455366098296, + "loss": 2.8041, + "theoretical_loss": 3.9129373008053983, + "tokens_seen": 502283264 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042817452357071215, + "loss": 2.9187, + "theoretical_loss": 3.9128823578450977, + "tokens_seen": 502348800 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004281644934804413, + "loss": 2.8599, + "theoretical_loss": 3.9128274240588308, + "tokens_seen": 502414336 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004281544633901705, + "loss": 3.1846, + "theoretical_loss": 3.9127724994438693, + "tokens_seen": 502479872 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004281444332998997, + "loss": 2.9606, + "theoretical_loss": 3.9127175839974866, + "tokens_seen": 502545408 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042813440320962887, + "loss": 2.8654, + "theoretical_loss": 3.912662677716956, + "tokens_seen": 502610944 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004281243731193581, + "loss": 3.0389, + "theoretical_loss": 3.9126077805995534, + "tokens_seen": 502676480 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042811434302908723, + "loss": 3.0599, + "theoretical_loss": 3.912552892642555, + "tokens_seen": 502742016 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042810431293881647, + "loss": 2.8393, + "theoretical_loss": 3.912498013843238, + "tokens_seen": 502807552 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042809428284854565, + "loss": 2.9673, + "theoretical_loss": 3.9124431441988823, + "tokens_seen": 502873088 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042808425275827483, + "loss": 2.9235, + "theoretical_loss": 3.9123882837067674, + "tokens_seen": 502938624 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 597072, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4846396446228027, + "objective/train/theoretical_loss": 3.9123471443422035, + "objective/train/tokens_used": 523447776, + "theoretical_loss": 3.9123471443422035, + "tokens_seen": 502987776 + }, + { + "epoch": 1.06, + "learning_rate": 0.000428074222668004, + "loss": 2.9092, + "theoretical_loss": 3.912333432364174, + "tokens_seen": 503004160 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004280641925777332, + "loss": 2.7859, + "theoretical_loss": 3.912278590168385, + "tokens_seen": 503069696 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004280541624874624, + "loss": 2.85, + "theoretical_loss": 3.9122237571166827, + "tokens_seen": 503135232 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004280441323971916, + "loss": 2.9421, + "theoretical_loss": 3.912168933206353, + "tokens_seen": 503200768 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042803410230692074, + "loss": 3.1732, + "theoretical_loss": 3.9121141184346806, + "tokens_seen": 503266304 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042802407221664997, + "loss": 3.1852, + "theoretical_loss": 3.912059312798953, + "tokens_seen": 503331840 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004280140421263791, + "loss": 3.0069, + "theoretical_loss": 3.9120045162964594, + "tokens_seen": 503397376 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042800401203610833, + "loss": 3.0059, + "theoretical_loss": 3.911949728924487, + "tokens_seen": 503462912 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004279939819458375, + "loss": 3.1234, + "theoretical_loss": 3.9118949506803267, + "tokens_seen": 503528448 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004279839518555667, + "loss": 2.9553, + "theoretical_loss": 3.9118401815612707, + "tokens_seen": 503593984 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004279739217652959, + "loss": 2.8518, + "theoretical_loss": 3.9117854215646117, + "tokens_seen": 503659520 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004279638916750251, + "loss": 3.1847, + "theoretical_loss": 3.9117306706876427, + "tokens_seen": 503725056 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042795386158475424, + "loss": 2.9503, + "theoretical_loss": 3.9116759289276595, + "tokens_seen": 503790592 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004279438314944835, + "loss": 2.8428, + "theoretical_loss": 3.911621196281958, + "tokens_seen": 503856128 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004279338014042126, + "loss": 2.9084, + "theoretical_loss": 3.9115664727478356, + "tokens_seen": 503921664 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042792377131394184, + "loss": 3.0378, + "theoretical_loss": 3.9115117583225905, + "tokens_seen": 503987200 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042791374122367107, + "loss": 3.0881, + "theoretical_loss": 3.911457053003523, + "tokens_seen": 504052736 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004279037111334002, + "loss": 2.8481, + "theoretical_loss": 3.9114023567879332, + "tokens_seen": 504118272 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042789368104312943, + "loss": 2.9654, + "theoretical_loss": 3.9113476696731233, + "tokens_seen": 504183808 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042788365095285856, + "loss": 3.1006, + "theoretical_loss": 3.911292991656396, + "tokens_seen": 504249344 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004278736208625878, + "loss": 3.0438, + "theoretical_loss": 3.9112383227350564, + "tokens_seen": 504314880 + }, + { + "epoch": 1.06, + "learning_rate": 0.000427863590772317, + "loss": 3.0496, + "theoretical_loss": 3.911183662906409, + "tokens_seen": 504380416 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042785356068204616, + "loss": 2.8254, + "theoretical_loss": 3.911129012167761, + "tokens_seen": 504445952 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042784353059177534, + "loss": 2.6538, + "theoretical_loss": 3.911074370516419, + "tokens_seen": 504511488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004278335005015045, + "loss": 2.9649, + "theoretical_loss": 3.911019737949693, + "tokens_seen": 504577024 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 597784, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1036291122436523, + "objective/train/theoretical_loss": 3.9109787694848093, + "objective/train/tokens_used": 525086176, + "theoretical_loss": 3.9109787694848093, + "tokens_seen": 504626176 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004278234704112337, + "loss": 3.0501, + "theoretical_loss": 3.9109651144648927, + "tokens_seen": 504642560 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042781344032096294, + "loss": 3.2442, + "theoretical_loss": 3.910910500059329, + "tokens_seen": 504708096 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042780341023069206, + "loss": 3.1424, + "theoretical_loss": 3.910855894730314, + "tokens_seen": 504773632 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004277933801404213, + "loss": 2.6353, + "theoretical_loss": 3.9108012984751612, + "tokens_seen": 504839168 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004277833500501505, + "loss": 3.0854, + "theoretical_loss": 3.9107467112911856, + "tokens_seen": 504904704 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042777331995987966, + "loss": 3.162, + "theoretical_loss": 3.9106921331757016, + "tokens_seen": 504970240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042776328986960884, + "loss": 3.0313, + "theoretical_loss": 3.910637564126028, + "tokens_seen": 505035776 + }, + { + "epoch": 1.06, + "learning_rate": 0.000427753259779338, + "loss": 2.8933, + "theoretical_loss": 3.9105830041394816, + "tokens_seen": 505101312 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004277432296890672, + "loss": 2.9269, + "theoretical_loss": 3.910528453213381, + "tokens_seen": 505166848 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042773319959879644, + "loss": 2.9692, + "theoretical_loss": 3.910473911345048, + "tokens_seen": 505232384 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042772316950852557, + "loss": 2.987, + "theoretical_loss": 3.9104193785318024, + "tokens_seen": 505297920 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004277131394182548, + "loss": 2.7983, + "theoretical_loss": 3.9103648547709673, + "tokens_seen": 505363456 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042770310932798393, + "loss": 2.9446, + "theoretical_loss": 3.910310340059867, + "tokens_seen": 505428992 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042769307923771316, + "loss": 2.9765, + "theoretical_loss": 3.9102558343958256, + "tokens_seen": 505494528 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042768304914744235, + "loss": 2.9479, + "theoretical_loss": 3.9102013377761695, + "tokens_seen": 505560064 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042767301905717153, + "loss": 2.8593, + "theoretical_loss": 3.910146850198225, + "tokens_seen": 505625600 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004276629889669007, + "loss": 3.1793, + "theoretical_loss": 3.910092371659321, + "tokens_seen": 505691136 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004276529588766299, + "loss": 3.1078, + "theoretical_loss": 3.9100379021567875, + "tokens_seen": 505756672 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042764292878635907, + "loss": 2.8893, + "theoretical_loss": 3.9099834416879533, + "tokens_seen": 505822208 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004276328986960883, + "loss": 3.161, + "theoretical_loss": 3.909928990250151, + "tokens_seen": 505887744 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042762286860581743, + "loss": 2.8981, + "theoretical_loss": 3.9098745478407135, + "tokens_seen": 505953280 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042761283851554667, + "loss": 3.054, + "theoretical_loss": 3.909820114456975, + "tokens_seen": 506018816 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042760280842527585, + "loss": 2.8977, + "theoretical_loss": 3.909765690096269, + "tokens_seen": 506084352 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042759277833500503, + "loss": 2.7607, + "theoretical_loss": 3.9097112747559337, + "tokens_seen": 506149888 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004275827482447342, + "loss": 2.8732, + "theoretical_loss": 3.909656868433305, + "tokens_seen": 506215424 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 599198, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1902496814727783, + "objective/train/theoretical_loss": 3.9096160696076026, + "objective/train/tokens_used": 526724576, + "theoretical_loss": 3.9096160696076026, + "tokens_seen": 506264576 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004275727181544634, + "loss": 2.8656, + "theoretical_loss": 3.9096024711257216, + "tokens_seen": 506280960 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004275626880641926, + "loss": 2.7504, + "theoretical_loss": 3.909548082830523, + "tokens_seen": 506346496 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004275526579739218, + "loss": 3.0306, + "theoretical_loss": 3.90949370354505, + "tokens_seen": 506412032 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042754262788365094, + "loss": 3.0496, + "theoretical_loss": 3.909439333266645, + "tokens_seen": 506477568 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042753259779338017, + "loss": 2.7544, + "theoretical_loss": 3.90938497199265, + "tokens_seen": 506543104 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004275225677031093, + "loss": 2.8873, + "theoretical_loss": 3.909330619720409, + "tokens_seen": 506608640 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042751253761283853, + "loss": 3.0007, + "theoretical_loss": 3.909276276447268, + "tokens_seen": 506674176 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004275025075225677, + "loss": 2.8218, + "theoretical_loss": 3.909221942170573, + "tokens_seen": 506739712 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004274924774322969, + "loss": 2.8335, + "theoretical_loss": 3.9091676168876717, + "tokens_seen": 506805248 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004274824473420261, + "loss": 2.826, + "theoretical_loss": 3.9091133005959118, + "tokens_seen": 506870784 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004274724172517553, + "loss": 2.9372, + "theoretical_loss": 3.9090589932926436, + "tokens_seen": 506936320 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042746238716148444, + "loss": 2.9096, + "theoretical_loss": 3.909004694975218, + "tokens_seen": 507001856 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004274523570712137, + "loss": 2.9741, + "theoretical_loss": 3.908950405640987, + "tokens_seen": 507067392 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004274423269809428, + "loss": 2.9571, + "theoretical_loss": 3.908896125287303, + "tokens_seen": 507132928 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042743229689067204, + "loss": 2.7355, + "theoretical_loss": 3.908841853911521, + "tokens_seen": 507198464 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004274222668004012, + "loss": 3.0665, + "theoretical_loss": 3.9087875915109955, + "tokens_seen": 507264000 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004274122367101304, + "loss": 2.9467, + "theoretical_loss": 3.908733338083084, + "tokens_seen": 507329536 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004274022066198596, + "loss": 2.7414, + "theoretical_loss": 3.9086790936251425, + "tokens_seen": 507395072 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042739217652958876, + "loss": 2.9773, + "theoretical_loss": 3.908624858134531, + "tokens_seen": 507460608 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042738214643931794, + "loss": 2.893, + "theoretical_loss": 3.908570631608609, + "tokens_seen": 507526144 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004273721163490472, + "loss": 2.8147, + "theoretical_loss": 3.9085164140447377, + "tokens_seen": 507591680 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004273620862587763, + "loss": 2.8449, + "theoretical_loss": 3.908462205440278, + "tokens_seen": 507657216 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042735205616850554, + "loss": 2.9502, + "theoretical_loss": 3.9084080057925936, + "tokens_seen": 507722752 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042734202607823467, + "loss": 2.9411, + "theoretical_loss": 3.908353815099049, + "tokens_seen": 507788288 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004273319959879639, + "loss": 2.8535, + "theoretical_loss": 3.90829963335701, + "tokens_seen": 507853824 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 599925, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.022632360458374, + "objective/train/theoretical_loss": 3.9082590029233213, + "objective/train/tokens_used": 528362976, + "theoretical_loss": 3.9082590029233213, + "tokens_seen": 507902976 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004273219658976931, + "loss": 3.1551, + "theoretical_loss": 3.908245460563842, + "tokens_seen": 507919360 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042731193580742226, + "loss": 2.7927, + "theoretical_loss": 3.908191296716913, + "tokens_seen": 507984896 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042730190571715145, + "loss": 2.7956, + "theoretical_loss": 3.908137141813592, + "tokens_seen": 508050432 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004272918756268807, + "loss": 2.7378, + "theoretical_loss": 3.9080829958512484, + "tokens_seen": 508115968 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004272818455366098, + "loss": 2.8576, + "theoretical_loss": 3.9080288588272536, + "tokens_seen": 508181504 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042727181544633904, + "loss": 2.6358, + "theoretical_loss": 3.90797473073898, + "tokens_seen": 508247040 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042726178535606817, + "loss": 2.9043, + "theoretical_loss": 3.9079206115837994, + "tokens_seen": 508312576 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004272517552657974, + "loss": 3.0203, + "theoretical_loss": 3.907866501359087, + "tokens_seen": 508378112 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004272417251755266, + "loss": 2.8236, + "theoretical_loss": 3.907812400062218, + "tokens_seen": 508443648 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042723169508525577, + "loss": 2.8062, + "theoretical_loss": 3.9077583076905684, + "tokens_seen": 508509184 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042722166499498495, + "loss": 2.9367, + "theoretical_loss": 3.907704224241517, + "tokens_seen": 508574720 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042721163490471413, + "loss": 2.758, + "theoretical_loss": 3.907650149712442, + "tokens_seen": 508640256 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004272016048144433, + "loss": 2.8615, + "theoretical_loss": 3.9075960841007227, + "tokens_seen": 508705792 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042719157472417255, + "loss": 2.8553, + "theoretical_loss": 3.9075420274037405, + "tokens_seen": 508771328 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004271815446339017, + "loss": 2.7945, + "theoretical_loss": 3.9074879796188773, + "tokens_seen": 508836864 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004271715145436309, + "loss": 2.718, + "theoretical_loss": 3.907433940743516, + "tokens_seen": 508902400 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004271614844533601, + "loss": 3.0734, + "theoretical_loss": 3.9073799107750413, + "tokens_seen": 508967936 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042715145436308927, + "loss": 2.8651, + "theoretical_loss": 3.9073258897108385, + "tokens_seen": 509033472 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004271414242728185, + "loss": 2.684, + "theoretical_loss": 3.9072718775482933, + "tokens_seen": 509099008 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042713139418254763, + "loss": 2.9408, + "theoretical_loss": 3.907217874284794, + "tokens_seen": 509164544 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042712136409227687, + "loss": 2.7014, + "theoretical_loss": 3.9071638799177295, + "tokens_seen": 509230080 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042711133400200605, + "loss": 3.0804, + "theoretical_loss": 3.907109894444489, + "tokens_seen": 509295616 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042710130391173523, + "loss": 2.8745, + "theoretical_loss": 3.907055917862463, + "tokens_seen": 509361152 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004270912738214644, + "loss": 3.088, + "theoretical_loss": 3.9070019501690445, + "tokens_seen": 509426688 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004270812437311936, + "loss": 2.8545, + "theoretical_loss": 3.9069479913616254, + "tokens_seen": 509492224 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 601205, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.895568370819092, + "objective/train/theoretical_loss": 3.906907528085932, + "objective/train/tokens_used": 530001376, + "theoretical_loss": 3.906907528085932, + "tokens_seen": 509541376 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004270712136409228, + "loss": 2.8726, + "theoretical_loss": 3.906894041437601, + "tokens_seen": 509557760 + }, + { + "epoch": 1.06, + "learning_rate": 0.000427061183550652, + "loss": 2.6773, + "theoretical_loss": 3.906840100394366, + "tokens_seen": 509623296 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042705115346038114, + "loss": 2.8422, + "theoretical_loss": 3.9067861682293166, + "tokens_seen": 509688832 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042704112337011037, + "loss": 2.8928, + "theoretical_loss": 3.9067322449398505, + "tokens_seen": 509754368 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004270310932798395, + "loss": 3.308, + "theoretical_loss": 3.9066783305233663, + "tokens_seen": 509819904 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042702106318956873, + "loss": 3.0129, + "theoretical_loss": 3.906624424977263, + "tokens_seen": 509885440 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004270110330992979, + "loss": 3.0955, + "theoretical_loss": 3.9065705282989427, + "tokens_seen": 509950976 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004270010030090271, + "loss": 3.2208, + "theoretical_loss": 3.9065166404858056, + "tokens_seen": 510016512 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004269909729187563, + "loss": 2.9995, + "theoretical_loss": 3.906462761535256, + "tokens_seen": 510082048 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004269809428284855, + "loss": 2.6179, + "theoretical_loss": 3.9064088914446975, + "tokens_seen": 510147584 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042697091273821464, + "loss": 2.9016, + "theoretical_loss": 3.9063550302115346, + "tokens_seen": 510213120 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004269608826479439, + "loss": 2.7884, + "theoretical_loss": 3.9063011778331744, + "tokens_seen": 510278656 + }, + { + "epoch": 1.06, + "learning_rate": 0.000426950852557673, + "loss": 3.0448, + "theoretical_loss": 3.9062473343070234, + "tokens_seen": 510344192 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042694082246740224, + "loss": 3.076, + "theoretical_loss": 3.9061934996304912, + "tokens_seen": 510409728 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004269307923771314, + "loss": 2.9906, + "theoretical_loss": 3.906139673800986, + "tokens_seen": 510475264 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004269207622868606, + "loss": 3.0174, + "theoretical_loss": 3.9060858568159196, + "tokens_seen": 510540800 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004269107321965898, + "loss": 2.8787, + "theoretical_loss": 3.906032048672702, + "tokens_seen": 510606336 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042690070210631896, + "loss": 2.8605, + "theoretical_loss": 3.9059782493687476, + "tokens_seen": 510671872 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042689067201604814, + "loss": 3.0283, + "theoretical_loss": 3.9059244589014694, + "tokens_seen": 510737408 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004268806419257774, + "loss": 2.945, + "theoretical_loss": 3.9058706772682825, + "tokens_seen": 510802944 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004268706118355065, + "loss": 3.2505, + "theoretical_loss": 3.905816904466603, + "tokens_seen": 510868480 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042686058174523574, + "loss": 3.0026, + "theoretical_loss": 3.905763140493848, + "tokens_seen": 510934016 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042685055165496487, + "loss": 2.9411, + "theoretical_loss": 3.905709385347436, + "tokens_seen": 510999552 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004268405215646941, + "loss": 3.2739, + "theoretical_loss": 3.9056556390247863, + "tokens_seen": 511065088 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004268304914744233, + "loss": 3.2656, + "theoretical_loss": 3.905601901523318, + "tokens_seen": 511130624 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 601687, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9587132930755615, + "objective/train/theoretical_loss": 3.905561604184567, + "objective/train/tokens_used": 531639776, + "theoretical_loss": 3.905561604184567, + "tokens_seen": 511179776 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042682046138415246, + "loss": 3.1384, + "theoretical_loss": 3.9055481728404544, + "tokens_seen": 511196160 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042681043129388165, + "loss": 3.1338, + "theoretical_loss": 3.905494452973617, + "tokens_seen": 511261696 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004268004012036109, + "loss": 2.854, + "theoretical_loss": 3.905440741920229, + "tokens_seen": 511327232 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042679037111334, + "loss": 2.9444, + "theoretical_loss": 3.9053870396777164, + "tokens_seen": 511392768 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042678034102306924, + "loss": 2.9435, + "theoretical_loss": 3.905333346243504, + "tokens_seen": 511458304 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042677031093279837, + "loss": 3.0561, + "theoretical_loss": 3.905279661615019, + "tokens_seen": 511523840 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004267602808425276, + "loss": 2.9755, + "theoretical_loss": 3.9052259857896887, + "tokens_seen": 511589376 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004267502507522568, + "loss": 2.9857, + "theoretical_loss": 3.9051723187649436, + "tokens_seen": 511654912 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042674022066198597, + "loss": 3.0649, + "theoretical_loss": 3.905118660538213, + "tokens_seen": 511720448 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042673019057171515, + "loss": 2.9349, + "theoretical_loss": 3.905065011106928, + "tokens_seen": 511785984 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042672016048144433, + "loss": 3.0933, + "theoretical_loss": 3.9050113704685208, + "tokens_seen": 511851520 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004267101303911735, + "loss": 3.0626, + "theoretical_loss": 3.904957738620425, + "tokens_seen": 511917056 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042670010030090275, + "loss": 3.0902, + "theoretical_loss": 3.9049041155600746, + "tokens_seen": 511982592 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004266900702106319, + "loss": 2.8052, + "theoretical_loss": 3.9048505012849053, + "tokens_seen": 512048128 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004266800401203611, + "loss": 2.9866, + "theoretical_loss": 3.9047968957923542, + "tokens_seen": 512113664 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042667001003009024, + "loss": 3.2107, + "theoretical_loss": 3.9047432990798585, + "tokens_seen": 512179200 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042665997993981947, + "loss": 2.9144, + "theoretical_loss": 3.9046897111448575, + "tokens_seen": 512244736 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042664994984954865, + "loss": 2.8282, + "theoretical_loss": 3.90463613198479, + "tokens_seen": 512310272 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042663991975927783, + "loss": 3.0063, + "theoretical_loss": 3.9045825615970973, + "tokens_seen": 512375808 + }, + { + "epoch": 1.07, + "learning_rate": 0.000426629889669007, + "loss": 2.9466, + "theoretical_loss": 3.904528999979222, + "tokens_seen": 512441344 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042661985957873625, + "loss": 2.9841, + "theoretical_loss": 3.9044754471286063, + "tokens_seen": 512506880 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004266098294884654, + "loss": 2.7521, + "theoretical_loss": 3.9044219030426945, + "tokens_seen": 512572416 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004265997993981946, + "loss": 2.8133, + "theoretical_loss": 3.904368367718932, + "tokens_seen": 512637952 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042658976930792374, + "loss": 2.9629, + "theoretical_loss": 3.904314841154765, + "tokens_seen": 512703488 + }, + { + "epoch": 1.07, + "learning_rate": 0.000426579739217653, + "loss": 3.0293, + "theoretical_loss": 3.9042613233476406, + "tokens_seen": 512769024 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 602985, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.070340633392334, + "objective/train/theoretical_loss": 3.904221190737572, + "objective/train/tokens_used": 533278176, + "theoretical_loss": 3.904221190737572, + "tokens_seen": 512818176 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042656970912738216, + "loss": 2.8697, + "theoretical_loss": 3.9042078142950074, + "tokens_seen": 512834560 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042655967903711134, + "loss": 2.8872, + "theoretical_loss": 3.9041543139943147, + "tokens_seen": 512900096 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004265496489468405, + "loss": 3.0673, + "theoretical_loss": 3.9041008224430134, + "tokens_seen": 512965632 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004265396188565697, + "loss": 2.8721, + "theoretical_loss": 3.9040473396385544, + "tokens_seen": 513031168 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004265295887662989, + "loss": 2.8403, + "theoretical_loss": 3.903993865578391, + "tokens_seen": 513096704 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004265195586760281, + "loss": 2.8301, + "theoretical_loss": 3.9039404002599767, + "tokens_seen": 513162240 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042650952858575724, + "loss": 2.9424, + "theoretical_loss": 3.903886943680766, + "tokens_seen": 513227776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004264994984954865, + "loss": 3.0853, + "theoretical_loss": 3.903833495838215, + "tokens_seen": 513293312 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004264894684052156, + "loss": 3.214, + "theoretical_loss": 3.903780056729781, + "tokens_seen": 513358848 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042647943831494484, + "loss": 3.1121, + "theoretical_loss": 3.9037266263529213, + "tokens_seen": 513424384 + }, + { + "epoch": 1.07, + "learning_rate": 0.000426469408224674, + "loss": 3.105, + "theoretical_loss": 3.903673204705096, + "tokens_seen": 513489920 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004264593781344032, + "loss": 2.6414, + "theoretical_loss": 3.903619791783764, + "tokens_seen": 513555456 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004264493480441324, + "loss": 3.0392, + "theoretical_loss": 3.903566387586387, + "tokens_seen": 513620992 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004264393179538616, + "loss": 2.8988, + "theoretical_loss": 3.9035129921104277, + "tokens_seen": 513686528 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042642928786359075, + "loss": 2.9126, + "theoretical_loss": 3.9034596053533486, + "tokens_seen": 513752064 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042641925777332, + "loss": 3.1472, + "theoretical_loss": 3.9034062273126144, + "tokens_seen": 513817600 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042640922768304916, + "loss": 2.9302, + "theoretical_loss": 3.9033528579856904, + "tokens_seen": 513883136 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042639919759277834, + "loss": 3.1095, + "theoretical_loss": 3.9032994973700434, + "tokens_seen": 513948672 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004263891675025076, + "loss": 3.0708, + "theoretical_loss": 3.9032461454631404, + "tokens_seen": 514014208 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004263791374122367, + "loss": 3.0222, + "theoretical_loss": 3.9031928022624514, + "tokens_seen": 514079744 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042636910732196594, + "loss": 3.1103, + "theoretical_loss": 3.9031394677654445, + "tokens_seen": 514145280 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042635907723169507, + "loss": 2.7347, + "theoretical_loss": 3.903086141969591, + "tokens_seen": 514210816 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004263490471414243, + "loss": 2.9696, + "theoretical_loss": 3.903032824872362, + "tokens_seen": 514276352 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004263390170511535, + "loss": 3.09, + "theoretical_loss": 3.902979516471232, + "tokens_seen": 514341888 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042632898696088266, + "loss": 2.9927, + "theoretical_loss": 3.9029262167636736, + "tokens_seen": 514407424 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 603651, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3740158081054688, + "objective/train/theoretical_loss": 3.902886247686642, + "objective/train/tokens_used": 534916576, + "theoretical_loss": 3.902886247686642, + "tokens_seen": 514456576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042631895687061185, + "loss": 2.8779, + "theoretical_loss": 3.9028729257471615, + "tokens_seen": 514472960 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004263089267803411, + "loss": 3.0081, + "theoretical_loss": 3.902819643419173, + "tokens_seen": 514538496 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004262988966900702, + "loss": 3.0532, + "theoretical_loss": 3.9027663697771837, + "tokens_seen": 514604032 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042628886659979944, + "loss": 3.1043, + "theoretical_loss": 3.9027131048186727, + "tokens_seen": 514669568 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042627883650952857, + "loss": 2.8976, + "theoretical_loss": 3.902659848541119, + "tokens_seen": 514735104 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004262688064192578, + "loss": 3.0731, + "theoretical_loss": 3.9026066009420024, + "tokens_seen": 514800640 + }, + { + "epoch": 1.07, + "learning_rate": 0.000426258776328987, + "loss": 2.867, + "theoretical_loss": 3.902553362018805, + "tokens_seen": 514866176 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042624874623871617, + "loss": 2.911, + "theoretical_loss": 3.902500131769008, + "tokens_seen": 514931712 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042623871614844535, + "loss": 3.2478, + "theoretical_loss": 3.9024469101900956, + "tokens_seen": 514997248 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042622868605817453, + "loss": 2.9697, + "theoretical_loss": 3.902393697279552, + "tokens_seen": 515062784 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004262186559679037, + "loss": 2.874, + "theoretical_loss": 3.9023404930348624, + "tokens_seen": 515128320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042620862587763295, + "loss": 2.9714, + "theoretical_loss": 3.9022872974535137, + "tokens_seen": 515193856 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004261985957873621, + "loss": 3.0392, + "theoretical_loss": 3.9022341105329934, + "tokens_seen": 515259392 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004261885656970913, + "loss": 2.9288, + "theoretical_loss": 3.90218093227079, + "tokens_seen": 515324928 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042617853560682044, + "loss": 2.958, + "theoretical_loss": 3.902127762664393, + "tokens_seen": 515390464 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042616850551654967, + "loss": 2.8225, + "theoretical_loss": 3.902074601711294, + "tokens_seen": 515456000 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042615847542627885, + "loss": 2.7655, + "theoretical_loss": 3.9020214494089833, + "tokens_seen": 515521536 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042614844533600803, + "loss": 2.8536, + "theoretical_loss": 3.901968305754955, + "tokens_seen": 515587072 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004261384152457372, + "loss": 2.8864, + "theoretical_loss": 3.901915170746702, + "tokens_seen": 515652608 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042612838515546645, + "loss": 2.9468, + "theoretical_loss": 3.90186204438172, + "tokens_seen": 515718144 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004261183550651956, + "loss": 2.9706, + "theoretical_loss": 3.9018089266575045, + "tokens_seen": 515783680 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004261083249749248, + "loss": 2.9346, + "theoretical_loss": 3.9017558175715523, + "tokens_seen": 515849216 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042609829488465394, + "loss": 2.8477, + "theoretical_loss": 3.9017027171213616, + "tokens_seen": 515914752 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004260882647943832, + "loss": 2.8322, + "theoretical_loss": 3.9016496253044317, + "tokens_seen": 515980288 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042607823470411236, + "loss": 2.764, + "theoretical_loss": 3.901596542118263, + "tokens_seen": 516045824 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 604730, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.402284622192383, + "objective/train/theoretical_loss": 3.901556735391069, + "objective/train/tokens_used": 536554976, + "theoretical_loss": 3.901556735391069, + "tokens_seen": 516094976 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042606820461384154, + "loss": 3.1128, + "theoretical_loss": 3.901543467560355, + "tokens_seen": 516111360 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004260581745235707, + "loss": 2.903, + "theoretical_loss": 3.9014904016282124, + "tokens_seen": 516176896 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004260481444332999, + "loss": 3.158, + "theoretical_loss": 3.901437344319336, + "tokens_seen": 516242432 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004260381143430291, + "loss": 2.6293, + "theoretical_loss": 3.9013842956312317, + "tokens_seen": 516307968 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004260280842527583, + "loss": 2.7853, + "theoretical_loss": 3.901331255561405, + "tokens_seen": 516373504 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042601805416248744, + "loss": 2.7866, + "theoretical_loss": 3.9012782241073602, + "tokens_seen": 516439040 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004260080240722167, + "loss": 3.2774, + "theoretical_loss": 3.9012252012666067, + "tokens_seen": 516504576 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004259979939819458, + "loss": 2.8639, + "theoretical_loss": 3.901172187036652, + "tokens_seen": 516570112 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042598796389167504, + "loss": 2.9217, + "theoretical_loss": 3.901119181415006, + "tokens_seen": 516635648 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004259779338014042, + "loss": 2.8692, + "theoretical_loss": 3.901066184399179, + "tokens_seen": 516701184 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004259679037111334, + "loss": 2.8358, + "theoretical_loss": 3.9010131959866827, + "tokens_seen": 516766720 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004259578736208626, + "loss": 2.9191, + "theoretical_loss": 3.9009602161750294, + "tokens_seen": 516832256 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004259478435305918, + "loss": 2.8169, + "theoretical_loss": 3.9009072449617324, + "tokens_seen": 516897792 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042593781344032095, + "loss": 3.0129, + "theoretical_loss": 3.900854282344307, + "tokens_seen": 516963328 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004259277833500502, + "loss": 3.0345, + "theoretical_loss": 3.9008013283202683, + "tokens_seen": 517028864 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004259177532597793, + "loss": 2.9059, + "theoretical_loss": 3.9007483828871337, + "tokens_seen": 517094400 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042590772316950854, + "loss": 3.049, + "theoretical_loss": 3.9006954460424206, + "tokens_seen": 517159936 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004258976930792377, + "loss": 2.6156, + "theoretical_loss": 3.9006425177836475, + "tokens_seen": 517225472 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004258876629889669, + "loss": 2.9233, + "theoretical_loss": 3.9005895981083345, + "tokens_seen": 517291008 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004258776328986961, + "loss": 2.7387, + "theoretical_loss": 3.900536687014002, + "tokens_seen": 517356544 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042586760280842527, + "loss": 2.8999, + "theoretical_loss": 3.9004837844981726, + "tokens_seen": 517422080 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042585757271815445, + "loss": 2.5944, + "theoretical_loss": 3.9004308905583684, + "tokens_seen": 517487616 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004258475426278837, + "loss": 3.1158, + "theoretical_loss": 3.900378005192114, + "tokens_seen": 517553152 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004258375125376128, + "loss": 2.851, + "theoretical_loss": 3.900325128396934, + "tokens_seen": 517618688 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042582748244734205, + "loss": 2.6032, + "theoretical_loss": 3.900272260170354, + "tokens_seen": 517684224 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 605280, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3959805965423584, + "objective/train/theoretical_loss": 3.900232614622076, + "objective/train/tokens_used": 538193376, + "theoretical_loss": 3.900232614622076, + "tokens_seen": 517733376 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004258174523570712, + "loss": 3.1391, + "theoretical_loss": 3.9002194005099016, + "tokens_seen": 517749760 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004258074222668004, + "loss": 2.8197, + "theoretical_loss": 3.900166549413105, + "tokens_seen": 517815296 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004257973921765296, + "loss": 3.117, + "theoretical_loss": 3.900113706877492, + "tokens_seen": 517880832 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042578736208625877, + "loss": 2.9555, + "theoretical_loss": 3.9000608729005943, + "tokens_seen": 517946368 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042577733199598795, + "loss": 2.7419, + "theoretical_loss": 3.9000080474799415, + "tokens_seen": 518011904 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004257673019057172, + "loss": 2.6787, + "theoretical_loss": 3.8999552306130667, + "tokens_seen": 518077440 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004257572718154463, + "loss": 2.8847, + "theoretical_loss": 3.8999024222975036, + "tokens_seen": 518142976 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042574724172517555, + "loss": 2.898, + "theoretical_loss": 3.899849622530785, + "tokens_seen": 518208512 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004257372116349047, + "loss": 3.0798, + "theoretical_loss": 3.899796831310446, + "tokens_seen": 518274048 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004257271815446339, + "loss": 2.8054, + "theoretical_loss": 3.899744048634024, + "tokens_seen": 518339584 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004257171514543631, + "loss": 2.847, + "theoretical_loss": 3.899691274499056, + "tokens_seen": 518405120 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004257071213640923, + "loss": 2.9221, + "theoretical_loss": 3.8996385089030796, + "tokens_seen": 518470656 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042569709127382145, + "loss": 2.92, + "theoretical_loss": 3.899585751843635, + "tokens_seen": 518536192 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042568706118355064, + "loss": 2.9806, + "theoretical_loss": 3.899533003318261, + "tokens_seen": 518601728 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004256770310932798, + "loss": 2.9909, + "theoretical_loss": 3.8994802633245014, + "tokens_seen": 518667264 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042566700100300905, + "loss": 2.8064, + "theoretical_loss": 3.899427531859896, + "tokens_seen": 518732800 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042565697091273823, + "loss": 2.6601, + "theoretical_loss": 3.899374808921989, + "tokens_seen": 518798336 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004256469408224674, + "loss": 2.8185, + "theoretical_loss": 3.8993220945083253, + "tokens_seen": 518863872 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042563691073219665, + "loss": 2.9848, + "theoretical_loss": 3.8992693886164504, + "tokens_seen": 518929408 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004256268806419258, + "loss": 3.0078, + "theoretical_loss": 3.8992166912439092, + "tokens_seen": 518994944 + }, + { + "epoch": 1.07, + "learning_rate": 0.000425616850551655, + "loss": 2.8503, + "theoretical_loss": 3.8991640023882512, + "tokens_seen": 519060480 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042560682046138414, + "loss": 3.1139, + "theoretical_loss": 3.8991113220470233, + "tokens_seen": 519126016 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004255967903711134, + "loss": 2.8546, + "theoretical_loss": 3.8990586502177758, + "tokens_seen": 519191552 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042558676028084256, + "loss": 2.8532, + "theoretical_loss": 3.8990059868980587, + "tokens_seen": 519257088 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042557673019057174, + "loss": 2.891, + "theoretical_loss": 3.8989533320854237, + "tokens_seen": 519322624 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 606732, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3824806213378906, + "objective/train/theoretical_loss": 3.8989138465572477, + "objective/train/tokens_used": 539831776, + "theoretical_loss": 3.8989138465572477, + "tokens_seen": 519371776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004255667001003009, + "loss": 2.845, + "theoretical_loss": 3.898900685777423, + "tokens_seen": 519388160 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004255566700100301, + "loss": 2.9526, + "theoretical_loss": 3.8988480479716108, + "tokens_seen": 519453696 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004255466399197593, + "loss": 3.05, + "theoretical_loss": 3.8987954186655402, + "tokens_seen": 519519232 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004255366098294885, + "loss": 2.9402, + "theoretical_loss": 3.8987427978567686, + "tokens_seen": 519584768 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042552657973921764, + "loss": 2.8061, + "theoretical_loss": 3.898690185542852, + "tokens_seen": 519650304 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004255165496489469, + "loss": 2.8778, + "theoretical_loss": 3.8986375817213466, + "tokens_seen": 519715840 + }, + { + "epoch": 1.07, + "learning_rate": 0.000425506519558676, + "loss": 2.7645, + "theoretical_loss": 3.8985849863898125, + "tokens_seen": 519781376 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042549648946840524, + "loss": 3.1105, + "theoretical_loss": 3.8985323995458083, + "tokens_seen": 519846912 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004254864593781344, + "loss": 2.9125, + "theoretical_loss": 3.8984798211868963, + "tokens_seen": 519912448 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004254764292878636, + "loss": 3.1081, + "theoretical_loss": 3.8984272513106357, + "tokens_seen": 519977984 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004254663991975928, + "loss": 3.0043, + "theoretical_loss": 3.89837468991459, + "tokens_seen": 520043520 + }, + { + "epoch": 1.07, + "learning_rate": 0.000425456369107322, + "loss": 2.9944, + "theoretical_loss": 3.898322136996324, + "tokens_seen": 520109056 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042544633901705115, + "loss": 2.826, + "theoretical_loss": 3.898269592553401, + "tokens_seen": 520174592 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004254363089267804, + "loss": 2.5198, + "theoretical_loss": 3.8982170565833867, + "tokens_seen": 520240128 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004254262788365095, + "loss": 2.9318, + "theoretical_loss": 3.8981645290838483, + "tokens_seen": 520305664 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042541624874623874, + "loss": 2.7986, + "theoretical_loss": 3.898112010052353, + "tokens_seen": 520371200 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004254062186559679, + "loss": 3.0719, + "theoretical_loss": 3.898059499486469, + "tokens_seen": 520436736 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004253961885656971, + "loss": 2.9799, + "theoretical_loss": 3.8980069973837677, + "tokens_seen": 520502272 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004253861584754263, + "loss": 2.9913, + "theoretical_loss": 3.8979545037418175, + "tokens_seen": 520567808 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042537612838515547, + "loss": 2.8971, + "theoretical_loss": 3.897902018558192, + "tokens_seen": 520633344 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042536609829488465, + "loss": 2.9909, + "theoretical_loss": 3.8978495418304626, + "tokens_seen": 520698880 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004253560682046139, + "loss": 3.0641, + "theoretical_loss": 3.8977970735562035, + "tokens_seen": 520764416 + }, + { + "epoch": 1.07, + "learning_rate": 0.000425346038114343, + "loss": 3.1792, + "theoretical_loss": 3.8977446137329887, + "tokens_seen": 520829952 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042533600802407225, + "loss": 3.1222, + "theoretical_loss": 3.8976921623583944, + "tokens_seen": 520895488 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004253259779338014, + "loss": 2.9062, + "theoretical_loss": 3.8976397194299977, + "tokens_seen": 520961024 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 607442, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.450615167617798, + "objective/train/theoretical_loss": 3.8976003927750593, + "objective/train/tokens_used": 541470176, + "theoretical_loss": 3.8976003927750593, + "tokens_seen": 521010176 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004253159478435306, + "loss": 2.8771, + "theoretical_loss": 3.8975872849453754, + "tokens_seen": 521026560 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004253059177532598, + "loss": 2.9847, + "theoretical_loss": 3.897534858902107, + "tokens_seen": 521092096 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042529588766298897, + "loss": 2.7599, + "theoretical_loss": 3.897482441297771, + "tokens_seen": 521157632 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042528585757271815, + "loss": 2.8196, + "theoretical_loss": 3.8974300321299484, + "tokens_seen": 521223168 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004252758274824474, + "loss": 2.7218, + "theoretical_loss": 3.897377631396222, + "tokens_seen": 521288704 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004252657973921765, + "loss": 2.7832, + "theoretical_loss": 3.8973252390941733, + "tokens_seen": 521354240 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042525576730190575, + "loss": 3.1878, + "theoretical_loss": 3.8972728552213862, + "tokens_seen": 521419776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004252457372116349, + "loss": 2.9531, + "theoretical_loss": 3.897220479775446, + "tokens_seen": 521485312 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004252357071213641, + "loss": 2.8667, + "theoretical_loss": 3.8971681127539366, + "tokens_seen": 521550848 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004252256770310933, + "loss": 2.84, + "theoretical_loss": 3.8971157541544468, + "tokens_seen": 521616384 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004252156469408225, + "loss": 2.7896, + "theoretical_loss": 3.8970634039745624, + "tokens_seen": 521681920 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042520561685055165, + "loss": 3.1027, + "theoretical_loss": 3.897011062211873, + "tokens_seen": 521747456 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042519558676028084, + "loss": 2.7281, + "theoretical_loss": 3.896958728863968, + "tokens_seen": 521812992 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042518555667001, + "loss": 2.9021, + "theoretical_loss": 3.8969064039284387, + "tokens_seen": 521878528 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042517552657973925, + "loss": 2.8982, + "theoretical_loss": 3.8968540874028754, + "tokens_seen": 521944064 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004251654964894684, + "loss": 3.0018, + "theoretical_loss": 3.8968017792848713, + "tokens_seen": 522009600 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004251554663991976, + "loss": 3.1168, + "theoretical_loss": 3.8967494795720197, + "tokens_seen": 522075136 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004251454363089268, + "loss": 2.9336, + "theoretical_loss": 3.896697188261916, + "tokens_seen": 522140672 + }, + { + "epoch": 1.07, + "learning_rate": 0.000425135406218656, + "loss": 2.8327, + "theoretical_loss": 3.8966449053521552, + "tokens_seen": 522206208 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042512537612838516, + "loss": 2.992, + "theoretical_loss": 3.896592630840334, + "tokens_seen": 522271744 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042511534603811434, + "loss": 2.643, + "theoretical_loss": 3.896540364724049, + "tokens_seen": 522337280 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004251053159478435, + "loss": 2.9115, + "theoretical_loss": 3.8964881070009003, + "tokens_seen": 522402816 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042509528585757276, + "loss": 2.8193, + "theoretical_loss": 3.896435857668486, + "tokens_seen": 522468352 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004250852557673019, + "loss": 2.856, + "theoretical_loss": 3.8963836167244077, + "tokens_seen": 522533888 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004250752256770311, + "loss": 2.9309, + "theoretical_loss": 3.896331384166267, + "tokens_seen": 522599424 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 608749, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.052105665206909, + "objective/train/theoretical_loss": 3.896292215249489, + "objective/train/tokens_used": 543108576, + "theoretical_loss": 3.896292215249489, + "tokens_seen": 522648576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042506519558676024, + "loss": 2.8582, + "theoretical_loss": 3.896279159991664, + "tokens_seen": 522664960 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004250551654964895, + "loss": 2.9196, + "theoretical_loss": 3.8962269441982063, + "tokens_seen": 522730496 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042504513540621866, + "loss": 2.9007, + "theoretical_loss": 3.8961747367834945, + "tokens_seen": 522796032 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042503510531594784, + "loss": 3.1226, + "theoretical_loss": 3.896122537745136, + "tokens_seen": 522861568 + }, + { + "epoch": 1.07, + "learning_rate": 0.000425025075225677, + "loss": 3.0625, + "theoretical_loss": 3.896070347080737, + "tokens_seen": 522927104 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004250150451354062, + "loss": 2.948, + "theoretical_loss": 3.896018164787905, + "tokens_seen": 522992640 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004250050150451354, + "loss": 2.7946, + "theoretical_loss": 3.895965990864247, + "tokens_seen": 523058176 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004249949849548646, + "loss": 2.8184, + "theoretical_loss": 3.895913825307374, + "tokens_seen": 523123712 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042498495486459375, + "loss": 2.6486, + "theoretical_loss": 3.895861668114896, + "tokens_seen": 523189248 + }, + { + "epoch": 1.07, + "learning_rate": 0.000424974924774323, + "loss": 2.7146, + "theoretical_loss": 3.895809519284424, + "tokens_seen": 523254784 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042496489468405216, + "loss": 3.0432, + "theoretical_loss": 3.8957573788135704, + "tokens_seen": 523320320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042495486459378135, + "loss": 2.756, + "theoretical_loss": 3.895705246699949, + "tokens_seen": 523385856 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004249448345035105, + "loss": 2.9122, + "theoretical_loss": 3.895653122941173, + "tokens_seen": 523451392 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004249348044132397, + "loss": 3.0265, + "theoretical_loss": 3.895601007534858, + "tokens_seen": 523516928 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042492477432296894, + "loss": 3.0803, + "theoretical_loss": 3.8955489004786212, + "tokens_seen": 523582464 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004249147442326981, + "loss": 2.9926, + "theoretical_loss": 3.8954968017700793, + "tokens_seen": 523648000 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004249047141424273, + "loss": 2.8496, + "theoretical_loss": 3.89544471140685, + "tokens_seen": 523713536 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004248946840521565, + "loss": 2.8785, + "theoretical_loss": 3.8953926293865524, + "tokens_seen": 523779072 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042488465396188567, + "loss": 3.0031, + "theoretical_loss": 3.8953405557068077, + "tokens_seen": 523844608 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042487462387161485, + "loss": 2.9991, + "theoretical_loss": 3.895288490365236, + "tokens_seen": 523910144 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004248645937813441, + "loss": 2.8258, + "theoretical_loss": 3.8952364333594596, + "tokens_seen": 523975680 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004248545636910732, + "loss": 2.7998, + "theoretical_loss": 3.8951843846871013, + "tokens_seen": 524041216 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042484453360080245, + "loss": 2.7548, + "theoretical_loss": 3.895132344345787, + "tokens_seen": 524106752 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004248345035105316, + "loss": 3.0234, + "theoretical_loss": 3.895080312333139, + "tokens_seen": 524172288 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004248244734202608, + "loss": 2.9753, + "theoretical_loss": 3.8950282886467846, + "tokens_seen": 524237824 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 609143, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.270958662033081, + "objective/train/theoretical_loss": 3.894989276344722, + "objective/train/tokens_used": 544746976, + "theoretical_loss": 3.894989276344722, + "tokens_seen": 524286976 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042481444332999, + "loss": 3.1384, + "theoretical_loss": 3.8949762732843514, + "tokens_seen": 524303360 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042480441323971917, + "loss": 2.5421, + "theoretical_loss": 3.8949242662434664, + "tokens_seen": 524368896 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042479438314944835, + "loss": 3.0288, + "theoretical_loss": 3.894872267521759, + "tokens_seen": 524434432 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004247843530591776, + "loss": 3.0604, + "theoretical_loss": 3.8948202771168585, + "tokens_seen": 524499968 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004247743229689067, + "loss": 2.8957, + "theoretical_loss": 3.894768295026396, + "tokens_seen": 524565504 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042476429287863595, + "loss": 2.7608, + "theoretical_loss": 3.894716321248004, + "tokens_seen": 524631040 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004247542627883651, + "loss": 2.9537, + "theoretical_loss": 3.8946643557793146, + "tokens_seen": 524696576 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004247442326980943, + "loss": 2.9661, + "theoretical_loss": 3.8946123986179613, + "tokens_seen": 524762112 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004247342026078235, + "loss": 2.9881, + "theoretical_loss": 3.8945604497615802, + "tokens_seen": 524827648 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004247241725175527, + "loss": 2.8578, + "theoretical_loss": 3.8945085092078053, + "tokens_seen": 524893184 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042471414242728186, + "loss": 2.9858, + "theoretical_loss": 3.8944565769542745, + "tokens_seen": 524958720 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042470411233701104, + "loss": 2.8689, + "theoretical_loss": 3.8944046529986247, + "tokens_seen": 525024256 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004246940822467402, + "loss": 2.9482, + "theoretical_loss": 3.8943527373384947, + "tokens_seen": 525089792 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042468405215646945, + "loss": 3.0764, + "theoretical_loss": 3.8943008299715247, + "tokens_seen": 525155328 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004246740220661986, + "loss": 3.077, + "theoretical_loss": 3.8942489308953547, + "tokens_seen": 525220864 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004246639919759278, + "loss": 2.7081, + "theoretical_loss": 3.894197040107626, + "tokens_seen": 525286400 + }, + { + "epoch": 1.07, + "learning_rate": 0.000424653961885657, + "loss": 2.9005, + "theoretical_loss": 3.8941451576059807, + "tokens_seen": 525351936 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004246439317953862, + "loss": 2.7815, + "theoretical_loss": 3.8940932833880635, + "tokens_seen": 525417472 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042463390170511536, + "loss": 2.698, + "theoretical_loss": 3.8940414174515183, + "tokens_seen": 525483008 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042462387161484454, + "loss": 2.7497, + "theoretical_loss": 3.8939895597939898, + "tokens_seen": 525548544 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004246138415245737, + "loss": 2.8141, + "theoretical_loss": 3.893937710413125, + "tokens_seen": 525614080 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042460381143430296, + "loss": 2.9974, + "theoretical_loss": 3.893885869306571, + "tokens_seen": 525679616 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004245937813440321, + "loss": 2.6887, + "theoretical_loss": 3.8938340364719766, + "tokens_seen": 525745152 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004245837512537613, + "loss": 2.852, + "theoretical_loss": 3.89378221190699, + "tokens_seen": 525810688 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042457372116349044, + "loss": 2.9421, + "theoretical_loss": 3.8937303956092615, + "tokens_seen": 525876224 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 610436, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9925901889801025, + "objective/train/theoretical_loss": 3.893691538809941, + "objective/train/tokens_used": 546385376, + "theoretical_loss": 3.893691538809941, + "tokens_seen": 525925376 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004245636910732197, + "loss": 2.7874, + "theoretical_loss": 3.893678587576443, + "tokens_seen": 525941760 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042455366098294886, + "loss": 2.9986, + "theoretical_loss": 3.8936267878061868, + "tokens_seen": 526007296 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042454363089267804, + "loss": 2.7701, + "theoretical_loss": 3.8935749962961443, + "tokens_seen": 526072832 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004245336008024072, + "loss": 2.8532, + "theoretical_loss": 3.8935232130439714, + "tokens_seen": 526138368 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004245235707121364, + "loss": 2.8331, + "theoretical_loss": 3.893471438047322, + "tokens_seen": 526203904 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004245135406218656, + "loss": 3.127, + "theoretical_loss": 3.8934196713038522, + "tokens_seen": 526269440 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004245035105315948, + "loss": 3.0505, + "theoretical_loss": 3.8933679128112186, + "tokens_seen": 526334976 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042449348044132395, + "loss": 3.023, + "theoretical_loss": 3.89331616256708, + "tokens_seen": 526400512 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004244834503510532, + "loss": 3.0169, + "theoretical_loss": 3.893264420569094, + "tokens_seen": 526466048 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042447342026078236, + "loss": 2.9502, + "theoretical_loss": 3.893212686814921, + "tokens_seen": 526531584 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042446339017051155, + "loss": 2.9393, + "theoretical_loss": 3.893160961302222, + "tokens_seen": 526597120 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004244533600802407, + "loss": 2.765, + "theoretical_loss": 3.893109244028658, + "tokens_seen": 526662656 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004244433299899699, + "loss": 3.2206, + "theoretical_loss": 3.8930575349918923, + "tokens_seen": 526728192 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004244332998996991, + "loss": 2.9727, + "theoretical_loss": 3.893005834189588, + "tokens_seen": 526793728 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004244232698094283, + "loss": 3.0495, + "theoretical_loss": 3.8929541416194096, + "tokens_seen": 526859264 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042441323971915745, + "loss": 3.11, + "theoretical_loss": 3.8929024572790225, + "tokens_seen": 526924800 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004244032096288867, + "loss": 2.7881, + "theoretical_loss": 3.8928507811660937, + "tokens_seen": 526990336 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004243931795386158, + "loss": 3.0553, + "theoretical_loss": 3.8927991132782904, + "tokens_seen": 527055872 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042438314944834505, + "loss": 3.0207, + "theoretical_loss": 3.8927474536132802, + "tokens_seen": 527121408 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042437311935807423, + "loss": 2.9144, + "theoretical_loss": 3.8926958021687335, + "tokens_seen": 527186944 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004243630892678034, + "loss": 2.8823, + "theoretical_loss": 3.8926441589423195, + "tokens_seen": 527252480 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004243530591775326, + "loss": 3.1453, + "theoretical_loss": 3.89259252393171, + "tokens_seen": 527318016 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004243430290872618, + "loss": 2.7057, + "theoretical_loss": 3.8925408971345776, + "tokens_seen": 527383552 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042433299899699095, + "loss": 2.645, + "theoretical_loss": 3.8924892785485943, + "tokens_seen": 527449088 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004243229689067202, + "loss": 2.9985, + "theoretical_loss": 3.8924376681714348, + "tokens_seen": 527514624 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 611010, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8362579345703125, + "objective/train/theoretical_loss": 3.8923989657742073, + "objective/train/tokens_used": 548023776, + "theoretical_loss": 3.8923989657742073, + "tokens_seen": 527563776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004243129388164493, + "loss": 2.9011, + "theoretical_loss": 3.892386066000774, + "tokens_seen": 527580160 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042430290872617855, + "loss": 2.876, + "theoretical_loss": 3.892334472034287, + "tokens_seen": 527645696 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042429287863590773, + "loss": 3.0023, + "theoretical_loss": 3.8922828862696526, + "tokens_seen": 527711232 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004242828485456369, + "loss": 2.8032, + "theoretical_loss": 3.8922313087045466, + "tokens_seen": 527776768 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004242728184553661, + "loss": 2.8308, + "theoretical_loss": 3.892179739336649, + "tokens_seen": 527842304 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004242627883650953, + "loss": 2.8076, + "theoretical_loss": 3.892128178163639, + "tokens_seen": 527907840 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042425275827482446, + "loss": 3.0146, + "theoretical_loss": 3.892076625183197, + "tokens_seen": 527973376 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004242427281845537, + "loss": 2.9239, + "theoretical_loss": 3.8920250803930054, + "tokens_seen": 528038912 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004242326980942828, + "loss": 2.7142, + "theoretical_loss": 3.891973543790746, + "tokens_seen": 528104448 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042422266800401206, + "loss": 2.8914, + "theoretical_loss": 3.891922015374103, + "tokens_seen": 528169984 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004242126379137412, + "loss": 2.6705, + "theoretical_loss": 3.8918704951407603, + "tokens_seen": 528235520 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004242026078234704, + "loss": 2.8523, + "theoretical_loss": 3.8918189830884033, + "tokens_seen": 528301056 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004241925777331996, + "loss": 2.9579, + "theoretical_loss": 3.8917674792147183, + "tokens_seen": 528366592 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004241825476429288, + "loss": 2.8322, + "theoretical_loss": 3.8917159835173933, + "tokens_seen": 528432128 + }, + { + "epoch": 1.07, + "learning_rate": 0.000424172517552658, + "loss": 2.8149, + "theoretical_loss": 3.891664495994115, + "tokens_seen": 528497664 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004241624874623872, + "loss": 3.0571, + "theoretical_loss": 3.891613016642574, + "tokens_seen": 528563200 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004241524573721164, + "loss": 3.1115, + "theoretical_loss": 3.8915615454604593, + "tokens_seen": 528628736 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042414242728184556, + "loss": 2.8828, + "theoretical_loss": 3.8915100824454627, + "tokens_seen": 528694272 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042413239719157474, + "loss": 3.0419, + "theoretical_loss": 3.8914586275952763, + "tokens_seen": 528759808 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004241223671013039, + "loss": 2.9529, + "theoretical_loss": 3.8914071809075916, + "tokens_seen": 528825344 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042411233701103316, + "loss": 2.8425, + "theoretical_loss": 3.891355742380104, + "tokens_seen": 528890880 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004241023069207623, + "loss": 2.7841, + "theoretical_loss": 3.891304312010507, + "tokens_seen": 528956416 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004240922768304915, + "loss": 2.9775, + "theoretical_loss": 3.8912528897964975, + "tokens_seen": 529021952 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042408224674022065, + "loss": 2.9298, + "theoretical_loss": 3.891201475735772, + "tokens_seen": 529087488 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004240722166499499, + "loss": 2.8611, + "theoretical_loss": 3.891150069826027, + "tokens_seen": 529153024 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 612097, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.09621000289917, + "objective/train/theoretical_loss": 3.8911115207414153, + "objective/train/tokens_used": 549662176, + "theoretical_loss": 3.8911115207414153, + "tokens_seen": 529202176 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042406218655967906, + "loss": 3.0195, + "theoretical_loss": 3.8910986720649614, + "tokens_seen": 529218560 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042405215646940824, + "loss": 3.0257, + "theoretical_loss": 3.8910472824502755, + "tokens_seen": 529284096 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004240421263791374, + "loss": 3.2495, + "theoretical_loss": 3.890995900979669, + "tokens_seen": 529349632 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004240320962888666, + "loss": 2.8085, + "theoretical_loss": 3.890944527650843, + "tokens_seen": 529415168 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004240220661985958, + "loss": 2.9752, + "theoretical_loss": 3.8908931624615004, + "tokens_seen": 529480704 + }, + { + "epoch": 1.07, + "learning_rate": 0.000424012036108325, + "loss": 2.9726, + "theoretical_loss": 3.8908418054093437, + "tokens_seen": 529546240 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042400200601805415, + "loss": 2.9715, + "theoretical_loss": 3.890790456492078, + "tokens_seen": 529611776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004239919759277834, + "loss": 2.7099, + "theoretical_loss": 3.890739115707407, + "tokens_seen": 529677312 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042398194583751256, + "loss": 2.9256, + "theoretical_loss": 3.8906877830530373, + "tokens_seen": 529742848 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042397191574724175, + "loss": 2.8714, + "theoretical_loss": 3.8906364585266764, + "tokens_seen": 529808384 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004239618856569709, + "loss": 2.8072, + "theoretical_loss": 3.8905851421260316, + "tokens_seen": 529873920 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004239518555667001, + "loss": 2.8951, + "theoretical_loss": 3.890533833848811, + "tokens_seen": 529939456 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004239418254764293, + "loss": 2.8725, + "theoretical_loss": 3.8904825336927256, + "tokens_seen": 530004992 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004239317953861585, + "loss": 3.0388, + "theoretical_loss": 3.890431241655485, + "tokens_seen": 530070528 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042392176529588765, + "loss": 2.9665, + "theoretical_loss": 3.890379957734801, + "tokens_seen": 530136064 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004239117352056169, + "loss": 2.9384, + "theoretical_loss": 3.890328681928387, + "tokens_seen": 530201600 + }, + { + "epoch": 1.07, + "learning_rate": 0.000423901705115346, + "loss": 3.113, + "theoretical_loss": 3.890277414233955, + "tokens_seen": 530267136 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042389167502507525, + "loss": 2.801, + "theoretical_loss": 3.89022615464922, + "tokens_seen": 530332672 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042388164493480443, + "loss": 2.8593, + "theoretical_loss": 3.8901749031718973, + "tokens_seen": 530398208 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004238716148445336, + "loss": 3.1283, + "theoretical_loss": 3.890123659799703, + "tokens_seen": 530463744 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004238615847542628, + "loss": 2.9641, + "theoretical_loss": 3.890072424530354, + "tokens_seen": 530529280 + }, + { + "epoch": 1.07, + "learning_rate": 0.000423851554663992, + "loss": 3.1211, + "theoretical_loss": 3.890021197361569, + "tokens_seen": 530594816 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042384152457372115, + "loss": 3.0983, + "theoretical_loss": 3.889969978291066, + "tokens_seen": 530660352 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004238314944834504, + "loss": 3.0237, + "theoretical_loss": 3.8899187673165656, + "tokens_seen": 530725888 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004238214643931795, + "loss": 2.7202, + "theoretical_loss": 3.8898675644357885, + "tokens_seen": 530791424 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 612816, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.779944658279419, + "objective/train/theoretical_loss": 3.889829167585341, + "objective/train/tokens_used": 551300576, + "theoretical_loss": 3.889829167585341, + "tokens_seen": 530840576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042381143430290875, + "loss": 2.7002, + "theoretical_loss": 3.8898163696464563, + "tokens_seen": 530856960 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042380140421263793, + "loss": 2.8166, + "theoretical_loss": 3.889765182946292, + "tokens_seen": 530922496 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004237913741223671, + "loss": 3.0376, + "theoretical_loss": 3.8897140043330185, + "tokens_seen": 530988032 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004237813440320963, + "loss": 3.2238, + "theoretical_loss": 3.889662833804361, + "tokens_seen": 531053568 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004237713139418255, + "loss": 2.808, + "theoretical_loss": 3.889611671358044, + "tokens_seen": 531119104 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042376128385155466, + "loss": 3.0963, + "theoretical_loss": 3.8895605169917955, + "tokens_seen": 531184640 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004237512537612839, + "loss": 2.9041, + "theoretical_loss": 3.8895093707033404, + "tokens_seen": 531250176 + }, + { + "epoch": 1.07, + "learning_rate": 0.000423741223671013, + "loss": 3.0183, + "theoretical_loss": 3.8894582324904086, + "tokens_seen": 531315712 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042373119358074226, + "loss": 2.8208, + "theoretical_loss": 3.889407102350729, + "tokens_seen": 531381248 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004237211634904714, + "loss": 2.8103, + "theoretical_loss": 3.8893559802820317, + "tokens_seen": 531446784 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004237111334002006, + "loss": 2.868, + "theoretical_loss": 3.8893048662820466, + "tokens_seen": 531512320 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004237011033099298, + "loss": 2.7113, + "theoretical_loss": 3.8892537603485065, + "tokens_seen": 531577856 + }, + { + "epoch": 1.07, + "learning_rate": 0.000423691073219659, + "loss": 2.7521, + "theoretical_loss": 3.889202662479144, + "tokens_seen": 531643392 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042368104312938816, + "loss": 2.9089, + "theoretical_loss": 3.8891515726716923, + "tokens_seen": 531708928 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004236710130391174, + "loss": 2.9575, + "theoretical_loss": 3.8891004909238873, + "tokens_seen": 531774464 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004236609829488465, + "loss": 2.8788, + "theoretical_loss": 3.889049417233463, + "tokens_seen": 531840000 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042365095285857576, + "loss": 2.7697, + "theoretical_loss": 3.8889983515981568, + "tokens_seen": 531905536 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004236409227683049, + "loss": 3.0722, + "theoretical_loss": 3.888947294015705, + "tokens_seen": 531971072 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004236308926780341, + "loss": 3.1524, + "theoretical_loss": 3.8888962444838473, + "tokens_seen": 532036608 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004236208625877633, + "loss": 3.0272, + "theoretical_loss": 3.888845203000322, + "tokens_seen": 532102144 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004236108324974925, + "loss": 2.8684, + "theoretical_loss": 3.8887941695628694, + "tokens_seen": 532167680 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042360080240722166, + "loss": 2.9734, + "theoretical_loss": 3.88874314416923, + "tokens_seen": 532233216 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042359077231695085, + "loss": 2.8621, + "theoretical_loss": 3.888692126817147, + "tokens_seen": 532298752 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042358074222668, + "loss": 2.6702, + "theoretical_loss": 3.8886411175043616, + "tokens_seen": 532364288 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042357071213640926, + "loss": 2.8799, + "theoretical_loss": 3.8885901162286185, + "tokens_seen": 532429824 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 614210, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1336987018585205, + "objective/train/theoretical_loss": 3.888551870544763, + "objective/train/tokens_used": 552938976, + "theoretical_loss": 3.888551870544763, + "tokens_seen": 532478976 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004235606820461384, + "loss": 3.1439, + "theoretical_loss": 3.8885391229876625, + "tokens_seen": 532495360 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004235506519558676, + "loss": 2.9976, + "theoretical_loss": 3.8884881377792384, + "tokens_seen": 532560896 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042354062186559675, + "loss": 3.0269, + "theoretical_loss": 3.888437160601093, + "tokens_seen": 532626432 + }, + { + "epoch": 1.07, + "learning_rate": 0.000423530591775326, + "loss": 2.83, + "theoretical_loss": 3.8883861914509743, + "tokens_seen": 532691968 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042352056168505517, + "loss": 2.9997, + "theoretical_loss": 3.88833523032663, + "tokens_seen": 532757504 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042351053159478435, + "loss": 3.0225, + "theoretical_loss": 3.888284277225809, + "tokens_seen": 532823040 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042350050150451353, + "loss": 2.8851, + "theoretical_loss": 3.888233332146262, + "tokens_seen": 532888576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042349047141424276, + "loss": 2.8663, + "theoretical_loss": 3.8881823950857397, + "tokens_seen": 532954112 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004234804413239719, + "loss": 2.9248, + "theoretical_loss": 3.8881314660419934, + "tokens_seen": 533019648 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042347041123370113, + "loss": 2.9781, + "theoretical_loss": 3.8880805450127776, + "tokens_seen": 533085184 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042346038114343025, + "loss": 2.9781, + "theoretical_loss": 3.8880296319958445, + "tokens_seen": 533150720 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004234503510531595, + "loss": 2.7219, + "theoretical_loss": 3.8879787269889494, + "tokens_seen": 533216256 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042344032096288867, + "loss": 3.0108, + "theoretical_loss": 3.887927829989848, + "tokens_seen": 533281792 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042343029087261785, + "loss": 2.8635, + "theoretical_loss": 3.887876940996296, + "tokens_seen": 533347328 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004234202607823471, + "loss": 2.8422, + "theoretical_loss": 3.8878260600060512, + "tokens_seen": 533412864 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004234102306920762, + "loss": 3.0575, + "theoretical_loss": 3.8877751870168717, + "tokens_seen": 533478400 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042340020060180545, + "loss": 2.9074, + "theoretical_loss": 3.8877243220265174, + "tokens_seen": 533543936 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042339017051153463, + "loss": 3.0241, + "theoretical_loss": 3.887673465032748, + "tokens_seen": 533609472 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004233801404212638, + "loss": 2.865, + "theoretical_loss": 3.8876226160333234, + "tokens_seen": 533675008 + }, + { + "epoch": 1.07, + "learning_rate": 0.000423370110330993, + "loss": 3.0424, + "theoretical_loss": 3.8875717750260064, + "tokens_seen": 533740544 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004233600802407222, + "loss": 3.1135, + "theoretical_loss": 3.8875209420085595, + "tokens_seen": 533806080 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042335005015045135, + "loss": 3.0429, + "theoretical_loss": 3.887470116978747, + "tokens_seen": 533871616 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004233400200601806, + "loss": 3.0547, + "theoretical_loss": 3.887419299934333, + "tokens_seen": 533937152 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004233299899699097, + "loss": 3.0215, + "theoretical_loss": 3.887368490873083, + "tokens_seen": 534002688 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042331995987963895, + "loss": 3.0887, + "theoretical_loss": 3.887317689792763, + "tokens_seen": 534068224 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 614920, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9131345748901367, + "objective/train/theoretical_loss": 3.8872795942186653, + "objective/train/tokens_used": 554577376, + "theoretical_loss": 3.8872795942186653, + "tokens_seen": 534117376 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042330992978936813, + "loss": 2.9236, + "theoretical_loss": 3.887266896691141, + "tokens_seen": 534133760 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004232998996990973, + "loss": 2.948, + "theoretical_loss": 3.8872161115659845, + "tokens_seen": 534199296 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004232898696088265, + "loss": 2.7217, + "theoretical_loss": 3.8871653344150623, + "tokens_seen": 534264832 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004232798395185557, + "loss": 2.8982, + "theoretical_loss": 3.8871145652361454, + "tokens_seen": 534330368 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042326980942828486, + "loss": 3.1815, + "theoretical_loss": 3.887063804027004, + "tokens_seen": 534395904 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004232597793380141, + "loss": 2.967, + "theoretical_loss": 3.88701305078541, + "tokens_seen": 534461440 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004232497492477432, + "loss": 3.0584, + "theoretical_loss": 3.886962305509136, + "tokens_seen": 534526976 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042323971915747246, + "loss": 3.0766, + "theoretical_loss": 3.8869115681959556, + "tokens_seen": 534592512 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004232296890672016, + "loss": 3.0689, + "theoretical_loss": 3.886860838843643, + "tokens_seen": 534658048 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004232196589769308, + "loss": 2.9005, + "theoretical_loss": 3.886810117449974, + "tokens_seen": 534723584 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042320962888666, + "loss": 2.9064, + "theoretical_loss": 3.8867594040127242, + "tokens_seen": 534789120 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004231995987963892, + "loss": 2.9348, + "theoretical_loss": 3.8867086985296715, + "tokens_seen": 534854656 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042318956870611836, + "loss": 3.1647, + "theoretical_loss": 3.8866580009985934, + "tokens_seen": 534920192 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004231795386158476, + "loss": 2.8449, + "theoretical_loss": 3.8866073114172686, + "tokens_seen": 534985728 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004231695085255767, + "loss": 2.9154, + "theoretical_loss": 3.8865566297834775, + "tokens_seen": 535051264 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042315947843530596, + "loss": 3.0347, + "theoretical_loss": 3.886505956095, + "tokens_seen": 535116800 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004231494483450351, + "loss": 2.9126, + "theoretical_loss": 3.886455290349619, + "tokens_seen": 535182336 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004231394182547643, + "loss": 3.0763, + "theoretical_loss": 3.8864046325451156, + "tokens_seen": 535247872 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004231293881644935, + "loss": 2.9425, + "theoretical_loss": 3.8863539826792737, + "tokens_seen": 535313408 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004231193580742227, + "loss": 2.9192, + "theoretical_loss": 3.886303340749878, + "tokens_seen": 535378944 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042310932798395186, + "loss": 3.0836, + "theoretical_loss": 3.8862527067547123, + "tokens_seen": 535444480 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042309929789368105, + "loss": 2.7298, + "theoretical_loss": 3.8862020806915645, + "tokens_seen": 535510016 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004230892678034102, + "loss": 3.1905, + "theoretical_loss": 3.88615146255822, + "tokens_seen": 535575552 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042307923771313946, + "loss": 2.9487, + "theoretical_loss": 3.8861008523524676, + "tokens_seen": 535641088 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004230692076228686, + "loss": 2.7571, + "theoretical_loss": 3.8860502500720946, + "tokens_seen": 535706624 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 616369, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9449539184570312, + "objective/train/theoretical_loss": 3.886012303561517, + "objective/train/tokens_used": 556215776, + "theoretical_loss": 3.886012303561517, + "tokens_seen": 535755776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004230591775325978, + "loss": 2.9742, + "theoretical_loss": 3.885999655714893, + "tokens_seen": 535772160 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042304914744232695, + "loss": 3.0579, + "theoretical_loss": 3.8859490692786505, + "tokens_seen": 535837696 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004230391173520562, + "loss": 3.0758, + "theoretical_loss": 3.8858984907611607, + "tokens_seen": 535903232 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042302908726178537, + "loss": 2.8493, + "theoretical_loss": 3.8858479201602147, + "tokens_seen": 535968768 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042301905717151455, + "loss": 3.0233, + "theoretical_loss": 3.8857973574736056, + "tokens_seen": 536034304 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042300902708124373, + "loss": 2.9448, + "theoretical_loss": 3.8857468026991278, + "tokens_seen": 536099840 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042299899699097297, + "loss": 2.9356, + "theoretical_loss": 3.885696255834576, + "tokens_seen": 536165376 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004229889669007021, + "loss": 2.8509, + "theoretical_loss": 3.885645716877746, + "tokens_seen": 536230912 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042297893681043133, + "loss": 2.9117, + "theoretical_loss": 3.8855951858264346, + "tokens_seen": 536296448 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042296890672016045, + "loss": 3.0141, + "theoretical_loss": 3.8855446626784396, + "tokens_seen": 536361984 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004229588766298897, + "loss": 2.8382, + "theoretical_loss": 3.8854941474315585, + "tokens_seen": 536427520 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042294884653961887, + "loss": 2.84, + "theoretical_loss": 3.885443640083592, + "tokens_seen": 536493056 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042293881644934805, + "loss": 3.0141, + "theoretical_loss": 3.8853931406323383, + "tokens_seen": 536558592 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042292878635907723, + "loss": 3.0608, + "theoretical_loss": 3.8853426490756005, + "tokens_seen": 536624128 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004229187562688064, + "loss": 3.0681, + "theoretical_loss": 3.8852921654111796, + "tokens_seen": 536689664 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004229087261785356, + "loss": 3.1048, + "theoretical_loss": 3.8852416896368784, + "tokens_seen": 536755200 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042289869608826483, + "loss": 2.8965, + "theoretical_loss": 3.8851912217505014, + "tokens_seen": 536820736 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042288866599799396, + "loss": 3.0463, + "theoretical_loss": 3.885140761749852, + "tokens_seen": 536886272 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004228786359077232, + "loss": 2.8842, + "theoretical_loss": 3.8850903096327363, + "tokens_seen": 536951808 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004228686058174523, + "loss": 3.0815, + "theoretical_loss": 3.8850398653969607, + "tokens_seen": 537017344 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042285857572718155, + "loss": 3.1825, + "theoretical_loss": 3.884989429040332, + "tokens_seen": 537082880 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042284854563691074, + "loss": 2.6999, + "theoretical_loss": 3.8849390005606597, + "tokens_seen": 537148416 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004228385155466399, + "loss": 3.094, + "theoretical_loss": 3.884888579955751, + "tokens_seen": 537213952 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004228284854563691, + "loss": 2.8036, + "theoretical_loss": 3.8848381672234167, + "tokens_seen": 537279488 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042281845536609833, + "loss": 3.1333, + "theoretical_loss": 3.8847877623614675, + "tokens_seen": 537345024 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 617122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5995001792907715, + "objective/train/theoretical_loss": 3.8847499638786287, + "objective/train/tokens_used": 557854176, + "theoretical_loss": 3.8847499638786287, + "tokens_seen": 537394176 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042280842527582746, + "loss": 2.8701, + "theoretical_loss": 3.884737365367714, + "tokens_seen": 537410560 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004227983951855567, + "loss": 2.9849, + "theoretical_loss": 3.8846869762399705, + "tokens_seen": 537476096 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004227883650952858, + "loss": 3.0028, + "theoretical_loss": 3.8846365949760493, + "tokens_seen": 537541632 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042277833500501506, + "loss": 2.748, + "theoretical_loss": 3.8845862215737643, + "tokens_seen": 537607168 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042276830491474424, + "loss": 2.9641, + "theoretical_loss": 3.884535856030932, + "tokens_seen": 537672704 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004227582748244734, + "loss": 2.9048, + "theoretical_loss": 3.8844854983453665, + "tokens_seen": 537738240 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004227482447342026, + "loss": 2.9251, + "theoretical_loss": 3.884435148514886, + "tokens_seen": 537803776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004227382146439318, + "loss": 2.9232, + "theoretical_loss": 3.884384806537308, + "tokens_seen": 537869312 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042272818455366096, + "loss": 3.2003, + "theoretical_loss": 3.8843344724104507, + "tokens_seen": 537934848 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004227181544633902, + "loss": 3.0039, + "theoretical_loss": 3.884284146132134, + "tokens_seen": 538000384 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004227081243731193, + "loss": 2.8426, + "theoretical_loss": 3.8842338277001778, + "tokens_seen": 538065920 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042269809428284856, + "loss": 2.8665, + "theoretical_loss": 3.884183517112404, + "tokens_seen": 538131456 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004226880641925777, + "loss": 2.8744, + "theoretical_loss": 3.884133214366634, + "tokens_seen": 538196992 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004226780341023069, + "loss": 3.132, + "theoretical_loss": 3.884082919460692, + "tokens_seen": 538262528 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042266800401203616, + "loss": 2.7897, + "theoretical_loss": 3.8840326323923993, + "tokens_seen": 538328064 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004226579739217653, + "loss": 2.889, + "theoretical_loss": 3.883982353159583, + "tokens_seen": 538393600 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004226479438314945, + "loss": 2.858, + "theoretical_loss": 3.8839320817600678, + "tokens_seen": 538459136 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004226379137412237, + "loss": 2.9575, + "theoretical_loss": 3.8838818181916803, + "tokens_seen": 538524672 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004226278836509529, + "loss": 3.1264, + "theoretical_loss": 3.8838315624522473, + "tokens_seen": 538590208 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042261785356068206, + "loss": 2.9071, + "theoretical_loss": 3.883781314539598, + "tokens_seen": 538655744 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042260782347041125, + "loss": 2.7484, + "theoretical_loss": 3.8837310744515605, + "tokens_seen": 538721280 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004225977933801404, + "loss": 2.934, + "theoretical_loss": 3.8836808421859645, + "tokens_seen": 538786816 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042258776328986966, + "loss": 3.0486, + "theoretical_loss": 3.883630617740642, + "tokens_seen": 538852352 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004225777331995988, + "loss": 2.9116, + "theoretical_loss": 3.883580401113423, + "tokens_seen": 538917888 + }, + { + "epoch": 1.07, + "learning_rate": 0.000422567703109328, + "loss": 2.8901, + "theoretical_loss": 3.8835301923021417, + "tokens_seen": 538983424 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 618589, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2161307334899902, + "objective/train/theoretical_loss": 3.8834925408215852, + "objective/train/tokens_used": 559492576, + "theoretical_loss": 3.8834925408215852, + "tokens_seen": 539032576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042255767301905715, + "loss": 2.9765, + "theoretical_loss": 3.8834799913046303, + "tokens_seen": 539048960 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004225476429287864, + "loss": 3.1059, + "theoretical_loss": 3.883429798118723, + "tokens_seen": 539114496 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042253761283851557, + "loss": 2.8609, + "theoretical_loss": 3.8833796127422557, + "tokens_seen": 539180032 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042252758274824475, + "loss": 3.0228, + "theoretical_loss": 3.8833294351730636, + "tokens_seen": 539245568 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042251755265797393, + "loss": 2.9747, + "theoretical_loss": 3.8832792654089845, + "tokens_seen": 539311104 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042250752256770317, + "loss": 3.0639, + "theoretical_loss": 3.8832291034478548, + "tokens_seen": 539376640 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004224974924774323, + "loss": 3.0354, + "theoretical_loss": 3.8831789492875135, + "tokens_seen": 539442176 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042248746238716153, + "loss": 2.953, + "theoretical_loss": 3.8831288029258006, + "tokens_seen": 539507712 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042247743229689065, + "loss": 2.6457, + "theoretical_loss": 3.8830786643605553, + "tokens_seen": 539573248 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004224674022066199, + "loss": 3.2025, + "theoretical_loss": 3.88302853358962, + "tokens_seen": 539638784 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042245737211634907, + "loss": 2.9132, + "theoretical_loss": 3.8829784106108356, + "tokens_seen": 539704320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042244734202607825, + "loss": 2.6975, + "theoretical_loss": 3.882928295422045, + "tokens_seen": 539769856 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042243731193580743, + "loss": 2.6874, + "theoretical_loss": 3.8828781880210927, + "tokens_seen": 539835392 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004224272818455366, + "loss": 2.9888, + "theoretical_loss": 3.882828088405823, + "tokens_seen": 539900928 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004224172517552658, + "loss": 2.7301, + "theoretical_loss": 3.8827779965740805, + "tokens_seen": 539966464 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042240722166499503, + "loss": 2.7876, + "theoretical_loss": 3.882727912523712, + "tokens_seen": 540032000 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042239719157472416, + "loss": 2.7523, + "theoretical_loss": 3.882677836252565, + "tokens_seen": 540097536 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004223871614844534, + "loss": 3.0484, + "theoretical_loss": 3.8826277677584873, + "tokens_seen": 540163072 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004223771313941825, + "loss": 3.0711, + "theoretical_loss": 3.882577707039328, + "tokens_seen": 540228608 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042236710130391176, + "loss": 2.949, + "theoretical_loss": 3.8825276540929354, + "tokens_seen": 540294144 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042235707121364094, + "loss": 2.7796, + "theoretical_loss": 3.8824776089171618, + "tokens_seen": 540359680 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004223470411233701, + "loss": 2.7581, + "theoretical_loss": 3.8824275715098575, + "tokens_seen": 540425216 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004223370110330993, + "loss": 2.9913, + "theoretical_loss": 3.882377541868875, + "tokens_seen": 540490752 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042232698094282853, + "loss": 3.1344, + "theoretical_loss": 3.882327519992068, + "tokens_seen": 540556288 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042231695085255766, + "loss": 2.8887, + "theoretical_loss": 3.88227750587729, + "tokens_seen": 540621824 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 619158, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8451883792877197, + "objective/train/theoretical_loss": 3.8822400003837476, + "objective/train/tokens_used": 561130976, + "theoretical_loss": 3.8822400003837476, + "tokens_seen": 540670976 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004223069207622869, + "loss": 2.9941, + "theoretical_loss": 3.8822274995223958, + "tokens_seen": 540687360 + }, + { + "epoch": 1.07, + "learning_rate": 0.000422296890672016, + "loss": 2.7622, + "theoretical_loss": 3.8821775009252413, + "tokens_seen": 540752896 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042228686058174526, + "loss": 2.8625, + "theoretical_loss": 3.8821275100836825, + "tokens_seen": 540818432 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042227683049147444, + "loss": 2.6665, + "theoretical_loss": 3.8820775269955776, + "tokens_seen": 540883968 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004222668004012036, + "loss": 2.8965, + "theoretical_loss": 3.8820275516587843, + "tokens_seen": 540949504 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004222567703109328, + "loss": 2.7812, + "theoretical_loss": 3.8819775840711612, + "tokens_seen": 541015040 + }, + { + "epoch": 1.07, + "learning_rate": 0.000422246740220662, + "loss": 2.7666, + "theoretical_loss": 3.8819276242305696, + "tokens_seen": 541080576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042223671013039116, + "loss": 2.5857, + "theoretical_loss": 3.881877672134869, + "tokens_seen": 541146112 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004222266800401204, + "loss": 2.9337, + "theoretical_loss": 3.8818277277819213, + "tokens_seen": 541211648 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004222166499498495, + "loss": 2.8628, + "theoretical_loss": 3.88177779116959, + "tokens_seen": 541277184 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042220661985957876, + "loss": 2.5178, + "theoretical_loss": 3.881727862295737, + "tokens_seen": 541342720 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004221965897693079, + "loss": 2.8533, + "theoretical_loss": 3.8816779411582276, + "tokens_seen": 541408256 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004221865596790371, + "loss": 2.8706, + "theoretical_loss": 3.881628027754926, + "tokens_seen": 541473792 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004221765295887663, + "loss": 2.9555, + "theoretical_loss": 3.8815781220836985, + "tokens_seen": 541539328 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004221664994984955, + "loss": 2.957, + "theoretical_loss": 3.8815282241424116, + "tokens_seen": 541604864 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042215646940822467, + "loss": 2.5968, + "theoretical_loss": 3.881478333928933, + "tokens_seen": 541670400 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004221464393179539, + "loss": 2.8318, + "theoretical_loss": 3.8814284514411312, + "tokens_seen": 541735936 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042213640922768303, + "loss": 2.748, + "theoretical_loss": 3.881378576676876, + "tokens_seen": 541801472 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042212637913741226, + "loss": 2.952, + "theoretical_loss": 3.8813287096340363, + "tokens_seen": 541867008 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004221163490471414, + "loss": 2.8788, + "theoretical_loss": 3.881278850310484, + "tokens_seen": 541932544 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004221063189568706, + "loss": 3.0883, + "theoretical_loss": 3.88122899870409, + "tokens_seen": 541998080 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004220962888665998, + "loss": 2.8509, + "theoretical_loss": 3.8811791548127275, + "tokens_seen": 542063616 + }, + { + "epoch": 1.07, + "learning_rate": 0.000422086258776329, + "loss": 2.943, + "theoretical_loss": 3.8811293186342706, + "tokens_seen": 542129152 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042207622868605817, + "loss": 2.9274, + "theoretical_loss": 3.881079490166593, + "tokens_seen": 542194688 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042206619859578735, + "loss": 3.0352, + "theoretical_loss": 3.881029669407569, + "tokens_seen": 542260224 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 620361, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.204775810241699, + "objective/train/theoretical_loss": 3.8809923088958285, + "objective/train/tokens_used": 562769376, + "theoretical_loss": 3.8809923088958285, + "tokens_seen": 542309376 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042205616850551653, + "loss": 3.0893, + "theoretical_loss": 3.8809798563550766, + "tokens_seen": 542325760 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042204613841524577, + "loss": 3.1817, + "theoretical_loss": 3.880930051006991, + "tokens_seen": 542391296 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004220361083249749, + "loss": 2.8584, + "theoretical_loss": 3.88088025336119, + "tokens_seen": 542456832 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042202607823470413, + "loss": 2.8222, + "theoretical_loss": 3.8808304634155535, + "tokens_seen": 542522368 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004220160481444333, + "loss": 2.9794, + "theoretical_loss": 3.8807806811679595, + "tokens_seen": 542587904 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004220060180541625, + "loss": 2.7943, + "theoretical_loss": 3.8807309066162885, + "tokens_seen": 542653440 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004219959879638917, + "loss": 3.0174, + "theoretical_loss": 3.8806811397584218, + "tokens_seen": 542718976 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042198595787362085, + "loss": 3.0142, + "theoretical_loss": 3.8806313805922414, + "tokens_seen": 542784512 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042197592778335004, + "loss": 2.6437, + "theoretical_loss": 3.88058162911563, + "tokens_seen": 542850048 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042196589769307927, + "loss": 3.1293, + "theoretical_loss": 3.8805318853264708, + "tokens_seen": 542915584 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004219558676028084, + "loss": 3.0661, + "theoretical_loss": 3.8804821492226482, + "tokens_seen": 542981120 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042194583751253763, + "loss": 2.8919, + "theoretical_loss": 3.880432420802048, + "tokens_seen": 543046656 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042193580742226676, + "loss": 2.8511, + "theoretical_loss": 3.880382700062556, + "tokens_seen": 543112192 + }, + { + "epoch": 1.07, + "learning_rate": 0.000421925777331996, + "loss": 2.9383, + "theoretical_loss": 3.880332987002059, + "tokens_seen": 543177728 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042191574724172523, + "loss": 2.8971, + "theoretical_loss": 3.8802832816184445, + "tokens_seen": 543243264 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042190571715145436, + "loss": 2.8064, + "theoretical_loss": 3.880233583909602, + "tokens_seen": 543308800 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004218956870611836, + "loss": 2.9946, + "theoretical_loss": 3.8801838938734203, + "tokens_seen": 543374336 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004218856569709127, + "loss": 2.9893, + "theoretical_loss": 3.8801342115077895, + "tokens_seen": 543439872 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042187562688064196, + "loss": 3.1243, + "theoretical_loss": 3.8800845368106014, + "tokens_seen": 543505408 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042186559679037114, + "loss": 3.0212, + "theoretical_loss": 3.880034869779747, + "tokens_seen": 543570944 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004218555667001003, + "loss": 3.2732, + "theoretical_loss": 3.8799852104131194, + "tokens_seen": 543636480 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004218455366098295, + "loss": 2.9189, + "theoretical_loss": 3.879935558708613, + "tokens_seen": 543702016 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042183550651955873, + "loss": 3.024, + "theoretical_loss": 3.879885914664121, + "tokens_seen": 543767552 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042182547642928786, + "loss": 3.0686, + "theoretical_loss": 3.8798362782775393, + "tokens_seen": 543833088 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004218154463390171, + "loss": 2.9981, + "theoretical_loss": 3.879786649546764, + "tokens_seen": 543898624 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 620805, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7509706020355225, + "objective/train/theoretical_loss": 3.8797494330215407, + "objective/train/tokens_used": 564407776, + "theoretical_loss": 3.8797494330215407, + "tokens_seen": 543947776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004218054162487462, + "loss": 2.807, + "theoretical_loss": 3.879737028469692, + "tokens_seen": 543964160 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042179538615847546, + "loss": 2.9816, + "theoretical_loss": 3.8796874150442213, + "tokens_seen": 544029696 + }, + { + "epoch": 1.07, + "learning_rate": 0.00042178535606820464, + "loss": 2.8923, + "theoretical_loss": 3.8796378092682495, + "tokens_seen": 544095232 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004217753259779338, + "loss": 2.6996, + "theoretical_loss": 3.8795882111396773, + "tokens_seen": 544160768 + }, + { + "epoch": 1.07, + "learning_rate": 0.000421765295887663, + "loss": 2.9286, + "theoretical_loss": 3.8795386206564038, + "tokens_seen": 544226304 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004217552657973922, + "loss": 2.9942, + "theoretical_loss": 3.8794890378163314, + "tokens_seen": 544291840 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042174523570712136, + "loss": 3.0129, + "theoretical_loss": 3.879439462617361, + "tokens_seen": 544357376 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004217352056168506, + "loss": 2.9842, + "theoretical_loss": 3.8793898950573955, + "tokens_seen": 544422912 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004217251755265797, + "loss": 3.0301, + "theoretical_loss": 3.8793403351343385, + "tokens_seen": 544488448 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042171514543630896, + "loss": 2.7943, + "theoretical_loss": 3.8792907828460947, + "tokens_seen": 544553984 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004217051153460381, + "loss": 2.812, + "theoretical_loss": 3.8792412381905685, + "tokens_seen": 544619520 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004216950852557673, + "loss": 2.8875, + "theoretical_loss": 3.879191701165667, + "tokens_seen": 544685056 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004216850551654965, + "loss": 2.8014, + "theoretical_loss": 3.8791421717692964, + "tokens_seen": 544750592 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004216750250752257, + "loss": 2.9776, + "theoretical_loss": 3.8790926499993645, + "tokens_seen": 544816128 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042166499498495487, + "loss": 3.0308, + "theoretical_loss": 3.87904313585378, + "tokens_seen": 544881664 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004216549648946841, + "loss": 2.9902, + "theoretical_loss": 3.878993629330452, + "tokens_seen": 544947200 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042164493480441323, + "loss": 2.8341, + "theoretical_loss": 3.878944130427291, + "tokens_seen": 545012736 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042163490471414246, + "loss": 2.7959, + "theoretical_loss": 3.878894639142208, + "tokens_seen": 545078272 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004216248746238716, + "loss": 3.1664, + "theoretical_loss": 3.8788451554731145, + "tokens_seen": 545143808 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004216148445336008, + "loss": 2.8896, + "theoretical_loss": 3.878795679417923, + "tokens_seen": 545209344 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042160481444333, + "loss": 3.0888, + "theoretical_loss": 3.8787462109745476, + "tokens_seen": 545274880 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004215947843530592, + "loss": 3.0511, + "theoretical_loss": 3.878696750140902, + "tokens_seen": 545340416 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042158475426278837, + "loss": 2.7537, + "theoretical_loss": 3.8786472969149015, + "tokens_seen": 545405952 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042157472417251755, + "loss": 2.9606, + "theoretical_loss": 3.8785978512944626, + "tokens_seen": 545471488 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042156469408224673, + "loss": 2.8147, + "theoretical_loss": 3.878548413277501, + "tokens_seen": 545537024 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 621470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.958339214324951, + "objective/train/theoretical_loss": 3.8785113397533095, + "objective/train/tokens_used": 566046176, + "theoretical_loss": 3.8785113397533095, + "tokens_seen": 545586176 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042155466399197597, + "loss": 2.9194, + "theoretical_loss": 3.878498982861935, + "tokens_seen": 545602560 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004215446339017051, + "loss": 2.8887, + "theoretical_loss": 3.8784495600456825, + "tokens_seen": 545668096 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042153460381143433, + "loss": 3.0382, + "theoretical_loss": 3.8784001448266636, + "tokens_seen": 545733632 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004215245737211635, + "loss": 2.8951, + "theoretical_loss": 3.8783507372027977, + "tokens_seen": 545799168 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004215145436308927, + "loss": 2.7978, + "theoretical_loss": 3.8783013371720054, + "tokens_seen": 545864704 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004215045135406219, + "loss": 3.1933, + "theoretical_loss": 3.8782519447322086, + "tokens_seen": 545930240 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042149448345035105, + "loss": 2.7269, + "theoretical_loss": 3.8782025598813297, + "tokens_seen": 545995776 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042148445336008024, + "loss": 3.0344, + "theoretical_loss": 3.878153182617292, + "tokens_seen": 546061312 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042147442326980947, + "loss": 2.9469, + "theoretical_loss": 3.8781038129380203, + "tokens_seen": 546126848 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004214643931795386, + "loss": 2.9681, + "theoretical_loss": 3.8780544508414394, + "tokens_seen": 546192384 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042145436308926783, + "loss": 2.7063, + "theoretical_loss": 3.8780050963254746, + "tokens_seen": 546257920 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042144433299899696, + "loss": 2.8749, + "theoretical_loss": 3.877955749388052, + "tokens_seen": 546323456 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004214343029087262, + "loss": 2.9719, + "theoretical_loss": 3.8779064100270997, + "tokens_seen": 546388992 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004214242728184554, + "loss": 2.941, + "theoretical_loss": 3.877857078240546, + "tokens_seen": 546454528 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042141424272818456, + "loss": 3.0534, + "theoretical_loss": 3.8778077540263194, + "tokens_seen": 546520064 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042140421263791374, + "loss": 2.7779, + "theoretical_loss": 3.877758437382351, + "tokens_seen": 546585600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004213941825476429, + "loss": 2.7718, + "theoretical_loss": 3.877709128306569, + "tokens_seen": 546651136 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004213841524573721, + "loss": 2.7369, + "theoretical_loss": 3.8776598267969073, + "tokens_seen": 546716672 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042137412236710134, + "loss": 2.6951, + "theoretical_loss": 3.877610532851297, + "tokens_seen": 546782208 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042136409227683046, + "loss": 3.0101, + "theoretical_loss": 3.8775612464676716, + "tokens_seen": 546847744 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004213540621865597, + "loss": 2.9433, + "theoretical_loss": 3.8775119676439647, + "tokens_seen": 546913280 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004213440320962889, + "loss": 2.6857, + "theoretical_loss": 3.877462696378111, + "tokens_seen": 546978816 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042133400200601806, + "loss": 2.8557, + "theoretical_loss": 3.877413432668046, + "tokens_seen": 547044352 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042132397191574724, + "loss": 3.0713, + "theoretical_loss": 3.8773641765117066, + "tokens_seen": 547109888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004213139418254764, + "loss": 3.0662, + "theoretical_loss": 3.8773149279070296, + "tokens_seen": 547175424 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 622904, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.055859327316284, + "objective/train/theoretical_loss": 3.8772779964080595, + "objective/train/tokens_used": 567684576, + "theoretical_loss": 3.8772779964080595, + "tokens_seen": 547224576 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004213039117352056, + "loss": 2.6809, + "theoretical_loss": 3.8772656868519526, + "tokens_seen": 547240960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042129388164493484, + "loss": 3.0167, + "theoretical_loss": 3.877216453344415, + "tokens_seen": 547306496 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042128385155466397, + "loss": 2.8911, + "theoretical_loss": 3.8771672273823556, + "tokens_seen": 547372032 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004212738214643932, + "loss": 3.198, + "theoretical_loss": 3.8771180089637154, + "tokens_seen": 547437568 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042126379137412233, + "loss": 2.8962, + "theoretical_loss": 3.8770687980864356, + "tokens_seen": 547503104 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042125376128385156, + "loss": 3.0193, + "theoretical_loss": 3.8770195947484583, + "tokens_seen": 547568640 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042124373119358075, + "loss": 2.8844, + "theoretical_loss": 3.8769703989477255, + "tokens_seen": 547634176 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004212337011033099, + "loss": 2.9321, + "theoretical_loss": 3.8769212106821813, + "tokens_seen": 547699712 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004212236710130391, + "loss": 2.9915, + "theoretical_loss": 3.8768720299497703, + "tokens_seen": 547765248 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004212136409227683, + "loss": 2.6956, + "theoretical_loss": 3.8768228567484377, + "tokens_seen": 547830784 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042120361083249747, + "loss": 2.4523, + "theoretical_loss": 3.8767736910761297, + "tokens_seen": 547896320 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004211935807422267, + "loss": 3.2263, + "theoretical_loss": 3.8767245329307927, + "tokens_seen": 547961856 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042118355065195583, + "loss": 2.8253, + "theoretical_loss": 3.8766753823103746, + "tokens_seen": 548027392 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042117352056168507, + "loss": 2.6039, + "theoretical_loss": 3.8766262392128237, + "tokens_seen": 548092928 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004211634904714143, + "loss": 2.9644, + "theoretical_loss": 3.876577103636089, + "tokens_seen": 548158464 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042115346038114343, + "loss": 2.9272, + "theoretical_loss": 3.8765279755781217, + "tokens_seen": 548224000 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042114343029087266, + "loss": 2.8769, + "theoretical_loss": 3.876478855036872, + "tokens_seen": 548289536 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004211334002006018, + "loss": 3.018, + "theoretical_loss": 3.876429742010291, + "tokens_seen": 548355072 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042112337011033103, + "loss": 3.134, + "theoretical_loss": 3.8763806364963314, + "tokens_seen": 548420608 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004211133400200602, + "loss": 2.9645, + "theoretical_loss": 3.876331538492947, + "tokens_seen": 548486144 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004211033099297894, + "loss": 2.9324, + "theoretical_loss": 3.8762824479980917, + "tokens_seen": 548551680 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042109327983951857, + "loss": 2.7315, + "theoretical_loss": 3.8762333650097203, + "tokens_seen": 548617216 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042108324974924775, + "loss": 2.7846, + "theoretical_loss": 3.8761842895257885, + "tokens_seen": 548682752 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042107321965897693, + "loss": 2.798, + "theoretical_loss": 3.8761352215442524, + "tokens_seen": 548748288 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042106318956870617, + "loss": 2.8967, + "theoretical_loss": 3.8760861610630695, + "tokens_seen": 548813824 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 623446, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.478111743927002, + "objective/train/theoretical_loss": 3.876049370623061, + "objective/train/tokens_used": 569322976, + "theoretical_loss": 3.876049370623061, + "tokens_seen": 548862976 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004210531594784353, + "loss": 3.0132, + "theoretical_loss": 3.8760371080801983, + "tokens_seen": 548879360 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042104312938816453, + "loss": 2.8714, + "theoretical_loss": 3.8759880625935974, + "tokens_seen": 548944896 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004210330992978937, + "loss": 3.1069, + "theoretical_loss": 3.8759390246012266, + "tokens_seen": 549010432 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004210230692076229, + "loss": 2.8727, + "theoretical_loss": 3.8758899941010463, + "tokens_seen": 549075968 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004210130391173521, + "loss": 2.7593, + "theoretical_loss": 3.8758409710910176, + "tokens_seen": 549141504 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042100300902708125, + "loss": 2.9946, + "theoretical_loss": 3.8757919555691025, + "tokens_seen": 549207040 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042099297893681044, + "loss": 2.9204, + "theoretical_loss": 3.875742947533264, + "tokens_seen": 549272576 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042098294884653967, + "loss": 2.8915, + "theoretical_loss": 3.8756939469814666, + "tokens_seen": 549338112 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004209729187562688, + "loss": 2.9225, + "theoretical_loss": 3.8756449539116735, + "tokens_seen": 549403648 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042096288866599803, + "loss": 3.0075, + "theoretical_loss": 3.8755959683218504, + "tokens_seen": 549469184 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042095285857572716, + "loss": 2.6968, + "theoretical_loss": 3.8755469902099633, + "tokens_seen": 549534720 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004209428284854564, + "loss": 2.8839, + "theoretical_loss": 3.875498019573979, + "tokens_seen": 549600256 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004209327983951856, + "loss": 3.1297, + "theoretical_loss": 3.875449056411866, + "tokens_seen": 549665792 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042092276830491476, + "loss": 2.8652, + "theoretical_loss": 3.875400100721592, + "tokens_seen": 549731328 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042091273821464394, + "loss": 3.0584, + "theoretical_loss": 3.8753511525011257, + "tokens_seen": 549796864 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004209027081243731, + "loss": 2.7467, + "theoretical_loss": 3.875302211748439, + "tokens_seen": 549862400 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004208926780341023, + "loss": 3.218, + "theoretical_loss": 3.8752532784615004, + "tokens_seen": 549927936 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042088264794383154, + "loss": 2.906, + "theoretical_loss": 3.8752043526382827, + "tokens_seen": 549993472 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042087261785356066, + "loss": 2.7583, + "theoretical_loss": 3.875155434276759, + "tokens_seen": 550059008 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004208625877632899, + "loss": 2.6835, + "theoretical_loss": 3.8751065233749005, + "tokens_seen": 550124544 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004208525576730191, + "loss": 2.6696, + "theoretical_loss": 3.8750576199306837, + "tokens_seen": 550190080 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042084252758274826, + "loss": 3.2374, + "theoretical_loss": 3.8750087239420807, + "tokens_seen": 550255616 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042083249749247744, + "loss": 2.8243, + "theoretical_loss": 3.8749598354070693, + "tokens_seen": 550321152 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004208224674022066, + "loss": 2.954, + "theoretical_loss": 3.8749109543236253, + "tokens_seen": 550386688 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004208124373119358, + "loss": 2.8086, + "theoretical_loss": 3.8748620806897254, + "tokens_seen": 550452224 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 624875, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.574239730834961, + "objective/train/theoretical_loss": 3.8748254303518475, + "objective/train/tokens_used": 570961376, + "theoretical_loss": 3.8748254303518475, + "tokens_seen": 550501376 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042080240722166504, + "loss": 3.106, + "theoretical_loss": 3.874813214503348, + "tokens_seen": 550517760 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042079237713139417, + "loss": 2.974, + "theoretical_loss": 3.8747643557624714, + "tokens_seen": 550583296 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004207823470411234, + "loss": 2.9065, + "theoretical_loss": 3.874715504465075, + "tokens_seen": 550648832 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042077231695085253, + "loss": 2.9074, + "theoretical_loss": 3.87466666060914, + "tokens_seen": 550714368 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042076228686058176, + "loss": 2.8743, + "theoretical_loss": 3.8746178241926468, + "tokens_seen": 550779904 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042075225677031095, + "loss": 3.0312, + "theoretical_loss": 3.8745689952135773, + "tokens_seen": 550845440 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004207422266800401, + "loss": 2.8656, + "theoretical_loss": 3.8745201736699144, + "tokens_seen": 550910976 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004207321965897693, + "loss": 3.0798, + "theoretical_loss": 3.8744713595596414, + "tokens_seen": 550976512 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004207221664994985, + "loss": 3.0155, + "theoretical_loss": 3.874422552880743, + "tokens_seen": 551042048 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042071213640922767, + "loss": 2.8655, + "theoretical_loss": 3.8743737536312035, + "tokens_seen": 551107584 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004207021063189569, + "loss": 2.895, + "theoretical_loss": 3.87432496180901, + "tokens_seen": 551173120 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042069207622868603, + "loss": 2.878, + "theoretical_loss": 3.8742761774121472, + "tokens_seen": 551238656 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042068204613841527, + "loss": 2.9071, + "theoretical_loss": 3.874227400438604, + "tokens_seen": 551304192 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042067201604814445, + "loss": 2.8582, + "theoretical_loss": 3.8741786308863677, + "tokens_seen": 551369728 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042066198595787363, + "loss": 2.8426, + "theoretical_loss": 3.8741298687534282, + "tokens_seen": 551435264 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004206519558676028, + "loss": 2.9456, + "theoretical_loss": 3.874081114037775, + "tokens_seen": 551500800 + }, + { + "epoch": 1.08, + "learning_rate": 0.000420641925777332, + "loss": 2.6779, + "theoretical_loss": 3.874032366737398, + "tokens_seen": 551566336 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004206318956870612, + "loss": 2.8382, + "theoretical_loss": 3.873983626850289, + "tokens_seen": 551631872 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004206218655967904, + "loss": 2.7917, + "theoretical_loss": 3.8739348943744396, + "tokens_seen": 551697408 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042061183550651954, + "loss": 2.7561, + "theoretical_loss": 3.8738861693078435, + "tokens_seen": 551762944 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042060180541624877, + "loss": 3.085, + "theoretical_loss": 3.8738374516484937, + "tokens_seen": 551828480 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004205917753259779, + "loss": 2.8885, + "theoretical_loss": 3.8737887413943852, + "tokens_seen": 551894016 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042058174523570713, + "loss": 2.8237, + "theoretical_loss": 3.8737400385435126, + "tokens_seen": 551959552 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004205717151454363, + "loss": 3.0146, + "theoretical_loss": 3.8736913430938724, + "tokens_seen": 552025088 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004205616850551655, + "loss": 2.8244, + "theoretical_loss": 3.8736426550434615, + "tokens_seen": 552090624 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 625640, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8349080085754395, + "objective/train/theoretical_loss": 3.8736061438601923, + "objective/train/tokens_used": 572599776, + "theoretical_loss": 3.8736061438601923, + "tokens_seen": 552139776 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004205516549648947, + "loss": 2.9482, + "theoretical_loss": 3.8735939743902765, + "tokens_seen": 552156160 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004205416248746239, + "loss": 2.7414, + "theoretical_loss": 3.873545301132317, + "tokens_seen": 552221696 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042053159478435304, + "loss": 2.9545, + "theoretical_loss": 3.873496635267581, + "tokens_seen": 552287232 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004205215646940823, + "loss": 2.6246, + "theoretical_loss": 3.8734479767940693, + "tokens_seen": 552352768 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004205115346038114, + "loss": 2.9388, + "theoretical_loss": 3.8733993257097827, + "tokens_seen": 552418304 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042050150451354064, + "loss": 3.0344, + "theoretical_loss": 3.8733506820127213, + "tokens_seen": 552483840 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004204914744232698, + "loss": 2.99, + "theoretical_loss": 3.8733020457008887, + "tokens_seen": 552549376 + }, + { + "epoch": 1.08, + "learning_rate": 0.000420481444332999, + "loss": 3.006, + "theoretical_loss": 3.8732534167722874, + "tokens_seen": 552614912 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004204714142427282, + "loss": 2.7293, + "theoretical_loss": 3.873204795224921, + "tokens_seen": 552680448 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042046138415245736, + "loss": 2.6337, + "theoretical_loss": 3.873156181056795, + "tokens_seen": 552745984 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042045135406218654, + "loss": 3.1811, + "theoretical_loss": 3.8731075742659136, + "tokens_seen": 552811520 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004204413239719158, + "loss": 3.0877, + "theoretical_loss": 3.8730589748502835, + "tokens_seen": 552877056 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004204312938816449, + "loss": 2.8474, + "theoretical_loss": 3.8730103828079114, + "tokens_seen": 552942592 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042042126379137414, + "loss": 3.0734, + "theoretical_loss": 3.8729617981368047, + "tokens_seen": 553008128 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004204112337011033, + "loss": 3.238, + "theoretical_loss": 3.872913220834973, + "tokens_seen": 553073664 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004204012036108325, + "loss": 2.9764, + "theoretical_loss": 3.872864650900424, + "tokens_seen": 553139200 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042039117352056174, + "loss": 2.9151, + "theoretical_loss": 3.8728160883311684, + "tokens_seen": 553204736 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042038114343029086, + "loss": 3.0027, + "theoretical_loss": 3.8727675331252174, + "tokens_seen": 553270272 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004203711133400201, + "loss": 2.6286, + "theoretical_loss": 3.872718985280582, + "tokens_seen": 553335808 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004203610832497493, + "loss": 3.0266, + "theoretical_loss": 3.8726704447952747, + "tokens_seen": 553401344 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042035105315947846, + "loss": 3.1303, + "theoretical_loss": 3.8726219116673084, + "tokens_seen": 553466880 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042034102306920764, + "loss": 2.82, + "theoretical_loss": 3.8725733858946967, + "tokens_seen": 553532416 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004203309929789368, + "loss": 2.9129, + "theoretical_loss": 3.8725248674754553, + "tokens_seen": 553597952 + }, + { + "epoch": 1.08, + "learning_rate": 0.000420320962888666, + "loss": 2.728, + "theoretical_loss": 3.8724763564075984, + "tokens_seen": 553663488 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042031093279839524, + "loss": 2.947, + "theoretical_loss": 3.8724278526891425, + "tokens_seen": 553729024 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 626970, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.11159610748291, + "objective/train/theoretical_loss": 3.872391479722152, + "objective/train/tokens_used": 574238176, + "theoretical_loss": 3.872391479722152, + "tokens_seen": 553778176 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042030090270812437, + "loss": 2.9388, + "theoretical_loss": 3.872379356318105, + "tokens_seen": 553794560 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004202908726178536, + "loss": 2.8643, + "theoretical_loss": 3.8723308672925025, + "tokens_seen": 553860096 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042028084252758273, + "loss": 2.8689, + "theoretical_loss": 3.8722823856103554, + "tokens_seen": 553925632 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042027081243731196, + "loss": 3.1274, + "theoretical_loss": 3.8722339112696815, + "tokens_seen": 553991168 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042026078234704115, + "loss": 2.9725, + "theoretical_loss": 3.8721854442685, + "tokens_seen": 554056704 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004202507522567703, + "loss": 2.8734, + "theoretical_loss": 3.8721369846048344, + "tokens_seen": 554122240 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004202407221664995, + "loss": 2.7302, + "theoretical_loss": 3.8720885322767042, + "tokens_seen": 554187776 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004202306920762287, + "loss": 2.8491, + "theoretical_loss": 3.872040087282132, + "tokens_seen": 554253312 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042022066198595787, + "loss": 2.9801, + "theoretical_loss": 3.8719916496191407, + "tokens_seen": 554318848 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004202106318956871, + "loss": 2.8109, + "theoretical_loss": 3.871943219285755, + "tokens_seen": 554384384 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042020060180541623, + "loss": 3.0374, + "theoretical_loss": 3.8718947962799986, + "tokens_seen": 554449920 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042019057171514547, + "loss": 2.6717, + "theoretical_loss": 3.871846380599897, + "tokens_seen": 554515456 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042018054162487465, + "loss": 2.6651, + "theoretical_loss": 3.8717979722434777, + "tokens_seen": 554580992 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042017051153460383, + "loss": 2.9567, + "theoretical_loss": 3.871749571208766, + "tokens_seen": 554646528 + }, + { + "epoch": 1.08, + "learning_rate": 0.000420160481444333, + "loss": 2.9258, + "theoretical_loss": 3.87170117749379, + "tokens_seen": 554712064 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004201504513540622, + "loss": 2.8143, + "theoretical_loss": 3.8716527910965786, + "tokens_seen": 554777600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004201404212637914, + "loss": 3.085, + "theoretical_loss": 3.8716044120151603, + "tokens_seen": 554843136 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004201303911735206, + "loss": 2.8709, + "theoretical_loss": 3.8715560402475657, + "tokens_seen": 554908672 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042012036108324974, + "loss": 3.0459, + "theoretical_loss": 3.8715076757918254, + "tokens_seen": 554974208 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042011033099297897, + "loss": 3.0664, + "theoretical_loss": 3.8714593186459703, + "tokens_seen": 555039744 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004201003009027081, + "loss": 2.8809, + "theoretical_loss": 3.871410968808034, + "tokens_seen": 555105280 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042009027081243733, + "loss": 2.9343, + "theoretical_loss": 3.8713626262760474, + "tokens_seen": 555170816 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004200802407221665, + "loss": 2.6519, + "theoretical_loss": 3.871314291048046, + "tokens_seen": 555236352 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004200702106318957, + "loss": 2.856, + "theoretical_loss": 3.871265963122064, + "tokens_seen": 555301888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004200601805416249, + "loss": 2.5503, + "theoretical_loss": 3.8712176424961373, + "tokens_seen": 555367424 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 627698, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.411449432373047, + "objective/train/theoretical_loss": 3.871181406816171, + "objective/train/tokens_used": 575876576, + "theoretical_loss": 3.871181406816171, + "tokens_seen": 555416576 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004200501504513541, + "loss": 3.0096, + "theoretical_loss": 3.8711693291683003, + "tokens_seen": 555432960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042004012036108324, + "loss": 3.0007, + "theoretical_loss": 3.871121023136591, + "tokens_seen": 555498496 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004200300902708125, + "loss": 2.7156, + "theoretical_loss": 3.871072724399047, + "tokens_seen": 555564032 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004200200601805416, + "loss": 2.8436, + "theoretical_loss": 3.871024432953706, + "tokens_seen": 555629568 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042001003009027084, + "loss": 2.6881, + "theoretical_loss": 3.870976148798608, + "tokens_seen": 555695104 + }, + { + "epoch": 1.08, + "learning_rate": 0.00042, + "loss": 3.0327, + "theoretical_loss": 3.8709278719317917, + "tokens_seen": 555760640 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004199899699097292, + "loss": 2.7813, + "theoretical_loss": 3.8708796023512986, + "tokens_seen": 555826176 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004199799398194584, + "loss": 2.7957, + "theoretical_loss": 3.8708313400551697, + "tokens_seen": 555891712 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041996990972918756, + "loss": 2.955, + "theoretical_loss": 3.8707830850414475, + "tokens_seen": 555957248 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041995987963891674, + "loss": 2.7701, + "theoretical_loss": 3.8707348373081745, + "tokens_seen": 556022784 + }, + { + "epoch": 1.08, + "learning_rate": 0.000419949849548646, + "loss": 2.7127, + "theoretical_loss": 3.870686596853395, + "tokens_seen": 556088320 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004199398194583751, + "loss": 2.9101, + "theoretical_loss": 3.870638363675152, + "tokens_seen": 556153856 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041992978936810434, + "loss": 3.0043, + "theoretical_loss": 3.870590137771492, + "tokens_seen": 556219392 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041991975927783347, + "loss": 2.8558, + "theoretical_loss": 3.870541919140461, + "tokens_seen": 556284928 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004199097291875627, + "loss": 2.9357, + "theoretical_loss": 3.8704937077801045, + "tokens_seen": 556350464 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004198996990972919, + "loss": 3.0304, + "theoretical_loss": 3.8704455036884706, + "tokens_seen": 556416000 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041988966900702106, + "loss": 2.5689, + "theoretical_loss": 3.8703973068636075, + "tokens_seen": 556481536 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041987963891675025, + "loss": 2.8771, + "theoretical_loss": 3.870349117303564, + "tokens_seen": 556547072 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004198696088264795, + "loss": 2.7484, + "theoretical_loss": 3.87030093500639, + "tokens_seen": 556612608 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004198595787362086, + "loss": 2.9133, + "theoretical_loss": 3.8702527599701355, + "tokens_seen": 556678144 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041984954864593784, + "loss": 2.7249, + "theoretical_loss": 3.8702045921928523, + "tokens_seen": 556743680 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041983951855566697, + "loss": 2.9768, + "theoretical_loss": 3.8701564316725916, + "tokens_seen": 556809216 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004198294884653962, + "loss": 2.7145, + "theoretical_loss": 3.8701082784074075, + "tokens_seen": 556874752 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004198194583751254, + "loss": 3.0215, + "theoretical_loss": 3.870060132395351, + "tokens_seen": 556940288 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041980942828485457, + "loss": 2.9044, + "theoretical_loss": 3.8700119936344786, + "tokens_seen": 557005824 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 628817, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7914021015167236, + "objective/train/theoretical_loss": 3.8699758943212434, + "objective/train/tokens_used": 577514976, + "theoretical_loss": 3.8699758943212434, + "tokens_seen": 557054976 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041979939819458375, + "loss": 2.962, + "theoretical_loss": 3.8699638621228445, + "tokens_seen": 557071360 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041978936810431293, + "loss": 2.7353, + "theoretical_loss": 3.8699157378585043, + "tokens_seen": 557136896 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004197793380140421, + "loss": 2.9422, + "theoretical_loss": 3.869867620839514, + "tokens_seen": 557202432 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041976930792377135, + "loss": 3.1109, + "theoretical_loss": 3.8698195110639317, + "tokens_seen": 557267968 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041975927783350047, + "loss": 2.8179, + "theoretical_loss": 3.8697714085298145, + "tokens_seen": 557333504 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004197492477432297, + "loss": 2.9445, + "theoretical_loss": 3.8697233132352222, + "tokens_seen": 557399040 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041973921765295883, + "loss": 2.9836, + "theoretical_loss": 3.869675225178213, + "tokens_seen": 557464576 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041972918756268807, + "loss": 2.9379, + "theoretical_loss": 3.8696271443568477, + "tokens_seen": 557530112 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041971915747241725, + "loss": 2.803, + "theoretical_loss": 3.869579070769187, + "tokens_seen": 557595648 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041970912738214643, + "loss": 3.052, + "theoretical_loss": 3.8695310044132927, + "tokens_seen": 557661184 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004196990972918756, + "loss": 2.9134, + "theoretical_loss": 3.8694829452872277, + "tokens_seen": 557726720 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041968906720160485, + "loss": 2.7973, + "theoretical_loss": 3.8694348933890543, + "tokens_seen": 557792256 + }, + { + "epoch": 1.08, + "learning_rate": 0.000419679037111334, + "loss": 3.0926, + "theoretical_loss": 3.8693868487168372, + "tokens_seen": 557857792 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004196690070210632, + "loss": 3.0218, + "theoretical_loss": 3.8693388112686407, + "tokens_seen": 557923328 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004196589769307924, + "loss": 2.981, + "theoretical_loss": 3.8692907810425305, + "tokens_seen": 557988864 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004196489468405216, + "loss": 3.0689, + "theoretical_loss": 3.869242758036573, + "tokens_seen": 558054400 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004196389167502508, + "loss": 2.6322, + "theoretical_loss": 3.8691947422488333, + "tokens_seen": 558119936 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041962888665997994, + "loss": 2.9541, + "theoretical_loss": 3.8691467336773817, + "tokens_seen": 558185472 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041961885656970917, + "loss": 2.9968, + "theoretical_loss": 3.8690987323202846, + "tokens_seen": 558251008 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004196088264794383, + "loss": 3.0707, + "theoretical_loss": 3.8690507381756123, + "tokens_seen": 558316544 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041959879638916753, + "loss": 2.7787, + "theoretical_loss": 3.8690027512414344, + "tokens_seen": 558382080 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004195887662988967, + "loss": 2.7311, + "theoretical_loss": 3.868954771515821, + "tokens_seen": 558447616 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004195787362086259, + "loss": 2.9757, + "theoretical_loss": 3.8689067989968438, + "tokens_seen": 558513152 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004195687061183551, + "loss": 3.019, + "theoretical_loss": 3.868858833682575, + "tokens_seen": 558578688 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004195586760280843, + "loss": 2.808, + "theoretical_loss": 3.868810875571088, + "tokens_seen": 558644224 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 629246, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.687079429626465, + "objective/train/theoretical_loss": 3.8687749117131385, + "objective/train/tokens_used": 579153376, + "theoretical_loss": 3.8687749117131385, + "tokens_seen": 558693376 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041954864593781344, + "loss": 2.8716, + "theoretical_loss": 3.868762924660455, + "tokens_seen": 558709760 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004195386158475427, + "loss": 2.9467, + "theoretical_loss": 3.8687149809487518, + "tokens_seen": 558775296 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004195285857572718, + "loss": 2.692, + "theoretical_loss": 3.8686670444340527, + "tokens_seen": 558840832 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041951855566700104, + "loss": 2.7968, + "theoretical_loss": 3.8686191151144333, + "tokens_seen": 558906368 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004195085255767302, + "loss": 2.6929, + "theoretical_loss": 3.868571192987971, + "tokens_seen": 558971904 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004194984954864594, + "loss": 3.0387, + "theoretical_loss": 3.8685232780527423, + "tokens_seen": 559037440 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004194884653961886, + "loss": 2.7125, + "theoretical_loss": 3.868475370306826, + "tokens_seen": 559102976 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041947843530591776, + "loss": 2.8212, + "theoretical_loss": 3.8684274697483003, + "tokens_seen": 559168512 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041946840521564694, + "loss": 2.8875, + "theoretical_loss": 3.8683795763752444, + "tokens_seen": 559234048 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004194583751253762, + "loss": 3.2464, + "theoretical_loss": 3.8683316901857396, + "tokens_seen": 559299584 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004194483450351053, + "loss": 2.8747, + "theoretical_loss": 3.868283811177866, + "tokens_seen": 559365120 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041943831494483454, + "loss": 2.6532, + "theoretical_loss": 3.868235939349706, + "tokens_seen": 559430656 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041942828485456367, + "loss": 2.9353, + "theoretical_loss": 3.868188074699341, + "tokens_seen": 559496192 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004194182547642929, + "loss": 2.8988, + "theoretical_loss": 3.8681402172248553, + "tokens_seen": 559561728 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004194082246740221, + "loss": 2.664, + "theoretical_loss": 3.8680923669243326, + "tokens_seen": 559627264 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041939819458375126, + "loss": 2.8466, + "theoretical_loss": 3.8680445237958567, + "tokens_seen": 559692800 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041938816449348045, + "loss": 3.101, + "theoretical_loss": 3.8679966878375143, + "tokens_seen": 559758336 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004193781344032097, + "loss": 2.7125, + "theoretical_loss": 3.867948859047391, + "tokens_seen": 559823872 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004193681043129388, + "loss": 2.7661, + "theoretical_loss": 3.8679010374235734, + "tokens_seen": 559889408 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041935807422266804, + "loss": 2.7567, + "theoretical_loss": 3.8678532229641496, + "tokens_seen": 559954944 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041934804413239717, + "loss": 2.8698, + "theoretical_loss": 3.867805415667208, + "tokens_seen": 560020480 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004193380140421264, + "loss": 2.8603, + "theoretical_loss": 3.867757615530837, + "tokens_seen": 560086016 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004193279839518556, + "loss": 2.8624, + "theoretical_loss": 3.8677098225531266, + "tokens_seen": 560151552 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041931795386158477, + "loss": 3.0192, + "theoretical_loss": 3.867662036732168, + "tokens_seen": 560217088 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041930792377131395, + "loss": 2.552, + "theoretical_loss": 3.8676142580660517, + "tokens_seen": 560282624 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 630481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8848719596862793, + "objective/train/theoretical_loss": 3.867578428760682, + "objective/train/tokens_used": 580791776, + "theoretical_loss": 3.867578428760682, + "tokens_seen": 560331776 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041929789368104313, + "loss": 2.9827, + "theoretical_loss": 3.8675664865528705, + "tokens_seen": 560348160 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004192878635907723, + "loss": 2.9074, + "theoretical_loss": 3.8675187221907166, + "tokens_seen": 560413696 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041927783350050155, + "loss": 2.7092, + "theoretical_loss": 3.867470964977683, + "tokens_seen": 560479232 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041926780341023067, + "loss": 2.9471, + "theoretical_loss": 3.867423214911865, + "tokens_seen": 560544768 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004192577733199599, + "loss": 2.9313, + "theoretical_loss": 3.867375471991357, + "tokens_seen": 560610304 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041924774322968904, + "loss": 2.7035, + "theoretical_loss": 3.8673277362142544, + "tokens_seen": 560675840 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041923771313941827, + "loss": 2.9026, + "theoretical_loss": 3.867280007578654, + "tokens_seen": 560741376 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041922768304914745, + "loss": 2.7299, + "theoretical_loss": 3.867232286082653, + "tokens_seen": 560806912 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041921765295887663, + "loss": 3.0076, + "theoretical_loss": 3.867184571724349, + "tokens_seen": 560872448 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004192076228686058, + "loss": 2.8349, + "theoretical_loss": 3.867136864501841, + "tokens_seen": 560937984 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041919759277833505, + "loss": 3.0804, + "theoretical_loss": 3.867089164413228, + "tokens_seen": 561003520 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004191875626880642, + "loss": 2.8924, + "theoretical_loss": 3.8670414714566093, + "tokens_seen": 561069056 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004191775325977934, + "loss": 2.9753, + "theoretical_loss": 3.866993785630087, + "tokens_seen": 561134592 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041916750250752254, + "loss": 2.901, + "theoretical_loss": 3.8669461069317617, + "tokens_seen": 561200128 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004191574724172518, + "loss": 2.9177, + "theoretical_loss": 3.866898435359736, + "tokens_seen": 561265664 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041914744232698095, + "loss": 2.7428, + "theoretical_loss": 3.866850770912113, + "tokens_seen": 561331200 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041913741223671014, + "loss": 2.6984, + "theoretical_loss": 3.8668031135869962, + "tokens_seen": 561396736 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004191273821464393, + "loss": 3.0854, + "theoretical_loss": 3.86675546338249, + "tokens_seen": 561462272 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004191173520561685, + "loss": 2.815, + "theoretical_loss": 3.866707820296699, + "tokens_seen": 561527808 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004191073219658977, + "loss": 2.8582, + "theoretical_loss": 3.8666601843277304, + "tokens_seen": 561593344 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004190972918756269, + "loss": 2.9834, + "theoretical_loss": 3.866612555473689, + "tokens_seen": 561658880 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041908726178535604, + "loss": 2.9236, + "theoretical_loss": 3.8665649337326835, + "tokens_seen": 561724416 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004190772316950853, + "loss": 2.8714, + "theoretical_loss": 3.8665173191028215, + "tokens_seen": 561789952 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004190672016048144, + "loss": 2.7921, + "theoretical_loss": 3.8664697115822118, + "tokens_seen": 561855488 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041905717151454364, + "loss": 2.8708, + "theoretical_loss": 3.866422111168964, + "tokens_seen": 561921024 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 630917, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.474799871444702, + "objective/train/theoretical_loss": 3.866386415522097, + "objective/train/tokens_used": 582430176, + "theoretical_loss": 3.866386415522097, + "tokens_seen": 561970176 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004190471414242728, + "loss": 2.7912, + "theoretical_loss": 3.8663745178611872, + "tokens_seen": 561986560 + }, + { + "epoch": 1.08, + "learning_rate": 0.000419037111334002, + "loss": 2.9873, + "theoretical_loss": 3.866326931656994, + "tokens_seen": 562052096 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004190270812437312, + "loss": 2.8812, + "theoretical_loss": 3.8662793525544954, + "tokens_seen": 562117632 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004190170511534604, + "loss": 2.6993, + "theoretical_loss": 3.8662317805518036, + "tokens_seen": 562183168 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041900702106318954, + "loss": 3.0518, + "theoretical_loss": 3.8661842156470314, + "tokens_seen": 562248704 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004189969909729188, + "loss": 2.944, + "theoretical_loss": 3.866136657838293, + "tokens_seen": 562314240 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004189869608826479, + "loss": 2.7422, + "theoretical_loss": 3.866089107123703, + "tokens_seen": 562379776 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041897693079237714, + "loss": 2.8368, + "theoretical_loss": 3.8660415635013767, + "tokens_seen": 562445312 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004189669007021063, + "loss": 2.9691, + "theoretical_loss": 3.86599402696943, + "tokens_seen": 562510848 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004189568706118355, + "loss": 2.8235, + "theoretical_loss": 3.8659464975259787, + "tokens_seen": 562576384 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004189468405215647, + "loss": 2.9171, + "theoretical_loss": 3.865898975169142, + "tokens_seen": 562641920 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041893681043129387, + "loss": 3.0027, + "theoretical_loss": 3.865851459897037, + "tokens_seen": 562707456 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041892678034102305, + "loss": 2.9641, + "theoretical_loss": 3.865803951707782, + "tokens_seen": 562772992 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004189167502507523, + "loss": 2.9334, + "theoretical_loss": 3.8657564505994975, + "tokens_seen": 562838528 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041890672016048146, + "loss": 2.9198, + "theoretical_loss": 3.8657089565703036, + "tokens_seen": 562904064 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041889669007021065, + "loss": 3.1526, + "theoretical_loss": 3.865661469618321, + "tokens_seen": 562969600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004188866599799399, + "loss": 3.0167, + "theoretical_loss": 3.8656139897416715, + "tokens_seen": 563035136 + }, + { + "epoch": 1.08, + "learning_rate": 0.000418876629889669, + "loss": 2.8326, + "theoretical_loss": 3.865566516938478, + "tokens_seen": 563100672 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041886659979939824, + "loss": 2.9995, + "theoretical_loss": 3.8655190512068627, + "tokens_seen": 563166208 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041885656970912737, + "loss": 2.9358, + "theoretical_loss": 3.8654715925449503, + "tokens_seen": 563231744 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004188465396188566, + "loss": 2.7895, + "theoretical_loss": 3.8654241409508656, + "tokens_seen": 563297280 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004188365095285858, + "loss": 2.9822, + "theoretical_loss": 3.865376696422733, + "tokens_seen": 563362816 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041882647943831497, + "loss": 3.0189, + "theoretical_loss": 3.865329258958679, + "tokens_seen": 563428352 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041881644934804415, + "loss": 2.7797, + "theoretical_loss": 3.8652818285568307, + "tokens_seen": 563493888 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041880641925777333, + "loss": 2.861, + "theoretical_loss": 3.8652344052153147, + "tokens_seen": 563559424 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 632338, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9370810985565186, + "objective/train/theoretical_loss": 3.865198842341395, + "objective/train/tokens_used": 584068576, + "theoretical_loss": 3.865198842341395, + "tokens_seen": 563608576 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004187963891675025, + "loss": 2.935, + "theoretical_loss": 3.86518698893226, + "tokens_seen": 563624960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041878635907723175, + "loss": 2.9378, + "theoretical_loss": 3.8651395797057946, + "tokens_seen": 563690496 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004187763289869609, + "loss": 2.7841, + "theoretical_loss": 3.8650921775340494, + "tokens_seen": 563756032 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004187662988966901, + "loss": 2.921, + "theoretical_loss": 3.8650447824151533, + "tokens_seen": 563821568 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041875626880641924, + "loss": 2.959, + "theoretical_loss": 3.864997394347238, + "tokens_seen": 563887104 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041874623871614847, + "loss": 2.6183, + "theoretical_loss": 3.864950013328435, + "tokens_seen": 563952640 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041873620862587765, + "loss": 2.9683, + "theoretical_loss": 3.864902639356877, + "tokens_seen": 564018176 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041872617853560683, + "loss": 3.0088, + "theoretical_loss": 3.864855272430697, + "tokens_seen": 564083712 + }, + { + "epoch": 1.08, + "learning_rate": 0.000418716148445336, + "loss": 3.0662, + "theoretical_loss": 3.864807912548029, + "tokens_seen": 564149248 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041870611835506525, + "loss": 2.9283, + "theoretical_loss": 3.8647605597070074, + "tokens_seen": 564214784 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004186960882647944, + "loss": 2.8571, + "theoretical_loss": 3.864713213905768, + "tokens_seen": 564280320 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004186860581745236, + "loss": 2.82, + "theoretical_loss": 3.864665875142446, + "tokens_seen": 564345856 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041867602808425274, + "loss": 3.119, + "theoretical_loss": 3.8646185434151787, + "tokens_seen": 564411392 + }, + { + "epoch": 1.08, + "learning_rate": 0.000418665997993982, + "loss": 3.0035, + "theoretical_loss": 3.864571218722103, + "tokens_seen": 564476928 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041865596790371115, + "loss": 2.8305, + "theoretical_loss": 3.8645239010613572, + "tokens_seen": 564542464 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041864593781344034, + "loss": 2.8717, + "theoretical_loss": 3.8644765904310803, + "tokens_seen": 564608000 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004186359077231695, + "loss": 2.7008, + "theoretical_loss": 3.864429286829412, + "tokens_seen": 564673536 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004186258776328987, + "loss": 2.8845, + "theoretical_loss": 3.8643819902544925, + "tokens_seen": 564739072 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004186158475426279, + "loss": 2.9494, + "theoretical_loss": 3.864334700704462, + "tokens_seen": 564804608 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004186058174523571, + "loss": 2.9526, + "theoretical_loss": 3.8642874181774634, + "tokens_seen": 564870144 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041859578736208624, + "loss": 2.8049, + "theoretical_loss": 3.8642401426716377, + "tokens_seen": 564935680 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004185857572718155, + "loss": 2.703, + "theoretical_loss": 3.8641928741851292, + "tokens_seen": 565001216 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004185757271815446, + "loss": 3.0939, + "theoretical_loss": 3.864145612716081, + "tokens_seen": 565066752 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041856569709127384, + "loss": 2.748, + "theoretical_loss": 3.8640983582626385, + "tokens_seen": 565132288 + }, + { + "epoch": 1.08, + "learning_rate": 0.000418555667001003, + "loss": 3.0962, + "theoretical_loss": 3.8640511108229454, + "tokens_seen": 565197824 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 633117, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0730414390563965, + "objective/train/theoretical_loss": 3.864015679844834, + "objective/train/tokens_used": 585706976, + "theoretical_loss": 3.864015679844834, + "tokens_seen": 565246976 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004185456369107322, + "loss": 3.0086, + "theoretical_loss": 3.8640038703951483, + "tokens_seen": 565263360 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004185356068204614, + "loss": 2.9326, + "theoretical_loss": 3.8639566369773943, + "tokens_seen": 565328896 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004185255767301906, + "loss": 2.6394, + "theoretical_loss": 3.86390941056783, + "tokens_seen": 565394432 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041851554663991974, + "loss": 2.9686, + "theoretical_loss": 3.8638621911646043, + "tokens_seen": 565459968 + }, + { + "epoch": 1.08, + "learning_rate": 0.000418505516549649, + "loss": 2.9378, + "theoretical_loss": 3.863814978765865, + "tokens_seen": 565525504 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004184954864593781, + "loss": 2.7113, + "theoretical_loss": 3.8637677733697617, + "tokens_seen": 565591040 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041848545636910734, + "loss": 2.9256, + "theoretical_loss": 3.8637205749744448, + "tokens_seen": 565656576 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004184754262788365, + "loss": 2.8471, + "theoretical_loss": 3.8636733835780652, + "tokens_seen": 565722112 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004184653961885657, + "loss": 2.896, + "theoretical_loss": 3.863626199178774, + "tokens_seen": 565787648 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004184553660982949, + "loss": 2.755, + "theoretical_loss": 3.863579021774724, + "tokens_seen": 565853184 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041844533600802407, + "loss": 2.9088, + "theoretical_loss": 3.8635318513640677, + "tokens_seen": 565918720 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041843530591775325, + "loss": 2.8796, + "theoretical_loss": 3.863484687944959, + "tokens_seen": 565984256 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004184252758274825, + "loss": 2.6829, + "theoretical_loss": 3.863437531515552, + "tokens_seen": 566049792 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004184152457372116, + "loss": 3.0297, + "theoretical_loss": 3.8633903820740016, + "tokens_seen": 566115328 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041840521564694085, + "loss": 2.7752, + "theoretical_loss": 3.863343239618464, + "tokens_seen": 566180864 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041839518555666997, + "loss": 3.0822, + "theoretical_loss": 3.863296104147096, + "tokens_seen": 566246400 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004183851554663992, + "loss": 2.9419, + "theoretical_loss": 3.8632489756580535, + "tokens_seen": 566311936 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004183751253761284, + "loss": 3.0283, + "theoretical_loss": 3.8632018541494952, + "tokens_seen": 566377472 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041836509528585757, + "loss": 2.9068, + "theoretical_loss": 3.8631547396195796, + "tokens_seen": 566443008 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041835506519558675, + "loss": 3.0821, + "theoretical_loss": 3.863107632066466, + "tokens_seen": 566508544 + }, + { + "epoch": 1.08, + "learning_rate": 0.000418345035105316, + "loss": 2.8347, + "theoretical_loss": 3.8630605314883137, + "tokens_seen": 566574080 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004183350050150451, + "loss": 2.8839, + "theoretical_loss": 3.863013437883284, + "tokens_seen": 566639616 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041832497492477435, + "loss": 2.713, + "theoretical_loss": 3.862966351249538, + "tokens_seen": 566705152 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004183149448345035, + "loss": 2.9915, + "theoretical_loss": 3.8629192715852376, + "tokens_seen": 566770688 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004183049147442327, + "loss": 2.868, + "theoretical_loss": 3.862872198888546, + "tokens_seen": 566836224 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 634194, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.527719020843506, + "objective/train/theoretical_loss": 3.8628368989374158, + "objective/train/tokens_used": 587345376, + "theoretical_loss": 3.8628368989374158, + "tokens_seen": 566885376 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004182948846539619, + "loss": 2.473, + "theoretical_loss": 3.862825133157626, + "tokens_seen": 566901760 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004182848545636911, + "loss": 2.9303, + "theoretical_loss": 3.8627780743906426, + "tokens_seen": 566967296 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041827482447342025, + "loss": 2.9108, + "theoretical_loss": 3.862731022585759, + "tokens_seen": 567032832 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041826479438314944, + "loss": 2.8899, + "theoretical_loss": 3.862683977741143, + "tokens_seen": 567098368 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004182547642928786, + "loss": 2.7323, + "theoretical_loss": 3.8626369398549585, + "tokens_seen": 567163904 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041824473420260785, + "loss": 2.9033, + "theoretical_loss": 3.862589908925374, + "tokens_seen": 567229440 + }, + { + "epoch": 1.08, + "learning_rate": 0.000418234704112337, + "loss": 3.1807, + "theoretical_loss": 3.8625428849505563, + "tokens_seen": 567294976 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004182246740220662, + "loss": 2.795, + "theoretical_loss": 3.862495867928674, + "tokens_seen": 567360512 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004182146439317954, + "loss": 3.0679, + "theoretical_loss": 3.8624488578578964, + "tokens_seen": 567426048 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004182046138415246, + "loss": 2.8983, + "theoretical_loss": 3.8624018547363925, + "tokens_seen": 567491584 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041819458375125376, + "loss": 2.9737, + "theoretical_loss": 3.862354858562333, + "tokens_seen": 567557120 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041818455366098294, + "loss": 2.8067, + "theoretical_loss": 3.862307869333889, + "tokens_seen": 567622656 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004181745235707121, + "loss": 2.8586, + "theoretical_loss": 3.8622608870492323, + "tokens_seen": 567688192 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041816449348044136, + "loss": 2.8927, + "theoretical_loss": 3.862213911706535, + "tokens_seen": 567753728 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041815446339017054, + "loss": 2.7804, + "theoretical_loss": 3.862166943303971, + "tokens_seen": 567819264 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004181444332998997, + "loss": 2.6391, + "theoretical_loss": 3.862119981839713, + "tokens_seen": 567884800 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004181344032096289, + "loss": 2.6862, + "theoretical_loss": 3.8620730273119364, + "tokens_seen": 567950336 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004181243731193581, + "loss": 2.8688, + "theoretical_loss": 3.862026079718816, + "tokens_seen": 568015872 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004181143430290873, + "loss": 3.0498, + "theoretical_loss": 3.8619791390585285, + "tokens_seen": 568081408 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041810431293881644, + "loss": 2.8153, + "theoretical_loss": 3.8619322053292495, + "tokens_seen": 568146944 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004180942828485457, + "loss": 2.723, + "theoretical_loss": 3.8618852785291566, + "tokens_seen": 568212480 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004180842527582748, + "loss": 2.8977, + "theoretical_loss": 3.8618383586564278, + "tokens_seen": 568278016 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041807422266800404, + "loss": 2.8766, + "theoretical_loss": 3.861791445709242, + "tokens_seen": 568343552 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004180641925777332, + "loss": 2.8196, + "theoretical_loss": 3.8617445396857786, + "tokens_seen": 568409088 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004180541624874624, + "loss": 2.5105, + "theoretical_loss": 3.861697640584217, + "tokens_seen": 568474624 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 634895, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1014013290405273, + "objective/train/theoretical_loss": 3.8616624707994496, + "objective/train/tokens_used": 588983776, + "theoretical_loss": 3.8616624707994496, + "tokens_seen": 568523776 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004180441323971916, + "loss": 2.6694, + "theoretical_loss": 3.861650748402738, + "tokens_seen": 568540160 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004180341023069208, + "loss": 2.9858, + "theoretical_loss": 3.861603863139524, + "tokens_seen": 568605696 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041802407221664994, + "loss": 2.4696, + "theoretical_loss": 3.861556984792756, + "tokens_seen": 568671232 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004180140421263792, + "loss": 2.7752, + "theoretical_loss": 3.861510113360618, + "tokens_seen": 568736768 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004180040120361083, + "loss": 2.6513, + "theoretical_loss": 3.861463248841292, + "tokens_seen": 568802304 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041799398194583754, + "loss": 2.6531, + "theoretical_loss": 3.861416391232963, + "tokens_seen": 568867840 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004179839518555667, + "loss": 2.8723, + "theoretical_loss": 3.8613695405338158, + "tokens_seen": 568933376 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004179739217652959, + "loss": 2.6643, + "theoretical_loss": 3.861322696742036, + "tokens_seen": 568998912 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004179638916750251, + "loss": 2.8317, + "theoretical_loss": 3.8612758598558097, + "tokens_seen": 569064448 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041795386158475427, + "loss": 2.9949, + "theoretical_loss": 3.8612290298733236, + "tokens_seen": 569129984 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041794383149448345, + "loss": 2.7817, + "theoretical_loss": 3.8611822067927655, + "tokens_seen": 569195520 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004179338014042127, + "loss": 2.8349, + "theoretical_loss": 3.861135390612324, + "tokens_seen": 569261056 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004179237713139418, + "loss": 2.6289, + "theoretical_loss": 3.8610885813301876, + "tokens_seen": 569326592 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041791374122367105, + "loss": 2.9583, + "theoretical_loss": 3.8610417789445464, + "tokens_seen": 569392128 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041790371113340017, + "loss": 2.8696, + "theoretical_loss": 3.86099498345359, + "tokens_seen": 569457664 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004178936810431294, + "loss": 2.7377, + "theoretical_loss": 3.86094819485551, + "tokens_seen": 569523200 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004178836509528586, + "loss": 2.5355, + "theoretical_loss": 3.8609014131484978, + "tokens_seen": 569588736 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041787362086258777, + "loss": 2.7731, + "theoretical_loss": 3.860854638330746, + "tokens_seen": 569654272 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041786359077231695, + "loss": 2.8211, + "theoretical_loss": 3.860807870400447, + "tokens_seen": 569719808 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004178535606820462, + "loss": 2.9783, + "theoretical_loss": 3.8607611093557956, + "tokens_seen": 569785344 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004178435305917753, + "loss": 2.6345, + "theoretical_loss": 3.860714355194986, + "tokens_seen": 569850880 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041783350050150455, + "loss": 2.885, + "theoretical_loss": 3.8606676079162128, + "tokens_seen": 569916416 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004178234704112337, + "loss": 3.0496, + "theoretical_loss": 3.860620867517672, + "tokens_seen": 569981952 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004178134403209629, + "loss": 2.9936, + "theoretical_loss": 3.86057413399756, + "tokens_seen": 570047488 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004178034102306921, + "loss": 2.5621, + "theoretical_loss": 3.8605274073540743, + "tokens_seen": 570113024 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 635428, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.758199691772461, + "objective/train/theoretical_loss": 3.860492366883161, + "objective/train/tokens_used": 590622176, + "theoretical_loss": 3.860492366883161, + "tokens_seen": 570162176 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004177933801404213, + "loss": 2.7609, + "theoretical_loss": 3.8604806875854116, + "tokens_seen": 570178560 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041778335005015045, + "loss": 2.6497, + "theoretical_loss": 3.8604339746897725, + "tokens_seen": 570244096 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041777331995987964, + "loss": 2.9305, + "theoretical_loss": 3.860387268665354, + "tokens_seen": 570309632 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004177632898696088, + "loss": 2.5576, + "theoretical_loss": 3.860340569510357, + "tokens_seen": 570375168 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041775325977933805, + "loss": 2.7111, + "theoretical_loss": 3.8602938772229827, + "tokens_seen": 570440704 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004177432296890672, + "loss": 2.9168, + "theoretical_loss": 3.8602471918014305, + "tokens_seen": 570506240 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004177331995987964, + "loss": 2.7029, + "theoretical_loss": 3.8602005132439037, + "tokens_seen": 570571776 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004177231695085256, + "loss": 2.7222, + "theoretical_loss": 3.8601538415486045, + "tokens_seen": 570637312 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004177131394182548, + "loss": 2.7994, + "theoretical_loss": 3.8601071767137363, + "tokens_seen": 570702848 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041770310932798396, + "loss": 2.7614, + "theoretical_loss": 3.8600605187375026, + "tokens_seen": 570768384 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041769307923771314, + "loss": 2.6008, + "theoretical_loss": 3.860013867618109, + "tokens_seen": 570833920 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004176830491474423, + "loss": 2.9842, + "theoretical_loss": 3.8599672233537596, + "tokens_seen": 570899456 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041767301905717156, + "loss": 2.601, + "theoretical_loss": 3.8599205859426604, + "tokens_seen": 570964992 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004176629889669007, + "loss": 2.7662, + "theoretical_loss": 3.8598739553830193, + "tokens_seen": 571030528 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004176529588766299, + "loss": 2.9515, + "theoretical_loss": 3.859827331673042, + "tokens_seen": 571096064 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041764292878635904, + "loss": 2.8044, + "theoretical_loss": 3.8597807148109378, + "tokens_seen": 571161600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004176328986960883, + "loss": 2.6835, + "theoretical_loss": 3.8597341047949145, + "tokens_seen": 571227136 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041762286860581746, + "loss": 2.8642, + "theoretical_loss": 3.8596875016231817, + "tokens_seen": 571292672 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041761283851554664, + "loss": 2.9252, + "theoretical_loss": 3.8596409052939498, + "tokens_seen": 571358208 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004176028084252758, + "loss": 2.2926, + "theoretical_loss": 3.8595943158054284, + "tokens_seen": 571423744 + }, + { + "epoch": 1.08, + "learning_rate": 0.000417592778335005, + "loss": 2.3754, + "theoretical_loss": 3.85954773315583, + "tokens_seen": 571489280 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004175827482447342, + "loss": 3.0418, + "theoretical_loss": 3.8595011573433657, + "tokens_seen": 571554816 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004175727181544634, + "loss": 2.8298, + "theoretical_loss": 3.8594545883662494, + "tokens_seen": 571620352 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041756268806419255, + "loss": 2.6487, + "theoretical_loss": 3.8594080262226935, + "tokens_seen": 571685888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004175526579739218, + "loss": 2.6435, + "theoretical_loss": 3.859361470910912, + "tokens_seen": 571751424 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 638980, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1796209812164307, + "objective/train/theoretical_loss": 3.859326558909354, + "objective/train/tokens_used": 592260576, + "theoretical_loss": 3.859326558909354, + "tokens_seen": 571800576 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041754262788365096, + "loss": 2.9137, + "theoretical_loss": 3.8593149224291197, + "tokens_seen": 571816960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041753259779338015, + "loss": 2.54, + "theoretical_loss": 3.8592683807755326, + "tokens_seen": 571882496 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004175225677031093, + "loss": 2.74, + "theoretical_loss": 3.8592218459483663, + "tokens_seen": 571948032 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004175125376128385, + "loss": 2.6262, + "theoretical_loss": 3.8591753179458372, + "tokens_seen": 572013568 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004175025075225677, + "loss": 2.8701, + "theoretical_loss": 3.859128796766163, + "tokens_seen": 572079104 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004174924774322969, + "loss": 2.6111, + "theoretical_loss": 3.859082282407562, + "tokens_seen": 572144640 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041748244734202605, + "loss": 2.7199, + "theoretical_loss": 3.8590357748682527, + "tokens_seen": 572210176 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004174724172517553, + "loss": 2.4907, + "theoretical_loss": 3.858989274146454, + "tokens_seen": 572275712 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004174623871614844, + "loss": 2.8167, + "theoretical_loss": 3.858942780240387, + "tokens_seen": 572341248 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041745235707121365, + "loss": 2.6303, + "theoretical_loss": 3.858896293148272, + "tokens_seen": 572406784 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041744232698094283, + "loss": 3.005, + "theoretical_loss": 3.85884981286833, + "tokens_seen": 572472320 + }, + { + "epoch": 1.08, + "learning_rate": 0.000417432296890672, + "loss": 2.8463, + "theoretical_loss": 3.858803339398783, + "tokens_seen": 572537856 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004174222668004012, + "loss": 3.0947, + "theoretical_loss": 3.858756872737855, + "tokens_seen": 572603392 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041741223671013037, + "loss": 3.0426, + "theoretical_loss": 3.8587104128837675, + "tokens_seen": 572668928 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004174022066198596, + "loss": 2.8641, + "theoretical_loss": 3.8586639598347463, + "tokens_seen": 572734464 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004173921765295888, + "loss": 2.8205, + "theoretical_loss": 3.8586175135890155, + "tokens_seen": 572800000 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041738214643931797, + "loss": 2.9435, + "theoretical_loss": 3.8585710741448, + "tokens_seen": 572865536 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041737211634904715, + "loss": 2.9395, + "theoretical_loss": 3.8585246415003267, + "tokens_seen": 572931072 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004173620862587764, + "loss": 2.7106, + "theoretical_loss": 3.8584782156538218, + "tokens_seen": 572996608 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004173520561685055, + "loss": 2.8422, + "theoretical_loss": 3.858431796603513, + "tokens_seen": 573062144 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041734202607823475, + "loss": 2.4501, + "theoretical_loss": 3.8583853843476277, + "tokens_seen": 573127680 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004173319959879639, + "loss": 2.7858, + "theoretical_loss": 3.8583389788843956, + "tokens_seen": 573193216 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004173219658976931, + "loss": 2.5905, + "theoretical_loss": 3.858292580212045, + "tokens_seen": 573258752 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004173119358074223, + "loss": 2.659, + "theoretical_loss": 3.8582461883288075, + "tokens_seen": 573324288 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004173019057171515, + "loss": 2.6405, + "theoretical_loss": 3.858199803232913, + "tokens_seen": 573389824 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7385621070861816, + "objective/train/theoretical_loss": 3.8581650188641214, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.8581650188641214, + "tokens_seen": 573438976 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041729187562688065, + "loss": 2.822, + "theoretical_loss": 3.858153424922592, + "tokens_seen": 573455360 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041728184553660984, + "loss": 2.5345, + "theoretical_loss": 3.858107053396078, + "tokens_seen": 573520896 + }, + { + "epoch": 1.08, + "learning_rate": 0.000417271815446339, + "loss": 2.6229, + "theoretical_loss": 3.858060688651603, + "tokens_seen": 573586432 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041726178535606825, + "loss": 2.5662, + "theoretical_loss": 3.8580143306874, + "tokens_seen": 573651968 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004172517552657974, + "loss": 2.9459, + "theoretical_loss": 3.857967979501704, + "tokens_seen": 573717504 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004172417251755266, + "loss": 2.9108, + "theoretical_loss": 3.857921635092749, + "tokens_seen": 573783040 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004172316950852558, + "loss": 2.7796, + "theoretical_loss": 3.8578752974587704, + "tokens_seen": 573848576 + }, + { + "epoch": 1.08, + "learning_rate": 0.000417221664994985, + "loss": 2.9508, + "theoretical_loss": 3.857828966598005, + "tokens_seen": 573914112 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041721163490471416, + "loss": 2.4994, + "theoretical_loss": 3.857782642508688, + "tokens_seen": 573979648 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041720160481444334, + "loss": 2.8715, + "theoretical_loss": 3.857736325189058, + "tokens_seen": 574045184 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004171915747241725, + "loss": 2.666, + "theoretical_loss": 3.8576900146373525, + "tokens_seen": 574110720 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041718154463390176, + "loss": 2.6323, + "theoretical_loss": 3.8576437108518102, + "tokens_seen": 574176256 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004171715145436309, + "loss": 2.7769, + "theoretical_loss": 3.8575974138306703, + "tokens_seen": 574241792 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004171614844533601, + "loss": 2.8701, + "theoretical_loss": 3.857551123572174, + "tokens_seen": 574307328 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041715145436308924, + "loss": 2.7098, + "theoretical_loss": 3.8575048400745597, + "tokens_seen": 574372864 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004171414242728185, + "loss": 2.9271, + "theoretical_loss": 3.8574585633360705, + "tokens_seen": 574438400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041713139418254766, + "loss": 2.7082, + "theoretical_loss": 3.8574122933549475, + "tokens_seen": 574503936 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041712136409227684, + "loss": 2.8392, + "theoretical_loss": 3.857366030129434, + "tokens_seen": 574569472 + }, + { + "epoch": 1.08, + "learning_rate": 0.000417111334002006, + "loss": 2.5627, + "theoretical_loss": 3.857319773657772, + "tokens_seen": 574635008 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004171013039117352, + "loss": 2.5853, + "theoretical_loss": 3.8572735239382068, + "tokens_seen": 574700544 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004170912738214644, + "loss": 2.8761, + "theoretical_loss": 3.8572272809689823, + "tokens_seen": 574766080 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004170812437311936, + "loss": 2.9132, + "theoretical_loss": 3.857181044748344, + "tokens_seen": 574831616 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041707121364092275, + "loss": 2.6828, + "theoretical_loss": 3.857134815274538, + "tokens_seen": 574897152 + }, + { + "epoch": 1.08, + "learning_rate": 0.000417061183550652, + "loss": 2.5846, + "theoretical_loss": 3.85708859254581, + "tokens_seen": 574962688 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041705115346038116, + "loss": 2.9014, + "theoretical_loss": 3.8570423765604076, + "tokens_seen": 575028224 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5287535190582275, + "objective/train/theoretical_loss": 3.8570077189956096, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.8570077189956096, + "tokens_seen": 575077376 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041704112337011035, + "loss": 2.5591, + "theoretical_loss": 3.856996167316579, + "tokens_seen": 575093760 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004170310932798395, + "loss": 2.623, + "theoretical_loss": 3.8569499648125727, + "tokens_seen": 575159296 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004170210631895687, + "loss": 2.9332, + "theoretical_loss": 3.8569037690466375, + "tokens_seen": 575224832 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004170110330992979, + "loss": 2.7544, + "theoretical_loss": 3.8568575800170235, + "tokens_seen": 575290368 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004170010030090271, + "loss": 2.7303, + "theoretical_loss": 3.856811397721981, + "tokens_seen": 575355904 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041699097291875625, + "loss": 2.5582, + "theoretical_loss": 3.856765222159762, + "tokens_seen": 575421440 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004169809428284855, + "loss": 2.8197, + "theoretical_loss": 3.856719053328616, + "tokens_seen": 575486976 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004169709127382146, + "loss": 2.7173, + "theoretical_loss": 3.8566728912267982, + "tokens_seen": 575552512 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041696088264794385, + "loss": 2.8001, + "theoretical_loss": 3.85662673585256, + "tokens_seen": 575618048 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041695085255767303, + "loss": 2.9218, + "theoretical_loss": 3.8565805872041556, + "tokens_seen": 575683584 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004169408224674022, + "loss": 2.7323, + "theoretical_loss": 3.8565344452798396, + "tokens_seen": 575749120 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004169307923771314, + "loss": 2.8589, + "theoretical_loss": 3.856488310077866, + "tokens_seen": 575814656 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004169207622868606, + "loss": 2.8005, + "theoretical_loss": 3.8564421815964924, + "tokens_seen": 575880192 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041691073219658975, + "loss": 2.7526, + "theoretical_loss": 3.856396059833974, + "tokens_seen": 575945728 + }, + { + "epoch": 1.08, + "learning_rate": 0.000416900702106319, + "loss": 2.6452, + "theoretical_loss": 3.856349944788567, + "tokens_seen": 576011264 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004168906720160481, + "loss": 2.8848, + "theoretical_loss": 3.8563038364585314, + "tokens_seen": 576076800 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041688064192577735, + "loss": 2.548, + "theoretical_loss": 3.856257734842123, + "tokens_seen": 576142336 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041687061183550653, + "loss": 2.8584, + "theoretical_loss": 3.856211639937602, + "tokens_seen": 576207872 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004168605817452357, + "loss": 3.0388, + "theoretical_loss": 3.856165551743228, + "tokens_seen": 576273408 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004168505516549649, + "loss": 2.7249, + "theoretical_loss": 3.8561194702572603, + "tokens_seen": 576338944 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004168405215646941, + "loss": 2.5879, + "theoretical_loss": 3.856073395477962, + "tokens_seen": 576404480 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041683049147442326, + "loss": 3.0491, + "theoretical_loss": 3.856027327403592, + "tokens_seen": 576470016 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004168204613841525, + "loss": 2.7849, + "theoretical_loss": 3.8559812660324138, + "tokens_seen": 576535552 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004168104312938816, + "loss": 2.5804, + "theoretical_loss": 3.8559352113626906, + "tokens_seen": 576601088 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041680040120361085, + "loss": 2.7666, + "theoretical_loss": 3.855889163392685, + "tokens_seen": 576666624 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.898465633392334, + "objective/train/theoretical_loss": 3.8558546318108267, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.8558546318108267, + "tokens_seen": 576715776 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041679037111334, + "loss": 2.7956, + "theoretical_loss": 3.855843122120662, + "tokens_seen": 576732160 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004167803410230692, + "loss": 2.9043, + "theoretical_loss": 3.8557970875448855, + "tokens_seen": 576797696 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004167703109327984, + "loss": 2.7581, + "theoretical_loss": 3.8557510596636217, + "tokens_seen": 576863232 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004167602808425276, + "loss": 2.6658, + "theoretical_loss": 3.8557050384751363, + "tokens_seen": 576928768 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041675025075225676, + "loss": 2.7947, + "theoretical_loss": 3.855659023977696, + "tokens_seen": 576994304 + }, + { + "epoch": 1.08, + "learning_rate": 0.000416740220661986, + "loss": 3.0682, + "theoretical_loss": 3.8556130161695688, + "tokens_seen": 577059840 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004167301905717151, + "loss": 2.7337, + "theoretical_loss": 3.8555670150490213, + "tokens_seen": 577125376 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041672016048144436, + "loss": 2.8836, + "theoretical_loss": 3.855521020614324, + "tokens_seen": 577190912 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004167101303911735, + "loss": 2.6244, + "theoretical_loss": 3.8554750328637444, + "tokens_seen": 577256448 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004167001003009027, + "loss": 2.6494, + "theoretical_loss": 3.8554290517955536, + "tokens_seen": 577321984 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004166900702106319, + "loss": 2.845, + "theoretical_loss": 3.855383077408022, + "tokens_seen": 577387520 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004166800401203611, + "loss": 2.8374, + "theoretical_loss": 3.8553371096994207, + "tokens_seen": 577453056 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041667001003009026, + "loss": 2.6816, + "theoretical_loss": 3.8552911486680217, + "tokens_seen": 577518592 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041665997993981944, + "loss": 2.7496, + "theoretical_loss": 3.855245194312097, + "tokens_seen": 577584128 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004166499498495487, + "loss": 2.6757, + "theoretical_loss": 3.8551992466299208, + "tokens_seen": 577649664 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041663991975927786, + "loss": 2.7504, + "theoretical_loss": 3.8551533056197664, + "tokens_seen": 577715200 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041662988966900704, + "loss": 2.8373, + "theoretical_loss": 3.8551073712799075, + "tokens_seen": 577780736 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004166198595787362, + "loss": 2.8251, + "theoretical_loss": 3.85506144360862, + "tokens_seen": 577846272 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004166098294884654, + "loss": 2.8219, + "theoretical_loss": 3.8550155226041802, + "tokens_seen": 577911808 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004165997993981946, + "loss": 2.8431, + "theoretical_loss": 3.854969608264863, + "tokens_seen": 577977344 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004165897693079238, + "loss": 2.9726, + "theoretical_loss": 3.854923700588947, + "tokens_seen": 578042880 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041657973921765295, + "loss": 2.8628, + "theoretical_loss": 3.8548777995747088, + "tokens_seen": 578108416 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004165697091273822, + "loss": 3.1248, + "theoretical_loss": 3.8548319052204265, + "tokens_seen": 578173952 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041655967903711136, + "loss": 2.6876, + "theoretical_loss": 3.8547860175243795, + "tokens_seen": 578239488 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041654964894684055, + "loss": 2.6237, + "theoretical_loss": 3.854740136484848, + "tokens_seen": 578305024 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9665675163269043, + "objective/train/theoretical_loss": 3.854705730072502, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.854705730072502, + "tokens_seen": 578354176 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004165396188565697, + "loss": 2.8729, + "theoretical_loss": 3.854694262100111, + "tokens_seen": 578370560 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004165295887662989, + "loss": 2.616, + "theoretical_loss": 3.85464839436845, + "tokens_seen": 578436096 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004165195586760281, + "loss": 2.9423, + "theoretical_loss": 3.854602533288147, + "tokens_seen": 578501632 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004165095285857573, + "loss": 2.4722, + "theoretical_loss": 3.8545566788574828, + "tokens_seen": 578567168 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041649949849548645, + "loss": 2.7993, + "theoretical_loss": 3.854510831074742, + "tokens_seen": 578632704 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004164894684052157, + "loss": 2.8733, + "theoretical_loss": 3.8544649899382053, + "tokens_seen": 578698240 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004164794383149448, + "loss": 2.8087, + "theoretical_loss": 3.85441915544616, + "tokens_seen": 578763776 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041646940822467405, + "loss": 2.9179, + "theoretical_loss": 3.854373327596888, + "tokens_seen": 578829312 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041645937813440323, + "loss": 2.9232, + "theoretical_loss": 3.854327506388677, + "tokens_seen": 578894848 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004164493480441324, + "loss": 2.8144, + "theoretical_loss": 3.854281691819811, + "tokens_seen": 578960384 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004164393179538616, + "loss": 2.7681, + "theoretical_loss": 3.8542358838885775, + "tokens_seen": 579025920 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004164292878635908, + "loss": 2.7656, + "theoretical_loss": 3.854190082593264, + "tokens_seen": 579091456 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041641925777331995, + "loss": 2.6538, + "theoretical_loss": 3.854144287932158, + "tokens_seen": 579156992 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004164092276830492, + "loss": 2.8346, + "theoretical_loss": 3.8540984999035475, + "tokens_seen": 579222528 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004163991975927783, + "loss": 2.5503, + "theoretical_loss": 3.8540527185057223, + "tokens_seen": 579288064 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041638916750250755, + "loss": 2.8447, + "theoretical_loss": 3.854006943736972, + "tokens_seen": 579353600 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041637913741223673, + "loss": 2.8478, + "theoretical_loss": 3.853961175595587, + "tokens_seen": 579419136 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004163691073219659, + "loss": 2.7447, + "theoretical_loss": 3.8539154140798586, + "tokens_seen": 579484672 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004163590772316951, + "loss": 2.7067, + "theoretical_loss": 3.853869659188078, + "tokens_seen": 579550208 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004163490471414243, + "loss": 2.8598, + "theoretical_loss": 3.8538239109185377, + "tokens_seen": 579615744 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041633901705115346, + "loss": 2.5542, + "theoretical_loss": 3.8537781692695305, + "tokens_seen": 579681280 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004163289869608827, + "loss": 2.9101, + "theoretical_loss": 3.8537324342393506, + "tokens_seen": 579746816 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004163189568706118, + "loss": 2.7138, + "theoretical_loss": 3.8536867058262914, + "tokens_seen": 579812352 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041630892678034105, + "loss": 2.9372, + "theoretical_loss": 3.8536409840286483, + "tokens_seen": 579877888 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004162988966900702, + "loss": 2.6821, + "theoretical_loss": 3.8535952688447166, + "tokens_seen": 579943424 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7990593910217285, + "objective/train/theoretical_loss": 3.8535609867959906, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.8535609867959906, + "tokens_seen": 579992576 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004162888665997994, + "loss": 2.7535, + "theoretical_loss": 3.853549560272792, + "tokens_seen": 580008960 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004162788365095286, + "loss": 2.7862, + "theoretical_loss": 3.8535038583111723, + "tokens_seen": 580074496 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004162688064192578, + "loss": 2.8232, + "theoretical_loss": 3.8534581629581535, + "tokens_seen": 580140032 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041625877632898696, + "loss": 2.8708, + "theoretical_loss": 3.8534124742120346, + "tokens_seen": 580205568 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004162487462387162, + "loss": 2.7073, + "theoretical_loss": 3.853366792071114, + "tokens_seen": 580271104 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004162387161484453, + "loss": 2.4495, + "theoretical_loss": 3.8533211165336905, + "tokens_seen": 580336640 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041622868605817456, + "loss": 2.9621, + "theoretical_loss": 3.8532754475980644, + "tokens_seen": 580402176 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004162186559679037, + "loss": 2.8054, + "theoretical_loss": 3.8532297852625366, + "tokens_seen": 580467712 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004162086258776329, + "loss": 2.8751, + "theoretical_loss": 3.8531841295254075, + "tokens_seen": 580533248 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004161985957873621, + "loss": 2.7955, + "theoretical_loss": 3.8531384803849793, + "tokens_seen": 580598784 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004161885656970913, + "loss": 2.9094, + "theoretical_loss": 3.8530928378395544, + "tokens_seen": 580664320 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041617853560682046, + "loss": 2.7033, + "theoretical_loss": 3.8530472018874358, + "tokens_seen": 580729856 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041616850551654964, + "loss": 2.7345, + "theoretical_loss": 3.8530015725269267, + "tokens_seen": 580795392 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004161584754262788, + "loss": 2.8526, + "theoretical_loss": 3.852955949756332, + "tokens_seen": 580860928 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041614844533600806, + "loss": 2.8981, + "theoretical_loss": 3.8529103335739565, + "tokens_seen": 580926464 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004161384152457372, + "loss": 3.1064, + "theoretical_loss": 3.8528647239781053, + "tokens_seen": 580992000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004161283851554664, + "loss": 2.7506, + "theoretical_loss": 3.852819120967085, + "tokens_seen": 581057536 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041611835506519555, + "loss": 2.8473, + "theoretical_loss": 3.8527735245392023, + "tokens_seen": 581123072 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004161083249749248, + "loss": 2.7136, + "theoretical_loss": 3.852727934692765, + "tokens_seen": 581188608 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041609829488465397, + "loss": 2.5776, + "theoretical_loss": 3.852682351426081, + "tokens_seen": 581254144 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041608826479438315, + "loss": 2.5465, + "theoretical_loss": 3.8526367747374577, + "tokens_seen": 581319680 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041607823470411233, + "loss": 2.8932, + "theoretical_loss": 3.852591204625206, + "tokens_seen": 581385216 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041606820461384156, + "loss": 2.8596, + "theoretical_loss": 3.8525456410876355, + "tokens_seen": 581450752 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004160581745235707, + "loss": 2.5327, + "theoretical_loss": 3.8525000841230566, + "tokens_seen": 581516288 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004160481444332999, + "loss": 2.9714, + "theoretical_loss": 3.85245453372978, + "tokens_seen": 581581824 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1994011402130127, + "objective/train/theoretical_loss": 3.8524203752462247, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.8524203752462247, + "tokens_seen": 581630976 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041603811434302905, + "loss": 2.7316, + "theoretical_loss": 3.8524089899061185, + "tokens_seen": 581647360 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004160280842527583, + "loss": 2.8617, + "theoretical_loss": 3.8523634526503834, + "tokens_seen": 581712896 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041601805416248747, + "loss": 2.7499, + "theoretical_loss": 3.8523179219608883, + "tokens_seen": 581778432 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041600802407221665, + "loss": 2.5885, + "theoretical_loss": 3.8522723978359474, + "tokens_seen": 581843968 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041599799398194583, + "loss": 2.7726, + "theoretical_loss": 3.852226880273874, + "tokens_seen": 581909504 + }, + { + "epoch": 1.09, + "learning_rate": 0.000415987963891675, + "loss": 2.7785, + "theoretical_loss": 3.8521813692729836, + "tokens_seen": 581975040 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004159779338014042, + "loss": 2.6607, + "theoretical_loss": 3.852135864831591, + "tokens_seen": 582040576 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041596790371113343, + "loss": 2.622, + "theoretical_loss": 3.8520903669480138, + "tokens_seen": 582106112 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041595787362086256, + "loss": 2.7967, + "theoretical_loss": 3.8520448756205674, + "tokens_seen": 582171648 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004159478435305918, + "loss": 2.9222, + "theoretical_loss": 3.85199939084757, + "tokens_seen": 582237184 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004159378134403209, + "loss": 2.6956, + "theoretical_loss": 3.8519539126273394, + "tokens_seen": 582302720 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041592778335005015, + "loss": 2.7274, + "theoretical_loss": 3.8519084409581943, + "tokens_seen": 582368256 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041591775325977934, + "loss": 2.6459, + "theoretical_loss": 3.8518629758384537, + "tokens_seen": 582433792 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004159077231695085, + "loss": 2.7572, + "theoretical_loss": 3.8518175172664377, + "tokens_seen": 582499328 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041589769307923775, + "loss": 2.7572, + "theoretical_loss": 3.851772065240467, + "tokens_seen": 582564864 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041588766298896693, + "loss": 2.8565, + "theoretical_loss": 3.851726619758862, + "tokens_seen": 582630400 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004158776328986961, + "loss": 2.7034, + "theoretical_loss": 3.851681180819945, + "tokens_seen": 582695936 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004158676028084253, + "loss": 2.7775, + "theoretical_loss": 3.851635748422039, + "tokens_seen": 582761472 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004158575727181545, + "loss": 2.9109, + "theoretical_loss": 3.8515903225634656, + "tokens_seen": 582827008 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041584754262788366, + "loss": 2.7499, + "theoretical_loss": 3.851544903242549, + "tokens_seen": 582892544 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004158375125376129, + "loss": 2.854, + "theoretical_loss": 3.8514994904576136, + "tokens_seen": 582958080 + }, + { + "epoch": 1.09, + "learning_rate": 0.000415827482447342, + "loss": 3.1417, + "theoretical_loss": 3.851454084206985, + "tokens_seen": 583023616 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041581745235707126, + "loss": 2.7627, + "theoretical_loss": 3.8514086844889865, + "tokens_seen": 583089152 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004158074222668004, + "loss": 2.6031, + "theoretical_loss": 3.851363291301946, + "tokens_seen": 583154688 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004157973921765296, + "loss": 2.9589, + "theoretical_loss": 3.85131790464419, + "tokens_seen": 583220224 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.419975757598877, + "objective/train/theoretical_loss": 3.8512838689347095, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.8512838689347095, + "tokens_seen": 583269376 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004157873620862588, + "loss": 2.8707, + "theoretical_loss": 3.8512725245140453, + "tokens_seen": 583285760 + }, + { + "epoch": 1.09, + "learning_rate": 0.000415777331995988, + "loss": 2.8639, + "theoretical_loss": 3.85122715090984, + "tokens_seen": 583351296 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041576730190571716, + "loss": 2.7196, + "theoretical_loss": 3.8511817838299023, + "tokens_seen": 583416832 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004157572718154464, + "loss": 2.828, + "theoretical_loss": 3.8511364232725622, + "tokens_seen": 583482368 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004157472417251755, + "loss": 2.8964, + "theoretical_loss": 3.8510910692361486, + "tokens_seen": 583547904 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041573721163490476, + "loss": 2.6145, + "theoretical_loss": 3.851045721718992, + "tokens_seen": 583613440 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004157271815446339, + "loss": 2.6236, + "theoretical_loss": 3.851000380719424, + "tokens_seen": 583678976 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004157171514543631, + "loss": 2.8353, + "theoretical_loss": 3.850955046235776, + "tokens_seen": 583744512 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004157071213640923, + "loss": 2.9949, + "theoretical_loss": 3.85090971826638, + "tokens_seen": 583810048 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004156970912738215, + "loss": 2.61, + "theoretical_loss": 3.850864396809569, + "tokens_seen": 583875584 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041568706118355066, + "loss": 2.7655, + "theoretical_loss": 3.8508190818636763, + "tokens_seen": 583941120 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041567703109327984, + "loss": 2.8568, + "theoretical_loss": 3.8507737734270355, + "tokens_seen": 584006656 + }, + { + "epoch": 1.09, + "learning_rate": 0.000415667001003009, + "loss": 2.6818, + "theoretical_loss": 3.850728471497982, + "tokens_seen": 584072192 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041565697091273826, + "loss": 2.8308, + "theoretical_loss": 3.8506831760748517, + "tokens_seen": 584137728 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004156469408224674, + "loss": 2.9406, + "theoretical_loss": 3.8506378871559788, + "tokens_seen": 584203264 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004156369107321966, + "loss": 2.736, + "theoretical_loss": 3.850592604739701, + "tokens_seen": 584268800 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041562688064192575, + "loss": 2.8512, + "theoretical_loss": 3.850547328824356, + "tokens_seen": 584334336 + }, + { + "epoch": 1.09, + "learning_rate": 0.000415616850551655, + "loss": 2.5336, + "theoretical_loss": 3.8505020594082797, + "tokens_seen": 584399872 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041560682046138417, + "loss": 2.9136, + "theoretical_loss": 3.850456796489812, + "tokens_seen": 584465408 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041559679037111335, + "loss": 2.5841, + "theoretical_loss": 3.8504115400672916, + "tokens_seen": 584530944 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041558676028084253, + "loss": 2.7805, + "theoretical_loss": 3.850366290139057, + "tokens_seen": 584596480 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041557673019057176, + "loss": 2.685, + "theoretical_loss": 3.85032104670345, + "tokens_seen": 584662016 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004155667001003009, + "loss": 2.9191, + "theoretical_loss": 3.85027580975881, + "tokens_seen": 584727552 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004155566700100301, + "loss": 3.0622, + "theoretical_loss": 3.8502305793034797, + "tokens_seen": 584793088 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041554663991975925, + "loss": 2.9408, + "theoretical_loss": 3.8501853553357996, + "tokens_seen": 584858624 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.363860607147217, + "objective/train/theoretical_loss": 3.850151441616564, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.850151441616564, + "tokens_seen": 584907776 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004155366098294885, + "loss": 2.6347, + "theoretical_loss": 3.850140137854114, + "tokens_seen": 584924160 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041552657973921767, + "loss": 2.8598, + "theoretical_loss": 3.850094926856765, + "tokens_seen": 584989696 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041551654964894685, + "loss": 2.8034, + "theoretical_loss": 3.8500497223420966, + "tokens_seen": 585055232 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041550651955867603, + "loss": 2.6854, + "theoretical_loss": 3.8500045243084537, + "tokens_seen": 585120768 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004154964894684052, + "loss": 2.7721, + "theoretical_loss": 3.849959332754181, + "tokens_seen": 585186304 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004154864593781344, + "loss": 3.0961, + "theoretical_loss": 3.849914147677624, + "tokens_seen": 585251840 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041547642928786363, + "loss": 2.9701, + "theoretical_loss": 3.849868969077129, + "tokens_seen": 585317376 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041546639919759276, + "loss": 2.7004, + "theoretical_loss": 3.8498237969510436, + "tokens_seen": 585382912 + }, + { + "epoch": 1.09, + "learning_rate": 0.000415456369107322, + "loss": 2.7834, + "theoretical_loss": 3.849778631297715, + "tokens_seen": 585448448 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004154463390170511, + "loss": 2.7596, + "theoretical_loss": 3.8497334721154903, + "tokens_seen": 585513984 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041543630892678035, + "loss": 2.7112, + "theoretical_loss": 3.84968831940272, + "tokens_seen": 585579520 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041542627883650954, + "loss": 2.9759, + "theoretical_loss": 3.8496431731577516, + "tokens_seen": 585645056 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004154162487462387, + "loss": 2.7964, + "theoretical_loss": 3.849598033378936, + "tokens_seen": 585710592 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004154062186559679, + "loss": 2.8015, + "theoretical_loss": 3.849552900064624, + "tokens_seen": 585776128 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041539618856569713, + "loss": 3.0477, + "theoretical_loss": 3.8495077732131655, + "tokens_seen": 585841664 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041538615847542626, + "loss": 2.8499, + "theoretical_loss": 3.8494626528229134, + "tokens_seen": 585907200 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004153761283851555, + "loss": 2.6039, + "theoretical_loss": 3.8494175388922196, + "tokens_seen": 585972736 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004153660982948846, + "loss": 2.5178, + "theoretical_loss": 3.849372431419437, + "tokens_seen": 586038272 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041535606820461386, + "loss": 2.827, + "theoretical_loss": 3.8493273304029194, + "tokens_seen": 586103808 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041534603811434304, + "loss": 2.8356, + "theoretical_loss": 3.8492822358410206, + "tokens_seen": 586169344 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004153360080240722, + "loss": 2.7122, + "theoretical_loss": 3.8492371477320955, + "tokens_seen": 586234880 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004153259779338014, + "loss": 2.8285, + "theoretical_loss": 3.8491920660744996, + "tokens_seen": 586300416 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004153159478435306, + "loss": 2.9381, + "theoretical_loss": 3.849146990866589, + "tokens_seen": 586365952 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041530591775325976, + "loss": 2.871, + "theoretical_loss": 3.8491019221067195, + "tokens_seen": 586431488 + }, + { + "epoch": 1.09, + "learning_rate": 0.000415295887662989, + "loss": 2.9606, + "theoretical_loss": 3.849056859793249, + "tokens_seen": 586497024 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.706367015838623, + "objective/train/theoretical_loss": 3.8490230672876073, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.8490230672876073, + "tokens_seen": 586546176 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004152858575727181, + "loss": 2.7225, + "theoretical_loss": 3.8490118039245353, + "tokens_seen": 586562560 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041527582748244736, + "loss": 2.711, + "theoretical_loss": 3.848966754498936, + "tokens_seen": 586628096 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004152657973921765, + "loss": 2.9355, + "theoretical_loss": 3.848921711514811, + "tokens_seen": 586693632 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004152557673019057, + "loss": 2.9359, + "theoretical_loss": 3.848876674970519, + "tokens_seen": 586759168 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004152457372116349, + "loss": 2.8788, + "theoretical_loss": 3.848831644864421, + "tokens_seen": 586824704 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004152357071213641, + "loss": 2.892, + "theoretical_loss": 3.8487866211948774, + "tokens_seen": 586890240 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041522567703109327, + "loss": 2.8509, + "theoretical_loss": 3.8487416039602493, + "tokens_seen": 586955776 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004152156469408225, + "loss": 2.6335, + "theoretical_loss": 3.8486965931588992, + "tokens_seen": 587021312 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041520561685055163, + "loss": 2.764, + "theoretical_loss": 3.8486515887891892, + "tokens_seen": 587086848 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041519558676028086, + "loss": 2.7661, + "theoretical_loss": 3.8486065908494824, + "tokens_seen": 587152384 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041518555667001, + "loss": 2.8219, + "theoretical_loss": 3.8485615993381432, + "tokens_seen": 587217920 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004151755265797392, + "loss": 2.6805, + "theoretical_loss": 3.8485166142535356, + "tokens_seen": 587283456 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004151654964894684, + "loss": 2.7319, + "theoretical_loss": 3.8484716355940245, + "tokens_seen": 587348992 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004151554663991976, + "loss": 2.8708, + "theoretical_loss": 3.8484266633579756, + "tokens_seen": 587414528 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004151454363089268, + "loss": 2.8686, + "theoretical_loss": 3.848381697543755, + "tokens_seen": 587480064 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041513540621865595, + "loss": 2.865, + "theoretical_loss": 3.8483367381497295, + "tokens_seen": 587545600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004151253761283852, + "loss": 2.8516, + "theoretical_loss": 3.8482917851742666, + "tokens_seen": 587611136 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041511534603811437, + "loss": 2.8969, + "theoretical_loss": 3.848246838615734, + "tokens_seen": 587676672 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041510531594784355, + "loss": 2.7833, + "theoretical_loss": 3.8482018984724995, + "tokens_seen": 587742208 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041509528585757273, + "loss": 2.8398, + "theoretical_loss": 3.8481569647429343, + "tokens_seen": 587807744 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041508525576730196, + "loss": 2.6893, + "theoretical_loss": 3.8481120374254063, + "tokens_seen": 587873280 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004150752256770311, + "loss": 2.9593, + "theoretical_loss": 3.848067116518287, + "tokens_seen": 587938816 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004150651955867603, + "loss": 2.9808, + "theoretical_loss": 3.848022202019946, + "tokens_seen": 588004352 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041505516549648945, + "loss": 2.7058, + "theoretical_loss": 3.847977293928756, + "tokens_seen": 588069888 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004150451354062187, + "loss": 2.6618, + "theoretical_loss": 3.8479323922430893, + "tokens_seen": 588135424 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.562694787979126, + "objective/train/theoretical_loss": 3.847898720181484, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.847898720181484, + "tokens_seen": 588184576 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041503510531594787, + "loss": 2.3911, + "theoretical_loss": 3.8478874969613175, + "tokens_seen": 588200960 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041502507522567705, + "loss": 2.5121, + "theoretical_loss": 3.847842608081815, + "tokens_seen": 588266496 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041501504513540623, + "loss": 2.7847, + "theoretical_loss": 3.8477977256029554, + "tokens_seen": 588332032 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004150050150451354, + "loss": 2.5692, + "theoretical_loss": 3.847752849523112, + "tokens_seen": 588397568 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004149949849548646, + "loss": 2.977, + "theoretical_loss": 3.847707979840662, + "tokens_seen": 588463104 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041498495486459383, + "loss": 2.8128, + "theoretical_loss": 3.8476631165539796, + "tokens_seen": 588528640 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041497492477432296, + "loss": 2.7995, + "theoretical_loss": 3.8476182596614414, + "tokens_seen": 588594176 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004149648946840522, + "loss": 2.771, + "theoretical_loss": 3.8475734091614253, + "tokens_seen": 588659712 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004149548645937813, + "loss": 2.8067, + "theoretical_loss": 3.847528565052307, + "tokens_seen": 588725248 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041494483450351055, + "loss": 2.6756, + "theoretical_loss": 3.847483727332466, + "tokens_seen": 588790784 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041493480441323974, + "loss": 2.6244, + "theoretical_loss": 3.8474388960002797, + "tokens_seen": 588856320 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004149247743229689, + "loss": 2.6471, + "theoretical_loss": 3.8473940710541283, + "tokens_seen": 588921856 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004149147442326981, + "loss": 2.4718, + "theoretical_loss": 3.847349252492392, + "tokens_seen": 588987392 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041490471414242733, + "loss": 2.9592, + "theoretical_loss": 3.84730444031345, + "tokens_seen": 589052928 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041489468405215646, + "loss": 2.5508, + "theoretical_loss": 3.847259634515684, + "tokens_seen": 589118464 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004148846539618857, + "loss": 3.1104, + "theoretical_loss": 3.847214835097476, + "tokens_seen": 589184000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004148746238716148, + "loss": 2.821, + "theoretical_loss": 3.8471700420572077, + "tokens_seen": 589249536 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041486459378134406, + "loss": 2.8675, + "theoretical_loss": 3.8471252553932618, + "tokens_seen": 589315072 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041485456369107324, + "loss": 2.7168, + "theoretical_loss": 3.847080475104022, + "tokens_seen": 589380608 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004148445336008024, + "loss": 2.5669, + "theoretical_loss": 3.847035701187872, + "tokens_seen": 589446144 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004148345035105316, + "loss": 2.7023, + "theoretical_loss": 3.8469909336431964, + "tokens_seen": 589511680 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004148244734202608, + "loss": 2.8024, + "theoretical_loss": 3.846946172468381, + "tokens_seen": 589577216 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041481444332998996, + "loss": 2.839, + "theoretical_loss": 3.846901417661811, + "tokens_seen": 589642752 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004148044132397192, + "loss": 2.5471, + "theoretical_loss": 3.846856669221872, + "tokens_seen": 589708288 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004147943831494483, + "loss": 2.6021, + "theoretical_loss": 3.846811927146952, + "tokens_seen": 589773824 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.017899990081787, + "objective/train/theoretical_loss": 3.846778374766836, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.846778374766836, + "tokens_seen": 589822976 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041478435305917756, + "loss": 2.9278, + "theoretical_loss": 3.846767191435438, + "tokens_seen": 589839360 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004147743229689067, + "loss": 2.6644, + "theoretical_loss": 3.8467224620857183, + "tokens_seen": 589904896 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004147642928786359, + "loss": 2.7847, + "theoretical_loss": 3.846677739096182, + "tokens_seen": 589970432 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004147542627883651, + "loss": 2.4443, + "theoretical_loss": 3.846633022465218, + "tokens_seen": 590035968 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004147442326980943, + "loss": 2.6099, + "theoretical_loss": 3.846588312191215, + "tokens_seen": 590101504 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041473420260782347, + "loss": 2.8192, + "theoretical_loss": 3.8465436082725653, + "tokens_seen": 590167040 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004147241725175527, + "loss": 2.6361, + "theoretical_loss": 3.846498910707659, + "tokens_seen": 590232576 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041471414242728183, + "loss": 2.7931, + "theoretical_loss": 3.8464542194948876, + "tokens_seen": 590298112 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041470411233701106, + "loss": 2.8275, + "theoretical_loss": 3.8464095346326435, + "tokens_seen": 590363648 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004146940822467402, + "loss": 2.5265, + "theoretical_loss": 3.8463648561193198, + "tokens_seen": 590429184 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004146840521564694, + "loss": 2.4411, + "theoretical_loss": 3.84632018395331, + "tokens_seen": 590494720 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004146740220661986, + "loss": 2.6881, + "theoretical_loss": 3.846275518133007, + "tokens_seen": 590560256 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004146639919759278, + "loss": 2.9586, + "theoretical_loss": 3.846230858656806, + "tokens_seen": 590625792 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041465396188565697, + "loss": 2.8721, + "theoretical_loss": 3.846186205523102, + "tokens_seen": 590691328 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041464393179538615, + "loss": 2.5559, + "theoretical_loss": 3.846141558730291, + "tokens_seen": 590756864 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041463390170511533, + "loss": 2.877, + "theoretical_loss": 3.8460969182767695, + "tokens_seen": 590822400 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041462387161484457, + "loss": 2.8501, + "theoretical_loss": 3.8460522841609333, + "tokens_seen": 590887936 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004146138415245737, + "loss": 2.7009, + "theoretical_loss": 3.8460076563811807, + "tokens_seen": 590953472 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041460381143430293, + "loss": 2.6128, + "theoretical_loss": 3.8459630349359104, + "tokens_seen": 591019008 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004145937813440321, + "loss": 2.4929, + "theoretical_loss": 3.8459184198235192, + "tokens_seen": 591084544 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004145837512537613, + "loss": 2.8232, + "theoretical_loss": 3.8458738110424076, + "tokens_seen": 591150080 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004145737211634905, + "loss": 2.6952, + "theoretical_loss": 3.845829208590975, + "tokens_seen": 591215616 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041456369107321965, + "loss": 2.8109, + "theoretical_loss": 3.8457846124676225, + "tokens_seen": 591281152 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041455366098294884, + "loss": 3.0057, + "theoretical_loss": 3.8457400226707494, + "tokens_seen": 591346688 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041454363089267807, + "loss": 2.8575, + "theoretical_loss": 3.8456954391987592, + "tokens_seen": 591412224 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5051016807556152, + "objective/train/theoretical_loss": 3.845662005744509, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.845662005744509, + "tokens_seen": 591461376 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004145336008024072, + "loss": 2.763, + "theoretical_loss": 3.8456508620500527, + "tokens_seen": 591477760 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041452357071213643, + "loss": 2.7731, + "theoretical_loss": 3.8456062912230324, + "tokens_seen": 591543296 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041451354062186556, + "loss": 2.8782, + "theoretical_loss": 3.8455617267161024, + "tokens_seen": 591608832 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004145035105315948, + "loss": 2.9828, + "theoretical_loss": 3.8455171685276666, + "tokens_seen": 591674368 + }, + { + "epoch": 1.09, + "learning_rate": 0.000414493480441324, + "loss": 2.6916, + "theoretical_loss": 3.845472616656129, + "tokens_seen": 591739904 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041448345035105316, + "loss": 2.3959, + "theoretical_loss": 3.845428071099895, + "tokens_seen": 591805440 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041447342026078234, + "loss": 2.7328, + "theoretical_loss": 3.8453835318573697, + "tokens_seen": 591870976 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004144633901705115, + "loss": 2.8305, + "theoretical_loss": 3.8453389989269593, + "tokens_seen": 591936512 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004144533600802407, + "loss": 2.7196, + "theoretical_loss": 3.8452944723070708, + "tokens_seen": 592002048 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041444332998996994, + "loss": 2.6286, + "theoretical_loss": 3.845249951996111, + "tokens_seen": 592067584 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041443329989969906, + "loss": 2.4818, + "theoretical_loss": 3.8452054379924894, + "tokens_seen": 592133120 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004144232698094283, + "loss": 2.6504, + "theoretical_loss": 3.845160930294613, + "tokens_seen": 592198656 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041441323971915753, + "loss": 2.715, + "theoretical_loss": 3.845116428900891, + "tokens_seen": 592264192 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041440320962888666, + "loss": 2.7967, + "theoretical_loss": 3.8450719338097326, + "tokens_seen": 592329728 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004143931795386159, + "loss": 3.0511, + "theoretical_loss": 3.8450274450195496, + "tokens_seen": 592395264 + }, + { + "epoch": 1.09, + "learning_rate": 0.000414383149448345, + "loss": 2.8589, + "theoretical_loss": 3.8449829625287517, + "tokens_seen": 592460800 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041437311935807426, + "loss": 2.8515, + "theoretical_loss": 3.8449384863357503, + "tokens_seen": 592526336 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041436308926780344, + "loss": 3.0156, + "theoretical_loss": 3.8448940164389573, + "tokens_seen": 592591872 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004143530591775326, + "loss": 2.5779, + "theoretical_loss": 3.844849552836786, + "tokens_seen": 592657408 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004143430290872618, + "loss": 2.5526, + "theoretical_loss": 3.844805095527648, + "tokens_seen": 592722944 + }, + { + "epoch": 1.09, + "learning_rate": 0.000414332998996991, + "loss": 2.5768, + "theoretical_loss": 3.844760644509959, + "tokens_seen": 592788480 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041432296890672016, + "loss": 2.8752, + "theoretical_loss": 3.8447161997821313, + "tokens_seen": 592854016 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004143129388164494, + "loss": 2.502, + "theoretical_loss": 3.844671761342581, + "tokens_seen": 592919552 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004143029087261785, + "loss": 2.7409, + "theoretical_loss": 3.8446273291897226, + "tokens_seen": 592985088 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041429287863590776, + "loss": 2.8724, + "theoretical_loss": 3.844582903321973, + "tokens_seen": 593050624 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 641212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6726303100585938, + "objective/train/theoretical_loss": 3.8445495880448104, + "objective/train/tokens_used": 593504736, + "theoretical_loss": 3.8445495880448104, + "tokens_seen": 593099776 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004142828485456369, + "loss": 2.8083, + "theoretical_loss": 3.844538483737748, + "tokens_seen": 593116160 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004142728184553661, + "loss": 3.0371, + "theoretical_loss": 3.8444940704354655, + "tokens_seen": 593181696 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004142627883650953, + "loss": 2.7278, + "theoretical_loss": 3.844449663413542, + "tokens_seen": 593247232 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004142527582748245, + "loss": 2.7245, + "theoretical_loss": 3.844405262670397, + "tokens_seen": 593312768 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041424272818455367, + "loss": 2.728, + "theoretical_loss": 3.8443608682044488, + "tokens_seen": 593378304 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004142326980942829, + "loss": 2.7423, + "theoretical_loss": 3.8443164800141165, + "tokens_seen": 593443840 + }, + { + "epoch": 1.09, + "learning_rate": 0.00041422266800401203, + "loss": 2.8193, + "theoretical_loss": 3.8442720980978207, + "tokens_seen": 593509376 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041421263791374126, + "loss": 3.4325, + "theoretical_loss": 3.844217322819618, + "tokens_seen": 593590272 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004142026078234704, + "loss": 2.9414, + "theoretical_loss": 3.844172954916165, + "tokens_seen": 593655808 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004141925777331996, + "loss": 2.9461, + "theoretical_loss": 3.844128593281643, + "tokens_seen": 593721344 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004141825476429288, + "loss": 2.8163, + "theoretical_loss": 3.8440842379144717, + "tokens_seen": 593786880 + }, + { + "epoch": 2.0, + "learning_rate": 0.000414172517552658, + "loss": 3.198, + "theoretical_loss": 3.844039888813076, + "tokens_seen": 593852416 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041416248746238717, + "loss": 3.0478, + "theoretical_loss": 3.8439955459758792, + "tokens_seen": 593917952 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041415245737211635, + "loss": 2.9957, + "theoretical_loss": 3.8439512094013057, + "tokens_seen": 593983488 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041414242728184553, + "loss": 3.0462, + "theoretical_loss": 3.843906879087779, + "tokens_seen": 594049024 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041413239719157477, + "loss": 2.8059, + "theoretical_loss": 3.8438625550337253, + "tokens_seen": 594114560 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004141223671013039, + "loss": 2.8339, + "theoretical_loss": 3.84381823723757, + "tokens_seen": 594180096 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041411233701103313, + "loss": 3.0042, + "theoretical_loss": 3.8437739256977403, + "tokens_seen": 594245632 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004141023069207623, + "loss": 2.868, + "theoretical_loss": 3.8437296204126623, + "tokens_seen": 594311168 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004140922768304915, + "loss": 3.0221, + "theoretical_loss": 3.843685321380763, + "tokens_seen": 594376704 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004140822467402207, + "loss": 2.856, + "theoretical_loss": 3.8436410286004725, + "tokens_seen": 594442240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041407221664994985, + "loss": 2.9139, + "theoretical_loss": 3.8435967420702175, + "tokens_seen": 594507776 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041406218655967904, + "loss": 2.8862, + "theoretical_loss": 3.8435524617884287, + "tokens_seen": 594573312 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041405215646940827, + "loss": 2.977, + "theoretical_loss": 3.8435081877535344, + "tokens_seen": 594638848 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004140421263791374, + "loss": 2.78, + "theoretical_loss": 3.843463919963966, + "tokens_seen": 594704384 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 708692, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.850222587585449, + "objective/train/theoretical_loss": 3.843441788410689, + "objective/train/tokens_used": 615197152, + "theoretical_loss": 3.843441788410689, + "tokens_seen": 594737152 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041403209628886663, + "loss": 2.9877, + "theoretical_loss": 3.8434196584181546, + "tokens_seen": 594769920 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041402206619859576, + "loss": 2.9706, + "theoretical_loss": 3.8433754031145315, + "tokens_seen": 594835456 + }, + { + "epoch": 2.0, + "learning_rate": 0.000414012036108325, + "loss": 2.9933, + "theoretical_loss": 3.8433311540515285, + "tokens_seen": 594900992 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004140020060180542, + "loss": 3.0618, + "theoretical_loss": 3.8432869112275787, + "tokens_seen": 594966528 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041399197592778336, + "loss": 3.0248, + "theoretical_loss": 3.8432426746411146, + "tokens_seen": 595032064 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041398194583751254, + "loss": 2.9282, + "theoretical_loss": 3.8431984442905707, + "tokens_seen": 595097600 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004139719157472417, + "loss": 2.9484, + "theoretical_loss": 3.8431542201743807, + "tokens_seen": 595163136 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004139618856569709, + "loss": 2.8406, + "theoretical_loss": 3.8431100022909797, + "tokens_seen": 595228672 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041395185556670014, + "loss": 2.7116, + "theoretical_loss": 3.8430657906388035, + "tokens_seen": 595294208 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041394182547642926, + "loss": 2.8288, + "theoretical_loss": 3.843021585216288, + "tokens_seen": 595359744 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004139317953861585, + "loss": 2.886, + "theoretical_loss": 3.84297738602187, + "tokens_seen": 595425280 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004139217652958877, + "loss": 2.9879, + "theoretical_loss": 3.8429331930539865, + "tokens_seen": 595490816 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041391173520561686, + "loss": 2.8221, + "theoretical_loss": 3.8428890063110748, + "tokens_seen": 595556352 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041390170511534604, + "loss": 2.8475, + "theoretical_loss": 3.8428448257915733, + "tokens_seen": 595621888 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004138916750250752, + "loss": 3.1417, + "theoretical_loss": 3.8428006514939215, + "tokens_seen": 595687424 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004138816449348044, + "loss": 2.7316, + "theoretical_loss": 3.8427564834165584, + "tokens_seen": 595752960 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041387161484453364, + "loss": 2.8183, + "theoretical_loss": 3.842712321557924, + "tokens_seen": 595818496 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041386158475426277, + "loss": 2.9654, + "theoretical_loss": 3.842668165916459, + "tokens_seen": 595884032 + }, + { + "epoch": 2.0, + "learning_rate": 0.000413851554663992, + "loss": 2.7973, + "theoretical_loss": 3.8426240164906043, + "tokens_seen": 595949568 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041384152457372113, + "loss": 3.0307, + "theoretical_loss": 3.842579873278801, + "tokens_seen": 596015104 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041383149448345036, + "loss": 2.7348, + "theoretical_loss": 3.842535736279493, + "tokens_seen": 596080640 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041382146439317954, + "loss": 2.9633, + "theoretical_loss": 3.842491605491122, + "tokens_seen": 596146176 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004138114343029087, + "loss": 2.9599, + "theoretical_loss": 3.842447480912131, + "tokens_seen": 596211712 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004138014042126379, + "loss": 2.7724, + "theoretical_loss": 3.8424033625409644, + "tokens_seen": 596277248 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004137913741223671, + "loss": 3.0336, + "theoretical_loss": 3.8423592503760666, + "tokens_seen": 596342784 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 713945, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0268349647521973, + "objective/train/theoretical_loss": 3.8423371966204827, + "objective/train/tokens_used": 616835552, + "theoretical_loss": 3.8423371966204827, + "tokens_seen": 596375552 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041378134403209627, + "loss": 3.0419, + "theoretical_loss": 3.842315144415883, + "tokens_seen": 596408320 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004137713139418255, + "loss": 2.9421, + "theoretical_loss": 3.842271044658859, + "tokens_seen": 596473856 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041376128385155463, + "loss": 2.99, + "theoretical_loss": 3.84222695110344, + "tokens_seen": 596539392 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041375125376128387, + "loss": 2.9369, + "theoretical_loss": 3.842182863748074, + "tokens_seen": 596604928 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041374122367101305, + "loss": 3.0614, + "theoretical_loss": 3.8421387825912072, + "tokens_seen": 596670464 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041373119358074223, + "loss": 3.1159, + "theoretical_loss": 3.842094707631288, + "tokens_seen": 596736000 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004137211634904714, + "loss": 2.7884, + "theoretical_loss": 3.8420506388667652, + "tokens_seen": 596801536 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004137111334002006, + "loss": 2.977, + "theoretical_loss": 3.842006576296087, + "tokens_seen": 596867072 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041370110330992977, + "loss": 2.9942, + "theoretical_loss": 3.841962519917703, + "tokens_seen": 596932608 + }, + { + "epoch": 2.0, + "learning_rate": 0.000413691073219659, + "loss": 2.86, + "theoretical_loss": 3.8419184697300635, + "tokens_seen": 596998144 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041368104312938813, + "loss": 2.8966, + "theoretical_loss": 3.841874425731619, + "tokens_seen": 597063680 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041367101303911737, + "loss": 2.7813, + "theoretical_loss": 3.841830387920821, + "tokens_seen": 597129216 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041366098294884655, + "loss": 2.6166, + "theoretical_loss": 3.8417863562961205, + "tokens_seen": 597194752 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041365095285857573, + "loss": 2.9736, + "theoretical_loss": 3.841742330855971, + "tokens_seen": 597260288 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041364092276830497, + "loss": 3.0773, + "theoretical_loss": 3.841698311598824, + "tokens_seen": 597325824 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004136308926780341, + "loss": 3.1391, + "theoretical_loss": 3.8416542985231343, + "tokens_seen": 597391360 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041362086258776333, + "loss": 2.9423, + "theoretical_loss": 3.841610291627355, + "tokens_seen": 597456896 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004136108324974925, + "loss": 2.874, + "theoretical_loss": 3.841566290909941, + "tokens_seen": 597522432 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004136008024072217, + "loss": 2.9495, + "theoretical_loss": 3.8415222963693467, + "tokens_seen": 597587968 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004135907723169509, + "loss": 3.039, + "theoretical_loss": 3.8414783080040285, + "tokens_seen": 597653504 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041358074222668005, + "loss": 2.8841, + "theoretical_loss": 3.8414343258124424, + "tokens_seen": 597719040 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041357071213640924, + "loss": 2.803, + "theoretical_loss": 3.841390349793045, + "tokens_seen": 597784576 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041356068204613847, + "loss": 2.9264, + "theoretical_loss": 3.8413463799442935, + "tokens_seen": 597850112 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004135506519558676, + "loss": 3.0903, + "theoretical_loss": 3.8413024162646465, + "tokens_seen": 597915648 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041354062186559683, + "loss": 2.7137, + "theoretical_loss": 3.8412584587525616, + "tokens_seen": 597981184 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 718818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.732727289199829, + "objective/train/theoretical_loss": 3.841236482308873, + "objective/train/tokens_used": 618473952, + "theoretical_loss": 3.841236482308873, + "tokens_seen": 598013952 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041353059177532596, + "loss": 3.0269, + "theoretical_loss": 3.841214507406498, + "tokens_seen": 598046720 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004135205616850552, + "loss": 2.9763, + "theoretical_loss": 3.8411705622249155, + "tokens_seen": 598112256 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004135105315947844, + "loss": 3.026, + "theoretical_loss": 3.841126623206274, + "tokens_seen": 598177792 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041350050150451356, + "loss": 3.0752, + "theoretical_loss": 3.841082690349034, + "tokens_seen": 598243328 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041349047141424274, + "loss": 2.9446, + "theoretical_loss": 3.8410387636516568, + "tokens_seen": 598308864 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004134804413239719, + "loss": 2.8162, + "theoretical_loss": 3.840994843112604, + "tokens_seen": 598374400 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004134704112337011, + "loss": 2.8233, + "theoretical_loss": 3.8409509287303383, + "tokens_seen": 598439936 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041346038114343034, + "loss": 2.9554, + "theoretical_loss": 3.840907020503322, + "tokens_seen": 598505472 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041345035105315946, + "loss": 2.7912, + "theoretical_loss": 3.840863118430019, + "tokens_seen": 598571008 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004134403209628887, + "loss": 2.9135, + "theoretical_loss": 3.840819222508893, + "tokens_seen": 598636544 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004134302908726179, + "loss": 3.072, + "theoretical_loss": 3.840775332738408, + "tokens_seen": 598702080 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041342026078234706, + "loss": 2.8755, + "theoretical_loss": 3.84073144911703, + "tokens_seen": 598767616 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041341023069207624, + "loss": 2.984, + "theoretical_loss": 3.8406875716432243, + "tokens_seen": 598833152 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004134002006018054, + "loss": 3.0003, + "theoretical_loss": 3.8406437003154568, + "tokens_seen": 598898688 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004133901705115346, + "loss": 2.9602, + "theoretical_loss": 3.840599835132194, + "tokens_seen": 598964224 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041338014042126384, + "loss": 2.85, + "theoretical_loss": 3.840555976091904, + "tokens_seen": 599029760 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041337011033099297, + "loss": 2.9622, + "theoretical_loss": 3.840512123193054, + "tokens_seen": 599095296 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004133600802407222, + "loss": 2.9202, + "theoretical_loss": 3.8404682764341125, + "tokens_seen": 599160832 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041335005015045133, + "loss": 2.9157, + "theoretical_loss": 3.840424435813548, + "tokens_seen": 599226368 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041334002006018056, + "loss": 2.9191, + "theoretical_loss": 3.84038060132983, + "tokens_seen": 599291904 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041332998996990975, + "loss": 2.898, + "theoretical_loss": 3.8403367729814297, + "tokens_seen": 599357440 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004133199598796389, + "loss": 2.9767, + "theoretical_loss": 3.840292950766816, + "tokens_seen": 599422976 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004133099297893681, + "loss": 2.8749, + "theoretical_loss": 3.840249134684461, + "tokens_seen": 599488512 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004132998996990973, + "loss": 2.9314, + "theoretical_loss": 3.8402053247328363, + "tokens_seen": 599554048 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041328986960882647, + "loss": 2.8579, + "theoretical_loss": 3.8401615209104136, + "tokens_seen": 599619584 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 723725, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7808403968811035, + "objective/train/theoretical_loss": 3.8401396212971757, + "objective/train/tokens_used": 620112352, + "theoretical_loss": 3.8401396212971757, + "tokens_seen": 599652352 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004132798395185557, + "loss": 2.7878, + "theoretical_loss": 3.8401177232156654, + "tokens_seen": 599685120 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041326980942828483, + "loss": 2.812, + "theoretical_loss": 3.8400739316470665, + "tokens_seen": 599750656 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041325977933801407, + "loss": 3.0098, + "theoretical_loss": 3.8400301462030892, + "tokens_seen": 599816192 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041324974924774325, + "loss": 2.9269, + "theoretical_loss": 3.8399863668822087, + "tokens_seen": 599881728 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041323971915747243, + "loss": 2.8626, + "theoretical_loss": 3.8399425936828995, + "tokens_seen": 599947264 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004132296890672016, + "loss": 2.9165, + "theoretical_loss": 3.839898826603637, + "tokens_seen": 600012800 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004132196589769308, + "loss": 2.9539, + "theoretical_loss": 3.839855065642898, + "tokens_seen": 600078336 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041320962888665997, + "loss": 2.9778, + "theoretical_loss": 3.839811310799158, + "tokens_seen": 600143872 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004131995987963892, + "loss": 2.8436, + "theoretical_loss": 3.839767562070895, + "tokens_seen": 600209408 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041318956870611833, + "loss": 2.8844, + "theoretical_loss": 3.839723819456586, + "tokens_seen": 600274944 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041317953861584757, + "loss": 3.0573, + "theoretical_loss": 3.8396800829547093, + "tokens_seen": 600340480 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004131695085255767, + "loss": 2.857, + "theoretical_loss": 3.839636352563744, + "tokens_seen": 600406016 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041315947843530593, + "loss": 2.8885, + "theoretical_loss": 3.83959262828217, + "tokens_seen": 600471552 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004131494483450351, + "loss": 2.8889, + "theoretical_loss": 3.8395489101084657, + "tokens_seen": 600537088 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004131394182547643, + "loss": 2.8557, + "theoretical_loss": 3.8395051980411123, + "tokens_seen": 600602624 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004131293881644935, + "loss": 2.9485, + "theoretical_loss": 3.8394614920785903, + "tokens_seen": 600668160 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004131193580742227, + "loss": 2.8408, + "theoretical_loss": 3.8394177922193817, + "tokens_seen": 600733696 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041310932798395184, + "loss": 2.8704, + "theoretical_loss": 3.8393740984619686, + "tokens_seen": 600799232 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004130992978936811, + "loss": 2.8666, + "theoretical_loss": 3.8393304108048323, + "tokens_seen": 600864768 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004130892678034102, + "loss": 2.9289, + "theoretical_loss": 3.839286729246458, + "tokens_seen": 600930304 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041307923771313944, + "loss": 2.8433, + "theoretical_loss": 3.839243053785327, + "tokens_seen": 600995840 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004130692076228686, + "loss": 2.8201, + "theoretical_loss": 3.8391993844199255, + "tokens_seen": 601061376 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004130591775325978, + "loss": 2.9049, + "theoretical_loss": 3.839155721148737, + "tokens_seen": 601126912 + }, + { + "epoch": 2.0, + "learning_rate": 0.000413049147442327, + "loss": 2.8023, + "theoretical_loss": 3.8391120639702474, + "tokens_seen": 601192448 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041303911735205616, + "loss": 2.9873, + "theoretical_loss": 3.839068412882942, + "tokens_seen": 601257984 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 728779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1727378368377686, + "objective/train/theoretical_loss": 3.8390465896230106, + "objective/train/tokens_used": 621750752, + "theoretical_loss": 3.8390465896230106, + "tokens_seen": 601290752 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041302908726178534, + "loss": 2.9188, + "theoretical_loss": 3.8390247678853076, + "tokens_seen": 601323520 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004130190571715146, + "loss": 2.8634, + "theoretical_loss": 3.8389811289758304, + "tokens_seen": 601389056 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004130090270812437, + "loss": 2.6309, + "theoretical_loss": 3.838937496152999, + "tokens_seen": 601454592 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041299899699097294, + "loss": 2.9133, + "theoretical_loss": 3.8388938694153003, + "tokens_seen": 601520128 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041298896690070207, + "loss": 2.9708, + "theoretical_loss": 3.838850248761223, + "tokens_seen": 601585664 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004129789368104313, + "loss": 2.7942, + "theoretical_loss": 3.8388066341892566, + "tokens_seen": 601651200 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004129689067201605, + "loss": 2.8799, + "theoretical_loss": 3.838763025697891, + "tokens_seen": 601716736 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041295887662988966, + "loss": 2.881, + "theoretical_loss": 3.8387194232856148, + "tokens_seen": 601782272 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041294884653961884, + "loss": 2.9561, + "theoretical_loss": 3.8386758269509205, + "tokens_seen": 601847808 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004129388164493481, + "loss": 2.7078, + "theoretical_loss": 3.8386322366922982, + "tokens_seen": 601913344 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004129287863590772, + "loss": 2.9614, + "theoretical_loss": 3.83858865250824, + "tokens_seen": 601978880 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041291875626880644, + "loss": 2.8647, + "theoretical_loss": 3.8385450743972376, + "tokens_seen": 602044416 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004129087261785356, + "loss": 2.8494, + "theoretical_loss": 3.8385015023577846, + "tokens_seen": 602109952 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004128986960882648, + "loss": 2.8509, + "theoretical_loss": 3.8384579363883744, + "tokens_seen": 602175488 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041288866599799404, + "loss": 2.8159, + "theoretical_loss": 3.8384143764875, + "tokens_seen": 602241024 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041287863590772317, + "loss": 3.0505, + "theoretical_loss": 3.8383708226536575, + "tokens_seen": 602306560 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004128686058174524, + "loss": 2.7001, + "theoretical_loss": 3.83832727488534, + "tokens_seen": 602372096 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041285857572718153, + "loss": 2.7911, + "theoretical_loss": 3.838283733181044, + "tokens_seen": 602437632 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041284854563691076, + "loss": 3.0129, + "theoretical_loss": 3.8382401975392653, + "tokens_seen": 602503168 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041283851554663995, + "loss": 2.7819, + "theoretical_loss": 3.8381966679585005, + "tokens_seen": 602568704 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004128284854563691, + "loss": 2.8773, + "theoretical_loss": 3.838153144437247, + "tokens_seen": 602634240 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004128184553660983, + "loss": 2.819, + "theoretical_loss": 3.838109626974002, + "tokens_seen": 602699776 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004128084252758275, + "loss": 3.0012, + "theoretical_loss": 3.8380661155672646, + "tokens_seen": 602765312 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041279839518555667, + "loss": 2.8227, + "theoretical_loss": 3.838022610215532, + "tokens_seen": 602830848 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004127883650952859, + "loss": 2.6656, + "theoretical_loss": 3.837979110917305, + "tokens_seen": 602896384 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 731087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8379440307617188, + "objective/train/theoretical_loss": 3.837957363537787, + "objective/train/tokens_used": 623389152, + "theoretical_loss": 3.837957363537787, + "tokens_seen": 602929152 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041277833500501503, + "loss": 2.9037, + "theoretical_loss": 3.837935617671082, + "tokens_seen": 602961920 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041276830491474427, + "loss": 2.9334, + "theoretical_loss": 3.837892130475365, + "tokens_seen": 603027456 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041275827482447345, + "loss": 2.8944, + "theoretical_loss": 3.837848649328653, + "tokens_seen": 603092992 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041274824473420263, + "loss": 2.8149, + "theoretical_loss": 3.8378051742294494, + "tokens_seen": 603158528 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004127382146439318, + "loss": 2.8712, + "theoretical_loss": 3.8377617051762547, + "tokens_seen": 603224064 + }, + { + "epoch": 2.0, + "learning_rate": 0.000412728184553661, + "loss": 2.8573, + "theoretical_loss": 3.8377182421675715, + "tokens_seen": 603289600 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041271815446339017, + "loss": 2.8153, + "theoretical_loss": 3.8376747852019033, + "tokens_seen": 603355136 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004127081243731194, + "loss": 2.7117, + "theoretical_loss": 3.837631334277753, + "tokens_seen": 603420672 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041269809428284854, + "loss": 2.8129, + "theoretical_loss": 3.8375878893936255, + "tokens_seen": 603486208 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041268806419257777, + "loss": 2.9922, + "theoretical_loss": 3.837544450548025, + "tokens_seen": 603551744 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004126780341023069, + "loss": 2.7937, + "theoretical_loss": 3.8375010177394566, + "tokens_seen": 603617280 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041266800401203613, + "loss": 2.6681, + "theoretical_loss": 3.8374575909664257, + "tokens_seen": 603682816 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004126579739217653, + "loss": 2.8324, + "theoretical_loss": 3.837414170227439, + "tokens_seen": 603748352 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004126479438314945, + "loss": 3.0201, + "theoretical_loss": 3.8373707555210035, + "tokens_seen": 603813888 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004126379137412237, + "loss": 2.7794, + "theoretical_loss": 3.8373273468456253, + "tokens_seen": 603879424 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004126278836509529, + "loss": 2.9055, + "theoretical_loss": 3.8372839441998137, + "tokens_seen": 603944960 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041261785356068204, + "loss": 2.978, + "theoretical_loss": 3.8372405475820757, + "tokens_seen": 604010496 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004126078234704113, + "loss": 2.9492, + "theoretical_loss": 3.8371971569909205, + "tokens_seen": 604076032 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004125977933801404, + "loss": 2.8896, + "theoretical_loss": 3.837153772424858, + "tokens_seen": 604141568 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041258776328986964, + "loss": 2.9618, + "theoretical_loss": 3.837110393882398, + "tokens_seen": 604207104 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004125777331995988, + "loss": 2.9102, + "theoretical_loss": 3.83706702136205, + "tokens_seen": 604272640 + }, + { + "epoch": 2.0, + "learning_rate": 0.000412567703109328, + "loss": 3.0573, + "theoretical_loss": 3.8370236548623264, + "tokens_seen": 604338176 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004125576730190572, + "loss": 2.7037, + "theoretical_loss": 3.8369802943817377, + "tokens_seen": 604403712 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041254764292878636, + "loss": 2.8876, + "theoretical_loss": 3.8369369399187963, + "tokens_seen": 604469248 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041253761283851554, + "loss": 2.9452, + "theoretical_loss": 3.836893591472015, + "tokens_seen": 604534784 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 736119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0697572231292725, + "objective/train/theoretical_loss": 3.83687191950422, + "objective/train/tokens_used": 625027552, + "theoretical_loss": 3.83687191950422, + "tokens_seen": 604567552 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004125275827482448, + "loss": 3.0517, + "theoretical_loss": 3.836850249039906, + "tokens_seen": 604600320 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004125175526579739, + "loss": 2.8523, + "theoretical_loss": 3.8368069126209843, + "tokens_seen": 604665856 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041250752256770314, + "loss": 2.9029, + "theoretical_loss": 3.836763582213763, + "tokens_seen": 604731392 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041249749247743227, + "loss": 2.6393, + "theoretical_loss": 3.8367202578167574, + "tokens_seen": 604796928 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004124874623871615, + "loss": 2.7526, + "theoretical_loss": 3.8366769394284823, + "tokens_seen": 604862464 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004124774322968907, + "loss": 2.9766, + "theoretical_loss": 3.836633627047453, + "tokens_seen": 604928000 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041246740220661986, + "loss": 2.8323, + "theoretical_loss": 3.8365903206721867, + "tokens_seen": 604993536 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041245737211634904, + "loss": 2.823, + "theoretical_loss": 3.8365470203012, + "tokens_seen": 605059072 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004124473420260783, + "loss": 2.9371, + "theoretical_loss": 3.83650372593301, + "tokens_seen": 605124608 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004124373119358074, + "loss": 2.8824, + "theoretical_loss": 3.836460437566134, + "tokens_seen": 605190144 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041242728184553664, + "loss": 2.9946, + "theoretical_loss": 3.8364171551990918, + "tokens_seen": 605255680 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041241725175526577, + "loss": 3.0648, + "theoretical_loss": 3.836373878830401, + "tokens_seen": 605321216 + }, + { + "epoch": 2.0, + "learning_rate": 0.000412407221664995, + "loss": 2.8366, + "theoretical_loss": 3.8363306084585815, + "tokens_seen": 605386752 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004123971915747242, + "loss": 2.9835, + "theoretical_loss": 3.836287344082153, + "tokens_seen": 605452288 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041238716148445337, + "loss": 2.8837, + "theoretical_loss": 3.8362440856996365, + "tokens_seen": 605517824 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041237713139418255, + "loss": 2.726, + "theoretical_loss": 3.8362008333095527, + "tokens_seen": 605583360 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041236710130391173, + "loss": 2.8451, + "theoretical_loss": 3.8361575869104225, + "tokens_seen": 605648896 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004123570712136409, + "loss": 2.7774, + "theoretical_loss": 3.836114346500769, + "tokens_seen": 605714432 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041234704112337015, + "loss": 2.7372, + "theoretical_loss": 3.8360711120791144, + "tokens_seen": 605779968 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041233701103309927, + "loss": 2.8797, + "theoretical_loss": 3.8360278836439816, + "tokens_seen": 605845504 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004123269809428285, + "loss": 3.046, + "theoretical_loss": 3.835984661193895, + "tokens_seen": 605911040 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041231695085255763, + "loss": 2.7871, + "theoretical_loss": 3.8359414447273776, + "tokens_seen": 605976576 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041230692076228687, + "loss": 2.8635, + "theoretical_loss": 3.835898234242954, + "tokens_seen": 606042112 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041229689067201605, + "loss": 2.9083, + "theoretical_loss": 3.8358550297391507, + "tokens_seen": 606107648 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041228686058174523, + "loss": 2.666, + "theoretical_loss": 3.8358118312144924, + "tokens_seen": 606173184 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 740966, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.874371290206909, + "objective/train/theoretical_loss": 3.8357902341938823, + "objective/train/tokens_used": 626665952, + "theoretical_loss": 3.8357902341938823, + "tokens_seen": 606205952 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004122768304914744, + "loss": 2.7719, + "theoretical_loss": 3.835768638667506, + "tokens_seen": 606238720 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041226680040120365, + "loss": 2.8218, + "theoretical_loss": 3.8357254520967174, + "tokens_seen": 606304256 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004122567703109328, + "loss": 2.8802, + "theoretical_loss": 3.835682271500655, + "tokens_seen": 606369792 + }, + { + "epoch": 2.0, + "learning_rate": 0.000412246740220662, + "loss": 3.0319, + "theoretical_loss": 3.8356390968778453, + "tokens_seen": 606435328 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041223671013039114, + "loss": 2.865, + "theoretical_loss": 3.8355959282268177, + "tokens_seen": 606500864 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004122266800401204, + "loss": 2.9013, + "theoretical_loss": 3.8355527655461, + "tokens_seen": 606566400 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041221664994984955, + "loss": 2.9206, + "theoretical_loss": 3.835509608834223, + "tokens_seen": 606631936 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041220661985957874, + "loss": 2.9424, + "theoretical_loss": 3.8354664580897153, + "tokens_seen": 606697472 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121965897693079, + "loss": 2.8802, + "theoretical_loss": 3.835423313311108, + "tokens_seen": 606763008 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121865596790371, + "loss": 2.9102, + "theoretical_loss": 3.8353801744969322, + "tokens_seen": 606828544 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121765295887663, + "loss": 2.8268, + "theoretical_loss": 3.835337041645719, + "tokens_seen": 606894080 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121664994984955, + "loss": 2.9799, + "theoretical_loss": 3.8352939147559995, + "tokens_seen": 606959616 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121564694082247, + "loss": 2.7371, + "theoretical_loss": 3.835250793826308, + "tokens_seen": 607025152 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121464393179539, + "loss": 2.7789, + "theoretical_loss": 3.835207678855176, + "tokens_seen": 607090688 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121364092276831, + "loss": 2.9266, + "theoretical_loss": 3.835164569841138, + "tokens_seen": 607156224 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041212637913741224, + "loss": 2.6294, + "theoretical_loss": 3.8351214667827276, + "tokens_seen": 607221760 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121163490471415, + "loss": 2.7727, + "theoretical_loss": 3.835078369678479, + "tokens_seen": 607287296 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004121063189568706, + "loss": 2.9085, + "theoretical_loss": 3.835035278526928, + "tokens_seen": 607352832 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041209628886659984, + "loss": 2.8811, + "theoretical_loss": 3.8349921933266105, + "tokens_seen": 607418368 + }, + { + "epoch": 2.0, + "learning_rate": 0.000412086258776329, + "loss": 2.8147, + "theoretical_loss": 3.8349491140760614, + "tokens_seen": 607483904 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004120762286860582, + "loss": 2.9682, + "theoretical_loss": 3.8349060407738182, + "tokens_seen": 607549440 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004120661985957874, + "loss": 2.9455, + "theoretical_loss": 3.834862973418418, + "tokens_seen": 607614976 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041205616850551656, + "loss": 2.921, + "theoretical_loss": 3.834819912008398, + "tokens_seen": 607680512 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041204613841524574, + "loss": 2.913, + "theoretical_loss": 3.8347768565422973, + "tokens_seen": 607746048 + }, + { + "epoch": 2.0, + "learning_rate": 0.000412036108324975, + "loss": 2.8299, + "theoretical_loss": 3.8347338070186536, + "tokens_seen": 607811584 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 746024, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.745020627975464, + "objective/train/theoretical_loss": 3.834712284484797, + "objective/train/tokens_used": 628304352, + "theoretical_loss": 3.834712284484797, + "tokens_seen": 607844352 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004120260782347041, + "loss": 2.9376, + "theoretical_loss": 3.834690763436007, + "tokens_seen": 607877120 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041201604814443334, + "loss": 2.8025, + "theoretical_loss": 3.8346477257928964, + "tokens_seen": 607942656 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041200601805416247, + "loss": 2.9184, + "theoretical_loss": 3.8346046940878624, + "tokens_seen": 608008192 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004119959879638917, + "loss": 2.8953, + "theoretical_loss": 3.834561668319447, + "tokens_seen": 608073728 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004119859578736209, + "loss": 2.7333, + "theoretical_loss": 3.834518648486189, + "tokens_seen": 608139264 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041197592778335006, + "loss": 2.8579, + "theoretical_loss": 3.8344756345866324, + "tokens_seen": 608204800 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041196589769307924, + "loss": 2.8816, + "theoretical_loss": 3.834432626619318, + "tokens_seen": 608270336 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004119558676028085, + "loss": 2.8811, + "theoretical_loss": 3.8343896245827898, + "tokens_seen": 608335872 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004119458375125376, + "loss": 2.997, + "theoretical_loss": 3.8343466284755907, + "tokens_seen": 608401408 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041193580742226684, + "loss": 2.8126, + "theoretical_loss": 3.8343036382962645, + "tokens_seen": 608466944 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041192577733199597, + "loss": 2.9976, + "theoretical_loss": 3.834260654043356, + "tokens_seen": 608532480 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004119157472417252, + "loss": 2.8142, + "theoretical_loss": 3.8342176757154087, + "tokens_seen": 608598016 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004119057171514544, + "loss": 2.7509, + "theoretical_loss": 3.8341747033109703, + "tokens_seen": 608663552 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041189568706118357, + "loss": 2.6745, + "theoretical_loss": 3.8341317368285845, + "tokens_seen": 608729088 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041188565697091275, + "loss": 2.8161, + "theoretical_loss": 3.8340887762667992, + "tokens_seen": 608794624 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041187562688064193, + "loss": 2.9308, + "theoretical_loss": 3.8340458216241613, + "tokens_seen": 608860160 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004118655967903711, + "loss": 2.8721, + "theoretical_loss": 3.834002872899217, + "tokens_seen": 608925696 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041185556670010035, + "loss": 2.7833, + "theoretical_loss": 3.833959930090516, + "tokens_seen": 608991232 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041184553660982947, + "loss": 2.8491, + "theoretical_loss": 3.8339169931966053, + "tokens_seen": 609056768 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004118355065195587, + "loss": 2.8048, + "theoretical_loss": 3.8338740622160343, + "tokens_seen": 609122304 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041182547642928783, + "loss": 2.8229, + "theoretical_loss": 3.833831137147353, + "tokens_seen": 609187840 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041181544633901707, + "loss": 3.0615, + "theoretical_loss": 3.8337882179891114, + "tokens_seen": 609253376 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041180541624874625, + "loss": 2.6738, + "theoretical_loss": 3.83374530473986, + "tokens_seen": 609318912 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041179538615847543, + "loss": 2.706, + "theoretical_loss": 3.833702397398149, + "tokens_seen": 609384448 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004117853560682046, + "loss": 2.7287, + "theoretical_loss": 3.8336594959625314, + "tokens_seen": 609449984 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 751044, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0871493816375732, + "objective/train/theoretical_loss": 3.833638047459054, + "objective/train/tokens_used": 629942752, + "theoretical_loss": 3.833638047459054, + "tokens_seen": 609482752 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041177532597793385, + "loss": 3.0586, + "theoretical_loss": 3.833616600431558, + "tokens_seen": 609515520 + }, + { + "epoch": 2.0, + "learning_rate": 0.000411765295887663, + "loss": 2.8549, + "theoretical_loss": 3.8335737108037815, + "tokens_seen": 609581056 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004117552657973922, + "loss": 2.872, + "theoretical_loss": 3.833530827077756, + "tokens_seen": 609646592 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041174523570712134, + "loss": 2.8627, + "theoretical_loss": 3.833487949252034, + "tokens_seen": 609712128 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004117352056168506, + "loss": 2.8217, + "theoretical_loss": 3.8334450773251705, + "tokens_seen": 609777664 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041172517552657975, + "loss": 2.99, + "theoretical_loss": 3.833402211295719, + "tokens_seen": 609843200 + }, + { + "epoch": 2.0, + "learning_rate": 0.00041171514543630894, + "loss": 2.7994, + "theoretical_loss": 3.833359351162236, + "tokens_seen": 609908736 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004117051153460381, + "loss": 2.9186, + "theoretical_loss": 3.8333164969232767, + "tokens_seen": 609974272 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004116950852557673, + "loss": 2.9245, + "theoretical_loss": 3.8332736485773964, + "tokens_seen": 610039808 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004116850551654965, + "loss": 2.7503, + "theoretical_loss": 3.8332308061231526, + "tokens_seen": 610105344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004116750250752257, + "loss": 2.7762, + "theoretical_loss": 3.833187969559102, + "tokens_seen": 610170880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041166499498495484, + "loss": 3.0806, + "theoretical_loss": 3.833145138883803, + "tokens_seen": 610236416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004116549648946841, + "loss": 2.5722, + "theoretical_loss": 3.8331023140958127, + "tokens_seen": 610301952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004116449348044132, + "loss": 2.9739, + "theoretical_loss": 3.833059495193691, + "tokens_seen": 610367488 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041163490471414244, + "loss": 2.908, + "theoretical_loss": 3.833016682175996, + "tokens_seen": 610433024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004116248746238716, + "loss": 2.9373, + "theoretical_loss": 3.8329738750412883, + "tokens_seen": 610498560 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004116148445336008, + "loss": 2.7929, + "theoretical_loss": 3.832931073788127, + "tokens_seen": 610564096 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041160481444333, + "loss": 2.9314, + "theoretical_loss": 3.8328882784150746, + "tokens_seen": 610629632 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004115947843530592, + "loss": 2.9051, + "theoretical_loss": 3.8328454889206904, + "tokens_seen": 610695168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041158475426278834, + "loss": 2.8858, + "theoretical_loss": 3.8328027053035374, + "tokens_seen": 610760704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004115747241725176, + "loss": 3.0172, + "theoretical_loss": 3.8327599275621775, + "tokens_seen": 610826240 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004115646940822467, + "loss": 2.8714, + "theoretical_loss": 3.832717155695173, + "tokens_seen": 610891776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041155466399197594, + "loss": 2.7776, + "theoretical_loss": 3.832674389701088, + "tokens_seen": 610957312 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004115446339017051, + "loss": 2.892, + "theoretical_loss": 3.8326316295784855, + "tokens_seen": 611022848 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004115346038114343, + "loss": 2.9538, + "theoretical_loss": 3.8325888753259303, + "tokens_seen": 611088384 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 756149, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.102915048599243, + "objective/train/theoretical_loss": 3.8325675004004713, + "objective/train/tokens_used": 631581152, + "theoretical_loss": 3.8325675004004713, + "tokens_seen": 611121152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004115245737211635, + "loss": 2.9014, + "theoretical_loss": 3.8325461269419865, + "tokens_seen": 611153920 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041151454363089267, + "loss": 2.8073, + "theoretical_loss": 3.8325033844252197, + "tokens_seen": 611219456 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041150451354062185, + "loss": 2.8775, + "theoretical_loss": 3.8324606477741963, + "tokens_seen": 611284992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004114944834503511, + "loss": 2.8357, + "theoretical_loss": 3.832417916987482, + "tokens_seen": 611350528 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004114844533600802, + "loss": 2.7828, + "theoretical_loss": 3.832375192063643, + "tokens_seen": 611416064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041147442326980944, + "loss": 2.8968, + "theoretical_loss": 3.832332473001247, + "tokens_seen": 611481600 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041146439317953857, + "loss": 2.957, + "theoretical_loss": 3.832289759798863, + "tokens_seen": 611547136 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004114543630892678, + "loss": 2.8542, + "theoretical_loss": 3.832247052455058, + "tokens_seen": 611612672 + }, + { + "epoch": 2.01, + "learning_rate": 0.000411444332998997, + "loss": 2.9705, + "theoretical_loss": 3.832204350968401, + "tokens_seen": 611678208 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041143430290872617, + "loss": 2.7964, + "theoretical_loss": 3.832161655337462, + "tokens_seen": 611743744 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041142427281845535, + "loss": 2.6554, + "theoretical_loss": 3.832118965560809, + "tokens_seen": 611809280 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004114142427281846, + "loss": 2.8301, + "theoretical_loss": 3.832076281637014, + "tokens_seen": 611874816 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041140421263791377, + "loss": 2.6453, + "theoretical_loss": 3.8320336035646476, + "tokens_seen": 611940352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041139418254764295, + "loss": 2.846, + "theoretical_loss": 3.831990931342281, + "tokens_seen": 612005888 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041138415245737213, + "loss": 2.8807, + "theoretical_loss": 3.831948264968485, + "tokens_seen": 612071424 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004113741223671013, + "loss": 2.842, + "theoretical_loss": 3.8319056044418334, + "tokens_seen": 612136960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041136409227683055, + "loss": 2.9488, + "theoretical_loss": 3.8318629497608985, + "tokens_seen": 612202496 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041135406218655967, + "loss": 2.8403, + "theoretical_loss": 3.831820300924253, + "tokens_seen": 612268032 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004113440320962889, + "loss": 2.8262, + "theoretical_loss": 3.8317776579304716, + "tokens_seen": 612333568 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041133400200601803, + "loss": 2.9601, + "theoretical_loss": 3.8317350207781278, + "tokens_seen": 612399104 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041132397191574727, + "loss": 2.8308, + "theoretical_loss": 3.831692389465797, + "tokens_seen": 612464640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041131394182547645, + "loss": 2.7944, + "theoretical_loss": 3.831649763992054, + "tokens_seen": 612530176 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041130391173520563, + "loss": 2.942, + "theoretical_loss": 3.831607144355476, + "tokens_seen": 612595712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004112938816449348, + "loss": 2.8861, + "theoretical_loss": 3.8315645305546377, + "tokens_seen": 612661248 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041128385155466405, + "loss": 2.8315, + "theoretical_loss": 3.8315219225881165, + "tokens_seen": 612726784 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 761270, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.108196973800659, + "objective/train/theoretical_loss": 3.83150062079228, + "objective/train/tokens_used": 633219552, + "theoretical_loss": 3.83150062079228, + "tokens_seen": 612759552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004112738214643932, + "loss": 2.84, + "theoretical_loss": 3.8314793204544895, + "tokens_seen": 612792320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004112637913741224, + "loss": 2.7863, + "theoretical_loss": 3.831436724152335, + "tokens_seen": 612857856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041125376128385154, + "loss": 2.9543, + "theoretical_loss": 3.831394133680231, + "tokens_seen": 612923392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004112437311935808, + "loss": 2.9146, + "theoretical_loss": 3.8313515490367562, + "tokens_seen": 612988928 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041123370110330995, + "loss": 2.8663, + "theoretical_loss": 3.8313089702204906, + "tokens_seen": 613054464 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041122367101303914, + "loss": 2.9834, + "theoretical_loss": 3.831266397230013, + "tokens_seen": 613120000 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004112136409227683, + "loss": 2.9004, + "theoretical_loss": 3.8312238300639048, + "tokens_seen": 613185536 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004112036108324975, + "loss": 2.9223, + "theoretical_loss": 3.8311812687207456, + "tokens_seen": 613251072 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004111935807422267, + "loss": 2.9421, + "theoretical_loss": 3.8311387131991173, + "tokens_seen": 613316608 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004111835506519559, + "loss": 2.9294, + "theoretical_loss": 3.831096163497602, + "tokens_seen": 613382144 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041117352056168504, + "loss": 2.7985, + "theoretical_loss": 3.8310536196147815, + "tokens_seen": 613447680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004111634904714143, + "loss": 2.8329, + "theoretical_loss": 3.831011081549239, + "tokens_seen": 613513216 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004111534603811434, + "loss": 3.1471, + "theoretical_loss": 3.8309685492995573, + "tokens_seen": 613578752 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041114343029087264, + "loss": 2.9241, + "theoretical_loss": 3.830926022864321, + "tokens_seen": 613644288 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004111334002006018, + "loss": 2.7, + "theoretical_loss": 3.8308835022421137, + "tokens_seen": 613709824 + }, + { + "epoch": 2.01, + "learning_rate": 0.000411123370110331, + "loss": 2.9448, + "theoretical_loss": 3.83084098743152, + "tokens_seen": 613775360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004111133400200602, + "loss": 2.6292, + "theoretical_loss": 3.8307984784311255, + "tokens_seen": 613840896 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004111033099297894, + "loss": 2.8419, + "theoretical_loss": 3.8307559752395166, + "tokens_seen": 613906432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041109327983951854, + "loss": 3.0578, + "theoretical_loss": 3.8307134778552783, + "tokens_seen": 613971968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004110832497492478, + "loss": 2.6516, + "theoretical_loss": 3.8306709862769983, + "tokens_seen": 614037504 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004110732196589769, + "loss": 2.8264, + "theoretical_loss": 3.830628500503263, + "tokens_seen": 614103040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041106318956870614, + "loss": 2.635, + "theoretical_loss": 3.830586020532661, + "tokens_seen": 614168576 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004110531594784353, + "loss": 2.9087, + "theoretical_loss": 3.830543546363781, + "tokens_seen": 614234112 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004110431293881645, + "loss": 2.8193, + "theoretical_loss": 3.8305010779952102, + "tokens_seen": 614299648 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004110330992978937, + "loss": 2.8163, + "theoretical_loss": 3.8304586154255387, + "tokens_seen": 614365184 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 766282, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.869065046310425, + "objective/train/theoretical_loss": 3.8304373863148493, + "objective/train/tokens_used": 634857952, + "theoretical_loss": 3.8304373863148493, + "tokens_seen": 614397952 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041102306920762287, + "loss": 2.9004, + "theoretical_loss": 3.8304161586533563, + "tokens_seen": 614430720 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041101303911735205, + "loss": 2.8674, + "theoretical_loss": 3.830373707677253, + "tokens_seen": 614496256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004110030090270813, + "loss": 2.8783, + "theoretical_loss": 3.83033126249582, + "tokens_seen": 614561792 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004109929789368104, + "loss": 3.0358, + "theoretical_loss": 3.8302888231076473, + "tokens_seen": 614627328 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041098294884653965, + "loss": 2.8569, + "theoretical_loss": 3.830246389511328, + "tokens_seen": 614692864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041097291875626877, + "loss": 2.7641, + "theoretical_loss": 3.8302039617054526, + "tokens_seen": 614758400 + }, + { + "epoch": 2.01, + "learning_rate": 0.000410962888665998, + "loss": 2.9641, + "theoretical_loss": 3.830161539688616, + "tokens_seen": 614823936 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004109528585757272, + "loss": 2.8592, + "theoretical_loss": 3.8301191234594096, + "tokens_seen": 614889472 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041094282848545637, + "loss": 2.8512, + "theoretical_loss": 3.830076713016428, + "tokens_seen": 614955008 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041093279839518555, + "loss": 2.6787, + "theoretical_loss": 3.8300343083582646, + "tokens_seen": 615020544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004109227683049148, + "loss": 2.7377, + "theoretical_loss": 3.829991909483515, + "tokens_seen": 615086080 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004109127382146439, + "loss": 3.0285, + "theoretical_loss": 3.8299495163907733, + "tokens_seen": 615151616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041090270812437315, + "loss": 2.7276, + "theoretical_loss": 3.829907129078636, + "tokens_seen": 615217152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004108926780341023, + "loss": 2.821, + "theoretical_loss": 3.829864747545699, + "tokens_seen": 615282688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004108826479438315, + "loss": 2.8471, + "theoretical_loss": 3.8298223717905584, + "tokens_seen": 615348224 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004108726178535607, + "loss": 2.6164, + "theoretical_loss": 3.8297800018118116, + "tokens_seen": 615413760 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041086258776328987, + "loss": 2.7788, + "theoretical_loss": 3.8297376376080567, + "tokens_seen": 615479296 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041085255767301905, + "loss": 2.6422, + "theoretical_loss": 3.829695279177891, + "tokens_seen": 615544832 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041084252758274823, + "loss": 2.924, + "theoretical_loss": 3.8296529265199135, + "tokens_seen": 615610368 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004108324974924774, + "loss": 2.9178, + "theoretical_loss": 3.8296105796327233, + "tokens_seen": 615675904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041082246740220665, + "loss": 2.8156, + "theoretical_loss": 3.82956823851492, + "tokens_seen": 615741440 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004108124373119358, + "loss": 2.8831, + "theoretical_loss": 3.829525903165103, + "tokens_seen": 615806976 + }, + { + "epoch": 2.01, + "learning_rate": 0.000410802407221665, + "loss": 2.7111, + "theoretical_loss": 3.8294835735818733, + "tokens_seen": 615872512 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004107923771313942, + "loss": 2.9392, + "theoretical_loss": 3.8294412497638324, + "tokens_seen": 615938048 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004107823470411234, + "loss": 2.6815, + "theoretical_loss": 3.829398931709581, + "tokens_seen": 616003584 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 771277, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5148863792419434, + "objective/train/theoretical_loss": 3.829377774843439, + "objective/train/tokens_used": 636496352, + "theoretical_loss": 3.829377774843439, + "tokens_seen": 616036352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041077231695085256, + "loss": 2.7967, + "theoretical_loss": 3.8293566194177213, + "tokens_seen": 616069120 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041076228686058174, + "loss": 2.6174, + "theoretical_loss": 3.829314312886856, + "tokens_seen": 616134656 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004107522567703109, + "loss": 2.7027, + "theoretical_loss": 3.829272012115588, + "tokens_seen": 616200192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041074222668004015, + "loss": 2.5431, + "theoretical_loss": 3.8292297171025202, + "tokens_seen": 616265728 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004107321965897693, + "loss": 2.7535, + "theoretical_loss": 3.8291874278462577, + "tokens_seen": 616331264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004107221664994985, + "loss": 2.8045, + "theoretical_loss": 3.8291451443454037, + "tokens_seen": 616396800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041071213640922764, + "loss": 2.6862, + "theoretical_loss": 3.8291028665985642, + "tokens_seen": 616462336 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004107021063189569, + "loss": 2.6272, + "theoretical_loss": 3.8290605946043437, + "tokens_seen": 616527872 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041069207622868606, + "loss": 2.736, + "theoretical_loss": 3.829018328361349, + "tokens_seen": 616593408 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041068204613841524, + "loss": 2.9518, + "theoretical_loss": 3.8289760678681852, + "tokens_seen": 616658944 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004106720160481444, + "loss": 2.8972, + "theoretical_loss": 3.8289338131234603, + "tokens_seen": 616724480 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004106619859578736, + "loss": 2.8132, + "theoretical_loss": 3.8288915641257812, + "tokens_seen": 616790016 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041065195586760284, + "loss": 2.9286, + "theoretical_loss": 3.828849320873756, + "tokens_seen": 616855552 + }, + { + "epoch": 2.01, + "learning_rate": 0.000410641925777332, + "loss": 2.6631, + "theoretical_loss": 3.828807083365992, + "tokens_seen": 616921088 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004106318956870612, + "loss": 2.8104, + "theoretical_loss": 3.8287648516010995, + "tokens_seen": 616986624 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004106218655967904, + "loss": 2.8351, + "theoretical_loss": 3.8287226255776865, + "tokens_seen": 617052160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004106118355065196, + "loss": 2.757, + "theoretical_loss": 3.8286804052943637, + "tokens_seen": 617117696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041060180541624874, + "loss": 2.9234, + "theoretical_loss": 3.828638190749741, + "tokens_seen": 617183232 + }, + { + "epoch": 2.01, + "learning_rate": 0.000410591775325978, + "loss": 2.6066, + "theoretical_loss": 3.8285959819424296, + "tokens_seen": 617248768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004105817452357071, + "loss": 2.9455, + "theoretical_loss": 3.828553778871039, + "tokens_seen": 617314304 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041057171514543634, + "loss": 2.924, + "theoretical_loss": 3.8285115815341833, + "tokens_seen": 617379840 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004105616850551655, + "loss": 2.7455, + "theoretical_loss": 3.828469389930473, + "tokens_seen": 617445376 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004105516549648947, + "loss": 2.7622, + "theoretical_loss": 3.8284272040585208, + "tokens_seen": 617510912 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004105416248746239, + "loss": 2.8468, + "theoretical_loss": 3.8283850239169412, + "tokens_seen": 617576448 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041053159478435307, + "loss": 2.74, + "theoretical_loss": 3.8283428495043466, + "tokens_seen": 617641984 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 772734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1529695987701416, + "objective/train/theoretical_loss": 3.8283217644459855, + "objective/train/tokens_used": 638134752, + "theoretical_loss": 3.8283217644459855, + "tokens_seen": 617674752 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041052156469408225, + "loss": 2.6459, + "theoretical_loss": 3.8283006808193516, + "tokens_seen": 617707520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004105115346038115, + "loss": 3.0692, + "theoretical_loss": 3.8282585178605704, + "tokens_seen": 617773056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004105015045135406, + "loss": 2.944, + "theoretical_loss": 3.828216360626619, + "tokens_seen": 617838592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041049147442326985, + "loss": 2.9468, + "theoretical_loss": 3.8281742091161117, + "tokens_seen": 617904128 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041048144433299897, + "loss": 2.9138, + "theoretical_loss": 3.828132063327666, + "tokens_seen": 617969664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004104714142427282, + "loss": 2.7093, + "theoretical_loss": 3.8280899232598973, + "tokens_seen": 618035200 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004104613841524574, + "loss": 2.9611, + "theoretical_loss": 3.8280477889114226, + "tokens_seen": 618100736 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041045135406218657, + "loss": 2.8029, + "theoretical_loss": 3.82800566028086, + "tokens_seen": 618166272 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041044132397191575, + "loss": 2.9444, + "theoretical_loss": 3.827963537366827, + "tokens_seen": 618231808 + }, + { + "epoch": 2.01, + "learning_rate": 0.000410431293881645, + "loss": 2.8183, + "theoretical_loss": 3.827921420167942, + "tokens_seen": 618297344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004104212637913741, + "loss": 2.9094, + "theoretical_loss": 3.827879308682825, + "tokens_seen": 618362880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041041123370110335, + "loss": 2.7128, + "theoretical_loss": 3.8278372029100938, + "tokens_seen": 618428416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004104012036108325, + "loss": 2.8372, + "theoretical_loss": 3.8277951028483694, + "tokens_seen": 618493952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004103911735205617, + "loss": 3.0235, + "theoretical_loss": 3.8277530084962716, + "tokens_seen": 618559488 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004103811434302909, + "loss": 2.7758, + "theoretical_loss": 3.827710919852422, + "tokens_seen": 618625024 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041037111334002007, + "loss": 2.7514, + "theoretical_loss": 3.827668836915441, + "tokens_seen": 618690560 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041036108324974925, + "loss": 2.6761, + "theoretical_loss": 3.8276267596839513, + "tokens_seen": 618756096 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041035105315947844, + "loss": 2.8231, + "theoretical_loss": 3.8275846881565743, + "tokens_seen": 618821632 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004103410230692076, + "loss": 2.8471, + "theoretical_loss": 3.827542622331933, + "tokens_seen": 618887168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041033099297893685, + "loss": 2.7828, + "theoretical_loss": 3.827500562208651, + "tokens_seen": 618952704 + }, + { + "epoch": 2.01, + "learning_rate": 0.000410320962888666, + "loss": 3.0159, + "theoretical_loss": 3.8274585077853525, + "tokens_seen": 619018240 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004103109327983952, + "loss": 2.8519, + "theoretical_loss": 3.82741645906066, + "tokens_seen": 619083776 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004103009027081244, + "loss": 2.7824, + "theoretical_loss": 3.8273744160331997, + "tokens_seen": 619149312 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004102908726178536, + "loss": 2.8297, + "theoretical_loss": 3.8273323787015965, + "tokens_seen": 619214848 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041028084252758276, + "loss": 2.9612, + "theoretical_loss": 3.8272903470644755, + "tokens_seen": 619280384 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 773070, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.051236152648926, + "objective/train/theoretical_loss": 3.827269333380917, + "objective/train/tokens_used": 639773152, + "theoretical_loss": 3.827269333380917, + "tokens_seen": 619313152 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041027081243731194, + "loss": 2.9915, + "theoretical_loss": 3.8272483211204635, + "tokens_seen": 619345920 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004102607823470411, + "loss": 2.84, + "theoretical_loss": 3.8272063008681867, + "tokens_seen": 619411456 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041025075225677035, + "loss": 2.7795, + "theoretical_loss": 3.827164286306272, + "tokens_seen": 619476992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004102407221664995, + "loss": 2.9163, + "theoretical_loss": 3.8271222774333467, + "tokens_seen": 619542528 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004102306920762287, + "loss": 2.7408, + "theoretical_loss": 3.8270802742480394, + "tokens_seen": 619608064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041022066198595784, + "loss": 2.7077, + "theoretical_loss": 3.8270382767489783, + "tokens_seen": 619673600 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004102106318956871, + "loss": 2.7924, + "theoretical_loss": 3.8269962849347934, + "tokens_seen": 619739136 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041020060180541626, + "loss": 2.9734, + "theoretical_loss": 3.8269542988041123, + "tokens_seen": 619804672 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041019057171514544, + "loss": 2.8563, + "theoretical_loss": 3.826912318355566, + "tokens_seen": 619870208 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004101805416248746, + "loss": 2.7356, + "theoretical_loss": 3.826870343587785, + "tokens_seen": 619935744 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004101705115346038, + "loss": 2.94, + "theoretical_loss": 3.8268283744993994, + "tokens_seen": 620001280 + }, + { + "epoch": 2.01, + "learning_rate": 0.000410160481444333, + "loss": 3.0616, + "theoretical_loss": 3.8267864110890413, + "tokens_seen": 620066816 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004101504513540622, + "loss": 2.8621, + "theoretical_loss": 3.8267444533553423, + "tokens_seen": 620132352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041014042126379135, + "loss": 2.7912, + "theoretical_loss": 3.8267025012969347, + "tokens_seen": 620197888 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004101303911735206, + "loss": 2.8802, + "theoretical_loss": 3.8266605549124515, + "tokens_seen": 620263424 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041012036108324976, + "loss": 2.7613, + "theoretical_loss": 3.8266186142005254, + "tokens_seen": 620328960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041011033099297894, + "loss": 2.8051, + "theoretical_loss": 3.82657667915979, + "tokens_seen": 620394496 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004101003009027081, + "loss": 2.8462, + "theoretical_loss": 3.8265347497888804, + "tokens_seen": 620460032 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004100902708124373, + "loss": 2.8898, + "theoretical_loss": 3.8264928260864304, + "tokens_seen": 620525568 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004100802407221665, + "loss": 2.881, + "theoretical_loss": 3.8264509080510756, + "tokens_seen": 620591104 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004100702106318957, + "loss": 2.6456, + "theoretical_loss": 3.8264089956814513, + "tokens_seen": 620656640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041006018054162485, + "loss": 2.9374, + "theoretical_loss": 3.826367088976194, + "tokens_seen": 620722176 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004100501504513541, + "loss": 2.9263, + "theoretical_loss": 3.8263251879339393, + "tokens_seen": 620787712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004100401203610832, + "loss": 2.9678, + "theoretical_loss": 3.8262832925533257, + "tokens_seen": 620853248 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041003009027081245, + "loss": 2.7197, + "theoretical_loss": 3.826241402832989, + "tokens_seen": 620918784 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 774439, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7738590240478516, + "objective/train/theoretical_loss": 3.8262204600949996, + "objective/train/tokens_used": 641411552, + "theoretical_loss": 3.8262204600949996, + "tokens_seen": 620951552 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041002006018054163, + "loss": 2.8329, + "theoretical_loss": 3.8261995187715687, + "tokens_seen": 620984320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004100100300902708, + "loss": 2.9793, + "theoretical_loss": 3.8261576403677027, + "tokens_seen": 621049856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00041, + "loss": 2.8844, + "theoretical_loss": 3.8261157676200295, + "tokens_seen": 621115392 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040998996990972917, + "loss": 2.8916, + "theoretical_loss": 3.826073900527189, + "tokens_seen": 621180928 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040997993981945835, + "loss": 2.8177, + "theoretical_loss": 3.8260320390878206, + "tokens_seen": 621246464 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004099699097291876, + "loss": 2.5451, + "theoretical_loss": 3.8259901833005645, + "tokens_seen": 621312000 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004099598796389167, + "loss": 2.862, + "theoretical_loss": 3.825948333164062, + "tokens_seen": 621377536 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040994984954864595, + "loss": 2.7519, + "theoretical_loss": 3.8259064886769547, + "tokens_seen": 621443072 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040993981945837513, + "loss": 2.8865, + "theoretical_loss": 3.825864649837883, + "tokens_seen": 621508608 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004099297893681043, + "loss": 2.8783, + "theoretical_loss": 3.825822816645491, + "tokens_seen": 621574144 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004099197592778335, + "loss": 3.0324, + "theoretical_loss": 3.8257809890984196, + "tokens_seen": 621639680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004099097291875627, + "loss": 2.9964, + "theoretical_loss": 3.8257391671953127, + "tokens_seen": 621705216 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004098996990972919, + "loss": 2.9455, + "theoretical_loss": 3.825697350934814, + "tokens_seen": 621770752 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004098896690070211, + "loss": 2.9301, + "theoretical_loss": 3.8256555403155668, + "tokens_seen": 621836288 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004098796389167503, + "loss": 2.7822, + "theoretical_loss": 3.8256137353362174, + "tokens_seen": 621901824 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040986960882647945, + "loss": 2.723, + "theoretical_loss": 3.8255719359954083, + "tokens_seen": 621967360 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040985957873620864, + "loss": 2.733, + "theoretical_loss": 3.8255301422917873, + "tokens_seen": 622032896 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004098495486459378, + "loss": 2.935, + "theoretical_loss": 3.8254883542239995, + "tokens_seen": 622098432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040983951855566705, + "loss": 2.8432, + "theoretical_loss": 3.8254465717906907, + "tokens_seen": 622163968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004098294884653962, + "loss": 2.7378, + "theoretical_loss": 3.8254047949905083, + "tokens_seen": 622229504 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004098194583751254, + "loss": 2.7812, + "theoretical_loss": 3.8253630238221, + "tokens_seen": 622295040 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004098094282848546, + "loss": 2.9145, + "theoretical_loss": 3.825321258284113, + "tokens_seen": 622360576 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004097993981945838, + "loss": 3.0614, + "theoretical_loss": 3.825279498375196, + "tokens_seen": 622426112 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040978936810431296, + "loss": 2.9549, + "theoretical_loss": 3.8252377440939975, + "tokens_seen": 622491648 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040977933801404214, + "loss": 2.8137, + "theoretical_loss": 3.825195995439167, + "tokens_seen": 622557184 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 775020, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.917367696762085, + "objective/train/theoretical_loss": 3.825175123221218, + "objective/train/tokens_used": 643049952, + "theoretical_loss": 3.825175123221218, + "tokens_seen": 622589952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004097693079237713, + "loss": 2.9835, + "theoretical_loss": 3.8251542524093542, + "tokens_seen": 622622720 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040975927783350055, + "loss": 2.8027, + "theoretical_loss": 3.825112515003209, + "tokens_seen": 622688256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004097492477432297, + "loss": 3.0546, + "theoretical_loss": 3.825070783219382, + "tokens_seen": 622753792 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004097392176529589, + "loss": 2.7498, + "theoretical_loss": 3.825029057056524, + "tokens_seen": 622819328 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040972918756268804, + "loss": 2.7878, + "theoretical_loss": 3.8249873365132876, + "tokens_seen": 622884864 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004097191574724173, + "loss": 2.6325, + "theoretical_loss": 3.8249456215883235, + "tokens_seen": 622950400 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040970912738214646, + "loss": 2.837, + "theoretical_loss": 3.824903912280285, + "tokens_seen": 623015936 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040969909729187564, + "loss": 2.8094, + "theoretical_loss": 3.8248622085878257, + "tokens_seen": 623081472 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004096890672016048, + "loss": 3.0344, + "theoretical_loss": 3.824820510509597, + "tokens_seen": 623147008 + }, + { + "epoch": 2.01, + "learning_rate": 0.000409679037111334, + "loss": 3.0147, + "theoretical_loss": 3.824778818044255, + "tokens_seen": 623212544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004096690070210632, + "loss": 2.8037, + "theoretical_loss": 3.824737131190452, + "tokens_seen": 623278080 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004096589769307924, + "loss": 2.8736, + "theoretical_loss": 3.8246954499468444, + "tokens_seen": 623343616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040964894684052155, + "loss": 2.7638, + "theoretical_loss": 3.824653774312087, + "tokens_seen": 623409152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004096389167502508, + "loss": 3.0273, + "theoretical_loss": 3.824612104284835, + "tokens_seen": 623474688 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040962888665997996, + "loss": 2.8466, + "theoretical_loss": 3.824570439863746, + "tokens_seen": 623540224 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040961885656970914, + "loss": 3.077, + "theoretical_loss": 3.8245287810474746, + "tokens_seen": 623605760 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004096088264794383, + "loss": 2.9727, + "theoretical_loss": 3.8244871278346793, + "tokens_seen": 623671296 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004095987963891675, + "loss": 2.7729, + "theoretical_loss": 3.8244454802240173, + "tokens_seen": 623736832 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004095887662988967, + "loss": 2.7011, + "theoretical_loss": 3.8244038382141463, + "tokens_seen": 623802368 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004095787362086259, + "loss": 3.0728, + "theoretical_loss": 3.824362201803726, + "tokens_seen": 623867904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040956870611835505, + "loss": 2.7587, + "theoretical_loss": 3.8243205709914143, + "tokens_seen": 623933440 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004095586760280843, + "loss": 2.7799, + "theoretical_loss": 3.8242789457758706, + "tokens_seen": 623998976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004095486459378134, + "loss": 2.6693, + "theoretical_loss": 3.824237326155756, + "tokens_seen": 624064512 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040953861584754265, + "loss": 2.8832, + "theoretical_loss": 3.824195712129729, + "tokens_seen": 624130048 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040952858575727183, + "loss": 2.7424, + "theoretical_loss": 3.824154103696452, + "tokens_seen": 624195584 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 776436, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9514684677124023, + "objective/train/theoretical_loss": 3.8241333015766763, + "objective/train/tokens_used": 644688352, + "theoretical_loss": 3.8241333015766763, + "tokens_seen": 624228352 + }, + { + "epoch": 2.01, + "learning_rate": 0.000409518555667001, + "loss": 2.7882, + "theoretical_loss": 3.8241125008545853, + "tokens_seen": 624261120 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004095085255767302, + "loss": 2.756, + "theoretical_loss": 3.8240709036027916, + "tokens_seen": 624326656 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040949849548645937, + "loss": 2.9293, + "theoretical_loss": 3.8240293119397317, + "tokens_seen": 624392192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040948846539618855, + "loss": 2.7802, + "theoretical_loss": 3.823987725864069, + "tokens_seen": 624457728 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004094784353059178, + "loss": 2.8242, + "theoretical_loss": 3.8239461453744674, + "tokens_seen": 624523264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004094684052156469, + "loss": 2.8764, + "theoretical_loss": 3.8239045704695895, + "tokens_seen": 624588800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040945837512537615, + "loss": 2.6924, + "theoretical_loss": 3.8238630011480996, + "tokens_seen": 624654336 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040944834503510533, + "loss": 3.0381, + "theoretical_loss": 3.823821437408662, + "tokens_seen": 624719872 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004094383149448345, + "loss": 2.821, + "theoretical_loss": 3.8237798792499422, + "tokens_seen": 624785408 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004094282848545637, + "loss": 2.6496, + "theoretical_loss": 3.823738326670605, + "tokens_seen": 624850944 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004094182547642929, + "loss": 2.7156, + "theoretical_loss": 3.8236967796693166, + "tokens_seen": 624916480 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040940822467402206, + "loss": 2.9485, + "theoretical_loss": 3.8236552382447435, + "tokens_seen": 624982016 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004093981945837513, + "loss": 2.8782, + "theoretical_loss": 3.8236137023955523, + "tokens_seen": 625047552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004093881644934804, + "loss": 2.8737, + "theoretical_loss": 3.8235721721204103, + "tokens_seen": 625113088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040937813440320965, + "loss": 2.7295, + "theoretical_loss": 3.823530647417985, + "tokens_seen": 625178624 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004093681043129388, + "loss": 2.9309, + "theoretical_loss": 3.823489128286945, + "tokens_seen": 625244160 + }, + { + "epoch": 2.01, + "learning_rate": 0.000409358074222668, + "loss": 2.9655, + "theoretical_loss": 3.8234476147259584, + "tokens_seen": 625309696 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004093480441323972, + "loss": 2.8918, + "theoretical_loss": 3.8234061067336946, + "tokens_seen": 625375232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004093380140421264, + "loss": 2.7062, + "theoretical_loss": 3.823364604308823, + "tokens_seen": 625440768 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040932798395185556, + "loss": 2.7036, + "theoretical_loss": 3.823323107450014, + "tokens_seen": 625506304 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004093179538615848, + "loss": 2.9356, + "theoretical_loss": 3.8232816161559375, + "tokens_seen": 625571840 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004093079237713139, + "loss": 2.5131, + "theoretical_loss": 3.8232401304252646, + "tokens_seen": 625637376 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040929789368104316, + "loss": 2.9493, + "theoretical_loss": 3.823198650256667, + "tokens_seen": 625702912 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004092878635907723, + "loss": 3.0427, + "theoretical_loss": 3.8231571756488165, + "tokens_seen": 625768448 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004092778335005015, + "loss": 2.8405, + "theoretical_loss": 3.823115706600385, + "tokens_seen": 625833984 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 777260, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8199868202209473, + "objective/train/theoretical_loss": 3.8230949741605365, + "objective/train/tokens_used": 646326752, + "theoretical_loss": 3.8230949741605365, + "tokens_seen": 625866752 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004092678034102307, + "loss": 2.7164, + "theoretical_loss": 3.823074243110045, + "tokens_seen": 625899520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004092577733199599, + "loss": 2.6414, + "theoretical_loss": 3.82303278517647, + "tokens_seen": 625965056 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040924774322968906, + "loss": 2.8903, + "theoretical_loss": 3.8229913327983343, + "tokens_seen": 626030592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040923771313941824, + "loss": 2.7473, + "theoretical_loss": 3.822949885974311, + "tokens_seen": 626096128 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004092276830491474, + "loss": 2.7383, + "theoretical_loss": 3.8229084447030757, + "tokens_seen": 626161664 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040921765295887666, + "loss": 3.0065, + "theoretical_loss": 3.8228670089833026, + "tokens_seen": 626227200 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004092076228686058, + "loss": 2.6521, + "theoretical_loss": 3.822825578813667, + "tokens_seen": 626292736 + }, + { + "epoch": 2.01, + "learning_rate": 0.000409197592778335, + "loss": 2.8301, + "theoretical_loss": 3.8227841541928456, + "tokens_seen": 626358272 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040918756268806415, + "loss": 2.7502, + "theoretical_loss": 3.8227427351195145, + "tokens_seen": 626423808 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004091775325977934, + "loss": 2.9458, + "theoretical_loss": 3.82270132159235, + "tokens_seen": 626489344 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040916750250752257, + "loss": 2.6212, + "theoretical_loss": 3.82265991361003, + "tokens_seen": 626554880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040915747241725175, + "loss": 2.9773, + "theoretical_loss": 3.8226185111712327, + "tokens_seen": 626620416 + }, + { + "epoch": 2.01, + "learning_rate": 0.000409147442326981, + "loss": 2.9269, + "theoretical_loss": 3.822577114274635, + "tokens_seen": 626685952 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040913741223671016, + "loss": 3.0878, + "theoretical_loss": 3.822535722918916, + "tokens_seen": 626751488 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040912738214643934, + "loss": 2.9192, + "theoretical_loss": 3.8224943371027558, + "tokens_seen": 626817024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004091173520561685, + "loss": 2.4529, + "theoretical_loss": 3.822452956824833, + "tokens_seen": 626882560 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004091073219658977, + "loss": 2.703, + "theoretical_loss": 3.822411582083827, + "tokens_seen": 626948096 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004090972918756269, + "loss": 2.8391, + "theoretical_loss": 3.82237021287842, + "tokens_seen": 627013632 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004090872617853561, + "loss": 2.7474, + "theoretical_loss": 3.8223288492072918, + "tokens_seen": 627079168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040907723169508525, + "loss": 2.6742, + "theoretical_loss": 3.8222874910691234, + "tokens_seen": 627144704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004090672016048145, + "loss": 2.6279, + "theoretical_loss": 3.8222461384625976, + "tokens_seen": 627210240 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004090571715145436, + "loss": 2.8529, + "theoretical_loss": 3.822204791386396, + "tokens_seen": 627275776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040904714142427285, + "loss": 2.8604, + "theoretical_loss": 3.822163449839202, + "tokens_seen": 627341312 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040903711133400203, + "loss": 2.97, + "theoretical_loss": 3.822122113819698, + "tokens_seen": 627406848 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004090270812437312, + "loss": 2.714, + "theoretical_loss": 3.8220807833265678, + "tokens_seen": 627472384 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 778767, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.658456563949585, + "objective/train/theoretical_loss": 3.8220601201519817, + "objective/train/tokens_used": 647965152, + "theoretical_loss": 3.8220601201519817, + "tokens_seen": 627505152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004090170511534604, + "loss": 2.9567, + "theoretical_loss": 3.8220394583584953, + "tokens_seen": 627537920 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040900702106318957, + "loss": 2.6408, + "theoretical_loss": 3.821998138914166, + "tokens_seen": 627603456 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040899699097291875, + "loss": 2.8496, + "theoretical_loss": 3.821956824992264, + "tokens_seen": 627668992 + }, + { + "epoch": 2.01, + "learning_rate": 0.000408986960882648, + "loss": 2.7743, + "theoretical_loss": 3.821915516591475, + "tokens_seen": 627734528 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004089769307923771, + "loss": 2.6965, + "theoretical_loss": 3.8218742137104846, + "tokens_seen": 627800064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040896690070210635, + "loss": 2.8615, + "theoretical_loss": 3.8218329163479794, + "tokens_seen": 627865600 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040895687061183553, + "loss": 2.5942, + "theoretical_loss": 3.821791624502646, + "tokens_seen": 627931136 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004089468405215647, + "loss": 2.8906, + "theoretical_loss": 3.821750338173172, + "tokens_seen": 627996672 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004089368104312939, + "loss": 2.9984, + "theoretical_loss": 3.8217090573582446, + "tokens_seen": 628062208 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004089267803410231, + "loss": 2.8825, + "theoretical_loss": 3.821667782056552, + "tokens_seen": 628127744 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040891675025075226, + "loss": 2.8492, + "theoretical_loss": 3.821626512266784, + "tokens_seen": 628193280 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004089067201604815, + "loss": 2.7001, + "theoretical_loss": 3.821585247987627, + "tokens_seen": 628258816 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004088966900702106, + "loss": 2.8747, + "theoretical_loss": 3.8215439892177727, + "tokens_seen": 628324352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040888665997993985, + "loss": 2.9203, + "theoretical_loss": 3.82150273595591, + "tokens_seen": 628389888 + }, + { + "epoch": 2.01, + "learning_rate": 0.000408876629889669, + "loss": 2.8152, + "theoretical_loss": 3.8214614882007303, + "tokens_seen": 628455424 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004088665997993982, + "loss": 2.818, + "theoretical_loss": 3.8214202459509234, + "tokens_seen": 628520960 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004088565697091274, + "loss": 2.9598, + "theoretical_loss": 3.8213790092051805, + "tokens_seen": 628586496 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004088465396188566, + "loss": 2.8876, + "theoretical_loss": 3.821337777962194, + "tokens_seen": 628652032 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040883650952858576, + "loss": 2.8699, + "theoretical_loss": 3.8212965522206552, + "tokens_seen": 628717568 + }, + { + "epoch": 2.01, + "learning_rate": 0.000408826479438315, + "loss": 2.866, + "theoretical_loss": 3.821255331979258, + "tokens_seen": 628783104 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004088164493480441, + "loss": 3.1374, + "theoretical_loss": 3.8212141172366945, + "tokens_seen": 628848640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040880641925777336, + "loss": 2.736, + "theoretical_loss": 3.8211729079916577, + "tokens_seen": 628914176 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004087963891675025, + "loss": 2.7657, + "theoretical_loss": 3.821131704242843, + "tokens_seen": 628979712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004087863590772317, + "loss": 2.918, + "theoretical_loss": 3.821090505988944, + "tokens_seen": 629045248 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004087763289869609, + "loss": 2.7973, + "theoretical_loss": 3.821049313228655, + "tokens_seen": 629110784 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 779594, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.082977056503296, + "objective/train/theoretical_loss": 3.821028718908207, + "objective/train/tokens_used": 649603552, + "theoretical_loss": 3.821028718908207, + "tokens_seen": 629143552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004087662988966901, + "loss": 2.8122, + "theoretical_loss": 3.821008125960672, + "tokens_seen": 629176320 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040875626880641926, + "loss": 2.943, + "theoretical_loss": 3.8209669441836907, + "tokens_seen": 629241856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040874623871614844, + "loss": 2.8359, + "theoretical_loss": 3.8209257678964073, + "tokens_seen": 629307392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004087362086258776, + "loss": 2.7174, + "theoretical_loss": 3.8208845970975176, + "tokens_seen": 629372928 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040872617853560686, + "loss": 2.9775, + "theoretical_loss": 3.82084343178572, + "tokens_seen": 629438464 + }, + { + "epoch": 2.01, + "learning_rate": 0.000408716148445336, + "loss": 2.5463, + "theoretical_loss": 3.8208022719597112, + "tokens_seen": 629504000 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004087061183550652, + "loss": 2.9226, + "theoretical_loss": 3.820761117618189, + "tokens_seen": 629569536 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040869608826479435, + "loss": 2.774, + "theoretical_loss": 3.8207199687598523, + "tokens_seen": 629635072 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004086860581745236, + "loss": 2.56, + "theoretical_loss": 3.8206788253834, + "tokens_seen": 629700608 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040867602808425277, + "loss": 2.6942, + "theoretical_loss": 3.8206376874875305, + "tokens_seen": 629766144 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040866599799398195, + "loss": 2.6446, + "theoretical_loss": 3.8205965550709444, + "tokens_seen": 629831680 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040865596790371113, + "loss": 2.6938, + "theoretical_loss": 3.8205554281323417, + "tokens_seen": 629897216 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040864593781344036, + "loss": 2.8075, + "theoretical_loss": 3.8205143066704226, + "tokens_seen": 629962752 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004086359077231695, + "loss": 2.742, + "theoretical_loss": 3.820473190683889, + "tokens_seen": 630028288 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004086258776328987, + "loss": 2.8813, + "theoretical_loss": 3.8204320801714413, + "tokens_seen": 630093824 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040861584754262785, + "loss": 2.9127, + "theoretical_loss": 3.820390975131782, + "tokens_seen": 630159360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004086058174523571, + "loss": 2.9052, + "theoretical_loss": 3.8203498755636143, + "tokens_seen": 630224896 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040859578736208627, + "loss": 2.7453, + "theoretical_loss": 3.8203087814656396, + "tokens_seen": 630290432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040858575727181545, + "loss": 2.8833, + "theoretical_loss": 3.820267692836562, + "tokens_seen": 630355968 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040857572718154463, + "loss": 2.9168, + "theoretical_loss": 3.8202266096750854, + "tokens_seen": 630421504 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004085656970912738, + "loss": 2.7973, + "theoretical_loss": 3.8201855319799134, + "tokens_seen": 630487040 + }, + { + "epoch": 2.01, + "learning_rate": 0.000408555667001003, + "loss": 2.6587, + "theoretical_loss": 3.8201444597497507, + "tokens_seen": 630552576 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040854563691073223, + "loss": 3.037, + "theoretical_loss": 3.8201033929833024, + "tokens_seen": 630618112 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040853560682046136, + "loss": 2.8116, + "theoretical_loss": 3.8200623316792743, + "tokens_seen": 630683648 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004085255767301906, + "loss": 2.8274, + "theoretical_loss": 3.820021275836372, + "tokens_seen": 630749184 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 780765, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5284388065338135, + "objective/train/theoretical_loss": 3.820000749962438, + "objective/train/tokens_used": 651241952, + "theoretical_loss": 3.820000749962438, + "tokens_seen": 630781952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004085155466399197, + "loss": 2.7464, + "theoretical_loss": 3.8199802254533015, + "tokens_seen": 630814720 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040850551654964895, + "loss": 2.7889, + "theoretical_loss": 3.8199391805287704, + "tokens_seen": 630880256 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040849548645937814, + "loss": 2.7819, + "theoretical_loss": 3.8198981410614854, + "tokens_seen": 630945792 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004084854563691073, + "loss": 2.7129, + "theoretical_loss": 3.819857107050155, + "tokens_seen": 631011328 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004084754262788365, + "loss": 2.8668, + "theoretical_loss": 3.819816078493486, + "tokens_seen": 631076864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040846539618856573, + "loss": 2.7094, + "theoretical_loss": 3.819775055390188, + "tokens_seen": 631142400 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040845536609829486, + "loss": 3.036, + "theoretical_loss": 3.8197340377389697, + "tokens_seen": 631207936 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004084453360080241, + "loss": 2.8579, + "theoretical_loss": 3.81969302553854, + "tokens_seen": 631273472 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004084353059177532, + "loss": 2.7545, + "theoretical_loss": 3.81965201878761, + "tokens_seen": 631339008 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040842527582748246, + "loss": 2.9343, + "theoretical_loss": 3.819611017484889, + "tokens_seen": 631404544 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040841524573721164, + "loss": 2.9302, + "theoretical_loss": 3.8195700216290884, + "tokens_seen": 631470080 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004084052156469408, + "loss": 2.6115, + "theoretical_loss": 3.8195290312189187, + "tokens_seen": 631535616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040839518555667005, + "loss": 2.9017, + "theoretical_loss": 3.8194880462530922, + "tokens_seen": 631601152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004083851554663992, + "loss": 2.8711, + "theoretical_loss": 3.819447066730321, + "tokens_seen": 631666688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004083751253761284, + "loss": 2.7859, + "theoretical_loss": 3.8194060926493165, + "tokens_seen": 631732224 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004083650952858576, + "loss": 2.9598, + "theoretical_loss": 3.819365124008793, + "tokens_seen": 631797760 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004083550651955868, + "loss": 2.6389, + "theoretical_loss": 3.8193241608074637, + "tokens_seen": 631863296 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040834503510531596, + "loss": 2.9403, + "theoretical_loss": 3.8192832030440416, + "tokens_seen": 631928832 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004083350050150452, + "loss": 2.9505, + "theoretical_loss": 3.819242250717242, + "tokens_seen": 631994368 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004083249749247743, + "loss": 2.7696, + "theoretical_loss": 3.819201303825779, + "tokens_seen": 632059904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040831494483450356, + "loss": 2.9617, + "theoretical_loss": 3.8191603623683674, + "tokens_seen": 632125440 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004083049147442327, + "loss": 2.7968, + "theoretical_loss": 3.819119426343724, + "tokens_seen": 632190976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004082948846539619, + "loss": 2.6314, + "theoretical_loss": 3.8190784957505635, + "tokens_seen": 632256512 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004082848545636911, + "loss": 2.9328, + "theoretical_loss": 3.819037570587603, + "tokens_seen": 632322048 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004082748244734203, + "loss": 2.8324, + "theoretical_loss": 3.8189966508535598, + "tokens_seen": 632387584 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 781551, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.776937246322632, + "objective/train/theoretical_loss": 3.8189761930219808, + "objective/train/tokens_used": 652880352, + "theoretical_loss": 3.8189761930219808, + "tokens_seen": 632420352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040826479438314946, + "loss": 2.8414, + "theoretical_loss": 3.8189557365471503, + "tokens_seen": 632453120 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040825476429287864, + "loss": 3.0336, + "theoretical_loss": 3.8189148276670926, + "tokens_seen": 632518656 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004082447342026078, + "loss": 2.9431, + "theoretical_loss": 3.818873924212105, + "tokens_seen": 632584192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040823470411233706, + "loss": 2.8801, + "theoretical_loss": 3.818833026180906, + "tokens_seen": 632649728 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004082246740220662, + "loss": 2.7998, + "theoretical_loss": 3.8187921335722157, + "tokens_seen": 632715264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004082146439317954, + "loss": 2.8741, + "theoretical_loss": 3.8187512463847515, + "tokens_seen": 632780800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040820461384152455, + "loss": 2.6709, + "theoretical_loss": 3.8187103646172353, + "tokens_seen": 632846336 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004081945837512538, + "loss": 2.939, + "theoretical_loss": 3.8186694882683865, + "tokens_seen": 632911872 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040818455366098297, + "loss": 2.9783, + "theoretical_loss": 3.818628617336926, + "tokens_seen": 632977408 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040817452357071215, + "loss": 2.9095, + "theoretical_loss": 3.8185877518215756, + "tokens_seen": 633042944 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040816449348044133, + "loss": 2.9708, + "theoretical_loss": 3.818546891721056, + "tokens_seen": 633108480 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040815446339017056, + "loss": 2.9867, + "theoretical_loss": 3.81850603703409, + "tokens_seen": 633174016 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004081444332998997, + "loss": 2.8313, + "theoretical_loss": 3.8184651877594007, + "tokens_seen": 633239552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004081344032096289, + "loss": 2.9659, + "theoretical_loss": 3.8184243438957095, + "tokens_seen": 633305088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040812437311935805, + "loss": 2.8979, + "theoretical_loss": 3.8183835054417408, + "tokens_seen": 633370624 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004081143430290873, + "loss": 2.7882, + "theoretical_loss": 3.818342672396219, + "tokens_seen": 633436160 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040810431293881647, + "loss": 2.8737, + "theoretical_loss": 3.818301844757867, + "tokens_seen": 633501696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040809428284854565, + "loss": 2.8172, + "theoretical_loss": 3.8182610225254106, + "tokens_seen": 633567232 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040808425275827483, + "loss": 2.8925, + "theoretical_loss": 3.8182202056975747, + "tokens_seen": 633632768 + }, + { + "epoch": 2.01, + "learning_rate": 0.000408074222668004, + "loss": 2.6774, + "theoretical_loss": 3.8181793942730846, + "tokens_seen": 633698304 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004080641925777332, + "loss": 2.7061, + "theoretical_loss": 3.8181385882506667, + "tokens_seen": 633763840 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040805416248746243, + "loss": 3.0322, + "theoretical_loss": 3.8180977876290467, + "tokens_seen": 633829376 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040804413239719156, + "loss": 2.8356, + "theoretical_loss": 3.8180569924069525, + "tokens_seen": 633894912 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004080341023069208, + "loss": 2.8216, + "theoretical_loss": 3.8180162025831104, + "tokens_seen": 633960448 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004080240722166499, + "loss": 2.7041, + "theoretical_loss": 3.817975418156249, + "tokens_seen": 634025984 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 783036, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2487502098083496, + "objective/train/theoretical_loss": 3.8179550279662884, + "objective/train/tokens_used": 654518752, + "theoretical_loss": 3.8179550279662884, + "tokens_seen": 634058752 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040801404212637915, + "loss": 2.9766, + "theoretical_loss": 3.817934639125096, + "tokens_seen": 634091520 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040800401203610834, + "loss": 2.7949, + "theoretical_loss": 3.8178938654883803, + "tokens_seen": 634157056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004079939819458375, + "loss": 2.6729, + "theoretical_loss": 3.8178530972448304, + "tokens_seen": 634222592 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004079839518555667, + "loss": 2.7617, + "theoretical_loss": 3.8178123343931762, + "tokens_seen": 634288128 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040797392176529593, + "loss": 2.6544, + "theoretical_loss": 3.8177715769321474, + "tokens_seen": 634353664 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040796389167502506, + "loss": 2.9967, + "theoretical_loss": 3.817730824860474, + "tokens_seen": 634419200 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004079538615847543, + "loss": 2.7682, + "theoretical_loss": 3.817690078176888, + "tokens_seen": 634484736 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004079438314944834, + "loss": 2.7531, + "theoretical_loss": 3.817649336880119, + "tokens_seen": 634550272 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040793380140421266, + "loss": 2.7709, + "theoretical_loss": 3.8176086009689, + "tokens_seen": 634615808 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040792377131394184, + "loss": 2.7165, + "theoretical_loss": 3.817567870441962, + "tokens_seen": 634681344 + }, + { + "epoch": 2.01, + "learning_rate": 0.000407913741223671, + "loss": 2.8137, + "theoretical_loss": 3.8175271452980377, + "tokens_seen": 634746880 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004079037111334002, + "loss": 2.9933, + "theoretical_loss": 3.8174864255358605, + "tokens_seen": 634812416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004078936810431294, + "loss": 2.8765, + "theoretical_loss": 3.8174457111541633, + "tokens_seen": 634877952 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040788365095285856, + "loss": 2.8943, + "theoretical_loss": 3.8174050021516797, + "tokens_seen": 634943488 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004078736208625878, + "loss": 2.979, + "theoretical_loss": 3.8173642985271443, + "tokens_seen": 635009024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004078635907723169, + "loss": 2.7522, + "theoretical_loss": 3.817323600279291, + "tokens_seen": 635074560 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040785356068204616, + "loss": 2.7643, + "theoretical_loss": 3.817282907406857, + "tokens_seen": 635140096 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004078435305917753, + "loss": 2.9688, + "theoretical_loss": 3.8172422199085743, + "tokens_seen": 635205632 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004078335005015045, + "loss": 2.7879, + "theoretical_loss": 3.817201537783182, + "tokens_seen": 635271168 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004078234704112337, + "loss": 2.8632, + "theoretical_loss": 3.8171608610294143, + "tokens_seen": 635336704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004078134403209629, + "loss": 2.8936, + "theoretical_loss": 3.8171201896460083, + "tokens_seen": 635402240 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040780341023069207, + "loss": 2.8438, + "theoretical_loss": 3.817079523631703, + "tokens_seen": 635467776 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004077933801404213, + "loss": 2.9047, + "theoretical_loss": 3.8170388629852336, + "tokens_seen": 635533312 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040778335005015043, + "loss": 2.8102, + "theoretical_loss": 3.8169982077053395, + "tokens_seen": 635598848 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040777331995987966, + "loss": 2.7813, + "theoretical_loss": 3.816957557790759, + "tokens_seen": 635664384 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 783857, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6842525005340576, + "objective/train/theoretical_loss": 3.8169372348450663, + "objective/train/tokens_used": 656157152, + "theoretical_loss": 3.8169372348450663, + "tokens_seen": 635697152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004077632898696088, + "loss": 2.8004, + "theoretical_loss": 3.81691691324023, + "tokens_seen": 635729920 + }, + { + "epoch": 2.01, + "learning_rate": 0.000407753259779338, + "loss": 2.8361, + "theoretical_loss": 3.8168762740524933, + "tokens_seen": 635795456 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004077432296890672, + "loss": 3.1555, + "theoretical_loss": 3.8168356402262877, + "tokens_seen": 635860992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004077331995987964, + "loss": 2.8418, + "theoretical_loss": 3.816795011760354, + "tokens_seen": 635926528 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040772316950852557, + "loss": 2.7288, + "theoretical_loss": 3.816754388653432, + "tokens_seen": 635992064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040771313941825475, + "loss": 2.8514, + "theoretical_loss": 3.816713770904263, + "tokens_seen": 636057600 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040770310932798393, + "loss": 2.8342, + "theoretical_loss": 3.8166731585115894, + "tokens_seen": 636123136 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040769307923771317, + "loss": 2.6405, + "theoretical_loss": 3.816632551474152, + "tokens_seen": 636188672 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004076830491474423, + "loss": 2.8957, + "theoretical_loss": 3.816591949790693, + "tokens_seen": 636254208 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040767301905717153, + "loss": 2.6612, + "theoretical_loss": 3.816551353459955, + "tokens_seen": 636319744 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004076629889669007, + "loss": 3.0216, + "theoretical_loss": 3.816510762480683, + "tokens_seen": 636385280 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004076529588766299, + "loss": 2.8275, + "theoretical_loss": 3.8164701768516185, + "tokens_seen": 636450816 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004076429287863591, + "loss": 2.9587, + "theoretical_loss": 3.816429596571506, + "tokens_seen": 636516352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040763289869608825, + "loss": 2.7802, + "theoretical_loss": 3.81638902163909, + "tokens_seen": 636581888 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004076228686058175, + "loss": 2.867, + "theoretical_loss": 3.8163484520531155, + "tokens_seen": 636647424 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040761283851554667, + "loss": 2.5608, + "theoretical_loss": 3.8163078878123278, + "tokens_seen": 636712960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040760280842527585, + "loss": 2.7673, + "theoretical_loss": 3.8162673289154725, + "tokens_seen": 636778496 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040759277833500503, + "loss": 2.8313, + "theoretical_loss": 3.8162267753612964, + "tokens_seen": 636844032 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004075827482447342, + "loss": 2.911, + "theoretical_loss": 3.8161862271485445, + "tokens_seen": 636909568 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004075727181544634, + "loss": 2.8002, + "theoretical_loss": 3.8161456842759645, + "tokens_seen": 636975104 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040756268806419263, + "loss": 2.8853, + "theoretical_loss": 3.816105146742304, + "tokens_seen": 637040640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040755265797392176, + "loss": 2.6506, + "theoretical_loss": 3.816064614546311, + "tokens_seen": 637106176 + }, + { + "epoch": 2.01, + "learning_rate": 0.000407542627883651, + "loss": 3.0013, + "theoretical_loss": 3.816024087686733, + "tokens_seen": 637171712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004075325977933801, + "loss": 2.8045, + "theoretical_loss": 3.81598356616232, + "tokens_seen": 637237248 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040752256770310935, + "loss": 2.9861, + "theoretical_loss": 3.8159430499718194, + "tokens_seen": 637302784 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 785359, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.857738494873047, + "objective/train/theoretical_loss": 3.8159227938763953, + "objective/train/tokens_used": 657795552, + "theoretical_loss": 3.8159227938763953, + "tokens_seen": 637335552 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040751253761283854, + "loss": 2.8626, + "theoretical_loss": 3.815902539113981, + "tokens_seen": 637368320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004075025075225677, + "loss": 2.8335, + "theoretical_loss": 3.815862033587556, + "tokens_seen": 637433856 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004074924774322969, + "loss": 2.9154, + "theoretical_loss": 3.8158215333912935, + "tokens_seen": 637499392 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040748244734202613, + "loss": 2.9701, + "theoretical_loss": 3.8157810385239443, + "tokens_seen": 637564928 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040747241725175526, + "loss": 2.7581, + "theoretical_loss": 3.81574054898426, + "tokens_seen": 637630464 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004074623871614845, + "loss": 3.1263, + "theoretical_loss": 3.8157000647709927, + "tokens_seen": 637696000 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004074523570712136, + "loss": 2.6475, + "theoretical_loss": 3.8156595858828934, + "tokens_seen": 637761536 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040744232698094286, + "loss": 2.9252, + "theoretical_loss": 3.815619112318715, + "tokens_seen": 637827072 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040743229689067204, + "loss": 2.8951, + "theoretical_loss": 3.8155786440772106, + "tokens_seen": 637892608 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004074222668004012, + "loss": 3.0481, + "theoretical_loss": 3.8155381811571325, + "tokens_seen": 637958144 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004074122367101304, + "loss": 2.9415, + "theoretical_loss": 3.8154977235572356, + "tokens_seen": 638023680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004074022066198596, + "loss": 2.789, + "theoretical_loss": 3.8154572712762733, + "tokens_seen": 638089216 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040739217652958876, + "loss": 2.7131, + "theoretical_loss": 3.815416824313001, + "tokens_seen": 638154752 + }, + { + "epoch": 2.01, + "learning_rate": 0.000407382146439318, + "loss": 2.7745, + "theoretical_loss": 3.815376382666172, + "tokens_seen": 638220288 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004073721163490471, + "loss": 2.9315, + "theoretical_loss": 3.8153359463345433, + "tokens_seen": 638285824 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040736208625877636, + "loss": 2.6523, + "theoretical_loss": 3.8152955153168704, + "tokens_seen": 638351360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004073520561685055, + "loss": 3.1046, + "theoretical_loss": 3.8152550896119086, + "tokens_seen": 638416896 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004073420260782347, + "loss": 2.9757, + "theoretical_loss": 3.815214669218416, + "tokens_seen": 638482432 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004073319959879639, + "loss": 3.0314, + "theoretical_loss": 3.8151742541351483, + "tokens_seen": 638547968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004073219658976931, + "loss": 3.0917, + "theoretical_loss": 3.8151338443608633, + "tokens_seen": 638613504 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040731193580742227, + "loss": 2.6262, + "theoretical_loss": 3.8150934398943193, + "tokens_seen": 638679040 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004073019057171515, + "loss": 2.7144, + "theoretical_loss": 3.8150530407342744, + "tokens_seen": 638744576 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040729187562688063, + "loss": 2.8772, + "theoretical_loss": 3.815012646879487, + "tokens_seen": 638810112 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040728184553660986, + "loss": 2.7468, + "theoretical_loss": 3.814972258328717, + "tokens_seen": 638875648 + }, + { + "epoch": 2.01, + "learning_rate": 0.000407271815446339, + "loss": 2.6933, + "theoretical_loss": 3.8149318750807235, + "tokens_seen": 638941184 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 786054, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3852269649505615, + "objective/train/theoretical_loss": 3.814911685444881, + "objective/train/tokens_used": 659433952, + "theoretical_loss": 3.814911685444881, + "tokens_seen": 638973952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004072617853560682, + "loss": 2.5543, + "theoretical_loss": 3.8148914971342665, + "tokens_seen": 639006720 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004072517552657974, + "loss": 2.8611, + "theoretical_loss": 3.814851124488106, + "tokens_seen": 639072256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004072417251755266, + "loss": 2.774, + "theoretical_loss": 3.8148107571410037, + "tokens_seen": 639137792 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040723169508525577, + "loss": 2.7506, + "theoretical_loss": 3.8147703950917204, + "tokens_seen": 639203328 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040722166499498495, + "loss": 2.7369, + "theoretical_loss": 3.8147300383390172, + "tokens_seen": 639268864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040721163490471413, + "loss": 2.4588, + "theoretical_loss": 3.814689686881657, + "tokens_seen": 639334400 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040720160481444337, + "loss": 2.7529, + "theoretical_loss": 3.814649340718402, + "tokens_seen": 639399936 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004071915747241725, + "loss": 2.7639, + "theoretical_loss": 3.814608999848015, + "tokens_seen": 639465472 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040718154463390173, + "loss": 2.9076, + "theoretical_loss": 3.814568664269259, + "tokens_seen": 639531008 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004071715145436309, + "loss": 2.7979, + "theoretical_loss": 3.8145283339808986, + "tokens_seen": 639596544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004071614844533601, + "loss": 2.7578, + "theoretical_loss": 3.814488008981697, + "tokens_seen": 639662080 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040715145436308927, + "loss": 3.0341, + "theoretical_loss": 3.8144476892704198, + "tokens_seen": 639727616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040714142427281845, + "loss": 2.897, + "theoretical_loss": 3.8144073748458305, + "tokens_seen": 639793152 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040713139418254763, + "loss": 2.9447, + "theoretical_loss": 3.814367065706696, + "tokens_seen": 639858688 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040712136409227687, + "loss": 2.8507, + "theoretical_loss": 3.814326761851781, + "tokens_seen": 639924224 + }, + { + "epoch": 2.01, + "learning_rate": 0.000407111334002006, + "loss": 2.7963, + "theoretical_loss": 3.8142864632798523, + "tokens_seen": 639989760 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040710130391173523, + "loss": 2.8708, + "theoretical_loss": 3.8142461699896764, + "tokens_seen": 640055296 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040709127382146436, + "loss": 2.8233, + "theoretical_loss": 3.8142058819800204, + "tokens_seen": 640120832 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004070812437311936, + "loss": 3.0831, + "theoretical_loss": 3.8141655992496517, + "tokens_seen": 640186368 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004070712136409228, + "loss": 2.7594, + "theoretical_loss": 3.814125321797338, + "tokens_seen": 640251904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040706118355065196, + "loss": 2.5152, + "theoretical_loss": 3.814085049621848, + "tokens_seen": 640317440 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040705115346038114, + "loss": 2.7952, + "theoretical_loss": 3.81404478272195, + "tokens_seen": 640382976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004070411233701103, + "loss": 2.7167, + "theoretical_loss": 3.8140045210964133, + "tokens_seen": 640448512 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004070310932798395, + "loss": 2.8056, + "theoretical_loss": 3.8139642647440075, + "tokens_seen": 640514048 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040702106318956874, + "loss": 2.6948, + "theoretical_loss": 3.8139240136635024, + "tokens_seen": 640579584 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 786648, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7099239826202393, + "objective/train/theoretical_loss": 3.8139038900998283, + "objective/train/tokens_used": 661072352, + "theoretical_loss": 3.8139038900998283, + "tokens_seen": 640612352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040701103309929786, + "loss": 2.7378, + "theoretical_loss": 3.8138837678536683, + "tokens_seen": 640645120 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004070010030090271, + "loss": 2.8674, + "theoretical_loss": 3.813843527313276, + "tokens_seen": 640710656 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004069909729187563, + "loss": 2.786, + "theoretical_loss": 3.813803292041097, + "tokens_seen": 640776192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040698094282848546, + "loss": 2.9161, + "theoretical_loss": 3.8137630620359024, + "tokens_seen": 640841728 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040697091273821464, + "loss": 2.8332, + "theoretical_loss": 3.8137228372964644, + "tokens_seen": 640907264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004069608826479438, + "loss": 2.754, + "theoretical_loss": 3.8136826178215557, + "tokens_seen": 640972800 + }, + { + "epoch": 2.01, + "learning_rate": 0.000406950852557673, + "loss": 2.7697, + "theoretical_loss": 3.8136424036099488, + "tokens_seen": 641038336 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040694082246740224, + "loss": 2.7057, + "theoretical_loss": 3.8136021946604166, + "tokens_seen": 641103872 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040693079237713137, + "loss": 2.8738, + "theoretical_loss": 3.813561990971734, + "tokens_seen": 641169408 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004069207622868606, + "loss": 2.8105, + "theoretical_loss": 3.8135217925426734, + "tokens_seen": 641234944 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040691073219658973, + "loss": 2.8888, + "theoretical_loss": 3.81348159937201, + "tokens_seen": 641300480 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040690070210631896, + "loss": 2.8901, + "theoretical_loss": 3.81344141145852, + "tokens_seen": 641366016 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004068906720160482, + "loss": 2.7657, + "theoretical_loss": 3.813401228800976, + "tokens_seen": 641431552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004068806419257773, + "loss": 2.6503, + "theoretical_loss": 3.8133610513981555, + "tokens_seen": 641497088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040687061183550656, + "loss": 2.725, + "theoretical_loss": 3.8133208792488347, + "tokens_seen": 641562624 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004068605817452357, + "loss": 2.6084, + "theoretical_loss": 3.8132807123517893, + "tokens_seen": 641628160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004068505516549649, + "loss": 2.8921, + "theoretical_loss": 3.813240550705797, + "tokens_seen": 641693696 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004068405215646941, + "loss": 2.7129, + "theoretical_loss": 3.813200394309635, + "tokens_seen": 641759232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004068304914744233, + "loss": 2.7996, + "theoretical_loss": 3.8131602431620797, + "tokens_seen": 641824768 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040682046138415247, + "loss": 2.9959, + "theoretical_loss": 3.8131200972619115, + "tokens_seen": 641890304 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004068104312938817, + "loss": 2.6639, + "theoretical_loss": 3.8130799566079077, + "tokens_seen": 641955840 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040680040120361083, + "loss": 2.751, + "theoretical_loss": 3.813039821198847, + "tokens_seen": 642021376 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040679037111334006, + "loss": 2.7787, + "theoretical_loss": 3.8129996910335096, + "tokens_seen": 642086912 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004067803410230692, + "loss": 2.8948, + "theoretical_loss": 3.8129595661106745, + "tokens_seen": 642152448 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004067703109327984, + "loss": 2.8215, + "theoretical_loss": 3.8129194464291225, + "tokens_seen": 642217984 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 788043, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.803264856338501, + "objective/train/theoretical_loss": 3.8128993885534466, + "objective/train/tokens_used": 662710752, + "theoretical_loss": 3.8128993885534466, + "tokens_seen": 642250752 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004067602808425276, + "loss": 2.762, + "theoretical_loss": 3.812879331987634, + "tokens_seen": 642283520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004067502507522568, + "loss": 2.7921, + "theoretical_loss": 3.8128392227849903, + "tokens_seen": 642349056 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040674022066198597, + "loss": 2.869, + "theoretical_loss": 3.8127991188199726, + "tokens_seen": 642414592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040673019057171515, + "loss": 2.7139, + "theoretical_loss": 3.812759020091362, + "tokens_seen": 642480128 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040672016048144433, + "loss": 2.9997, + "theoretical_loss": 3.812718926597942, + "tokens_seen": 642545664 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040671013039117357, + "loss": 2.9864, + "theoretical_loss": 3.8126788383384946, + "tokens_seen": 642611200 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004067001003009027, + "loss": 2.9218, + "theoretical_loss": 3.812638755311803, + "tokens_seen": 642676736 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040669007021063193, + "loss": 2.8599, + "theoretical_loss": 3.812598677516651, + "tokens_seen": 642742272 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004066800401203611, + "loss": 2.8809, + "theoretical_loss": 3.812558604951822, + "tokens_seen": 642807808 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004066700100300903, + "loss": 2.9374, + "theoretical_loss": 3.8125185376161, + "tokens_seen": 642873344 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040665997993981947, + "loss": 2.9225, + "theoretical_loss": 3.8124784755082706, + "tokens_seen": 642938880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00040664994984954865, + "loss": 2.8897, + "theoretical_loss": 3.8124384186271176, + "tokens_seen": 643004416 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040663991975927783, + "loss": 2.67, + "theoretical_loss": 3.812398366971428, + "tokens_seen": 643069952 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040662988966900707, + "loss": 2.7487, + "theoretical_loss": 3.8123583205399862, + "tokens_seen": 643135488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004066198595787362, + "loss": 2.8362, + "theoretical_loss": 3.8123182793315804, + "tokens_seen": 643201024 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040660982948846543, + "loss": 3.058, + "theoretical_loss": 3.8122782433449953, + "tokens_seen": 643266560 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040659979939819456, + "loss": 2.9504, + "theoretical_loss": 3.812238212579019, + "tokens_seen": 643332096 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004065897693079238, + "loss": 2.9792, + "theoretical_loss": 3.8121981870324393, + "tokens_seen": 643397632 + }, + { + "epoch": 2.02, + "learning_rate": 0.000406579739217653, + "loss": 2.7157, + "theoretical_loss": 3.8121581667040436, + "tokens_seen": 643463168 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040656970912738216, + "loss": 2.6707, + "theoretical_loss": 3.81211815159262, + "tokens_seen": 643528704 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040655967903711134, + "loss": 2.7079, + "theoretical_loss": 3.8120781416969587, + "tokens_seen": 643594240 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004065496489468405, + "loss": 2.9709, + "theoretical_loss": 3.812038137015847, + "tokens_seen": 643659776 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004065396188565697, + "loss": 2.7942, + "theoretical_loss": 3.811998137548075, + "tokens_seen": 643725312 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040652958876629894, + "loss": 3.1165, + "theoretical_loss": 3.8119581432924337, + "tokens_seen": 643790848 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040651955867602806, + "loss": 2.6032, + "theoretical_loss": 3.811918154247712, + "tokens_seen": 643856384 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 789663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.804586172103882, + "objective/train/theoretical_loss": 3.8118981616790686, + "objective/train/tokens_used": 664349152, + "theoretical_loss": 3.8118981616790686, + "tokens_seen": 643889152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004065095285857573, + "loss": 2.8536, + "theoretical_loss": 3.811878170412702, + "tokens_seen": 643921920 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004064994984954865, + "loss": 2.9554, + "theoretical_loss": 3.811838191786193, + "tokens_seen": 643987456 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040648946840521566, + "loss": 2.9039, + "theoretical_loss": 3.8117982183669787, + "tokens_seen": 644052992 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040647943831494484, + "loss": 2.6216, + "theoretical_loss": 3.81175825015385, + "tokens_seen": 644118528 + }, + { + "epoch": 2.02, + "learning_rate": 0.000406469408224674, + "loss": 2.8063, + "theoretical_loss": 3.8117182871455997, + "tokens_seen": 644184064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004064593781344032, + "loss": 2.8755, + "theoretical_loss": 3.81167832934102, + "tokens_seen": 644249600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040644934804413244, + "loss": 2.7333, + "theoretical_loss": 3.811638376738904, + "tokens_seen": 644315136 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040643931795386157, + "loss": 2.8455, + "theoretical_loss": 3.811598429338046, + "tokens_seen": 644380672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004064292878635908, + "loss": 3.0777, + "theoretical_loss": 3.8115584871372397, + "tokens_seen": 644446208 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040641925777331993, + "loss": 2.8559, + "theoretical_loss": 3.811518550135279, + "tokens_seen": 644511744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040640922768304916, + "loss": 3.0454, + "theoretical_loss": 3.811478618330959, + "tokens_seen": 644577280 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040639919759277834, + "loss": 2.5906, + "theoretical_loss": 3.8114386917230756, + "tokens_seen": 644642816 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004063891675025075, + "loss": 2.681, + "theoretical_loss": 3.8113987703104235, + "tokens_seen": 644708352 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004063891675025075, + "loss": 2.8988, + "theoretical_loss": 3.811358854091799, + "tokens_seen": 644773888 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004063791374122367, + "loss": 2.8432, + "theoretical_loss": 3.8113189430659986, + "tokens_seen": 644839424 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004063691073219659, + "loss": 2.8233, + "theoretical_loss": 3.811279037231819, + "tokens_seen": 644904960 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040635907723169507, + "loss": 2.7678, + "theoretical_loss": 3.8112391365880574, + "tokens_seen": 644970496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004063490471414243, + "loss": 2.9558, + "theoretical_loss": 3.811199241133511, + "tokens_seen": 645036032 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040633901705115343, + "loss": 2.9838, + "theoretical_loss": 3.811159350866978, + "tokens_seen": 645101568 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040632898696088267, + "loss": 2.69, + "theoretical_loss": 3.8111194657872582, + "tokens_seen": 645167104 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040631895687061185, + "loss": 2.763, + "theoretical_loss": 3.8110795858931485, + "tokens_seen": 645232640 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040630892678034103, + "loss": 2.7282, + "theoretical_loss": 3.811039711183448, + "tokens_seen": 645298176 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004062988966900702, + "loss": 2.7764, + "theoretical_loss": 3.810999841656958, + "tokens_seen": 645363712 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004062888665997994, + "loss": 2.7889, + "theoretical_loss": 3.8109599773124776, + "tokens_seen": 645429248 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040627883650952857, + "loss": 2.6931, + "theoretical_loss": 3.810920118148807, + "tokens_seen": 645494784 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 790384, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9371323585510254, + "objective/train/theoretical_loss": 3.810900190509401, + "objective/train/tokens_used": 665987552, + "theoretical_loss": 3.810900190509401, + "tokens_seen": 645527552 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004062688064192578, + "loss": 2.9117, + "theoretical_loss": 3.810880264164747, + "tokens_seen": 645560320 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040625877632898693, + "loss": 2.9301, + "theoretical_loss": 3.8108404153590993, + "tokens_seen": 645625856 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040624874623871617, + "loss": 2.7162, + "theoretical_loss": 3.810800571730665, + "tokens_seen": 645691392 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004062387161484453, + "loss": 2.9238, + "theoretical_loss": 3.8107607332782463, + "tokens_seen": 645756928 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040622868605817453, + "loss": 3.0003, + "theoretical_loss": 3.8107209000006455, + "tokens_seen": 645822464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004062186559679037, + "loss": 3.068, + "theoretical_loss": 3.8106810718966657, + "tokens_seen": 645888000 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004062086258776329, + "loss": 3.066, + "theoretical_loss": 3.8106412489651103, + "tokens_seen": 645953536 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004061985957873621, + "loss": 2.8349, + "theoretical_loss": 3.810601431204782, + "tokens_seen": 646019072 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040618856569709126, + "loss": 2.8544, + "theoretical_loss": 3.810561618614485, + "tokens_seen": 646084608 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040617853560682044, + "loss": 2.753, + "theoretical_loss": 3.810521811193024, + "tokens_seen": 646150144 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040616850551654967, + "loss": 2.7691, + "theoretical_loss": 3.8104820089392035, + "tokens_seen": 646215680 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004061584754262788, + "loss": 2.8669, + "theoretical_loss": 3.8104422118518295, + "tokens_seen": 646281216 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040614844533600804, + "loss": 2.9435, + "theoretical_loss": 3.810402419929707, + "tokens_seen": 646346752 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040613841524573727, + "loss": 2.8705, + "theoretical_loss": 3.810362633171642, + "tokens_seen": 646412288 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004061283851554664, + "loss": 2.8442, + "theoretical_loss": 3.81032285157644, + "tokens_seen": 646477824 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040611835506519563, + "loss": 2.6825, + "theoretical_loss": 3.8102830751429093, + "tokens_seen": 646543360 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040610832497492476, + "loss": 2.8439, + "theoretical_loss": 3.810243303869856, + "tokens_seen": 646608896 + }, + { + "epoch": 2.02, + "learning_rate": 0.000406098294884654, + "loss": 2.5503, + "theoretical_loss": 3.810203537756088, + "tokens_seen": 646674432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004060882647943832, + "loss": 2.5302, + "theoretical_loss": 3.8101637768004144, + "tokens_seen": 646739968 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040607823470411236, + "loss": 2.8447, + "theoretical_loss": 3.8101240210016414, + "tokens_seen": 646805504 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040606820461384154, + "loss": 2.9075, + "theoretical_loss": 3.810084270358579, + "tokens_seen": 646871040 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004060581745235707, + "loss": 2.8127, + "theoretical_loss": 3.810044524870036, + "tokens_seen": 646936576 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004060481444332999, + "loss": 2.7271, + "theoretical_loss": 3.810004784534823, + "tokens_seen": 647002112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040603811434302914, + "loss": 2.7557, + "theoretical_loss": 3.8099650493517476, + "tokens_seen": 647067648 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040602808425275826, + "loss": 2.7964, + "theoretical_loss": 3.809925319319623, + "tokens_seen": 647133184 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 791083, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1197621822357178, + "objective/train/theoretical_loss": 3.8099054562347945, + "objective/train/tokens_used": 667625952, + "theoretical_loss": 3.8099054562347945, + "tokens_seen": 647165952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004060180541624875, + "loss": 2.8243, + "theoretical_loss": 3.809885594437258, + "tokens_seen": 647198720 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004060080240722167, + "loss": 2.7903, + "theoretical_loss": 3.8098458747034636, + "tokens_seen": 647264256 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040599799398194586, + "loss": 2.6526, + "theoretical_loss": 3.809806160117053, + "tokens_seen": 647329792 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040598796389167504, + "loss": 2.8791, + "theoretical_loss": 3.8097664506768365, + "tokens_seen": 647395328 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004059779338014042, + "loss": 2.6126, + "theoretical_loss": 3.809726746381627, + "tokens_seen": 647460864 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004059679037111334, + "loss": 2.7736, + "theoretical_loss": 3.8096870472302378, + "tokens_seen": 647526400 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040595787362086264, + "loss": 2.7677, + "theoretical_loss": 3.8096473532214805, + "tokens_seen": 647591936 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040594784353059177, + "loss": 2.7009, + "theoretical_loss": 3.8096076643541696, + "tokens_seen": 647657472 + }, + { + "epoch": 2.02, + "learning_rate": 0.000405937813440321, + "loss": 2.745, + "theoretical_loss": 3.8095679806271194, + "tokens_seen": 647723008 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040592778335005013, + "loss": 2.7262, + "theoretical_loss": 3.8095283020391433, + "tokens_seen": 647788544 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040591775325977936, + "loss": 2.8404, + "theoretical_loss": 3.8094886285890563, + "tokens_seen": 647854080 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040590772316950854, + "loss": 2.7365, + "theoretical_loss": 3.8094489602756734, + "tokens_seen": 647919616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004058976930792377, + "loss": 2.9187, + "theoretical_loss": 3.809409297097811, + "tokens_seen": 647985152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004058876629889669, + "loss": 2.6985, + "theoretical_loss": 3.809369639054283, + "tokens_seen": 648050688 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004058776328986961, + "loss": 2.5365, + "theoretical_loss": 3.809329986143907, + "tokens_seen": 648116224 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040586760280842527, + "loss": 2.8878, + "theoretical_loss": 3.8092903383654995, + "tokens_seen": 648181760 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004058575727181545, + "loss": 2.6767, + "theoretical_loss": 3.8092506957178776, + "tokens_seen": 648247296 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040584754262788363, + "loss": 2.8827, + "theoretical_loss": 3.809211058199858, + "tokens_seen": 648312832 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040583751253761287, + "loss": 2.7196, + "theoretical_loss": 3.80917142581026, + "tokens_seen": 648378368 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040582748244734205, + "loss": 3.0595, + "theoretical_loss": 3.8091317985478996, + "tokens_seen": 648443904 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040581745235707123, + "loss": 2.8787, + "theoretical_loss": 3.8090921764115975, + "tokens_seen": 648509440 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004058074222668004, + "loss": 2.9666, + "theoretical_loss": 3.809052559400172, + "tokens_seen": 648574976 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004057973921765296, + "loss": 2.8784, + "theoretical_loss": 3.809012947512442, + "tokens_seen": 648640512 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040578736208625877, + "loss": 2.7818, + "theoretical_loss": 3.8089733407472273, + "tokens_seen": 648706048 + }, + { + "epoch": 2.02, + "learning_rate": 0.000405777331995988, + "loss": 3.0789, + "theoretical_loss": 3.8089337391033484, + "tokens_seen": 648771584 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 792264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.787963628768921, + "objective/train/theoretical_loss": 3.8089139402015415, + "objective/train/tokens_used": 669264352, + "theoretical_loss": 3.8089139402015415, + "tokens_seen": 648804352 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040576730190571713, + "loss": 2.8683, + "theoretical_loss": 3.808894142579626, + "tokens_seen": 648837120 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040575727181544637, + "loss": 2.8342, + "theoretical_loss": 3.808854551174881, + "tokens_seen": 648902656 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004057472417251755, + "loss": 2.7637, + "theoretical_loss": 3.8088149648879344, + "tokens_seen": 648968192 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040573721163490473, + "loss": 2.8181, + "theoretical_loss": 3.8087753837176077, + "tokens_seen": 649033728 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004057271815446339, + "loss": 2.8324, + "theoretical_loss": 3.8087358076627242, + "tokens_seen": 649099264 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004057171514543631, + "loss": 2.9637, + "theoretical_loss": 3.8086962367221053, + "tokens_seen": 649164800 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004057071213640923, + "loss": 2.9783, + "theoretical_loss": 3.808656670894574, + "tokens_seen": 649230336 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040569709127382146, + "loss": 3.0008, + "theoretical_loss": 3.8086171101789543, + "tokens_seen": 649295872 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040568706118355064, + "loss": 2.7163, + "theoretical_loss": 3.808577554574069, + "tokens_seen": 649361408 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004056770310932799, + "loss": 2.6197, + "theoretical_loss": 3.808538004078743, + "tokens_seen": 649426944 + }, + { + "epoch": 2.02, + "learning_rate": 0.000405667001003009, + "loss": 2.8005, + "theoretical_loss": 3.8084984586918, + "tokens_seen": 649492480 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040565697091273824, + "loss": 2.9181, + "theoretical_loss": 3.808458918412065, + "tokens_seen": 649558016 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004056469408224674, + "loss": 2.7293, + "theoretical_loss": 3.8084193832383635, + "tokens_seen": 649623552 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004056369107321966, + "loss": 2.7755, + "theoretical_loss": 3.8083798531695208, + "tokens_seen": 649689088 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004056268806419258, + "loss": 2.9192, + "theoretical_loss": 3.8083403282043635, + "tokens_seen": 649754624 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040561685055165496, + "loss": 2.5435, + "theoretical_loss": 3.808300808341717, + "tokens_seen": 649820160 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040560682046138414, + "loss": 2.7715, + "theoretical_loss": 3.8082612935804097, + "tokens_seen": 649885696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004055967903711134, + "loss": 2.7429, + "theoretical_loss": 3.808221783919267, + "tokens_seen": 649951232 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004055867602808425, + "loss": 2.8494, + "theoretical_loss": 3.8081822793571174, + "tokens_seen": 650016768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040557673019057174, + "loss": 2.5832, + "theoretical_loss": 3.8081427798927887, + "tokens_seen": 650082304 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040556670010030087, + "loss": 2.797, + "theoretical_loss": 3.808103285525109, + "tokens_seen": 650147840 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004055566700100301, + "loss": 2.9601, + "theoretical_loss": 3.8080637962529074, + "tokens_seen": 650213376 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004055466399197593, + "loss": 3.0431, + "theoretical_loss": 3.808024312075013, + "tokens_seen": 650278912 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040553660982948846, + "loss": 2.8572, + "theoretical_loss": 3.807984832990255, + "tokens_seen": 650344448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040552657973921764, + "loss": 2.8196, + "theoretical_loss": 3.807945358997463, + "tokens_seen": 650409984 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 793032, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8657760620117188, + "objective/train/theoretical_loss": 3.8079256239101893, + "objective/train/tokens_used": 670902752, + "theoretical_loss": 3.8079256239101893, + "tokens_seen": 650442752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004055165496489469, + "loss": 2.8388, + "theoretical_loss": 3.8079058900954683, + "tokens_seen": 650475520 + }, + { + "epoch": 2.02, + "learning_rate": 0.000405506519558676, + "loss": 2.9388, + "theoretical_loss": 3.807866426283101, + "tokens_seen": 650541056 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040549648946840524, + "loss": 2.7149, + "theoretical_loss": 3.8078269675591914, + "tokens_seen": 650606592 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040548645937813437, + "loss": 2.6655, + "theoretical_loss": 3.8077875139225723, + "tokens_seen": 650672128 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004054764292878636, + "loss": 2.708, + "theoretical_loss": 3.807748065372075, + "tokens_seen": 650737664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004054663991975928, + "loss": 3.0241, + "theoretical_loss": 3.807708621906531, + "tokens_seen": 650803200 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040545636910732197, + "loss": 2.9069, + "theoretical_loss": 3.807669183524774, + "tokens_seen": 650868736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040544633901705115, + "loss": 2.7804, + "theoretical_loss": 3.8076297502256358, + "tokens_seen": 650934272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040543630892678033, + "loss": 3.0921, + "theoretical_loss": 3.8075903220079503, + "tokens_seen": 650999808 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004054262788365095, + "loss": 2.917, + "theoretical_loss": 3.807550898870552, + "tokens_seen": 651065344 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040541624874623874, + "loss": 2.6225, + "theoretical_loss": 3.8075114808122734, + "tokens_seen": 651130880 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040540621865596787, + "loss": 2.7534, + "theoretical_loss": 3.807472067831951, + "tokens_seen": 651196416 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004053961885656971, + "loss": 2.6766, + "theoretical_loss": 3.807432659928418, + "tokens_seen": 651261952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004053861584754263, + "loss": 2.7737, + "theoretical_loss": 3.8073932571005105, + "tokens_seen": 651327488 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040537612838515547, + "loss": 2.8894, + "theoretical_loss": 3.807353859347064, + "tokens_seen": 651393024 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004053660982948847, + "loss": 2.7208, + "theoretical_loss": 3.8073144666669148, + "tokens_seen": 651458560 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040535606820461383, + "loss": 2.686, + "theoretical_loss": 3.8072750790588987, + "tokens_seen": 651524096 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040534603811434307, + "loss": 2.7685, + "theoretical_loss": 3.807235696521853, + "tokens_seen": 651589632 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040533600802407225, + "loss": 2.694, + "theoretical_loss": 3.807196319054615, + "tokens_seen": 651655168 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040532597793380143, + "loss": 2.8357, + "theoretical_loss": 3.807156946656022, + "tokens_seen": 651720704 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004053159478435306, + "loss": 2.7878, + "theoretical_loss": 3.8071175793249123, + "tokens_seen": 651786240 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004053059177532598, + "loss": 2.9112, + "theoretical_loss": 3.8070782170601234, + "tokens_seen": 651851776 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040529588766298897, + "loss": 2.8191, + "theoretical_loss": 3.807038859860495, + "tokens_seen": 651917312 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004052858575727182, + "loss": 2.8678, + "theoretical_loss": 3.806999507724866, + "tokens_seen": 651982848 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040527582748244733, + "loss": 2.8658, + "theoretical_loss": 3.8069601606520758, + "tokens_seen": 652048384 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 794457, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.615185499191284, + "objective/train/theoretical_loss": 3.8069404890138827, + "objective/train/tokens_used": 672541152, + "theoretical_loss": 3.8069404890138827, + "tokens_seen": 652081152 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040526579739217657, + "loss": 2.8081, + "theoretical_loss": 3.8069208186409647, + "tokens_seen": 652113920 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004052557673019057, + "loss": 3.0599, + "theoretical_loss": 3.8068814816903718, + "tokens_seen": 652179456 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040524573721163493, + "loss": 2.475, + "theoretical_loss": 3.806842149799139, + "tokens_seen": 652244992 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004052357071213641, + "loss": 2.8371, + "theoretical_loss": 3.806802822966106, + "tokens_seen": 652310528 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004052256770310933, + "loss": 2.9591, + "theoretical_loss": 3.806763501190116, + "tokens_seen": 652376064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004052156469408225, + "loss": 3.0144, + "theoretical_loss": 3.8067241844700095, + "tokens_seen": 652441600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040520561685055166, + "loss": 2.9075, + "theoretical_loss": 3.8066848728046287, + "tokens_seen": 652507136 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040519558676028084, + "loss": 2.8039, + "theoretical_loss": 3.8066455661928167, + "tokens_seen": 652572672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004051855566700101, + "loss": 2.9402, + "theoretical_loss": 3.806606264633416, + "tokens_seen": 652638208 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004051755265797392, + "loss": 2.6916, + "theoretical_loss": 3.8065669681252707, + "tokens_seen": 652703744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040516549648946844, + "loss": 2.9478, + "theoretical_loss": 3.8065276766672236, + "tokens_seen": 652769280 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004051554663991976, + "loss": 2.8893, + "theoretical_loss": 3.806488390258119, + "tokens_seen": 652834816 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004051454363089268, + "loss": 2.8495, + "theoretical_loss": 3.8064491088968015, + "tokens_seen": 652900352 + }, + { + "epoch": 2.02, + "learning_rate": 0.000405135406218656, + "loss": 2.9459, + "theoretical_loss": 3.8064098325821156, + "tokens_seen": 652965888 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040512537612838516, + "loss": 2.7826, + "theoretical_loss": 3.8063705613129075, + "tokens_seen": 653031424 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040511534603811434, + "loss": 2.928, + "theoretical_loss": 3.8063312950880217, + "tokens_seen": 653096960 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004051053159478436, + "loss": 2.9083, + "theoretical_loss": 3.806292033906305, + "tokens_seen": 653162496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004050952858575727, + "loss": 2.9771, + "theoretical_loss": 3.806252777766603, + "tokens_seen": 653228032 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040508525576730194, + "loss": 2.8971, + "theoretical_loss": 3.806213526667763, + "tokens_seen": 653293568 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040507522567703107, + "loss": 2.8588, + "theoretical_loss": 3.806174280608632, + "tokens_seen": 653359104 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004050651955867603, + "loss": 2.9839, + "theoretical_loss": 3.8061350395880567, + "tokens_seen": 653424640 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004050551654964895, + "loss": 2.6543, + "theoretical_loss": 3.8060958036048866, + "tokens_seen": 653490176 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040504513540621866, + "loss": 2.8108, + "theoretical_loss": 3.806056572657969, + "tokens_seen": 653555712 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040503510531594784, + "loss": 2.7504, + "theoretical_loss": 3.8060173467461524, + "tokens_seen": 653621248 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004050250752256771, + "loss": 2.9447, + "theoretical_loss": 3.805978125868286, + "tokens_seen": 653686784 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 795253, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0313971042633057, + "objective/train/theoretical_loss": 3.805958517316725, + "objective/train/tokens_used": 674179552, + "theoretical_loss": 3.805958517316725, + "tokens_seen": 653719552 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004050150451354062, + "loss": 2.9876, + "theoretical_loss": 3.8059389100232197, + "tokens_seen": 653752320 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040500501504513544, + "loss": 2.7331, + "theoretical_loss": 3.805899699209802, + "tokens_seen": 653817856 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040499498495486457, + "loss": 2.6985, + "theoretical_loss": 3.8058604934268843, + "tokens_seen": 653883392 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004049849548645938, + "loss": 2.6745, + "theoretical_loss": 3.8058212926733175, + "tokens_seen": 653948928 + }, + { + "epoch": 2.02, + "learning_rate": 0.000404974924774323, + "loss": 2.5625, + "theoretical_loss": 3.8057820969479508, + "tokens_seen": 654014464 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040496489468405217, + "loss": 2.6322, + "theoretical_loss": 3.805742906249636, + "tokens_seen": 654080000 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040495486459378135, + "loss": 2.7656, + "theoretical_loss": 3.8057037205772257, + "tokens_seen": 654145536 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040494483450351053, + "loss": 2.8471, + "theoretical_loss": 3.8056645399295714, + "tokens_seen": 654211072 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004049348044132397, + "loss": 2.807, + "theoretical_loss": 3.805625364305526, + "tokens_seen": 654276608 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040492477432296894, + "loss": 2.897, + "theoretical_loss": 3.805586193703941, + "tokens_seen": 654342144 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040491474423269807, + "loss": 2.5623, + "theoretical_loss": 3.8055470281236707, + "tokens_seen": 654407680 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004049047141424273, + "loss": 2.6036, + "theoretical_loss": 3.805507867563568, + "tokens_seen": 654473216 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040489468405215643, + "loss": 2.7559, + "theoretical_loss": 3.805468712022488, + "tokens_seen": 654538752 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040488465396188567, + "loss": 2.7901, + "theoretical_loss": 3.8054295614992832, + "tokens_seen": 654604288 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040487462387161485, + "loss": 2.9805, + "theoretical_loss": 3.80539041599281, + "tokens_seen": 654669824 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040486459378134403, + "loss": 2.7287, + "theoretical_loss": 3.805351275501922, + "tokens_seen": 654735360 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004048545636910732, + "loss": 2.7279, + "theoretical_loss": 3.8053121400254755, + "tokens_seen": 654800896 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040484453360080245, + "loss": 3.0278, + "theoretical_loss": 3.8052730095623266, + "tokens_seen": 654866432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004048345035105316, + "loss": 2.7637, + "theoretical_loss": 3.8052338841113302, + "tokens_seen": 654931968 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004048244734202608, + "loss": 2.7719, + "theoretical_loss": 3.805194763671344, + "tokens_seen": 654997504 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040481444332998994, + "loss": 2.857, + "theoretical_loss": 3.8051556482412248, + "tokens_seen": 655063040 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040480441323971917, + "loss": 2.822, + "theoretical_loss": 3.8051165378198295, + "tokens_seen": 655128576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040479438314944835, + "loss": 2.7325, + "theoretical_loss": 3.8050774324060157, + "tokens_seen": 655194112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040478435305917753, + "loss": 3.0475, + "theoretical_loss": 3.8050383319986425, + "tokens_seen": 655259648 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004047743229689067, + "loss": 2.7759, + "theoretical_loss": 3.804999236596567, + "tokens_seen": 655325184 + }, + { + "debugging/Self-BLEU-5": 0.4859850452541583, + "debugging/distinct-1-grams": 0.7709278301586879, + "debugging/distinct-2-grams": 0.9620726312090557, + "debugging/entropy-1-grams": 5.7746798218568784, + "debugging/entropy-2-grams": 6.637753508493868, + "debugging/length": 600.7777777777778, + "debugging/num_segments": 9, + "debugging/score": 0.006362688036581238, + "debugging/score_std": 0.004646295055099869, + "epoch": 2.02, + "objective/train/docs_used": 796377, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9904468059539795, + "objective/train/theoretical_loss": 3.8049796907721594, + "objective/train/tokens_used": 675817952, + "theoretical_loss": 3.8049796907721594, + "tokens_seen": 655357952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004047642928786359, + "loss": 2.8813, + "theoretical_loss": 3.8049601461986486, + "tokens_seen": 655390720 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004047542627883651, + "loss": 2.9686, + "theoretical_loss": 3.8049210608037463, + "tokens_seen": 655456256 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004047442326980943, + "loss": 2.8707, + "theoretical_loss": 3.8048819804107197, + "tokens_seen": 655521792 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040473420260782344, + "loss": 2.6262, + "theoretical_loss": 3.804842905018429, + "tokens_seen": 655587328 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004047241725175527, + "loss": 2.9797, + "theoretical_loss": 3.8048038346257345, + "tokens_seen": 655652864 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004047141424272818, + "loss": 2.845, + "theoretical_loss": 3.8047647692314968, + "tokens_seen": 655718400 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040470411233701104, + "loss": 2.7958, + "theoretical_loss": 3.8047257088345763, + "tokens_seen": 655783936 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004046940822467402, + "loss": 2.9501, + "theoretical_loss": 3.8046866534338353, + "tokens_seen": 655849472 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004046840521564694, + "loss": 2.7035, + "theoretical_loss": 3.804647603028135, + "tokens_seen": 655915008 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004046740220661986, + "loss": 2.6593, + "theoretical_loss": 3.804608557616338, + "tokens_seen": 655980544 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004046639919759278, + "loss": 2.9606, + "theoretical_loss": 3.8045695171973066, + "tokens_seen": 656046080 + }, + { + "epoch": 2.02, + "learning_rate": 0.000404653961885657, + "loss": 2.9506, + "theoretical_loss": 3.8045304817699037, + "tokens_seen": 656111616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004046439317953862, + "loss": 2.6727, + "theoretical_loss": 3.804491451332993, + "tokens_seen": 656177152 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040463390170511536, + "loss": 2.5689, + "theoretical_loss": 3.8044524258854375, + "tokens_seen": 656242688 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040462387161484454, + "loss": 2.7187, + "theoretical_loss": 3.8044134054261014, + "tokens_seen": 656308224 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004046138415245738, + "loss": 2.7893, + "theoretical_loss": 3.80437438995385, + "tokens_seen": 656373760 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004046038114343029, + "loss": 2.8434, + "theoretical_loss": 3.8043353794675463, + "tokens_seen": 656439296 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040459378134403214, + "loss": 2.9646, + "theoretical_loss": 3.804296373966057, + "tokens_seen": 656504832 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040458375125376127, + "loss": 2.9982, + "theoretical_loss": 3.8042573734482463, + "tokens_seen": 656570368 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004045737211634905, + "loss": 2.9142, + "theoretical_loss": 3.8042183779129823, + "tokens_seen": 656635904 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004045636910732197, + "loss": 2.8449, + "theoretical_loss": 3.8041793873591283, + "tokens_seen": 656701440 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040455366098294886, + "loss": 2.8761, + "theoretical_loss": 3.8041404017855536, + "tokens_seen": 656766976 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040454363089267804, + "loss": 2.7735, + "theoretical_loss": 3.804101421191124, + "tokens_seen": 656832512 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004045336008024073, + "loss": 2.761, + "theoretical_loss": 3.804062445574706, + "tokens_seen": 656898048 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004045235707121364, + "loss": 2.6464, + "theoretical_loss": 3.8040234749351693, + "tokens_seen": 656963584 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 797121, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.958272695541382, + "objective/train/theoretical_loss": 3.804003991481377, + "objective/train/tokens_used": 677456352, + "theoretical_loss": 3.804003991481377, + "tokens_seen": 656996352 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040451354062186564, + "loss": 2.8226, + "theoretical_loss": 3.8039845092713804, + "tokens_seen": 657029120 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040450351053159477, + "loss": 2.9393, + "theoretical_loss": 3.8039455485822087, + "tokens_seen": 657094656 + }, + { + "epoch": 2.02, + "learning_rate": 0.000404493480441324, + "loss": 2.8633, + "theoretical_loss": 3.8039065928665226, + "tokens_seen": 657160192 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004044834503510532, + "loss": 2.9581, + "theoretical_loss": 3.8038676421231914, + "tokens_seen": 657225728 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040447342026078237, + "loss": 2.7917, + "theoretical_loss": 3.8038286963510846, + "tokens_seen": 657291264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040446339017051155, + "loss": 2.6505, + "theoretical_loss": 3.8037897555490723, + "tokens_seen": 657356800 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040445336008024073, + "loss": 3.0437, + "theoretical_loss": 3.803750819716025, + "tokens_seen": 657422336 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004044433299899699, + "loss": 2.9178, + "theoretical_loss": 3.8037118888508132, + "tokens_seen": 657487872 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040443329989969915, + "loss": 2.7833, + "theoretical_loss": 3.803672962952308, + "tokens_seen": 657553408 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040442326980942827, + "loss": 2.8205, + "theoretical_loss": 3.803634042019381, + "tokens_seen": 657618944 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004044132397191575, + "loss": 2.6532, + "theoretical_loss": 3.8035951260509036, + "tokens_seen": 657684480 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040440320962888663, + "loss": 2.9728, + "theoretical_loss": 3.8035562150457487, + "tokens_seen": 657750016 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040439317953861587, + "loss": 2.8446, + "theoretical_loss": 3.803517309002788, + "tokens_seen": 657815552 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040438314944834505, + "loss": 2.6177, + "theoretical_loss": 3.8034784079208945, + "tokens_seen": 657881088 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040437311935807423, + "loss": 2.8787, + "theoretical_loss": 3.803439511798943, + "tokens_seen": 657946624 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004043630892678034, + "loss": 3.0163, + "theoretical_loss": 3.8034006206358044, + "tokens_seen": 658012160 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040435305917753265, + "loss": 2.7094, + "theoretical_loss": 3.803361734430355, + "tokens_seen": 658077696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004043430290872618, + "loss": 2.9232, + "theoretical_loss": 3.8033228531814682, + "tokens_seen": 658143232 + }, + { + "epoch": 2.02, + "learning_rate": 0.000404332998996991, + "loss": 2.9224, + "theoretical_loss": 3.803283976888019, + "tokens_seen": 658208768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040432296890672014, + "loss": 2.6132, + "theoretical_loss": 3.803245105548883, + "tokens_seen": 658274304 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040431293881644937, + "loss": 2.7078, + "theoretical_loss": 3.803206239162935, + "tokens_seen": 658339840 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040430290872617855, + "loss": 2.6425, + "theoretical_loss": 3.803167377729051, + "tokens_seen": 658405376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040429287863590773, + "loss": 2.7553, + "theoretical_loss": 3.803128521246107, + "tokens_seen": 658470912 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004042828485456369, + "loss": 2.9433, + "theoretical_loss": 3.80308966971298, + "tokens_seen": 658536448 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004042728184553661, + "loss": 2.9494, + "theoretical_loss": 3.8030508231285465, + "tokens_seen": 658601984 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 798701, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.928380250930786, + "objective/train/theoretical_loss": 3.8030314016917397, + "objective/train/tokens_used": 679094752, + "theoretical_loss": 3.8030314016917397, + "tokens_seen": 658634752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004042627883650953, + "loss": 3.0227, + "theoretical_loss": 3.803011981491685, + "tokens_seen": 658667520 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004042527582748245, + "loss": 2.8351, + "theoretical_loss": 3.802973144801272, + "tokens_seen": 658733056 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040424272818455364, + "loss": 2.837, + "theoretical_loss": 3.802934313056186, + "tokens_seen": 658798592 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004042326980942829, + "loss": 2.9081, + "theoretical_loss": 3.802895486255305, + "tokens_seen": 658864128 + }, + { + "epoch": 2.02, + "learning_rate": 0.000404222668004012, + "loss": 2.8801, + "theoretical_loss": 3.802856664397508, + "tokens_seen": 658929664 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040421263791374124, + "loss": 2.8748, + "theoretical_loss": 3.8028178474816743, + "tokens_seen": 658995200 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004042026078234704, + "loss": 2.9186, + "theoretical_loss": 3.802779035506684, + "tokens_seen": 659060736 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004041925777331996, + "loss": 2.8017, + "theoretical_loss": 3.802740228471416, + "tokens_seen": 659126272 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004041825476429288, + "loss": 2.7335, + "theoretical_loss": 3.8027014263747505, + "tokens_seen": 659191808 + }, + { + "epoch": 2.02, + "learning_rate": 0.000404172517552658, + "loss": 2.7138, + "theoretical_loss": 3.802662629215569, + "tokens_seen": 659257344 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040416248746238714, + "loss": 2.843, + "theoretical_loss": 3.802623836992752, + "tokens_seen": 659322880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004041524573721164, + "loss": 2.9381, + "theoretical_loss": 3.8025850497051805, + "tokens_seen": 659388416 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004041424272818455, + "loss": 3.0615, + "theoretical_loss": 3.802546267351737, + "tokens_seen": 659453952 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040413239719157474, + "loss": 2.6555, + "theoretical_loss": 3.802507489931303, + "tokens_seen": 659519488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004041223671013039, + "loss": 2.6922, + "theoretical_loss": 3.802468717442761, + "tokens_seen": 659585024 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004041123370110331, + "loss": 2.8747, + "theoretical_loss": 3.802429949884994, + "tokens_seen": 659650560 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004041023069207623, + "loss": 2.6937, + "theoretical_loss": 3.8023911872568847, + "tokens_seen": 659716096 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040409227683049147, + "loss": 2.9162, + "theoretical_loss": 3.8023524295573177, + "tokens_seen": 659781632 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040408224674022065, + "loss": 2.7625, + "theoretical_loss": 3.8023136767851753, + "tokens_seen": 659847168 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004040722166499499, + "loss": 2.3571, + "theoretical_loss": 3.802274928939343, + "tokens_seen": 659912704 + }, + { + "epoch": 2.02, + "learning_rate": 0.000404062186559679, + "loss": 2.8614, + "theoretical_loss": 3.802236186018705, + "tokens_seen": 659978240 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040405215646940824, + "loss": 2.8778, + "theoretical_loss": 3.802197448022147, + "tokens_seen": 660043776 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040404212637913737, + "loss": 2.7498, + "theoretical_loss": 3.802158714948553, + "tokens_seen": 660109312 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004040320962888666, + "loss": 2.893, + "theoretical_loss": 3.8021199867968094, + "tokens_seen": 660174848 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004040220661985958, + "loss": 2.9762, + "theoretical_loss": 3.8020812635658023, + "tokens_seen": 660240384 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 799454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4462287425994873, + "objective/train/theoretical_loss": 3.8020619037952272, + "objective/train/tokens_used": 680733152, + "theoretical_loss": 3.8020619037952272, + "tokens_seen": 660273152 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040401203610832497, + "loss": 2.8877, + "theoretical_loss": 3.802042545254418, + "tokens_seen": 660305920 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040400200601805415, + "loss": 2.8215, + "theoretical_loss": 3.802003831861544, + "tokens_seen": 660371456 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004039919759277834, + "loss": 2.798, + "theoretical_loss": 3.8019651233860663, + "tokens_seen": 660436992 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004039819458375125, + "loss": 2.9349, + "theoretical_loss": 3.8019264198268736, + "tokens_seen": 660502528 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040397191574724175, + "loss": 2.7946, + "theoretical_loss": 3.801887721182853, + "tokens_seen": 660568064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004039618856569709, + "loss": 2.9562, + "theoretical_loss": 3.8018490274528918, + "tokens_seen": 660633600 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004039518555667001, + "loss": 2.9122, + "theoretical_loss": 3.801810338635881, + "tokens_seen": 660699136 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004039418254764293, + "loss": 2.9185, + "theoretical_loss": 3.8017716547307074, + "tokens_seen": 660764672 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040393179538615847, + "loss": 2.7965, + "theoretical_loss": 3.801732975736262, + "tokens_seen": 660830208 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040392176529588765, + "loss": 3.0356, + "theoretical_loss": 3.801694301651433, + "tokens_seen": 660895744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040391173520561683, + "loss": 2.8722, + "theoretical_loss": 3.8016556324751116, + "tokens_seen": 660961280 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040390170511534607, + "loss": 2.9781, + "theoretical_loss": 3.801616968206188, + "tokens_seen": 661026816 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040389167502507525, + "loss": 2.853, + "theoretical_loss": 3.8015783088435526, + "tokens_seen": 661092352 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040388164493480443, + "loss": 3.0213, + "theoretical_loss": 3.8015396543860964, + "tokens_seen": 661157888 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004038716148445336, + "loss": 2.7481, + "theoretical_loss": 3.8015010048327116, + "tokens_seen": 661223424 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040386158475426285, + "loss": 2.7774, + "theoretical_loss": 3.8014623601822892, + "tokens_seen": 661288960 + }, + { + "epoch": 2.02, + "learning_rate": 0.000403851554663992, + "loss": 2.7178, + "theoretical_loss": 3.8014237204337222, + "tokens_seen": 661354496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004038415245737212, + "loss": 2.7672, + "theoretical_loss": 3.8013850855859026, + "tokens_seen": 661420032 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040383149448345034, + "loss": 2.8802, + "theoretical_loss": 3.801346455637724, + "tokens_seen": 661485568 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040382146439317957, + "loss": 2.8351, + "theoretical_loss": 3.801307830588079, + "tokens_seen": 661551104 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040381143430290875, + "loss": 2.8868, + "theoretical_loss": 3.801269210435862, + "tokens_seen": 661616640 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040380140421263794, + "loss": 2.7866, + "theoretical_loss": 3.801230595179966, + "tokens_seen": 661682176 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004037913741223671, + "loss": 2.7692, + "theoretical_loss": 3.801191984819286, + "tokens_seen": 661747712 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004037813440320963, + "loss": 2.7309, + "theoretical_loss": 3.8011533793527166, + "tokens_seen": 661813248 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004037713139418255, + "loss": 2.7817, + "theoretical_loss": 3.8011147787791533, + "tokens_seen": 661878784 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 800678, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2656052112579346, + "objective/train/theoretical_loss": 3.8010954803269037, + "objective/train/tokens_used": 682371552, + "theoretical_loss": 3.8010954803269037, + "tokens_seen": 661911552 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004037612838515547, + "loss": 2.8415, + "theoretical_loss": 3.801076183097491, + "tokens_seen": 661944320 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040375125376128384, + "loss": 2.8146, + "theoretical_loss": 3.801037592306626, + "tokens_seen": 662009856 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004037412236710131, + "loss": 2.8478, + "theoretical_loss": 3.800999006405454, + "tokens_seen": 662075392 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004037311935807422, + "loss": 2.8708, + "theoretical_loss": 3.8009604253928715, + "tokens_seen": 662140928 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040372116349047144, + "loss": 2.9692, + "theoretical_loss": 3.800921849267776, + "tokens_seen": 662206464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004037111334002006, + "loss": 3.0543, + "theoretical_loss": 3.8008832780290636, + "tokens_seen": 662272000 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004037011033099298, + "loss": 2.8201, + "theoretical_loss": 3.8008447116756336, + "tokens_seen": 662337536 + }, + { + "epoch": 2.02, + "learning_rate": 0.000403691073219659, + "loss": 2.8622, + "theoretical_loss": 3.800806150206382, + "tokens_seen": 662403072 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004036810431293882, + "loss": 2.8962, + "theoretical_loss": 3.800767593620209, + "tokens_seen": 662468608 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040367101303911734, + "loss": 3.0683, + "theoretical_loss": 3.800729041916012, + "tokens_seen": 662534144 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004036609829488466, + "loss": 3.0585, + "theoretical_loss": 3.8006904950926907, + "tokens_seen": 662599680 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004036509528585757, + "loss": 3.0067, + "theoretical_loss": 3.800651953149144, + "tokens_seen": 662665216 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040364092276830494, + "loss": 3.0874, + "theoretical_loss": 3.800613416084272, + "tokens_seen": 662730752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004036308926780341, + "loss": 2.905, + "theoretical_loss": 3.8005748838969744, + "tokens_seen": 662796288 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004036208625877633, + "loss": 2.9418, + "theoretical_loss": 3.8005363565861527, + "tokens_seen": 662861824 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004036108324974925, + "loss": 2.7549, + "theoretical_loss": 3.800497834150706, + "tokens_seen": 662927360 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040360080240722167, + "loss": 2.8905, + "theoretical_loss": 3.800459316589537, + "tokens_seen": 662992896 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040359077231695085, + "loss": 2.752, + "theoretical_loss": 3.800420803901546, + "tokens_seen": 663058432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004035807422266801, + "loss": 2.7383, + "theoretical_loss": 3.8003822960856364, + "tokens_seen": 663123968 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004035707121364092, + "loss": 2.9399, + "theoretical_loss": 3.8003437931407094, + "tokens_seen": 663189504 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040356068204613844, + "loss": 2.9639, + "theoretical_loss": 3.8003052950656673, + "tokens_seen": 663255040 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040355065195586757, + "loss": 2.9999, + "theoretical_loss": 3.800266801859414, + "tokens_seen": 663320576 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004035406218655968, + "loss": 2.8012, + "theoretical_loss": 3.8002283135208517, + "tokens_seen": 663386112 + }, + { + "epoch": 2.02, + "learning_rate": 0.000403530591775326, + "loss": 2.7796, + "theoretical_loss": 3.800189830048885, + "tokens_seen": 663451648 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040352056168505517, + "loss": 2.8371, + "theoretical_loss": 3.800151351442418, + "tokens_seen": 663517184 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 801382, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9103448390960693, + "objective/train/theoretical_loss": 3.800132113963404, + "objective/train/tokens_used": 684009952, + "theoretical_loss": 3.800132113963404, + "tokens_seen": 663549952 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040351053159478435, + "loss": 2.8541, + "theoretical_loss": 3.8001128777003546, + "tokens_seen": 663582720 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004035005015045136, + "loss": 2.8874, + "theoretical_loss": 3.800074408821599, + "tokens_seen": 663648256 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004034904714142427, + "loss": 2.8394, + "theoretical_loss": 3.8000359448050576, + "tokens_seen": 663713792 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040348044132397195, + "loss": 2.8841, + "theoretical_loss": 3.7999974856496346, + "tokens_seen": 663779328 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004034704112337011, + "loss": 2.6108, + "theoretical_loss": 3.799959031354237, + "tokens_seen": 663844864 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004034603811434303, + "loss": 2.8604, + "theoretical_loss": 3.7999205819177693, + "tokens_seen": 663910400 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004034503510531595, + "loss": 2.6844, + "theoretical_loss": 3.79988213733914, + "tokens_seen": 663975936 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040344032096288867, + "loss": 2.7968, + "theoretical_loss": 3.7998436976172543, + "tokens_seen": 664041472 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040343029087261785, + "loss": 2.923, + "theoretical_loss": 3.79980526275102, + "tokens_seen": 664107008 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040342026078234703, + "loss": 2.7212, + "theoretical_loss": 3.799766832739345, + "tokens_seen": 664172544 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004034102306920762, + "loss": 2.8013, + "theoretical_loss": 3.799728407581137, + "tokens_seen": 664238080 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040340020060180545, + "loss": 2.8651, + "theoretical_loss": 3.799689987275304, + "tokens_seen": 664303616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004033901705115346, + "loss": 3.1115, + "theoretical_loss": 3.799651571820755, + "tokens_seen": 664369152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004033801404212638, + "loss": 2.7985, + "theoretical_loss": 3.7996131612163984, + "tokens_seen": 664434688 + }, + { + "epoch": 2.02, + "learning_rate": 0.000403370110330993, + "loss": 2.779, + "theoretical_loss": 3.799574755461144, + "tokens_seen": 664500224 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004033600802407222, + "loss": 3.0251, + "theoretical_loss": 3.7995363545539016, + "tokens_seen": 664565760 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040335005015045136, + "loss": 2.9, + "theoretical_loss": 3.799497958493581, + "tokens_seen": 664631296 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040334002006018054, + "loss": 2.8359, + "theoretical_loss": 3.7994595672790927, + "tokens_seen": 664696832 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004033299899699097, + "loss": 2.8659, + "theoretical_loss": 3.7994211809093468, + "tokens_seen": 664762368 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040331995987963895, + "loss": 3.0194, + "theoretical_loss": 3.799382799383255, + "tokens_seen": 664827904 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004033099297893681, + "loss": 2.9732, + "theoretical_loss": 3.7993444226997295, + "tokens_seen": 664893440 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004032998996990973, + "loss": 2.7585, + "theoretical_loss": 3.7993060508576804, + "tokens_seen": 664958976 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040328986960882644, + "loss": 2.8927, + "theoretical_loss": 3.799267683856021, + "tokens_seen": 665024512 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004032798395185557, + "loss": 2.9975, + "theoretical_loss": 3.799229321693664, + "tokens_seen": 665090048 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040326980942828486, + "loss": 2.7975, + "theoretical_loss": 3.7991909643695214, + "tokens_seen": 665155584 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 802714, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3324520587921143, + "objective/train/theoretical_loss": 3.7991717875214412, + "objective/train/tokens_used": 685648352, + "theoretical_loss": 3.7991717875214412, + "tokens_seen": 665188352 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040325977933801404, + "loss": 2.7329, + "theoretical_loss": 3.7991526118825067, + "tokens_seen": 665221120 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004032497492477432, + "loss": 2.9358, + "theoretical_loss": 3.799114264231534, + "tokens_seen": 665286656 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004032397191574724, + "loss": 2.9407, + "theoretical_loss": 3.7990759214155156, + "tokens_seen": 665352192 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004032296890672016, + "loss": 2.8405, + "theoretical_loss": 3.799037583433368, + "tokens_seen": 665417728 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004032196589769308, + "loss": 3.0254, + "theoretical_loss": 3.7989992502840044, + "tokens_seen": 665483264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040320962888665995, + "loss": 2.6699, + "theoretical_loss": 3.79896092196634, + "tokens_seen": 665548800 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004031995987963892, + "loss": 2.8213, + "theoretical_loss": 3.7989225984792903, + "tokens_seen": 665614336 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040318956870611836, + "loss": 2.8812, + "theoretical_loss": 3.7988842798217703, + "tokens_seen": 665679872 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040317953861584754, + "loss": 2.8593, + "theoretical_loss": 3.7988459659926965, + "tokens_seen": 665745408 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004031695085255767, + "loss": 2.8102, + "theoretical_loss": 3.798807656990986, + "tokens_seen": 665810944 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004031594784353059, + "loss": 3.0188, + "theoretical_loss": 3.798769352815554, + "tokens_seen": 665876480 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040314944834503514, + "loss": 2.7791, + "theoretical_loss": 3.7987310534653194, + "tokens_seen": 665942016 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004031394182547643, + "loss": 2.8322, + "theoretical_loss": 3.7986927589391977, + "tokens_seen": 666007552 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004031293881644935, + "loss": 2.9257, + "theoretical_loss": 3.798654469236107, + "tokens_seen": 666073088 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004031193580742227, + "loss": 2.8127, + "theoretical_loss": 3.798616184354967, + "tokens_seen": 666138624 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040310932798395187, + "loss": 2.847, + "theoretical_loss": 3.798577904294694, + "tokens_seen": 666204160 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040309929789368105, + "loss": 3.0944, + "theoretical_loss": 3.7985396290542086, + "tokens_seen": 666269696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004030892678034103, + "loss": 2.6814, + "theoretical_loss": 3.798501358632429, + "tokens_seen": 666335232 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004030792377131394, + "loss": 2.9891, + "theoretical_loss": 3.798463093028275, + "tokens_seen": 666400768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040306920762286864, + "loss": 2.7525, + "theoretical_loss": 3.798424832240666, + "tokens_seen": 666466304 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040305917753259777, + "loss": 3.0883, + "theoretical_loss": 3.7983865762685225, + "tokens_seen": 666531840 + }, + { + "epoch": 2.02, + "learning_rate": 0.000403049147442327, + "loss": 2.946, + "theoretical_loss": 3.7983483251107657, + "tokens_seen": 666597376 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004030391173520562, + "loss": 2.8419, + "theoretical_loss": 3.798310078766315, + "tokens_seen": 666662912 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040302908726178537, + "loss": 2.8564, + "theoretical_loss": 3.7982718372340933, + "tokens_seen": 666728448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040301905717151455, + "loss": 2.7861, + "theoretical_loss": 3.7982336005130213, + "tokens_seen": 666793984 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 803703, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0033750534057617, + "objective/train/theoretical_loss": 3.798214483956329, + "objective/train/tokens_used": 687286752, + "theoretical_loss": 3.798214483956329, + "tokens_seen": 666826752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004030090270812438, + "loss": 2.9464, + "theoretical_loss": 3.7981953686020207, + "tokens_seen": 666859520 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004029989969909729, + "loss": 3.1007, + "theoretical_loss": 3.798157141500014, + "tokens_seen": 666925056 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040298896690070215, + "loss": 2.8264, + "theoretical_loss": 3.7981189192059244, + "tokens_seen": 666990592 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004029789368104313, + "loss": 2.8476, + "theoretical_loss": 3.7980807017186744, + "tokens_seen": 667056128 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004029689067201605, + "loss": 3.1105, + "theoretical_loss": 3.7980424890371873, + "tokens_seen": 667121664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004029588766298897, + "loss": 2.6239, + "theoretical_loss": 3.7980042811603867, + "tokens_seen": 667187200 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040294884653961887, + "loss": 2.8954, + "theoretical_loss": 3.797966078087197, + "tokens_seen": 667252736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040293881644934805, + "loss": 3.0282, + "theoretical_loss": 3.797927879816542, + "tokens_seen": 667318272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040292878635907723, + "loss": 3.0588, + "theoretical_loss": 3.7978896863473466, + "tokens_seen": 667383808 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004029187562688064, + "loss": 2.8539, + "theoretical_loss": 3.7978514976785362, + "tokens_seen": 667449344 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040290872617853565, + "loss": 2.8734, + "theoretical_loss": 3.797813313809036, + "tokens_seen": 667514880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004028986960882648, + "loss": 2.8289, + "theoretical_loss": 3.7977751347377717, + "tokens_seen": 667580416 + }, + { + "epoch": 2.02, + "learning_rate": 0.000402888665997994, + "loss": 2.7012, + "theoretical_loss": 3.7977369604636695, + "tokens_seen": 667645952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004028786359077232, + "loss": 2.9201, + "theoretical_loss": 3.7976987909856557, + "tokens_seen": 667711488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004028686058174524, + "loss": 2.8353, + "theoretical_loss": 3.7976606263026573, + "tokens_seen": 667777024 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040285857572718156, + "loss": 3.0119, + "theoretical_loss": 3.7976224664136007, + "tokens_seen": 667842560 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040284854563691074, + "loss": 2.9295, + "theoretical_loss": 3.7975843113174146, + "tokens_seen": 667908096 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004028385155466399, + "loss": 2.653, + "theoretical_loss": 3.797546161013026, + "tokens_seen": 667973632 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040282848545636915, + "loss": 2.9208, + "theoretical_loss": 3.7975080154993632, + "tokens_seen": 668039168 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004028184553660983, + "loss": 2.9167, + "theoretical_loss": 3.7974698747753552, + "tokens_seen": 668104704 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004028084252758275, + "loss": 2.961, + "theoretical_loss": 3.7974317388399297, + "tokens_seen": 668170240 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040279839518555664, + "loss": 2.9478, + "theoretical_loss": 3.797393607692017, + "tokens_seen": 668235776 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004027883650952859, + "loss": 2.8548, + "theoretical_loss": 3.7973554813305466, + "tokens_seen": 668301312 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040277833500501506, + "loss": 2.9796, + "theoretical_loss": 3.797317359754448, + "tokens_seen": 668366848 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040276830491474424, + "loss": 2.7677, + "theoretical_loss": 3.797279242962651, + "tokens_seen": 668432384 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 804287, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6115474700927734, + "objective/train/theoretical_loss": 3.7972601863605315, + "objective/train/tokens_used": 688925152, + "theoretical_loss": 3.7972601863605315, + "tokens_seen": 668465152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004027582748244734, + "loss": 2.9188, + "theoretical_loss": 3.797241130954087, + "tokens_seen": 668497920 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004027482447342026, + "loss": 2.7482, + "theoretical_loss": 3.7972030237276866, + "tokens_seen": 668563456 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004027382146439318, + "loss": 2.6248, + "theoretical_loss": 3.7971649212823806, + "tokens_seen": 668628992 + }, + { + "epoch": 2.02, + "learning_rate": 0.000402728184553661, + "loss": 2.8523, + "theoretical_loss": 3.7971268236171016, + "tokens_seen": 668694528 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040271815446339015, + "loss": 2.7225, + "theoretical_loss": 3.7970887307307812, + "tokens_seen": 668760064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004027081243731194, + "loss": 2.7711, + "theoretical_loss": 3.7970506426223514, + "tokens_seen": 668825600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040269809428284856, + "loss": 3.0768, + "theoretical_loss": 3.797012559290745, + "tokens_seen": 668891136 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040268806419257774, + "loss": 3.112, + "theoretical_loss": 3.796974480734894, + "tokens_seen": 668956672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004026780341023069, + "loss": 2.8936, + "theoretical_loss": 3.7969364069537344, + "tokens_seen": 669022208 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004026680040120361, + "loss": 2.8176, + "theoretical_loss": 3.796898337946197, + "tokens_seen": 669087744 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004026579739217653, + "loss": 2.8419, + "theoretical_loss": 3.796860273711217, + "tokens_seen": 669153280 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004026479438314945, + "loss": 2.9462, + "theoretical_loss": 3.7968222142477295, + "tokens_seen": 669218816 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040263791374122365, + "loss": 2.8592, + "theoretical_loss": 3.796784159554668, + "tokens_seen": 669284352 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004026278836509529, + "loss": 2.7023, + "theoretical_loss": 3.796746109630967, + "tokens_seen": 669349888 + }, + { + "epoch": 2.02, + "learning_rate": 0.000402617853560682, + "loss": 2.7926, + "theoretical_loss": 3.796708064475564, + "tokens_seen": 669415424 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040260782347041125, + "loss": 2.7279, + "theoretical_loss": 3.796670024087393, + "tokens_seen": 669480960 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040259779338014043, + "loss": 3.0721, + "theoretical_loss": 3.796631988465391, + "tokens_seen": 669546496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004025877632898696, + "loss": 2.7945, + "theoretical_loss": 3.796593957608494, + "tokens_seen": 669612032 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004025777331995988, + "loss": 2.9095, + "theoretical_loss": 3.7965559315156385, + "tokens_seen": 669677568 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040256770310932797, + "loss": 2.977, + "theoretical_loss": 3.796517910185762, + "tokens_seen": 669743104 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040255767301905715, + "loss": 2.8426, + "theoretical_loss": 3.7964798936178017, + "tokens_seen": 669808640 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004025476429287864, + "loss": 2.8808, + "theoretical_loss": 3.7964418818106953, + "tokens_seen": 669874176 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004025376128385155, + "loss": 2.8402, + "theoretical_loss": 3.7964038747633815, + "tokens_seen": 669939712 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040252758274824475, + "loss": 2.9317, + "theoretical_loss": 3.796365872474798, + "tokens_seen": 670005248 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040251755265797393, + "loss": 2.8884, + "theoretical_loss": 3.7963278749438842, + "tokens_seen": 670070784 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 805654, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.389000654220581, + "objective/train/theoretical_loss": 3.796308877962222, + "objective/train/tokens_used": 690563552, + "theoretical_loss": 3.796308877962222, + "tokens_seen": 670103552 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004025075225677031, + "loss": 3.0705, + "theoretical_loss": 3.796289882169579, + "tokens_seen": 670136320 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004024974924774323, + "loss": 2.9853, + "theoretical_loss": 3.7962518941508216, + "tokens_seen": 670201856 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004024874623871615, + "loss": 2.7051, + "theoretical_loss": 3.796213910886552, + "tokens_seen": 670267392 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040247743229689066, + "loss": 2.6793, + "theoretical_loss": 3.79617593237571, + "tokens_seen": 670332928 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004024674022066199, + "loss": 2.7152, + "theoretical_loss": 3.796137958617237, + "tokens_seen": 670398464 + }, + { + "epoch": 2.02, + "learning_rate": 0.000402457372116349, + "loss": 2.7582, + "theoretical_loss": 3.796099989610073, + "tokens_seen": 670464000 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040244734202607825, + "loss": 3.1524, + "theoretical_loss": 3.7960620253531596, + "tokens_seen": 670529536 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004024373119358074, + "loss": 3.0175, + "theoretical_loss": 3.7960240658454385, + "tokens_seen": 670595072 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004024272818455366, + "loss": 2.637, + "theoretical_loss": 3.795986111085851, + "tokens_seen": 670660608 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004024172517552658, + "loss": 2.8622, + "theoretical_loss": 3.7959481610733397, + "tokens_seen": 670726144 + }, + { + "epoch": 2.02, + "learning_rate": 0.000402407221664995, + "loss": 2.5987, + "theoretical_loss": 3.7959102158068463, + "tokens_seen": 670791680 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004023971915747242, + "loss": 3.1051, + "theoretical_loss": 3.795872275285315, + "tokens_seen": 670857216 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004023871614844534, + "loss": 2.5387, + "theoretical_loss": 3.7958343395076883, + "tokens_seen": 670922752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004023771313941826, + "loss": 2.6301, + "theoretical_loss": 3.7957964084729094, + "tokens_seen": 670988288 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040236710130391176, + "loss": 3.0434, + "theoretical_loss": 3.795758482179923, + "tokens_seen": 671053824 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040235707121364094, + "loss": 2.9399, + "theoretical_loss": 3.7957205606276725, + "tokens_seen": 671119360 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004023470411233701, + "loss": 2.9368, + "theoretical_loss": 3.795682643815103, + "tokens_seen": 671184896 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040233701103309935, + "loss": 3.0092, + "theoretical_loss": 3.7956447317411595, + "tokens_seen": 671250432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004023269809428285, + "loss": 2.6606, + "theoretical_loss": 3.7956068244047865, + "tokens_seen": 671315968 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004023169508525577, + "loss": 2.905, + "theoretical_loss": 3.79556892180493, + "tokens_seen": 671381504 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040230692076228684, + "loss": 2.768, + "theoretical_loss": 3.795531023940536, + "tokens_seen": 671447040 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004022968906720161, + "loss": 3.1211, + "theoretical_loss": 3.7954931308105513, + "tokens_seen": 671512576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040228686058174526, + "loss": 3.0504, + "theoretical_loss": 3.7954552424139214, + "tokens_seen": 671578112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040227683049147444, + "loss": 2.8887, + "theoretical_loss": 3.795417358749594, + "tokens_seen": 671643648 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004022668004012036, + "loss": 2.6697, + "theoretical_loss": 3.795379479816516, + "tokens_seen": 671709184 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 806421, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0061473846435547, + "objective/train/theoretical_loss": 3.7953605421238663, + "objective/train/tokens_used": 692201952, + "theoretical_loss": 3.7953605421238663, + "tokens_seen": 671741952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004022567703109328, + "loss": 2.9267, + "theoretical_loss": 3.7953416056136344, + "tokens_seen": 671774720 + }, + { + "epoch": 2.02, + "learning_rate": 0.000402246740220662, + "loss": 2.9145, + "theoretical_loss": 3.795303736139899, + "tokens_seen": 671840256 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004022367101303912, + "loss": 2.9675, + "theoretical_loss": 3.795265871394256, + "tokens_seen": 671905792 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040222668004012035, + "loss": 3.1682, + "theoretical_loss": 3.795228011375655, + "tokens_seen": 671971328 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004022166499498496, + "loss": 2.7738, + "theoretical_loss": 3.795190156083045, + "tokens_seen": 672036864 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040220661985957876, + "loss": 2.8969, + "theoretical_loss": 3.7951523055153755, + "tokens_seen": 672102400 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040219658976930794, + "loss": 2.9553, + "theoretical_loss": 3.795114459671595, + "tokens_seen": 672167936 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004021865596790371, + "loss": 2.8685, + "theoretical_loss": 3.795076618550654, + "tokens_seen": 672233472 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004021765295887663, + "loss": 2.8885, + "theoretical_loss": 3.795038782151504, + "tokens_seen": 672299008 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004021664994984955, + "loss": 2.9009, + "theoretical_loss": 3.795000950473094, + "tokens_seen": 672364544 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004021564694082247, + "loss": 3.0639, + "theoretical_loss": 3.7949631235143753, + "tokens_seen": 672430080 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040214643931795385, + "loss": 2.7665, + "theoretical_loss": 3.7949253012743, + "tokens_seen": 672495616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004021364092276831, + "loss": 2.8488, + "theoretical_loss": 3.7948874837518183, + "tokens_seen": 672561152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004021263791374122, + "loss": 2.8415, + "theoretical_loss": 3.794849670945884, + "tokens_seen": 672626688 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040211634904714145, + "loss": 2.9815, + "theoretical_loss": 3.7948118628554477, + "tokens_seen": 672692224 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040210631895687063, + "loss": 2.9568, + "theoretical_loss": 3.794774059479463, + "tokens_seen": 672757760 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004020962888665998, + "loss": 2.7326, + "theoretical_loss": 3.7947362608168826, + "tokens_seen": 672823296 + }, + { + "epoch": 2.02, + "learning_rate": 0.000402086258776329, + "loss": 3.0833, + "theoretical_loss": 3.79469846686666, + "tokens_seen": 672888832 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040207622868605817, + "loss": 3.0286, + "theoretical_loss": 3.794660677627748, + "tokens_seen": 672954368 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040206619859578735, + "loss": 2.8386, + "theoretical_loss": 3.7946228930991017, + "tokens_seen": 673019904 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004020561685055166, + "loss": 2.9388, + "theoretical_loss": 3.794585113279675, + "tokens_seen": 673085440 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004020461384152457, + "loss": 3.028, + "theoretical_loss": 3.7945473381684227, + "tokens_seen": 673150976 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040203610832497495, + "loss": 2.7476, + "theoretical_loss": 3.7945095677642993, + "tokens_seen": 673216512 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040202607823470413, + "loss": 2.9742, + "theoretical_loss": 3.7944718020662602, + "tokens_seen": 673282048 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004020160481444333, + "loss": 3.0702, + "theoretical_loss": 3.794434041073261, + "tokens_seen": 673347584 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 807207, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9605650901794434, + "objective/train/theoretical_loss": 3.794415162340826, + "objective/train/tokens_used": 693840352, + "theoretical_loss": 3.794415162340826, + "tokens_seen": 673380352 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004020060180541625, + "loss": 2.7631, + "theoretical_loss": 3.7943962847842587, + "tokens_seen": 673413120 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004019959879638917, + "loss": 3.1873, + "theoretical_loss": 3.794358533198208, + "tokens_seen": 673478656 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040198595787362086, + "loss": 3.0331, + "theoretical_loss": 3.7943207863140667, + "tokens_seen": 673544192 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004019759277833501, + "loss": 2.8757, + "theoretical_loss": 3.7942830441307915, + "tokens_seen": 673609728 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004019658976930792, + "loss": 2.8293, + "theoretical_loss": 3.794245306647339, + "tokens_seen": 673675264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040195586760280845, + "loss": 2.7213, + "theoretical_loss": 3.7942075738626677, + "tokens_seen": 673740800 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004019458375125376, + "loss": 2.8015, + "theoretical_loss": 3.794169845775736, + "tokens_seen": 673806336 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004019358074222668, + "loss": 2.7856, + "theoretical_loss": 3.7941321223855002, + "tokens_seen": 673871872 + }, + { + "epoch": 2.02, + "learning_rate": 0.000401925777331996, + "loss": 2.8947, + "theoretical_loss": 3.794094403690921, + "tokens_seen": 673937408 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004019157472417252, + "loss": 2.7252, + "theoretical_loss": 3.794056689690956, + "tokens_seen": 674002944 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040190571715145436, + "loss": 2.9241, + "theoretical_loss": 3.794018980384566, + "tokens_seen": 674068480 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004018956870611836, + "loss": 2.9409, + "theoretical_loss": 3.7939812757707094, + "tokens_seen": 674134016 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004018856569709127, + "loss": 2.9654, + "theoretical_loss": 3.7939435758483464, + "tokens_seen": 674199552 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040187562688064196, + "loss": 2.6226, + "theoretical_loss": 3.793905880616437, + "tokens_seen": 674265088 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004018655967903711, + "loss": 2.7045, + "theoretical_loss": 3.793868190073943, + "tokens_seen": 674330624 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004018555667001003, + "loss": 2.8452, + "theoretical_loss": 3.793830504219824, + "tokens_seen": 674396160 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004018455366098295, + "loss": 2.8149, + "theoretical_loss": 3.793792823053042, + "tokens_seen": 674461696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004018355065195587, + "loss": 2.997, + "theoretical_loss": 3.7937551465725585, + "tokens_seen": 674527232 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040182547642928786, + "loss": 2.7987, + "theoretical_loss": 3.7937174747773357, + "tokens_seen": 674592768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040181544633901704, + "loss": 2.7863, + "theoretical_loss": 3.7936798076663347, + "tokens_seen": 674658304 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004018054162487462, + "loss": 2.7427, + "theoretical_loss": 3.79364214523852, + "tokens_seen": 674723840 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040179538615847546, + "loss": 2.8898, + "theoretical_loss": 3.7936044874928534, + "tokens_seen": 674789376 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004017853560682046, + "loss": 2.7465, + "theoretical_loss": 3.7935668344282982, + "tokens_seen": 674854912 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004017753259779338, + "loss": 2.7848, + "theoretical_loss": 3.793529186043818, + "tokens_seen": 674920448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040176529588766295, + "loss": 2.7028, + "theoretical_loss": 3.7934915423383764, + "tokens_seen": 674985984 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 807950, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.584026336669922, + "objective/train/theoretical_loss": 3.7934727222399722, + "objective/train/tokens_used": 695478752, + "theoretical_loss": 3.7934727222399722, + "tokens_seen": 675018752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004017552657973922, + "loss": 2.759, + "theoretical_loss": 3.793453903310939, + "tokens_seen": 675051520 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040174523570712137, + "loss": 2.9461, + "theoretical_loss": 3.793416268960469, + "tokens_seen": 675117056 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040173520561685055, + "loss": 2.9082, + "theoretical_loss": 3.7933786392859323, + "tokens_seen": 675182592 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040172517552657973, + "loss": 2.8488, + "theoretical_loss": 3.793341014286293, + "tokens_seen": 675248128 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040171514543630896, + "loss": 3.0354, + "theoretical_loss": 3.7933033939605183, + "tokens_seen": 675313664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004017051153460381, + "loss": 2.7168, + "theoretical_loss": 3.793265778307572, + "tokens_seen": 675379200 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004016950852557673, + "loss": 2.8791, + "theoretical_loss": 3.7932281673264225, + "tokens_seen": 675444736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040168505516549645, + "loss": 2.5665, + "theoretical_loss": 3.7931905610160355, + "tokens_seen": 675510272 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004016750250752257, + "loss": 3.0928, + "theoretical_loss": 3.793152959375377, + "tokens_seen": 675575808 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040166499498495487, + "loss": 2.8882, + "theoretical_loss": 3.7931153624034155, + "tokens_seen": 675641344 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040165496489468405, + "loss": 2.9232, + "theoretical_loss": 3.793077770099118, + "tokens_seen": 675706880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004016449348044133, + "loss": 3.0883, + "theoretical_loss": 3.7930401824614526, + "tokens_seen": 675772416 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004016349047141424, + "loss": 2.8628, + "theoretical_loss": 3.7930025994893866, + "tokens_seen": 675837952 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040162487462387165, + "loss": 3.0159, + "theoretical_loss": 3.79296502118189, + "tokens_seen": 675903488 + }, + { + "epoch": 2.02, + "learning_rate": 0.00040161484453360083, + "loss": 2.916, + "theoretical_loss": 3.792927447537931, + "tokens_seen": 675969024 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040160481444333, + "loss": 2.9463, + "theoretical_loss": 3.792889878556479, + "tokens_seen": 676034560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004015947843530592, + "loss": 2.9628, + "theoretical_loss": 3.7928523142365025, + "tokens_seen": 676100096 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040158475426278837, + "loss": 3.0279, + "theoretical_loss": 3.7928147545769724, + "tokens_seen": 676165632 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040157472417251755, + "loss": 2.4866, + "theoretical_loss": 3.792777199576859, + "tokens_seen": 676231168 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004015646940822468, + "loss": 2.6683, + "theoretical_loss": 3.792739649235132, + "tokens_seen": 676296704 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004015546639919759, + "loss": 2.8666, + "theoretical_loss": 3.792702103550763, + "tokens_seen": 676362240 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040154463390170515, + "loss": 3.0335, + "theoretical_loss": 3.792664562522723, + "tokens_seen": 676427776 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040153460381143433, + "loss": 3.1476, + "theoretical_loss": 3.7926270261499826, + "tokens_seen": 676493312 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004015245737211635, + "loss": 2.7187, + "theoretical_loss": 3.792589494431515, + "tokens_seen": 676558848 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004015145436308927, + "loss": 2.9815, + "theoretical_loss": 3.792551967366291, + "tokens_seen": 676624384 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 809057, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.946103096008301, + "objective/train/theoretical_loss": 3.7925332055783247, + "objective/train/tokens_used": 697117152, + "theoretical_loss": 3.7925332055783247, + "tokens_seen": 676657152 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004015045135406219, + "loss": 3.0284, + "theoretical_loss": 3.7925144449532837, + "tokens_seen": 676689920 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040149448345035106, + "loss": 2.8247, + "theoretical_loss": 3.7924769271914665, + "tokens_seen": 676755456 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004014844533600803, + "loss": 2.9618, + "theoretical_loss": 3.7924394140798117, + "tokens_seen": 676820992 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004014744232698094, + "loss": 2.795, + "theoretical_loss": 3.792401905617293, + "tokens_seen": 676886528 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040146439317953865, + "loss": 3.0326, + "theoretical_loss": 3.792364401802884, + "tokens_seen": 676952064 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004014543630892678, + "loss": 3.1664, + "theoretical_loss": 3.7923269026355593, + "tokens_seen": 677017600 + }, + { + "epoch": 2.03, + "learning_rate": 0.000401444332998997, + "loss": 2.8116, + "theoretical_loss": 3.7922894081142924, + "tokens_seen": 677083136 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004014343029087262, + "loss": 2.6791, + "theoretical_loss": 3.792251918238059, + "tokens_seen": 677148672 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004014242728184554, + "loss": 2.8246, + "theoretical_loss": 3.7922144330058334, + "tokens_seen": 677214208 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040141424272818456, + "loss": 2.7107, + "theoretical_loss": 3.7921769524165923, + "tokens_seen": 677279744 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004014042126379138, + "loss": 2.9204, + "theoretical_loss": 3.7921394764693095, + "tokens_seen": 677345280 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004013941825476429, + "loss": 2.6977, + "theoretical_loss": 3.7921020051629624, + "tokens_seen": 677410816 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040138415245737216, + "loss": 2.658, + "theoretical_loss": 3.7920645384965272, + "tokens_seen": 677476352 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004013741223671013, + "loss": 2.928, + "theoretical_loss": 3.7920270764689805, + "tokens_seen": 677541888 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004013640922768305, + "loss": 2.8152, + "theoretical_loss": 3.791989619079299, + "tokens_seen": 677607424 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004013540621865597, + "loss": 2.8409, + "theoretical_loss": 3.7919521663264604, + "tokens_seen": 677672960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004013440320962889, + "loss": 3.0376, + "theoretical_loss": 3.7919147182094424, + "tokens_seen": 677738496 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040133400200601806, + "loss": 2.7009, + "theoretical_loss": 3.7918772747272227, + "tokens_seen": 677804032 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040132397191574724, + "loss": 2.8787, + "theoretical_loss": 3.79183983587878, + "tokens_seen": 677869568 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004013139418254764, + "loss": 2.9441, + "theoretical_loss": 3.7918024016630922, + "tokens_seen": 677935104 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040130391173520566, + "loss": 2.6351, + "theoretical_loss": 3.7917649720791395, + "tokens_seen": 678000640 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004012938816449348, + "loss": 2.7128, + "theoretical_loss": 3.7917275471259, + "tokens_seen": 678066176 + }, + { + "epoch": 2.03, + "learning_rate": 0.000401283851554664, + "loss": 2.9271, + "theoretical_loss": 3.791690126802354, + "tokens_seen": 678131712 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040127382146439315, + "loss": 2.9554, + "theoretical_loss": 3.7916527111074814, + "tokens_seen": 678197248 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004012637913741224, + "loss": 2.9009, + "theoretical_loss": 3.791615300040262, + "tokens_seen": 678262784 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 809644, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9782016277313232, + "objective/train/theoretical_loss": 3.791596596241704, + "objective/train/tokens_used": 698755552, + "theoretical_loss": 3.791596596241704, + "tokens_seen": 678295552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040125376128385157, + "loss": 2.9855, + "theoretical_loss": 3.7915778935996767, + "tokens_seen": 678328320 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040124373119358075, + "loss": 3.0623, + "theoretical_loss": 3.7915404917847066, + "tokens_seen": 678393856 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040123370110330993, + "loss": 2.8576, + "theoretical_loss": 3.791503094594333, + "tokens_seen": 678459392 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040122367101303916, + "loss": 2.8939, + "theoretical_loss": 3.791465702027537, + "tokens_seen": 678524928 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004012136409227683, + "loss": 2.9493, + "theoretical_loss": 3.7914283140833005, + "tokens_seen": 678590464 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004012036108324975, + "loss": 2.5703, + "theoretical_loss": 3.791390930760606, + "tokens_seen": 678656000 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040119358074222665, + "loss": 2.9312, + "theoretical_loss": 3.791353552058436, + "tokens_seen": 678721536 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004011835506519559, + "loss": 2.6408, + "theoretical_loss": 3.791316177975773, + "tokens_seen": 678787072 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040117352056168507, + "loss": 2.7553, + "theoretical_loss": 3.7912788085116005, + "tokens_seen": 678852608 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040116349047141425, + "loss": 2.8303, + "theoretical_loss": 3.7912414436649016, + "tokens_seen": 678918144 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040115346038114343, + "loss": 3.0389, + "theoretical_loss": 3.7912040834346605, + "tokens_seen": 678983680 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004011434302908726, + "loss": 2.7881, + "theoretical_loss": 3.7911667278198617, + "tokens_seen": 679049216 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004011334002006018, + "loss": 2.6978, + "theoretical_loss": 3.791129376819489, + "tokens_seen": 679114752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040112337011033103, + "loss": 2.9489, + "theoretical_loss": 3.7910920304325275, + "tokens_seen": 679180288 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040111334002006016, + "loss": 2.7048, + "theoretical_loss": 3.7910546886579617, + "tokens_seen": 679245824 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004011033099297894, + "loss": 2.9393, + "theoretical_loss": 3.791017351494778, + "tokens_seen": 679311360 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004010932798395185, + "loss": 2.9715, + "theoretical_loss": 3.790980018941961, + "tokens_seen": 679376896 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040108324974924775, + "loss": 2.8581, + "theoretical_loss": 3.790942690998498, + "tokens_seen": 679442432 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040107321965897693, + "loss": 2.5724, + "theoretical_loss": 3.790905367663374, + "tokens_seen": 679507968 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004010631895687061, + "loss": 2.8063, + "theoretical_loss": 3.790868048935577, + "tokens_seen": 679573504 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004010531594784353, + "loss": 3.0307, + "theoretical_loss": 3.7908307348140933, + "tokens_seen": 679639040 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040104312938816453, + "loss": 2.8895, + "theoretical_loss": 3.7907934252979105, + "tokens_seen": 679704576 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040103309929789366, + "loss": 2.9646, + "theoretical_loss": 3.790756120386016, + "tokens_seen": 679770112 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004010230692076229, + "loss": 2.8548, + "theoretical_loss": 3.7907188200773985, + "tokens_seen": 679835648 + }, + { + "epoch": 2.03, + "learning_rate": 0.000401013039117352, + "loss": 2.9081, + "theoretical_loss": 3.7906815243710454, + "tokens_seen": 679901184 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 811111, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.683133363723755, + "objective/train/theoretical_loss": 3.7906628782434018, + "objective/train/tokens_used": 700393952, + "theoretical_loss": 3.7906628782434018, + "tokens_seen": 679933952 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040100300902708126, + "loss": 2.9855, + "theoretical_loss": 3.7906442332659456, + "tokens_seen": 679966720 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040099297893681044, + "loss": 2.7769, + "theoretical_loss": 3.790606946761088, + "tokens_seen": 680032256 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004009829488465396, + "loss": 2.7631, + "theoretical_loss": 3.790569664855462, + "tokens_seen": 680097792 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004009729187562688, + "loss": 3.0075, + "theoretical_loss": 3.7905323875480574, + "tokens_seen": 680163328 + }, + { + "epoch": 2.03, + "learning_rate": 0.000400962888665998, + "loss": 2.6681, + "theoretical_loss": 3.7904951148378636, + "tokens_seen": 680228864 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040095285857572716, + "loss": 3.1031, + "theoretical_loss": 3.7904578467238714, + "tokens_seen": 680294400 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004009428284854564, + "loss": 2.8202, + "theoretical_loss": 3.790420583205071, + "tokens_seen": 680359936 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004009327983951855, + "loss": 2.8716, + "theoretical_loss": 3.7903833242804534, + "tokens_seen": 680425472 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040092276830491476, + "loss": 2.6202, + "theoretical_loss": 3.790346069949009, + "tokens_seen": 680491008 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004009127382146439, + "loss": 2.8913, + "theoretical_loss": 3.790308820209731, + "tokens_seen": 680556544 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004009027081243731, + "loss": 3.0475, + "theoretical_loss": 3.7902715750616087, + "tokens_seen": 680622080 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040089267803410236, + "loss": 2.9008, + "theoretical_loss": 3.7902343345036367, + "tokens_seen": 680687616 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004008826479438315, + "loss": 2.9714, + "theoretical_loss": 3.790197098534806, + "tokens_seen": 680753152 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004008726178535607, + "loss": 2.8291, + "theoretical_loss": 3.7901598671541104, + "tokens_seen": 680818688 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004008625877632899, + "loss": 3.0035, + "theoretical_loss": 3.7901226403605417, + "tokens_seen": 680884224 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004008525576730191, + "loss": 2.8466, + "theoretical_loss": 3.7900854181530943, + "tokens_seen": 680949760 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040084252758274826, + "loss": 3.0043, + "theoretical_loss": 3.7900482005307614, + "tokens_seen": 681015296 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040083249749247744, + "loss": 2.7097, + "theoretical_loss": 3.7900109874925376, + "tokens_seen": 681080832 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004008224674022066, + "loss": 2.713, + "theoretical_loss": 3.7899737790374166, + "tokens_seen": 681146368 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040081243731193586, + "loss": 3.0035, + "theoretical_loss": 3.7899365751643934, + "tokens_seen": 681211904 + }, + { + "epoch": 2.03, + "learning_rate": 0.000400802407221665, + "loss": 2.9085, + "theoretical_loss": 3.7898993758724635, + "tokens_seen": 681277440 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004007923771313942, + "loss": 2.908, + "theoretical_loss": 3.789862181160621, + "tokens_seen": 681342976 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040078234704112335, + "loss": 2.8204, + "theoretical_loss": 3.7898249910278627, + "tokens_seen": 681408512 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004007723169508526, + "loss": 2.8935, + "theoretical_loss": 3.7897878054731837, + "tokens_seen": 681474048 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040076228686058177, + "loss": 3.0723, + "theoretical_loss": 3.7897506244955808, + "tokens_seen": 681539584 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 811509, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8229663372039795, + "objective/train/theoretical_loss": 3.789732035722869, + "objective/train/tokens_used": 702032352, + "theoretical_loss": 3.789732035722869, + "tokens_seen": 681572352 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040075225677031095, + "loss": 2.7856, + "theoretical_loss": 3.7897134480940506, + "tokens_seen": 681605120 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040074222668004013, + "loss": 2.6028, + "theoretical_loss": 3.7896762762675893, + "tokens_seen": 681670656 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040073219658976936, + "loss": 2.8811, + "theoretical_loss": 3.7896391090151944, + "tokens_seen": 681736192 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004007221664994985, + "loss": 2.9482, + "theoretical_loss": 3.789601946335864, + "tokens_seen": 681801728 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004007121364092277, + "loss": 2.8387, + "theoretical_loss": 3.7895647882285957, + "tokens_seen": 681867264 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040070210631895685, + "loss": 2.8092, + "theoretical_loss": 3.789527634692387, + "tokens_seen": 681932800 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004006920762286861, + "loss": 2.9132, + "theoretical_loss": 3.7894904857262373, + "tokens_seen": 681998336 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040068204613841527, + "loss": 3.1188, + "theoretical_loss": 3.7894533413291445, + "tokens_seen": 682063872 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040067201604814445, + "loss": 3.0301, + "theoretical_loss": 3.7894162015001087, + "tokens_seen": 682129408 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040066198595787363, + "loss": 2.6942, + "theoretical_loss": 3.789379066238128, + "tokens_seen": 682194944 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004006519558676028, + "loss": 3.0303, + "theoretical_loss": 3.789341935542204, + "tokens_seen": 682260480 + }, + { + "epoch": 2.03, + "learning_rate": 0.000400641925777332, + "loss": 2.9624, + "theoretical_loss": 3.789304809411335, + "tokens_seen": 682326016 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040063189568706123, + "loss": 2.893, + "theoretical_loss": 3.789267687844522, + "tokens_seen": 682391552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040062186559679036, + "loss": 2.8456, + "theoretical_loss": 3.7892305708407656, + "tokens_seen": 682457088 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004006118355065196, + "loss": 2.7705, + "theoretical_loss": 3.789193458399067, + "tokens_seen": 682522624 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004006018054162487, + "loss": 3.0149, + "theoretical_loss": 3.789156350518428, + "tokens_seen": 682588160 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040059177532597795, + "loss": 2.8807, + "theoretical_loss": 3.7891192471978488, + "tokens_seen": 682653696 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040058174523570713, + "loss": 2.9405, + "theoretical_loss": 3.789082148436332, + "tokens_seen": 682719232 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004005717151454363, + "loss": 2.8325, + "theoretical_loss": 3.7890450542328806, + "tokens_seen": 682784768 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004005616850551655, + "loss": 2.9391, + "theoretical_loss": 3.7890079645864967, + "tokens_seen": 682850304 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040055165496489473, + "loss": 3.1444, + "theoretical_loss": 3.7889708794961825, + "tokens_seen": 682915840 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040054162487462386, + "loss": 2.9871, + "theoretical_loss": 3.788933798960942, + "tokens_seen": 682981376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004005315947843531, + "loss": 2.8059, + "theoretical_loss": 3.7888967229797785, + "tokens_seen": 683046912 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004005215646940822, + "loss": 2.5791, + "theoretical_loss": 3.7888596515516952, + "tokens_seen": 683112448 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040051153460381146, + "loss": 3.0333, + "theoretical_loss": 3.7888225846756978, + "tokens_seen": 683177984 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 812647, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.369586229324341, + "objective/train/theoretical_loss": 3.7888040529444194, + "objective/train/tokens_used": 703670752, + "theoretical_loss": 3.7888040529444194, + "tokens_seen": 683210752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040050150451354064, + "loss": 2.8012, + "theoretical_loss": 3.788785522350789, + "tokens_seen": 683243520 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004004914744232698, + "loss": 3.2464, + "theoretical_loss": 3.7887484645759746, + "tokens_seen": 683309056 + }, + { + "epoch": 2.03, + "learning_rate": 0.000400481444332999, + "loss": 2.9848, + "theoretical_loss": 3.788711411350259, + "tokens_seen": 683374592 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004004714142427282, + "loss": 2.8967, + "theoretical_loss": 3.7886743626726487, + "tokens_seen": 683440128 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040046138415245736, + "loss": 2.9512, + "theoretical_loss": 3.788637318542148, + "tokens_seen": 683505664 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004004513540621866, + "loss": 2.9431, + "theoretical_loss": 3.7886002789577633, + "tokens_seen": 683571200 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004004413239719157, + "loss": 2.8512, + "theoretical_loss": 3.788563243918502, + "tokens_seen": 683636736 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040043129388164496, + "loss": 2.7822, + "theoretical_loss": 3.7885262134233697, + "tokens_seen": 683702272 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004004212637913741, + "loss": 2.977, + "theoretical_loss": 3.7884891874713738, + "tokens_seen": 683767808 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004004112337011033, + "loss": 2.7165, + "theoretical_loss": 3.7884521660615205, + "tokens_seen": 683833344 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004004012036108325, + "loss": 2.6355, + "theoretical_loss": 3.7884151491928186, + "tokens_seen": 683898880 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004003911735205617, + "loss": 2.8597, + "theoretical_loss": 3.788378136864276, + "tokens_seen": 683964416 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040038114343029087, + "loss": 2.9315, + "theoretical_loss": 3.7883411290749005, + "tokens_seen": 684029952 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004003711133400201, + "loss": 2.7077, + "theoretical_loss": 3.7883041258237, + "tokens_seen": 684095488 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040036108324974923, + "loss": 2.912, + "theoretical_loss": 3.7882671271096844, + "tokens_seen": 684161024 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040035105315947846, + "loss": 2.9169, + "theoretical_loss": 3.7882301329318624, + "tokens_seen": 684226560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004003410230692076, + "loss": 2.8847, + "theoretical_loss": 3.788193143289243, + "tokens_seen": 684292096 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004003309929789368, + "loss": 2.8739, + "theoretical_loss": 3.7881561581808363, + "tokens_seen": 684357632 + }, + { + "epoch": 2.03, + "learning_rate": 0.000400320962888666, + "loss": 2.8555, + "theoretical_loss": 3.788119177605653, + "tokens_seen": 684423168 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004003109327983952, + "loss": 2.7767, + "theoretical_loss": 3.7880822015627023, + "tokens_seen": 684488704 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040030090270812437, + "loss": 2.8128, + "theoretical_loss": 3.788045230050996, + "tokens_seen": 684554240 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040029087261785355, + "loss": 2.9026, + "theoretical_loss": 3.788008263069544, + "tokens_seen": 684619776 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040028084252758273, + "loss": 2.7174, + "theoretical_loss": 3.7879713006173583, + "tokens_seen": 684685312 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040027081243731197, + "loss": 2.9723, + "theoretical_loss": 3.787934342693451, + "tokens_seen": 684750848 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004002607823470411, + "loss": 3.017, + "theoretical_loss": 3.787897389296833, + "tokens_seen": 684816384 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 812952, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.962841510772705, + "objective/train/theoretical_loss": 3.787878914295949, + "objective/train/tokens_used": 705309152, + "theoretical_loss": 3.787878914295949, + "tokens_seen": 684849152 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040025075225677033, + "loss": 2.9822, + "theoretical_loss": 3.787860440426517, + "tokens_seen": 684881920 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040024072216649946, + "loss": 3.1501, + "theoretical_loss": 3.787823496081515, + "tokens_seen": 684947456 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004002306920762287, + "loss": 3.0707, + "theoretical_loss": 3.787786556260841, + "tokens_seen": 685012992 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040022066198595787, + "loss": 2.8825, + "theoretical_loss": 3.7877496209635075, + "tokens_seen": 685078528 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040021063189568705, + "loss": 3.1118, + "theoretical_loss": 3.7877126901885276, + "tokens_seen": 685144064 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040020060180541623, + "loss": 3.1401, + "theoretical_loss": 3.7876757639349163, + "tokens_seen": 685209600 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040019057171514547, + "loss": 3.1658, + "theoretical_loss": 3.787638842201686, + "tokens_seen": 685275136 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004001805416248746, + "loss": 3.1845, + "theoretical_loss": 3.7876019249878525, + "tokens_seen": 685340672 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040017051153460383, + "loss": 2.8411, + "theoretical_loss": 3.78756501229243, + "tokens_seen": 685406208 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040016048144433296, + "loss": 3.0347, + "theoretical_loss": 3.7875281041144335, + "tokens_seen": 685471744 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004001504513540622, + "loss": 3.0841, + "theoretical_loss": 3.787491200452878, + "tokens_seen": 685537280 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040014042126379143, + "loss": 2.7852, + "theoretical_loss": 3.78745430130678, + "tokens_seen": 685602816 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040013039117352056, + "loss": 2.7576, + "theoretical_loss": 3.7874174066751545, + "tokens_seen": 685668352 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004001203610832498, + "loss": 3.0064, + "theoretical_loss": 3.7873805165570182, + "tokens_seen": 685733888 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004001103309929789, + "loss": 2.9897, + "theoretical_loss": 3.7873436309513884, + "tokens_seen": 685799424 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040010030090270815, + "loss": 3.0036, + "theoretical_loss": 3.7873067498572808, + "tokens_seen": 685864960 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040009027081243733, + "loss": 2.9462, + "theoretical_loss": 3.787269873273713, + "tokens_seen": 685930496 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004000802407221665, + "loss": 3.017, + "theoretical_loss": 3.787233001199703, + "tokens_seen": 685996032 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004000702106318957, + "loss": 3.1799, + "theoretical_loss": 3.7871961336342674, + "tokens_seen": 686061568 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040006018054162493, + "loss": 3.0584, + "theoretical_loss": 3.787159270576425, + "tokens_seen": 686127104 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040005015045135406, + "loss": 2.837, + "theoretical_loss": 3.787122412025195, + "tokens_seen": 686192640 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004000401203610833, + "loss": 3.0278, + "theoretical_loss": 3.7870855579795952, + "tokens_seen": 686258176 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004000300902708124, + "loss": 2.9139, + "theoretical_loss": 3.787048708438644, + "tokens_seen": 686323712 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040002006018054166, + "loss": 3.0253, + "theoretical_loss": 3.7870118634013625, + "tokens_seen": 686389248 + }, + { + "epoch": 2.03, + "learning_rate": 0.00040001003009027084, + "loss": 2.9288, + "theoretical_loss": 3.786975022866769, + "tokens_seen": 686454784 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 812952, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6860439777374268, + "objective/train/theoretical_loss": 3.786956604287674, + "objective/train/tokens_used": 706947552, + "theoretical_loss": 3.786956604287674, + "tokens_seen": 686487552 + }, + { + "epoch": 2.03, + "learning_rate": 0.0004, + "loss": 3.0279, + "theoretical_loss": 3.786938186833884, + "tokens_seen": 686520320 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003999899699097292, + "loss": 3.0499, + "theoretical_loss": 3.7869013553017274, + "tokens_seen": 686585856 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003999799398194584, + "loss": 3.1523, + "theoretical_loss": 3.7868645282693203, + "tokens_seen": 686651392 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039996990972918756, + "loss": 2.8675, + "theoretical_loss": 3.786827705735683, + "tokens_seen": 686716928 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003999598796389168, + "loss": 2.8969, + "theoretical_loss": 3.7867908876998366, + "tokens_seen": 686782464 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003999498495486459, + "loss": 3.0232, + "theoretical_loss": 3.786754074160803, + "tokens_seen": 686848000 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039993981945837516, + "loss": 3.1544, + "theoretical_loss": 3.7867172651176038, + "tokens_seen": 686913536 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003999297893681043, + "loss": 3.3209, + "theoretical_loss": 3.7866804605692614, + "tokens_seen": 686979072 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003999197592778335, + "loss": 2.9485, + "theoretical_loss": 3.786643660514798, + "tokens_seen": 687044608 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003999097291875627, + "loss": 2.9027, + "theoretical_loss": 3.786606864953236, + "tokens_seen": 687110144 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003998996990972919, + "loss": 3.0381, + "theoretical_loss": 3.786570073883599, + "tokens_seen": 687175680 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039988966900702107, + "loss": 3.0156, + "theoretical_loss": 3.78653328730491, + "tokens_seen": 687241216 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003998796389167503, + "loss": 3.2087, + "theoretical_loss": 3.7864965052161925, + "tokens_seen": 687306752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039986960882647943, + "loss": 3.0047, + "theoretical_loss": 3.7864597276164704, + "tokens_seen": 687372288 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039985957873620866, + "loss": 3.0319, + "theoretical_loss": 3.7864229545047685, + "tokens_seen": 687437824 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003998495486459378, + "loss": 3.2716, + "theoretical_loss": 3.7863861858801107, + "tokens_seen": 687503360 + }, + { + "epoch": 2.03, + "learning_rate": 0.000399839518555667, + "loss": 2.8685, + "theoretical_loss": 3.786349421741522, + "tokens_seen": 687568896 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003998294884653962, + "loss": 2.9713, + "theoretical_loss": 3.7863126620880276, + "tokens_seen": 687634432 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003998194583751254, + "loss": 2.9644, + "theoretical_loss": 3.786275906918653, + "tokens_seen": 687699968 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039980942828485457, + "loss": 2.9327, + "theoretical_loss": 3.7862391562324245, + "tokens_seen": 687765504 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039979939819458375, + "loss": 2.8463, + "theoretical_loss": 3.786202410028367, + "tokens_seen": 687831040 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039978936810431293, + "loss": 3.209, + "theoretical_loss": 3.7861656683055074, + "tokens_seen": 687896576 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039977933801404217, + "loss": 3.247, + "theoretical_loss": 3.7861289310628727, + "tokens_seen": 687962112 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003997693079237713, + "loss": 3.0825, + "theoretical_loss": 3.7860921982994897, + "tokens_seen": 688027648 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039975927783350053, + "loss": 2.9732, + "theoretical_loss": 3.786055470014386, + "tokens_seen": 688093184 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 813708, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2107160091400146, + "objective/train/theoretical_loss": 3.786037107550884, + "objective/train/tokens_used": 708585952, + "theoretical_loss": 3.786037107550884, + "tokens_seen": 688125952 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039974924774322966, + "loss": 3.1854, + "theoretical_loss": 3.7860187462065875, + "tokens_seen": 688158720 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003997392176529589, + "loss": 3.036, + "theoretical_loss": 3.7859820268751245, + "tokens_seen": 688224256 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039972918756268807, + "loss": 2.9536, + "theoretical_loss": 3.785945312019024, + "tokens_seen": 688289792 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039971915747241725, + "loss": 2.7898, + "theoretical_loss": 3.7859086016373134, + "tokens_seen": 688355328 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039970912738214643, + "loss": 2.9671, + "theoretical_loss": 3.785871895729024, + "tokens_seen": 688420864 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039969909729187567, + "loss": 2.9539, + "theoretical_loss": 3.785835194293183, + "tokens_seen": 688486400 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003996890672016048, + "loss": 2.9139, + "theoretical_loss": 3.7857984973288206, + "tokens_seen": 688551936 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039967903711133403, + "loss": 3.1913, + "theoretical_loss": 3.7857618048349657, + "tokens_seen": 688617472 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039966900702106316, + "loss": 3.1764, + "theoretical_loss": 3.7857251168106494, + "tokens_seen": 688683008 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003996589769307924, + "loss": 2.9506, + "theoretical_loss": 3.7856884332549017, + "tokens_seen": 688748544 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003996489468405216, + "loss": 3.1994, + "theoretical_loss": 3.7856517541667523, + "tokens_seen": 688814080 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039963891675025076, + "loss": 3.0963, + "theoretical_loss": 3.7856150795452335, + "tokens_seen": 688879616 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039962888665997994, + "loss": 2.9912, + "theoretical_loss": 3.7855784093893754, + "tokens_seen": 688945152 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003996188565697091, + "loss": 3.1565, + "theoretical_loss": 3.78554174369821, + "tokens_seen": 689010688 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003996088264794383, + "loss": 2.9481, + "theoretical_loss": 3.7855050824707694, + "tokens_seen": 689076224 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039959879638916754, + "loss": 2.9037, + "theoretical_loss": 3.7854684257060853, + "tokens_seen": 689141760 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039958876629889666, + "loss": 3.0422, + "theoretical_loss": 3.7854317734031904, + "tokens_seen": 689207296 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003995787362086259, + "loss": 3.0962, + "theoretical_loss": 3.7853951255611165, + "tokens_seen": 689272832 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003995687061183551, + "loss": 2.9324, + "theoretical_loss": 3.785358482178898, + "tokens_seen": 689338368 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039955867602808426, + "loss": 3.0348, + "theoretical_loss": 3.7853218432555678, + "tokens_seen": 689403904 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039954864593781344, + "loss": 2.9487, + "theoretical_loss": 3.7852852087901594, + "tokens_seen": 689469440 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003995386158475426, + "loss": 3.1092, + "theoretical_loss": 3.7852485787817063, + "tokens_seen": 689534976 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003995285857572718, + "loss": 2.9527, + "theoretical_loss": 3.7852119532292434, + "tokens_seen": 689600512 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039951855566700104, + "loss": 3.163, + "theoretical_loss": 3.785175332131805, + "tokens_seen": 689666048 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039950852557673017, + "loss": 2.9854, + "theoretical_loss": 3.7851387154884257, + "tokens_seen": 689731584 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 815148, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4033315181732178, + "objective/train/theoretical_loss": 3.7851204088367068, + "objective/train/tokens_used": 710224352, + "theoretical_loss": 3.7851204088367068, + "tokens_seen": 689764352 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003994984954864594, + "loss": 3.257, + "theoretical_loss": 3.785102103298141, + "tokens_seen": 689797120 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039948846539618853, + "loss": 3.0081, + "theoretical_loss": 3.7850654955599863, + "tokens_seen": 689862656 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039947843530591776, + "loss": 2.9746, + "theoretical_loss": 3.7850288922729973, + "tokens_seen": 689928192 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039946840521564694, + "loss": 2.9742, + "theoretical_loss": 3.78499229343621, + "tokens_seen": 689993728 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003994583751253761, + "loss": 3.0874, + "theoretical_loss": 3.784955699048661, + "tokens_seen": 690059264 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003994483450351053, + "loss": 3.1328, + "theoretical_loss": 3.784919109109386, + "tokens_seen": 690124800 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003994383149448345, + "loss": 2.89, + "theoretical_loss": 3.784882523617423, + "tokens_seen": 690190336 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039942828485456367, + "loss": 2.852, + "theoretical_loss": 3.784845942571809, + "tokens_seen": 690255872 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003994182547642929, + "loss": 3.2457, + "theoretical_loss": 3.784809365971581, + "tokens_seen": 690321408 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039940822467402203, + "loss": 2.9565, + "theoretical_loss": 3.784772793815777, + "tokens_seen": 690386944 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039939819458375127, + "loss": 2.902, + "theoretical_loss": 3.784736226103436, + "tokens_seen": 690452480 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003993881644934805, + "loss": 3.042, + "theoretical_loss": 3.784699662833596, + "tokens_seen": 690518016 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039937813440320963, + "loss": 3.0018, + "theoretical_loss": 3.784663104005295, + "tokens_seen": 690583552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039936810431293886, + "loss": 2.8559, + "theoretical_loss": 3.784626549617573, + "tokens_seen": 690649088 + }, + { + "epoch": 2.03, + "learning_rate": 0.000399358074222668, + "loss": 3.0771, + "theoretical_loss": 3.7845899996694685, + "tokens_seen": 690714624 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003993480441323972, + "loss": 3.186, + "theoretical_loss": 3.7845534541600223, + "tokens_seen": 690780160 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003993380140421264, + "loss": 2.9598, + "theoretical_loss": 3.7845169130882734, + "tokens_seen": 690845696 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003993279839518556, + "loss": 3.0254, + "theoretical_loss": 3.7844803764532617, + "tokens_seen": 690911232 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039931795386158477, + "loss": 2.7558, + "theoretical_loss": 3.784443844254029, + "tokens_seen": 690976768 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039930792377131395, + "loss": 3.0946, + "theoretical_loss": 3.784407316489615, + "tokens_seen": 691042304 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039929789368104313, + "loss": 2.9042, + "theoretical_loss": 3.784370793159061, + "tokens_seen": 691107840 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039928786359077237, + "loss": 3.007, + "theoretical_loss": 3.784334274261409, + "tokens_seen": 691173376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003992778335005015, + "loss": 3.001, + "theoretical_loss": 3.7842977597957006, + "tokens_seen": 691238912 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039926780341023073, + "loss": 3.1521, + "theoretical_loss": 3.784261249760977, + "tokens_seen": 691304448 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039925777331995986, + "loss": 3.0055, + "theoretical_loss": 3.784224744156282, + "tokens_seen": 691369984 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 815781, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.672840118408203, + "objective/train/theoretical_loss": 3.7842064930148953, + "objective/train/tokens_used": 711862752, + "theoretical_loss": 3.7842064930148953, + "tokens_seen": 691402752 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003992477432296891, + "loss": 3.1546, + "theoretical_loss": 3.784188242980657, + "tokens_seen": 691435520 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039923771313941827, + "loss": 2.9453, + "theoretical_loss": 3.784151746233145, + "tokens_seen": 691501056 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039922768304914745, + "loss": 2.872, + "theoretical_loss": 3.78411525391279, + "tokens_seen": 691566592 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039921765295887663, + "loss": 3.0222, + "theoretical_loss": 3.7840787660186344, + "tokens_seen": 691632128 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039920762286860587, + "loss": 2.9783, + "theoretical_loss": 3.7840422825497235, + "tokens_seen": 691697664 + }, + { + "epoch": 2.03, + "learning_rate": 0.000399197592778335, + "loss": 2.9824, + "theoretical_loss": 3.7840058035050994, + "tokens_seen": 691763200 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039918756268806423, + "loss": 3.1181, + "theoretical_loss": 3.7839693288838085, + "tokens_seen": 691828736 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039917753259779336, + "loss": 3.0017, + "theoretical_loss": 3.783932858684895, + "tokens_seen": 691894272 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003991675025075226, + "loss": 2.8385, + "theoretical_loss": 3.783896392907402, + "tokens_seen": 691959808 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003991574724172518, + "loss": 3.1507, + "theoretical_loss": 3.7838599315503774, + "tokens_seen": 692025344 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039914744232698096, + "loss": 2.9381, + "theoretical_loss": 3.7838234746128654, + "tokens_seen": 692090880 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039913741223671014, + "loss": 3.2148, + "theoretical_loss": 3.7837870220939127, + "tokens_seen": 692156416 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003991273821464393, + "loss": 3.0144, + "theoretical_loss": 3.783750573992565, + "tokens_seen": 692221952 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003991173520561685, + "loss": 3.1139, + "theoretical_loss": 3.7837141303078683, + "tokens_seen": 692287488 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039910732196589774, + "loss": 3.0469, + "theoretical_loss": 3.7836776910388696, + "tokens_seen": 692353024 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039909729187562686, + "loss": 2.8595, + "theoretical_loss": 3.783641256184617, + "tokens_seen": 692418560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003990872617853561, + "loss": 3.0581, + "theoretical_loss": 3.7836048257441566, + "tokens_seen": 692484096 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003990772316950853, + "loss": 3.0968, + "theoretical_loss": 3.7835683997165366, + "tokens_seen": 692549632 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039906720160481446, + "loss": 2.6969, + "theoretical_loss": 3.7835319781008048, + "tokens_seen": 692615168 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039905717151454364, + "loss": 2.973, + "theoretical_loss": 3.78349556089601, + "tokens_seen": 692680704 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003990471414242728, + "loss": 2.8064, + "theoretical_loss": 3.7834591481011994, + "tokens_seen": 692746240 + }, + { + "epoch": 2.03, + "learning_rate": 0.000399037111334002, + "loss": 3.1452, + "theoretical_loss": 3.783422739715424, + "tokens_seen": 692811776 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039902708124373124, + "loss": 3.2073, + "theoretical_loss": 3.7833863357377306, + "tokens_seen": 692877312 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039901705115346037, + "loss": 2.7702, + "theoretical_loss": 3.7833499361671703, + "tokens_seen": 692942848 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003990070210631896, + "loss": 2.927, + "theoretical_loss": 3.7833135410027925, + "tokens_seen": 693008384 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 817009, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7722556591033936, + "objective/train/theoretical_loss": 3.7832953450726245, + "objective/train/tokens_used": 713501152, + "theoretical_loss": 3.7832953450726245, + "tokens_seen": 693041152 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039899699097291873, + "loss": 3.0062, + "theoretical_loss": 3.783277150243646, + "tokens_seen": 693073920 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039898696088264796, + "loss": 3.0559, + "theoretical_loss": 3.7832407638887826, + "tokens_seen": 693139456 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039897693079237714, + "loss": 3.0706, + "theoretical_loss": 3.783204381937253, + "tokens_seen": 693204992 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003989669007021063, + "loss": 2.9877, + "theoretical_loss": 3.7831680043881066, + "tokens_seen": 693270528 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003989568706118355, + "loss": 2.8072, + "theoretical_loss": 3.7831316312403964, + "tokens_seen": 693336064 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003989468405215647, + "loss": 2.7801, + "theoretical_loss": 3.7830952624931724, + "tokens_seen": 693401600 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039893681043129387, + "loss": 3.0159, + "theoretical_loss": 3.7830588981454873, + "tokens_seen": 693467136 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003989267803410231, + "loss": 2.9373, + "theoretical_loss": 3.783022538196393, + "tokens_seen": 693532672 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039891675025075223, + "loss": 2.9548, + "theoretical_loss": 3.782986182644941, + "tokens_seen": 693598208 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039890672016048147, + "loss": 3.2316, + "theoretical_loss": 3.7829498314901855, + "tokens_seen": 693663744 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039889669007021065, + "loss": 3.1027, + "theoretical_loss": 3.7829134847311785, + "tokens_seen": 693729280 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039888665997993983, + "loss": 3.2061, + "theoretical_loss": 3.782877142366974, + "tokens_seen": 693794816 + }, + { + "epoch": 2.03, + "learning_rate": 0.000398876629889669, + "loss": 3.1499, + "theoretical_loss": 3.782840804396624, + "tokens_seen": 693860352 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003988665997993982, + "loss": 3.1008, + "theoretical_loss": 3.782804470819184, + "tokens_seen": 693925888 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039885656970912737, + "loss": 3.0239, + "theoretical_loss": 3.782768141633708, + "tokens_seen": 693991424 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003988465396188566, + "loss": 3.1596, + "theoretical_loss": 3.7827318168392488, + "tokens_seen": 694056960 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039883650952858573, + "loss": 2.9416, + "theoretical_loss": 3.782695496434863, + "tokens_seen": 694122496 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039882647943831497, + "loss": 2.9495, + "theoretical_loss": 3.782659180419605, + "tokens_seen": 694188032 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003988164493480441, + "loss": 2.9938, + "theoretical_loss": 3.7826228687925294, + "tokens_seen": 694253568 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039880641925777333, + "loss": 2.9565, + "theoretical_loss": 3.7825865615526926, + "tokens_seen": 694319104 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003987963891675025, + "loss": 3.0452, + "theoretical_loss": 3.7825502586991506, + "tokens_seen": 694384640 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003987863590772317, + "loss": 2.9098, + "theoretical_loss": 3.782513960230959, + "tokens_seen": 694450176 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003987763289869609, + "loss": 2.9125, + "theoretical_loss": 3.7824776661471744, + "tokens_seen": 694515712 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039876629889669006, + "loss": 2.9881, + "theoretical_loss": 3.7824413764468536, + "tokens_seen": 694581248 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039875626880641924, + "loss": 3.0315, + "theoretical_loss": 3.7824050911290543, + "tokens_seen": 694646784 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 817800, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.937741279602051, + "objective/train/theoretical_loss": 3.782386950113305, + "objective/train/tokens_used": 715139552, + "theoretical_loss": 3.782386950113305, + "tokens_seen": 694679552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039874623871614847, + "loss": 2.7712, + "theoretical_loss": 3.782368810192833, + "tokens_seen": 694712320 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003987362086258776, + "loss": 3.0905, + "theoretical_loss": 3.782332533637248, + "tokens_seen": 694777856 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039872617853560683, + "loss": 3.1504, + "theoretical_loss": 3.7822962614613567, + "tokens_seen": 694843392 + }, + { + "epoch": 2.03, + "learning_rate": 0.000398716148445336, + "loss": 3.1779, + "theoretical_loss": 3.7822599936642174, + "tokens_seen": 694908928 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003987061183550652, + "loss": 2.9834, + "theoretical_loss": 3.7822237302448887, + "tokens_seen": 694974464 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003986960882647944, + "loss": 2.9804, + "theoretical_loss": 3.78218747120243, + "tokens_seen": 695040000 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039868605817452356, + "loss": 2.9168, + "theoretical_loss": 3.7821512165358993, + "tokens_seen": 695105536 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039867602808425274, + "loss": 3.0235, + "theoretical_loss": 3.782114966244357, + "tokens_seen": 695171072 + }, + { + "epoch": 2.03, + "learning_rate": 0.000398665997993982, + "loss": 2.9155, + "theoretical_loss": 3.7820787203268624, + "tokens_seen": 695236608 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003986559679037111, + "loss": 3.0621, + "theoretical_loss": 3.7820424787824747, + "tokens_seen": 695302144 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039864593781344034, + "loss": 2.9857, + "theoretical_loss": 3.7820062416102553, + "tokens_seen": 695367680 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003986359077231695, + "loss": 2.9637, + "theoretical_loss": 3.7819700088092647, + "tokens_seen": 695433216 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003986258776328987, + "loss": 2.8773, + "theoretical_loss": 3.781933780378563, + "tokens_seen": 695498752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039861584754262794, + "loss": 2.8766, + "theoretical_loss": 3.781897556317212, + "tokens_seen": 695564288 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039860581745235706, + "loss": 2.9318, + "theoretical_loss": 3.781861336624273, + "tokens_seen": 695629824 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003985957873620863, + "loss": 2.9717, + "theoretical_loss": 3.7818251212988074, + "tokens_seen": 695695360 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003985857572718155, + "loss": 2.9584, + "theoretical_loss": 3.781788910339877, + "tokens_seen": 695760896 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039857572718154466, + "loss": 2.743, + "theoretical_loss": 3.781752703746545, + "tokens_seen": 695826432 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039856569709127384, + "loss": 3.0222, + "theoretical_loss": 3.7817165015178733, + "tokens_seen": 695891968 + }, + { + "epoch": 2.03, + "learning_rate": 0.000398555667001003, + "loss": 3.1952, + "theoretical_loss": 3.7816803036529247, + "tokens_seen": 695957504 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003985456369107322, + "loss": 2.9163, + "theoretical_loss": 3.781644110150763, + "tokens_seen": 696023040 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039853560682046144, + "loss": 3.0045, + "theoretical_loss": 3.781607921010451, + "tokens_seen": 696088576 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039852557673019057, + "loss": 2.875, + "theoretical_loss": 3.781571736231053, + "tokens_seen": 696154112 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003985155466399198, + "loss": 3.1498, + "theoretical_loss": 3.7815355558116326, + "tokens_seen": 696219648 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039850551654964893, + "loss": 3.0813, + "theoretical_loss": 3.7814993797512546, + "tokens_seen": 696285184 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 823060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.890718460083008, + "objective/train/theoretical_loss": 3.7814812933554136, + "objective/train/tokens_used": 716777952, + "theoretical_loss": 3.7814812933554136, + "tokens_seen": 696317952 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039849548645937816, + "loss": 2.9396, + "theoretical_loss": 3.781463208048983, + "tokens_seen": 696350720 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039848545636910734, + "loss": 2.9903, + "theoretical_loss": 3.781427040703883, + "tokens_seen": 696416256 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003984754262788365, + "loss": 2.9655, + "theoretical_loss": 3.781390877715019, + "tokens_seen": 696481792 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003984653961885657, + "loss": 2.9875, + "theoretical_loss": 3.781354719081458, + "tokens_seen": 696547328 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003984553660982949, + "loss": 2.7848, + "theoretical_loss": 3.7813185648022647, + "tokens_seen": 696612864 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039844533600802407, + "loss": 2.7708, + "theoretical_loss": 3.7812824148765056, + "tokens_seen": 696678400 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003984353059177533, + "loss": 2.9602, + "theoretical_loss": 3.781246269303247, + "tokens_seen": 696743936 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039842527582748243, + "loss": 2.9896, + "theoretical_loss": 3.7812101280815553, + "tokens_seen": 696809472 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039841524573721167, + "loss": 2.9217, + "theoretical_loss": 3.7811739912104976, + "tokens_seen": 696875008 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039840521564694085, + "loss": 2.9529, + "theoretical_loss": 3.781137858689141, + "tokens_seen": 696940544 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039839518555667003, + "loss": 3.0855, + "theoretical_loss": 3.781101730516553, + "tokens_seen": 697006080 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003983851554663992, + "loss": 3.0658, + "theoretical_loss": 3.7810656066918016, + "tokens_seen": 697071616 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003983751253761284, + "loss": 2.76, + "theoretical_loss": 3.7810294872139547, + "tokens_seen": 697137152 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039836509528585757, + "loss": 2.8135, + "theoretical_loss": 3.7809933720820803, + "tokens_seen": 697202688 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003983550651955868, + "loss": 3.0346, + "theoretical_loss": 3.7809572612952476, + "tokens_seen": 697268224 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039834503510531593, + "loss": 2.9071, + "theoretical_loss": 3.7809211548525257, + "tokens_seen": 697333760 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039833500501504517, + "loss": 3.0486, + "theoretical_loss": 3.7808850527529825, + "tokens_seen": 697399296 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003983249749247743, + "loss": 2.8335, + "theoretical_loss": 3.7808489549956894, + "tokens_seen": 697464832 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039831494483450353, + "loss": 3.1494, + "theoretical_loss": 3.7808128615797143, + "tokens_seen": 697530368 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003983049147442327, + "loss": 2.9519, + "theoretical_loss": 3.780776772504129, + "tokens_seen": 697595904 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003982948846539619, + "loss": 2.9174, + "theoretical_loss": 3.7807406877680023, + "tokens_seen": 697661440 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003982848545636911, + "loss": 2.8904, + "theoretical_loss": 3.7807046073704065, + "tokens_seen": 697726976 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039827482447342026, + "loss": 2.9245, + "theoretical_loss": 3.780668531310411, + "tokens_seen": 697792512 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039826479438314944, + "loss": 3.0444, + "theoretical_loss": 3.7806324595870877, + "tokens_seen": 697858048 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039825476429287867, + "loss": 2.9878, + "theoretical_loss": 3.7805963921995085, + "tokens_seen": 697923584 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 828245, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0669054985046387, + "objective/train/theoretical_loss": 3.7805783601313325, + "objective/train/tokens_used": 718416352, + "theoretical_loss": 3.7805783601313325, + "tokens_seen": 697956352 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003982447342026078, + "loss": 3.0409, + "theoretical_loss": 3.7805603291467444, + "tokens_seen": 697989120 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039823470411233703, + "loss": 3.0377, + "theoretical_loss": 3.7805242704278683, + "tokens_seen": 698054656 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003982246740220662, + "loss": 2.7453, + "theoretical_loss": 3.7804882160419515, + "tokens_seen": 698120192 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003982146439317954, + "loss": 3.0153, + "theoretical_loss": 3.780452165988067, + "tokens_seen": 698185728 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003982046138415246, + "loss": 2.937, + "theoretical_loss": 3.780416120265289, + "tokens_seen": 698251264 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039819458375125376, + "loss": 2.9108, + "theoretical_loss": 3.780380078872689, + "tokens_seen": 698316800 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039818455366098294, + "loss": 2.7331, + "theoretical_loss": 3.7803440418093417, + "tokens_seen": 698382336 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003981745235707122, + "loss": 3.0641, + "theoretical_loss": 3.78030800907432, + "tokens_seen": 698447872 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003981644934804413, + "loss": 2.9069, + "theoretical_loss": 3.780271980666699, + "tokens_seen": 698513408 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039815446339017054, + "loss": 2.8881, + "theoretical_loss": 3.780235956585552, + "tokens_seen": 698578944 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039814443329989966, + "loss": 3.1442, + "theoretical_loss": 3.7801999368299537, + "tokens_seen": 698644480 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003981344032096289, + "loss": 2.9341, + "theoretical_loss": 3.78016392139898, + "tokens_seen": 698710016 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003981243731193581, + "loss": 2.6563, + "theoretical_loss": 3.7801279102917063, + "tokens_seen": 698775552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039811434302908726, + "loss": 3.1938, + "theoretical_loss": 3.780091903507206, + "tokens_seen": 698841088 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039810431293881644, + "loss": 3.0582, + "theoretical_loss": 3.7800559010445567, + "tokens_seen": 698906624 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003980942828485457, + "loss": 2.7882, + "theoretical_loss": 3.7800199029028345, + "tokens_seen": 698972160 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003980842527582748, + "loss": 2.9789, + "theoretical_loss": 3.779983909081115, + "tokens_seen": 699037696 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039807422266800404, + "loss": 2.9775, + "theoretical_loss": 3.7799479195784746, + "tokens_seen": 699103232 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039806419257773317, + "loss": 3.027, + "theoretical_loss": 3.779911934393991, + "tokens_seen": 699168768 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003980541624874624, + "loss": 3.0297, + "theoretical_loss": 3.7798759535267417, + "tokens_seen": 699234304 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003980441323971916, + "loss": 2.949, + "theoretical_loss": 3.779839976975803, + "tokens_seen": 699299840 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039803410230692077, + "loss": 2.8488, + "theoretical_loss": 3.779804004740253, + "tokens_seen": 699365376 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039802407221664995, + "loss": 2.9867, + "theoretical_loss": 3.7797680368191706, + "tokens_seen": 699430912 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039801404212637913, + "loss": 2.9055, + "theoretical_loss": 3.7797320732116337, + "tokens_seen": 699496448 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003980040120361083, + "loss": 3.1272, + "theoretical_loss": 3.7796961139167204, + "tokens_seen": 699561984 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 833268, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7306745052337646, + "objective/train/theoretical_loss": 3.7796781358862104, + "objective/train/tokens_used": 720054752, + "theoretical_loss": 3.7796781358862104, + "tokens_seen": 699594752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039799398194583754, + "loss": 2.9271, + "theoretical_loss": 3.7796601589335106, + "tokens_seen": 699627520 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039798395185556667, + "loss": 2.8494, + "theoretical_loss": 3.7796242082610823, + "tokens_seen": 699693056 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003979739217652959, + "loss": 2.9823, + "theoretical_loss": 3.7795882618985157, + "tokens_seen": 699758592 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039796389167502503, + "loss": 2.9894, + "theoretical_loss": 3.779552319844891, + "tokens_seen": 699824128 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039795386158475427, + "loss": 2.8972, + "theoretical_loss": 3.779516382099287, + "tokens_seen": 699889664 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039794383149448345, + "loss": 2.8741, + "theoretical_loss": 3.7794804486607845, + "tokens_seen": 699955200 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039793380140421263, + "loss": 2.907, + "theoretical_loss": 3.779444519528465, + "tokens_seen": 700020736 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003979237713139418, + "loss": 2.9134, + "theoretical_loss": 3.779408594701408, + "tokens_seen": 700086272 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039791374122367105, + "loss": 2.7774, + "theoretical_loss": 3.779372674178696, + "tokens_seen": 700151808 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003979037111334002, + "loss": 3.0367, + "theoretical_loss": 3.7793367579594097, + "tokens_seen": 700217344 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003978936810431294, + "loss": 2.8712, + "theoretical_loss": 3.7793008460426307, + "tokens_seen": 700282880 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003978836509528586, + "loss": 2.9051, + "theoretical_loss": 3.779264938427441, + "tokens_seen": 700348416 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039787362086258777, + "loss": 3.1493, + "theoretical_loss": 3.7792290351129236, + "tokens_seen": 700413952 + }, + { + "epoch": 2.03, + "learning_rate": 0.000397863590772317, + "loss": 2.8041, + "theoretical_loss": 3.7791931360981605, + "tokens_seen": 700479488 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039785356068204613, + "loss": 3.0465, + "theoretical_loss": 3.7791572413822343, + "tokens_seen": 700545024 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039784353059177537, + "loss": 2.8724, + "theoretical_loss": 3.7791213509642287, + "tokens_seen": 700610560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003978335005015045, + "loss": 2.8372, + "theoretical_loss": 3.7790854648432273, + "tokens_seen": 700676096 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039782347041123373, + "loss": 3.0213, + "theoretical_loss": 3.779049583018313, + "tokens_seen": 700741632 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003978134403209629, + "loss": 3.0222, + "theoretical_loss": 3.7790137054885697, + "tokens_seen": 700807168 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003978034102306921, + "loss": 2.9475, + "theoretical_loss": 3.7789778322530827, + "tokens_seen": 700872704 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003977933801404213, + "loss": 2.7373, + "theoretical_loss": 3.7789419633109365, + "tokens_seen": 700938240 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039778335005015046, + "loss": 2.9236, + "theoretical_loss": 3.7789060986612144, + "tokens_seen": 701003776 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039777331995987964, + "loss": 2.8003, + "theoretical_loss": 3.7788702383030035, + "tokens_seen": 701069312 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039776328986960887, + "loss": 2.7636, + "theoretical_loss": 3.7788343822353876, + "tokens_seen": 701134848 + }, + { + "epoch": 2.03, + "learning_rate": 0.000397753259779338, + "loss": 2.7646, + "theoretical_loss": 3.778798530457453, + "tokens_seen": 701200384 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 838443, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2617812156677246, + "objective/train/theoretical_loss": 3.7787806061768308, + "objective/train/tokens_used": 721693152, + "theoretical_loss": 3.7787806061768308, + "tokens_seen": 701233152 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039774322968906723, + "loss": 2.84, + "theoretical_loss": 3.7787626829682863, + "tokens_seen": 701265920 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003977331995987964, + "loss": 2.831, + "theoretical_loss": 3.778726839766972, + "tokens_seen": 701331456 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003977231695085256, + "loss": 2.8498, + "theoretical_loss": 3.7786910008525987, + "tokens_seen": 701396992 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003977131394182548, + "loss": 2.8381, + "theoretical_loss": 3.778655166224252, + "tokens_seen": 701462528 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039770310932798396, + "loss": 2.9054, + "theoretical_loss": 3.778619335881019, + "tokens_seen": 701528064 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039769307923771314, + "loss": 3.1357, + "theoretical_loss": 3.7785835098219867, + "tokens_seen": 701593600 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003976830491474424, + "loss": 2.7488, + "theoretical_loss": 3.7785476880462436, + "tokens_seen": 701659136 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003976730190571715, + "loss": 3.1839, + "theoretical_loss": 3.778511870552877, + "tokens_seen": 701724672 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039766298896690074, + "loss": 3.026, + "theoretical_loss": 3.778476057340975, + "tokens_seen": 701790208 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039765295887662987, + "loss": 2.8715, + "theoretical_loss": 3.778440248409627, + "tokens_seen": 701855744 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003976429287863591, + "loss": 2.9057, + "theoretical_loss": 3.7784044437579207, + "tokens_seen": 701921280 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003976328986960883, + "loss": 2.9479, + "theoretical_loss": 3.7783686433849457, + "tokens_seen": 701986816 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039762286860581746, + "loss": 2.8235, + "theoretical_loss": 3.778332847289791, + "tokens_seen": 702052352 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039761283851554664, + "loss": 2.9134, + "theoretical_loss": 3.7782970554715467, + "tokens_seen": 702117888 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003976028084252759, + "loss": 2.9239, + "theoretical_loss": 3.7782612679293015, + "tokens_seen": 702183424 + }, + { + "epoch": 2.03, + "learning_rate": 0.000397592778335005, + "loss": 2.7292, + "theoretical_loss": 3.778225484662147, + "tokens_seen": 702248960 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039758274824473424, + "loss": 2.8728, + "theoretical_loss": 3.7781897056691722, + "tokens_seen": 702314496 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039757271815446337, + "loss": 3.129, + "theoretical_loss": 3.778153930949469, + "tokens_seen": 702380032 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003975626880641926, + "loss": 2.8147, + "theoretical_loss": 3.7781181605021277, + "tokens_seen": 702445568 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003975526579739218, + "loss": 3.1122, + "theoretical_loss": 3.7780823943262396, + "tokens_seen": 702511104 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039754262788365097, + "loss": 2.9351, + "theoretical_loss": 3.7780466324208968, + "tokens_seen": 702576640 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039753259779338015, + "loss": 2.5674, + "theoretical_loss": 3.77801087478519, + "tokens_seen": 702642176 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039752256770310933, + "loss": 3.0548, + "theoretical_loss": 3.7779751214182125, + "tokens_seen": 702707712 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003975125376128385, + "loss": 2.8329, + "theoretical_loss": 3.777939372319056, + "tokens_seen": 702773248 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039750250752256774, + "loss": 2.9937, + "theoretical_loss": 3.777903627486813, + "tokens_seen": 702838784 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 839055, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7638256549835205, + "objective/train/theoretical_loss": 3.777885756670501, + "objective/train/tokens_used": 723331552, + "theoretical_loss": 3.777885756670501, + "tokens_seen": 702871552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039749247743229687, + "loss": 2.9961, + "theoretical_loss": 3.7778678869205766, + "tokens_seen": 702904320 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003974824473420261, + "loss": 2.8954, + "theoretical_loss": 3.7778321506194406, + "tokens_seen": 702969856 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039747241725175523, + "loss": 2.8431, + "theoretical_loss": 3.7777964185824975, + "tokens_seen": 703035392 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039746238716148447, + "loss": 2.9681, + "theoretical_loss": 3.7777606908088415, + "tokens_seen": 703100928 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039745235707121365, + "loss": 3.0087, + "theoretical_loss": 3.7777249672975666, + "tokens_seen": 703166464 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039744232698094283, + "loss": 3.2902, + "theoretical_loss": 3.7776892480477673, + "tokens_seen": 703232000 + }, + { + "epoch": 2.03, + "learning_rate": 0.000397432296890672, + "loss": 3.011, + "theoretical_loss": 3.7776535330585377, + "tokens_seen": 703297536 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039742226680040125, + "loss": 2.8944, + "theoretical_loss": 3.777617822328973, + "tokens_seen": 703363072 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003974122367101304, + "loss": 2.918, + "theoretical_loss": 3.7775821158581686, + "tokens_seen": 703428608 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003974022066198596, + "loss": 2.7816, + "theoretical_loss": 3.777546413645219, + "tokens_seen": 703494144 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039739217652958874, + "loss": 3.0054, + "theoretical_loss": 3.777510715689221, + "tokens_seen": 703559680 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039738214643931797, + "loss": 2.9605, + "theoretical_loss": 3.77747502198927, + "tokens_seen": 703625216 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039737211634904715, + "loss": 2.8161, + "theoretical_loss": 3.7774393325444615, + "tokens_seen": 703690752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039736208625877633, + "loss": 2.9456, + "theoretical_loss": 3.7774036473538937, + "tokens_seen": 703756288 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003973520561685055, + "loss": 3.0258, + "theoretical_loss": 3.777367966416662, + "tokens_seen": 703821824 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003973420260782347, + "loss": 2.979, + "theoretical_loss": 3.777332289731864, + "tokens_seen": 703887360 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003973319959879639, + "loss": 2.9257, + "theoretical_loss": 3.777296617298597, + "tokens_seen": 703952896 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003973219658976931, + "loss": 2.7951, + "theoretical_loss": 3.7772609491159588, + "tokens_seen": 704018432 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039731193580742224, + "loss": 2.9469, + "theoretical_loss": 3.7772252851830466, + "tokens_seen": 704083968 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003973019057171515, + "loss": 3.1696, + "theoretical_loss": 3.777189625498959, + "tokens_seen": 704149504 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003972918756268806, + "loss": 2.9136, + "theoretical_loss": 3.7771539700627956, + "tokens_seen": 704215040 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039728184553660984, + "loss": 2.8302, + "theoretical_loss": 3.7771183188736535, + "tokens_seen": 704280576 + }, + { + "epoch": 2.03, + "learning_rate": 0.000397271815446339, + "loss": 2.9683, + "theoretical_loss": 3.777082671930632, + "tokens_seen": 704346112 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003972617853560682, + "loss": 2.8791, + "theoretical_loss": 3.77704702923283, + "tokens_seen": 704411648 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003972517552657974, + "loss": 2.969, + "theoretical_loss": 3.7770113907793483, + "tokens_seen": 704477184 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 840230, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1280670166015625, + "objective/train/theoretical_loss": 3.776993573143946, + "objective/train/tokens_used": 724969952, + "theoretical_loss": 3.776993573143946, + "tokens_seen": 704509952 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003972417251755266, + "loss": 2.7157, + "theoretical_loss": 3.776975756569286, + "tokens_seen": 704542720 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039723169508525574, + "loss": 2.8356, + "theoretical_loss": 3.776940126601743, + "tokens_seen": 704608256 + }, + { + "epoch": 2.03, + "learning_rate": 0.000397221664994985, + "loss": 2.8503, + "theoretical_loss": 3.7769045008758204, + "tokens_seen": 704673792 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003972116349047141, + "loss": 2.9024, + "theoretical_loss": 3.7768688793906184, + "tokens_seen": 704739328 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039720160481444334, + "loss": 3.0062, + "theoretical_loss": 3.776833262145237, + "tokens_seen": 704804864 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003971915747241725, + "loss": 3.1082, + "theoretical_loss": 3.776797649138779, + "tokens_seen": 704870400 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003971815446339017, + "loss": 2.9018, + "theoretical_loss": 3.776762040370345, + "tokens_seen": 704935936 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003971715145436309, + "loss": 2.7978, + "theoretical_loss": 3.776726435839037, + "tokens_seen": 705001472 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039716148445336007, + "loss": 2.8779, + "theoretical_loss": 3.776690835543956, + "tokens_seen": 705067008 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039715145436308925, + "loss": 2.9387, + "theoretical_loss": 3.7766552394842057, + "tokens_seen": 705132544 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003971414242728185, + "loss": 2.6552, + "theoretical_loss": 3.7766196476588885, + "tokens_seen": 705198080 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039713139418254766, + "loss": 3.0808, + "theoretical_loss": 3.7765840600671066, + "tokens_seen": 705263616 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039712136409227684, + "loss": 2.9143, + "theoretical_loss": 3.7765484767079633, + "tokens_seen": 705329152 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003971113340020061, + "loss": 3.0465, + "theoretical_loss": 3.776512897580562, + "tokens_seen": 705394688 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003971013039117352, + "loss": 2.8223, + "theoretical_loss": 3.7764773226840065, + "tokens_seen": 705460224 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039709127382146444, + "loss": 2.892, + "theoretical_loss": 3.7764417520174005, + "tokens_seen": 705525760 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039708124373119357, + "loss": 2.7686, + "theoretical_loss": 3.7764061855798485, + "tokens_seen": 705591296 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003970712136409228, + "loss": 2.8557, + "theoretical_loss": 3.776370623370455, + "tokens_seen": 705656832 + }, + { + "epoch": 2.03, + "learning_rate": 0.000397061183550652, + "loss": 2.9014, + "theoretical_loss": 3.7763350653883236, + "tokens_seen": 705722368 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039705115346038117, + "loss": 2.8048, + "theoretical_loss": 3.776299511632561, + "tokens_seen": 705787904 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039704112337011035, + "loss": 2.9953, + "theoretical_loss": 3.7762639621022718, + "tokens_seen": 705853440 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039703109327983953, + "loss": 3.0841, + "theoretical_loss": 3.7762284167965614, + "tokens_seen": 705918976 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003970210631895687, + "loss": 2.9048, + "theoretical_loss": 3.7761928757145355, + "tokens_seen": 705984512 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039701103309929794, + "loss": 2.7224, + "theoretical_loss": 3.7761573388553007, + "tokens_seen": 706050048 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039700100300902707, + "loss": 2.8874, + "theoretical_loss": 3.776121806217963, + "tokens_seen": 706115584 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 840818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0678188800811768, + "objective/train/theoretical_loss": 3.7761040414822267, + "objective/train/tokens_used": 726608352, + "theoretical_loss": 3.7761040414822267, + "tokens_seen": 706148352 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003969909729187563, + "loss": 2.97, + "theoretical_loss": 3.776086277801629, + "tokens_seen": 706181120 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039698094282848543, + "loss": 2.8323, + "theoretical_loss": 3.776050753605406, + "tokens_seen": 706246656 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039697091273821467, + "loss": 2.9463, + "theoretical_loss": 3.776015233628401, + "tokens_seen": 706312192 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039696088264794385, + "loss": 3.0658, + "theoretical_loss": 3.7759797178697214, + "tokens_seen": 706377728 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039695085255767303, + "loss": 3.0671, + "theoretical_loss": 3.775944206328475, + "tokens_seen": 706443264 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003969408224674022, + "loss": 2.9199, + "theoretical_loss": 3.77590869900377, + "tokens_seen": 706508800 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039693079237713145, + "loss": 2.8948, + "theoretical_loss": 3.7758731958947145, + "tokens_seen": 706574336 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003969207622868606, + "loss": 2.8665, + "theoretical_loss": 3.7758376970004166, + "tokens_seen": 706639872 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003969107321965898, + "loss": 2.8945, + "theoretical_loss": 3.775802202319986, + "tokens_seen": 706705408 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039690070210631894, + "loss": 3.097, + "theoretical_loss": 3.7757667118525315, + "tokens_seen": 706770944 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039689067201604817, + "loss": 2.9045, + "theoretical_loss": 3.7757312255971622, + "tokens_seen": 706836480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039688064192577735, + "loss": 2.7058, + "theoretical_loss": 3.775695743552988, + "tokens_seen": 706902016 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039687061183550653, + "loss": 3.0764, + "theoretical_loss": 3.775660265719118, + "tokens_seen": 706967552 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003968605817452357, + "loss": 2.9457, + "theoretical_loss": 3.7756247920946633, + "tokens_seen": 707033088 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003968505516549649, + "loss": 2.9319, + "theoretical_loss": 3.7755893226787345, + "tokens_seen": 707098624 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003968405215646941, + "loss": 2.9837, + "theoretical_loss": 3.7755538574704417, + "tokens_seen": 707164160 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003968304914744233, + "loss": 3.0032, + "theoretical_loss": 3.7755183964688968, + "tokens_seen": 707229696 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039682046138415244, + "loss": 2.7542, + "theoretical_loss": 3.7754829396732097, + "tokens_seen": 707295232 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003968104312938817, + "loss": 2.9074, + "theoretical_loss": 3.7754474870824923, + "tokens_seen": 707360768 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003968004012036108, + "loss": 2.716, + "theoretical_loss": 3.7754120386958574, + "tokens_seen": 707426304 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039679037111334004, + "loss": 2.7301, + "theoretical_loss": 3.775376594512416, + "tokens_seen": 707491840 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003967803410230692, + "loss": 3.0049, + "theoretical_loss": 3.7753411545312803, + "tokens_seen": 707557376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003967703109327984, + "loss": 3.0008, + "theoretical_loss": 3.7753057187515644, + "tokens_seen": 707622912 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003967602808425276, + "loss": 3.1812, + "theoretical_loss": 3.7752702871723796, + "tokens_seen": 707688448 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003967502507522568, + "loss": 2.9954, + "theoretical_loss": 3.77523485979284, + "tokens_seen": 707753984 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 842171, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9438395500183105, + "objective/train/theoretical_loss": 3.77521714767766, + "objective/train/tokens_used": 728246752, + "theoretical_loss": 3.77521714767766, + "tokens_seen": 707786752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039674022066198594, + "loss": 2.8499, + "theoretical_loss": 3.7751994366120583, + "tokens_seen": 707819520 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003967301905717152, + "loss": 2.9356, + "theoretical_loss": 3.7751640176291485, + "tokens_seen": 707885056 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003967201604814443, + "loss": 3.1708, + "theoretical_loss": 3.7751286028432247, + "tokens_seen": 707950592 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039671013039117354, + "loss": 2.9933, + "theoretical_loss": 3.7750931922534017, + "tokens_seen": 708016128 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003967001003009027, + "loss": 2.985, + "theoretical_loss": 3.7750577858587926, + "tokens_seen": 708081664 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003966900702106319, + "loss": 2.7421, + "theoretical_loss": 3.775022383658513, + "tokens_seen": 708147200 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003966800401203611, + "loss": 3.0504, + "theoretical_loss": 3.774986985651678, + "tokens_seen": 708212736 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039667001003009027, + "loss": 2.8545, + "theoretical_loss": 3.774951591837402, + "tokens_seen": 708278272 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039665997993981945, + "loss": 2.7481, + "theoretical_loss": 3.7749162022148024, + "tokens_seen": 708343808 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003966499498495487, + "loss": 2.7234, + "theoretical_loss": 3.774880816782993, + "tokens_seen": 708409344 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003966399197592778, + "loss": 2.9445, + "theoretical_loss": 3.774845435541091, + "tokens_seen": 708474880 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039662988966900704, + "loss": 2.8104, + "theoretical_loss": 3.7748100584882134, + "tokens_seen": 708540416 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039661985957873617, + "loss": 2.9856, + "theoretical_loss": 3.774774685623475, + "tokens_seen": 708605952 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003966098294884654, + "loss": 2.8665, + "theoretical_loss": 3.774739316945994, + "tokens_seen": 708671488 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003965997993981946, + "loss": 2.8147, + "theoretical_loss": 3.774703952454888, + "tokens_seen": 708737024 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039658976930792377, + "loss": 3.0254, + "theoretical_loss": 3.7746685921492737, + "tokens_seen": 708802560 + }, + { + "epoch": 2.03, + "learning_rate": 0.00039657973921765295, + "loss": 3.0298, + "theoretical_loss": 3.7746332360282677, + "tokens_seen": 708868096 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003965697091273822, + "loss": 3.0365, + "theoretical_loss": 3.77459788409099, + "tokens_seen": 708933632 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003965596790371113, + "loss": 3.0917, + "theoretical_loss": 3.7745625363365582, + "tokens_seen": 708999168 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039654964894684055, + "loss": 3.0597, + "theoretical_loss": 3.7745271927640904, + "tokens_seen": 709064704 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003965396188565697, + "loss": 2.8429, + "theoretical_loss": 3.7744918533727057, + "tokens_seen": 709130240 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003965295887662989, + "loss": 2.9297, + "theoretical_loss": 3.7744565181615233, + "tokens_seen": 709195776 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003965195586760281, + "loss": 2.9209, + "theoretical_loss": 3.7744211871296613, + "tokens_seen": 709261312 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039650952858575727, + "loss": 2.7967, + "theoretical_loss": 3.774385860276241, + "tokens_seen": 709326848 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039649949849548645, + "loss": 3.0973, + "theoretical_loss": 3.7743505376003816, + "tokens_seen": 709392384 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 842767, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9267566204071045, + "objective/train/theoretical_loss": 3.774332877828762, + "objective/train/tokens_used": 729885152, + "theoretical_loss": 3.774332877828762, + "tokens_seen": 709425152 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039648946840521563, + "loss": 2.8282, + "theoretical_loss": 3.7743152191012026, + "tokens_seen": 709457920 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003964794383149448, + "loss": 2.88, + "theoretical_loss": 3.7742799047778255, + "tokens_seen": 709523456 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039646940822467405, + "loss": 2.8465, + "theoretical_loss": 3.77424459462937, + "tokens_seen": 709588992 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003964593781344032, + "loss": 2.949, + "theoretical_loss": 3.774209288654957, + "tokens_seen": 709654528 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003964493480441324, + "loss": 2.9575, + "theoretical_loss": 3.7741739868537088, + "tokens_seen": 709720064 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003964393179538616, + "loss": 2.8422, + "theoretical_loss": 3.7741386892247455, + "tokens_seen": 709785600 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003964292878635908, + "loss": 3.0997, + "theoretical_loss": 3.77410339576719, + "tokens_seen": 709851136 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039641925777331996, + "loss": 2.8329, + "theoretical_loss": 3.774068106480163, + "tokens_seen": 709916672 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039640922768304914, + "loss": 2.9717, + "theoretical_loss": 3.774032821362788, + "tokens_seen": 709982208 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003963991975927783, + "loss": 2.9319, + "theoretical_loss": 3.773997540414187, + "tokens_seen": 710047744 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039638916750250755, + "loss": 2.8197, + "theoretical_loss": 3.773962263633482, + "tokens_seen": 710113280 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039637913741223673, + "loss": 2.7053, + "theoretical_loss": 3.7739269910197972, + "tokens_seen": 710178816 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003963691073219659, + "loss": 2.9991, + "theoretical_loss": 3.7738917225722557, + "tokens_seen": 710244352 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003963590772316951, + "loss": 2.995, + "theoretical_loss": 3.7738564582899805, + "tokens_seen": 710309888 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003963490471414243, + "loss": 3.0643, + "theoretical_loss": 3.7738211981720964, + "tokens_seen": 710375424 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003963390170511535, + "loss": 3.0085, + "theoretical_loss": 3.7737859422177262, + "tokens_seen": 710440960 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039632898696088264, + "loss": 2.9135, + "theoretical_loss": 3.773750690425995, + "tokens_seen": 710506496 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003963189568706119, + "loss": 2.8237, + "theoretical_loss": 3.7737154427960276, + "tokens_seen": 710572032 + }, + { + "epoch": 2.04, + "learning_rate": 0.000396308926780341, + "loss": 2.9719, + "theoretical_loss": 3.7736801993269484, + "tokens_seen": 710637568 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039629889669007024, + "loss": 2.8327, + "theoretical_loss": 3.7736449600178834, + "tokens_seen": 710703104 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003962888665997994, + "loss": 2.8824, + "theoretical_loss": 3.773609724867957, + "tokens_seen": 710768640 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003962788365095286, + "loss": 2.8354, + "theoretical_loss": 3.7735744938762954, + "tokens_seen": 710834176 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003962688064192578, + "loss": 3.2813, + "theoretical_loss": 3.7735392670420254, + "tokens_seen": 710899712 + }, + { + "epoch": 2.04, + "learning_rate": 0.000396258776328987, + "loss": 2.8482, + "theoretical_loss": 3.7735040443642713, + "tokens_seen": 710965248 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039624874623871614, + "loss": 2.9067, + "theoretical_loss": 3.773468825842161, + "tokens_seen": 711030784 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 844300, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3437087535858154, + "objective/train/theoretical_loss": 3.7734512181391993, + "objective/train/tokens_used": 731523552, + "theoretical_loss": 3.7734512181391993, + "tokens_seen": 711063552 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003962387161484454, + "loss": 3.113, + "theoretical_loss": 3.773433611474821, + "tokens_seen": 711096320 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003962286860581745, + "loss": 2.9326, + "theoretical_loss": 3.773398401261378, + "tokens_seen": 711161856 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039621865596790374, + "loss": 3.1375, + "theoretical_loss": 3.7733631952009596, + "tokens_seen": 711227392 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003962086258776329, + "loss": 2.7491, + "theoretical_loss": 3.773327993292693, + "tokens_seen": 711292928 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003961985957873621, + "loss": 2.9599, + "theoretical_loss": 3.7732927955357063, + "tokens_seen": 711358464 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003961885656970913, + "loss": 2.7884, + "theoretical_loss": 3.773257601929128, + "tokens_seen": 711424000 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039617853560682047, + "loss": 2.8457, + "theoretical_loss": 3.7732224124720855, + "tokens_seen": 711489536 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039616850551654965, + "loss": 2.9076, + "theoretical_loss": 3.7731872271637075, + "tokens_seen": 711555072 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003961584754262789, + "loss": 2.8034, + "theoretical_loss": 3.7731520460031236, + "tokens_seen": 711620608 + }, + { + "epoch": 2.04, + "learning_rate": 0.000396148445336008, + "loss": 2.7777, + "theoretical_loss": 3.7731168689894625, + "tokens_seen": 711686144 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039613841524573724, + "loss": 2.9276, + "theoretical_loss": 3.7730816961218534, + "tokens_seen": 711751680 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039612838515546637, + "loss": 2.7608, + "theoretical_loss": 3.7730465273994263, + "tokens_seen": 711817216 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003961183550651956, + "loss": 2.8722, + "theoretical_loss": 3.773011362821311, + "tokens_seen": 711882752 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003961083249749248, + "loss": 2.7365, + "theoretical_loss": 3.772976202386637, + "tokens_seen": 711948288 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039609829488465397, + "loss": 2.8973, + "theoretical_loss": 3.772941046094536, + "tokens_seen": 712013824 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039608826479438315, + "loss": 2.7796, + "theoretical_loss": 3.772905893944138, + "tokens_seen": 712079360 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003960782347041124, + "loss": 2.7532, + "theoretical_loss": 3.7728707459345734, + "tokens_seen": 712144896 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003960682046138415, + "loss": 2.973, + "theoretical_loss": 3.7728356020649745, + "tokens_seen": 712210432 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039605817452357075, + "loss": 3.0538, + "theoretical_loss": 3.772800462334472, + "tokens_seen": 712275968 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003960481444332999, + "loss": 2.759, + "theoretical_loss": 3.7727653267421983, + "tokens_seen": 712341504 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003960381143430291, + "loss": 2.8805, + "theoretical_loss": 3.772730195287285, + "tokens_seen": 712407040 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003960280842527583, + "loss": 2.8023, + "theoretical_loss": 3.772695067968865, + "tokens_seen": 712472576 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039601805416248747, + "loss": 2.9331, + "theoretical_loss": 3.7726599447860694, + "tokens_seen": 712538112 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039600802407221665, + "loss": 2.9116, + "theoretical_loss": 3.772624825738032, + "tokens_seen": 712603648 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039599799398194583, + "loss": 3.1991, + "theoretical_loss": 3.772589710823886, + "tokens_seen": 712669184 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 844668, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.137606143951416, + "objective/train/theoretical_loss": 3.772572154916751, + "objective/train/tokens_used": 733161952, + "theoretical_loss": 3.772572154916751, + "tokens_seen": 712701952 + }, + { + "epoch": 2.04, + "learning_rate": 0.000395987963891675, + "loss": 3.0682, + "theoretical_loss": 3.772554600042764, + "tokens_seen": 712734720 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039597793380140425, + "loss": 2.923, + "theoretical_loss": 3.7725194933938004, + "tokens_seen": 712800256 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003959679037111334, + "loss": 3.009, + "theoretical_loss": 3.7724843908761283, + "tokens_seen": 712865792 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003959578736208626, + "loss": 2.9241, + "theoretical_loss": 3.772449292488883, + "tokens_seen": 712931328 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003959478435305918, + "loss": 2.7148, + "theoretical_loss": 3.772414198231197, + "tokens_seen": 712996864 + }, + { + "epoch": 2.04, + "learning_rate": 0.000395937813440321, + "loss": 2.6896, + "theoretical_loss": 3.7723791081022062, + "tokens_seen": 713062400 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039592778335005016, + "loss": 2.7487, + "theoretical_loss": 3.7723440221010454, + "tokens_seen": 713127936 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039591775325977934, + "loss": 2.9316, + "theoretical_loss": 3.7723089402268495, + "tokens_seen": 713193472 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003959077231695085, + "loss": 2.8535, + "theoretical_loss": 3.772273862478754, + "tokens_seen": 713259008 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039589769307923775, + "loss": 2.7874, + "theoretical_loss": 3.7722387888558946, + "tokens_seen": 713324544 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003958876629889669, + "loss": 2.7757, + "theoretical_loss": 3.7722037193574067, + "tokens_seen": 713390080 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003958776328986961, + "loss": 2.6952, + "theoretical_loss": 3.7721686539824275, + "tokens_seen": 713455616 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039586760280842524, + "loss": 2.8004, + "theoretical_loss": 3.7721335927300927, + "tokens_seen": 713521152 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003958575727181545, + "loss": 2.8383, + "theoretical_loss": 3.772098535599539, + "tokens_seen": 713586688 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039584754262788366, + "loss": 3.1454, + "theoretical_loss": 3.772063482589904, + "tokens_seen": 713652224 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039583751253761284, + "loss": 2.7299, + "theoretical_loss": 3.7720284337003243, + "tokens_seen": 713717760 + }, + { + "epoch": 2.04, + "learning_rate": 0.000395827482447342, + "loss": 2.9994, + "theoretical_loss": 3.7719933889299377, + "tokens_seen": 713783296 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003958174523570712, + "loss": 2.7725, + "theoretical_loss": 3.771958348277882, + "tokens_seen": 713848832 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003958074222668004, + "loss": 2.7891, + "theoretical_loss": 3.7719233117432944, + "tokens_seen": 713914368 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003957973921765296, + "loss": 2.8777, + "theoretical_loss": 3.771888279325314, + "tokens_seen": 713979904 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039578736208625875, + "loss": 2.9531, + "theoretical_loss": 3.771853251023079, + "tokens_seen": 714045440 + }, + { + "epoch": 2.04, + "learning_rate": 0.000395777331995988, + "loss": 2.9526, + "theoretical_loss": 3.7718182268357285, + "tokens_seen": 714110976 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039576730190571716, + "loss": 2.879, + "theoretical_loss": 3.7717832067624015, + "tokens_seen": 714176512 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039575727181544634, + "loss": 2.9837, + "theoretical_loss": 3.771748190802237, + "tokens_seen": 714242048 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003957472417251755, + "loss": 2.878, + "theoretical_loss": 3.771713178954375, + "tokens_seen": 714307584 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 846114, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8042542934417725, + "objective/train/theoretical_loss": 3.771695674572288, + "objective/train/tokens_used": 734800352, + "theoretical_loss": 3.771695674572288, + "tokens_seen": 714340352 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003957372116349047, + "loss": 2.8705, + "theoretical_loss": 3.7716781712179546, + "tokens_seen": 714373120 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003957271815446339, + "loss": 2.8951, + "theoretical_loss": 3.7716431675921167, + "tokens_seen": 714438656 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003957171514543631, + "loss": 3.003, + "theoretical_loss": 3.7716081680760007, + "tokens_seen": 714504192 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039570712136409225, + "loss": 2.734, + "theoretical_loss": 3.7715731726687487, + "tokens_seen": 714569728 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003956970912738215, + "loss": 2.9267, + "theoretical_loss": 3.7715381813694995, + "tokens_seen": 714635264 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003956870611835506, + "loss": 2.9854, + "theoretical_loss": 3.771503194177396, + "tokens_seen": 714700800 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039567703109327985, + "loss": 2.7856, + "theoretical_loss": 3.771468211091579, + "tokens_seen": 714766336 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039566700100300903, + "loss": 2.7446, + "theoretical_loss": 3.77143323211119, + "tokens_seen": 714831872 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003956569709127382, + "loss": 2.6806, + "theoretical_loss": 3.7713982572353704, + "tokens_seen": 714897408 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003956469408224674, + "loss": 2.8043, + "theoretical_loss": 3.771363286463263, + "tokens_seen": 714962944 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039563691073219657, + "loss": 3.0227, + "theoretical_loss": 3.77132831979401, + "tokens_seen": 715028480 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003956268806419258, + "loss": 2.8135, + "theoretical_loss": 3.7712933572267544, + "tokens_seen": 715094016 + }, + { + "epoch": 2.04, + "learning_rate": 0.000395616850551655, + "loss": 2.5729, + "theoretical_loss": 3.7712583987606387, + "tokens_seen": 715159552 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039560682046138417, + "loss": 2.8661, + "theoretical_loss": 3.7712234443948063, + "tokens_seen": 715225088 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039559679037111335, + "loss": 2.8424, + "theoretical_loss": 3.7711884941284004, + "tokens_seen": 715290624 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003955867602808426, + "loss": 2.8453, + "theoretical_loss": 3.771153547960565, + "tokens_seen": 715356160 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003955767301905717, + "loss": 3.137, + "theoretical_loss": 3.771118605890444, + "tokens_seen": 715421696 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039556670010030095, + "loss": 3.0339, + "theoretical_loss": 3.771083667917181, + "tokens_seen": 715487232 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003955566700100301, + "loss": 2.7859, + "theoretical_loss": 3.7710487340399217, + "tokens_seen": 715552768 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003955466399197593, + "loss": 2.7065, + "theoretical_loss": 3.7710138042578096, + "tokens_seen": 715618304 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003955366098294885, + "loss": 2.9122, + "theoretical_loss": 3.7709788785699905, + "tokens_seen": 715683840 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039552657973921767, + "loss": 2.8986, + "theoretical_loss": 3.770943956975609, + "tokens_seen": 715749376 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039551654964894685, + "loss": 2.7145, + "theoretical_loss": 3.770909039473811, + "tokens_seen": 715814912 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039550651955867603, + "loss": 2.7466, + "theoretical_loss": 3.770874126063742, + "tokens_seen": 715880448 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003954964894684052, + "loss": 2.7592, + "theoretical_loss": 3.770839216744548, + "tokens_seen": 715945984 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 846880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.995128631591797, + "objective/train/theoretical_loss": 3.7708217636187626, + "objective/train/tokens_used": 736438752, + "theoretical_loss": 3.7708217636187626, + "tokens_seen": 715978752 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039548645937813445, + "loss": 2.6837, + "theoretical_loss": 3.770804311515376, + "tokens_seen": 716011520 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003954764292878636, + "loss": 3.0525, + "theoretical_loss": 3.7707694103753715, + "tokens_seen": 716077056 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003954663991975928, + "loss": 2.9757, + "theoretical_loss": 3.770734513323682, + "tokens_seen": 716142592 + }, + { + "epoch": 2.04, + "learning_rate": 0.000395456369107322, + "loss": 2.7955, + "theoretical_loss": 3.770699620359454, + "tokens_seen": 716208128 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003954463390170512, + "loss": 2.8557, + "theoretical_loss": 3.770664731481835, + "tokens_seen": 716273664 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039543630892678036, + "loss": 2.8646, + "theoretical_loss": 3.7706298466899724, + "tokens_seen": 716339200 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039542627883650954, + "loss": 2.9426, + "theoretical_loss": 3.7705949659830145, + "tokens_seen": 716404736 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003954162487462387, + "loss": 2.7996, + "theoretical_loss": 3.7705600893601083, + "tokens_seen": 716470272 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039540621865596795, + "loss": 3.0203, + "theoretical_loss": 3.7705252168204035, + "tokens_seen": 716535808 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003953961885656971, + "loss": 2.9499, + "theoretical_loss": 3.770490348363048, + "tokens_seen": 716601344 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003953861584754263, + "loss": 3.0872, + "theoretical_loss": 3.77045548398719, + "tokens_seen": 716666880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039537612838515544, + "loss": 2.6637, + "theoretical_loss": 3.7704206236919795, + "tokens_seen": 716732416 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003953660982948847, + "loss": 2.8815, + "theoretical_loss": 3.770385767476566, + "tokens_seen": 716797952 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039535606820461386, + "loss": 2.943, + "theoretical_loss": 3.7703509153400976, + "tokens_seen": 716863488 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039534603811434304, + "loss": 2.8796, + "theoretical_loss": 3.7703160672817257, + "tokens_seen": 716929024 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003953360080240722, + "loss": 2.9777, + "theoretical_loss": 3.7702812233006, + "tokens_seen": 716994560 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003953259779338014, + "loss": 2.858, + "theoretical_loss": 3.77024638339587, + "tokens_seen": 717060096 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003953159478435306, + "loss": 2.8236, + "theoretical_loss": 3.7702115475666877, + "tokens_seen": 717125632 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003953059177532598, + "loss": 2.9503, + "theoretical_loss": 3.7701767158122026, + "tokens_seen": 717191168 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039529588766298895, + "loss": 2.8212, + "theoretical_loss": 3.7701418881315667, + "tokens_seen": 717256704 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003952858575727182, + "loss": 2.9965, + "theoretical_loss": 3.7701070645239314, + "tokens_seen": 717322240 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039527582748244736, + "loss": 2.8431, + "theoretical_loss": 3.7700722449884476, + "tokens_seen": 717387776 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039526579739217654, + "loss": 2.8866, + "theoretical_loss": 3.7700374295242676, + "tokens_seen": 717453312 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003952557673019057, + "loss": 2.9852, + "theoretical_loss": 3.770002618130544, + "tokens_seen": 717518848 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003952457372116349, + "loss": 2.7195, + "theoretical_loss": 3.769967810806428, + "tokens_seen": 717584384 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 848381, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.755312204360962, + "objective/train/theoretical_loss": 3.7699504086702085, + "objective/train/tokens_used": 738077152, + "theoretical_loss": 3.7699504086702085, + "tokens_seen": 717617152 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003952357071213641, + "loss": 2.9789, + "theoretical_loss": 3.7699330075510735, + "tokens_seen": 717649920 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003952256770310933, + "loss": 2.678, + "theoretical_loss": 3.7698982083636325, + "tokens_seen": 717715456 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039521564694082245, + "loss": 2.9702, + "theoretical_loss": 3.7698634132432587, + "tokens_seen": 717780992 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003952056168505517, + "loss": 2.7129, + "theoretical_loss": 3.7698286221891055, + "tokens_seen": 717846528 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003951955867602808, + "loss": 2.9426, + "theoretical_loss": 3.769793835200326, + "tokens_seen": 717912064 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039518555667001005, + "loss": 2.8837, + "theoretical_loss": 3.7697590522760747, + "tokens_seen": 717977600 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039517552657973923, + "loss": 2.89, + "theoretical_loss": 3.7697242734155054, + "tokens_seen": 718043136 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003951654964894684, + "loss": 2.9438, + "theoretical_loss": 3.7696894986177725, + "tokens_seen": 718108672 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003951554663991976, + "loss": 2.692, + "theoretical_loss": 3.7696547278820307, + "tokens_seen": 718174208 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039514543630892677, + "loss": 3.0629, + "theoretical_loss": 3.769619961207435, + "tokens_seen": 718239744 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039513540621865595, + "loss": 2.6968, + "theoretical_loss": 3.7695851985931412, + "tokens_seen": 718305280 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003951253761283852, + "loss": 2.6478, + "theoretical_loss": 3.7695504400383033, + "tokens_seen": 718370816 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003951153460381143, + "loss": 2.6335, + "theoretical_loss": 3.7695156855420784, + "tokens_seen": 718436352 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039510531594784355, + "loss": 2.9399, + "theoretical_loss": 3.7694809351036214, + "tokens_seen": 718501888 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039509528585757273, + "loss": 2.8495, + "theoretical_loss": 3.769446188722089, + "tokens_seen": 718567424 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003950852557673019, + "loss": 2.9849, + "theoretical_loss": 3.7694114463966377, + "tokens_seen": 718632960 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003950752256770311, + "loss": 2.7779, + "theoretical_loss": 3.7693767081264236, + "tokens_seen": 718698496 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003950651955867603, + "loss": 2.9114, + "theoretical_loss": 3.7693419739106044, + "tokens_seen": 718764032 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039505516549648946, + "loss": 2.8417, + "theoretical_loss": 3.7693072437483366, + "tokens_seen": 718829568 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003950451354062187, + "loss": 2.8272, + "theoretical_loss": 3.7692725176387785, + "tokens_seen": 718895104 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003950351053159478, + "loss": 2.6431, + "theoretical_loss": 3.7692377955810867, + "tokens_seen": 718960640 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039502507522567705, + "loss": 2.7929, + "theoretical_loss": 3.76920307757442, + "tokens_seen": 719026176 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003950150451354062, + "loss": 2.8803, + "theoretical_loss": 3.7691683636179367, + "tokens_seen": 719091712 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003950050150451354, + "loss": 2.8331, + "theoretical_loss": 3.7691336537107945, + "tokens_seen": 719157248 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003949949849548646, + "loss": 2.8321, + "theoretical_loss": 3.769098947852152, + "tokens_seen": 719222784 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 849012, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8266098499298096, + "objective/train/theoretical_loss": 3.7690815964407562, + "objective/train/tokens_used": 739715552, + "theoretical_loss": 3.7690815964407562, + "tokens_seen": 719255552 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003949849548645938, + "loss": 2.8435, + "theoretical_loss": 3.7690642460411694, + "tokens_seen": 719288320 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039497492477432296, + "loss": 2.9216, + "theoretical_loss": 3.769029548277005, + "tokens_seen": 719353856 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003949648946840522, + "loss": 2.851, + "theoretical_loss": 3.768994854558818, + "tokens_seen": 719419392 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003949548645937813, + "loss": 2.8108, + "theoretical_loss": 3.768960164885769, + "tokens_seen": 719484928 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039494483450351056, + "loss": 2.9195, + "theoretical_loss": 3.768925479257017, + "tokens_seen": 719550464 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003949348044132397, + "loss": 2.8193, + "theoretical_loss": 3.768890797671723, + "tokens_seen": 719616000 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003949247743229689, + "loss": 2.7336, + "theoretical_loss": 3.7688561201290467, + "tokens_seen": 719681536 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003949147442326981, + "loss": 2.9406, + "theoretical_loss": 3.7688214466281496, + "tokens_seen": 719747072 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003949047141424273, + "loss": 2.7792, + "theoretical_loss": 3.7687867771681924, + "tokens_seen": 719812608 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003948946840521565, + "loss": 2.9954, + "theoretical_loss": 3.768752111748336, + "tokens_seen": 719878144 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039488465396188564, + "loss": 3.0322, + "theoretical_loss": 3.768717450367742, + "tokens_seen": 719943680 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003948746238716149, + "loss": 2.8552, + "theoretical_loss": 3.7686827930255724, + "tokens_seen": 720009216 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039486459378134406, + "loss": 2.9811, + "theoretical_loss": 3.7686481397209883, + "tokens_seen": 720074752 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039485456369107324, + "loss": 2.9801, + "theoretical_loss": 3.768613490453153, + "tokens_seen": 720140288 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003948445336008024, + "loss": 2.9666, + "theoretical_loss": 3.7685788452212288, + "tokens_seen": 720205824 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003948345035105316, + "loss": 2.6315, + "theoretical_loss": 3.768544204024378, + "tokens_seen": 720271360 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003948244734202608, + "loss": 2.6003, + "theoretical_loss": 3.7685095668617636, + "tokens_seen": 720336896 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039481444332999, + "loss": 2.9072, + "theoretical_loss": 3.7684749337325485, + "tokens_seen": 720402432 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039480441323971915, + "loss": 2.9633, + "theoretical_loss": 3.768440304635897, + "tokens_seen": 720467968 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003947943831494484, + "loss": 2.8997, + "theoretical_loss": 3.7684056795709724, + "tokens_seen": 720533504 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039478435305917756, + "loss": 2.8266, + "theoretical_loss": 3.768371058536939, + "tokens_seen": 720599040 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039477432296890674, + "loss": 2.9913, + "theoretical_loss": 3.76833644153296, + "tokens_seen": 720664576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003947642928786359, + "loss": 2.8822, + "theoretical_loss": 3.768301828558201, + "tokens_seen": 720730112 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003947542627883651, + "loss": 2.872, + "theoretical_loss": 3.7682672196118254, + "tokens_seen": 720795648 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003947442326980943, + "loss": 2.8944, + "theoretical_loss": 3.768232614693, + "tokens_seen": 720861184 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 850658, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.295666217803955, + "objective/train/theoretical_loss": 3.768215313743657, + "objective/train/tokens_used": 741353952, + "theoretical_loss": 3.768215313743657, + "tokens_seen": 720893952 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003947342026078235, + "loss": 2.4079, + "theoretical_loss": 3.7681980138008884, + "tokens_seen": 720926720 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039472417251755265, + "loss": 3.0534, + "theoretical_loss": 3.768163416934657, + "tokens_seen": 720992256 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003947141424272819, + "loss": 2.7413, + "theoretical_loss": 3.7681288240934707, + "tokens_seen": 721057792 + }, + { + "epoch": 2.04, + "learning_rate": 0.000394704112337011, + "loss": 2.9531, + "theoretical_loss": 3.768094235276496, + "tokens_seen": 721123328 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039469408224674025, + "loss": 2.8847, + "theoretical_loss": 3.7680596504828987, + "tokens_seen": 721188864 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039468405215646943, + "loss": 2.7946, + "theoretical_loss": 3.7680250697118467, + "tokens_seen": 721254400 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003946740220661986, + "loss": 2.9812, + "theoretical_loss": 3.7679904929625043, + "tokens_seen": 721319936 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003946639919759278, + "loss": 2.6334, + "theoretical_loss": 3.7679559202340407, + "tokens_seen": 721385472 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039465396188565697, + "loss": 3.0705, + "theoretical_loss": 3.7679213515256214, + "tokens_seen": 721451008 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039464393179538615, + "loss": 2.9367, + "theoretical_loss": 3.7678867868364145, + "tokens_seen": 721516544 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003946339017051154, + "loss": 2.859, + "theoretical_loss": 3.767852226165588, + "tokens_seen": 721582080 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003946238716148445, + "loss": 2.6695, + "theoretical_loss": 3.7678176695123096, + "tokens_seen": 721647616 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039461384152457375, + "loss": 2.6397, + "theoretical_loss": 3.7677831168757474, + "tokens_seen": 721713152 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039460381143430293, + "loss": 2.8705, + "theoretical_loss": 3.7677485682550698, + "tokens_seen": 721778688 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003945937813440321, + "loss": 2.848, + "theoretical_loss": 3.7677140236494457, + "tokens_seen": 721844224 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003945837512537613, + "loss": 2.8654, + "theoretical_loss": 3.767679483058044, + "tokens_seen": 721909760 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003945737211634905, + "loss": 2.7036, + "theoretical_loss": 3.7676449464800337, + "tokens_seen": 721975296 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039456369107321966, + "loss": 2.7538, + "theoretical_loss": 3.7676104139145843, + "tokens_seen": 722040832 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003945536609829489, + "loss": 2.9453, + "theoretical_loss": 3.7675758853608654, + "tokens_seen": 722106368 + }, + { + "epoch": 2.04, + "learning_rate": 0.000394543630892678, + "loss": 2.9468, + "theoretical_loss": 3.767541360818047, + "tokens_seen": 722171904 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039453360080240725, + "loss": 2.9775, + "theoretical_loss": 3.7675068402852996, + "tokens_seen": 722237440 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003945235707121364, + "loss": 3.0682, + "theoretical_loss": 3.7674723237617926, + "tokens_seen": 722302976 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003945135406218656, + "loss": 2.7909, + "theoretical_loss": 3.767437811246698, + "tokens_seen": 722368512 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003945035105315948, + "loss": 2.8995, + "theoretical_loss": 3.7674033027391856, + "tokens_seen": 722434048 + }, + { + "epoch": 2.04, + "learning_rate": 0.000394493480441324, + "loss": 2.6872, + "theoretical_loss": 3.7673687982384276, + "tokens_seen": 722499584 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 851365, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.842928171157837, + "objective/train/theoretical_loss": 3.767351547490322, + "objective/train/tokens_used": 742992352, + "theoretical_loss": 3.767351547490322, + "tokens_seen": 722532352 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039448345035105316, + "loss": 2.8114, + "theoretical_loss": 3.767334297743594, + "tokens_seen": 722565120 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003944734202607824, + "loss": 2.9769, + "theoretical_loss": 3.767299801253858, + "tokens_seen": 722630656 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003944633901705115, + "loss": 2.7042, + "theoretical_loss": 3.7672653087683905, + "tokens_seen": 722696192 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039445336008024076, + "loss": 2.8248, + "theoretical_loss": 3.767230820286364, + "tokens_seen": 722761728 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003944433299899699, + "loss": 3.0078, + "theoretical_loss": 3.76719633580695, + "tokens_seen": 722827264 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003944332998996991, + "loss": 2.6326, + "theoretical_loss": 3.7671618553293227, + "tokens_seen": 722892800 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003944232698094283, + "loss": 2.8485, + "theoretical_loss": 3.7671273788526536, + "tokens_seen": 722958336 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003944132397191575, + "loss": 2.6747, + "theoretical_loss": 3.767092906376117, + "tokens_seen": 723023872 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039440320962888666, + "loss": 2.8055, + "theoretical_loss": 3.7670584378988856, + "tokens_seen": 723089408 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039439317953861584, + "loss": 2.9246, + "theoretical_loss": 3.767023973420133, + "tokens_seen": 723154944 + }, + { + "epoch": 2.04, + "learning_rate": 0.000394383149448345, + "loss": 2.8103, + "theoretical_loss": 3.766989512939033, + "tokens_seen": 723220480 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039437311935807426, + "loss": 2.8357, + "theoretical_loss": 3.7669550564547594, + "tokens_seen": 723286016 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003943630892678034, + "loss": 2.8102, + "theoretical_loss": 3.7669206039664878, + "tokens_seen": 723351552 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003943530591775326, + "loss": 2.5936, + "theoretical_loss": 3.766886155473392, + "tokens_seen": 723417088 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039434302908726175, + "loss": 3.0184, + "theoretical_loss": 3.766851710974646, + "tokens_seen": 723482624 + }, + { + "epoch": 2.04, + "learning_rate": 0.000394332998996991, + "loss": 2.7742, + "theoretical_loss": 3.766817270469427, + "tokens_seen": 723548160 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039432296890672017, + "loss": 2.7539, + "theoretical_loss": 3.766782833956908, + "tokens_seen": 723613696 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039431293881644935, + "loss": 2.7714, + "theoretical_loss": 3.766748401436266, + "tokens_seen": 723679232 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039430290872617853, + "loss": 2.9656, + "theoretical_loss": 3.7667139729066768, + "tokens_seen": 723744768 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039429287863590776, + "loss": 2.8626, + "theoretical_loss": 3.766679548367316, + "tokens_seen": 723810304 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003942828485456369, + "loss": 2.9391, + "theoretical_loss": 3.76664512781736, + "tokens_seen": 723875840 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003942728184553661, + "loss": 2.8405, + "theoretical_loss": 3.766610711255985, + "tokens_seen": 723941376 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039426278836509525, + "loss": 2.9499, + "theoretical_loss": 3.7665762986823683, + "tokens_seen": 724006912 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003942527582748245, + "loss": 2.621, + "theoretical_loss": 3.766541890095687, + "tokens_seen": 724072448 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039424272818455367, + "loss": 2.8388, + "theoretical_loss": 3.7665074854951186, + "tokens_seen": 724137984 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 852859, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8854544162750244, + "objective/train/theoretical_loss": 3.766490284689369, + "objective/train/tokens_used": 744630752, + "theoretical_loss": 3.766490284689369, + "tokens_seen": 724170752 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039423269809428285, + "loss": 2.9438, + "theoretical_loss": 3.76647308487984, + "tokens_seen": 724203520 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039422266800401203, + "loss": 2.8946, + "theoretical_loss": 3.7664386882490293, + "tokens_seen": 724269056 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003942126379137412, + "loss": 3.0176, + "theoretical_loss": 3.7664042956018644, + "tokens_seen": 724334592 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003942026078234704, + "loss": 2.879, + "theoretical_loss": 3.7663699069375234, + "tokens_seen": 724400128 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039419257773319963, + "loss": 2.8402, + "theoretical_loss": 3.7663355222551855, + "tokens_seen": 724465664 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039418254764292876, + "loss": 2.8686, + "theoretical_loss": 3.766301141554029, + "tokens_seen": 724531200 + }, + { + "epoch": 2.04, + "learning_rate": 0.000394172517552658, + "loss": 2.8314, + "theoretical_loss": 3.7662667648332326, + "tokens_seen": 724596736 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003941624874623871, + "loss": 2.7861, + "theoretical_loss": 3.7662323920919762, + "tokens_seen": 724662272 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039415245737211635, + "loss": 2.9663, + "theoretical_loss": 3.7661980233294385, + "tokens_seen": 724727808 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003941424272818456, + "loss": 2.8261, + "theoretical_loss": 3.7661636585447997, + "tokens_seen": 724793344 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003941323971915747, + "loss": 2.8967, + "theoretical_loss": 3.7661292977372405, + "tokens_seen": 724858880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039412236710130395, + "loss": 2.8327, + "theoretical_loss": 3.7660949409059397, + "tokens_seen": 724924416 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039411233701103313, + "loss": 2.9669, + "theoretical_loss": 3.7660605880500784, + "tokens_seen": 724989952 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003941023069207623, + "loss": 2.6587, + "theoretical_loss": 3.7660262391688373, + "tokens_seen": 725055488 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003940922768304915, + "loss": 2.7309, + "theoretical_loss": 3.7659918942613975, + "tokens_seen": 725121024 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003940822467402207, + "loss": 2.7392, + "theoretical_loss": 3.76595755332694, + "tokens_seen": 725186560 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039407221664994986, + "loss": 2.8362, + "theoretical_loss": 3.765923216364646, + "tokens_seen": 725252096 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003940621865596791, + "loss": 2.746, + "theoretical_loss": 3.765888883373698, + "tokens_seen": 725317632 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003940521564694082, + "loss": 2.7861, + "theoretical_loss": 3.765854554353277, + "tokens_seen": 725383168 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039404212637913745, + "loss": 2.9553, + "theoretical_loss": 3.765820229302565, + "tokens_seen": 725448704 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003940320962888666, + "loss": 2.954, + "theoretical_loss": 3.765785908220746, + "tokens_seen": 725514240 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003940220661985958, + "loss": 2.7014, + "theoretical_loss": 3.7657515911070005, + "tokens_seen": 725579776 + }, + { + "epoch": 2.04, + "learning_rate": 0.000394012036108325, + "loss": 2.7269, + "theoretical_loss": 3.765717277960513, + "tokens_seen": 725645312 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003940020060180542, + "loss": 2.8501, + "theoretical_loss": 3.7656829687804656, + "tokens_seen": 725710848 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039399197592778336, + "loss": 2.6812, + "theoretical_loss": 3.7656486635660427, + "tokens_seen": 725776384 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 853613, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0343728065490723, + "objective/train/theoretical_loss": 3.765631512445685, + "objective/train/tokens_used": 746269152, + "theoretical_loss": 3.765631512445685, + "tokens_seen": 725809152 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003939819458375126, + "loss": 2.9185, + "theoretical_loss": 3.765614362316427, + "tokens_seen": 725841920 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003939719157472417, + "loss": 2.9212, + "theoretical_loss": 3.765580065030803, + "tokens_seen": 725907456 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039396188565697096, + "loss": 2.9167, + "theoretical_loss": 3.7655457717083545, + "tokens_seen": 725972992 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003939518555667001, + "loss": 2.8155, + "theoretical_loss": 3.7655114823482654, + "tokens_seen": 726038528 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003939418254764293, + "loss": 2.7203, + "theoretical_loss": 3.7654771969497203, + "tokens_seen": 726104064 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003939317953861585, + "loss": 2.7505, + "theoretical_loss": 3.765442915511905, + "tokens_seen": 726169600 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003939217652958877, + "loss": 2.9916, + "theoretical_loss": 3.765408638034004, + "tokens_seen": 726235136 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039391173520561686, + "loss": 2.7676, + "theoretical_loss": 3.7653743645152025, + "tokens_seen": 726300672 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039390170511534604, + "loss": 2.635, + "theoretical_loss": 3.765340094954686, + "tokens_seen": 726366208 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003938916750250752, + "loss": 2.8757, + "theoretical_loss": 3.7653058293516404, + "tokens_seen": 726431744 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039388164493480446, + "loss": 2.8451, + "theoretical_loss": 3.7652715677052515, + "tokens_seen": 726497280 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003938716148445336, + "loss": 2.8985, + "theoretical_loss": 3.765237310014706, + "tokens_seen": 726562816 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003938615847542628, + "loss": 2.8408, + "theoretical_loss": 3.76520305627919, + "tokens_seen": 726628352 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039385155466399195, + "loss": 2.9088, + "theoretical_loss": 3.76516880649789, + "tokens_seen": 726693888 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003938415245737212, + "loss": 3.0388, + "theoretical_loss": 3.765134560669994, + "tokens_seen": 726759424 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039383149448345037, + "loss": 2.9202, + "theoretical_loss": 3.7651003187946888, + "tokens_seen": 726824960 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039382146439317955, + "loss": 2.8556, + "theoretical_loss": 3.7650660808711613, + "tokens_seen": 726890496 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039381143430290873, + "loss": 2.7686, + "theoretical_loss": 3.7650318468985993, + "tokens_seen": 726956032 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039380140421263796, + "loss": 2.7862, + "theoretical_loss": 3.764997616876191, + "tokens_seen": 727021568 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003937913741223671, + "loss": 3.0449, + "theoretical_loss": 3.7649633908031257, + "tokens_seen": 727087104 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003937813440320963, + "loss": 2.6458, + "theoretical_loss": 3.7649291686785897, + "tokens_seen": 727152640 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039377131394182545, + "loss": 2.7193, + "theoretical_loss": 3.7648949505017733, + "tokens_seen": 727218176 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003937612838515547, + "loss": 2.8087, + "theoretical_loss": 3.7648607362718645, + "tokens_seen": 727283712 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039375125376128387, + "loss": 2.97, + "theoretical_loss": 3.7648265259880525, + "tokens_seen": 727349248 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039374122367101305, + "loss": 3.2369, + "theoretical_loss": 3.764792319649527, + "tokens_seen": 727414784 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 854233, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.945094108581543, + "objective/train/theoretical_loss": 3.764775217959494, + "objective/train/tokens_used": 747907552, + "theoretical_loss": 3.764775217959494, + "tokens_seen": 727447552 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039373119358074223, + "loss": 2.8022, + "theoretical_loss": 3.7647581172554783, + "tokens_seen": 727480320 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003937211634904714, + "loss": 2.9182, + "theoretical_loss": 3.764723918805095, + "tokens_seen": 727545856 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003937111334002006, + "loss": 2.775, + "theoretical_loss": 3.7646897242975674, + "tokens_seen": 727611392 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039370110330992983, + "loss": 3.0035, + "theoretical_loss": 3.7646555337320873, + "tokens_seen": 727676928 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039369107321965896, + "loss": 2.8621, + "theoretical_loss": 3.764621347107843, + "tokens_seen": 727742464 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003936810431293882, + "loss": 3.0658, + "theoretical_loss": 3.764587164424027, + "tokens_seen": 727808000 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003936710130391173, + "loss": 2.7817, + "theoretical_loss": 3.76455298567983, + "tokens_seen": 727873536 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039366098294884655, + "loss": 2.9779, + "theoretical_loss": 3.764518810874443, + "tokens_seen": 727939072 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039365095285857573, + "loss": 2.8654, + "theoretical_loss": 3.7644846400070575, + "tokens_seen": 728004608 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003936409227683049, + "loss": 2.7282, + "theoretical_loss": 3.7644504730768653, + "tokens_seen": 728070144 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003936308926780341, + "loss": 2.7755, + "theoretical_loss": 3.764416310083059, + "tokens_seen": 728135680 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039362086258776333, + "loss": 2.9102, + "theoretical_loss": 3.7643821510248303, + "tokens_seen": 728201216 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039361083249749246, + "loss": 2.9127, + "theoretical_loss": 3.764347995901372, + "tokens_seen": 728266752 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003936008024072217, + "loss": 2.9826, + "theoretical_loss": 3.764313844711876, + "tokens_seen": 728332288 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003935907723169508, + "loss": 2.7889, + "theoretical_loss": 3.7642796974555366, + "tokens_seen": 728397824 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039358074222668006, + "loss": 2.7005, + "theoretical_loss": 3.764245554131546, + "tokens_seen": 728463360 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039357071213640924, + "loss": 3.0278, + "theoretical_loss": 3.7642114147390977, + "tokens_seen": 728528896 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003935606820461384, + "loss": 2.8777, + "theoretical_loss": 3.764177279277386, + "tokens_seen": 728594432 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003935506519558676, + "loss": 3.0676, + "theoretical_loss": 3.764143147745605, + "tokens_seen": 728659968 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003935406218655968, + "loss": 2.8788, + "theoretical_loss": 3.7641090201429477, + "tokens_seen": 728725504 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039353059177532596, + "loss": 2.7074, + "theoretical_loss": 3.764074896468609, + "tokens_seen": 728791040 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003935205616850552, + "loss": 2.8258, + "theoretical_loss": 3.7640407767217843, + "tokens_seen": 728856576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003935105315947843, + "loss": 2.9152, + "theoretical_loss": 3.764006660901667, + "tokens_seen": 728922112 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039350050150451356, + "loss": 2.9142, + "theoretical_loss": 3.7639725490074536, + "tokens_seen": 728987648 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003934904714142427, + "loss": 2.8464, + "theoretical_loss": 3.7639384410383387, + "tokens_seen": 729053184 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 855726, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.673104763031006, + "objective/train/theoretical_loss": 3.763921388525442, + "objective/train/tokens_used": 749545952, + "theoretical_loss": 3.763921388525442, + "tokens_seen": 729085952 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003934804413239719, + "loss": 2.9341, + "theoretical_loss": 3.763904336993518, + "tokens_seen": 729118720 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003934704112337011, + "loss": 2.7971, + "theoretical_loss": 3.7638702368721875, + "tokens_seen": 729184256 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003934603811434303, + "loss": 2.9572, + "theoretical_loss": 3.7638361406735434, + "tokens_seen": 729249792 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039345035105315946, + "loss": 2.9707, + "theoretical_loss": 3.7638020483967813, + "tokens_seen": 729315328 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003934403209628887, + "loss": 2.6327, + "theoretical_loss": 3.7637679600410987, + "tokens_seen": 729380864 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039343029087261783, + "loss": 2.9909, + "theoretical_loss": 3.7637338756056913, + "tokens_seen": 729446400 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039342026078234706, + "loss": 2.7681, + "theoretical_loss": 3.7636997950897566, + "tokens_seen": 729511936 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003934102306920762, + "loss": 2.8921, + "theoretical_loss": 3.763665718492492, + "tokens_seen": 729577472 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003934002006018054, + "loss": 2.948, + "theoretical_loss": 3.7636316458130956, + "tokens_seen": 729643008 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039339017051153466, + "loss": 2.8759, + "theoretical_loss": 3.7635975770507635, + "tokens_seen": 729708544 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003933801404212638, + "loss": 2.7037, + "theoretical_loss": 3.763563512204695, + "tokens_seen": 729774080 + }, + { + "epoch": 2.04, + "learning_rate": 0.000393370110330993, + "loss": 2.8733, + "theoretical_loss": 3.7635294512740876, + "tokens_seen": 729839616 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039336008024072215, + "loss": 2.7322, + "theoretical_loss": 3.7634953942581397, + "tokens_seen": 729905152 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003933500501504514, + "loss": 2.8427, + "theoretical_loss": 3.7634613411560505, + "tokens_seen": 729970688 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039334002006018057, + "loss": 2.8633, + "theoretical_loss": 3.7634272919670186, + "tokens_seen": 730036224 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039332998996990975, + "loss": 2.7698, + "theoretical_loss": 3.763393246690243, + "tokens_seen": 730101760 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039331995987963893, + "loss": 2.883, + "theoretical_loss": 3.7633592053249236, + "tokens_seen": 730167296 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039330992978936816, + "loss": 2.8142, + "theoretical_loss": 3.763325167870259, + "tokens_seen": 730232832 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003932998996990973, + "loss": 3.0139, + "theoretical_loss": 3.76329113432545, + "tokens_seen": 730298368 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003932898696088265, + "loss": 2.8159, + "theoretical_loss": 3.7632571046896963, + "tokens_seen": 730363904 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039327983951855565, + "loss": 2.8133, + "theoretical_loss": 3.763223078962198, + "tokens_seen": 730429440 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003932698094282849, + "loss": 2.9151, + "theoretical_loss": 3.763189057142156, + "tokens_seen": 730494976 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039325977933801407, + "loss": 2.6749, + "theoretical_loss": 3.7631550392287707, + "tokens_seen": 730560512 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039324974924774325, + "loss": 2.9721, + "theoretical_loss": 3.7631210252212437, + "tokens_seen": 730626048 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039323971915747243, + "loss": 2.8593, + "theoretical_loss": 3.763087015118776, + "tokens_seen": 730691584 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 856181, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.754777431488037, + "objective/train/theoretical_loss": 3.7630700115316893, + "objective/train/tokens_used": 751184352, + "theoretical_loss": 3.7630700115316893, + "tokens_seen": 730724352 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003932296890672016, + "loss": 2.8694, + "theoretical_loss": 3.7630530089205685, + "tokens_seen": 730757120 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003932196589769308, + "loss": 2.8381, + "theoretical_loss": 3.7630190066258233, + "tokens_seen": 730822656 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039320962888666003, + "loss": 2.8851, + "theoretical_loss": 3.762985008233743, + "tokens_seen": 730888192 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039319959879638916, + "loss": 3.0075, + "theoretical_loss": 3.762951013743529, + "tokens_seen": 730953728 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003931895687061184, + "loss": 3.1236, + "theoretical_loss": 3.762917023154384, + "tokens_seen": 731019264 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003931795386158475, + "loss": 2.8348, + "theoretical_loss": 3.762883036465511, + "tokens_seen": 731084800 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039316950852557675, + "loss": 2.8395, + "theoretical_loss": 3.7628490536761117, + "tokens_seen": 731150336 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039315947843530593, + "loss": 2.8552, + "theoretical_loss": 3.7628150747853906, + "tokens_seen": 731215872 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003931494483450351, + "loss": 2.9167, + "theoretical_loss": 3.7627810997925506, + "tokens_seen": 731281408 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003931394182547643, + "loss": 3.2027, + "theoretical_loss": 3.7627471286967946, + "tokens_seen": 731346944 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039312938816449353, + "loss": 2.9124, + "theoretical_loss": 3.7627131614973277, + "tokens_seen": 731412480 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039311935807422266, + "loss": 2.8646, + "theoretical_loss": 3.762679198193353, + "tokens_seen": 731478016 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003931093279839519, + "loss": 2.9585, + "theoretical_loss": 3.762645238784075, + "tokens_seen": 731543552 + }, + { + "epoch": 2.04, + "learning_rate": 0.000393099297893681, + "loss": 2.7172, + "theoretical_loss": 3.7626112832686984, + "tokens_seen": 731609088 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039308926780341026, + "loss": 2.8292, + "theoretical_loss": 3.762577331646428, + "tokens_seen": 731674624 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039307923771313944, + "loss": 2.8594, + "theoretical_loss": 3.762543383916469, + "tokens_seen": 731740160 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003930692076228686, + "loss": 2.9352, + "theoretical_loss": 3.7625094400780257, + "tokens_seen": 731805696 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003930591775325978, + "loss": 2.8496, + "theoretical_loss": 3.7624755001303045, + "tokens_seen": 731871232 + }, + { + "epoch": 2.04, + "learning_rate": 0.000393049147442327, + "loss": 2.9329, + "theoretical_loss": 3.7624415640725104, + "tokens_seen": 731936768 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039303911735205616, + "loss": 2.7525, + "theoretical_loss": 3.7624076319038506, + "tokens_seen": 732002304 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003930290872617854, + "loss": 2.702, + "theoretical_loss": 3.7623737036235294, + "tokens_seen": 732067840 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003930190571715145, + "loss": 2.7606, + "theoretical_loss": 3.7623397792307545, + "tokens_seen": 732133376 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039300902708124376, + "loss": 2.7868, + "theoretical_loss": 3.762305858724732, + "tokens_seen": 732198912 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003929989969909729, + "loss": 3.0959, + "theoretical_loss": 3.7622719421046695, + "tokens_seen": 732264448 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003929889669007021, + "loss": 3.0838, + "theoretical_loss": 3.7622380293697733, + "tokens_seen": 732329984 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 857755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.856901168823242, + "objective/train/theoretical_loss": 3.762221074459015, + "objective/train/tokens_used": 752822752, + "theoretical_loss": 3.762221074459015, + "tokens_seen": 732362752 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003929789368104313, + "loss": 2.7062, + "theoretical_loss": 3.762204120519251, + "tokens_seen": 732395520 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003929689067201605, + "loss": 2.7705, + "theoretical_loss": 3.76217021555231, + "tokens_seen": 732461056 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039295887662988967, + "loss": 2.9732, + "theoretical_loss": 3.7621363144681585, + "tokens_seen": 732526592 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003929488465396189, + "loss": 2.9889, + "theoretical_loss": 3.7621024172660045, + "tokens_seen": 732592128 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039293881644934803, + "loss": 2.7959, + "theoretical_loss": 3.7620685239450555, + "tokens_seen": 732657664 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039292878635907726, + "loss": 2.9595, + "theoretical_loss": 3.762034634504521, + "tokens_seen": 732723200 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003929187562688064, + "loss": 2.746, + "theoretical_loss": 3.762000748943609, + "tokens_seen": 732788736 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003929087261785356, + "loss": 2.9863, + "theoretical_loss": 3.7619668672615285, + "tokens_seen": 732854272 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003928986960882648, + "loss": 2.6103, + "theoretical_loss": 3.761932989457489, + "tokens_seen": 732919808 + }, + { + "epoch": 2.04, + "learning_rate": 0.000392888665997994, + "loss": 2.9042, + "theoretical_loss": 3.7618991155307, + "tokens_seen": 732985344 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039287863590772317, + "loss": 2.9509, + "theoretical_loss": 3.761865245480371, + "tokens_seen": 733050880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039286860581745235, + "loss": 2.9416, + "theoretical_loss": 3.7618313793057117, + "tokens_seen": 733116416 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039285857572718153, + "loss": 2.9895, + "theoretical_loss": 3.761797517005933, + "tokens_seen": 733181952 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039284854563691077, + "loss": 2.6667, + "theoretical_loss": 3.7617636585802434, + "tokens_seen": 733247488 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003928385155466399, + "loss": 2.8332, + "theoretical_loss": 3.7617298040278557, + "tokens_seen": 733313024 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039282848545636913, + "loss": 2.8072, + "theoretical_loss": 3.761695953347979, + "tokens_seen": 733378560 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039281845536609825, + "loss": 2.8533, + "theoretical_loss": 3.7616621065398257, + "tokens_seen": 733444096 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003928084252758275, + "loss": 2.8354, + "theoretical_loss": 3.761628263602606, + "tokens_seen": 733509632 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039279839518555667, + "loss": 2.695, + "theoretical_loss": 3.7615944245355317, + "tokens_seen": 733575168 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039278836509528585, + "loss": 2.7853, + "theoretical_loss": 3.7615605893378152, + "tokens_seen": 733640704 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039277833500501503, + "loss": 2.8961, + "theoretical_loss": 3.7615267580086673, + "tokens_seen": 733706240 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039276830491474427, + "loss": 3.0298, + "theoretical_loss": 3.761492930547301, + "tokens_seen": 733771776 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003927582748244734, + "loss": 2.914, + "theoretical_loss": 3.7614591069529286, + "tokens_seen": 733837312 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039274824473420263, + "loss": 2.8187, + "theoretical_loss": 3.761425287224763, + "tokens_seen": 733902848 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039273821464393176, + "loss": 2.7914, + "theoretical_loss": 3.7613914713620167, + "tokens_seen": 733968384 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 858482, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.96612548828125, + "objective/train/theoretical_loss": 3.7613745648799295, + "objective/train/tokens_used": 754461152, + "theoretical_loss": 3.7613745648799295, + "tokens_seen": 734001152 + }, + { + "epoch": 2.04, + "learning_rate": 0.000392728184553661, + "loss": 2.6728, + "theoretical_loss": 3.7613576593639024, + "tokens_seen": 734033920 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003927181544633902, + "loss": 2.9194, + "theoretical_loss": 3.7613238512296348, + "tokens_seen": 734099456 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039270812437311936, + "loss": 2.7902, + "theoretical_loss": 3.7612900469584263, + "tokens_seen": 734164992 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039269809428284854, + "loss": 3.0611, + "theoretical_loss": 3.7612562465494905, + "tokens_seen": 734230528 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003926880641925777, + "loss": 2.6482, + "theoretical_loss": 3.7612224500020424, + "tokens_seen": 734296064 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003926780341023069, + "loss": 2.7174, + "theoretical_loss": 3.7611886573152957, + "tokens_seen": 734361600 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039266800401203613, + "loss": 2.6784, + "theoretical_loss": 3.7611548684884655, + "tokens_seen": 734427136 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039265797392176526, + "loss": 2.886, + "theoretical_loss": 3.7611210835207656, + "tokens_seen": 734492672 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003926479438314945, + "loss": 2.9471, + "theoretical_loss": 3.7610873024114113, + "tokens_seen": 734558208 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039263791374122373, + "loss": 2.8858, + "theoretical_loss": 3.7610535251596184, + "tokens_seen": 734623744 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039262788365095286, + "loss": 2.685, + "theoretical_loss": 3.7610197517646022, + "tokens_seen": 734689280 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003926178535606821, + "loss": 2.6936, + "theoretical_loss": 3.760985982225577, + "tokens_seen": 734754816 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003926078234704112, + "loss": 2.6473, + "theoretical_loss": 3.76095221654176, + "tokens_seen": 734820352 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039259779338014046, + "loss": 2.7335, + "theoretical_loss": 3.7609184547123675, + "tokens_seen": 734885888 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039258776328986964, + "loss": 2.9301, + "theoretical_loss": 3.760884696736615, + "tokens_seen": 734951424 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003925777331995988, + "loss": 2.7825, + "theoretical_loss": 3.7608509426137195, + "tokens_seen": 735016960 + }, + { + "epoch": 2.04, + "learning_rate": 0.000392567703109328, + "loss": 2.8205, + "theoretical_loss": 3.7608171923428975, + "tokens_seen": 735082496 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003925576730190572, + "loss": 2.8997, + "theoretical_loss": 3.7607834459233667, + "tokens_seen": 735148032 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039254764292878636, + "loss": 2.6886, + "theoretical_loss": 3.7607497033543433, + "tokens_seen": 735213568 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003925376128385156, + "loss": 2.9114, + "theoretical_loss": 3.7607159646350454, + "tokens_seen": 735279104 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003925275827482447, + "loss": 2.9142, + "theoretical_loss": 3.760682229764691, + "tokens_seen": 735344640 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039251755265797396, + "loss": 3.1446, + "theoretical_loss": 3.7606484987424973, + "tokens_seen": 735410176 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003925075225677031, + "loss": 2.8202, + "theoretical_loss": 3.760614771567683, + "tokens_seen": 735475712 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003924974924774323, + "loss": 2.784, + "theoretical_loss": 3.760581048239466, + "tokens_seen": 735541248 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003924874623871615, + "loss": 2.8307, + "theoretical_loss": 3.760547328757066, + "tokens_seen": 735606784 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 859784, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6183910369873047, + "objective/train/theoretical_loss": 3.7605304704578026, + "objective/train/tokens_used": 756099552, + "theoretical_loss": 3.7605304704578026, + "tokens_seen": 735639552 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003924774322968907, + "loss": 2.844, + "theoretical_loss": 3.7605136131197003, + "tokens_seen": 735672320 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039246740220661987, + "loss": 3.1915, + "theoretical_loss": 3.7604799013265895, + "tokens_seen": 735737856 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003924573721163491, + "loss": 3.0552, + "theoretical_loss": 3.760446193376952, + "tokens_seen": 735803392 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039244734202607823, + "loss": 2.871, + "theoretical_loss": 3.7604124892700073, + "tokens_seen": 735868928 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039243731193580746, + "loss": 2.6401, + "theoretical_loss": 3.7603787890049762, + "tokens_seen": 735934464 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003924272818455366, + "loss": 2.9156, + "theoretical_loss": 3.760345092581077, + "tokens_seen": 736000000 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003924172517552658, + "loss": 2.7434, + "theoretical_loss": 3.7603113999975317, + "tokens_seen": 736065536 + }, + { + "epoch": 2.04, + "learning_rate": 0.000392407221664995, + "loss": 2.6808, + "theoretical_loss": 3.7602777112535595, + "tokens_seen": 736131072 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003923971915747242, + "loss": 2.8149, + "theoretical_loss": 3.7602440263483814, + "tokens_seen": 736196608 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039238716148445337, + "loss": 2.8053, + "theoretical_loss": 3.7602103452812186, + "tokens_seen": 736262144 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039237713139418255, + "loss": 2.7973, + "theoretical_loss": 3.760176668051291, + "tokens_seen": 736327680 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039236710130391173, + "loss": 2.9008, + "theoretical_loss": 3.760142994657822, + "tokens_seen": 736393216 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039235707121364097, + "loss": 3.034, + "theoretical_loss": 3.760109325100032, + "tokens_seen": 736458752 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003923470411233701, + "loss": 2.8678, + "theoretical_loss": 3.7600756593771427, + "tokens_seen": 736524288 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039233701103309933, + "loss": 2.8121, + "theoretical_loss": 3.7600419974883765, + "tokens_seen": 736589824 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039232698094282846, + "loss": 2.7452, + "theoretical_loss": 3.7600083394329555, + "tokens_seen": 736655360 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003923169508525577, + "loss": 2.7862, + "theoretical_loss": 3.7599746852101026, + "tokens_seen": 736720896 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039230692076228687, + "loss": 2.8344, + "theoretical_loss": 3.7599410348190396, + "tokens_seen": 736786432 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039229689067201605, + "loss": 2.8182, + "theoretical_loss": 3.7599073882589904, + "tokens_seen": 736851968 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039228686058174523, + "loss": 2.7019, + "theoretical_loss": 3.759873745529178, + "tokens_seen": 736917504 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039227683049147447, + "loss": 2.7842, + "theoretical_loss": 3.7598401066288254, + "tokens_seen": 736983040 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003922668004012036, + "loss": 2.854, + "theoretical_loss": 3.759806471557156, + "tokens_seen": 737048576 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039225677031093283, + "loss": 2.7951, + "theoretical_loss": 3.7597728403133948, + "tokens_seen": 737114112 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039224674022066196, + "loss": 3.0258, + "theoretical_loss": 3.759739212896765, + "tokens_seen": 737179648 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003922367101303912, + "loss": 2.6118, + "theoretical_loss": 3.759705589306491, + "tokens_seen": 737245184 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 860487, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.353097438812256, + "objective/train/theoretical_loss": 3.759688778945995, + "objective/train/tokens_used": 757737952, + "theoretical_loss": 3.759688778945995, + "tokens_seen": 737277952 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003922266800401204, + "loss": 2.6294, + "theoretical_loss": 3.759671969541797, + "tokens_seen": 737310720 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039221664994984956, + "loss": 2.9801, + "theoretical_loss": 3.759638353601909, + "tokens_seen": 737376256 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039220661985957874, + "loss": 2.6816, + "theoretical_loss": 3.7596047414860507, + "tokens_seen": 737441792 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003921965897693079, + "loss": 2.841, + "theoretical_loss": 3.7595711331934476, + "tokens_seen": 737507328 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003921865596790371, + "loss": 2.8772, + "theoretical_loss": 3.7595375287233255, + "tokens_seen": 737572864 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039217652958876633, + "loss": 2.7096, + "theoretical_loss": 3.75950392807491, + "tokens_seen": 737638400 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039216649949849546, + "loss": 2.7568, + "theoretical_loss": 3.7594703312474262, + "tokens_seen": 737703936 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003921564694082247, + "loss": 2.7315, + "theoretical_loss": 3.7594367382401015, + "tokens_seen": 737769472 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003921464393179539, + "loss": 3.0837, + "theoretical_loss": 3.759403149052161, + "tokens_seen": 737835008 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039213640922768306, + "loss": 2.7529, + "theoretical_loss": 3.7593695636828324, + "tokens_seen": 737900544 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039212637913741224, + "loss": 2.9824, + "theoretical_loss": 3.7593359821313417, + "tokens_seen": 737966080 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003921163490471414, + "loss": 2.8387, + "theoretical_loss": 3.7593024043969163, + "tokens_seen": 738031616 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003921063189568706, + "loss": 2.9527, + "theoretical_loss": 3.759268830478783, + "tokens_seen": 738097152 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039209628886659984, + "loss": 2.7311, + "theoretical_loss": 3.7592352603761703, + "tokens_seen": 738162688 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039208625877632896, + "loss": 2.7515, + "theoretical_loss": 3.7592016940883046, + "tokens_seen": 738228224 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003920762286860582, + "loss": 3.0343, + "theoretical_loss": 3.7591681316144143, + "tokens_seen": 738293760 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003920661985957873, + "loss": 2.7702, + "theoretical_loss": 3.759134572953728, + "tokens_seen": 738359296 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039205616850551656, + "loss": 2.8235, + "theoretical_loss": 3.7591010181054734, + "tokens_seen": 738424832 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039204613841524574, + "loss": 2.6255, + "theoretical_loss": 3.7590674670688795, + "tokens_seen": 738490368 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003920361083249749, + "loss": 2.8931, + "theoretical_loss": 3.759033919843175, + "tokens_seen": 738555904 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003920260782347041, + "loss": 2.9703, + "theoretical_loss": 3.759000376427589, + "tokens_seen": 738621440 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003920160481444333, + "loss": 2.9791, + "theoretical_loss": 3.75896683682135, + "tokens_seen": 738686976 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039200601805416247, + "loss": 2.8601, + "theoretical_loss": 3.758933301023689, + "tokens_seen": 738752512 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003919959879638917, + "loss": 2.8717, + "theoretical_loss": 3.758899769033835, + "tokens_seen": 738818048 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039198595787362083, + "loss": 3.0159, + "theoretical_loss": 3.7588662408510167, + "tokens_seen": 738883584 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 861921, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8210134506225586, + "objective/train/theoretical_loss": 3.758849478187006, + "objective/train/tokens_used": 759376352, + "theoretical_loss": 3.758849478187006, + "tokens_seen": 738916352 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039197592778335007, + "loss": 2.9699, + "theoretical_loss": 3.7588327164744664, + "tokens_seen": 738949120 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039196589769307925, + "loss": 2.7979, + "theoretical_loss": 3.7587991959034133, + "tokens_seen": 739014656 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039195586760280843, + "loss": 2.8731, + "theoretical_loss": 3.758765679137088, + "tokens_seen": 739080192 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003919458375125376, + "loss": 2.8623, + "theoretical_loss": 3.7587321661747213, + "tokens_seen": 739145728 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003919358074222668, + "loss": 2.7755, + "theoretical_loss": 3.758698657015545, + "tokens_seen": 739211264 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039192577733199597, + "loss": 2.9633, + "theoretical_loss": 3.7586651516587892, + "tokens_seen": 739276800 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003919157472417252, + "loss": 2.9176, + "theoretical_loss": 3.758631650103686, + "tokens_seen": 739342336 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039190571715145433, + "loss": 2.8127, + "theoretical_loss": 3.7585981523494674, + "tokens_seen": 739407872 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039189568706118357, + "loss": 2.8019, + "theoretical_loss": 3.7585646583953656, + "tokens_seen": 739473408 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039188565697091275, + "loss": 2.7282, + "theoretical_loss": 3.7585311682406113, + "tokens_seen": 739538944 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039187562688064193, + "loss": 2.8132, + "theoretical_loss": 3.758497681884439, + "tokens_seen": 739604480 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039186559679037117, + "loss": 3.1263, + "theoretical_loss": 3.7584641993260792, + "tokens_seen": 739670016 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003918555667001003, + "loss": 2.7582, + "theoretical_loss": 3.758430720564766, + "tokens_seen": 739735552 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039184553660982953, + "loss": 2.845, + "theoretical_loss": 3.758397245599732, + "tokens_seen": 739801088 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039183550651955866, + "loss": 2.7409, + "theoretical_loss": 3.7583637744302107, + "tokens_seen": 739866624 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003918254764292879, + "loss": 2.9599, + "theoretical_loss": 3.758330307055436, + "tokens_seen": 739932160 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039181544633901707, + "loss": 2.9472, + "theoretical_loss": 3.758296843474641, + "tokens_seen": 739997696 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039180541624874625, + "loss": 2.7435, + "theoretical_loss": 3.7582633836870603, + "tokens_seen": 740063232 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039179538615847543, + "loss": 2.6432, + "theoretical_loss": 3.758229927691927, + "tokens_seen": 740128768 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039178535606820467, + "loss": 3.0198, + "theoretical_loss": 3.758196475488476, + "tokens_seen": 740194304 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003917753259779338, + "loss": 2.8823, + "theoretical_loss": 3.758163027075943, + "tokens_seen": 740259840 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039176529588766303, + "loss": 2.7262, + "theoretical_loss": 3.758129582453561, + "tokens_seen": 740325376 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039175526579739216, + "loss": 2.9131, + "theoretical_loss": 3.758096141620566, + "tokens_seen": 740390912 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003917452357071214, + "loss": 2.8228, + "theoretical_loss": 3.7580627045761936, + "tokens_seen": 740456448 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003917352056168506, + "loss": 2.7725, + "theoretical_loss": 3.7580292713196792, + "tokens_seen": 740521984 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 862761, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8765251636505127, + "objective/train/theoretical_loss": 3.7580125561116295, + "objective/train/tokens_used": 761014752, + "theoretical_loss": 3.7580125561116295, + "tokens_seen": 740554752 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039172517552657976, + "loss": 2.8848, + "theoretical_loss": 3.757995841850258, + "tokens_seen": 740587520 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039171514543630894, + "loss": 2.6698, + "theoretical_loss": 3.7579624161671665, + "tokens_seen": 740653056 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003917051153460381, + "loss": 2.8602, + "theoretical_loss": 3.75792899426964, + "tokens_seen": 740718592 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003916950852557673, + "loss": 2.8744, + "theoretical_loss": 3.757895576156916, + "tokens_seen": 740784128 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039168505516549653, + "loss": 2.9496, + "theoretical_loss": 3.75786216182823, + "tokens_seen": 740849664 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039167502507522566, + "loss": 2.9308, + "theoretical_loss": 3.757828751282821, + "tokens_seen": 740915200 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003916649949849549, + "loss": 3.0178, + "theoretical_loss": 3.757795344519923, + "tokens_seen": 740980736 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003916549648946841, + "loss": 2.8622, + "theoretical_loss": 3.7577619415387753, + "tokens_seen": 741046272 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039164493480441326, + "loss": 2.8771, + "theoretical_loss": 3.757728542338615, + "tokens_seen": 741111808 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039163490471414244, + "loss": 2.652, + "theoretical_loss": 3.75769514691868, + "tokens_seen": 741177344 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003916248746238716, + "loss": 2.9017, + "theoretical_loss": 3.7576617552782077, + "tokens_seen": 741242880 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003916148445336008, + "loss": 2.9632, + "theoretical_loss": 3.757628367416437, + "tokens_seen": 741308416 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039160481444333004, + "loss": 2.6733, + "theoretical_loss": 3.757594983332605, + "tokens_seen": 741373952 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039159478435305916, + "loss": 2.6975, + "theoretical_loss": 3.7575616030259518, + "tokens_seen": 741439488 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003915847542627884, + "loss": 2.9127, + "theoretical_loss": 3.7575282264957153, + "tokens_seen": 741505024 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003915747241725175, + "loss": 2.8389, + "theoretical_loss": 3.7574948537411355, + "tokens_seen": 741570560 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039156469408224676, + "loss": 3.0937, + "theoretical_loss": 3.7574614847614503, + "tokens_seen": 741636096 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039155466399197594, + "loss": 2.979, + "theoretical_loss": 3.7574281195559003, + "tokens_seen": 741701632 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003915446339017051, + "loss": 2.5656, + "theoretical_loss": 3.7573947581237244, + "tokens_seen": 741767168 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003915346038114343, + "loss": 2.6176, + "theoretical_loss": 3.757361400464163, + "tokens_seen": 741832704 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003915245737211635, + "loss": 2.7122, + "theoretical_loss": 3.7573280465764567, + "tokens_seen": 741898240 + }, + { + "epoch": 2.04, + "learning_rate": 0.00039151454363089267, + "loss": 3.0246, + "theoretical_loss": 3.7572946964598453, + "tokens_seen": 741963776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003915045135406219, + "loss": 2.7568, + "theoretical_loss": 3.757261350113569, + "tokens_seen": 742029312 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039149448345035103, + "loss": 2.8737, + "theoretical_loss": 3.7572280075368694, + "tokens_seen": 742094848 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039148445336008027, + "loss": 2.8704, + "theoretical_loss": 3.757194668728987, + "tokens_seen": 742160384 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 864185, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.501842498779297, + "objective/train/theoretical_loss": 3.757178000738115, + "objective/train/tokens_used": 762653152, + "theoretical_loss": 3.757178000738115, + "tokens_seen": 742193152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039147442326980945, + "loss": 3.1039, + "theoretical_loss": 3.7571613336891634, + "tokens_seen": 742225920 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039146439317953863, + "loss": 2.7548, + "theoretical_loss": 3.75712800241664, + "tokens_seen": 742291456 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003914543630892678, + "loss": 2.8036, + "theoretical_loss": 3.7570946749106584, + "tokens_seen": 742356992 + }, + { + "epoch": 2.05, + "learning_rate": 0.000391444332998997, + "loss": 2.584, + "theoretical_loss": 3.7570613511704605, + "tokens_seen": 742422528 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039143430290872617, + "loss": 2.9916, + "theoretical_loss": 3.7570280311952886, + "tokens_seen": 742488064 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003914242728184554, + "loss": 2.9727, + "theoretical_loss": 3.7569947149843843, + "tokens_seen": 742553600 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039141424272818453, + "loss": 2.9331, + "theoretical_loss": 3.7569614025369917, + "tokens_seen": 742619136 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039140421263791377, + "loss": 2.5328, + "theoretical_loss": 3.756928093852352, + "tokens_seen": 742684672 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003913941825476429, + "loss": 2.8034, + "theoretical_loss": 3.756894788929709, + "tokens_seen": 742750208 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039138415245737213, + "loss": 2.7063, + "theoretical_loss": 3.7568614877683064, + "tokens_seen": 742815744 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003913741223671013, + "loss": 2.8524, + "theoretical_loss": 3.7568281903673864, + "tokens_seen": 742881280 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003913640922768305, + "loss": 2.6954, + "theoretical_loss": 3.756794896726194, + "tokens_seen": 742946816 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003913540621865597, + "loss": 2.489, + "theoretical_loss": 3.7567616068439715, + "tokens_seen": 743012352 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039134403209628886, + "loss": 2.9102, + "theoretical_loss": 3.756728320719964, + "tokens_seen": 743077888 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039133400200601804, + "loss": 2.6978, + "theoretical_loss": 3.756695038353416, + "tokens_seen": 743143424 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039132397191574727, + "loss": 2.5701, + "theoretical_loss": 3.7566617597435723, + "tokens_seen": 743208960 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003913139418254764, + "loss": 2.5741, + "theoretical_loss": 3.7566284848896765, + "tokens_seen": 743274496 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039130391173520563, + "loss": 2.8461, + "theoretical_loss": 3.756595213790974, + "tokens_seen": 743340032 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003912938816449348, + "loss": 2.6907, + "theoretical_loss": 3.756561946446711, + "tokens_seen": 743405568 + }, + { + "epoch": 2.05, + "learning_rate": 0.000391283851554664, + "loss": 2.9518, + "theoretical_loss": 3.756528682856131, + "tokens_seen": 743471104 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003912738214643932, + "loss": 2.6059, + "theoretical_loss": 3.7564954230184817, + "tokens_seen": 743536640 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039126379137412236, + "loss": 2.8419, + "theoretical_loss": 3.756462166933008, + "tokens_seen": 743602176 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039125376128385154, + "loss": 2.7538, + "theoretical_loss": 3.7564289145989553, + "tokens_seen": 743667712 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003912437311935808, + "loss": 2.7312, + "theoretical_loss": 3.756395666015571, + "tokens_seen": 743733248 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003912337011033099, + "loss": 3.0143, + "theoretical_loss": 3.756362421182101, + "tokens_seen": 743798784 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 864916, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2605478763580322, + "objective/train/theoretical_loss": 3.7563458001713483, + "objective/train/tokens_used": 764291552, + "theoretical_loss": 3.7563458001713483, + "tokens_seen": 743831552 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039122367101303914, + "loss": 2.9468, + "theoretical_loss": 3.756329180097792, + "tokens_seen": 743864320 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039121364092276826, + "loss": 2.9104, + "theoretical_loss": 3.756295942761891, + "tokens_seen": 743929856 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003912036108324975, + "loss": 2.8218, + "theoretical_loss": 3.7562627091736456, + "tokens_seen": 743995392 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003911935807422267, + "loss": 2.7912, + "theoretical_loss": 3.7562294793323026, + "tokens_seen": 744060928 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039118355065195586, + "loss": 2.9189, + "theoretical_loss": 3.75619625323711, + "tokens_seen": 744126464 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039117352056168504, + "loss": 2.9026, + "theoretical_loss": 3.7561630308873153, + "tokens_seen": 744192000 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003911634904714143, + "loss": 2.9202, + "theoretical_loss": 3.756129812282167, + "tokens_seen": 744257536 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003911534603811434, + "loss": 2.7454, + "theoretical_loss": 3.7560965974209126, + "tokens_seen": 744323072 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039114343029087264, + "loss": 3.0462, + "theoretical_loss": 3.756063386302801, + "tokens_seen": 744388608 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003911334002006018, + "loss": 2.9229, + "theoretical_loss": 3.7560301789270807, + "tokens_seen": 744454144 + }, + { + "epoch": 2.05, + "learning_rate": 0.000391123370110331, + "loss": 2.6486, + "theoretical_loss": 3.755996975293001, + "tokens_seen": 744519680 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039111334002006024, + "loss": 2.7592, + "theoretical_loss": 3.7559637753998105, + "tokens_seen": 744585216 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039110330992978936, + "loss": 2.579, + "theoretical_loss": 3.755930579246759, + "tokens_seen": 744650752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003910932798395186, + "loss": 2.6565, + "theoretical_loss": 3.7558973868330954, + "tokens_seen": 744716288 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039108324974924773, + "loss": 2.6632, + "theoretical_loss": 3.7558641981580703, + "tokens_seen": 744781824 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039107321965897696, + "loss": 2.5718, + "theoretical_loss": 3.7558310132209334, + "tokens_seen": 744847360 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039106318956870614, + "loss": 2.6674, + "theoretical_loss": 3.755797832020934, + "tokens_seen": 744912896 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003910531594784353, + "loss": 2.8298, + "theoretical_loss": 3.755764654557324, + "tokens_seen": 744978432 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003910431293881645, + "loss": 3.0806, + "theoretical_loss": 3.755731480829353, + "tokens_seen": 745043968 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003910330992978937, + "loss": 2.6907, + "theoretical_loss": 3.755698310836272, + "tokens_seen": 745109504 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039102306920762287, + "loss": 2.843, + "theoretical_loss": 3.7556651445773324, + "tokens_seen": 745175040 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003910130391173521, + "loss": 2.8829, + "theoretical_loss": 3.755631982051785, + "tokens_seen": 745240576 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039100300902708123, + "loss": 2.7557, + "theoretical_loss": 3.7555988232588815, + "tokens_seen": 745306112 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039099297893681047, + "loss": 2.8817, + "theoretical_loss": 3.7555656681978737, + "tokens_seen": 745371648 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039098294884653965, + "loss": 2.8561, + "theoretical_loss": 3.755532516868014, + "tokens_seen": 745437184 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 865470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9077301025390625, + "objective/train/theoretical_loss": 3.75551594260203, + "objective/train/tokens_used": 765929952, + "theoretical_loss": 3.75551594260203, + "tokens_seen": 745469952 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039097291875626883, + "loss": 2.9101, + "theoretical_loss": 3.755499369268553, + "tokens_seen": 745502720 + }, + { + "epoch": 2.05, + "learning_rate": 0.000390962888665998, + "loss": 2.9018, + "theoretical_loss": 3.7554662253987443, + "tokens_seen": 745568256 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003909528585757272, + "loss": 2.7832, + "theoretical_loss": 3.755433085257841, + "tokens_seen": 745633792 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039094282848545637, + "loss": 2.9756, + "theoretical_loss": 3.755399948845094, + "tokens_seen": 745699328 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003909327983951856, + "loss": 2.7584, + "theoretical_loss": 3.7553668161597584, + "tokens_seen": 745764864 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039092276830491473, + "loss": 2.886, + "theoretical_loss": 3.7553336872010856, + "tokens_seen": 745830400 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039091273821464397, + "loss": 3.0352, + "theoretical_loss": 3.7553005619683306, + "tokens_seen": 745895936 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003909027081243731, + "loss": 2.733, + "theoretical_loss": 3.755267440460746, + "tokens_seen": 745961472 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039089267803410233, + "loss": 2.7691, + "theoretical_loss": 3.7552343226775857, + "tokens_seen": 746027008 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003908826479438315, + "loss": 2.886, + "theoretical_loss": 3.7552012086181046, + "tokens_seen": 746092544 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003908726178535607, + "loss": 2.7298, + "theoretical_loss": 3.755168098281556, + "tokens_seen": 746158080 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003908625877632899, + "loss": 2.7824, + "theoretical_loss": 3.755134991667195, + "tokens_seen": 746223616 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039085255767301906, + "loss": 2.8606, + "theoretical_loss": 3.755101888774276, + "tokens_seen": 746289152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039084252758274824, + "loss": 2.481, + "theoretical_loss": 3.755068789602054, + "tokens_seen": 746354688 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039083249749247747, + "loss": 2.5479, + "theoretical_loss": 3.755035694149784, + "tokens_seen": 746420224 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003908224674022066, + "loss": 2.9372, + "theoretical_loss": 3.7550026024167225, + "tokens_seen": 746485760 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039081243731193583, + "loss": 2.9928, + "theoretical_loss": 3.7549695144021236, + "tokens_seen": 746551296 + }, + { + "epoch": 2.05, + "learning_rate": 0.000390802407221665, + "loss": 2.8547, + "theoretical_loss": 3.7549364301052437, + "tokens_seen": 746616832 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003907923771313942, + "loss": 2.6554, + "theoretical_loss": 3.7549033495253386, + "tokens_seen": 746682368 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003907823470411234, + "loss": 2.6698, + "theoretical_loss": 3.754870272661665, + "tokens_seen": 746747904 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039077231695085256, + "loss": 2.8375, + "theoretical_loss": 3.754837199513479, + "tokens_seen": 746813440 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039076228686058174, + "loss": 2.6853, + "theoretical_loss": 3.7548041300800366, + "tokens_seen": 746878976 + }, + { + "epoch": 2.05, + "learning_rate": 0.000390752256770311, + "loss": 2.8241, + "theoretical_loss": 3.7547710643605963, + "tokens_seen": 746944512 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003907422266800401, + "loss": 2.5937, + "theoretical_loss": 3.7547380023544132, + "tokens_seen": 747010048 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039073219658976934, + "loss": 2.6594, + "theoretical_loss": 3.7547049440607467, + "tokens_seen": 747075584 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 865470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7921359539031982, + "objective/train/theoretical_loss": 3.7546884163058745, + "objective/train/tokens_used": 767568352, + "theoretical_loss": 3.7546884163058745, + "tokens_seen": 747108352 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039072216649949846, + "loss": 2.9316, + "theoretical_loss": 3.7546718894788524, + "tokens_seen": 747141120 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003907121364092277, + "loss": 2.9271, + "theoretical_loss": 3.754638838607989, + "tokens_seen": 747206656 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003907021063189569, + "loss": 2.662, + "theoretical_loss": 3.754605791447415, + "tokens_seen": 747272192 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039069207622868606, + "loss": 2.846, + "theoretical_loss": 3.754572747996387, + "tokens_seen": 747337728 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039068204613841524, + "loss": 2.6688, + "theoretical_loss": 3.754539708254164, + "tokens_seen": 747403264 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003906720160481445, + "loss": 3.0716, + "theoretical_loss": 3.754506672220005, + "tokens_seen": 747468800 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003906619859578736, + "loss": 2.5733, + "theoretical_loss": 3.7544736398931686, + "tokens_seen": 747534336 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039065195586760284, + "loss": 2.8922, + "theoretical_loss": 3.754440611272914, + "tokens_seen": 747599872 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039064192577733197, + "loss": 2.7972, + "theoretical_loss": 3.7544075863585, + "tokens_seen": 747665408 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003906318956870612, + "loss": 2.6071, + "theoretical_loss": 3.7543745651491855, + "tokens_seen": 747730944 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003906218655967904, + "loss": 2.8335, + "theoretical_loss": 3.7543415476442314, + "tokens_seen": 747796480 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039061183550651957, + "loss": 3.1237, + "theoretical_loss": 3.754308533842897, + "tokens_seen": 747862016 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039060180541624875, + "loss": 2.6559, + "theoretical_loss": 3.7542755237444423, + "tokens_seen": 747927552 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039059177532597793, + "loss": 2.877, + "theoretical_loss": 3.754242517348127, + "tokens_seen": 747993088 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003905817452357071, + "loss": 2.6894, + "theoretical_loss": 3.754209514653213, + "tokens_seen": 748058624 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039057171514543634, + "loss": 3.0517, + "theoretical_loss": 3.7541765156589593, + "tokens_seen": 748124160 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039056168505516547, + "loss": 2.8098, + "theoretical_loss": 3.754143520364628, + "tokens_seen": 748189696 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003905516549648947, + "loss": 2.7217, + "theoretical_loss": 3.7541105287694796, + "tokens_seen": 748255232 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039054162487462383, + "loss": 2.8167, + "theoretical_loss": 3.7540775408727765, + "tokens_seen": 748320768 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039053159478435307, + "loss": 2.766, + "theoretical_loss": 3.7540445566737786, + "tokens_seen": 748386304 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039052156469408225, + "loss": 2.7154, + "theoretical_loss": 3.754011576171749, + "tokens_seen": 748451840 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039051153460381143, + "loss": 2.7892, + "theoretical_loss": 3.7539785993659494, + "tokens_seen": 748517376 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003905015045135406, + "loss": 2.6884, + "theoretical_loss": 3.7539456262556414, + "tokens_seen": 748582912 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039049147442326985, + "loss": 3.0133, + "theoretical_loss": 3.753912656840088, + "tokens_seen": 748648448 + }, + { + "epoch": 2.05, + "learning_rate": 0.000390481444332999, + "loss": 2.7864, + "theoretical_loss": 3.7538796911185512, + "tokens_seen": 748713984 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 866234, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.680988311767578, + "objective/train/theoretical_loss": 3.7538632096428093, + "objective/train/tokens_used": 769206752, + "theoretical_loss": 3.7538632096428093, + "tokens_seen": 748746752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003904714142427282, + "loss": 2.649, + "theoretical_loss": 3.753846729090295, + "tokens_seen": 748779520 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039046138415245734, + "loss": 2.9356, + "theoretical_loss": 3.753813770754581, + "tokens_seen": 748845056 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039045135406218657, + "loss": 2.8491, + "theoretical_loss": 3.753780816110673, + "tokens_seen": 748910592 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039044132397191575, + "loss": 2.9241, + "theoretical_loss": 3.7537478651578358, + "tokens_seen": 748976128 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039043129388164493, + "loss": 2.7688, + "theoretical_loss": 3.7537149178953304, + "tokens_seen": 749041664 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003904212637913741, + "loss": 2.8314, + "theoretical_loss": 3.7536819743224226, + "tokens_seen": 749107200 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003904112337011033, + "loss": 2.9138, + "theoretical_loss": 3.753649034438376, + "tokens_seen": 749172736 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003904012036108325, + "loss": 2.713, + "theoretical_loss": 3.753616098242455, + "tokens_seen": 749238272 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003903911735205617, + "loss": 2.8089, + "theoretical_loss": 3.753583165733924, + "tokens_seen": 749303808 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003903811434302909, + "loss": 2.9077, + "theoretical_loss": 3.7535502369120475, + "tokens_seen": 749369344 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003903711133400201, + "loss": 2.924, + "theoretical_loss": 3.7535173117760907, + "tokens_seen": 749434880 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039036108324974926, + "loss": 2.7313, + "theoretical_loss": 3.753484390325319, + "tokens_seen": 749500416 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039035105315947844, + "loss": 2.7019, + "theoretical_loss": 3.753451472558997, + "tokens_seen": 749565952 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039034102306920767, + "loss": 2.9285, + "theoretical_loss": 3.7534185584763904, + "tokens_seen": 749631488 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003903309929789368, + "loss": 2.5482, + "theoretical_loss": 3.7533856480767662, + "tokens_seen": 749697024 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039032096288866603, + "loss": 2.7619, + "theoretical_loss": 3.753352741359389, + "tokens_seen": 749762560 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003903109327983952, + "loss": 2.9135, + "theoretical_loss": 3.7533198383235256, + "tokens_seen": 749828096 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003903009027081244, + "loss": 2.7497, + "theoretical_loss": 3.753286938968442, + "tokens_seen": 749893632 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003902908726178536, + "loss": 2.8409, + "theoretical_loss": 3.7532540432934054, + "tokens_seen": 749959168 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039028084252758276, + "loss": 2.9118, + "theoretical_loss": 3.7532211512976823, + "tokens_seen": 750024704 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039027081243731194, + "loss": 2.8197, + "theoretical_loss": 3.7531882629805393, + "tokens_seen": 750090240 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003902607823470412, + "loss": 2.7701, + "theoretical_loss": 3.753155378341245, + "tokens_seen": 750155776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003902507522567703, + "loss": 2.9495, + "theoretical_loss": 3.7531224973790653, + "tokens_seen": 750221312 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039024072216649954, + "loss": 2.7622, + "theoretical_loss": 3.7530896200932684, + "tokens_seen": 750286848 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039023069207622866, + "loss": 2.7241, + "theoretical_loss": 3.7530567464831233, + "tokens_seen": 750352384 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 867529, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8023226261138916, + "objective/train/theoretical_loss": 3.75304031105619, + "objective/train/tokens_used": 770845152, + "theoretical_loss": 3.75304031105619, + "tokens_seen": 750385152 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003902206619859579, + "loss": 2.8509, + "theoretical_loss": 3.7530238765478963, + "tokens_seen": 750417920 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003902106318956871, + "loss": 2.8771, + "theoretical_loss": 3.7529910102868564, + "tokens_seen": 750483456 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039020060180541626, + "loss": 2.7444, + "theoretical_loss": 3.752958147699272, + "tokens_seen": 750548992 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039019057171514544, + "loss": 2.7908, + "theoretical_loss": 3.7529252887844127, + "tokens_seen": 750614528 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003901805416248747, + "loss": 2.9501, + "theoretical_loss": 3.7528924335415463, + "tokens_seen": 750680064 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003901705115346038, + "loss": 2.8367, + "theoretical_loss": 3.7528595819699424, + "tokens_seen": 750745600 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039016048144433304, + "loss": 2.6838, + "theoretical_loss": 3.752826734068871, + "tokens_seen": 750811136 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039015045135406217, + "loss": 2.7132, + "theoretical_loss": 3.7527938898376, + "tokens_seen": 750876672 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003901404212637914, + "loss": 2.7584, + "theoretical_loss": 3.752761049275401, + "tokens_seen": 750942208 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003901303911735206, + "loss": 2.8268, + "theoretical_loss": 3.7527282123815424, + "tokens_seen": 751007744 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039012036108324977, + "loss": 3.2001, + "theoretical_loss": 3.7526953791552953, + "tokens_seen": 751073280 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039011033099297895, + "loss": 2.7938, + "theoretical_loss": 3.7526625495959296, + "tokens_seen": 751138816 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039010030090270813, + "loss": 2.8428, + "theoretical_loss": 3.752629723702717, + "tokens_seen": 751204352 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003900902708124373, + "loss": 2.7589, + "theoretical_loss": 3.752596901474927, + "tokens_seen": 751269888 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039008024072216654, + "loss": 2.8637, + "theoretical_loss": 3.7525640829118307, + "tokens_seen": 751335424 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039007021063189567, + "loss": 2.7897, + "theoretical_loss": 3.7525312680127003, + "tokens_seen": 751400960 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003900601805416249, + "loss": 2.9186, + "theoretical_loss": 3.7524984567768063, + "tokens_seen": 751466496 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039005015045135403, + "loss": 2.819, + "theoretical_loss": 3.7524656492034207, + "tokens_seen": 751532032 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039004012036108327, + "loss": 2.9226, + "theoretical_loss": 3.752432845291816, + "tokens_seen": 751597568 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039003009027081245, + "loss": 2.8002, + "theoretical_loss": 3.752400045041263, + "tokens_seen": 751663104 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039002006018054163, + "loss": 2.6275, + "theoretical_loss": 3.7523672484510344, + "tokens_seen": 751728640 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003900100300902708, + "loss": 3.0589, + "theoretical_loss": 3.752334455520403, + "tokens_seen": 751794176 + }, + { + "epoch": 2.05, + "learning_rate": 0.00039000000000000005, + "loss": 3.0157, + "theoretical_loss": 3.7523016662486413, + "tokens_seen": 751859712 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003899899699097292, + "loss": 2.6684, + "theoretical_loss": 3.752268880635022, + "tokens_seen": 751925248 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003899799398194584, + "loss": 3.0901, + "theoretical_loss": 3.752236098678819, + "tokens_seen": 751990784 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 868214, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.602017402648926, + "objective/train/theoretical_loss": 3.752219709072021, + "objective/train/tokens_used": 772483552, + "theoretical_loss": 3.752219709072021, + "tokens_seen": 752023552 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038996990972918754, + "loss": 2.9366, + "theoretical_loss": 3.7522033203793046, + "tokens_seen": 752056320 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038995987963891677, + "loss": 2.9654, + "theoretical_loss": 3.752170545735753, + "tokens_seen": 752121856 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038994984954864595, + "loss": 2.8639, + "theoretical_loss": 3.752137774747437, + "tokens_seen": 752187392 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038993981945837513, + "loss": 2.8474, + "theoretical_loss": 3.7521050074136317, + "tokens_seen": 752252928 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003899297893681043, + "loss": 2.9317, + "theoretical_loss": 3.7520722437336107, + "tokens_seen": 752318464 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003899197592778335, + "loss": 2.9675, + "theoretical_loss": 3.752039483706648, + "tokens_seen": 752384000 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003899097291875627, + "loss": 3.0981, + "theoretical_loss": 3.752006727332019, + "tokens_seen": 752449536 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003898996990972919, + "loss": 2.8965, + "theoretical_loss": 3.7519739746089975, + "tokens_seen": 752515072 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038988966900702104, + "loss": 2.7717, + "theoretical_loss": 3.751941225536859, + "tokens_seen": 752580608 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003898796389167503, + "loss": 2.8899, + "theoretical_loss": 3.7519084801148788, + "tokens_seen": 752646144 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003898696088264794, + "loss": 2.9638, + "theoretical_loss": 3.751875738342332, + "tokens_seen": 752711680 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038985957873620864, + "loss": 2.8868, + "theoretical_loss": 3.7518430002184937, + "tokens_seen": 752777216 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003898495486459378, + "loss": 2.9566, + "theoretical_loss": 3.751810265742641, + "tokens_seen": 752842752 + }, + { + "epoch": 2.05, + "learning_rate": 0.000389839518555667, + "loss": 2.8835, + "theoretical_loss": 3.7517775349140488, + "tokens_seen": 752908288 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003898294884653962, + "loss": 2.7896, + "theoretical_loss": 3.7517448077319937, + "tokens_seen": 752973824 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003898194583751254, + "loss": 2.6797, + "theoretical_loss": 3.751712084195752, + "tokens_seen": 753039360 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038980942828485454, + "loss": 2.8957, + "theoretical_loss": 3.7516793643046005, + "tokens_seen": 753104896 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003897993981945838, + "loss": 2.8789, + "theoretical_loss": 3.7516466480578154, + "tokens_seen": 753170432 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003897893681043129, + "loss": 2.9751, + "theoretical_loss": 3.751613935454675, + "tokens_seen": 753235968 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038977933801404214, + "loss": 3.1071, + "theoretical_loss": 3.7515812264944555, + "tokens_seen": 753301504 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003897693079237713, + "loss": 2.8043, + "theoretical_loss": 3.751548521176434, + "tokens_seen": 753367040 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003897592778335005, + "loss": 2.8944, + "theoretical_loss": 3.7515158194998897, + "tokens_seen": 753432576 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003897492477432297, + "loss": 2.8687, + "theoretical_loss": 3.7514831214640987, + "tokens_seen": 753498112 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038973921765295886, + "loss": 2.8928, + "theoretical_loss": 3.751450427068341, + "tokens_seen": 753563648 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038972918756268805, + "loss": 3.0386, + "theoretical_loss": 3.7514177363118923, + "tokens_seen": 753629184 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 869556, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.754945755004883, + "objective/train/theoretical_loss": 3.7514013922981846, + "objective/train/tokens_used": 774121952, + "theoretical_loss": 3.7514013922981846, + "tokens_seen": 753661952 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003897191574724173, + "loss": 2.9253, + "theoretical_loss": 3.7513850491940337, + "tokens_seen": 753694720 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003897091273821464, + "loss": 2.8255, + "theoretical_loss": 3.751352365714042, + "tokens_seen": 753760256 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038969909729187564, + "loss": 2.8426, + "theoretical_loss": 3.7513196858711972, + "tokens_seen": 753825792 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038968906720160477, + "loss": 2.9487, + "theoretical_loss": 3.751287009664778, + "tokens_seen": 753891328 + }, + { + "epoch": 2.05, + "learning_rate": 0.000389679037111334, + "loss": 2.7612, + "theoretical_loss": 3.751254337094063, + "tokens_seen": 753956864 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003896690070210632, + "loss": 3.0445, + "theoretical_loss": 3.7512216681583332, + "tokens_seen": 754022400 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038965897693079237, + "loss": 2.9938, + "theoretical_loss": 3.751189002856867, + "tokens_seen": 754087936 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038964894684052155, + "loss": 2.8911, + "theoretical_loss": 3.751156341188945, + "tokens_seen": 754153472 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003896389167502508, + "loss": 2.8629, + "theoretical_loss": 3.751123683153847, + "tokens_seen": 754219008 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038962888665997997, + "loss": 2.8401, + "theoretical_loss": 3.7510910287508534, + "tokens_seen": 754284544 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038961885656970915, + "loss": 2.8568, + "theoretical_loss": 3.751058377979245, + "tokens_seen": 754350080 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038960882647943833, + "loss": 2.8119, + "theoretical_loss": 3.751025730838302, + "tokens_seen": 754415616 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003895987963891675, + "loss": 2.9211, + "theoretical_loss": 3.7509930873273056, + "tokens_seen": 754481152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038958876629889674, + "loss": 3.0088, + "theoretical_loss": 3.7509604474455376, + "tokens_seen": 754546688 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038957873620862587, + "loss": 2.8507, + "theoretical_loss": 3.750927811192278, + "tokens_seen": 754612224 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003895687061183551, + "loss": 2.8428, + "theoretical_loss": 3.750895178566809, + "tokens_seen": 754677760 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038955867602808423, + "loss": 2.857, + "theoretical_loss": 3.7508625495684136, + "tokens_seen": 754743296 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038954864593781347, + "loss": 2.9037, + "theoretical_loss": 3.7508299241963714, + "tokens_seen": 754808832 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038953861584754265, + "loss": 2.688, + "theoretical_loss": 3.7507973024499663, + "tokens_seen": 754874368 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038952858575727183, + "loss": 2.856, + "theoretical_loss": 3.7507646843284803, + "tokens_seen": 754939904 + }, + { + "epoch": 2.05, + "learning_rate": 0.000389518555667001, + "loss": 2.7452, + "theoretical_loss": 3.7507320698311952, + "tokens_seen": 755005440 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038950852557673025, + "loss": 2.6562, + "theoretical_loss": 3.7506994589573948, + "tokens_seen": 755070976 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003894984954864594, + "loss": 2.8558, + "theoretical_loss": 3.7506668517063613, + "tokens_seen": 755136512 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003894884653961886, + "loss": 3.0562, + "theoretical_loss": 3.750634248077379, + "tokens_seen": 755202048 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038947843530591774, + "loss": 2.8171, + "theoretical_loss": 3.75060164806973, + "tokens_seen": 755267584 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 870017, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.080669403076172, + "objective/train/theoretical_loss": 3.750585349423682, + "objective/train/tokens_used": 775760352, + "theoretical_loss": 3.750585349423682, + "tokens_seen": 755300352 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038946840521564697, + "loss": 2.8293, + "theoretical_loss": 3.7505690516826986, + "tokens_seen": 755333120 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038945837512537615, + "loss": 2.8135, + "theoretical_loss": 3.7505364589155685, + "tokens_seen": 755398656 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038944834503510533, + "loss": 2.8743, + "theoretical_loss": 3.7505038697676234, + "tokens_seen": 755464192 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003894383149448345, + "loss": 2.8677, + "theoretical_loss": 3.750471284238148, + "tokens_seen": 755529728 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003894282848545637, + "loss": 2.7691, + "theoretical_loss": 3.7504387023264263, + "tokens_seen": 755595264 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003894182547642929, + "loss": 2.9833, + "theoretical_loss": 3.750406124031743, + "tokens_seen": 755660800 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003894082246740221, + "loss": 2.9658, + "theoretical_loss": 3.750373549353383, + "tokens_seen": 755726336 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038939819458375124, + "loss": 2.7576, + "theoretical_loss": 3.750340978290632, + "tokens_seen": 755791872 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003893881644934805, + "loss": 2.7332, + "theoretical_loss": 3.7503084108427736, + "tokens_seen": 755857408 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003893781344032096, + "loss": 2.7703, + "theoretical_loss": 3.750275847009095, + "tokens_seen": 755922944 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038936810431293884, + "loss": 2.7189, + "theoretical_loss": 3.7502432867888804, + "tokens_seen": 755988480 + }, + { + "epoch": 2.05, + "learning_rate": 0.000389358074222668, + "loss": 2.7728, + "theoretical_loss": 3.7502107301814167, + "tokens_seen": 756054016 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003893480441323972, + "loss": 2.6577, + "theoretical_loss": 3.750178177185989, + "tokens_seen": 756119552 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003893380140421264, + "loss": 2.7496, + "theoretical_loss": 3.750145627801884, + "tokens_seen": 756185088 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003893279839518556, + "loss": 2.9146, + "theoretical_loss": 3.7501130820283883, + "tokens_seen": 756250624 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038931795386158474, + "loss": 2.7928, + "theoretical_loss": 3.750080539864789, + "tokens_seen": 756316160 + }, + { + "epoch": 2.05, + "learning_rate": 0.000389307923771314, + "loss": 2.8623, + "theoretical_loss": 3.750048001310372, + "tokens_seen": 756381696 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003892978936810431, + "loss": 2.5961, + "theoretical_loss": 3.750015466364424, + "tokens_seen": 756447232 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038928786359077234, + "loss": 2.6931, + "theoretical_loss": 3.7499829350262335, + "tokens_seen": 756512768 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003892778335005015, + "loss": 2.8814, + "theoretical_loss": 3.7499504072950876, + "tokens_seen": 756578304 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003892678034102307, + "loss": 2.7536, + "theoretical_loss": 3.7499178831702737, + "tokens_seen": 756643840 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003892577733199599, + "loss": 2.8347, + "theoretical_loss": 3.7498853626510797, + "tokens_seen": 756709376 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038924774322968906, + "loss": 2.7921, + "theoretical_loss": 3.7498528457367932, + "tokens_seen": 756774912 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038923771313941825, + "loss": 2.9477, + "theoretical_loss": 3.7498203324267037, + "tokens_seen": 756840448 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003892276830491475, + "loss": 3.0231, + "theoretical_loss": 3.7497878227200983, + "tokens_seen": 756905984 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 871516, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.127725601196289, + "objective/train/theoretical_loss": 3.74977156921788, + "objective/train/tokens_used": 777398752, + "theoretical_loss": 3.74977156921788, + "tokens_seen": 756938752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003892176529588766, + "loss": 2.9308, + "theoretical_loss": 3.749755316616267, + "tokens_seen": 756971520 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038920762286860584, + "loss": 2.8961, + "theoretical_loss": 3.749722814114497, + "tokens_seen": 757037056 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038919759277833497, + "loss": 2.7532, + "theoretical_loss": 3.749690315214079, + "tokens_seen": 757102592 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003891875626880642, + "loss": 3.0619, + "theoretical_loss": 3.7496578199143014, + "tokens_seen": 757168128 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003891775325977934, + "loss": 2.6638, + "theoretical_loss": 3.749625328214454, + "tokens_seen": 757233664 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038916750250752257, + "loss": 2.9055, + "theoretical_loss": 3.7495928401138263, + "tokens_seen": 757299200 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038915747241725175, + "loss": 2.7871, + "theoretical_loss": 3.749560355611708, + "tokens_seen": 757364736 + }, + { + "epoch": 2.05, + "learning_rate": 0.000389147442326981, + "loss": 2.674, + "theoretical_loss": 3.74952787470739, + "tokens_seen": 757430272 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003891374122367101, + "loss": 2.8836, + "theoretical_loss": 3.7494953974001617, + "tokens_seen": 757495808 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038912738214643935, + "loss": 2.6771, + "theoretical_loss": 3.7494629236893138, + "tokens_seen": 757561344 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003891173520561685, + "loss": 2.981, + "theoretical_loss": 3.7494304535741376, + "tokens_seen": 757626880 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003891073219658977, + "loss": 3.0572, + "theoretical_loss": 3.7493979870539227, + "tokens_seen": 757692416 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003890972918756269, + "loss": 3.0317, + "theoretical_loss": 3.7493655241279615, + "tokens_seen": 757757952 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038908726178535607, + "loss": 2.9948, + "theoretical_loss": 3.7493330647955445, + "tokens_seen": 757823488 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038907723169508525, + "loss": 2.7901, + "theoretical_loss": 3.7493006090559637, + "tokens_seen": 757889024 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038906720160481443, + "loss": 2.6746, + "theoretical_loss": 3.74926815690851, + "tokens_seen": 757954560 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003890571715145436, + "loss": 3.0293, + "theoretical_loss": 3.7492357083524768, + "tokens_seen": 758020096 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038904714142427285, + "loss": 2.7772, + "theoretical_loss": 3.7492032633871544, + "tokens_seen": 758085632 + }, + { + "epoch": 2.05, + "learning_rate": 0.000389037111334002, + "loss": 2.8311, + "theoretical_loss": 3.749170822011836, + "tokens_seen": 758151168 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003890270812437312, + "loss": 2.8405, + "theoretical_loss": 3.749138384225814, + "tokens_seen": 758216704 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003890170511534604, + "loss": 2.9285, + "theoretical_loss": 3.7491059500283814, + "tokens_seen": 758282240 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003890070210631896, + "loss": 2.8327, + "theoretical_loss": 3.7490735194188307, + "tokens_seen": 758347776 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038899699097291876, + "loss": 2.7235, + "theoretical_loss": 3.749041092396455, + "tokens_seen": 758413312 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038898696088264794, + "loss": 2.8355, + "theoretical_loss": 3.7490086689605473, + "tokens_seen": 758478848 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003889769307923771, + "loss": 2.8129, + "theoretical_loss": 3.748976249110402, + "tokens_seen": 758544384 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 872211, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.623302936553955, + "objective/train/theoretical_loss": 3.748960040529769, + "objective/train/tokens_used": 779037152, + "theoretical_loss": 3.748960040529769, + "tokens_seen": 758577152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038896690070210635, + "loss": 2.7462, + "theoretical_loss": 3.7489438328453124, + "tokens_seen": 758609920 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003889568706118355, + "loss": 2.9954, + "theoretical_loss": 3.748911420164572, + "tokens_seen": 758675456 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003889468405215647, + "loss": 2.8864, + "theoretical_loss": 3.7488790110674755, + "tokens_seen": 758740992 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038893681043129384, + "loss": 3.003, + "theoretical_loss": 3.7488466055533163, + "tokens_seen": 758806528 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003889267803410231, + "loss": 2.9352, + "theoretical_loss": 3.74881420362139, + "tokens_seen": 758872064 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038891675025075226, + "loss": 2.7863, + "theoretical_loss": 3.7487818052709905, + "tokens_seen": 758937600 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038890672016048144, + "loss": 2.9018, + "theoretical_loss": 3.7487494105014125, + "tokens_seen": 759003136 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003888966900702106, + "loss": 2.5538, + "theoretical_loss": 3.748717019311952, + "tokens_seen": 759068672 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003888866599799398, + "loss": 2.7822, + "theoretical_loss": 3.748684631701904, + "tokens_seen": 759134208 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038887662988966904, + "loss": 2.7246, + "theoretical_loss": 3.748652247670564, + "tokens_seen": 759199744 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003888665997993982, + "loss": 2.9017, + "theoretical_loss": 3.748619867217227, + "tokens_seen": 759265280 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003888565697091274, + "loss": 2.9178, + "theoretical_loss": 3.7485874903411895, + "tokens_seen": 759330816 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003888465396188566, + "loss": 2.8642, + "theoretical_loss": 3.7485551170417484, + "tokens_seen": 759396352 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003888365095285858, + "loss": 2.8042, + "theoretical_loss": 3.748522747318198, + "tokens_seen": 759461888 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038882647943831494, + "loss": 2.9502, + "theoretical_loss": 3.7484903811698365, + "tokens_seen": 759527424 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003888164493480442, + "loss": 2.9731, + "theoretical_loss": 3.7484580185959597, + "tokens_seen": 759592960 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003888064192577733, + "loss": 2.9822, + "theoretical_loss": 3.748425659595865, + "tokens_seen": 759658496 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038879638916750254, + "loss": 3.0593, + "theoretical_loss": 3.7483933041688493, + "tokens_seen": 759724032 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003887863590772317, + "loss": 2.6031, + "theoretical_loss": 3.74836095231421, + "tokens_seen": 759789568 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003887763289869609, + "loss": 2.9151, + "theoretical_loss": 3.7483286040312445, + "tokens_seen": 759855104 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003887662988966901, + "loss": 2.9807, + "theoretical_loss": 3.7482962593192504, + "tokens_seen": 759920640 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038875626880641927, + "loss": 2.795, + "theoretical_loss": 3.7482639181775252, + "tokens_seen": 759986176 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038874623871614845, + "loss": 2.9018, + "theoretical_loss": 3.748231580605368, + "tokens_seen": 760051712 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003887362086258777, + "loss": 2.9098, + "theoretical_loss": 3.748199246602076, + "tokens_seen": 760117248 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003887261785356068, + "loss": 2.8907, + "theoretical_loss": 3.7481669161669484, + "tokens_seen": 760182784 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 873451, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.31072998046875, + "objective/train/theoretical_loss": 3.748150752287227, + "objective/train/tokens_used": 780675552, + "theoretical_loss": 3.748150752287227, + "tokens_seen": 760215552 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038871614844533604, + "loss": 2.6467, + "theoretical_loss": 3.7481345892992834, + "tokens_seen": 760248320 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038870611835506517, + "loss": 2.7994, + "theoretical_loss": 3.7481022659983805, + "tokens_seen": 760313856 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003886960882647944, + "loss": 2.735, + "theoretical_loss": 3.7480699462635383, + "tokens_seen": 760379392 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003886860581745236, + "loss": 2.9806, + "theoretical_loss": 3.748037630094056, + "tokens_seen": 760444928 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038867602808425277, + "loss": 2.746, + "theoretical_loss": 3.748005317489233, + "tokens_seen": 760510464 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038866599799398195, + "loss": 2.6938, + "theoretical_loss": 3.74797300844837, + "tokens_seen": 760576000 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003886559679037112, + "loss": 2.9767, + "theoretical_loss": 3.747940702970766, + "tokens_seen": 760641536 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003886459378134403, + "loss": 2.9013, + "theoretical_loss": 3.7479084010557204, + "tokens_seen": 760707072 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038863590772316955, + "loss": 2.8877, + "theoretical_loss": 3.7478761027025347, + "tokens_seen": 760772608 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003886258776328987, + "loss": 2.7881, + "theoretical_loss": 3.7478438079105088, + "tokens_seen": 760838144 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003886158475426279, + "loss": 2.8087, + "theoretical_loss": 3.747811516678943, + "tokens_seen": 760903680 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003886058174523571, + "loss": 3.0713, + "theoretical_loss": 3.747779229007139, + "tokens_seen": 760969216 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038859578736208627, + "loss": 2.8158, + "theoretical_loss": 3.7477469448943976, + "tokens_seen": 761034752 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038858575727181545, + "loss": 2.8475, + "theoretical_loss": 3.7477146643400197, + "tokens_seen": 761100288 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038857572718154463, + "loss": 2.8359, + "theoretical_loss": 3.747682387343307, + "tokens_seen": 761165824 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003885656970912738, + "loss": 2.5743, + "theoretical_loss": 3.7476501139035605, + "tokens_seen": 761231360 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038855566700100305, + "loss": 2.9382, + "theoretical_loss": 3.747617844020083, + "tokens_seen": 761296896 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003885456369107322, + "loss": 2.8919, + "theoretical_loss": 3.7475855776921763, + "tokens_seen": 761362432 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003885356068204614, + "loss": 2.9157, + "theoretical_loss": 3.7475533149191422, + "tokens_seen": 761427968 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003885255767301906, + "loss": 2.7488, + "theoretical_loss": 3.7475210557002834, + "tokens_seen": 761493504 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003885155466399198, + "loss": 2.7937, + "theoretical_loss": 3.747488800034903, + "tokens_seen": 761559040 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038850551654964896, + "loss": 2.6836, + "theoretical_loss": 3.747456547922303, + "tokens_seen": 761624576 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038849548645937814, + "loss": 2.8512, + "theoretical_loss": 3.747424299361787, + "tokens_seen": 761690112 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003884854563691073, + "loss": 2.7041, + "theoretical_loss": 3.747392054352658, + "tokens_seen": 761755648 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038847542627883655, + "loss": 2.8625, + "theoretical_loss": 3.7473598128942194, + "tokens_seen": 761821184 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 874119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.929797649383545, + "objective/train/theoretical_loss": 3.7473436934962914, + "objective/train/tokens_used": 782313952, + "theoretical_loss": 3.7473436934962914, + "tokens_seen": 761853952 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003884653961885657, + "loss": 2.8039, + "theoretical_loss": 3.747327574985775, + "tokens_seen": 761886720 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003884553660982949, + "loss": 2.8516, + "theoretical_loss": 3.7472953406266276, + "tokens_seen": 761952256 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038844533600802404, + "loss": 2.7222, + "theoretical_loss": 3.747263109816083, + "tokens_seen": 762017792 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003884353059177533, + "loss": 2.9182, + "theoretical_loss": 3.747230882553444, + "tokens_seen": 762083328 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038842527582748246, + "loss": 2.6432, + "theoretical_loss": 3.747198658838016, + "tokens_seen": 762148864 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038841524573721164, + "loss": 2.661, + "theoretical_loss": 3.747166438669103, + "tokens_seen": 762214400 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003884052156469408, + "loss": 2.8589, + "theoretical_loss": 3.747134222046009, + "tokens_seen": 762279936 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038839518555667, + "loss": 2.9799, + "theoretical_loss": 3.7471020089680405, + "tokens_seen": 762345472 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003883851554663992, + "loss": 2.8731, + "theoretical_loss": 3.7470697994345024, + "tokens_seen": 762411008 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003883751253761284, + "loss": 2.8761, + "theoretical_loss": 3.7470375934446984, + "tokens_seen": 762476544 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038836509528585755, + "loss": 3.1062, + "theoretical_loss": 3.7470053909979364, + "tokens_seen": 762542080 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003883550651955868, + "loss": 2.8829, + "theoretical_loss": 3.746973192093521, + "tokens_seen": 762607616 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038834503510531596, + "loss": 2.6719, + "theoretical_loss": 3.746940996730758, + "tokens_seen": 762673152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038833500501504514, + "loss": 2.8273, + "theoretical_loss": 3.7469088049089545, + "tokens_seen": 762738688 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003883249749247743, + "loss": 2.8338, + "theoretical_loss": 3.7468766166274157, + "tokens_seen": 762804224 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003883149448345035, + "loss": 2.8078, + "theoretical_loss": 3.7468444318854486, + "tokens_seen": 762869760 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003883049147442327, + "loss": 2.7179, + "theoretical_loss": 3.7468122506823605, + "tokens_seen": 762935296 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003882948846539619, + "loss": 2.871, + "theoretical_loss": 3.746780073017457, + "tokens_seen": 763000832 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038828485456369105, + "loss": 2.8244, + "theoretical_loss": 3.746747898890047, + "tokens_seen": 763066368 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003882748244734203, + "loss": 2.6443, + "theoretical_loss": 3.7467157282994368, + "tokens_seen": 763131904 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003882647943831494, + "loss": 2.8023, + "theoretical_loss": 3.7466835612449336, + "tokens_seen": 763197440 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038825476429287865, + "loss": 2.7588, + "theoretical_loss": 3.746651397725846, + "tokens_seen": 763262976 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038824473420260783, + "loss": 2.9649, + "theoretical_loss": 3.7466192377414815, + "tokens_seen": 763328512 + }, + { + "epoch": 2.05, + "learning_rate": 0.000388234704112337, + "loss": 3.0581, + "theoretical_loss": 3.746587081291148, + "tokens_seen": 763394048 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003882246740220662, + "loss": 2.9048, + "theoretical_loss": 3.746554928374154, + "tokens_seen": 763459584 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 875620, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.904193878173828, + "objective/train/theoretical_loss": 3.7465388532404433, + "objective/train/tokens_used": 783952352, + "theoretical_loss": 3.7465388532404433, + "tokens_seen": 763492352 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038821464393179537, + "loss": 2.9749, + "theoretical_loss": 3.7465227789898083, + "tokens_seen": 763525120 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038820461384152455, + "loss": 2.8214, + "theoretical_loss": 3.7464906331374195, + "tokens_seen": 763590656 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003881945837512538, + "loss": 2.7125, + "theoretical_loss": 3.7464584908162957, + "tokens_seen": 763656192 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003881845536609829, + "loss": 2.9804, + "theoretical_loss": 3.746426352025747, + "tokens_seen": 763721728 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038817452357071215, + "loss": 2.7881, + "theoretical_loss": 3.7463942167650823, + "tokens_seen": 763787264 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038816449348044133, + "loss": 2.871, + "theoretical_loss": 3.746362085033611, + "tokens_seen": 763852800 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003881544633901705, + "loss": 3.2335, + "theoretical_loss": 3.746329956830643, + "tokens_seen": 763918336 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003881444332998997, + "loss": 2.8939, + "theoretical_loss": 3.746297832155488, + "tokens_seen": 763983872 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003881344032096289, + "loss": 2.8657, + "theoretical_loss": 3.7462657110074558, + "tokens_seen": 764049408 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003881243731193581, + "loss": 2.9157, + "theoretical_loss": 3.746233593385857, + "tokens_seen": 764114944 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003881143430290873, + "loss": 2.9297, + "theoretical_loss": 3.7462014792900016, + "tokens_seen": 764180480 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038810431293881647, + "loss": 2.9193, + "theoretical_loss": 3.746169368719201, + "tokens_seen": 764246016 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038809428284854565, + "loss": 2.6569, + "theoretical_loss": 3.7461372616727657, + "tokens_seen": 764311552 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038808425275827483, + "loss": 2.8671, + "theoretical_loss": 3.746105158150007, + "tokens_seen": 764377088 + }, + { + "epoch": 2.05, + "learning_rate": 0.000388074222668004, + "loss": 2.8244, + "theoretical_loss": 3.746073058150235, + "tokens_seen": 764442624 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038806419257773325, + "loss": 2.7417, + "theoretical_loss": 3.7460409616727626, + "tokens_seen": 764508160 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003880541624874624, + "loss": 2.8929, + "theoretical_loss": 3.7460088687169, + "tokens_seen": 764573696 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003880441323971916, + "loss": 2.9315, + "theoretical_loss": 3.7459767792819605, + "tokens_seen": 764639232 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003880341023069208, + "loss": 2.8129, + "theoretical_loss": 3.745944693367255, + "tokens_seen": 764704768 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038802407221665, + "loss": 2.9542, + "theoretical_loss": 3.7459126109720957, + "tokens_seen": 764770304 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038801404212637916, + "loss": 2.9054, + "theoretical_loss": 3.7458805320957955, + "tokens_seen": 764835840 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038800401203610834, + "loss": 2.7932, + "theoretical_loss": 3.745848456737667, + "tokens_seen": 764901376 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003879939819458375, + "loss": 2.8687, + "theoretical_loss": 3.7458163848970223, + "tokens_seen": 764966912 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038798395185556675, + "loss": 2.8049, + "theoretical_loss": 3.7457843165731752, + "tokens_seen": 765032448 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003879739217652959, + "loss": 2.9075, + "theoretical_loss": 3.7457522517654382, + "tokens_seen": 765097984 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 876301, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8112714290618896, + "objective/train/theoretical_loss": 3.7457362206798965, + "objective/train/tokens_used": 785590752, + "theoretical_loss": 3.7457362206798965, + "tokens_seen": 765130752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003879638916750251, + "loss": 3.0486, + "theoretical_loss": 3.7457201904731248, + "tokens_seen": 765163520 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038795386158475424, + "loss": 2.8881, + "theoretical_loss": 3.745688132695549, + "tokens_seen": 765229056 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003879438314944835, + "loss": 3.0385, + "theoretical_loss": 3.745656078432024, + "tokens_seen": 765294592 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038793380140421266, + "loss": 2.8354, + "theoretical_loss": 3.7456240276818633, + "tokens_seen": 765360128 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038792377131394184, + "loss": 2.731, + "theoretical_loss": 3.7455919804443827, + "tokens_seen": 765425664 + }, + { + "epoch": 2.05, + "learning_rate": 0.000387913741223671, + "loss": 2.9255, + "theoretical_loss": 3.7455599367188945, + "tokens_seen": 765491200 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003879037111334002, + "loss": 2.842, + "theoretical_loss": 3.745527896504715, + "tokens_seen": 765556736 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003878936810431294, + "loss": 3.034, + "theoretical_loss": 3.7454958598011574, + "tokens_seen": 765622272 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003878836509528586, + "loss": 2.7549, + "theoretical_loss": 3.745463826607537, + "tokens_seen": 765687808 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038787362086258775, + "loss": 2.9639, + "theoretical_loss": 3.7454317969231696, + "tokens_seen": 765753344 + }, + { + "epoch": 2.05, + "learning_rate": 0.000387863590772317, + "loss": 2.8361, + "theoretical_loss": 3.74539977074737, + "tokens_seen": 765818880 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038785356068204616, + "loss": 2.8789, + "theoretical_loss": 3.7453677480794534, + "tokens_seen": 765884416 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038784353059177534, + "loss": 2.9602, + "theoretical_loss": 3.745335728918736, + "tokens_seen": 765949952 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003878335005015045, + "loss": 2.8919, + "theoretical_loss": 3.745303713264533, + "tokens_seen": 766015488 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003878234704112337, + "loss": 2.8526, + "theoretical_loss": 3.745271701116161, + "tokens_seen": 766081024 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003878134403209629, + "loss": 3.0369, + "theoretical_loss": 3.7452396924729356, + "tokens_seen": 766146560 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003878034102306921, + "loss": 2.8223, + "theoretical_loss": 3.745207687334174, + "tokens_seen": 766212096 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038779338014042125, + "loss": 2.8681, + "theoretical_loss": 3.745175685699193, + "tokens_seen": 766277632 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003877833500501505, + "loss": 2.7303, + "theoretical_loss": 3.745143687567308, + "tokens_seen": 766343168 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003877733199598796, + "loss": 2.9516, + "theoretical_loss": 3.7451116929378374, + "tokens_seen": 766408704 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038776328986960885, + "loss": 2.7378, + "theoretical_loss": 3.7450797018100976, + "tokens_seen": 766474240 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038775325977933803, + "loss": 2.6424, + "theoretical_loss": 3.745047714183406, + "tokens_seen": 766539776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003877432296890672, + "loss": 2.5332, + "theoretical_loss": 3.7450157300570814, + "tokens_seen": 766605312 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003877331995987964, + "loss": 2.8007, + "theoretical_loss": 3.74498374943044, + "tokens_seen": 766670848 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038772316950852557, + "loss": 2.8851, + "theoretical_loss": 3.7449517723028007, + "tokens_seen": 766736384 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 877352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.128386974334717, + "objective/train/theoretical_loss": 3.7449357850508935, + "objective/train/tokens_used": 787229152, + "theoretical_loss": 3.7449357850508935, + "tokens_seen": 766769152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038771313941825475, + "loss": 2.7447, + "theoretical_loss": 3.7449197986734806, + "tokens_seen": 766801920 + }, + { + "epoch": 2.05, + "learning_rate": 0.000387703109327984, + "loss": 2.6875, + "theoretical_loss": 3.744887828541799, + "tokens_seen": 766867456 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003876930792377131, + "loss": 2.916, + "theoretical_loss": 3.744855861907075, + "tokens_seen": 766932992 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038768304914744235, + "loss": 2.8973, + "theoretical_loss": 3.744823898768626, + "tokens_seen": 766998528 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038767301905717153, + "loss": 2.7641, + "theoretical_loss": 3.744791939125771, + "tokens_seen": 767064064 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003876629889669007, + "loss": 2.8003, + "theoretical_loss": 3.74475998297783, + "tokens_seen": 767129600 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003876529588766299, + "loss": 2.837, + "theoretical_loss": 3.7447280303241213, + "tokens_seen": 767195136 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003876429287863591, + "loss": 2.6501, + "theoretical_loss": 3.7446960811639656, + "tokens_seen": 767260672 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038763289869608826, + "loss": 3.0486, + "theoretical_loss": 3.7446641354966816, + "tokens_seen": 767326208 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003876228686058175, + "loss": 2.7624, + "theoretical_loss": 3.744632193321589, + "tokens_seen": 767391744 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003876128385155466, + "loss": 2.5624, + "theoretical_loss": 3.7446002546380086, + "tokens_seen": 767457280 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038760280842527585, + "loss": 2.7834, + "theoretical_loss": 3.7445683194452606, + "tokens_seen": 767522816 + }, + { + "epoch": 2.05, + "learning_rate": 0.000387592778335005, + "loss": 2.562, + "theoretical_loss": 3.744536387742665, + "tokens_seen": 767588352 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003875827482447342, + "loss": 2.966, + "theoretical_loss": 3.7445044595295425, + "tokens_seen": 767653888 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003875727181544634, + "loss": 2.8074, + "theoretical_loss": 3.7444725348052144, + "tokens_seen": 767719424 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003875626880641926, + "loss": 2.9211, + "theoretical_loss": 3.744440613569001, + "tokens_seen": 767784960 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038755265797392176, + "loss": 2.9349, + "theoretical_loss": 3.7444086958202245, + "tokens_seen": 767850496 + }, + { + "epoch": 2.05, + "learning_rate": 0.000387542627883651, + "loss": 2.8785, + "theoretical_loss": 3.7443767815582047, + "tokens_seen": 767916032 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003875325977933801, + "loss": 2.9355, + "theoretical_loss": 3.744344870782265, + "tokens_seen": 767981568 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038752256770310936, + "loss": 3.0958, + "theoretical_loss": 3.7443129634917254, + "tokens_seen": 768047104 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003875125376128385, + "loss": 3.0222, + "theoretical_loss": 3.7442810596859095, + "tokens_seen": 768112640 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003875025075225677, + "loss": 2.7181, + "theoretical_loss": 3.7442491593641387, + "tokens_seen": 768178176 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003874924774322969, + "loss": 2.8857, + "theoretical_loss": 3.744217262525735, + "tokens_seen": 768243712 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003874824473420261, + "loss": 2.7694, + "theoretical_loss": 3.744185369170021, + "tokens_seen": 768309248 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038747241725175526, + "loss": 2.9329, + "theoretical_loss": 3.7441534792963203, + "tokens_seen": 768374784 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 878097, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6164374351501465, + "objective/train/theoretical_loss": 3.744137535665013, + "objective/train/tokens_used": 788867552, + "theoretical_loss": 3.744137535665013, + "tokens_seen": 768407552 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038746238716148444, + "loss": 2.7017, + "theoretical_loss": 3.744121592903955, + "tokens_seen": 768440320 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003874523570712136, + "loss": 2.9756, + "theoretical_loss": 3.744089709992248, + "tokens_seen": 768505856 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038744232698094286, + "loss": 2.8532, + "theoretical_loss": 3.744057830560523, + "tokens_seen": 768571392 + }, + { + "epoch": 2.05, + "learning_rate": 0.000387432296890672, + "loss": 2.8827, + "theoretical_loss": 3.7440259546081043, + "tokens_seen": 768636928 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003874222668004012, + "loss": 2.9375, + "theoretical_loss": 3.7439940821343134, + "tokens_seen": 768702464 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038741223671013035, + "loss": 2.7816, + "theoretical_loss": 3.7439622131384764, + "tokens_seen": 768768000 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003874022066198596, + "loss": 2.5517, + "theoretical_loss": 3.7439303476199166, + "tokens_seen": 768833536 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038739217652958876, + "loss": 2.8064, + "theoretical_loss": 3.743898485577957, + "tokens_seen": 768899072 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038738214643931795, + "loss": 2.8739, + "theoretical_loss": 3.743866627011924, + "tokens_seen": 768964608 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003873721163490472, + "loss": 3.0709, + "theoretical_loss": 3.743834771921141, + "tokens_seen": 769030144 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038736208625877636, + "loss": 2.7886, + "theoretical_loss": 3.7438029203049332, + "tokens_seen": 769095680 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038735205616850554, + "loss": 2.9046, + "theoretical_loss": 3.743771072162625, + "tokens_seen": 769161216 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003873420260782347, + "loss": 2.8764, + "theoretical_loss": 3.7437392274935424, + "tokens_seen": 769226752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003873319959879639, + "loss": 3.125, + "theoretical_loss": 3.74370738629701, + "tokens_seen": 769292288 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003873219658976931, + "loss": 2.9003, + "theoretical_loss": 3.7436755485723543, + "tokens_seen": 769357824 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003873119358074223, + "loss": 2.7719, + "theoretical_loss": 3.7436437143189005, + "tokens_seen": 769423360 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038730190571715145, + "loss": 2.894, + "theoretical_loss": 3.7436118835359737, + "tokens_seen": 769488896 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003872918756268807, + "loss": 2.7787, + "theoretical_loss": 3.7435800562229016, + "tokens_seen": 769554432 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003872818455366098, + "loss": 2.7862, + "theoretical_loss": 3.7435482323790095, + "tokens_seen": 769619968 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038727181544633905, + "loss": 2.8708, + "theoretical_loss": 3.743516412003624, + "tokens_seen": 769685504 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038726178535606823, + "loss": 2.8517, + "theoretical_loss": 3.7434845950960725, + "tokens_seen": 769751040 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003872517552657974, + "loss": 2.9194, + "theoretical_loss": 3.74345278165568, + "tokens_seen": 769816576 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003872417251755266, + "loss": 2.7662, + "theoretical_loss": 3.743420971681776, + "tokens_seen": 769882112 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038723169508525577, + "loss": 2.7826, + "theoretical_loss": 3.743389165173686, + "tokens_seen": 769947648 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038722166499498495, + "loss": 3.1014, + "theoretical_loss": 3.7433573621307383, + "tokens_seen": 770013184 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 879358, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.580174684524536, + "objective/train/theoretical_loss": 3.7433414619084826, + "objective/train/tokens_used": 790505952, + "theoretical_loss": 3.7433414619084826, + "tokens_seen": 770045952 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003872116349047142, + "loss": 2.6705, + "theoretical_loss": 3.7433255625522603, + "tokens_seen": 770078720 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003872016048144433, + "loss": 2.8358, + "theoretical_loss": 3.7432937664375796, + "tokens_seen": 770144256 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038719157472417255, + "loss": 2.8843, + "theoretical_loss": 3.7432619737860247, + "tokens_seen": 770209792 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038718154463390173, + "loss": 2.9577, + "theoretical_loss": 3.7432301845969227, + "tokens_seen": 770275328 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003871715145436309, + "loss": 2.9062, + "theoretical_loss": 3.743198398869603, + "tokens_seen": 770340864 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003871614844533601, + "loss": 2.9596, + "theoretical_loss": 3.743166616603394, + "tokens_seen": 770406400 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003871514543630893, + "loss": 2.9192, + "theoretical_loss": 3.7431348377976246, + "tokens_seen": 770471936 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038714142427281846, + "loss": 3.0561, + "theoretical_loss": 3.743103062451623, + "tokens_seen": 770537472 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003871313941825477, + "loss": 2.8688, + "theoretical_loss": 3.7430712905647185, + "tokens_seen": 770603008 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003871213640922768, + "loss": 2.7494, + "theoretical_loss": 3.7430395221362414, + "tokens_seen": 770668544 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038711133400200605, + "loss": 2.8565, + "theoretical_loss": 3.74300775716552, + "tokens_seen": 770734080 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003871013039117352, + "loss": 2.9329, + "theoretical_loss": 3.7429759956518844, + "tokens_seen": 770799616 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003870912738214644, + "loss": 2.809, + "theoretical_loss": 3.7429442375946644, + "tokens_seen": 770865152 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003870812437311936, + "loss": 2.8026, + "theoretical_loss": 3.742912482993191, + "tokens_seen": 770930688 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003870712136409228, + "loss": 2.8708, + "theoretical_loss": 3.7428807318467925, + "tokens_seen": 770996224 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038706118355065196, + "loss": 2.9398, + "theoretical_loss": 3.742848984154801, + "tokens_seen": 771061760 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003870511534603812, + "loss": 2.8979, + "theoretical_loss": 3.7428172399165467, + "tokens_seen": 771127296 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003870411233701103, + "loss": 2.8455, + "theoretical_loss": 3.74278549913136, + "tokens_seen": 771192832 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038703109327983956, + "loss": 2.998, + "theoretical_loss": 3.742753761798572, + "tokens_seen": 771258368 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003870210631895687, + "loss": 2.836, + "theoretical_loss": 3.742722027917514, + "tokens_seen": 771323904 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003870110330992979, + "loss": 2.9174, + "theoretical_loss": 3.7426902974875182, + "tokens_seen": 771389440 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003870010030090271, + "loss": 2.7942, + "theoretical_loss": 3.7426585705079147, + "tokens_seen": 771454976 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003869909729187563, + "loss": 3.0534, + "theoretical_loss": 3.7426268469780357, + "tokens_seen": 771520512 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038698094282848546, + "loss": 2.9767, + "theoretical_loss": 3.7425951268972133, + "tokens_seen": 771586048 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038697091273821464, + "loss": 2.7632, + "theoretical_loss": 3.74256341026478, + "tokens_seen": 771651584 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 880093, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.807681083679199, + "objective/train/theoretical_loss": 3.7425475532415007, + "objective/train/tokens_used": 792144352, + "theoretical_loss": 3.7425475532415007, + "tokens_seen": 771684352 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003869608826479438, + "loss": 2.8143, + "theoretical_loss": 3.742531697080068, + "tokens_seen": 771717120 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038695085255767306, + "loss": 2.8962, + "theoretical_loss": 3.7424999873424087, + "tokens_seen": 771782656 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003869408224674022, + "loss": 2.7382, + "theoretical_loss": 3.742468281051136, + "tokens_seen": 771848192 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003869307923771314, + "loss": 3.0543, + "theoretical_loss": 3.7424365782055817, + "tokens_seen": 771913728 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038692076228686055, + "loss": 2.814, + "theoretical_loss": 3.74240487880508, + "tokens_seen": 771979264 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003869107321965898, + "loss": 2.8605, + "theoretical_loss": 3.7423731828489633, + "tokens_seen": 772044800 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038690070210631896, + "loss": 2.9045, + "theoretical_loss": 3.7423414903365653, + "tokens_seen": 772110336 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038689067201604815, + "loss": 2.7878, + "theoretical_loss": 3.7423098012672193, + "tokens_seen": 772175872 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038688064192577733, + "loss": 2.7309, + "theoretical_loss": 3.74227811564026, + "tokens_seen": 772241408 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038687061183550656, + "loss": 2.947, + "theoretical_loss": 3.7422464334550196, + "tokens_seen": 772306944 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003868605817452357, + "loss": 2.7292, + "theoretical_loss": 3.742214754710834, + "tokens_seen": 772372480 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003868505516549649, + "loss": 2.8713, + "theoretical_loss": 3.742183079407037, + "tokens_seen": 772438016 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038684052156469405, + "loss": 2.9634, + "theoretical_loss": 3.7421514075429627, + "tokens_seen": 772503552 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003868304914744233, + "loss": 2.7913, + "theoretical_loss": 3.742119739117946, + "tokens_seen": 772569088 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038682046138415247, + "loss": 3.1463, + "theoretical_loss": 3.7420880741313223, + "tokens_seen": 772634624 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038681043129388165, + "loss": 2.6388, + "theoretical_loss": 3.742056412582426, + "tokens_seen": 772700160 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038680040120361083, + "loss": 2.694, + "theoretical_loss": 3.7420247544705925, + "tokens_seen": 772765696 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038679037111334, + "loss": 2.935, + "theoretical_loss": 3.741993099795158, + "tokens_seen": 772831232 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003867803410230692, + "loss": 3.0614, + "theoretical_loss": 3.7419614485554566, + "tokens_seen": 772896768 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038677031093279843, + "loss": 2.7906, + "theoretical_loss": 3.7419298007508255, + "tokens_seen": 772962304 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038676028084252755, + "loss": 3.0528, + "theoretical_loss": 3.7418981563806, + "tokens_seen": 773027840 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003867502507522568, + "loss": 2.9, + "theoretical_loss": 3.7418665154441166, + "tokens_seen": 773093376 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003867402206619859, + "loss": 2.9703, + "theoretical_loss": 3.741834877940711, + "tokens_seen": 773158912 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038673019057171515, + "loss": 3.0419, + "theoretical_loss": 3.7418032438697213, + "tokens_seen": 773224448 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038672016048144433, + "loss": 2.969, + "theoretical_loss": 3.7417716132304824, + "tokens_seen": 773289984 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 881313, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6865322589874268, + "objective/train/theoretical_loss": 3.741755799197563, + "objective/train/tokens_used": 793782752, + "theoretical_loss": 3.741755799197563, + "tokens_seen": 773322752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003867101303911735, + "loss": 2.8573, + "theoretical_loss": 3.741739986022333, + "tokens_seen": 773355520 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003867001003009027, + "loss": 2.991, + "theoretical_loss": 3.7417083622446086, + "tokens_seen": 773421056 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038669007021063193, + "loss": 2.9242, + "theoretical_loss": 3.7416767418966472, + "tokens_seen": 773486592 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038668004012036106, + "loss": 2.8793, + "theoretical_loss": 3.7416451249777865, + "tokens_seen": 773552128 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003866700100300903, + "loss": 3.0631, + "theoretical_loss": 3.7416135114873637, + "tokens_seen": 773617664 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003866599799398194, + "loss": 2.7995, + "theoretical_loss": 3.741581901424717, + "tokens_seen": 773683200 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038664994984954866, + "loss": 2.7504, + "theoretical_loss": 3.7415502947891843, + "tokens_seen": 773748736 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038663991975927784, + "loss": 2.9901, + "theoretical_loss": 3.7415186915801035, + "tokens_seen": 773814272 + }, + { + "epoch": 2.05, + "learning_rate": 0.000386629889669007, + "loss": 2.9467, + "theoretical_loss": 3.7414870917968135, + "tokens_seen": 773879808 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038661985957873625, + "loss": 2.7866, + "theoretical_loss": 3.7414554954386525, + "tokens_seen": 773945344 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003866098294884654, + "loss": 2.6642, + "theoretical_loss": 3.7414239025049594, + "tokens_seen": 774010880 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003865997993981946, + "loss": 2.7879, + "theoretical_loss": 3.7413923129950737, + "tokens_seen": 774076416 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003865897693079238, + "loss": 3.0073, + "theoretical_loss": 3.741360726908333, + "tokens_seen": 774141952 + }, + { + "epoch": 2.05, + "learning_rate": 0.000386579739217653, + "loss": 2.7793, + "theoretical_loss": 3.7413291442440784, + "tokens_seen": 774207488 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038656970912738216, + "loss": 3.0301, + "theoretical_loss": 3.7412975650016485, + "tokens_seen": 774273024 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003865596790371114, + "loss": 2.8888, + "theoretical_loss": 3.741265989180383, + "tokens_seen": 774338560 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003865496489468405, + "loss": 2.9951, + "theoretical_loss": 3.7412344167796214, + "tokens_seen": 774404096 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038653961885656976, + "loss": 2.8143, + "theoretical_loss": 3.7412028477987045, + "tokens_seen": 774469632 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003865295887662989, + "loss": 2.9018, + "theoretical_loss": 3.7411712822369725, + "tokens_seen": 774535168 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003865195586760281, + "loss": 3.0448, + "theoretical_loss": 3.741139720093765, + "tokens_seen": 774600704 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003865095285857573, + "loss": 2.9451, + "theoretical_loss": 3.741108161368423, + "tokens_seen": 774666240 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003864994984954865, + "loss": 2.8807, + "theoretical_loss": 3.7410766060602887, + "tokens_seen": 774731776 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038648946840521566, + "loss": 3.0032, + "theoretical_loss": 3.7410450541687004, + "tokens_seen": 774797312 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038647943831494484, + "loss": 2.6414, + "theoretical_loss": 3.741013505693001, + "tokens_seen": 774862848 + }, + { + "epoch": 2.05, + "learning_rate": 0.000386469408224674, + "loss": 2.8847, + "theoretical_loss": 3.740981960632532, + "tokens_seen": 774928384 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 882028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3022451400756836, + "objective/train/theoretical_loss": 3.740966189382802, + "objective/train/tokens_used": 795421152, + "theoretical_loss": 3.740966189382802, + "tokens_seen": 774961152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00038645937813440326, + "loss": 2.8122, + "theoretical_loss": 3.7409504189866336, + "tokens_seen": 774993920 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003864493480441324, + "loss": 2.8759, + "theoretical_loss": 3.7409188807546485, + "tokens_seen": 775059456 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003864393179538616, + "loss": 2.8984, + "theoretical_loss": 3.740887345935918, + "tokens_seen": 775124992 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038642928786359075, + "loss": 2.902, + "theoretical_loss": 3.7408558145297848, + "tokens_seen": 775190528 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038641925777332, + "loss": 2.8342, + "theoretical_loss": 3.7408242865355907, + "tokens_seen": 775256064 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038640922768304917, + "loss": 2.9901, + "theoretical_loss": 3.7407927619526786, + "tokens_seen": 775321600 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038639919759277835, + "loss": 2.8286, + "theoretical_loss": 3.7407612407803903, + "tokens_seen": 775387136 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038638916750250753, + "loss": 2.8755, + "theoretical_loss": 3.7407297230180694, + "tokens_seen": 775452672 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038637913741223676, + "loss": 2.7858, + "theoretical_loss": 3.7406982086650578, + "tokens_seen": 775518208 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003863691073219659, + "loss": 2.7104, + "theoretical_loss": 3.7406666977207, + "tokens_seen": 775583744 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003863590772316951, + "loss": 2.8863, + "theoretical_loss": 3.740635190184338, + "tokens_seen": 775649280 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038634904714142425, + "loss": 2.9168, + "theoretical_loss": 3.7406036860553167, + "tokens_seen": 775714816 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003863390170511535, + "loss": 2.9828, + "theoretical_loss": 3.7405721853329785, + "tokens_seen": 775780352 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038632898696088267, + "loss": 2.6336, + "theoretical_loss": 3.7405406880166687, + "tokens_seen": 775845888 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038631895687061185, + "loss": 2.7923, + "theoretical_loss": 3.74050919410573, + "tokens_seen": 775911424 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038630892678034103, + "loss": 2.8397, + "theoretical_loss": 3.740477703599507, + "tokens_seen": 775976960 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003862988966900702, + "loss": 2.8475, + "theoretical_loss": 3.740446216497344, + "tokens_seen": 776042496 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003862888665997994, + "loss": 2.7932, + "theoretical_loss": 3.7404147327985866, + "tokens_seen": 776108032 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038627883650952863, + "loss": 2.8546, + "theoretical_loss": 3.7403832525025784, + "tokens_seen": 776173568 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038626880641925775, + "loss": 2.9357, + "theoretical_loss": 3.740351775608665, + "tokens_seen": 776239104 + }, + { + "epoch": 2.06, + "learning_rate": 0.000386258776328987, + "loss": 2.8414, + "theoretical_loss": 3.7403203021161913, + "tokens_seen": 776304640 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003862487462387161, + "loss": 2.916, + "theoretical_loss": 3.740288832024503, + "tokens_seen": 776370176 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038623871614844535, + "loss": 2.9987, + "theoretical_loss": 3.740257365332945, + "tokens_seen": 776435712 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038622868605817453, + "loss": 2.8645, + "theoretical_loss": 3.740225902040864, + "tokens_seen": 776501248 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003862186559679037, + "loss": 2.8856, + "theoretical_loss": 3.740194442147604, + "tokens_seen": 776566784 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 883113, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.753779411315918, + "objective/train/theoretical_loss": 3.740178713475329, + "objective/train/tokens_used": 797059552, + "theoretical_loss": 3.740178713475329, + "tokens_seen": 776599552 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003862086258776329, + "loss": 2.8486, + "theoretical_loss": 3.740162985652513, + "tokens_seen": 776632320 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038619859578736213, + "loss": 2.783, + "theoretical_loss": 3.740131532554937, + "tokens_seen": 776697856 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038618856569709126, + "loss": 2.7231, + "theoretical_loss": 3.7401000828542212, + "tokens_seen": 776763392 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003861785356068205, + "loss": 2.8215, + "theoretical_loss": 3.740068636549713, + "tokens_seen": 776828928 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003861685055165496, + "loss": 2.8173, + "theoretical_loss": 3.740037193640759, + "tokens_seen": 776894464 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038615847542627886, + "loss": 2.7441, + "theoretical_loss": 3.7400057541267064, + "tokens_seen": 776960000 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038614844533600804, + "loss": 2.7466, + "theoretical_loss": 3.739974318006902, + "tokens_seen": 777025536 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003861384152457372, + "loss": 2.9235, + "theoretical_loss": 3.7399428852806937, + "tokens_seen": 777091072 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003861283851554664, + "loss": 2.9365, + "theoretical_loss": 3.7399114559474285, + "tokens_seen": 777156608 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003861183550651956, + "loss": 2.8048, + "theoretical_loss": 3.739880030006454, + "tokens_seen": 777222144 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038610832497492476, + "loss": 2.7476, + "theoretical_loss": 3.7398486074571182, + "tokens_seen": 777287680 + }, + { + "epoch": 2.06, + "learning_rate": 0.000386098294884654, + "loss": 2.7782, + "theoretical_loss": 3.73981718829877, + "tokens_seen": 777353216 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003860882647943831, + "loss": 2.8754, + "theoretical_loss": 3.7397857725307553, + "tokens_seen": 777418752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038607823470411236, + "loss": 2.8833, + "theoretical_loss": 3.7397543601524252, + "tokens_seen": 777484288 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003860682046138415, + "loss": 2.569, + "theoretical_loss": 3.739722951163127, + "tokens_seen": 777549824 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003860581745235707, + "loss": 2.8597, + "theoretical_loss": 3.7396915455622093, + "tokens_seen": 777615360 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003860481444332999, + "loss": 2.9052, + "theoretical_loss": 3.7396601433490213, + "tokens_seen": 777680896 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003860381143430291, + "loss": 2.8995, + "theoretical_loss": 3.7396287445229124, + "tokens_seen": 777746432 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038602808425275826, + "loss": 2.6981, + "theoretical_loss": 3.7395973490832315, + "tokens_seen": 777811968 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003860180541624875, + "loss": 2.683, + "theoretical_loss": 3.739565957029328, + "tokens_seen": 777877504 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003860080240722166, + "loss": 2.7552, + "theoretical_loss": 3.7395345683605523, + "tokens_seen": 777943040 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038599799398194586, + "loss": 2.8937, + "theoretical_loss": 3.739503183076253, + "tokens_seen": 778008576 + }, + { + "epoch": 2.06, + "learning_rate": 0.000385987963891675, + "loss": 2.646, + "theoretical_loss": 3.7394718011757817, + "tokens_seen": 778074112 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003859779338014042, + "loss": 2.8996, + "theoretical_loss": 3.739440422658487, + "tokens_seen": 778139648 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003859679037111334, + "loss": 2.947, + "theoretical_loss": 3.7394090475237203, + "tokens_seen": 778205184 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 883877, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5676019191741943, + "objective/train/theoretical_loss": 3.7393933612245824, + "objective/train/tokens_used": 798697952, + "theoretical_loss": 3.7393933612245824, + "tokens_seen": 778237952 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003859578736208626, + "loss": 2.6764, + "theoretical_loss": 3.7393776757708324, + "tokens_seen": 778270720 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038594784353059177, + "loss": 2.8117, + "theoretical_loss": 3.7393463073991726, + "tokens_seen": 778336256 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038593781344032095, + "loss": 2.9232, + "theoretical_loss": 3.7393149424080936, + "tokens_seen": 778401792 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038592778335005013, + "loss": 2.7402, + "theoretical_loss": 3.739283580796945, + "tokens_seen": 778467328 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038591775325977937, + "loss": 2.8319, + "theoretical_loss": 3.7392522225650793, + "tokens_seen": 778532864 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003859077231695085, + "loss": 2.9142, + "theoretical_loss": 3.739220867711847, + "tokens_seen": 778598400 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038589769307923773, + "loss": 2.896, + "theoretical_loss": 3.7391895162366002, + "tokens_seen": 778663936 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038588766298896685, + "loss": 2.5929, + "theoretical_loss": 3.7391581681386907, + "tokens_seen": 778729472 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003858776328986961, + "loss": 2.9375, + "theoretical_loss": 3.7391268234174704, + "tokens_seen": 778795008 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003858676028084253, + "loss": 2.8272, + "theoretical_loss": 3.7390954820722917, + "tokens_seen": 778860544 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038585757271815445, + "loss": 2.8612, + "theoretical_loss": 3.7390641441025068, + "tokens_seen": 778926080 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003858475426278837, + "loss": 3.0562, + "theoretical_loss": 3.7390328095074676, + "tokens_seen": 778991616 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038583751253761287, + "loss": 2.9644, + "theoretical_loss": 3.7390014782865277, + "tokens_seen": 779057152 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038582748244734205, + "loss": 2.7878, + "theoretical_loss": 3.73897015043904, + "tokens_seen": 779122688 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038581745235707123, + "loss": 2.7606, + "theoretical_loss": 3.7389388259643566, + "tokens_seen": 779188224 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003858074222668004, + "loss": 2.8087, + "theoretical_loss": 3.7389075048618317, + "tokens_seen": 779253760 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003857973921765296, + "loss": 2.8811, + "theoretical_loss": 3.7388761871308187, + "tokens_seen": 779319296 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038578736208625883, + "loss": 3.0463, + "theoretical_loss": 3.7388448727706707, + "tokens_seen": 779384832 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038577733199598796, + "loss": 2.8511, + "theoretical_loss": 3.738813561780742, + "tokens_seen": 779450368 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003857673019057172, + "loss": 2.763, + "theoretical_loss": 3.7387822541603857, + "tokens_seen": 779515904 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003857572718154463, + "loss": 2.9689, + "theoretical_loss": 3.738750949908957, + "tokens_seen": 779581440 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038574724172517555, + "loss": 2.7182, + "theoretical_loss": 3.738719649025809, + "tokens_seen": 779646976 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038573721163490473, + "loss": 2.8999, + "theoretical_loss": 3.7386883515102975, + "tokens_seen": 779712512 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003857271815446339, + "loss": 2.9385, + "theoretical_loss": 3.7386570573617766, + "tokens_seen": 779778048 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003857171514543631, + "loss": 2.7616, + "theoretical_loss": 3.738625766579601, + "tokens_seen": 779843584 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 885041, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8538613319396973, + "objective/train/theoretical_loss": 3.738610122450691, + "objective/train/tokens_used": 800336352, + "theoretical_loss": 3.738610122450691, + "tokens_seen": 779876352 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038570712136409233, + "loss": 2.8281, + "theoretical_loss": 3.7385944791631256, + "tokens_seen": 779909120 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038569709127382146, + "loss": 2.8344, + "theoretical_loss": 3.738563195111706, + "tokens_seen": 779974656 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003856870611835507, + "loss": 3.0381, + "theoretical_loss": 3.7385319144246973, + "tokens_seen": 780040192 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003856770310932798, + "loss": 2.6559, + "theoretical_loss": 3.7385006371014553, + "tokens_seen": 780105728 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038566700100300906, + "loss": 3.129, + "theoretical_loss": 3.738469363141336, + "tokens_seen": 780171264 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038565697091273824, + "loss": 2.6653, + "theoretical_loss": 3.738438092543694, + "tokens_seen": 780236800 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003856469408224674, + "loss": 2.7597, + "theoretical_loss": 3.7384068253078873, + "tokens_seen": 780302336 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003856369107321966, + "loss": 2.9662, + "theoretical_loss": 3.7383755614332705, + "tokens_seen": 780367872 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003856268806419258, + "loss": 3.0753, + "theoretical_loss": 3.7383443009192012, + "tokens_seen": 780433408 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038561685055165496, + "loss": 2.7267, + "theoretical_loss": 3.738313043765035, + "tokens_seen": 780498944 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003856068204613842, + "loss": 3.0858, + "theoretical_loss": 3.73828178997013, + "tokens_seen": 780564480 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003855967903711133, + "loss": 2.8874, + "theoretical_loss": 3.738250539533842, + "tokens_seen": 780630016 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038558676028084256, + "loss": 2.8781, + "theoretical_loss": 3.738219292455528, + "tokens_seen": 780695552 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003855767301905717, + "loss": 2.8912, + "theoretical_loss": 3.738188048734547, + "tokens_seen": 780761088 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003855667001003009, + "loss": 2.9454, + "theoretical_loss": 3.738156808370255, + "tokens_seen": 780826624 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003855566700100301, + "loss": 2.9832, + "theoretical_loss": 3.7381255713620103, + "tokens_seen": 780892160 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003855466399197593, + "loss": 2.8575, + "theoretical_loss": 3.73809433770917, + "tokens_seen": 780957696 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038553660982948846, + "loss": 2.751, + "theoretical_loss": 3.7380631074110933, + "tokens_seen": 781023232 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003855265797392177, + "loss": 2.7593, + "theoretical_loss": 3.7380318804671377, + "tokens_seen": 781088768 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003855165496489468, + "loss": 2.6554, + "theoretical_loss": 3.7380006568766615, + "tokens_seen": 781154304 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038550651955867606, + "loss": 2.8923, + "theoretical_loss": 3.7379694366390237, + "tokens_seen": 781219840 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003854964894684052, + "loss": 2.8653, + "theoretical_loss": 3.737938219753583, + "tokens_seen": 781285376 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003854864593781344, + "loss": 2.7881, + "theoretical_loss": 3.7379070062196975, + "tokens_seen": 781350912 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003854764292878636, + "loss": 2.9967, + "theoretical_loss": 3.7378757960367275, + "tokens_seen": 781416448 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003854663991975928, + "loss": 2.7699, + "theoretical_loss": 3.7378445892040317, + "tokens_seen": 781481984 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 890004, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.93015456199646, + "objective/train/theoretical_loss": 3.7378289870438364, + "objective/train/tokens_used": 801974752, + "theoretical_loss": 3.7378289870438364, + "tokens_seen": 781514752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038545636910732197, + "loss": 2.823, + "theoretical_loss": 3.737813385720969, + "tokens_seen": 781547520 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038544633901705115, + "loss": 2.6779, + "theoretical_loss": 3.7377821855869007, + "tokens_seen": 781613056 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038543630892678033, + "loss": 2.9013, + "theoretical_loss": 3.7377509888011846, + "tokens_seen": 781678592 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038542627883650957, + "loss": 2.8486, + "theoretical_loss": 3.7377197953631818, + "tokens_seen": 781744128 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003854162487462387, + "loss": 2.8487, + "theoretical_loss": 3.737688605272252, + "tokens_seen": 781809664 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038540621865596793, + "loss": 2.7669, + "theoretical_loss": 3.7376574185277565, + "tokens_seen": 781875200 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038539618856569705, + "loss": 2.6736, + "theoretical_loss": 3.7376262351290546, + "tokens_seen": 781940736 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003853861584754263, + "loss": 3.0875, + "theoretical_loss": 3.7375950550755075, + "tokens_seen": 782006272 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038537612838515547, + "loss": 2.9593, + "theoretical_loss": 3.737563878366476, + "tokens_seen": 782071808 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038536609829488465, + "loss": 2.6679, + "theoretical_loss": 3.737532705001321, + "tokens_seen": 782137344 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038535606820461383, + "loss": 3.0975, + "theoretical_loss": 3.7375015349794047, + "tokens_seen": 782202880 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038534603811434307, + "loss": 2.8719, + "theoretical_loss": 3.7374703683000865, + "tokens_seen": 782268416 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003853360080240722, + "loss": 3.0239, + "theoretical_loss": 3.7374392049627296, + "tokens_seen": 782333952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038532597793380143, + "loss": 2.6991, + "theoretical_loss": 3.7374080449666955, + "tokens_seen": 782399488 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038531594784353056, + "loss": 3.0373, + "theoretical_loss": 3.7373768883113456, + "tokens_seen": 782465024 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003853059177532598, + "loss": 2.7967, + "theoretical_loss": 3.737345734996042, + "tokens_seen": 782530560 + }, + { + "epoch": 2.06, + "learning_rate": 0.000385295887662989, + "loss": 2.7177, + "theoretical_loss": 3.7373145850201475, + "tokens_seen": 782596096 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038528585757271816, + "loss": 2.8801, + "theoretical_loss": 3.7372834383830247, + "tokens_seen": 782661632 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038527582748244734, + "loss": 2.6214, + "theoretical_loss": 3.7372522950840352, + "tokens_seen": 782727168 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003852657973921765, + "loss": 2.8109, + "theoretical_loss": 3.7372211551225423, + "tokens_seen": 782792704 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003852557673019057, + "loss": 3.0358, + "theoretical_loss": 3.737190018497909, + "tokens_seen": 782858240 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038524573721163493, + "loss": 2.8844, + "theoretical_loss": 3.7371588852094986, + "tokens_seen": 782923776 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038523570712136406, + "loss": 2.9968, + "theoretical_loss": 3.7371277552566737, + "tokens_seen": 782989312 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003852256770310933, + "loss": 2.758, + "theoretical_loss": 3.737096628638799, + "tokens_seen": 783054848 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003852156469408225, + "loss": 2.8207, + "theoretical_loss": 3.737065505355237, + "tokens_seen": 783120384 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 897763, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7294697761535645, + "objective/train/theoretical_loss": 3.7370499449636245, + "objective/train/tokens_used": 803613152, + "theoretical_loss": 3.7370499449636245, + "tokens_seen": 783153152 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038520561685055166, + "loss": 2.6323, + "theoretical_loss": 3.7370343854053525, + "tokens_seen": 783185920 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038519558676028084, + "loss": 2.9722, + "theoretical_loss": 3.7370032687885084, + "tokens_seen": 783251456 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038518555667001, + "loss": 2.8461, + "theoretical_loss": 3.7369721555040702, + "tokens_seen": 783316992 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003851755265797392, + "loss": 2.7199, + "theoretical_loss": 3.736941045551401, + "tokens_seen": 783382528 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038516549648946844, + "loss": 2.8761, + "theoretical_loss": 3.736909938929866, + "tokens_seen": 783448064 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038515546639919756, + "loss": 3.1976, + "theoretical_loss": 3.7368788356388305, + "tokens_seen": 783513600 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003851454363089268, + "loss": 2.904, + "theoretical_loss": 3.7368477356776584, + "tokens_seen": 783579136 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003851354062186559, + "loss": 2.8346, + "theoretical_loss": 3.7368166390457147, + "tokens_seen": 783644672 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038512537612838516, + "loss": 2.7427, + "theoretical_loss": 3.7367855457423653, + "tokens_seen": 783710208 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003851153460381144, + "loss": 2.7978, + "theoretical_loss": 3.736754455766975, + "tokens_seen": 783775744 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003851053159478435, + "loss": 2.9312, + "theoretical_loss": 3.73672336911891, + "tokens_seen": 783841280 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038509528585757276, + "loss": 2.7718, + "theoretical_loss": 3.7366922857975355, + "tokens_seen": 783906816 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003850852557673019, + "loss": 2.7926, + "theoretical_loss": 3.7366612058022177, + "tokens_seen": 783972352 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003850752256770311, + "loss": 2.9423, + "theoretical_loss": 3.736630129132323, + "tokens_seen": 784037888 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003850651955867603, + "loss": 2.71, + "theoretical_loss": 3.7365990557872166, + "tokens_seen": 784103424 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003850551654964895, + "loss": 2.9139, + "theoretical_loss": 3.7365679857662664, + "tokens_seen": 784168960 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038504513540621866, + "loss": 2.8589, + "theoretical_loss": 3.736536919068838, + "tokens_seen": 784234496 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003850351053159479, + "loss": 2.8182, + "theoretical_loss": 3.7365058556942987, + "tokens_seen": 784300032 + }, + { + "epoch": 2.06, + "learning_rate": 0.000385025075225677, + "loss": 2.9608, + "theoretical_loss": 3.736474795642015, + "tokens_seen": 784365568 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038501504513540626, + "loss": 2.8076, + "theoretical_loss": 3.736443738911354, + "tokens_seen": 784431104 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003850050150451354, + "loss": 2.8234, + "theoretical_loss": 3.7364126855016835, + "tokens_seen": 784496640 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003849949849548646, + "loss": 2.6401, + "theoretical_loss": 3.7363816354123713, + "tokens_seen": 784562176 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003849849548645938, + "loss": 2.8178, + "theoretical_loss": 3.736350588642784, + "tokens_seen": 784627712 + }, + { + "epoch": 2.06, + "learning_rate": 0.000384974924774323, + "loss": 2.9572, + "theoretical_loss": 3.7363195451922904, + "tokens_seen": 784693248 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038496489468405217, + "loss": 2.9677, + "theoretical_loss": 3.7362885050602577, + "tokens_seen": 784758784 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 900073, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1294591426849365, + "objective/train/theoretical_loss": 3.7362729862384674, + "objective/train/tokens_used": 805251552, + "theoretical_loss": 3.7362729862384674, + "tokens_seen": 784791552 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038495486459378135, + "loss": 2.8987, + "theoretical_loss": 3.736257468246055, + "tokens_seen": 784824320 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038494483450351053, + "loss": 2.8781, + "theoretical_loss": 3.7362264347490504, + "tokens_seen": 784889856 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038493480441323977, + "loss": 2.9147, + "theoretical_loss": 3.7361954045686114, + "tokens_seen": 784955392 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003849247743229689, + "loss": 2.7068, + "theoretical_loss": 3.7361643777041076, + "tokens_seen": 785020928 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038491474423269813, + "loss": 2.716, + "theoretical_loss": 3.7361333541549078, + "tokens_seen": 785086464 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038490471414242725, + "loss": 2.923, + "theoretical_loss": 3.7361023339203814, + "tokens_seen": 785152000 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003848946840521565, + "loss": 2.7817, + "theoretical_loss": 3.736071316999897, + "tokens_seen": 785217536 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038488465396188567, + "loss": 2.8024, + "theoretical_loss": 3.7360403033928242, + "tokens_seen": 785283072 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038487462387161485, + "loss": 2.9971, + "theoretical_loss": 3.7360092930985322, + "tokens_seen": 785348608 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038486459378134403, + "loss": 3.0687, + "theoretical_loss": 3.7359782861163917, + "tokens_seen": 785414144 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038485456369107327, + "loss": 2.8731, + "theoretical_loss": 3.735947282445772, + "tokens_seen": 785479680 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003848445336008024, + "loss": 2.8119, + "theoretical_loss": 3.735916282086043, + "tokens_seen": 785545216 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038483450351053163, + "loss": 2.7403, + "theoretical_loss": 3.735885285036575, + "tokens_seen": 785610752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038482447342026076, + "loss": 2.7575, + "theoretical_loss": 3.7358542912967385, + "tokens_seen": 785676288 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038481444332999, + "loss": 2.912, + "theoretical_loss": 3.7358233008659045, + "tokens_seen": 785741824 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003848044132397192, + "loss": 2.9174, + "theoretical_loss": 3.7357923137434437, + "tokens_seen": 785807360 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038479438314944836, + "loss": 2.7221, + "theoretical_loss": 3.7357613299287262, + "tokens_seen": 785872896 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038478435305917754, + "loss": 2.7952, + "theoretical_loss": 3.7357303494211243, + "tokens_seen": 785938432 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003847743229689067, + "loss": 2.7357, + "theoretical_loss": 3.735699372220008, + "tokens_seen": 786003968 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003847642928786359, + "loss": 2.6275, + "theoretical_loss": 3.7356683983247496, + "tokens_seen": 786069504 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038475426278836513, + "loss": 2.6422, + "theoretical_loss": 3.735637427734721, + "tokens_seen": 786135040 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038474423269809426, + "loss": 2.8061, + "theoretical_loss": 3.7356064604492936, + "tokens_seen": 786200576 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003847342026078235, + "loss": 2.8188, + "theoretical_loss": 3.735575496467839, + "tokens_seen": 786266112 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003847241725175527, + "loss": 3.1006, + "theoretical_loss": 3.73554453578973, + "tokens_seen": 786331648 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038471414242728186, + "loss": 2.8199, + "theoretical_loss": 3.7355135784143387, + "tokens_seen": 786397184 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 905150, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.815542459487915, + "objective/train/theoretical_loss": 3.735498100964966, + "objective/train/tokens_used": 806889952, + "theoretical_loss": 3.735498100964966, + "tokens_seen": 786429952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038470411233701104, + "loss": 2.7891, + "theoretical_loss": 3.735482624341037, + "tokens_seen": 786462720 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003846940822467402, + "loss": 2.8774, + "theoretical_loss": 3.735451673569199, + "tokens_seen": 786528256 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003846840521564694, + "loss": 2.9415, + "theoretical_loss": 3.7354207260981953, + "tokens_seen": 786593792 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038467402206619864, + "loss": 2.9055, + "theoretical_loss": 3.7353897819274016, + "tokens_seen": 786659328 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038466399197592776, + "loss": 2.7626, + "theoretical_loss": 3.735358841056189, + "tokens_seen": 786724864 + }, + { + "epoch": 2.06, + "learning_rate": 0.000384653961885657, + "loss": 2.9134, + "theoretical_loss": 3.735327903483931, + "tokens_seen": 786790400 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003846439317953861, + "loss": 2.9937, + "theoretical_loss": 3.7352969692100024, + "tokens_seen": 786855936 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038463390170511536, + "loss": 2.9249, + "theoretical_loss": 3.7352660382337763, + "tokens_seen": 786921472 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038462387161484454, + "loss": 2.8463, + "theoretical_loss": 3.735235110554626, + "tokens_seen": 786987008 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003846138415245737, + "loss": 2.9076, + "theoretical_loss": 3.735204186171926, + "tokens_seen": 787052544 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003846038114343029, + "loss": 2.7049, + "theoretical_loss": 3.7351732650850504, + "tokens_seen": 787118080 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003845937813440321, + "loss": 2.7692, + "theoretical_loss": 3.7351423472933734, + "tokens_seen": 787183616 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038458375125376127, + "loss": 2.8375, + "theoretical_loss": 3.7351114327962702, + "tokens_seen": 787249152 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003845737211634905, + "loss": 2.9005, + "theoretical_loss": 3.735080521593115, + "tokens_seen": 787314688 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038456369107321963, + "loss": 2.7006, + "theoretical_loss": 3.7350496136832825, + "tokens_seen": 787380224 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038455366098294886, + "loss": 2.8368, + "theoretical_loss": 3.735018709066148, + "tokens_seen": 787445760 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038454363089267805, + "loss": 2.8671, + "theoretical_loss": 3.7349878077410867, + "tokens_seen": 787511296 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038453360080240723, + "loss": 2.8951, + "theoretical_loss": 3.734956909707474, + "tokens_seen": 787576832 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003845235707121364, + "loss": 2.8302, + "theoretical_loss": 3.734926014964686, + "tokens_seen": 787642368 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003845135406218656, + "loss": 2.7686, + "theoretical_loss": 3.7348951235120977, + "tokens_seen": 787707904 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038450351053159477, + "loss": 2.7914, + "theoretical_loss": 3.734864235349085, + "tokens_seen": 787773440 + }, + { + "epoch": 2.06, + "learning_rate": 0.000384493480441324, + "loss": 2.7126, + "theoretical_loss": 3.7348333504750246, + "tokens_seen": 787838976 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038448345035105313, + "loss": 3.0019, + "theoretical_loss": 3.734802468889292, + "tokens_seen": 787904512 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038447342026078237, + "loss": 3.1167, + "theoretical_loss": 3.734771590591264, + "tokens_seen": 787970048 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003844633901705115, + "loss": 2.921, + "theoretical_loss": 3.7347407155803176, + "tokens_seen": 788035584 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 910242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.993212938308716, + "objective/train/theoretical_loss": 3.734725279307305, + "objective/train/tokens_used": 808528352, + "theoretical_loss": 3.734725279307305, + "tokens_seen": 788068352 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038445336008024073, + "loss": 3.0711, + "theoretical_loss": 3.734709843855829, + "tokens_seen": 788101120 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003844433299899699, + "loss": 2.9976, + "theoretical_loss": 3.734678975417175, + "tokens_seen": 788166656 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003844332998996991, + "loss": 2.8315, + "theoretical_loss": 3.7346481102637332, + "tokens_seen": 788232192 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003844232698094283, + "loss": 2.8954, + "theoretical_loss": 3.7346172483948803, + "tokens_seen": 788297728 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038441323971915745, + "loss": 2.7735, + "theoretical_loss": 3.7345863898099942, + "tokens_seen": 788363264 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038440320962888664, + "loss": 2.8864, + "theoretical_loss": 3.7345555345084525, + "tokens_seen": 788428800 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038439317953861587, + "loss": 3.0191, + "theoretical_loss": 3.7345246824896328, + "tokens_seen": 788494336 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038438314944834505, + "loss": 2.9265, + "theoretical_loss": 3.7344938337529134, + "tokens_seen": 788559872 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038437311935807423, + "loss": 2.7427, + "theoretical_loss": 3.7344629882976714, + "tokens_seen": 788625408 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038436308926780347, + "loss": 2.8258, + "theoretical_loss": 3.7344321461232863, + "tokens_seen": 788690944 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003843530591775326, + "loss": 3.0298, + "theoretical_loss": 3.734401307229136, + "tokens_seen": 788756480 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038434302908726183, + "loss": 2.9562, + "theoretical_loss": 3.734370471614599, + "tokens_seen": 788822016 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038433299899699096, + "loss": 2.8713, + "theoretical_loss": 3.734339639279055, + "tokens_seen": 788887552 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003843229689067202, + "loss": 2.6562, + "theoretical_loss": 3.734308810221881, + "tokens_seen": 788953088 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003843129388164494, + "loss": 3.0193, + "theoretical_loss": 3.7342779844424583, + "tokens_seen": 789018624 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038430290872617856, + "loss": 2.7786, + "theoretical_loss": 3.734247161940165, + "tokens_seen": 789084160 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038429287863590774, + "loss": 2.7898, + "theoretical_loss": 3.734216342714381, + "tokens_seen": 789149696 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003842828485456369, + "loss": 2.8075, + "theoretical_loss": 3.7341855267644855, + "tokens_seen": 789215232 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003842728184553661, + "loss": 2.9642, + "theoretical_loss": 3.7341547140898586, + "tokens_seen": 789280768 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038426278836509533, + "loss": 2.824, + "theoretical_loss": 3.7341239046898806, + "tokens_seen": 789346304 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038425275827482446, + "loss": 2.7216, + "theoretical_loss": 3.734093098563931, + "tokens_seen": 789411840 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003842427281845537, + "loss": 2.6687, + "theoretical_loss": 3.7340622957113907, + "tokens_seen": 789477376 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003842326980942829, + "loss": 2.8243, + "theoretical_loss": 3.7340314961316396, + "tokens_seen": 789542912 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038422266800401206, + "loss": 2.8471, + "theoretical_loss": 3.734000699824059, + "tokens_seen": 789608448 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038421263791374124, + "loss": 2.8609, + "theoretical_loss": 3.7339699067880296, + "tokens_seen": 789673984 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 911523, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8604464530944824, + "objective/train/theoretical_loss": 3.733954511496653, + "objective/train/tokens_used": 810166752, + "theoretical_loss": 3.733954511496653, + "tokens_seen": 789706752 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003842026078234704, + "loss": 2.9005, + "theoretical_loss": 3.7339391170229317, + "tokens_seen": 789739520 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003841925777331996, + "loss": 2.9414, + "theoretical_loss": 3.733908330528147, + "tokens_seen": 789805056 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038418254764292884, + "loss": 2.8559, + "theoretical_loss": 3.7338775473030568, + "tokens_seen": 789870592 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038417251755265796, + "loss": 3.1541, + "theoretical_loss": 3.733846767347043, + "tokens_seen": 789936128 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003841624874623872, + "loss": 2.7023, + "theoretical_loss": 3.7338159906594863, + "tokens_seen": 790001664 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003841524573721163, + "loss": 2.8034, + "theoretical_loss": 3.7337852172397694, + "tokens_seen": 790067200 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038414242728184556, + "loss": 2.7727, + "theoretical_loss": 3.7337544470872737, + "tokens_seen": 790132736 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038413239719157474, + "loss": 2.7565, + "theoretical_loss": 3.733723680201382, + "tokens_seen": 790198272 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003841223671013039, + "loss": 2.8344, + "theoretical_loss": 3.733692916581476, + "tokens_seen": 790263808 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003841123370110331, + "loss": 2.8147, + "theoretical_loss": 3.7336621562269383, + "tokens_seen": 790329344 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003841023069207623, + "loss": 2.8566, + "theoretical_loss": 3.7336313991371517, + "tokens_seen": 790394880 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038409227683049147, + "loss": 2.9304, + "theoretical_loss": 3.7336006453114994, + "tokens_seen": 790460416 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003840822467402207, + "loss": 2.9737, + "theoretical_loss": 3.7335698947493645, + "tokens_seen": 790525952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038407221664994983, + "loss": 2.7628, + "theoretical_loss": 3.733539147450129, + "tokens_seen": 790591488 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038406218655967907, + "loss": 2.6498, + "theoretical_loss": 3.733508403413177, + "tokens_seen": 790657024 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038405215646940825, + "loss": 2.9293, + "theoretical_loss": 3.7334776626378927, + "tokens_seen": 790722560 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038404212637913743, + "loss": 2.8448, + "theoretical_loss": 3.7334469251236584, + "tokens_seen": 790788096 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003840320962888666, + "loss": 2.8674, + "theoretical_loss": 3.733416190869859, + "tokens_seen": 790853632 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003840220661985958, + "loss": 2.9442, + "theoretical_loss": 3.733385459875878, + "tokens_seen": 790919168 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038401203610832497, + "loss": 2.7676, + "theoretical_loss": 3.7333547321411, + "tokens_seen": 790984704 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003840020060180542, + "loss": 2.5382, + "theoretical_loss": 3.7333240076649092, + "tokens_seen": 791050240 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038399197592778333, + "loss": 2.9655, + "theoretical_loss": 3.7332932864466897, + "tokens_seen": 791115776 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038398194583751257, + "loss": 2.9938, + "theoretical_loss": 3.7332625684858263, + "tokens_seen": 791181312 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003839719157472417, + "loss": 2.8769, + "theoretical_loss": 3.733231853781705, + "tokens_seen": 791246848 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038396188565697093, + "loss": 2.8183, + "theoretical_loss": 3.7332011423337086, + "tokens_seen": 791312384 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 912176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.054990291595459, + "objective/train/theoretical_loss": 3.733185787830566, + "objective/train/tokens_used": 811805152, + "theoretical_loss": 3.733185787830566, + "tokens_seen": 791345152 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003839518555667001, + "loss": 2.8538, + "theoretical_loss": 3.733170434141224, + "tokens_seen": 791377920 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003839418254764293, + "loss": 2.9927, + "theoretical_loss": 3.7331397292036366, + "tokens_seen": 791443456 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003839317953861585, + "loss": 2.9339, + "theoretical_loss": 3.7331090275203307, + "tokens_seen": 791508992 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038392176529588765, + "loss": 2.7836, + "theoretical_loss": 3.733078329090693, + "tokens_seen": 791574528 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038391173520561684, + "loss": 2.5344, + "theoretical_loss": 3.733047633914109, + "tokens_seen": 791640064 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038390170511534607, + "loss": 3.0145, + "theoretical_loss": 3.7330169419899653, + "tokens_seen": 791705600 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003838916750250752, + "loss": 2.9672, + "theoretical_loss": 3.7329862533176463, + "tokens_seen": 791771136 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038388164493480443, + "loss": 2.8305, + "theoretical_loss": 3.732955567896541, + "tokens_seen": 791836672 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003838716148445336, + "loss": 2.9355, + "theoretical_loss": 3.7329248857260335, + "tokens_seen": 791902208 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003838615847542628, + "loss": 2.7786, + "theoretical_loss": 3.7328942068055118, + "tokens_seen": 791967744 + }, + { + "epoch": 2.06, + "learning_rate": 0.000383851554663992, + "loss": 2.8358, + "theoretical_loss": 3.7328635311343623, + "tokens_seen": 792033280 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038384152457372116, + "loss": 2.6601, + "theoretical_loss": 3.732832858711973, + "tokens_seen": 792098816 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038383149448345034, + "loss": 2.618, + "theoretical_loss": 3.7328021895377295, + "tokens_seen": 792164352 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003838214643931796, + "loss": 2.9023, + "theoretical_loss": 3.73277152361102, + "tokens_seen": 792229888 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003838114343029087, + "loss": 2.7241, + "theoretical_loss": 3.732740860931232, + "tokens_seen": 792295424 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038380140421263794, + "loss": 2.7667, + "theoretical_loss": 3.7327102014977527, + "tokens_seen": 792360960 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038379137412236706, + "loss": 2.6392, + "theoretical_loss": 3.732679545309971, + "tokens_seen": 792426496 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003837813440320963, + "loss": 2.877, + "theoretical_loss": 3.732648892367274, + "tokens_seen": 792492032 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003837713139418255, + "loss": 2.8724, + "theoretical_loss": 3.7326182426690506, + "tokens_seen": 792557568 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038376128385155466, + "loss": 2.7809, + "theoretical_loss": 3.732587596214688, + "tokens_seen": 792623104 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038375125376128384, + "loss": 2.858, + "theoretical_loss": 3.732556953003576, + "tokens_seen": 792688640 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003837412236710131, + "loss": 2.7547, + "theoretical_loss": 3.732526313035102, + "tokens_seen": 792754176 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003837311935807422, + "loss": 2.8265, + "theoretical_loss": 3.7324956763086563, + "tokens_seen": 792819712 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038372116349047144, + "loss": 3.1426, + "theoretical_loss": 3.732465042823627, + "tokens_seen": 792885248 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038371113340020057, + "loss": 2.7527, + "theoretical_loss": 3.732434412579403, + "tokens_seen": 792950784 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 913766, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.490590810775757, + "objective/train/theoretical_loss": 3.732419098672402, + "objective/train/tokens_used": 813443552, + "theoretical_loss": 3.732419098672402, + "tokens_seen": 792983552 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003837011033099298, + "loss": 2.7067, + "theoretical_loss": 3.7324037855753742, + "tokens_seen": 793016320 + }, + { + "epoch": 2.06, + "learning_rate": 0.000383691073219659, + "loss": 2.7344, + "theoretical_loss": 3.73237316181093, + "tokens_seen": 793081856 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038368104312938816, + "loss": 2.8914, + "theoretical_loss": 3.73234254128546, + "tokens_seen": 793147392 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038367101303911735, + "loss": 2.8415, + "theoretical_loss": 3.7323119239983544, + "tokens_seen": 793212928 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003836609829488465, + "loss": 2.8043, + "theoretical_loss": 3.732281309949002, + "tokens_seen": 793278464 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003836509528585757, + "loss": 2.803, + "theoretical_loss": 3.732250699136795, + "tokens_seen": 793344000 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038364092276830494, + "loss": 2.7812, + "theoretical_loss": 3.732220091561122, + "tokens_seen": 793409536 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003836308926780341, + "loss": 2.6567, + "theoretical_loss": 3.7321894872213743, + "tokens_seen": 793475072 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003836208625877633, + "loss": 2.6744, + "theoretical_loss": 3.7321588861169417, + "tokens_seen": 793540608 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003836108324974925, + "loss": 2.8148, + "theoretical_loss": 3.7321282882472167, + "tokens_seen": 793606144 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038360080240722167, + "loss": 2.8026, + "theoretical_loss": 3.7320976936115886, + "tokens_seen": 793671680 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003835907723169509, + "loss": 2.8677, + "theoretical_loss": 3.7320671022094496, + "tokens_seen": 793737216 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038358074222668003, + "loss": 2.9286, + "theoretical_loss": 3.7320365140401903, + "tokens_seen": 793802752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038357071213640927, + "loss": 2.9299, + "theoretical_loss": 3.7320059291032024, + "tokens_seen": 793868288 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038356068204613845, + "loss": 2.8521, + "theoretical_loss": 3.7319753473978783, + "tokens_seen": 793933824 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038355065195586763, + "loss": 2.7029, + "theoretical_loss": 3.7319447689236087, + "tokens_seen": 793999360 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003835406218655968, + "loss": 2.8717, + "theoretical_loss": 3.7319141936797866, + "tokens_seen": 794064896 + }, + { + "epoch": 2.06, + "learning_rate": 0.000383530591775326, + "loss": 2.5418, + "theoretical_loss": 3.731883621665803, + "tokens_seen": 794130432 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038352056168505517, + "loss": 2.831, + "theoretical_loss": 3.731853052881051, + "tokens_seen": 794195968 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003835105315947844, + "loss": 2.8505, + "theoretical_loss": 3.7318224873249237, + "tokens_seen": 794261504 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038350050150451353, + "loss": 2.7467, + "theoretical_loss": 3.731791924996812, + "tokens_seen": 794327040 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038349047141424277, + "loss": 2.6475, + "theoretical_loss": 3.7317613658961104, + "tokens_seen": 794392576 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003834804413239719, + "loss": 2.829, + "theoretical_loss": 3.731730810022211, + "tokens_seen": 794458112 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038347041123370113, + "loss": 2.832, + "theoretical_loss": 3.7317002573745066, + "tokens_seen": 794523648 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003834603811434303, + "loss": 2.8858, + "theoretical_loss": 3.731669707952391, + "tokens_seen": 794589184 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 914513, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7244105339050293, + "objective/train/theoretical_loss": 3.7316544344507396, + "objective/train/tokens_used": 815081952, + "theoretical_loss": 3.7316544344507396, + "tokens_seen": 794621952 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003834503510531595, + "loss": 2.8052, + "theoretical_loss": 3.731639161755258, + "tokens_seen": 794654720 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003834403209628887, + "loss": 2.9226, + "theoretical_loss": 3.731608618782501, + "tokens_seen": 794720256 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038343029087261786, + "loss": 2.8461, + "theoretical_loss": 3.731578079033513, + "tokens_seen": 794785792 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038342026078234704, + "loss": 2.7635, + "theoretical_loss": 3.7315475425076894, + "tokens_seen": 794851328 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038341023069207627, + "loss": 2.7834, + "theoretical_loss": 3.731517009204423, + "tokens_seen": 794916864 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003834002006018054, + "loss": 2.6976, + "theoretical_loss": 3.7314864791231086, + "tokens_seen": 794982400 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038339017051153463, + "loss": 2.7814, + "theoretical_loss": 3.7314559522631408, + "tokens_seen": 795047936 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003833801404212638, + "loss": 2.6001, + "theoretical_loss": 3.7314254286239144, + "tokens_seen": 795113472 + }, + { + "epoch": 2.06, + "learning_rate": 0.000383370110330993, + "loss": 2.9477, + "theoretical_loss": 3.7313949082048232, + "tokens_seen": 795179008 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003833600802407222, + "loss": 2.9413, + "theoretical_loss": 3.731364391005263, + "tokens_seen": 795244544 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038335005015045136, + "loss": 2.7242, + "theoretical_loss": 3.731333877024629, + "tokens_seen": 795310080 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038334002006018054, + "loss": 2.7499, + "theoretical_loss": 3.7313033662623156, + "tokens_seen": 795375616 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003833299899699098, + "loss": 2.7898, + "theoretical_loss": 3.7312728587177193, + "tokens_seen": 795441152 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003833199598796389, + "loss": 2.7396, + "theoretical_loss": 3.7312423543902344, + "tokens_seen": 795506688 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038330992978936814, + "loss": 2.7025, + "theoretical_loss": 3.731211853279258, + "tokens_seen": 795572224 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038329989969909726, + "loss": 2.7248, + "theoretical_loss": 3.7311813553841855, + "tokens_seen": 795637760 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003832898696088265, + "loss": 2.7345, + "theoretical_loss": 3.731150860704413, + "tokens_seen": 795703296 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003832798395185557, + "loss": 2.8407, + "theoretical_loss": 3.7311203692393358, + "tokens_seen": 795768832 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038326980942828486, + "loss": 2.9334, + "theoretical_loss": 3.731089880988352, + "tokens_seen": 795834368 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038325977933801404, + "loss": 2.7804, + "theoretical_loss": 3.731059395950857, + "tokens_seen": 795899904 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003832497492477433, + "loss": 2.7048, + "theoretical_loss": 3.731028914126248, + "tokens_seen": 795965440 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003832397191574724, + "loss": 2.8948, + "theoretical_loss": 3.7309984355139214, + "tokens_seen": 796030976 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038322968906720164, + "loss": 2.953, + "theoretical_loss": 3.730967960113275, + "tokens_seen": 796096512 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038321965897693077, + "loss": 2.8011, + "theoretical_loss": 3.730937487923705, + "tokens_seen": 796162048 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038320962888666, + "loss": 2.8264, + "theoretical_loss": 3.7309070189446096, + "tokens_seen": 796227584 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 915896, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.109905481338501, + "objective/train/theoretical_loss": 3.7308917856588018, + "objective/train/tokens_used": 816720352, + "theoretical_loss": 3.7308917856588018, + "tokens_seen": 796260352 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003831995987963892, + "loss": 2.8979, + "theoretical_loss": 3.7308765531753867, + "tokens_seen": 796293120 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038318956870611836, + "loss": 2.908, + "theoretical_loss": 3.730846090615433, + "tokens_seen": 796358656 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038317953861584755, + "loss": 2.7226, + "theoretical_loss": 3.730815631264147, + "tokens_seen": 796424192 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003831695085255767, + "loss": 2.5663, + "theoretical_loss": 3.7307851751209267, + "tokens_seen": 796489728 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003831594784353059, + "loss": 2.7183, + "theoretical_loss": 3.7307547221851696, + "tokens_seen": 796555264 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038314944834503514, + "loss": 2.8574, + "theoretical_loss": 3.730724272456275, + "tokens_seen": 796620800 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038313941825476427, + "loss": 2.8418, + "theoretical_loss": 3.730693825933641, + "tokens_seen": 796686336 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003831293881644935, + "loss": 2.5915, + "theoretical_loss": 3.7306633826166666, + "tokens_seen": 796751872 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038311935807422263, + "loss": 2.9564, + "theoretical_loss": 3.73063294250475, + "tokens_seen": 796817408 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038310932798395187, + "loss": 2.8469, + "theoretical_loss": 3.730602505597291, + "tokens_seen": 796882944 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038309929789368105, + "loss": 2.887, + "theoretical_loss": 3.730572071893688, + "tokens_seen": 796948480 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038308926780341023, + "loss": 2.8746, + "theoretical_loss": 3.7305416413933408, + "tokens_seen": 797014016 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003830792377131394, + "loss": 2.9179, + "theoretical_loss": 3.7305112140956487, + "tokens_seen": 797079552 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038306920762286865, + "loss": 2.9636, + "theoretical_loss": 3.730480790000011, + "tokens_seen": 797145088 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003830591775325978, + "loss": 2.7864, + "theoretical_loss": 3.730450369105829, + "tokens_seen": 797210624 + }, + { + "epoch": 2.06, + "learning_rate": 0.000383049147442327, + "loss": 3.0204, + "theoretical_loss": 3.7304199514125007, + "tokens_seen": 797276160 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038303911735205614, + "loss": 2.8526, + "theoretical_loss": 3.7303895369194273, + "tokens_seen": 797341696 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038302908726178537, + "loss": 2.7846, + "theoretical_loss": 3.730359125626009, + "tokens_seen": 797407232 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038301905717151455, + "loss": 2.941, + "theoretical_loss": 3.7303287175316466, + "tokens_seen": 797472768 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038300902708124373, + "loss": 2.7052, + "theoretical_loss": 3.73029831263574, + "tokens_seen": 797538304 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003829989969909729, + "loss": 2.7305, + "theoretical_loss": 3.73026791093769, + "tokens_seen": 797603840 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003829889669007021, + "loss": 2.7913, + "theoretical_loss": 3.730237512436899, + "tokens_seen": 797669376 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003829789368104313, + "loss": 2.7654, + "theoretical_loss": 3.730207117132766, + "tokens_seen": 797734912 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003829689067201605, + "loss": 2.7138, + "theoretical_loss": 3.7301767250246938, + "tokens_seen": 797800448 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038295887662988964, + "loss": 2.9547, + "theoretical_loss": 3.7301463361120835, + "tokens_seen": 797865984 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 916418, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8996529579162598, + "objective/train/theoretical_loss": 3.730131142853889, + "objective/train/tokens_used": 818358752, + "theoretical_loss": 3.730131142853889, + "tokens_seen": 797898752 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003829488465396189, + "loss": 3.0152, + "theoretical_loss": 3.7301159503943357, + "tokens_seen": 797931520 + }, + { + "epoch": 2.06, + "learning_rate": 0.000382938816449348, + "loss": 3.0106, + "theoretical_loss": 3.7300855678708533, + "tokens_seen": 797997056 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038292878635907724, + "loss": 2.8793, + "theoretical_loss": 3.7300551885410385, + "tokens_seen": 798062592 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003829187562688064, + "loss": 2.7218, + "theoretical_loss": 3.730024812404292, + "tokens_seen": 798128128 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003829087261785356, + "loss": 2.8625, + "theoretical_loss": 3.7299944394600173, + "tokens_seen": 798193664 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003828986960882648, + "loss": 2.5771, + "theoretical_loss": 3.729964069707616, + "tokens_seen": 798259200 + }, + { + "epoch": 2.06, + "learning_rate": 0.000382888665997994, + "loss": 2.8149, + "theoretical_loss": 3.729933703146491, + "tokens_seen": 798324736 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003828786359077232, + "loss": 2.8857, + "theoretical_loss": 3.729903339776045, + "tokens_seen": 798390272 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003828686058174524, + "loss": 2.9546, + "theoretical_loss": 3.7298729795956804, + "tokens_seen": 798455808 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038285857572718156, + "loss": 2.9196, + "theoretical_loss": 3.729842622604801, + "tokens_seen": 798521344 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038284854563691074, + "loss": 2.921, + "theoretical_loss": 3.72981226880281, + "tokens_seen": 798586880 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038283851554664, + "loss": 3.0057, + "theoretical_loss": 3.7297819181891105, + "tokens_seen": 798652416 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003828284854563691, + "loss": 3.0165, + "theoretical_loss": 3.7297515707631055, + "tokens_seen": 798717952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038281845536609834, + "loss": 3.0633, + "theoretical_loss": 3.7297212265242, + "tokens_seen": 798783488 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038280842527582746, + "loss": 2.8644, + "theoretical_loss": 3.729690885471796, + "tokens_seen": 798849024 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003827983951855567, + "loss": 2.935, + "theoretical_loss": 3.7296605476052993, + "tokens_seen": 798914560 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003827883650952859, + "loss": 2.9549, + "theoretical_loss": 3.7296302129241132, + "tokens_seen": 798980096 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038277833500501506, + "loss": 2.798, + "theoretical_loss": 3.729599881427642, + "tokens_seen": 799045632 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038276830491474424, + "loss": 2.9321, + "theoretical_loss": 3.72956955311529, + "tokens_seen": 799111168 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003827582748244735, + "loss": 2.8404, + "theoretical_loss": 3.7295392279864625, + "tokens_seen": 799176704 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003827482447342026, + "loss": 2.5635, + "theoretical_loss": 3.729508906040564, + "tokens_seen": 799242240 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038273821464393184, + "loss": 2.7403, + "theoretical_loss": 3.7294785872769993, + "tokens_seen": 799307776 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038272818455366097, + "loss": 2.9023, + "theoretical_loss": 3.7294482716951736, + "tokens_seen": 799373312 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003827181544633902, + "loss": 2.5704, + "theoretical_loss": 3.7294179592944925, + "tokens_seen": 799438848 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003827081243731194, + "loss": 2.8978, + "theoretical_loss": 3.729387650074361, + "tokens_seen": 799504384 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 917678, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9319193363189697, + "objective/train/theoretical_loss": 3.729372496656816, + "objective/train/tokens_used": 819997152, + "theoretical_loss": 3.729372496656816, + "tokens_seen": 799537152 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038269809428284856, + "loss": 2.9678, + "theoretical_loss": 3.729357344034185, + "tokens_seen": 799569920 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038268806419257775, + "loss": 2.7522, + "theoretical_loss": 3.72932704117337, + "tokens_seen": 799635456 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003826780341023069, + "loss": 2.9152, + "theoretical_loss": 3.7292967414913223, + "tokens_seen": 799700992 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003826680040120361, + "loss": 2.5198, + "theoretical_loss": 3.7292664449874477, + "tokens_seen": 799766528 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038265797392176534, + "loss": 2.8087, + "theoretical_loss": 3.729236151661153, + "tokens_seen": 799832064 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038264794383149447, + "loss": 2.8923, + "theoretical_loss": 3.7292058615118435, + "tokens_seen": 799897600 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003826379137412237, + "loss": 2.7683, + "theoretical_loss": 3.729175574538927, + "tokens_seen": 799963136 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038262788365095283, + "loss": 2.7146, + "theoretical_loss": 3.7291452907418092, + "tokens_seen": 800028672 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038261785356068207, + "loss": 2.9047, + "theoretical_loss": 3.729115010119897, + "tokens_seen": 800094208 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038260782347041125, + "loss": 2.712, + "theoretical_loss": 3.7290847326725984, + "tokens_seen": 800159744 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038259779338014043, + "loss": 2.7399, + "theoretical_loss": 3.72905445839932, + "tokens_seen": 800225280 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003825877632898696, + "loss": 2.6716, + "theoretical_loss": 3.729024187299469, + "tokens_seen": 800290816 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038257773319959885, + "loss": 2.878, + "theoretical_loss": 3.7289939193724537, + "tokens_seen": 800356352 + }, + { + "epoch": 2.06, + "learning_rate": 0.000382567703109328, + "loss": 2.7631, + "theoretical_loss": 3.728963654617681, + "tokens_seen": 800421888 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003825576730190572, + "loss": 2.8419, + "theoretical_loss": 3.7289333930345587, + "tokens_seen": 800487424 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038254764292878634, + "loss": 2.5932, + "theoretical_loss": 3.7289031346224952, + "tokens_seen": 800552960 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038253761283851557, + "loss": 2.8904, + "theoretical_loss": 3.7288728793808987, + "tokens_seen": 800618496 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038252758274824475, + "loss": 3.065, + "theoretical_loss": 3.7288426273091773, + "tokens_seen": 800684032 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038251755265797393, + "loss": 2.7377, + "theoretical_loss": 3.7288123784067393, + "tokens_seen": 800749568 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003825075225677031, + "loss": 2.8729, + "theoretical_loss": 3.7287821326729933, + "tokens_seen": 800815104 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003824974924774323, + "loss": 2.7359, + "theoretical_loss": 3.728751890107349, + "tokens_seen": 800880640 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003824874623871615, + "loss": 2.8713, + "theoretical_loss": 3.7287216507092142, + "tokens_seen": 800946176 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003824774322968907, + "loss": 2.5688, + "theoretical_loss": 3.728691414477999, + "tokens_seen": 801011712 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038246740220661984, + "loss": 2.7509, + "theoretical_loss": 3.728661181413112, + "tokens_seen": 801077248 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003824573721163491, + "loss": 2.6961, + "theoretical_loss": 3.7286309515139626, + "tokens_seen": 801142784 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 918390, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.938758134841919, + "objective/train/theoretical_loss": 3.728615837751355, + "objective/train/tokens_used": 821635552, + "theoretical_loss": 3.728615837751355, + "tokens_seen": 801175552 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003824473420260782, + "loss": 3.0026, + "theoretical_loss": 3.7286007247799606, + "tokens_seen": 801208320 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038243731193580744, + "loss": 2.6613, + "theoretical_loss": 3.7285705012105157, + "tokens_seen": 801273856 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003824272818455366, + "loss": 2.668, + "theoretical_loss": 3.7285402808050376, + "tokens_seen": 801339392 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003824172517552658, + "loss": 2.5081, + "theoretical_loss": 3.728510063562937, + "tokens_seen": 801404928 + }, + { + "epoch": 2.06, + "learning_rate": 0.000382407221664995, + "loss": 2.5455, + "theoretical_loss": 3.7284798494836235, + "tokens_seen": 801470464 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003823971915747242, + "loss": 2.7998, + "theoretical_loss": 3.7284496385665076, + "tokens_seen": 801536000 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038238716148445334, + "loss": 2.8151, + "theoretical_loss": 3.728419430811, + "tokens_seen": 801601536 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003823771313941826, + "loss": 2.7684, + "theoretical_loss": 3.7283892262165117, + "tokens_seen": 801667072 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003823671013039117, + "loss": 3.0378, + "theoretical_loss": 3.7283590247824527, + "tokens_seen": 801732608 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038235707121364094, + "loss": 2.9177, + "theoretical_loss": 3.7283288265082346, + "tokens_seen": 801798144 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003823470411233701, + "loss": 3.1085, + "theoretical_loss": 3.7282986313932684, + "tokens_seen": 801863680 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003823370110330993, + "loss": 2.8762, + "theoretical_loss": 3.728268439436966, + "tokens_seen": 801929216 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003823269809428285, + "loss": 2.7137, + "theoretical_loss": 3.7282382506387375, + "tokens_seen": 801994752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038231695085255766, + "loss": 2.524, + "theoretical_loss": 3.728208064997996, + "tokens_seen": 802060288 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038230692076228685, + "loss": 2.8362, + "theoretical_loss": 3.728177882514152, + "tokens_seen": 802125824 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003822968906720161, + "loss": 2.9109, + "theoretical_loss": 3.728147703186619, + "tokens_seen": 802191360 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003822868605817452, + "loss": 2.7985, + "theoretical_loss": 3.728117527014808, + "tokens_seen": 802256896 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038227683049147444, + "loss": 2.8394, + "theoretical_loss": 3.7280873539981307, + "tokens_seen": 802322432 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038226680040120357, + "loss": 2.7245, + "theoretical_loss": 3.7280571841360013, + "tokens_seen": 802387968 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003822567703109328, + "loss": 2.8553, + "theoretical_loss": 3.7280270174278316, + "tokens_seen": 802453504 + }, + { + "epoch": 2.06, + "learning_rate": 0.000382246740220662, + "loss": 2.6918, + "theoretical_loss": 3.7279968538730333, + "tokens_seen": 802519040 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038223671013039117, + "loss": 2.6755, + "theoretical_loss": 3.7279666934710205, + "tokens_seen": 802584576 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038222668004012035, + "loss": 2.7522, + "theoretical_loss": 3.727936536221206, + "tokens_seen": 802650112 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003822166499498496, + "loss": 2.6767, + "theoretical_loss": 3.7279063821230034, + "tokens_seen": 802715648 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003822066198595787, + "loss": 2.8416, + "theoretical_loss": 3.727876231175825, + "tokens_seen": 802781184 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 919804, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2665350437164307, + "objective/train/theoretical_loss": 3.727861156883687, + "objective/train/tokens_used": 823273952, + "theoretical_loss": 3.727861156883687, + "tokens_seen": 802813952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038219658976930795, + "loss": 2.9272, + "theoretical_loss": 3.7278460833790854, + "tokens_seen": 802846720 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003821865596790371, + "loss": 2.8213, + "theoretical_loss": 3.7278159387321974, + "tokens_seen": 802912256 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003821765295887663, + "loss": 2.7012, + "theoretical_loss": 3.727785797234575, + "tokens_seen": 802977792 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003821664994984955, + "loss": 2.6509, + "theoretical_loss": 3.7277556588856333, + "tokens_seen": 803043328 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038215646940822467, + "loss": 2.8218, + "theoretical_loss": 3.7277255236847853, + "tokens_seen": 803108864 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038214643931795385, + "loss": 2.9506, + "theoretical_loss": 3.7276953916314453, + "tokens_seen": 803174400 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038213640922768303, + "loss": 2.8434, + "theoretical_loss": 3.727665262725029, + "tokens_seen": 803239936 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038212637913741227, + "loss": 2.9449, + "theoretical_loss": 3.7276351369649494, + "tokens_seen": 803305472 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038211634904714145, + "loss": 2.6784, + "theoretical_loss": 3.7276050143506225, + "tokens_seen": 803371008 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038210631895687063, + "loss": 2.9453, + "theoretical_loss": 3.727574894881462, + "tokens_seen": 803436544 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003820962888665998, + "loss": 2.7106, + "theoretical_loss": 3.7275447785568847, + "tokens_seen": 803502080 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038208625877632905, + "loss": 2.8271, + "theoretical_loss": 3.7275146653763045, + "tokens_seen": 803567616 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003820762286860582, + "loss": 2.9294, + "theoretical_loss": 3.727484555339137, + "tokens_seen": 803633152 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003820661985957874, + "loss": 2.5488, + "theoretical_loss": 3.7274544484447985, + "tokens_seen": 803698688 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038205616850551654, + "loss": 2.9373, + "theoretical_loss": 3.727424344692704, + "tokens_seen": 803764224 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038204613841524577, + "loss": 2.9778, + "theoretical_loss": 3.7273942440822694, + "tokens_seen": 803829760 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038203610832497495, + "loss": 2.9941, + "theoretical_loss": 3.727364146612911, + "tokens_seen": 803895296 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038202607823470413, + "loss": 2.6439, + "theoretical_loss": 3.727334052284045, + "tokens_seen": 803960832 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003820160481444333, + "loss": 2.7207, + "theoretical_loss": 3.7273039610950875, + "tokens_seen": 804026368 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003820060180541625, + "loss": 2.9206, + "theoretical_loss": 3.7272738730454558, + "tokens_seen": 804091904 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003819959879638917, + "loss": 2.8644, + "theoretical_loss": 3.7272437881345652, + "tokens_seen": 804157440 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003819859578736209, + "loss": 2.964, + "theoretical_loss": 3.7272137063618334, + "tokens_seen": 804222976 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038197592778335004, + "loss": 2.8123, + "theoretical_loss": 3.727183627726677, + "tokens_seen": 804288512 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003819658976930793, + "loss": 2.7005, + "theoretical_loss": 3.727153552228514, + "tokens_seen": 804354048 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003819558676028084, + "loss": 3.1517, + "theoretical_loss": 3.7271234798667603, + "tokens_seen": 804419584 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 920531, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.036098003387451, + "objective/train/theoretical_loss": 3.727108444861855, + "objective/train/tokens_used": 824912352, + "theoretical_loss": 3.727108444861855, + "tokens_seen": 804452352 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038194583751253764, + "loss": 2.6692, + "theoretical_loss": 3.727093410640835, + "tokens_seen": 804485120 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003819358074222668, + "loss": 2.885, + "theoretical_loss": 3.727063344550154, + "tokens_seen": 804550656 + }, + { + "epoch": 2.06, + "learning_rate": 0.000381925777331996, + "loss": 3.1506, + "theoretical_loss": 3.727033281594136, + "tokens_seen": 804616192 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003819157472417252, + "loss": 2.9126, + "theoretical_loss": 3.7270032217721987, + "tokens_seen": 804681728 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003819057171514544, + "loss": 2.7855, + "theoretical_loss": 3.72697316508376, + "tokens_seen": 804747264 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038189568706118354, + "loss": 2.5874, + "theoretical_loss": 3.7269431115282385, + "tokens_seen": 804812800 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003818856569709128, + "loss": 2.7009, + "theoretical_loss": 3.7269130611050523, + "tokens_seen": 804878336 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003818756268806419, + "loss": 2.4843, + "theoretical_loss": 3.72688301381362, + "tokens_seen": 804943872 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038186559679037114, + "loss": 2.7415, + "theoretical_loss": 3.72685296965336, + "tokens_seen": 805009408 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003818555667001003, + "loss": 2.5227, + "theoretical_loss": 3.7268229286236916, + "tokens_seen": 805074944 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003818455366098295, + "loss": 2.7426, + "theoretical_loss": 3.7267928907240337, + "tokens_seen": 805140480 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003818355065195587, + "loss": 2.7953, + "theoretical_loss": 3.7267628559538055, + "tokens_seen": 805206016 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038182547642928786, + "loss": 2.7316, + "theoretical_loss": 3.7267328243124256, + "tokens_seen": 805271552 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038181544633901705, + "loss": 2.7126, + "theoretical_loss": 3.726702795799314, + "tokens_seen": 805337088 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003818054162487463, + "loss": 2.8651, + "theoretical_loss": 3.7266727704138907, + "tokens_seen": 805402624 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003817953861584754, + "loss": 2.7344, + "theoretical_loss": 3.726642748155575, + "tokens_seen": 805468160 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038178535606820464, + "loss": 2.8418, + "theoretical_loss": 3.7266127290237865, + "tokens_seen": 805533696 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038177532597793377, + "loss": 2.887, + "theoretical_loss": 3.7265827130179465, + "tokens_seen": 805599232 + }, + { + "epoch": 2.06, + "learning_rate": 0.000381765295887663, + "loss": 2.8043, + "theoretical_loss": 3.7265527001374736, + "tokens_seen": 805664768 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003817552657973922, + "loss": 2.8358, + "theoretical_loss": 3.7265226903817896, + "tokens_seen": 805730304 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038174523570712137, + "loss": 2.6254, + "theoretical_loss": 3.726492683750314, + "tokens_seen": 805795840 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038173520561685055, + "loss": 2.6947, + "theoretical_loss": 3.7264626802424683, + "tokens_seen": 805861376 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003817251755265798, + "loss": 2.6898, + "theoretical_loss": 3.7264326798576723, + "tokens_seen": 805926912 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003817151454363089, + "loss": 2.7434, + "theoretical_loss": 3.7264026825953485, + "tokens_seen": 805992448 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038170511534603815, + "loss": 2.557, + "theoretical_loss": 3.7263726884549166, + "tokens_seen": 806057984 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 921184, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.061922788619995, + "objective/train/theoretical_loss": 3.7263576925552298, + "objective/train/tokens_used": 826550752, + "theoretical_loss": 3.7263576925552298, + "tokens_seen": 806090752 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003816950852557673, + "loss": 2.9332, + "theoretical_loss": 3.7263426974357987, + "tokens_seen": 806123520 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003816850551654965, + "loss": 2.8546, + "theoretical_loss": 3.7263127095374164, + "tokens_seen": 806189056 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003816750250752257, + "loss": 2.9827, + "theoretical_loss": 3.7262827247591908, + "tokens_seen": 806254592 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038166499498495487, + "loss": 2.7669, + "theoretical_loss": 3.7262527431005443, + "tokens_seen": 806320128 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038165496489468405, + "loss": 2.9307, + "theoretical_loss": 3.726222764560898, + "tokens_seen": 806385664 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038164493480441323, + "loss": 2.855, + "theoretical_loss": 3.7261927891396747, + "tokens_seen": 806451200 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003816349047141424, + "loss": 2.6853, + "theoretical_loss": 3.726162816836296, + "tokens_seen": 806516736 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038162487462387165, + "loss": 2.6587, + "theoretical_loss": 3.7261328476501845, + "tokens_seen": 806582272 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003816148445336008, + "loss": 2.7978, + "theoretical_loss": 3.7261028815807635, + "tokens_seen": 806647808 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038160481444333, + "loss": 2.8737, + "theoretical_loss": 3.726072918627455, + "tokens_seen": 806713344 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038159478435305914, + "loss": 2.6942, + "theoretical_loss": 3.7260429587896815, + "tokens_seen": 806778880 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003815847542627884, + "loss": 2.8407, + "theoretical_loss": 3.7260130020668667, + "tokens_seen": 806844416 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038157472417251756, + "loss": 2.8455, + "theoretical_loss": 3.725983048458434, + "tokens_seen": 806909952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038156469408224674, + "loss": 2.7394, + "theoretical_loss": 3.7259530979638056, + "tokens_seen": 806975488 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003815546639919759, + "loss": 2.8217, + "theoretical_loss": 3.725923150582406, + "tokens_seen": 807041024 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038154463390170515, + "loss": 2.7258, + "theoretical_loss": 3.7258932063136583, + "tokens_seen": 807106560 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003815346038114343, + "loss": 2.4661, + "theoretical_loss": 3.7258632651569865, + "tokens_seen": 807172096 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003815245737211635, + "loss": 2.8683, + "theoretical_loss": 3.725833327111814, + "tokens_seen": 807237632 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038151454363089264, + "loss": 2.3263, + "theoretical_loss": 3.725803392177566, + "tokens_seen": 807303168 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003815045135406219, + "loss": 2.5954, + "theoretical_loss": 3.7257734603536656, + "tokens_seen": 807368704 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038149448345035106, + "loss": 2.9207, + "theoretical_loss": 3.7257435316395373, + "tokens_seen": 807434240 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038148445336008024, + "loss": 3.1429, + "theoretical_loss": 3.7257136060346063, + "tokens_seen": 807499776 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003814744232698094, + "loss": 2.795, + "theoretical_loss": 3.725683683538297, + "tokens_seen": 807565312 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003814643931795386, + "loss": 2.5561, + "theoretical_loss": 3.7256537641500347, + "tokens_seen": 807630848 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003814543630892678, + "loss": 2.9045, + "theoretical_loss": 3.725623847869244, + "tokens_seen": 807696384 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 922293, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.627511501312256, + "objective/train/theoretical_loss": 3.72560889089397, + "objective/train/tokens_used": 828189152, + "theoretical_loss": 3.72560889089397, + "tokens_seen": 807729152 + }, + { + "epoch": 2.06, + "learning_rate": 0.000381444332998997, + "loss": 2.818, + "theoretical_loss": 3.725593934695349, + "tokens_seen": 807761920 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038143430290872614, + "loss": 2.9807, + "theoretical_loss": 3.7255640246277766, + "tokens_seen": 807827456 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003814242728184554, + "loss": 2.9548, + "theoretical_loss": 3.7255341176659513, + "tokens_seen": 807892992 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038141424272818456, + "loss": 2.9676, + "theoretical_loss": 3.725504213809299, + "tokens_seen": 807958528 + }, + { + "epoch": 2.06, + "learning_rate": 0.00038140421263791374, + "loss": 2.7428, + "theoretical_loss": 3.725474313057246, + "tokens_seen": 808024064 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003813941825476429, + "loss": 2.8616, + "theoretical_loss": 3.7254444154092177, + "tokens_seen": 808089600 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003813841524573721, + "loss": 2.9295, + "theoretical_loss": 3.72541452086464, + "tokens_seen": 808155136 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038137412236710134, + "loss": 2.6852, + "theoretical_loss": 3.7253846294229396, + "tokens_seen": 808220672 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003813640922768305, + "loss": 2.6936, + "theoretical_loss": 3.7253547410835424, + "tokens_seen": 808286208 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003813540621865597, + "loss": 2.5662, + "theoretical_loss": 3.725324855845875, + "tokens_seen": 808351744 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003813440320962889, + "loss": 2.8743, + "theoretical_loss": 3.7252949737093646, + "tokens_seen": 808417280 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038133400200601806, + "loss": 2.6674, + "theoretical_loss": 3.7252650946734374, + "tokens_seen": 808482816 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038132397191574725, + "loss": 2.8592, + "theoretical_loss": 3.7252352187375206, + "tokens_seen": 808548352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003813139418254765, + "loss": 2.9754, + "theoretical_loss": 3.725205345901042, + "tokens_seen": 808613888 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003813039117352056, + "loss": 2.6084, + "theoretical_loss": 3.725175476163427, + "tokens_seen": 808679424 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038129388164493484, + "loss": 2.7187, + "theoretical_loss": 3.7251456095241053, + "tokens_seen": 808744960 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038128385155466397, + "loss": 2.8086, + "theoretical_loss": 3.725115745982503, + "tokens_seen": 808810496 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003812738214643932, + "loss": 2.8193, + "theoretical_loss": 3.725085885538049, + "tokens_seen": 808876032 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003812637913741224, + "loss": 2.8032, + "theoretical_loss": 3.7250560281901706, + "tokens_seen": 808941568 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038125376128385157, + "loss": 2.766, + "theoretical_loss": 3.7250261739382955, + "tokens_seen": 809007104 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038124373119358075, + "loss": 2.82, + "theoretical_loss": 3.724996322781852, + "tokens_seen": 809072640 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038123370110331, + "loss": 2.5854, + "theoretical_loss": 3.7249664747202686, + "tokens_seen": 809138176 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003812236710130391, + "loss": 2.4659, + "theoretical_loss": 3.7249366297529742, + "tokens_seen": 809203712 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038121364092276835, + "loss": 2.7821, + "theoretical_loss": 3.724906787879397, + "tokens_seen": 809269248 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003812036108324975, + "loss": 2.5499, + "theoretical_loss": 3.7248769490989666, + "tokens_seen": 809334784 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 923007, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.85902738571167, + "objective/train/theoretical_loss": 3.7248620308685023, + "objective/train/tokens_used": 829827552, + "theoretical_loss": 3.7248620308685023, + "tokens_seen": 809367552 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003811935807422267, + "loss": 2.8503, + "theoretical_loss": 3.7248471134111103, + "tokens_seen": 809400320 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003811835506519559, + "loss": 2.7954, + "theoretical_loss": 3.724817280815259, + "tokens_seen": 809465856 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038117352056168507, + "loss": 2.7549, + "theoretical_loss": 3.7247874513108403, + "tokens_seen": 809531392 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038116349047141425, + "loss": 2.5741, + "theoretical_loss": 3.7247576248972853, + "tokens_seen": 809596928 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038115346038114343, + "loss": 2.7751, + "theoretical_loss": 3.724727801574023, + "tokens_seen": 809662464 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003811434302908726, + "loss": 3.0212, + "theoretical_loss": 3.724697981340482, + "tokens_seen": 809728000 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038113340020060185, + "loss": 2.8603, + "theoretical_loss": 3.7246681641960935, + "tokens_seen": 809793536 + }, + { + "epoch": 2.07, + "learning_rate": 0.000381123370110331, + "loss": 2.8533, + "theoretical_loss": 3.724638350140287, + "tokens_seen": 809859072 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003811133400200602, + "loss": 2.8063, + "theoretical_loss": 3.7246085391724932, + "tokens_seen": 809924608 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038110330992978934, + "loss": 2.7438, + "theoretical_loss": 3.7245787312921417, + "tokens_seen": 809990144 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003810932798395186, + "loss": 2.7521, + "theoretical_loss": 3.724548926498663, + "tokens_seen": 810055680 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038108324974924776, + "loss": 2.7021, + "theoretical_loss": 3.7245191247914886, + "tokens_seen": 810121216 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038107321965897694, + "loss": 2.6705, + "theoretical_loss": 3.7244893261700485, + "tokens_seen": 810186752 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003810631895687061, + "loss": 2.8608, + "theoretical_loss": 3.7244595306337733, + "tokens_seen": 810252288 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038105315947843535, + "loss": 2.8353, + "theoretical_loss": 3.7244297381820948, + "tokens_seen": 810317824 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003810431293881645, + "loss": 2.9204, + "theoretical_loss": 3.7243999488144444, + "tokens_seen": 810383360 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003810330992978937, + "loss": 3.0504, + "theoretical_loss": 3.7243701625302528, + "tokens_seen": 810448896 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038102306920762284, + "loss": 2.8244, + "theoretical_loss": 3.724340379328952, + "tokens_seen": 810514432 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003810130391173521, + "loss": 2.8698, + "theoretical_loss": 3.724310599209973, + "tokens_seen": 810579968 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038100300902708126, + "loss": 2.9134, + "theoretical_loss": 3.724280822172749, + "tokens_seen": 810645504 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038099297893681044, + "loss": 2.5862, + "theoretical_loss": 3.724251048216711, + "tokens_seen": 810711040 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003809829488465396, + "loss": 2.6886, + "theoretical_loss": 3.7242212773412913, + "tokens_seen": 810776576 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003809729187562688, + "loss": 2.815, + "theoretical_loss": 3.7241915095459213, + "tokens_seen": 810842112 + }, + { + "epoch": 2.07, + "learning_rate": 0.000380962888665998, + "loss": 2.7922, + "theoretical_loss": 3.724161744830035, + "tokens_seen": 810907648 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003809528585757272, + "loss": 2.9728, + "theoretical_loss": 3.7241319831930646, + "tokens_seen": 810973184 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 924257, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.479335069656372, + "objective/train/theoretical_loss": 3.724117103528995, + "objective/train/tokens_used": 831465952, + "theoretical_loss": 3.724117103528995, + "tokens_seen": 811005952 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038094282848545635, + "loss": 2.5841, + "theoretical_loss": 3.724102224634442, + "tokens_seen": 811038720 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003809327983951856, + "loss": 2.7625, + "theoretical_loss": 3.7240724691536005, + "tokens_seen": 811104256 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038092276830491476, + "loss": 2.6284, + "theoretical_loss": 3.7240427167499734, + "tokens_seen": 811169792 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038091273821464394, + "loss": 2.8429, + "theoretical_loss": 3.724012967422994, + "tokens_seen": 811235328 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003809027081243731, + "loss": 2.6949, + "theoretical_loss": 3.7239832211720953, + "tokens_seen": 811300864 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003808926780341023, + "loss": 2.8882, + "theoretical_loss": 3.7239534779967105, + "tokens_seen": 811366400 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003808826479438315, + "loss": 2.8984, + "theoretical_loss": 3.723923737896274, + "tokens_seen": 811431936 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003808726178535607, + "loss": 2.851, + "theoretical_loss": 3.723894000870219, + "tokens_seen": 811497472 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038086258776328985, + "loss": 2.8951, + "theoretical_loss": 3.72386426691798, + "tokens_seen": 811563008 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003808525576730191, + "loss": 2.6595, + "theoretical_loss": 3.72383453603899, + "tokens_seen": 811628544 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003808425275827482, + "loss": 2.7869, + "theoretical_loss": 3.7238048082326847, + "tokens_seen": 811694080 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038083249749247745, + "loss": 2.7446, + "theoretical_loss": 3.7237750834984977, + "tokens_seen": 811759616 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003808224674022066, + "loss": 2.9705, + "theoretical_loss": 3.7237453618358636, + "tokens_seen": 811825152 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003808124373119358, + "loss": 2.7296, + "theoretical_loss": 3.723715643244217, + "tokens_seen": 811890688 + }, + { + "epoch": 2.07, + "learning_rate": 0.000380802407221665, + "loss": 2.7897, + "theoretical_loss": 3.7236859277229923, + "tokens_seen": 811956224 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038079237713139417, + "loss": 2.8115, + "theoretical_loss": 3.7236562152716255, + "tokens_seen": 812021760 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038078234704112335, + "loss": 2.9943, + "theoretical_loss": 3.723626505889551, + "tokens_seen": 812087296 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003807723169508526, + "loss": 2.7147, + "theoretical_loss": 3.723596799576205, + "tokens_seen": 812152832 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003807622868605817, + "loss": 2.8725, + "theoretical_loss": 3.7235670963310215, + "tokens_seen": 812218368 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038075225677031095, + "loss": 2.9766, + "theoretical_loss": 3.7235373961534375, + "tokens_seen": 812283904 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038074222668004013, + "loss": 2.7214, + "theoretical_loss": 3.7235076990428877, + "tokens_seen": 812349440 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003807321965897693, + "loss": 2.7914, + "theoretical_loss": 3.7234780049988085, + "tokens_seen": 812414976 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003807221664994985, + "loss": 2.7851, + "theoretical_loss": 3.7234483140206356, + "tokens_seen": 812480512 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003807121364092277, + "loss": 2.92, + "theoretical_loss": 3.7234186261078057, + "tokens_seen": 812546048 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038070210631895685, + "loss": 2.7906, + "theoretical_loss": 3.7233889412597545, + "tokens_seen": 812611584 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 924620, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1071670055389404, + "objective/train/theoretical_loss": 3.723374099984845, + "objective/train/tokens_used": 833104352, + "theoretical_loss": 3.723374099984845, + "tokens_seen": 812644352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003806920762286861, + "loss": 2.8908, + "theoretical_loss": 3.7233592594759193, + "tokens_seen": 812677120 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003806820461384152, + "loss": 2.8797, + "theoretical_loss": 3.7233295807557356, + "tokens_seen": 812742656 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038067201604814445, + "loss": 2.65, + "theoretical_loss": 3.7232999050986413, + "tokens_seen": 812808192 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003806619859578736, + "loss": 2.9167, + "theoretical_loss": 3.7232702325040727, + "tokens_seen": 812873728 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003806519558676028, + "loss": 2.9298, + "theoretical_loss": 3.7232405629714673, + "tokens_seen": 812939264 + }, + { + "epoch": 2.07, + "learning_rate": 0.000380641925777332, + "loss": 2.8462, + "theoretical_loss": 3.7232108965002615, + "tokens_seen": 813004800 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003806318956870612, + "loss": 2.7202, + "theoretical_loss": 3.7231812330898935, + "tokens_seen": 813070336 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003806218655967904, + "loss": 2.8769, + "theoretical_loss": 3.7231515727398006, + "tokens_seen": 813135872 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038061183550651954, + "loss": 2.5064, + "theoretical_loss": 3.72312191544942, + "tokens_seen": 813201408 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003806018054162488, + "loss": 2.8703, + "theoretical_loss": 3.7230922612181905, + "tokens_seen": 813266944 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038059177532597796, + "loss": 2.68, + "theoretical_loss": 3.7230626100455497, + "tokens_seen": 813332480 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038058174523570714, + "loss": 2.6174, + "theoretical_loss": 3.723032961930935, + "tokens_seen": 813398016 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003805717151454363, + "loss": 2.8521, + "theoretical_loss": 3.7230033168737853, + "tokens_seen": 813463552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038056168505516555, + "loss": 2.9735, + "theoretical_loss": 3.722973674873539, + "tokens_seen": 813529088 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003805516549648947, + "loss": 2.9433, + "theoretical_loss": 3.7229440359296344, + "tokens_seen": 813594624 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003805416248746239, + "loss": 2.9525, + "theoretical_loss": 3.7229144000415104, + "tokens_seen": 813660160 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038053159478435304, + "loss": 3.0126, + "theoretical_loss": 3.722884767208606, + "tokens_seen": 813725696 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003805215646940823, + "loss": 2.7167, + "theoretical_loss": 3.7228551374303596, + "tokens_seen": 813791232 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038051153460381146, + "loss": 2.8659, + "theoretical_loss": 3.7228255107062114, + "tokens_seen": 813856768 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038050150451354064, + "loss": 2.7003, + "theoretical_loss": 3.7227958870356, + "tokens_seen": 813922304 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003804914744232698, + "loss": 2.6967, + "theoretical_loss": 3.722766266417965, + "tokens_seen": 813987840 + }, + { + "epoch": 2.07, + "learning_rate": 0.000380481444332999, + "loss": 2.6027, + "theoretical_loss": 3.7227366488527456, + "tokens_seen": 814053376 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003804714142427282, + "loss": 2.7122, + "theoretical_loss": 3.722707034339382, + "tokens_seen": 814118912 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003804613841524574, + "loss": 2.9424, + "theoretical_loss": 3.7226774228773145, + "tokens_seen": 814184448 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038045135406218655, + "loss": 2.5289, + "theoretical_loss": 3.722647814465982, + "tokens_seen": 814249984 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 925886, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.111829996109009, + "objective/train/theoretical_loss": 3.7226330114041666, + "objective/train/tokens_used": 834742752, + "theoretical_loss": 3.7226330114041666, + "tokens_seen": 814282752 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003804413239719158, + "loss": 2.7899, + "theoretical_loss": 3.7226182091048257, + "tokens_seen": 814315520 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038043129388164496, + "loss": 2.6081, + "theoretical_loss": 3.7225886067932854, + "tokens_seen": 814381056 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038042126379137414, + "loss": 2.8281, + "theoretical_loss": 3.722559007530802, + "tokens_seen": 814446592 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003804112337011033, + "loss": 2.8378, + "theoretical_loss": 3.722529411316816, + "tokens_seen": 814512128 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003804012036108325, + "loss": 2.812, + "theoretical_loss": 3.7224998181507676, + "tokens_seen": 814577664 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003803911735205617, + "loss": 3.027, + "theoretical_loss": 3.7224702280320985, + "tokens_seen": 814643200 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003803811434302909, + "loss": 2.8126, + "theoretical_loss": 3.722440640960249, + "tokens_seen": 814708736 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038037111334002005, + "loss": 2.8249, + "theoretical_loss": 3.7224110569346616, + "tokens_seen": 814774272 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003803610832497493, + "loss": 2.6414, + "theoretical_loss": 3.7223814759547764, + "tokens_seen": 814839808 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003803510531594784, + "loss": 2.712, + "theoretical_loss": 3.722351898020036, + "tokens_seen": 814905344 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038034102306920765, + "loss": 2.7911, + "theoretical_loss": 3.7223223231298808, + "tokens_seen": 814970880 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038033099297893683, + "loss": 2.5521, + "theoretical_loss": 3.722292751283754, + "tokens_seen": 815036416 + }, + { + "epoch": 2.07, + "learning_rate": 0.000380320962888666, + "loss": 2.783, + "theoretical_loss": 3.722263182481096, + "tokens_seen": 815101952 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003803109327983952, + "loss": 2.7707, + "theoretical_loss": 3.72223361672135, + "tokens_seen": 815167488 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038030090270812437, + "loss": 2.722, + "theoretical_loss": 3.7222040540039583, + "tokens_seen": 815233024 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038029087261785355, + "loss": 2.8075, + "theoretical_loss": 3.722174494328363, + "tokens_seen": 815298560 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003802808425275828, + "loss": 2.805, + "theoretical_loss": 3.7221449376940066, + "tokens_seen": 815364096 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003802708124373119, + "loss": 2.7188, + "theoretical_loss": 3.7221153841003316, + "tokens_seen": 815429632 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038026078234704115, + "loss": 2.6442, + "theoretical_loss": 3.7220858335467817, + "tokens_seen": 815495168 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038025075225677033, + "loss": 2.8672, + "theoretical_loss": 3.722056286032799, + "tokens_seen": 815560704 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003802407221664995, + "loss": 2.8103, + "theoretical_loss": 3.7220267415578268, + "tokens_seen": 815626240 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003802306920762287, + "loss": 2.626, + "theoretical_loss": 3.721997200121309, + "tokens_seen": 815691776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003802206619859579, + "loss": 2.7419, + "theoretical_loss": 3.7219676617226876, + "tokens_seen": 815757312 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038021063189568705, + "loss": 2.554, + "theoretical_loss": 3.721938126361408, + "tokens_seen": 815822848 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003802006018054163, + "loss": 2.4705, + "theoretical_loss": 3.721908594036913, + "tokens_seen": 815888384 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 927338, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0634372234344482, + "objective/train/theoretical_loss": 3.7218938290132857, + "objective/train/tokens_used": 836381152, + "theoretical_loss": 3.7218938290132857, + "tokens_seen": 815921152 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003801905717151454, + "loss": 2.8934, + "theoretical_loss": 3.721879064748646, + "tokens_seen": 815953920 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038018054162487465, + "loss": 2.9032, + "theoretical_loss": 3.721849538496052, + "tokens_seen": 816019456 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003801705115346038, + "loss": 2.8216, + "theoretical_loss": 3.721820015278575, + "tokens_seen": 816084992 + }, + { + "epoch": 2.07, + "learning_rate": 0.000380160481444333, + "loss": 2.7976, + "theoretical_loss": 3.721790495095658, + "tokens_seen": 816150528 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003801504513540622, + "loss": 2.4449, + "theoretical_loss": 3.7217609779467473, + "tokens_seen": 816216064 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003801404212637914, + "loss": 2.8176, + "theoretical_loss": 3.7217314638312864, + "tokens_seen": 816281600 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038013039117352056, + "loss": 2.4909, + "theoretical_loss": 3.7217019527487203, + "tokens_seen": 816347136 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038012036108324974, + "loss": 2.7071, + "theoretical_loss": 3.7216724446984943, + "tokens_seen": 816412672 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003801103309929789, + "loss": 2.6279, + "theoretical_loss": 3.7216429396800526, + "tokens_seen": 816478208 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038010030090270816, + "loss": 2.7475, + "theoretical_loss": 3.721613437692841, + "tokens_seen": 816543744 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003800902708124373, + "loss": 2.8536, + "theoretical_loss": 3.7215839387363046, + "tokens_seen": 816609280 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003800802407221665, + "loss": 2.5333, + "theoretical_loss": 3.7215544428098895, + "tokens_seen": 816674816 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003800702106318957, + "loss": 2.5012, + "theoretical_loss": 3.7215249499130403, + "tokens_seen": 816740352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003800601805416249, + "loss": 2.8211, + "theoretical_loss": 3.721495460045203, + "tokens_seen": 816805888 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038005015045135406, + "loss": 2.6342, + "theoretical_loss": 3.7214659732058246, + "tokens_seen": 816871424 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038004012036108324, + "loss": 2.7294, + "theoretical_loss": 3.7214364893943497, + "tokens_seen": 816936960 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003800300902708124, + "loss": 2.7032, + "theoretical_loss": 3.721407008610225, + "tokens_seen": 817002496 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038002006018054166, + "loss": 2.947, + "theoretical_loss": 3.7213775308528976, + "tokens_seen": 817068032 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003800100300902708, + "loss": 2.8428, + "theoretical_loss": 3.721348056121813, + "tokens_seen": 817133568 + }, + { + "epoch": 2.07, + "learning_rate": 0.00038, + "loss": 2.6575, + "theoretical_loss": 3.7213185844164185, + "tokens_seen": 817199104 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037998996990972915, + "loss": 2.7767, + "theoretical_loss": 3.7212891157361607, + "tokens_seen": 817264640 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003799799398194584, + "loss": 2.8218, + "theoretical_loss": 3.721259650080486, + "tokens_seen": 817330176 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037996990972918756, + "loss": 2.652, + "theoretical_loss": 3.7212301874488425, + "tokens_seen": 817395712 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037995987963891675, + "loss": 2.7455, + "theoretical_loss": 3.7212007278406767, + "tokens_seen": 817461248 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003799498495486459, + "loss": 2.5604, + "theoretical_loss": 3.721171271255436, + "tokens_seen": 817526784 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 928048, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.899970769882202, + "objective/train/theoretical_loss": 3.7211565440962397, + "objective/train/tokens_used": 838019552, + "theoretical_loss": 3.7211565440962397, + "tokens_seen": 817559552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037993981945837516, + "loss": 2.7355, + "theoretical_loss": 3.721141817692568, + "tokens_seen": 817592320 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003799297893681043, + "loss": 2.7331, + "theoretical_loss": 3.7211123671515214, + "tokens_seen": 817657856 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003799197592778335, + "loss": 2.8934, + "theoretical_loss": 3.7210829196317423, + "tokens_seen": 817723392 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037990972918756265, + "loss": 2.9739, + "theoretical_loss": 3.721053475132679, + "tokens_seen": 817788928 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003798996990972919, + "loss": 2.5838, + "theoretical_loss": 3.7210240336537805, + "tokens_seen": 817854464 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037988966900702107, + "loss": 2.8182, + "theoretical_loss": 3.7209945951944943, + "tokens_seen": 817920000 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037987963891675025, + "loss": 2.7168, + "theoretical_loss": 3.7209651597542694, + "tokens_seen": 817985536 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003798696088264795, + "loss": 2.8628, + "theoretical_loss": 3.7209357273325536, + "tokens_seen": 818051072 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003798595787362086, + "loss": 2.9665, + "theoretical_loss": 3.7209062979287966, + "tokens_seen": 818116608 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037984954864593785, + "loss": 2.6236, + "theoretical_loss": 3.720876871542446, + "tokens_seen": 818182144 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037983951855566703, + "loss": 2.7174, + "theoretical_loss": 3.720847448172951, + "tokens_seen": 818247680 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003798294884653962, + "loss": 2.6124, + "theoretical_loss": 3.720818027819762, + "tokens_seen": 818313216 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003798194583751254, + "loss": 3.0636, + "theoretical_loss": 3.7207886104823267, + "tokens_seen": 818378752 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037980942828485457, + "loss": 2.8202, + "theoretical_loss": 3.7207591961600954, + "tokens_seen": 818444288 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037979939819458375, + "loss": 2.7901, + "theoretical_loss": 3.720729784852517, + "tokens_seen": 818509824 + }, + { + "epoch": 2.07, + "learning_rate": 0.000379789368104313, + "loss": 2.6593, + "theoretical_loss": 3.720700376559042, + "tokens_seen": 818575360 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003797793380140421, + "loss": 2.7329, + "theoretical_loss": 3.7206709712791195, + "tokens_seen": 818640896 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037976930792377135, + "loss": 2.5465, + "theoretical_loss": 3.7206415690121997, + "tokens_seen": 818706432 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037975927783350053, + "loss": 2.8261, + "theoretical_loss": 3.7206121697577332, + "tokens_seen": 818771968 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003797492477432297, + "loss": 2.6284, + "theoretical_loss": 3.7205827735151695, + "tokens_seen": 818837504 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003797392176529589, + "loss": 2.6674, + "theoretical_loss": 3.7205533802839597, + "tokens_seen": 818903040 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003797291875626881, + "loss": 2.696, + "theoretical_loss": 3.720523990063554, + "tokens_seen": 818968576 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037971915747241725, + "loss": 2.8248, + "theoretical_loss": 3.720494602853403, + "tokens_seen": 819034112 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003797091273821465, + "loss": 2.8485, + "theoretical_loss": 3.7204652186529574, + "tokens_seen": 819099648 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003796990972918756, + "loss": 2.8094, + "theoretical_loss": 3.720435837461669, + "tokens_seen": 819165184 + }, + { + "debugging/Self-BLEU-5": 0.5761146013011956, + "debugging/distinct-1-grams": 0.7462239175050402, + "debugging/distinct-2-grams": 0.9529290102306198, + "debugging/entropy-1-grams": 6.250000994807308, + "debugging/entropy-2-grams": 7.4283795811873965, + "debugging/length": 523.9565217391304, + "debugging/num_segments": 23, + "debugging/score": 0.003311065605696948, + "debugging/score_std": 0.0050287850561047285, + "epoch": 2.07, + "objective/train/docs_used": 929355, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8574650287628174, + "objective/train/theoretical_loss": 3.720421147994287, + "objective/train/tokens_used": 839657952, + "theoretical_loss": 3.720421147994287, + "tokens_seen": 819197952 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037968906720160485, + "loss": 2.6181, + "theoretical_loss": 3.720406459278988, + "tokens_seen": 819230720 + }, + { + "epoch": 2.07, + "learning_rate": 0.000379679037111334, + "loss": 2.6622, + "theoretical_loss": 3.720377084104366, + "tokens_seen": 819296256 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003796690070210632, + "loss": 2.7908, + "theoretical_loss": 3.720347711937255, + "tokens_seen": 819361792 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003796589769307924, + "loss": 2.7659, + "theoretical_loss": 3.720318342777106, + "tokens_seen": 819427328 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003796489468405216, + "loss": 2.5301, + "theoretical_loss": 3.7202889766233707, + "tokens_seen": 819492864 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037963891675025076, + "loss": 2.7743, + "theoretical_loss": 3.720259613475501, + "tokens_seen": 819558400 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037962888665997994, + "loss": 2.6859, + "theoretical_loss": 3.720230253332949, + "tokens_seen": 819623936 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003796188565697091, + "loss": 2.9187, + "theoretical_loss": 3.7202008961951667, + "tokens_seen": 819689472 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037960882647943836, + "loss": 2.6287, + "theoretical_loss": 3.720171542061607, + "tokens_seen": 819755008 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003795987963891675, + "loss": 2.8665, + "theoretical_loss": 3.720142190931721, + "tokens_seen": 819820544 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003795887662988967, + "loss": 2.4396, + "theoretical_loss": 3.720112842804962, + "tokens_seen": 819886080 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003795787362086259, + "loss": 2.5603, + "theoretical_loss": 3.7200834976807835, + "tokens_seen": 819951616 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003795687061183551, + "loss": 2.7133, + "theoretical_loss": 3.7200541555586373, + "tokens_seen": 820017152 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037955867602808426, + "loss": 2.7159, + "theoretical_loss": 3.720024816437977, + "tokens_seen": 820082688 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037954864593781344, + "loss": 2.5554, + "theoretical_loss": 3.7199954803182553, + "tokens_seen": 820148224 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003795386158475426, + "loss": 2.7215, + "theoretical_loss": 3.7199661471989254, + "tokens_seen": 820213760 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037952858575727186, + "loss": 2.7678, + "theoretical_loss": 3.7199368170794416, + "tokens_seen": 820279296 + }, + { + "epoch": 2.07, + "learning_rate": 0.000379518555667001, + "loss": 2.7882, + "theoretical_loss": 3.7199074899592564, + "tokens_seen": 820344832 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003795085255767302, + "loss": 2.8856, + "theoretical_loss": 3.7198781658378244, + "tokens_seen": 820410368 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037949849548645935, + "loss": 2.8051, + "theoretical_loss": 3.719848844714599, + "tokens_seen": 820475904 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003794884653961886, + "loss": 2.5591, + "theoretical_loss": 3.719819526589034, + "tokens_seen": 820541440 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037947843530591776, + "loss": 2.6928, + "theoretical_loss": 3.719790211460584, + "tokens_seen": 820606976 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037946840521564695, + "loss": 2.7898, + "theoretical_loss": 3.719760899328703, + "tokens_seen": 820672512 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003794583751253761, + "loss": 2.6418, + "theoretical_loss": 3.719731590192846, + "tokens_seen": 820738048 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037944834503510536, + "loss": 2.6323, + "theoretical_loss": 3.7197022840524667, + "tokens_seen": 820803584 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 929749, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.016310930252075, + "objective/train/theoretical_loss": 3.719687632105411, + "objective/train/tokens_used": 841296352, + "theoretical_loss": 3.719687632105411, + "tokens_seen": 820836352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003794383149448345, + "loss": 2.5313, + "theoretical_loss": 3.7196729809070206, + "tokens_seen": 820869120 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003794282848545637, + "loss": 2.6753, + "theoretical_loss": 3.7196436807559623, + "tokens_seen": 820934656 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037941825476429285, + "loss": 2.6919, + "theoretical_loss": 3.719614383598746, + "tokens_seen": 821000192 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003794082246740221, + "loss": 3.0039, + "theoretical_loss": 3.719585089434828, + "tokens_seen": 821065728 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037939819458375127, + "loss": 2.8825, + "theoretical_loss": 3.719555798263663, + "tokens_seen": 821131264 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037938816449348045, + "loss": 2.9654, + "theoretical_loss": 3.719526510084707, + "tokens_seen": 821196800 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037937813440320963, + "loss": 2.8574, + "theoretical_loss": 3.719497224897415, + "tokens_seen": 821262336 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003793681043129388, + "loss": 2.7252, + "theoretical_loss": 3.719467942701243, + "tokens_seen": 821327872 + }, + { + "epoch": 2.07, + "learning_rate": 0.000379358074222668, + "loss": 2.876, + "theoretical_loss": 3.7194386634956462, + "tokens_seen": 821393408 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037934804413239723, + "loss": 2.9689, + "theoretical_loss": 3.7194093872800815, + "tokens_seen": 821458944 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037933801404212635, + "loss": 2.6865, + "theoretical_loss": 3.719380114054005, + "tokens_seen": 821524480 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003793279839518556, + "loss": 2.6635, + "theoretical_loss": 3.7193508438168728, + "tokens_seen": 821590016 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003793179538615847, + "loss": 2.9359, + "theoretical_loss": 3.7193215765681407, + "tokens_seen": 821655552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037930792377131395, + "loss": 2.6683, + "theoretical_loss": 3.7192923123072656, + "tokens_seen": 821721088 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037929789368104313, + "loss": 2.7919, + "theoretical_loss": 3.719263051033705, + "tokens_seen": 821786624 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003792878635907723, + "loss": 2.549, + "theoretical_loss": 3.7192337927469143, + "tokens_seen": 821852160 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003792778335005015, + "loss": 2.543, + "theoretical_loss": 3.7192045374463527, + "tokens_seen": 821917696 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037926780341023073, + "loss": 2.7604, + "theoretical_loss": 3.7191752851314748, + "tokens_seen": 821983232 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037925777331995986, + "loss": 2.9516, + "theoretical_loss": 3.7191460358017396, + "tokens_seen": 822048768 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003792477432296891, + "loss": 2.7581, + "theoretical_loss": 3.719116789456604, + "tokens_seen": 822114304 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003792377131394182, + "loss": 2.814, + "theoretical_loss": 3.7190875460955257, + "tokens_seen": 822179840 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037922768304914746, + "loss": 2.5764, + "theoretical_loss": 3.719058305717962, + "tokens_seen": 822245376 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037921765295887664, + "loss": 2.7264, + "theoretical_loss": 3.7190290683233713, + "tokens_seen": 822310912 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003792076228686058, + "loss": 2.7514, + "theoretical_loss": 3.7189998339112114, + "tokens_seen": 822376448 + }, + { + "epoch": 2.07, + "learning_rate": 0.000379197592778335, + "loss": 2.6486, + "theoretical_loss": 3.7189706024809404, + "tokens_seen": 822441984 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 930378, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.722386360168457, + "objective/train/theoretical_loss": 3.7189559878838434, + "objective/train/tokens_used": 842934752, + "theoretical_loss": 3.7189559878838434, + "tokens_seen": 822474752 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003791875626880642, + "loss": 2.8354, + "theoretical_loss": 3.718941374032016, + "tokens_seen": 822507520 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037917753259779336, + "loss": 3.0609, + "theoretical_loss": 3.7189121485638976, + "tokens_seen": 822573056 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003791675025075226, + "loss": 2.7041, + "theoretical_loss": 3.718882926076043, + "tokens_seen": 822638592 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003791574724172517, + "loss": 2.865, + "theoretical_loss": 3.7188537065679115, + "tokens_seen": 822704128 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037914744232698096, + "loss": 3.0404, + "theoretical_loss": 3.7188244900389615, + "tokens_seen": 822769664 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003791374122367101, + "loss": 2.7809, + "theoretical_loss": 3.718795276488652, + "tokens_seen": 822835200 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003791273821464393, + "loss": 2.6324, + "theoretical_loss": 3.7187660659164425, + "tokens_seen": 822900736 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037911735205616856, + "loss": 3.104, + "theoretical_loss": 3.718736858321792, + "tokens_seen": 822966272 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003791073219658977, + "loss": 2.9184, + "theoretical_loss": 3.7187076537041595, + "tokens_seen": 823031808 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003790972918756269, + "loss": 2.9075, + "theoretical_loss": 3.7186784520630054, + "tokens_seen": 823097344 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003790872617853561, + "loss": 2.9344, + "theoretical_loss": 3.718649253397789, + "tokens_seen": 823162880 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003790772316950853, + "loss": 2.6255, + "theoretical_loss": 3.7186200577079704, + "tokens_seen": 823228416 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037906720160481446, + "loss": 2.7619, + "theoretical_loss": 3.7185908649930086, + "tokens_seen": 823293952 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037905717151454364, + "loss": 2.7724, + "theoretical_loss": 3.7185616752523645, + "tokens_seen": 823359488 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003790471414242728, + "loss": 2.7481, + "theoretical_loss": 3.7185324884854984, + "tokens_seen": 823425024 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037903711133400206, + "loss": 2.8423, + "theoretical_loss": 3.718503304691871, + "tokens_seen": 823490560 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003790270812437312, + "loss": 2.6441, + "theoretical_loss": 3.7184741238709416, + "tokens_seen": 823556096 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003790170511534604, + "loss": 2.6508, + "theoretical_loss": 3.718444946022172, + "tokens_seen": 823621632 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037900702106318955, + "loss": 2.8063, + "theoretical_loss": 3.7184157711450228, + "tokens_seen": 823687168 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003789969909729188, + "loss": 2.9624, + "theoretical_loss": 3.718386599238954, + "tokens_seen": 823752704 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037898696088264796, + "loss": 2.7201, + "theoretical_loss": 3.7183574303034286, + "tokens_seen": 823818240 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037897693079237715, + "loss": 2.7259, + "theoretical_loss": 3.718328264337906, + "tokens_seen": 823883776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003789669007021063, + "loss": 2.824, + "theoretical_loss": 3.7182991013418483, + "tokens_seen": 823949312 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037895687061183556, + "loss": 2.9522, + "theoretical_loss": 3.718269941314717, + "tokens_seen": 824014848 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003789468405215647, + "loss": 2.7737, + "theoretical_loss": 3.7182407842559746, + "tokens_seen": 824080384 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 931616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7472262382507324, + "objective/train/theoretical_loss": 3.71822620683958, + "objective/train/tokens_used": 844573152, + "theoretical_loss": 3.71822620683958, + "tokens_seen": 824113152 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003789368104312939, + "loss": 2.5734, + "theoretical_loss": 3.718211630165081, + "tokens_seen": 824145920 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037892678034102305, + "loss": 2.7369, + "theoretical_loss": 3.7181824790415003, + "tokens_seen": 824211456 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003789167502507523, + "loss": 2.887, + "theoretical_loss": 3.718153330884693, + "tokens_seen": 824276992 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037890672016048147, + "loss": 2.9061, + "theoretical_loss": 3.718124185694122, + "tokens_seen": 824342528 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037889669007021065, + "loss": 3.0517, + "theoretical_loss": 3.7180950434692495, + "tokens_seen": 824408064 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037888665997993983, + "loss": 2.7274, + "theoretical_loss": 3.7180659042095376, + "tokens_seen": 824473600 + }, + { + "epoch": 2.07, + "learning_rate": 0.000378876629889669, + "loss": 2.8292, + "theoretical_loss": 3.7180367679144495, + "tokens_seen": 824539136 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003788665997993982, + "loss": 2.8566, + "theoretical_loss": 3.718007634583448, + "tokens_seen": 824604672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037885656970912743, + "loss": 2.6371, + "theoretical_loss": 3.7179785042159956, + "tokens_seen": 824670208 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037884653961885655, + "loss": 2.8622, + "theoretical_loss": 3.717949376811556, + "tokens_seen": 824735744 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003788365095285858, + "loss": 2.69, + "theoretical_loss": 3.7179202523695913, + "tokens_seen": 824801280 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003788264794383149, + "loss": 2.7501, + "theoretical_loss": 3.7178911308895666, + "tokens_seen": 824866816 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037881644934804415, + "loss": 2.8124, + "theoretical_loss": 3.7178620123709436, + "tokens_seen": 824932352 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037880641925777333, + "loss": 2.9955, + "theoretical_loss": 3.717832896813187, + "tokens_seen": 824997888 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003787963891675025, + "loss": 2.8538, + "theoretical_loss": 3.71780378421576, + "tokens_seen": 825063424 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003787863590772317, + "loss": 2.6721, + "theoretical_loss": 3.7177746745781266, + "tokens_seen": 825128960 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037877632898696093, + "loss": 3.0316, + "theoretical_loss": 3.717745567899751, + "tokens_seen": 825194496 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037876629889669006, + "loss": 2.7912, + "theoretical_loss": 3.7177164641800973, + "tokens_seen": 825260032 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003787562688064193, + "loss": 2.8639, + "theoretical_loss": 3.71768736341863, + "tokens_seen": 825325568 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003787462387161484, + "loss": 2.6579, + "theoretical_loss": 3.717658265614814, + "tokens_seen": 825391104 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037873620862587766, + "loss": 2.9474, + "theoretical_loss": 3.717629170768112, + "tokens_seen": 825456640 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037872617853560684, + "loss": 2.6777, + "theoretical_loss": 3.7176000788779913, + "tokens_seen": 825522176 + }, + { + "epoch": 2.07, + "learning_rate": 0.000378716148445336, + "loss": 2.766, + "theoretical_loss": 3.7175709899439147, + "tokens_seen": 825587712 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003787061183550652, + "loss": 2.6364, + "theoretical_loss": 3.7175419039653486, + "tokens_seen": 825653248 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003786960882647944, + "loss": 3.0823, + "theoretical_loss": 3.7175128209417574, + "tokens_seen": 825718784 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 932123, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9297609329223633, + "objective/train/theoretical_loss": 3.71749828053791, + "objective/train/tokens_used": 846211552, + "theoretical_loss": 3.71749828053791, + "tokens_seen": 825751552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037868605817452356, + "loss": 3.0315, + "theoretical_loss": 3.7174837408726065, + "tokens_seen": 825784320 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003786760280842528, + "loss": 2.9329, + "theoretical_loss": 3.717454663757361, + "tokens_seen": 825849856 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003786659979939819, + "loss": 2.8325, + "theoretical_loss": 3.7174255895954875, + "tokens_seen": 825915392 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037865596790371116, + "loss": 2.9202, + "theoretical_loss": 3.717396518386451, + "tokens_seen": 825980928 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003786459378134403, + "loss": 2.709, + "theoretical_loss": 3.717367450129718, + "tokens_seen": 826046464 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003786359077231695, + "loss": 2.7433, + "theoretical_loss": 3.7173383848247528, + "tokens_seen": 826112000 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003786258776328987, + "loss": 2.7836, + "theoretical_loss": 3.7173093224710234, + "tokens_seen": 826177536 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003786158475426279, + "loss": 3.0468, + "theoretical_loss": 3.717280263067995, + "tokens_seen": 826243072 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037860581745235706, + "loss": 3.0001, + "theoretical_loss": 3.7172512066151344, + "tokens_seen": 826308608 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003785957873620863, + "loss": 2.9357, + "theoretical_loss": 3.7172221531119085, + "tokens_seen": 826374144 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003785857572718154, + "loss": 2.5982, + "theoretical_loss": 3.7171931025577836, + "tokens_seen": 826439680 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037857572718154466, + "loss": 2.8929, + "theoretical_loss": 3.7171640549522262, + "tokens_seen": 826505216 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003785656970912738, + "loss": 2.8073, + "theoretical_loss": 3.717135010294703, + "tokens_seen": 826570752 + }, + { + "epoch": 2.07, + "learning_rate": 0.000378555667001003, + "loss": 2.7072, + "theoretical_loss": 3.717105968584683, + "tokens_seen": 826636288 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003785456369107322, + "loss": 2.9043, + "theoretical_loss": 3.717076929821631, + "tokens_seen": 826701824 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003785356068204614, + "loss": 3.0974, + "theoretical_loss": 3.717047894005016, + "tokens_seen": 826767360 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037852557673019057, + "loss": 2.5399, + "theoretical_loss": 3.7170188611343047, + "tokens_seen": 826832896 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037851554663991975, + "loss": 3.0706, + "theoretical_loss": 3.7169898312089655, + "tokens_seen": 826898432 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037850551654964893, + "loss": 3.1045, + "theoretical_loss": 3.7169608042284654, + "tokens_seen": 826963968 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037849548645937816, + "loss": 2.8803, + "theoretical_loss": 3.716931780192273, + "tokens_seen": 827029504 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003784854563691073, + "loss": 2.7085, + "theoretical_loss": 3.7169027590998556, + "tokens_seen": 827095040 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003784754262788365, + "loss": 2.8001, + "theoretical_loss": 3.7168737409506822, + "tokens_seen": 827160576 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037846539618856565, + "loss": 3.1734, + "theoretical_loss": 3.7168447257442208, + "tokens_seen": 827226112 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003784553660982949, + "loss": 2.9276, + "theoretical_loss": 3.7168157134799396, + "tokens_seen": 827291648 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037844533600802407, + "loss": 2.7264, + "theoretical_loss": 3.716786704157308, + "tokens_seen": 827357184 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 933422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9304966926574707, + "objective/train/theoretical_loss": 3.7167722005989443, + "objective/train/tokens_used": 847849952, + "theoretical_loss": 3.7167722005989443, + "tokens_seen": 827389952 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037843530591775325, + "loss": 2.806, + "theoretical_loss": 3.716757697775794, + "tokens_seen": 827422720 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037842527582748243, + "loss": 2.8655, + "theoretical_loss": 3.7167286943348667, + "tokens_seen": 827488256 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037841524573721167, + "loss": 2.9854, + "theoretical_loss": 3.7166996938339953, + "tokens_seen": 827553792 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003784052156469408, + "loss": 2.8557, + "theoretical_loss": 3.716670696272649, + "tokens_seen": 827619328 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037839518555667003, + "loss": 2.8762, + "theoretical_loss": 3.7166417016502966, + "tokens_seen": 827684864 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037838515546639916, + "loss": 2.9352, + "theoretical_loss": 3.7166127099664084, + "tokens_seen": 827750400 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003783751253761284, + "loss": 2.6815, + "theoretical_loss": 3.716583721220453, + "tokens_seen": 827815936 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037836509528585763, + "loss": 2.6254, + "theoretical_loss": 3.7165547354119015, + "tokens_seen": 827881472 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037835506519558675, + "loss": 2.8917, + "theoretical_loss": 3.716525752540222, + "tokens_seen": 827947008 + }, + { + "epoch": 2.07, + "learning_rate": 0.000378345035105316, + "loss": 3.027, + "theoretical_loss": 3.716496772604886, + "tokens_seen": 828012544 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003783350050150451, + "loss": 2.8621, + "theoretical_loss": 3.7164677956053627, + "tokens_seen": 828078080 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037832497492477435, + "loss": 3.0463, + "theoretical_loss": 3.7164388215411233, + "tokens_seen": 828143616 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037831494483450353, + "loss": 2.7848, + "theoretical_loss": 3.7164098504116367, + "tokens_seen": 828209152 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003783049147442327, + "loss": 2.8613, + "theoretical_loss": 3.7163808822163755, + "tokens_seen": 828274688 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003782948846539619, + "loss": 2.9271, + "theoretical_loss": 3.7163519169548085, + "tokens_seen": 828340224 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037828485456369113, + "loss": 2.6843, + "theoretical_loss": 3.7163229546264076, + "tokens_seen": 828405760 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037827482447342026, + "loss": 2.8605, + "theoretical_loss": 3.7162939952306435, + "tokens_seen": 828471296 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003782647943831495, + "loss": 2.9175, + "theoretical_loss": 3.7162650387669873, + "tokens_seen": 828536832 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003782547642928786, + "loss": 2.7707, + "theoretical_loss": 3.71623608523491, + "tokens_seen": 828602368 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037824473420260786, + "loss": 2.676, + "theoretical_loss": 3.7162071346338834, + "tokens_seen": 828667904 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037823470411233704, + "loss": 2.845, + "theoretical_loss": 3.7161781869633788, + "tokens_seen": 828733440 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003782246740220662, + "loss": 2.9766, + "theoretical_loss": 3.7161492422228677, + "tokens_seen": 828798976 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003782146439317954, + "loss": 2.9741, + "theoretical_loss": 3.7161203004118217, + "tokens_seen": 828864512 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003782046138415246, + "loss": 2.9232, + "theoretical_loss": 3.716091361529714, + "tokens_seen": 828930048 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037819458375125376, + "loss": 2.7427, + "theoretical_loss": 3.716062425576015, + "tokens_seen": 828995584 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 934816, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1100308895111084, + "objective/train/theoretical_loss": 3.716047958697154, + "objective/train/tokens_used": 849488352, + "theoretical_loss": 3.716047958697154, + "tokens_seen": 829028352 + }, + { + "epoch": 2.07, + "learning_rate": 0.000378184553660983, + "loss": 3.0321, + "theoretical_loss": 3.7160334925501974, + "tokens_seen": 829061120 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003781745235707121, + "loss": 2.8202, + "theoretical_loss": 3.7160045624517344, + "tokens_seen": 829126656 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037816449348044136, + "loss": 2.9412, + "theoretical_loss": 3.7159756352800977, + "tokens_seen": 829192192 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003781544633901705, + "loss": 2.7031, + "theoretical_loss": 3.7159467110347597, + "tokens_seen": 829257728 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003781444332998997, + "loss": 3.0871, + "theoretical_loss": 3.715917789715194, + "tokens_seen": 829323264 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003781344032096289, + "loss": 2.8528, + "theoretical_loss": 3.715888871320872, + "tokens_seen": 829388800 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003781243731193581, + "loss": 2.8641, + "theoretical_loss": 3.7158599558512684, + "tokens_seen": 829454336 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037811434302908726, + "loss": 3.02, + "theoretical_loss": 3.7158310433058555, + "tokens_seen": 829519872 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003781043129388165, + "loss": 2.925, + "theoretical_loss": 3.7158021336841065, + "tokens_seen": 829585408 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003780942828485456, + "loss": 2.9142, + "theoretical_loss": 3.715773226985495, + "tokens_seen": 829650944 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037808425275827486, + "loss": 2.7949, + "theoretical_loss": 3.7157443232094947, + "tokens_seen": 829716480 + }, + { + "epoch": 2.07, + "learning_rate": 0.000378074222668004, + "loss": 2.8045, + "theoretical_loss": 3.7157154223555793, + "tokens_seen": 829782016 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003780641925777332, + "loss": 3.2325, + "theoretical_loss": 3.7156865244232224, + "tokens_seen": 829847552 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003780541624874624, + "loss": 2.7547, + "theoretical_loss": 3.7156576294118984, + "tokens_seen": 829913088 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003780441323971916, + "loss": 2.9053, + "theoretical_loss": 3.715628737321081, + "tokens_seen": 829978624 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037803410230692077, + "loss": 2.928, + "theoretical_loss": 3.7155998481502444, + "tokens_seen": 830044160 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037802407221664995, + "loss": 2.8699, + "theoretical_loss": 3.715570961898863, + "tokens_seen": 830109696 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037801404212637913, + "loss": 2.8143, + "theoretical_loss": 3.7155420785664113, + "tokens_seen": 830175232 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037800401203610836, + "loss": 2.7866, + "theoretical_loss": 3.7155131981523644, + "tokens_seen": 830240768 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003779939819458375, + "loss": 2.6816, + "theoretical_loss": 3.7154843206561967, + "tokens_seen": 830306304 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037798395185556673, + "loss": 2.9595, + "theoretical_loss": 3.7154554460773834, + "tokens_seen": 830371840 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037797392176529585, + "loss": 2.9089, + "theoretical_loss": 3.7154265744153996, + "tokens_seen": 830437376 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003779638916750251, + "loss": 2.8497, + "theoretical_loss": 3.71539770566972, + "tokens_seen": 830502912 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037795386158475427, + "loss": 2.9463, + "theoretical_loss": 3.7153688398398197, + "tokens_seen": 830568448 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037794383149448345, + "loss": 2.688, + "theoretical_loss": 3.715339976925175, + "tokens_seen": 830633984 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 935526, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.734133005142212, + "objective/train/theoretical_loss": 3.71532554656091, + "objective/train/tokens_used": 851126752, + "theoretical_loss": 3.71532554656091, + "tokens_seen": 830666752 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037793380140421263, + "loss": 2.9231, + "theoretical_loss": 3.7153111169252613, + "tokens_seen": 830699520 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037792377131394187, + "loss": 2.7234, + "theoretical_loss": 3.7152822598395545, + "tokens_seen": 830765056 + }, + { + "epoch": 2.07, + "learning_rate": 0.000377913741223671, + "loss": 2.9191, + "theoretical_loss": 3.7152534056675295, + "tokens_seen": 830830592 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037790371113340023, + "loss": 2.7125, + "theoretical_loss": 3.715224554408664, + "tokens_seen": 830896128 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037789368104312936, + "loss": 2.7919, + "theoretical_loss": 3.715195706062432, + "tokens_seen": 830961664 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003778836509528586, + "loss": 2.8418, + "theoretical_loss": 3.7151668606283117, + "tokens_seen": 831027200 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003778736208625878, + "loss": 2.7902, + "theoretical_loss": 3.7151380181057783, + "tokens_seen": 831092736 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037786359077231695, + "loss": 2.7545, + "theoretical_loss": 3.715109178494309, + "tokens_seen": 831158272 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037785356068204614, + "loss": 2.7125, + "theoretical_loss": 3.7150803417933798, + "tokens_seen": 831223808 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003778435305917753, + "loss": 2.9552, + "theoretical_loss": 3.7150515080024684, + "tokens_seen": 831289344 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003778335005015045, + "loss": 2.764, + "theoretical_loss": 3.715022677121052, + "tokens_seen": 831354880 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037782347041123373, + "loss": 3.0012, + "theoretical_loss": 3.7149938491486063, + "tokens_seen": 831420416 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037781344032096286, + "loss": 2.675, + "theoretical_loss": 3.7149650240846093, + "tokens_seen": 831485952 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003778034102306921, + "loss": 2.7746, + "theoretical_loss": 3.7149362019285386, + "tokens_seen": 831551488 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003777933801404213, + "loss": 2.5407, + "theoretical_loss": 3.714907382679871, + "tokens_seen": 831617024 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037778335005015046, + "loss": 2.8533, + "theoretical_loss": 3.7148785663380854, + "tokens_seen": 831682560 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037777331995987964, + "loss": 2.7901, + "theoretical_loss": 3.7148497529026585, + "tokens_seen": 831748096 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003777632898696088, + "loss": 2.8641, + "theoretical_loss": 3.714820942373068, + "tokens_seen": 831813632 + }, + { + "epoch": 2.07, + "learning_rate": 0.000377753259779338, + "loss": 2.8166, + "theoretical_loss": 3.7147921347487927, + "tokens_seen": 831879168 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037774322968906724, + "loss": 2.9293, + "theoretical_loss": 3.71476333002931, + "tokens_seen": 831944704 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037773319959879636, + "loss": 2.9647, + "theoretical_loss": 3.714734528214099, + "tokens_seen": 832010240 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003777231695085256, + "loss": 2.8858, + "theoretical_loss": 3.714705729302638, + "tokens_seen": 832075776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003777131394182547, + "loss": 2.9028, + "theoretical_loss": 3.714676933294405, + "tokens_seen": 832141312 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037770310932798396, + "loss": 2.7915, + "theoretical_loss": 3.71464814018888, + "tokens_seen": 832206848 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037769307923771314, + "loss": 2.9318, + "theoretical_loss": 3.71461934998554, + "tokens_seen": 832272384 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 936975, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9853835105895996, + "objective/train/theoretical_loss": 3.7146049559720273, + "objective/train/tokens_used": 852765152, + "theoretical_loss": 3.7146049559720273, + "tokens_seen": 832305152 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003776830491474423, + "loss": 2.9336, + "theoretical_loss": 3.7145905626838656, + "tokens_seen": 832337920 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003776730190571715, + "loss": 2.6135, + "theoretical_loss": 3.714561778283335, + "tokens_seen": 832403456 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003776629889669007, + "loss": 2.6917, + "theoretical_loss": 3.7145329967834275, + "tokens_seen": 832468992 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037765295887662987, + "loss": 2.6006, + "theoretical_loss": 3.714504218183623, + "tokens_seen": 832534528 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003776429287863591, + "loss": 2.6902, + "theoretical_loss": 3.7144754424834003, + "tokens_seen": 832600064 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037763289869608823, + "loss": 2.7044, + "theoretical_loss": 3.71444666968224, + "tokens_seen": 832665600 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037762286860581746, + "loss": 2.6894, + "theoretical_loss": 3.714417899779621, + "tokens_seen": 832731136 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003776128385155467, + "loss": 2.7718, + "theoretical_loss": 3.7143891327750236, + "tokens_seen": 832796672 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003776028084252758, + "loss": 2.8134, + "theoretical_loss": 3.7143603686679283, + "tokens_seen": 832862208 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037759277833500506, + "loss": 2.5345, + "theoretical_loss": 3.7143316074578143, + "tokens_seen": 832927744 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003775827482447342, + "loss": 2.7188, + "theoretical_loss": 3.7143028491441625, + "tokens_seen": 832993280 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003775727181544634, + "loss": 2.8647, + "theoretical_loss": 3.7142740937264533, + "tokens_seen": 833058816 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003775626880641926, + "loss": 2.874, + "theoretical_loss": 3.7142453412041676, + "tokens_seen": 833124352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003775526579739218, + "loss": 2.9832, + "theoretical_loss": 3.7142165915767853, + "tokens_seen": 833189888 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037754262788365097, + "loss": 2.7881, + "theoretical_loss": 3.7141878448437886, + "tokens_seen": 833255424 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037753259779338015, + "loss": 2.7939, + "theoretical_loss": 3.7141591010046575, + "tokens_seen": 833320960 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037752256770310933, + "loss": 2.9099, + "theoretical_loss": 3.714130360058873, + "tokens_seen": 833386496 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037751253761283857, + "loss": 2.9208, + "theoretical_loss": 3.7141016220059164, + "tokens_seen": 833452032 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003775025075225677, + "loss": 2.8116, + "theoretical_loss": 3.71407288684527, + "tokens_seen": 833517568 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037749247743229693, + "loss": 2.8814, + "theoretical_loss": 3.7140441545764142, + "tokens_seen": 833583104 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037748244734202605, + "loss": 2.5721, + "theoretical_loss": 3.7140154251988315, + "tokens_seen": 833648640 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003774724172517553, + "loss": 2.5953, + "theoretical_loss": 3.713986698712003, + "tokens_seen": 833714176 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037746238716148447, + "loss": 2.6858, + "theoretical_loss": 3.7139579751154117, + "tokens_seen": 833779712 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037745235707121365, + "loss": 2.8436, + "theoretical_loss": 3.7139292544085385, + "tokens_seen": 833845248 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037744232698094283, + "loss": 2.9229, + "theoretical_loss": 3.713900536590866, + "tokens_seen": 833910784 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 937728, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8705952167510986, + "objective/train/theoretical_loss": 3.7138861787653186, + "objective/train/tokens_used": 854403552, + "theoretical_loss": 3.7138861787653186, + "tokens_seen": 833943552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037743229689067207, + "loss": 2.8177, + "theoretical_loss": 3.713871821661877, + "tokens_seen": 833976320 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003774222668004012, + "loss": 2.7473, + "theoretical_loss": 3.7138431096210534, + "tokens_seen": 834041856 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037741223671013043, + "loss": 2.7667, + "theoretical_loss": 3.7138144004678777, + "tokens_seen": 834107392 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037740220661985956, + "loss": 2.8914, + "theoretical_loss": 3.7137856942018335, + "tokens_seen": 834172928 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003773921765295888, + "loss": 2.9923, + "theoretical_loss": 3.7137569908224024, + "tokens_seen": 834238464 + }, + { + "epoch": 2.07, + "learning_rate": 0.000377382146439318, + "loss": 2.697, + "theoretical_loss": 3.713728290329068, + "tokens_seen": 834304000 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037737211634904715, + "loss": 2.9992, + "theoretical_loss": 3.7136995927213143, + "tokens_seen": 834369536 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037736208625877634, + "loss": 2.7865, + "theoretical_loss": 3.7136708979986235, + "tokens_seen": 834435072 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003773520561685055, + "loss": 2.7856, + "theoretical_loss": 3.713642206160479, + "tokens_seen": 834500608 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003773420260782347, + "loss": 2.9712, + "theoretical_loss": 3.7136135172063645, + "tokens_seen": 834566144 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037733199598796393, + "loss": 2.6482, + "theoretical_loss": 3.7135848311357638, + "tokens_seen": 834631680 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037732196589769306, + "loss": 2.7742, + "theoretical_loss": 3.713556147948161, + "tokens_seen": 834697216 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003773119358074223, + "loss": 2.7187, + "theoretical_loss": 3.7135274676430394, + "tokens_seen": 834762752 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003773019057171515, + "loss": 2.7803, + "theoretical_loss": 3.7134987902198833, + "tokens_seen": 834828288 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037729187562688066, + "loss": 2.8031, + "theoretical_loss": 3.7134701156781773, + "tokens_seen": 834893824 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037728184553660984, + "loss": 2.7147, + "theoretical_loss": 3.713441444017405, + "tokens_seen": 834959360 + }, + { + "epoch": 2.07, + "learning_rate": 0.000377271815446339, + "loss": 2.5068, + "theoretical_loss": 3.7134127752370514, + "tokens_seen": 835024896 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003772617853560682, + "loss": 2.7243, + "theoretical_loss": 3.713384109336601, + "tokens_seen": 835090432 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037725175526579744, + "loss": 2.7841, + "theoretical_loss": 3.7133554463155383, + "tokens_seen": 835155968 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037724172517552656, + "loss": 2.8078, + "theoretical_loss": 3.713326786173348, + "tokens_seen": 835221504 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003772316950852558, + "loss": 2.5191, + "theoretical_loss": 3.713298128909516, + "tokens_seen": 835287040 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003772216649949849, + "loss": 2.6828, + "theoretical_loss": 3.7132694745235266, + "tokens_seen": 835352576 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037721163490471416, + "loss": 2.8356, + "theoretical_loss": 3.713240823014866, + "tokens_seen": 835418112 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037720160481444334, + "loss": 2.8376, + "theoretical_loss": 3.7132121743830178, + "tokens_seen": 835483648 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003771915747241725, + "loss": 2.9586, + "theoretical_loss": 3.713183528627469, + "tokens_seen": 835549184 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 939195, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6481375694274902, + "objective/train/theoretical_loss": 3.713169206828146, + "objective/train/tokens_used": 856041952, + "theoretical_loss": 3.713169206828146, + "tokens_seen": 835581952 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003771815446339017, + "loss": 2.8213, + "theoretical_loss": 3.713154885747705, + "tokens_seen": 835614720 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003771715145436309, + "loss": 2.7886, + "theoretical_loss": 3.713126245743211, + "tokens_seen": 835680256 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037716148445336007, + "loss": 2.6125, + "theoretical_loss": 3.7130976086134737, + "tokens_seen": 835745792 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003771514543630893, + "loss": 2.9789, + "theoretical_loss": 3.713068974357979, + "tokens_seen": 835811328 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037714142427281843, + "loss": 2.8882, + "theoretical_loss": 3.7130403429762127, + "tokens_seen": 835876864 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037713139418254766, + "loss": 2.7156, + "theoretical_loss": 3.713011714467661, + "tokens_seen": 835942400 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037712136409227685, + "loss": 2.8133, + "theoretical_loss": 3.7129830888318107, + "tokens_seen": 836007936 + }, + { + "epoch": 2.07, + "learning_rate": 0.000377111334002006, + "loss": 2.8748, + "theoretical_loss": 3.7129544660681493, + "tokens_seen": 836073472 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003771013039117352, + "loss": 2.7447, + "theoretical_loss": 3.7129258461761614, + "tokens_seen": 836139008 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003770912738214644, + "loss": 2.4627, + "theoretical_loss": 3.7128972291553355, + "tokens_seen": 836204544 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037708124373119357, + "loss": 2.8791, + "theoretical_loss": 3.712868615005158, + "tokens_seen": 836270080 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003770712136409228, + "loss": 2.9345, + "theoretical_loss": 3.712840003725116, + "tokens_seen": 836335616 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037706118355065193, + "loss": 2.6005, + "theoretical_loss": 3.7128113953146973, + "tokens_seen": 836401152 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037705115346038117, + "loss": 2.7668, + "theoretical_loss": 3.7127827897733887, + "tokens_seen": 836466688 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003770411233701103, + "loss": 2.6267, + "theoretical_loss": 3.712754187100677, + "tokens_seen": 836532224 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037703109327983953, + "loss": 2.8772, + "theoretical_loss": 3.712725587296051, + "tokens_seen": 836597760 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003770210631895687, + "loss": 2.9275, + "theoretical_loss": 3.7126969903589986, + "tokens_seen": 836663296 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003770110330992979, + "loss": 2.8701, + "theoretical_loss": 3.712668396289007, + "tokens_seen": 836728832 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003770010030090271, + "loss": 2.8012, + "theoretical_loss": 3.712639805085564, + "tokens_seen": 836794368 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037699097291875625, + "loss": 2.7351, + "theoretical_loss": 3.7126112167481584, + "tokens_seen": 836859904 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037698094282848544, + "loss": 2.7632, + "theoretical_loss": 3.712582631276278, + "tokens_seen": 836925440 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037697091273821467, + "loss": 2.9011, + "theoretical_loss": 3.712554048669412, + "tokens_seen": 836990976 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003769608826479438, + "loss": 2.8888, + "theoretical_loss": 3.7125254689270486, + "tokens_seen": 837056512 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037695085255767303, + "loss": 2.9281, + "theoretical_loss": 3.7124968920486756, + "tokens_seen": 837122048 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003769408224674022, + "loss": 2.8598, + "theoretical_loss": 3.7124683180337827, + "tokens_seen": 837187584 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 939764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9354538917541504, + "objective/train/theoretical_loss": 3.712454032099982, + "objective/train/tokens_used": 857680352, + "theoretical_loss": 3.712454032099982, + "tokens_seen": 837220352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003769307923771314, + "loss": 2.8442, + "theoretical_loss": 3.7124397468818593, + "tokens_seen": 837253120 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003769207622868606, + "loss": 2.6892, + "theoretical_loss": 3.7124111785923937, + "tokens_seen": 837318656 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037691073219658976, + "loss": 2.6348, + "theoretical_loss": 3.712382613164875, + "tokens_seen": 837384192 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037690070210631894, + "loss": 2.7621, + "theoretical_loss": 3.7123540505987926, + "tokens_seen": 837449728 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003768906720160482, + "loss": 2.8155, + "theoretical_loss": 3.7123254908936367, + "tokens_seen": 837515264 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003768806419257773, + "loss": 2.8675, + "theoretical_loss": 3.7122969340488963, + "tokens_seen": 837580800 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037687061183550654, + "loss": 2.7471, + "theoretical_loss": 3.712268380064061, + "tokens_seen": 837646336 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003768605817452357, + "loss": 2.804, + "theoretical_loss": 3.712239828938621, + "tokens_seen": 837711872 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003768505516549649, + "loss": 2.8195, + "theoretical_loss": 3.712211280672067, + "tokens_seen": 837777408 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037684052156469413, + "loss": 2.9182, + "theoretical_loss": 3.7121827352638874, + "tokens_seen": 837842944 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037683049147442326, + "loss": 2.6297, + "theoretical_loss": 3.712154192713574, + "tokens_seen": 837908480 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003768204613841525, + "loss": 2.5391, + "theoretical_loss": 3.7121256530206157, + "tokens_seen": 837974016 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003768104312938817, + "loss": 2.7109, + "theoretical_loss": 3.7120971161845047, + "tokens_seen": 838039552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037680040120361086, + "loss": 2.8297, + "theoretical_loss": 3.712068582204731, + "tokens_seen": 838105088 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037679037111334004, + "loss": 3.0213, + "theoretical_loss": 3.712040051080785, + "tokens_seen": 838170624 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003767803410230692, + "loss": 2.8937, + "theoretical_loss": 3.7120115228121575, + "tokens_seen": 838236160 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003767703109327984, + "loss": 2.82, + "theoretical_loss": 3.71198299739834, + "tokens_seen": 838301696 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037676028084252764, + "loss": 2.6802, + "theoretical_loss": 3.711954474838824, + "tokens_seen": 838367232 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037675025075225676, + "loss": 2.8553, + "theoretical_loss": 3.7119259551330996, + "tokens_seen": 838432768 + }, + { + "epoch": 2.07, + "learning_rate": 0.000376740220661986, + "loss": 2.6405, + "theoretical_loss": 3.7118974382806598, + "tokens_seen": 838498304 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003767301905717151, + "loss": 2.9445, + "theoretical_loss": 3.711868924280995, + "tokens_seen": 838563840 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037672016048144436, + "loss": 2.845, + "theoretical_loss": 3.7118404131335976, + "tokens_seen": 838629376 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037671013039117354, + "loss": 2.9059, + "theoretical_loss": 3.7118119048379588, + "tokens_seen": 838694912 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003767001003009027, + "loss": 2.8623, + "theoretical_loss": 3.711783399393571, + "tokens_seen": 838760448 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003766900702106319, + "loss": 3.0017, + "theoretical_loss": 3.711754896799926, + "tokens_seen": 838825984 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 940402, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.893378257751465, + "objective/train/theoretical_loss": 3.7117406465719736, + "objective/train/tokens_used": 859318752, + "theoretical_loss": 3.7117406465719736, + "tokens_seen": 838858752 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003766800401203611, + "loss": 2.7411, + "theoretical_loss": 3.711726397056516, + "tokens_seen": 838891520 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037667001003009027, + "loss": 2.524, + "theoretical_loss": 3.711697900162834, + "tokens_seen": 838957056 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003766599799398195, + "loss": 2.8174, + "theoretical_loss": 3.7116694061183715, + "tokens_seen": 839022592 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037664994984954863, + "loss": 2.7206, + "theoretical_loss": 3.7116409149226213, + "tokens_seen": 839088128 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037663991975927786, + "loss": 2.7177, + "theoretical_loss": 3.711612426575077, + "tokens_seen": 839153664 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037662988966900705, + "loss": 2.7343, + "theoretical_loss": 3.711583941075231, + "tokens_seen": 839219200 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003766198595787362, + "loss": 3.0564, + "theoretical_loss": 3.711555458422576, + "tokens_seen": 839284736 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003766098294884654, + "loss": 2.5675, + "theoretical_loss": 3.711526978616605, + "tokens_seen": 839350272 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003765997993981946, + "loss": 2.6686, + "theoretical_loss": 3.711498501656812, + "tokens_seen": 839415808 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037658976930792377, + "loss": 2.8999, + "theoretical_loss": 3.7114700275426893, + "tokens_seen": 839481344 + }, + { + "epoch": 2.07, + "learning_rate": 0.000376579739217653, + "loss": 2.5858, + "theoretical_loss": 3.7114415562737317, + "tokens_seen": 839546880 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037656970912738213, + "loss": 2.8555, + "theoretical_loss": 3.711413087849432, + "tokens_seen": 839612416 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037655967903711137, + "loss": 2.7198, + "theoretical_loss": 3.711384622269284, + "tokens_seen": 839677952 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003765496489468405, + "loss": 2.9233, + "theoretical_loss": 3.711356159532782, + "tokens_seen": 839743488 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037653961885656973, + "loss": 2.7966, + "theoretical_loss": 3.71132769963942, + "tokens_seen": 839809024 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003765295887662989, + "loss": 2.702, + "theoretical_loss": 3.7112992425886917, + "tokens_seen": 839874560 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003765195586760281, + "loss": 2.7411, + "theoretical_loss": 3.7112707883800917, + "tokens_seen": 839940096 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003765095285857573, + "loss": 2.5153, + "theoretical_loss": 3.7112423370131142, + "tokens_seen": 840005632 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037649949849548645, + "loss": 2.5897, + "theoretical_loss": 3.7112138884872543, + "tokens_seen": 840071168 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037648946840521564, + "loss": 2.5425, + "theoretical_loss": 3.711185442802006, + "tokens_seen": 840136704 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037647943831494487, + "loss": 2.8974, + "theoretical_loss": 3.7111569999568648, + "tokens_seen": 840202240 + }, + { + "epoch": 2.07, + "learning_rate": 0.000376469408224674, + "loss": 2.8294, + "theoretical_loss": 3.711128559951325, + "tokens_seen": 840267776 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037645937813440323, + "loss": 2.9157, + "theoretical_loss": 3.711100122784882, + "tokens_seen": 840333312 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003764493480441324, + "loss": 3.2182, + "theoretical_loss": 3.7110716884570305, + "tokens_seen": 840398848 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003764393179538616, + "loss": 2.7897, + "theoretical_loss": 3.711043256967267, + "tokens_seen": 840464384 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 941411, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.780778169631958, + "objective/train/theoretical_loss": 3.71102904228651, + "objective/train/tokens_used": 860957152, + "theoretical_loss": 3.71102904228651, + "tokens_seen": 840497152 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003764292878635908, + "loss": 2.8305, + "theoretical_loss": 3.7110148283150854, + "tokens_seen": 840529920 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037641925777331996, + "loss": 2.7123, + "theoretical_loss": 3.710986402499983, + "tokens_seen": 840595456 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037640922768304914, + "loss": 3.029, + "theoretical_loss": 3.710957979521454, + "tokens_seen": 840660992 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003763991975927784, + "loss": 2.761, + "theoretical_loss": 3.7109295593789953, + "tokens_seen": 840726528 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003763891675025075, + "loss": 2.6939, + "theoretical_loss": 3.7109011420721023, + "tokens_seen": 840792064 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037637913741223674, + "loss": 2.6892, + "theoretical_loss": 3.710872727600271, + "tokens_seen": 840857600 + }, + { + "epoch": 2.07, + "learning_rate": 0.00037636910732196586, + "loss": 2.8136, + "theoretical_loss": 3.710844315962998, + "tokens_seen": 840923136 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003763590772316951, + "loss": 2.6677, + "theoretical_loss": 3.7108159071597795, + "tokens_seen": 840988672 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003763490471414243, + "loss": 2.7426, + "theoretical_loss": 3.710787501190112, + "tokens_seen": 841054208 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037633901705115346, + "loss": 2.7228, + "theoretical_loss": 3.710759098053492, + "tokens_seen": 841119744 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037632898696088264, + "loss": 2.8451, + "theoretical_loss": 3.7107306977494163, + "tokens_seen": 841185280 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003763189568706119, + "loss": 2.7124, + "theoretical_loss": 3.710702300277382, + "tokens_seen": 841250816 + }, + { + "epoch": 2.08, + "learning_rate": 0.000376308926780341, + "loss": 2.8676, + "theoretical_loss": 3.7106739056368863, + "tokens_seen": 841316352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037629889669007024, + "loss": 2.812, + "theoretical_loss": 3.7106455138274255, + "tokens_seen": 841381888 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037628886659979937, + "loss": 2.8864, + "theoretical_loss": 3.710617124848497, + "tokens_seen": 841447424 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003762788365095286, + "loss": 2.5823, + "theoretical_loss": 3.710588738699599, + "tokens_seen": 841512960 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003762688064192578, + "loss": 2.6779, + "theoretical_loss": 3.7105603553802284, + "tokens_seen": 841578496 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037625877632898696, + "loss": 2.6513, + "theoretical_loss": 3.710531974889883, + "tokens_seen": 841644032 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037624874623871615, + "loss": 2.9036, + "theoretical_loss": 3.710503597228061, + "tokens_seen": 841709568 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003762387161484453, + "loss": 2.9422, + "theoretical_loss": 3.710475222394259, + "tokens_seen": 841775104 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003762286860581745, + "loss": 2.722, + "theoretical_loss": 3.710446850387976, + "tokens_seen": 841840640 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037621865596790374, + "loss": 2.8682, + "theoretical_loss": 3.7104184812087104, + "tokens_seen": 841906176 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037620862587763287, + "loss": 2.5776, + "theoretical_loss": 3.71039011485596, + "tokens_seen": 841971712 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003761985957873621, + "loss": 2.5632, + "theoretical_loss": 3.7103617513292235, + "tokens_seen": 842037248 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037618856569709123, + "loss": 2.5813, + "theoretical_loss": 3.7103333906279987, + "tokens_seen": 842102784 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 942076, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.738717794418335, + "objective/train/theoretical_loss": 3.710319211336797, + "objective/train/tokens_used": 862595552, + "theoretical_loss": 3.710319211336797, + "tokens_seen": 842135552 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037617853560682047, + "loss": 2.8054, + "theoretical_loss": 3.7103050327517852, + "tokens_seen": 842168320 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037616850551654965, + "loss": 2.7416, + "theoretical_loss": 3.7102766777000813, + "tokens_seen": 842233856 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037615847542627883, + "loss": 3.0039, + "theoretical_loss": 3.7102483254723864, + "tokens_seen": 842299392 + }, + { + "epoch": 2.08, + "learning_rate": 0.000376148445336008, + "loss": 2.9361, + "theoretical_loss": 3.710219976068199, + "tokens_seen": 842364928 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037613841524573725, + "loss": 2.7569, + "theoretical_loss": 3.710191629487018, + "tokens_seen": 842430464 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037612838515546637, + "loss": 2.5037, + "theoretical_loss": 3.7101632857283438, + "tokens_seen": 842496000 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003761183550651956, + "loss": 2.8678, + "theoretical_loss": 3.710134944791675, + "tokens_seen": 842561536 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003761083249749248, + "loss": 2.7399, + "theoretical_loss": 3.7101066066765114, + "tokens_seen": 842627072 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037609829488465397, + "loss": 3.0493, + "theoretical_loss": 3.710078271382353, + "tokens_seen": 842692608 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003760882647943832, + "loss": 2.9969, + "theoretical_loss": 3.7100499389086985, + "tokens_seen": 842758144 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037607823470411233, + "loss": 2.8912, + "theoretical_loss": 3.7100216092550493, + "tokens_seen": 842823680 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037606820461384157, + "loss": 2.9114, + "theoretical_loss": 3.7099932824209043, + "tokens_seen": 842889216 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003760581745235707, + "loss": 2.6763, + "theoretical_loss": 3.709964958405765, + "tokens_seen": 842954752 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037604814443329993, + "loss": 2.7973, + "theoretical_loss": 3.7099366372091303, + "tokens_seen": 843020288 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003760381143430291, + "loss": 3.0025, + "theoretical_loss": 3.709908318830501, + "tokens_seen": 843085824 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003760280842527583, + "loss": 3.03, + "theoretical_loss": 3.7098800032693786, + "tokens_seen": 843151360 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003760180541624875, + "loss": 2.791, + "theoretical_loss": 3.709851690525263, + "tokens_seen": 843216896 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037600802407221665, + "loss": 2.9545, + "theoretical_loss": 3.7098233805976557, + "tokens_seen": 843282432 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037599799398194584, + "loss": 2.5329, + "theoretical_loss": 3.7097950734860565, + "tokens_seen": 843347968 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037598796389167507, + "loss": 2.914, + "theoretical_loss": 3.7097667691899674, + "tokens_seen": 843413504 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003759779338014042, + "loss": 2.8874, + "theoretical_loss": 3.709738467708889, + "tokens_seen": 843479040 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037596790371113343, + "loss": 2.6411, + "theoretical_loss": 3.7097101690423235, + "tokens_seen": 843544576 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003759578736208626, + "loss": 2.6696, + "theoretical_loss": 3.7096818731897723, + "tokens_seen": 843610112 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003759478435305918, + "loss": 2.7156, + "theoretical_loss": 3.709653580150736, + "tokens_seen": 843675648 + }, + { + "epoch": 2.08, + "learning_rate": 0.000375937813440321, + "loss": 2.7071, + "theoretical_loss": 3.7096252899247175, + "tokens_seen": 843741184 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 943510, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8786866664886475, + "objective/train/theoretical_loss": 3.709611145866434, + "objective/train/tokens_used": 864233952, + "theoretical_loss": 3.709611145866434, + "tokens_seen": 843773952 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037592778335005016, + "loss": 2.8376, + "theoretical_loss": 3.7095970025112175, + "tokens_seen": 843806720 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037591775325977934, + "loss": 2.8534, + "theoretical_loss": 3.7095687179097387, + "tokens_seen": 843872256 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003759077231695086, + "loss": 2.9706, + "theoretical_loss": 3.7095404361197835, + "tokens_seen": 843937792 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003758976930792377, + "loss": 2.6948, + "theoretical_loss": 3.7095121571408534, + "tokens_seen": 844003328 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037588766298896694, + "loss": 2.7853, + "theoretical_loss": 3.709483880972451, + "tokens_seen": 844068864 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037587763289869606, + "loss": 2.8065, + "theoretical_loss": 3.709455607614079, + "tokens_seen": 844134400 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003758676028084253, + "loss": 2.7958, + "theoretical_loss": 3.7094273370652404, + "tokens_seen": 844199936 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003758575727181545, + "loss": 2.8232, + "theoretical_loss": 3.709399069325437, + "tokens_seen": 844265472 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037584754262788366, + "loss": 3.139, + "theoretical_loss": 3.7093708043941716, + "tokens_seen": 844331008 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037583751253761284, + "loss": 2.8477, + "theoretical_loss": 3.7093425422709485, + "tokens_seen": 844396544 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003758274824473421, + "loss": 2.8708, + "theoretical_loss": 3.70931428295527, + "tokens_seen": 844462080 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003758174523570712, + "loss": 2.5674, + "theoretical_loss": 3.7092860264466387, + "tokens_seen": 844527616 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037580742226680044, + "loss": 2.7413, + "theoretical_loss": 3.709257772744559, + "tokens_seen": 844593152 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037579739217652957, + "loss": 2.7255, + "theoretical_loss": 3.709229521848534, + "tokens_seen": 844658688 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003757873620862588, + "loss": 2.78, + "theoretical_loss": 3.7092012737580675, + "tokens_seen": 844724224 + }, + { + "epoch": 2.08, + "learning_rate": 0.000375777331995988, + "loss": 2.5741, + "theoretical_loss": 3.709173028472663, + "tokens_seen": 844789760 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037576730190571716, + "loss": 2.8994, + "theoretical_loss": 3.7091447859918247, + "tokens_seen": 844855296 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037575727181544635, + "loss": 2.7549, + "theoretical_loss": 3.709116546315056, + "tokens_seen": 844920832 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003757472417251755, + "loss": 2.8349, + "theoretical_loss": 3.709088309441862, + "tokens_seen": 844986368 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003757372116349047, + "loss": 2.9799, + "theoretical_loss": 3.7090600753717458, + "tokens_seen": 845051904 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037572718154463394, + "loss": 2.9412, + "theoretical_loss": 3.7090318441042127, + "tokens_seen": 845117440 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037571715145436307, + "loss": 2.6342, + "theoretical_loss": 3.7090036156387667, + "tokens_seen": 845182976 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003757071213640923, + "loss": 3.0338, + "theoretical_loss": 3.7089753899749134, + "tokens_seen": 845248512 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037569709127382143, + "loss": 2.7319, + "theoretical_loss": 3.708947167112156, + "tokens_seen": 845314048 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037568706118355067, + "loss": 2.6745, + "theoretical_loss": 3.70891894705, + "tokens_seen": 845379584 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 944052, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.632538318634033, + "objective/train/theoretical_loss": 3.7089048380689933, + "objective/train/tokens_used": 865872352, + "theoretical_loss": 3.7089048380689933, + "tokens_seen": 845412352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037567703109327985, + "loss": 2.8133, + "theoretical_loss": 3.708890729787951, + "tokens_seen": 845445120 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037566700100300903, + "loss": 2.7545, + "theoretical_loss": 3.7088625153255137, + "tokens_seen": 845510656 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003756569709127382, + "loss": 2.8038, + "theoretical_loss": 3.7088343036621936, + "tokens_seen": 845576192 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037564694082246745, + "loss": 2.8285, + "theoretical_loss": 3.7088060947974957, + "tokens_seen": 845641728 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003756369107321966, + "loss": 2.6281, + "theoretical_loss": 3.7087778887309257, + "tokens_seen": 845707264 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003756268806419258, + "loss": 2.9724, + "theoretical_loss": 3.7087496854619895, + "tokens_seen": 845772800 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037561685055165494, + "loss": 2.9071, + "theoretical_loss": 3.708721484990192, + "tokens_seen": 845838336 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037560682046138417, + "loss": 2.8209, + "theoretical_loss": 3.7086932873150404, + "tokens_seen": 845903872 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037559679037111335, + "loss": 2.725, + "theoretical_loss": 3.7086650924360396, + "tokens_seen": 845969408 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037558676028084253, + "loss": 2.7387, + "theoretical_loss": 3.7086369003526967, + "tokens_seen": 846034944 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003755767301905717, + "loss": 2.6394, + "theoretical_loss": 3.7086087110645165, + "tokens_seen": 846100480 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003755667001003009, + "loss": 2.9535, + "theoretical_loss": 3.708580524571007, + "tokens_seen": 846166016 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003755566700100301, + "loss": 2.7447, + "theoretical_loss": 3.708552340871674, + "tokens_seen": 846231552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003755466399197593, + "loss": 2.691, + "theoretical_loss": 3.708524159966024, + "tokens_seen": 846297088 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037553660982948844, + "loss": 2.8282, + "theoretical_loss": 3.7084959818535648, + "tokens_seen": 846362624 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003755265797392177, + "loss": 2.7942, + "theoretical_loss": 3.7084678065338013, + "tokens_seen": 846428160 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003755165496489468, + "loss": 2.687, + "theoretical_loss": 3.7084396340062424, + "tokens_seen": 846493696 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037550651955867604, + "loss": 2.9069, + "theoretical_loss": 3.7084114642703945, + "tokens_seen": 846559232 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003754964894684052, + "loss": 2.7304, + "theoretical_loss": 3.7083832973257653, + "tokens_seen": 846624768 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003754864593781344, + "loss": 2.9416, + "theoretical_loss": 3.708355133171861, + "tokens_seen": 846690304 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003754764292878636, + "loss": 2.7376, + "theoretical_loss": 3.708326971808191, + "tokens_seen": 846755840 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003754663991975928, + "loss": 2.8387, + "theoretical_loss": 3.7082988132342605, + "tokens_seen": 846821376 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037545636910732194, + "loss": 2.7832, + "theoretical_loss": 3.70827065744958, + "tokens_seen": 846886912 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003754463390170512, + "loss": 2.7474, + "theoretical_loss": 3.708242504453655, + "tokens_seen": 846952448 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003754363089267803, + "loss": 2.9122, + "theoretical_loss": 3.708214354245995, + "tokens_seen": 847017984 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 945481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.310624122619629, + "objective/train/theoretical_loss": 3.7082002801876106, + "objective/train/tokens_used": 867510752, + "theoretical_loss": 3.7082002801876106, + "tokens_seen": 847050752 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037542627883650954, + "loss": 2.6795, + "theoretical_loss": 3.708186206826108, + "tokens_seen": 847083520 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003754162487462387, + "loss": 2.6511, + "theoretical_loss": 3.708158062193502, + "tokens_seen": 847149056 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003754062186559679, + "loss": 2.9887, + "theoretical_loss": 3.7081299203476847, + "tokens_seen": 847214592 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003753961885656971, + "loss": 2.9141, + "theoretical_loss": 3.708101781288166, + "tokens_seen": 847280128 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037538615847542626, + "loss": 3.0222, + "theoretical_loss": 3.708073645014454, + "tokens_seen": 847345664 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037537612838515544, + "loss": 2.6518, + "theoretical_loss": 3.708045511526057, + "tokens_seen": 847411200 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003753660982948847, + "loss": 2.5696, + "theoretical_loss": 3.7080173808224846, + "tokens_seen": 847476736 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037535606820461386, + "loss": 2.8286, + "theoretical_loss": 3.707989252903245, + "tokens_seen": 847542272 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037534603811434304, + "loss": 2.7683, + "theoretical_loss": 3.707961127767848, + "tokens_seen": 847607808 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003753360080240723, + "loss": 2.7759, + "theoretical_loss": 3.7079330054158026, + "tokens_seen": 847673344 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003753259779338014, + "loss": 3.1806, + "theoretical_loss": 3.7079048858466184, + "tokens_seen": 847738880 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037531594784353064, + "loss": 2.7589, + "theoretical_loss": 3.707876769059805, + "tokens_seen": 847804416 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037530591775325977, + "loss": 2.6025, + "theoretical_loss": 3.7078486550548715, + "tokens_seen": 847869952 + }, + { + "epoch": 2.08, + "learning_rate": 0.000375295887662989, + "loss": 2.7559, + "theoretical_loss": 3.707820543831328, + "tokens_seen": 847935488 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003752858575727182, + "loss": 2.5894, + "theoretical_loss": 3.7077924353886846, + "tokens_seen": 848001024 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037527582748244736, + "loss": 2.745, + "theoretical_loss": 3.707764329726451, + "tokens_seen": 848066560 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037526579739217655, + "loss": 2.6036, + "theoretical_loss": 3.7077362268441374, + "tokens_seen": 848132096 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003752557673019057, + "loss": 2.531, + "theoretical_loss": 3.707708126741254, + "tokens_seen": 848197632 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003752457372116349, + "loss": 2.9765, + "theoretical_loss": 3.7076800294173116, + "tokens_seen": 848263168 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037523570712136414, + "loss": 2.7127, + "theoretical_loss": 3.70765193487182, + "tokens_seen": 848328704 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037522567703109327, + "loss": 2.587, + "theoretical_loss": 3.707623843104291, + "tokens_seen": 848394240 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003752156469408225, + "loss": 2.6971, + "theoretical_loss": 3.707595754114234, + "tokens_seen": 848459776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037520561685055163, + "loss": 2.6927, + "theoretical_loss": 3.707567667901161, + "tokens_seen": 848525312 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037519558676028087, + "loss": 2.6176, + "theoretical_loss": 3.707539584464582, + "tokens_seen": 848590848 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037518555667001005, + "loss": 2.6565, + "theoretical_loss": 3.7075115038040085, + "tokens_seen": 848656384 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 946246, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.916497230529785, + "objective/train/theoretical_loss": 3.7074974645145717, + "objective/train/tokens_used": 869149152, + "theoretical_loss": 3.7074974645145717, + "tokens_seen": 848689152 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037517552657973923, + "loss": 2.8534, + "theoretical_loss": 3.7074834259189524, + "tokens_seen": 848721920 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003751654964894684, + "loss": 2.5884, + "theoretical_loss": 3.7074553508089245, + "tokens_seen": 848787456 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037515546639919765, + "loss": 2.5969, + "theoretical_loss": 3.7074272784734363, + "tokens_seen": 848852992 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003751454363089268, + "loss": 2.7696, + "theoretical_loss": 3.7073992089119994, + "tokens_seen": 848918528 + }, + { + "epoch": 2.08, + "learning_rate": 0.000375135406218656, + "loss": 2.6844, + "theoretical_loss": 3.7073711421241256, + "tokens_seen": 848984064 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037512537612838514, + "loss": 2.6811, + "theoretical_loss": 3.707343078109327, + "tokens_seen": 849049600 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037511534603811437, + "loss": 2.9713, + "theoretical_loss": 3.707315016867115, + "tokens_seen": 849115136 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037510531594784355, + "loss": 2.8274, + "theoretical_loss": 3.7072869583970025, + "tokens_seen": 849180672 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037509528585757273, + "loss": 2.7507, + "theoretical_loss": 3.7072589026985012, + "tokens_seen": 849246208 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003750852557673019, + "loss": 3.0336, + "theoretical_loss": 3.707230849771124, + "tokens_seen": 849311744 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003750752256770311, + "loss": 2.9991, + "theoretical_loss": 3.707202799614383, + "tokens_seen": 849377280 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003750651955867603, + "loss": 2.6436, + "theoretical_loss": 3.7071747522277905, + "tokens_seen": 849442816 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003750551654964895, + "loss": 2.7344, + "theoretical_loss": 3.70714670761086, + "tokens_seen": 849508352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037504513540621864, + "loss": 2.7969, + "theoretical_loss": 3.7071186657631037, + "tokens_seen": 849573888 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003750351053159479, + "loss": 2.8591, + "theoretical_loss": 3.7070906266840353, + "tokens_seen": 849639424 + }, + { + "epoch": 2.08, + "learning_rate": 0.000375025075225677, + "loss": 3.018, + "theoretical_loss": 3.707062590373167, + "tokens_seen": 849704960 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037501504513540624, + "loss": 2.6358, + "theoretical_loss": 3.7070345568300125, + "tokens_seen": 849770496 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003750050150451354, + "loss": 2.9031, + "theoretical_loss": 3.707006526054085, + "tokens_seen": 849836032 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003749949849548646, + "loss": 2.6426, + "theoretical_loss": 3.706978498044898, + "tokens_seen": 849901568 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003749849548645938, + "loss": 2.6169, + "theoretical_loss": 3.7069504728019655, + "tokens_seen": 849967104 + }, + { + "epoch": 2.08, + "learning_rate": 0.000374974924774323, + "loss": 2.8302, + "theoretical_loss": 3.706922450324801, + "tokens_seen": 850032640 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037496489468405214, + "loss": 2.7208, + "theoretical_loss": 3.706894430612919, + "tokens_seen": 850098176 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003749548645937814, + "loss": 2.7541, + "theoretical_loss": 3.7068664136658316, + "tokens_seen": 850163712 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003749448345035105, + "loss": 2.7727, + "theoretical_loss": 3.7068383994830545, + "tokens_seen": 850229248 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037493480441323974, + "loss": 2.6672, + "theoretical_loss": 3.7068103880641017, + "tokens_seen": 850294784 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 947576, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4943060874938965, + "objective/train/theoretical_loss": 3.7067963833909072, + "objective/train/tokens_used": 870787552, + "theoretical_loss": 3.7067963833909072, + "tokens_seen": 850327552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003749247743229689, + "loss": 2.6507, + "theoretical_loss": 3.706782379408487, + "tokens_seen": 850360320 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003749147442326981, + "loss": 2.79, + "theoretical_loss": 3.706754373515725, + "tokens_seen": 850425856 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003749047141424273, + "loss": 2.9062, + "theoretical_loss": 3.7067263703853306, + "tokens_seen": 850491392 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037489468405215646, + "loss": 2.8495, + "theoretical_loss": 3.7066983700168183, + "tokens_seen": 850556928 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037488465396188564, + "loss": 2.6916, + "theoretical_loss": 3.706670372409703, + "tokens_seen": 850622464 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003748746238716149, + "loss": 2.7941, + "theoretical_loss": 3.7066423775635, + "tokens_seen": 850688000 + }, + { + "epoch": 2.08, + "learning_rate": 0.000374864593781344, + "loss": 2.8913, + "theoretical_loss": 3.7066143854777236, + "tokens_seen": 850753536 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037485456369107324, + "loss": 3.0094, + "theoretical_loss": 3.7065863961518897, + "tokens_seen": 850819072 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037484453360080237, + "loss": 2.9763, + "theoretical_loss": 3.7065584095855133, + "tokens_seen": 850884608 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003748345035105316, + "loss": 2.9927, + "theoretical_loss": 3.7065304257781095, + "tokens_seen": 850950144 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003748244734202608, + "loss": 2.7568, + "theoretical_loss": 3.7065024447291943, + "tokens_seen": 851015680 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037481444332998997, + "loss": 2.721, + "theoretical_loss": 3.7064744664382836, + "tokens_seen": 851081216 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037480441323971915, + "loss": 2.5999, + "theoretical_loss": 3.706446490904893, + "tokens_seen": 851146752 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003747943831494484, + "loss": 2.7467, + "theoretical_loss": 3.7064185181285376, + "tokens_seen": 851212288 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003747843530591775, + "loss": 2.6778, + "theoretical_loss": 3.706390548108735, + "tokens_seen": 851277824 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037477432296890675, + "loss": 2.6205, + "theoretical_loss": 3.706362580845, + "tokens_seen": 851343360 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037476429287863587, + "loss": 2.888, + "theoretical_loss": 3.70633461633685, + "tokens_seen": 851408896 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003747542627883651, + "loss": 2.4779, + "theoretical_loss": 3.706306654583801, + "tokens_seen": 851474432 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003747442326980943, + "loss": 2.6989, + "theoretical_loss": 3.706278695585369, + "tokens_seen": 851539968 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037473420260782347, + "loss": 2.9415, + "theoretical_loss": 3.706250739341071, + "tokens_seen": 851605504 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037472417251755265, + "loss": 2.7205, + "theoretical_loss": 3.706222785850424, + "tokens_seen": 851671040 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037471414242728183, + "loss": 2.6934, + "theoretical_loss": 3.706194835112945, + "tokens_seen": 851736576 + }, + { + "epoch": 2.08, + "learning_rate": 0.000374704112337011, + "loss": 2.6926, + "theoretical_loss": 3.706166887128151, + "tokens_seen": 851802112 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037469408224674025, + "loss": 2.8434, + "theoretical_loss": 3.7061389418955586, + "tokens_seen": 851867648 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003746840521564694, + "loss": 2.5986, + "theoretical_loss": 3.706110999414686, + "tokens_seen": 851933184 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 948304, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5480265617370605, + "objective/train/theoretical_loss": 3.706097029205993, + "objective/train/tokens_used": 872425952, + "theoretical_loss": 3.706097029205993, + "tokens_seen": 851965952 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003746740220661986, + "loss": 2.8042, + "theoretical_loss": 3.7060830596850494, + "tokens_seen": 851998720 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037466399197592774, + "loss": 2.5988, + "theoretical_loss": 3.706055122706167, + "tokens_seen": 852064256 + }, + { + "epoch": 2.08, + "learning_rate": 0.000374653961885657, + "loss": 2.5353, + "theoretical_loss": 3.706027188477557, + "tokens_seen": 852129792 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037464393179538615, + "loss": 2.8022, + "theoretical_loss": 3.7059992569987354, + "tokens_seen": 852195328 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037463390170511534, + "loss": 2.8748, + "theoretical_loss": 3.7059713282692224, + "tokens_seen": 852260864 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037462387161484457, + "loss": 2.779, + "theoretical_loss": 3.7059434022885345, + "tokens_seen": 852326400 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037461384152457375, + "loss": 2.7233, + "theoretical_loss": 3.7059154790561895, + "tokens_seen": 852391936 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037460381143430293, + "loss": 2.7193, + "theoretical_loss": 3.705887558571707, + "tokens_seen": 852457472 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003745937813440321, + "loss": 2.6575, + "theoretical_loss": 3.7058596408346043, + "tokens_seen": 852523008 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003745837512537613, + "loss": 2.738, + "theoretical_loss": 3.705831725844401, + "tokens_seen": 852588544 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003745737211634905, + "loss": 2.6115, + "theoretical_loss": 3.7058038136006144, + "tokens_seen": 852654080 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003745636910732197, + "loss": 2.5825, + "theoretical_loss": 3.705775904102764, + "tokens_seen": 852719616 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037455366098294884, + "loss": 2.6179, + "theoretical_loss": 3.705747997350368, + "tokens_seen": 852785152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003745436308926781, + "loss": 2.7327, + "theoretical_loss": 3.7057200933429466, + "tokens_seen": 852850688 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003745336008024072, + "loss": 2.8464, + "theoretical_loss": 3.7056921920800177, + "tokens_seen": 852916224 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037452357071213644, + "loss": 2.8869, + "theoretical_loss": 3.7056642935611013, + "tokens_seen": 852981760 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003745135406218656, + "loss": 2.6987, + "theoretical_loss": 3.7056363977857156, + "tokens_seen": 853047296 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003745035105315948, + "loss": 2.8166, + "theoretical_loss": 3.7056085047533815, + "tokens_seen": 853112832 + }, + { + "epoch": 2.08, + "learning_rate": 0.000374493480441324, + "loss": 2.4584, + "theoretical_loss": 3.705580614463618, + "tokens_seen": 853178368 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003744834503510532, + "loss": 2.803, + "theoretical_loss": 3.7055527269159443, + "tokens_seen": 853243904 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037447342026078234, + "loss": 2.7901, + "theoretical_loss": 3.705524842109881, + "tokens_seen": 853309440 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003744633901705116, + "loss": 2.9767, + "theoretical_loss": 3.7054969600449477, + "tokens_seen": 853374976 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003744533600802407, + "loss": 2.7746, + "theoretical_loss": 3.705469080720664, + "tokens_seen": 853440512 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037444332998996994, + "loss": 2.8606, + "theoretical_loss": 3.7054412041365508, + "tokens_seen": 853506048 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003744332998996991, + "loss": 2.6336, + "theoretical_loss": 3.705413330292128, + "tokens_seen": 853571584 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 949423, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8431296348571777, + "objective/train/theoretical_loss": 3.7053993943971513, + "objective/train/tokens_used": 874064352, + "theoretical_loss": 3.7053993943971513, + "tokens_seen": 853604352 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003744232698094283, + "loss": 2.6529, + "theoretical_loss": 3.7053854591869166, + "tokens_seen": 853637120 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003744132397191575, + "loss": 3.0092, + "theoretical_loss": 3.7053575908204364, + "tokens_seen": 853702656 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037440320962888666, + "loss": 2.6759, + "theoretical_loss": 3.705329725192209, + "tokens_seen": 853768192 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037439317953861585, + "loss": 2.5965, + "theoretical_loss": 3.705301862301754, + "tokens_seen": 853833728 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003743831494483451, + "loss": 2.7665, + "theoretical_loss": 3.7052740021485926, + "tokens_seen": 853899264 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003743731193580742, + "loss": 3.0544, + "theoretical_loss": 3.7052461447322464, + "tokens_seen": 853964800 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037436308926780344, + "loss": 2.8876, + "theoretical_loss": 3.705218290052237, + "tokens_seen": 854030336 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037435305917753257, + "loss": 2.7202, + "theoretical_loss": 3.705190438108084, + "tokens_seen": 854095872 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003743430290872618, + "loss": 2.7702, + "theoretical_loss": 3.7051625888993103, + "tokens_seen": 854161408 + }, + { + "epoch": 2.08, + "learning_rate": 0.000374332998996991, + "loss": 2.811, + "theoretical_loss": 3.7051347424254364, + "tokens_seen": 854226944 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037432296890672017, + "loss": 3.0093, + "theoretical_loss": 3.705106898685985, + "tokens_seen": 854292480 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037431293881644935, + "loss": 2.5793, + "theoretical_loss": 3.7050790576804764, + "tokens_seen": 854358016 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003743029087261786, + "loss": 2.5169, + "theoretical_loss": 3.7050512194084337, + "tokens_seen": 854423552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003742928786359077, + "loss": 2.7798, + "theoretical_loss": 3.7050233838693787, + "tokens_seen": 854489088 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037428284854563695, + "loss": 2.5137, + "theoretical_loss": 3.704995551062833, + "tokens_seen": 854554624 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037427281845536607, + "loss": 2.6821, + "theoretical_loss": 3.7049677209883196, + "tokens_seen": 854620160 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003742627883650953, + "loss": 2.8419, + "theoretical_loss": 3.70493989364536, + "tokens_seen": 854685696 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003742527582748245, + "loss": 2.8814, + "theoretical_loss": 3.704912069033477, + "tokens_seen": 854751232 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037424272818455367, + "loss": 2.6939, + "theoretical_loss": 3.7048842471521937, + "tokens_seen": 854816768 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037423269809428285, + "loss": 2.8317, + "theoretical_loss": 3.7048564280010323, + "tokens_seen": 854882304 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037422266800401203, + "loss": 2.9259, + "theoretical_loss": 3.704828611579515, + "tokens_seen": 854947840 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003742126379137412, + "loss": 2.9123, + "theoretical_loss": 3.704800797887166, + "tokens_seen": 855013376 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037420260782347045, + "loss": 2.8707, + "theoretical_loss": 3.7047729869235084, + "tokens_seen": 855078912 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003741925777331996, + "loss": 2.7544, + "theoretical_loss": 3.7047451786880643, + "tokens_seen": 855144448 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003741825476429288, + "loss": 2.8984, + "theoretical_loss": 3.7047173731803573, + "tokens_seen": 855209984 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 949852, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5512642860412598, + "objective/train/theoretical_loss": 3.704703471449257, + "objective/train/tokens_used": 875702752, + "theoretical_loss": 3.704703471449257, + "tokens_seen": 855242752 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037417251755265794, + "loss": 2.6332, + "theoretical_loss": 3.7046895703999114, + "tokens_seen": 855275520 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003741624874623872, + "loss": 2.7775, + "theoretical_loss": 3.7046617703462497, + "tokens_seen": 855341056 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037415245737211635, + "loss": 2.828, + "theoretical_loss": 3.7046339730188964, + "tokens_seen": 855406592 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037414242728184554, + "loss": 2.6766, + "theoretical_loss": 3.7046061784173743, + "tokens_seen": 855472128 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003741323971915747, + "loss": 2.531, + "theoretical_loss": 3.7045783865412085, + "tokens_seen": 855537664 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037412236710130395, + "loss": 2.6946, + "theoretical_loss": 3.704550597389922, + "tokens_seen": 855603200 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003741123370110331, + "loss": 2.7786, + "theoretical_loss": 3.7045228109630397, + "tokens_seen": 855668736 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003741023069207623, + "loss": 2.8376, + "theoretical_loss": 3.7044950272600854, + "tokens_seen": 855734272 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037409227683049144, + "loss": 2.8049, + "theoretical_loss": 3.704467246280584, + "tokens_seen": 855799808 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003740822467402207, + "loss": 2.8418, + "theoretical_loss": 3.7044394680240593, + "tokens_seen": 855865344 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037407221664994986, + "loss": 2.5376, + "theoretical_loss": 3.7044116924900363, + "tokens_seen": 855930880 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037406218655967904, + "loss": 2.565, + "theoretical_loss": 3.7043839196780404, + "tokens_seen": 855996416 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003740521564694082, + "loss": 2.7114, + "theoretical_loss": 3.7043561495875954, + "tokens_seen": 856061952 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003740421263791374, + "loss": 2.8716, + "theoretical_loss": 3.7043283822182262, + "tokens_seen": 856127488 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003740320962888666, + "loss": 2.7345, + "theoretical_loss": 3.704300617569459, + "tokens_seen": 856193024 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003740220661985958, + "loss": 2.7921, + "theoretical_loss": 3.704272855640818, + "tokens_seen": 856258560 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037401203610832494, + "loss": 2.8099, + "theoretical_loss": 3.70424509643183, + "tokens_seen": 856324096 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003740020060180542, + "loss": 2.6542, + "theoretical_loss": 3.7042173399420184, + "tokens_seen": 856389632 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037399197592778336, + "loss": 2.5126, + "theoretical_loss": 3.7041895861709104, + "tokens_seen": 856455168 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037398194583751254, + "loss": 2.8828, + "theoretical_loss": 3.7041618351180308, + "tokens_seen": 856520704 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003739719157472417, + "loss": 2.5162, + "theoretical_loss": 3.704134086782906, + "tokens_seen": 856586240 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003739618856569709, + "loss": 2.7155, + "theoretical_loss": 3.704106341165062, + "tokens_seen": 856651776 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003739518555667001, + "loss": 2.8485, + "theoretical_loss": 3.7040785982640236, + "tokens_seen": 856717312 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003739418254764293, + "loss": 2.7267, + "theoretical_loss": 3.7040508580793188, + "tokens_seen": 856782848 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037393179538615845, + "loss": 2.8417, + "theoretical_loss": 3.704023120610473, + "tokens_seen": 856848384 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 951087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8513641357421875, + "objective/train/theoretical_loss": 3.7040092528943487, + "objective/train/tokens_used": 877341152, + "theoretical_loss": 3.7040092528943487, + "tokens_seen": 856881152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003739217652958877, + "loss": 2.7939, + "theoretical_loss": 3.703995385857012, + "tokens_seen": 856913920 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003739117352056168, + "loss": 2.8448, + "theoretical_loss": 3.703967653818464, + "tokens_seen": 856979456 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037390170511534605, + "loss": 2.7197, + "theoretical_loss": 3.703939924494353, + "tokens_seen": 857044992 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003738916750250752, + "loss": 2.5392, + "theoretical_loss": 3.703912197884209, + "tokens_seen": 857110528 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003738816449348044, + "loss": 2.761, + "theoretical_loss": 3.7038844739875567, + "tokens_seen": 857176064 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037387161484453364, + "loss": 2.6546, + "theoretical_loss": 3.703856752803924, + "tokens_seen": 857241600 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037386158475426277, + "loss": 2.7587, + "theoretical_loss": 3.7038290343328377, + "tokens_seen": 857307136 + }, + { + "epoch": 2.08, + "learning_rate": 0.000373851554663992, + "loss": 2.7639, + "theoretical_loss": 3.7038013185738246, + "tokens_seen": 857372672 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003738415245737212, + "loss": 2.6566, + "theoretical_loss": 3.703773605526413, + "tokens_seen": 857438208 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037383149448345037, + "loss": 2.8388, + "theoretical_loss": 3.7037458951901296, + "tokens_seen": 857503744 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037382146439317955, + "loss": 2.6906, + "theoretical_loss": 3.703718187564503, + "tokens_seen": 857569280 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003738114343029088, + "loss": 2.6582, + "theoretical_loss": 3.7036904826490598, + "tokens_seen": 857634816 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003738014042126379, + "loss": 2.8538, + "theoretical_loss": 3.703662780443328, + "tokens_seen": 857700352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037379137412236715, + "loss": 2.6117, + "theoretical_loss": 3.7036350809468366, + "tokens_seen": 857765888 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037378134403209627, + "loss": 2.6217, + "theoretical_loss": 3.7036073841591124, + "tokens_seen": 857831424 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003737713139418255, + "loss": 2.6182, + "theoretical_loss": 3.7035796900796845, + "tokens_seen": 857896960 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003737612838515547, + "loss": 2.8175, + "theoretical_loss": 3.7035519987080807, + "tokens_seen": 857962496 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037375125376128387, + "loss": 2.9811, + "theoretical_loss": 3.7035243100438295, + "tokens_seen": 858028032 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037374122367101305, + "loss": 2.9455, + "theoretical_loss": 3.7034966240864593, + "tokens_seen": 858093568 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037373119358074223, + "loss": 2.9907, + "theoretical_loss": 3.7034689408354993, + "tokens_seen": 858159104 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003737211634904714, + "loss": 2.851, + "theoretical_loss": 3.7034412602904783, + "tokens_seen": 858224640 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037371113340020065, + "loss": 2.6784, + "theoretical_loss": 3.7034135824509242, + "tokens_seen": 858290176 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003737011033099298, + "loss": 2.8578, + "theoretical_loss": 3.703385907316367, + "tokens_seen": 858355712 + }, + { + "epoch": 2.08, + "learning_rate": 0.000373691073219659, + "loss": 2.8543, + "theoretical_loss": 3.703358234886336, + "tokens_seen": 858421248 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037368104312938814, + "loss": 2.9281, + "theoretical_loss": 3.703330565160359, + "tokens_seen": 858486784 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 951523, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.799268960952759, + "objective/train/theoretical_loss": 3.7033167313112445, + "objective/train/tokens_used": 878979552, + "theoretical_loss": 3.7033167313112445, + "tokens_seen": 858519552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003736710130391174, + "loss": 2.8842, + "theoretical_loss": 3.7033028981379674, + "tokens_seen": 858552320 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037366098294884655, + "loss": 2.5017, + "theoretical_loss": 3.703275233818689, + "tokens_seen": 858617856 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037365095285857574, + "loss": 2.7812, + "theoretical_loss": 3.7032475722020544, + "tokens_seen": 858683392 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003736409227683049, + "loss": 3.0125, + "theoretical_loss": 3.703219913287593, + "tokens_seen": 858748928 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037363089267803415, + "loss": 2.855, + "theoretical_loss": 3.7031922570748343, + "tokens_seen": 858814464 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003736208625877633, + "loss": 2.7978, + "theoretical_loss": 3.703164603563309, + "tokens_seen": 858880000 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003736108324974925, + "loss": 2.3875, + "theoretical_loss": 3.7031369527525464, + "tokens_seen": 858945536 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037360080240722164, + "loss": 2.9526, + "theoretical_loss": 3.7031093046420773, + "tokens_seen": 859011072 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003735907723169509, + "loss": 2.5221, + "theoretical_loss": 3.703081659231432, + "tokens_seen": 859076608 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037358074222668006, + "loss": 2.9902, + "theoretical_loss": 3.7030540165201407, + "tokens_seen": 859142144 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037357071213640924, + "loss": 2.5429, + "theoretical_loss": 3.7030263765077334, + "tokens_seen": 859207680 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003735606820461384, + "loss": 2.673, + "theoretical_loss": 3.702998739193742, + "tokens_seen": 859273216 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003735506519558676, + "loss": 3.012, + "theoretical_loss": 3.7029711045776965, + "tokens_seen": 859338752 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003735406218655968, + "loss": 2.6181, + "theoretical_loss": 3.7029434726591277, + "tokens_seen": 859404288 + }, + { + "epoch": 2.08, + "learning_rate": 0.000373530591775326, + "loss": 2.7048, + "theoretical_loss": 3.702915843437567, + "tokens_seen": 859469824 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037352056168505514, + "loss": 2.5463, + "theoretical_loss": 3.7028882169125454, + "tokens_seen": 859535360 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003735105315947844, + "loss": 2.394, + "theoretical_loss": 3.7028605930835945, + "tokens_seen": 859600896 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037350050150451356, + "loss": 2.7804, + "theoretical_loss": 3.702832971950245, + "tokens_seen": 859666432 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037349047141424274, + "loss": 2.6185, + "theoretical_loss": 3.702805353512029, + "tokens_seen": 859731968 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003734804413239719, + "loss": 2.8201, + "theoretical_loss": 3.7027777377684776, + "tokens_seen": 859797504 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003734704112337011, + "loss": 3.0059, + "theoretical_loss": 3.7027501247191226, + "tokens_seen": 859863040 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003734603811434303, + "loss": 2.8458, + "theoretical_loss": 3.7027225143634963, + "tokens_seen": 859928576 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003734503510531595, + "loss": 2.6558, + "theoretical_loss": 3.7026949067011303, + "tokens_seen": 859994112 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037344032096288865, + "loss": 2.7805, + "theoretical_loss": 3.7026673017315566, + "tokens_seen": 860059648 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003734302908726179, + "loss": 2.6283, + "theoretical_loss": 3.7026396994543074, + "tokens_seen": 860125184 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 952944, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6841440200805664, + "objective/train/theoretical_loss": 3.7026258993251586, + "objective/train/tokens_used": 880617952, + "theoretical_loss": 3.7026258993251586, + "tokens_seen": 860157952 + }, + { + "epoch": 2.08, + "learning_rate": 0.000373420260782347, + "loss": 2.8276, + "theoretical_loss": 3.702612099868915, + "tokens_seen": 860190720 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037341023069207625, + "loss": 2.9939, + "theoretical_loss": 3.7025845029749123, + "tokens_seen": 860256256 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003734002006018054, + "loss": 2.696, + "theoretical_loss": 3.7025569087718315, + "tokens_seen": 860321792 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003733901705115346, + "loss": 2.8568, + "theoretical_loss": 3.7025293172592053, + "tokens_seen": 860387328 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003733801404212638, + "loss": 2.6686, + "theoretical_loss": 3.702501728436566, + "tokens_seen": 860452864 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037337011033099297, + "loss": 2.752, + "theoretical_loss": 3.7024741423034477, + "tokens_seen": 860518400 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037336008024072215, + "loss": 2.6242, + "theoretical_loss": 3.702446558859382, + "tokens_seen": 860583936 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003733500501504514, + "loss": 2.7184, + "theoretical_loss": 3.702418978103903, + "tokens_seen": 860649472 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003733400200601805, + "loss": 2.8764, + "theoretical_loss": 3.702391400036543, + "tokens_seen": 860715008 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037332998996990975, + "loss": 2.5713, + "theoretical_loss": 3.7023638246568367, + "tokens_seen": 860780544 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037331995987963893, + "loss": 2.7039, + "theoretical_loss": 3.702336251964317, + "tokens_seen": 860846080 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003733099297893681, + "loss": 2.7103, + "theoretical_loss": 3.7023086819585167, + "tokens_seen": 860911616 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003732998996990973, + "loss": 2.808, + "theoretical_loss": 3.7022811146389696, + "tokens_seen": 860977152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003732898696088265, + "loss": 2.9242, + "theoretical_loss": 3.702253550005211, + "tokens_seen": 861042688 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037327983951855565, + "loss": 2.7875, + "theoretical_loss": 3.7022259880567736, + "tokens_seen": 861108224 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003732698094282849, + "loss": 2.8359, + "theoretical_loss": 3.702198428793192, + "tokens_seen": 861173760 + }, + { + "epoch": 2.08, + "learning_rate": 0.000373259779338014, + "loss": 2.7244, + "theoretical_loss": 3.702170872214, + "tokens_seen": 861239296 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037324974924774325, + "loss": 2.8139, + "theoretical_loss": 3.702143318318732, + "tokens_seen": 861304832 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003732397191574724, + "loss": 2.6474, + "theoretical_loss": 3.702115767106922, + "tokens_seen": 861370368 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003732296890672016, + "loss": 2.7847, + "theoretical_loss": 3.7020882185781057, + "tokens_seen": 861435904 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003732196589769308, + "loss": 2.8831, + "theoretical_loss": 3.7020606727318164, + "tokens_seen": 861501440 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037320962888666, + "loss": 2.6749, + "theoretical_loss": 3.7020331295675897, + "tokens_seen": 861566976 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037319959879638916, + "loss": 2.5583, + "theoretical_loss": 3.70200558908496, + "tokens_seen": 861632512 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037318956870611834, + "loss": 2.8254, + "theoretical_loss": 3.7019780512834632, + "tokens_seen": 861698048 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003731795386158475, + "loss": 2.7722, + "theoretical_loss": 3.7019505161626327, + "tokens_seen": 861763584 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 953723, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7122349739074707, + "objective/train/theoretical_loss": 3.7019367496073228, + "objective/train/tokens_used": 882256352, + "theoretical_loss": 3.7019367496073228, + "tokens_seen": 861796352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037316950852557675, + "loss": 2.8407, + "theoretical_loss": 3.701922983722006, + "tokens_seen": 861829120 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003731594784353059, + "loss": 2.7291, + "theoretical_loss": 3.701895453961116, + "tokens_seen": 861894656 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003731494483450351, + "loss": 2.7609, + "theoretical_loss": 3.7018679268794994, + "tokens_seen": 861960192 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003731394182547643, + "loss": 2.7356, + "theoretical_loss": 3.701840402476692, + "tokens_seen": 862025728 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003731293881644935, + "loss": 2.8384, + "theoretical_loss": 3.7018128807522297, + "tokens_seen": 862091264 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003731193580742227, + "loss": 2.7187, + "theoretical_loss": 3.701785361705647, + "tokens_seen": 862156800 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037310932798395184, + "loss": 2.7171, + "theoretical_loss": 3.7017578453364806, + "tokens_seen": 862222336 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003730992978936811, + "loss": 2.8718, + "theoretical_loss": 3.701730331644267, + "tokens_seen": 862287872 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037308926780341026, + "loss": 2.8957, + "theoretical_loss": 3.701702820628541, + "tokens_seen": 862353408 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037307923771313944, + "loss": 2.9734, + "theoretical_loss": 3.701675312288841, + "tokens_seen": 862418944 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003730692076228686, + "loss": 2.8563, + "theoretical_loss": 3.701647806624701, + "tokens_seen": 862484480 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003730591775325978, + "loss": 2.724, + "theoretical_loss": 3.701620303635659, + "tokens_seen": 862550016 + }, + { + "epoch": 2.08, + "learning_rate": 0.000373049147442327, + "loss": 2.676, + "theoretical_loss": 3.701592803321251, + "tokens_seen": 862615552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003730391173520562, + "loss": 2.7422, + "theoretical_loss": 3.7015653056810143, + "tokens_seen": 862681088 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037302908726178534, + "loss": 2.8457, + "theoretical_loss": 3.701537810714485, + "tokens_seen": 862746624 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003730190571715146, + "loss": 2.6289, + "theoretical_loss": 3.7015103184212004, + "tokens_seen": 862812160 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037300902708124376, + "loss": 2.7261, + "theoretical_loss": 3.7014828288006973, + "tokens_seen": 862877696 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037299899699097294, + "loss": 2.7542, + "theoretical_loss": 3.7014553418525136, + "tokens_seen": 862943232 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003729889669007021, + "loss": 2.5174, + "theoretical_loss": 3.701427857576186, + "tokens_seen": 863008768 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003729789368104313, + "loss": 2.8556, + "theoretical_loss": 3.701400375971252, + "tokens_seen": 863074304 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003729689067201605, + "loss": 2.8079, + "theoretical_loss": 3.701372897037249, + "tokens_seen": 863139840 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003729588766298897, + "loss": 2.7774, + "theoretical_loss": 3.7013454207737153, + "tokens_seen": 863205376 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037294884653961885, + "loss": 2.8166, + "theoretical_loss": 3.701317947180188, + "tokens_seen": 863270912 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003729388164493481, + "loss": 2.518, + "theoretical_loss": 3.7012904762562053, + "tokens_seen": 863336448 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003729287863590772, + "loss": 2.8338, + "theoretical_loss": 3.701263008001305, + "tokens_seen": 863401984 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 954800, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.510504961013794, + "objective/train/theoretical_loss": 3.701249274874616, + "objective/train/tokens_used": 883894752, + "theoretical_loss": 3.701249274874616, + "tokens_seen": 863434752 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037291875626880645, + "loss": 2.7716, + "theoretical_loss": 3.7012355424150254, + "tokens_seen": 863467520 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003729087261785356, + "loss": 2.7719, + "theoretical_loss": 3.7012080794969044, + "tokens_seen": 863533056 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003728986960882648, + "loss": 2.7007, + "theoretical_loss": 3.7011806192464807, + "tokens_seen": 863598592 + }, + { + "epoch": 2.08, + "learning_rate": 0.000372888665997994, + "loss": 2.5711, + "theoretical_loss": 3.7011531616632922, + "tokens_seen": 863664128 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037287863590772317, + "loss": 2.7918, + "theoretical_loss": 3.701125706746878, + "tokens_seen": 863729664 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037286860581745235, + "loss": 2.8889, + "theoretical_loss": 3.701098254496777, + "tokens_seen": 863795200 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003728585757271816, + "loss": 2.9669, + "theoretical_loss": 3.7010708049125274, + "tokens_seen": 863860736 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003728485456369107, + "loss": 2.6103, + "theoretical_loss": 3.701043357993668, + "tokens_seen": 863926272 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037283851554663995, + "loss": 2.7198, + "theoretical_loss": 3.701015913739739, + "tokens_seen": 863991808 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037282848545636913, + "loss": 2.7146, + "theoretical_loss": 3.7009884721502777, + "tokens_seen": 864057344 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003728184553660983, + "loss": 2.8361, + "theoretical_loss": 3.700961033224825, + "tokens_seen": 864122880 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003728084252758275, + "loss": 2.7008, + "theoretical_loss": 3.7009335969629196, + "tokens_seen": 864188416 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003727983951855567, + "loss": 2.7614, + "theoretical_loss": 3.7009061633641007, + "tokens_seen": 864253952 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037278836509528585, + "loss": 2.7302, + "theoretical_loss": 3.700878732427908, + "tokens_seen": 864319488 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003727783350050151, + "loss": 2.6105, + "theoretical_loss": 3.7008513041538817, + "tokens_seen": 864385024 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003727683049147442, + "loss": 2.9334, + "theoretical_loss": 3.700823878541562, + "tokens_seen": 864450560 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037275827482447345, + "loss": 2.7052, + "theoretical_loss": 3.7007964555904875, + "tokens_seen": 864516096 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003727482447342026, + "loss": 2.8405, + "theoretical_loss": 3.7007690353001985, + "tokens_seen": 864581632 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003727382146439318, + "loss": 2.6395, + "theoretical_loss": 3.700741617670236, + "tokens_seen": 864647168 + }, + { + "epoch": 2.08, + "learning_rate": 0.000372728184553661, + "loss": 2.6473, + "theoretical_loss": 3.70071420270014, + "tokens_seen": 864712704 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003727181544633902, + "loss": 2.8752, + "theoretical_loss": 3.700686790389451, + "tokens_seen": 864778240 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037270812437311936, + "loss": 2.852, + "theoretical_loss": 3.700659380737709, + "tokens_seen": 864843776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037269809428284854, + "loss": 2.9933, + "theoretical_loss": 3.7006319737444553, + "tokens_seen": 864909312 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003726880641925777, + "loss": 2.5759, + "theoretical_loss": 3.7006045694092298, + "tokens_seen": 864974848 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037267803410230696, + "loss": 2.9994, + "theoretical_loss": 3.700577167731574, + "tokens_seen": 865040384 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 955501, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.222694158554077, + "objective/train/theoretical_loss": 3.700563467889191, + "objective/train/tokens_used": 885533152, + "theoretical_loss": 3.700563467889191, + "tokens_seen": 865073152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003726680040120361, + "loss": 2.7528, + "theoretical_loss": 3.7005497687110287, + "tokens_seen": 865105920 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003726579739217653, + "loss": 2.7434, + "theoretical_loss": 3.7005223723471348, + "tokens_seen": 865171456 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003726479438314945, + "loss": 2.5054, + "theoretical_loss": 3.700494978639434, + "tokens_seen": 865236992 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003726379137412237, + "loss": 2.7522, + "theoretical_loss": 3.700467587587467, + "tokens_seen": 865302528 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037262788365095286, + "loss": 2.593, + "theoretical_loss": 3.7004401991907754, + "tokens_seen": 865368064 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037261785356068204, + "loss": 2.4979, + "theoretical_loss": 3.700412813448901, + "tokens_seen": 865433600 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003726078234704112, + "loss": 2.7374, + "theoretical_loss": 3.7003854303613855, + "tokens_seen": 865499136 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037259779338014046, + "loss": 2.7745, + "theoretical_loss": 3.70035804992777, + "tokens_seen": 865564672 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003725877632898696, + "loss": 2.8962, + "theoretical_loss": 3.7003306721475973, + "tokens_seen": 865630208 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003725777331995988, + "loss": 2.8755, + "theoretical_loss": 3.700303297020409, + "tokens_seen": 865695744 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037256770310932795, + "loss": 2.744, + "theoretical_loss": 3.700275924545747, + "tokens_seen": 865761280 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003725576730190572, + "loss": 2.8406, + "theoretical_loss": 3.7002485547231534, + "tokens_seen": 865826816 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037254764292878636, + "loss": 2.3279, + "theoretical_loss": 3.700221187552171, + "tokens_seen": 865892352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037253761283851554, + "loss": 2.7378, + "theoretical_loss": 3.7001938230323423, + "tokens_seen": 865957888 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003725275827482447, + "loss": 2.1945, + "theoretical_loss": 3.7001664611632092, + "tokens_seen": 866023424 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037251755265797396, + "loss": 2.6143, + "theoretical_loss": 3.700139101944315, + "tokens_seen": 866088960 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003725075225677031, + "loss": 2.816, + "theoretical_loss": 3.700111745375202, + "tokens_seen": 866154496 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003724974924774323, + "loss": 2.4319, + "theoretical_loss": 3.700084391455414, + "tokens_seen": 866220032 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037248746238716145, + "loss": 2.5547, + "theoretical_loss": 3.7000570401844928, + "tokens_seen": 866285568 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003724774322968907, + "loss": 2.7006, + "theoretical_loss": 3.7000296915619826, + "tokens_seen": 866351104 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037246740220661987, + "loss": 2.6894, + "theoretical_loss": 3.700002345587426, + "tokens_seen": 866416640 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037245737211634905, + "loss": 2.7271, + "theoretical_loss": 3.6999750022603664, + "tokens_seen": 866482176 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037244734202607823, + "loss": 2.7118, + "theoretical_loss": 3.6999476615803473, + "tokens_seen": 866547712 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003724373119358074, + "loss": 2.7494, + "theoretical_loss": 3.6999203235469125, + "tokens_seen": 866613248 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003724272818455366, + "loss": 2.8457, + "theoretical_loss": 3.699892988159606, + "tokens_seen": 866678784 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 956034, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.010556221008301, + "objective/train/theoretical_loss": 3.6998793214581074, + "objective/train/tokens_used": 887171552, + "theoretical_loss": 3.6998793214581074, + "tokens_seen": 866711552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003724172517552658, + "loss": 2.8745, + "theoretical_loss": 3.6998656554179705, + "tokens_seen": 866744320 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037240722166499495, + "loss": 2.7951, + "theoretical_loss": 3.6998383253215508, + "tokens_seen": 866809856 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003723971915747242, + "loss": 2.5935, + "theoretical_loss": 3.699810997869891, + "tokens_seen": 866875392 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003723871614844533, + "loss": 2.6498, + "theoretical_loss": 3.6997836730625346, + "tokens_seen": 866940928 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037237713139418255, + "loss": 2.6788, + "theoretical_loss": 3.6997563508990265, + "tokens_seen": 867006464 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003723671013039118, + "loss": 2.5869, + "theoretical_loss": 3.6997290313789106, + "tokens_seen": 867072000 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003723570712136409, + "loss": 2.6665, + "theoretical_loss": 3.6997017145017326, + "tokens_seen": 867137536 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037234704112337015, + "loss": 2.621, + "theoretical_loss": 3.699674400267035, + "tokens_seen": 867203072 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037233701103309933, + "loss": 2.9424, + "theoretical_loss": 3.699647088674364, + "tokens_seen": 867268608 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003723269809428285, + "loss": 2.6496, + "theoretical_loss": 3.699619779723264, + "tokens_seen": 867334144 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003723169508525577, + "loss": 2.519, + "theoretical_loss": 3.69959247341328, + "tokens_seen": 867399680 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003723069207622869, + "loss": 2.5122, + "theoretical_loss": 3.6995651697439573, + "tokens_seen": 867465216 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037229689067201605, + "loss": 2.5631, + "theoretical_loss": 3.699537868714841, + "tokens_seen": 867530752 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003722868605817453, + "loss": 2.5265, + "theoretical_loss": 3.699510570325475, + "tokens_seen": 867596288 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003722768304914744, + "loss": 2.7926, + "theoretical_loss": 3.699483274575407, + "tokens_seen": 867661824 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037226680040120365, + "loss": 2.9379, + "theoretical_loss": 3.6994559814641814, + "tokens_seen": 867727360 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003722567703109328, + "loss": 2.8544, + "theoretical_loss": 3.6994286909913434, + "tokens_seen": 867792896 + }, + { + "epoch": 2.08, + "learning_rate": 0.000372246740220662, + "loss": 2.4664, + "theoretical_loss": 3.699401403156439, + "tokens_seen": 867858432 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003722367101303912, + "loss": 2.628, + "theoretical_loss": 3.6993741179590143, + "tokens_seen": 867923968 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003722266800401204, + "loss": 2.5972, + "theoretical_loss": 3.6993468353986154, + "tokens_seen": 867989504 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037221664994984956, + "loss": 2.7708, + "theoretical_loss": 3.699319555474788, + "tokens_seen": 868055040 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037220661985957874, + "loss": 2.5467, + "theoretical_loss": 3.6992922781870776, + "tokens_seen": 868120576 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003721965897693079, + "loss": 2.7708, + "theoretical_loss": 3.6992650035350323, + "tokens_seen": 868186112 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037218655967903716, + "loss": 2.4194, + "theoretical_loss": 3.699237731518197, + "tokens_seen": 868251648 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003721765295887663, + "loss": 2.4483, + "theoretical_loss": 3.6992104621361186, + "tokens_seen": 868317184 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 959586, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.304150342941284, + "objective/train/theoretical_loss": 3.6991968284329717, + "objective/train/tokens_used": 888809952, + "theoretical_loss": 3.6991968284329717, + "tokens_seen": 868349952 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003721664994984955, + "loss": 2.646, + "theoretical_loss": 3.699183195388344, + "tokens_seen": 868382720 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003721564694082247, + "loss": 2.6881, + "theoretical_loss": 3.6991559312744196, + "tokens_seen": 868448256 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003721464393179539, + "loss": 2.7205, + "theoretical_loss": 3.6991286697938923, + "tokens_seen": 868513792 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037213640922768306, + "loss": 2.9575, + "theoretical_loss": 3.6991014109463096, + "tokens_seen": 868579328 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037212637913741224, + "loss": 2.7497, + "theoretical_loss": 3.6990741547312176, + "tokens_seen": 868644864 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003721163490471414, + "loss": 2.6964, + "theoretical_loss": 3.699046901148164, + "tokens_seen": 868710400 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037210631895687066, + "loss": 2.8692, + "theoretical_loss": 3.6990196501966963, + "tokens_seen": 868775936 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003720962888665998, + "loss": 2.6819, + "theoretical_loss": 3.6989924018763616, + "tokens_seen": 868841472 + }, + { + "epoch": 2.08, + "learning_rate": 0.000372086258776329, + "loss": 2.6184, + "theoretical_loss": 3.6989651561867074, + "tokens_seen": 868907008 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037207622868605815, + "loss": 2.6148, + "theoretical_loss": 3.698937913127282, + "tokens_seen": 868972544 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003720661985957874, + "loss": 2.5731, + "theoretical_loss": 3.6989106726976324, + "tokens_seen": 869038080 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037205616850551656, + "loss": 2.8067, + "theoretical_loss": 3.6988834348973065, + "tokens_seen": 869103616 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037204613841524575, + "loss": 2.7593, + "theoretical_loss": 3.698856199725853, + "tokens_seen": 869169152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003720361083249749, + "loss": 2.609, + "theoretical_loss": 3.698828967182819, + "tokens_seen": 869234688 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037202607823470416, + "loss": 2.7118, + "theoretical_loss": 3.698801737267753, + "tokens_seen": 869300224 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003720160481444333, + "loss": 2.7569, + "theoretical_loss": 3.698774509980203, + "tokens_seen": 869365760 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003720060180541625, + "loss": 2.7569, + "theoretical_loss": 3.698747285319719, + "tokens_seen": 869431296 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037199598796389165, + "loss": 2.7691, + "theoretical_loss": 3.6987200632858475, + "tokens_seen": 869496832 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003719859578736209, + "loss": 2.5376, + "theoretical_loss": 3.698692843878139, + "tokens_seen": 869562368 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037197592778335007, + "loss": 2.8628, + "theoretical_loss": 3.6986656270961404, + "tokens_seen": 869627904 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037196589769307925, + "loss": 2.5259, + "theoretical_loss": 3.6986384129394017, + "tokens_seen": 869693440 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037195586760280843, + "loss": 2.6141, + "theoretical_loss": 3.6986112014074717, + "tokens_seen": 869758976 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003719458375125376, + "loss": 2.789, + "theoretical_loss": 3.6985839924998993, + "tokens_seen": 869824512 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003719358074222668, + "loss": 2.7556, + "theoretical_loss": 3.698556786216234, + "tokens_seen": 869890048 + }, + { + "epoch": 2.08, + "learning_rate": 0.000371925777331996, + "loss": 2.6361, + "theoretical_loss": 3.6985295825560245, + "tokens_seen": 869955584 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.836449146270752, + "objective/train/theoretical_loss": 3.6985159817095754, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.6985159817095754, + "tokens_seen": 869988352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037191574724172515, + "loss": 2.8748, + "theoretical_loss": 3.698502381518821, + "tokens_seen": 870021120 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003719057171514544, + "loss": 2.8179, + "theoretical_loss": 3.6984751831041724, + "tokens_seen": 870086656 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003718956870611835, + "loss": 2.6425, + "theoretical_loss": 3.6984479873116287, + "tokens_seen": 870152192 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037188565697091275, + "loss": 2.6881, + "theoretical_loss": 3.6984207941407394, + "tokens_seen": 870217728 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037187562688064193, + "loss": 2.9222, + "theoretical_loss": 3.698393603591055, + "tokens_seen": 870283264 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003718655967903711, + "loss": 2.7699, + "theoretical_loss": 3.698366415662125, + "tokens_seen": 870348800 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003718555667001003, + "loss": 2.8366, + "theoretical_loss": 3.698339230353499, + "tokens_seen": 870414336 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037184553660982953, + "loss": 2.8265, + "theoretical_loss": 3.698312047664728, + "tokens_seen": 870479872 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037183550651955866, + "loss": 2.7096, + "theoretical_loss": 3.6982848675953623, + "tokens_seen": 870545408 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003718254764292879, + "loss": 2.8957, + "theoretical_loss": 3.6982576901449518, + "tokens_seen": 870610944 + }, + { + "epoch": 2.08, + "learning_rate": 0.000371815446339017, + "loss": 2.8406, + "theoretical_loss": 3.698230515313047, + "tokens_seen": 870676480 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037180541624874625, + "loss": 2.5983, + "theoretical_loss": 3.698203343099199, + "tokens_seen": 870742016 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037179538615847544, + "loss": 2.5483, + "theoretical_loss": 3.698176173502959, + "tokens_seen": 870807552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003717853560682046, + "loss": 2.755, + "theoretical_loss": 3.698149006523876, + "tokens_seen": 870873088 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003717753259779338, + "loss": 2.6726, + "theoretical_loss": 3.6981218421615027, + "tokens_seen": 870938624 + }, + { + "epoch": 2.08, + "learning_rate": 0.000371765295887663, + "loss": 2.6759, + "theoretical_loss": 3.69809468041539, + "tokens_seen": 871004160 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037175526579739216, + "loss": 2.8533, + "theoretical_loss": 3.698067521285089, + "tokens_seen": 871069696 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003717452357071214, + "loss": 2.6255, + "theoretical_loss": 3.6980403647701503, + "tokens_seen": 871135232 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003717352056168505, + "loss": 2.589, + "theoretical_loss": 3.698013210870126, + "tokens_seen": 871200768 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037172517552657976, + "loss": 2.7553, + "theoretical_loss": 3.6979860595845673, + "tokens_seen": 871266304 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003717151454363089, + "loss": 2.3292, + "theoretical_loss": 3.6979589109130258, + "tokens_seen": 871331840 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003717051153460381, + "loss": 2.6952, + "theoretical_loss": 3.697931764855054, + "tokens_seen": 871397376 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003716950852557673, + "loss": 3.0049, + "theoretical_loss": 3.697904621410203, + "tokens_seen": 871462912 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003716850551654965, + "loss": 2.7278, + "theoretical_loss": 3.697877480578025, + "tokens_seen": 871528448 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037167502507522566, + "loss": 2.7773, + "theoretical_loss": 3.6978503423580715, + "tokens_seen": 871593984 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4541244506835938, + "objective/train/theoretical_loss": 3.69783677422754, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.69783677422754, + "tokens_seen": 871626752 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003716649949849549, + "loss": 2.7079, + "theoretical_loss": 3.697823206749896, + "tokens_seen": 871659520 + }, + { + "epoch": 2.08, + "learning_rate": 0.000371654964894684, + "loss": 2.6983, + "theoretical_loss": 3.69779607375305, + "tokens_seen": 871725056 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037164493480441326, + "loss": 2.6245, + "theoretical_loss": 3.6977689433670857, + "tokens_seen": 871790592 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003716349047141424, + "loss": 2.4994, + "theoretical_loss": 3.6977418155915562, + "tokens_seen": 871856128 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003716248746238716, + "loss": 2.4224, + "theoretical_loss": 3.6977146904260136, + "tokens_seen": 871921664 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037161484453360086, + "loss": 2.647, + "theoretical_loss": 3.6976875678700107, + "tokens_seen": 871987200 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037160481444333, + "loss": 2.5049, + "theoretical_loss": 3.697660447923101, + "tokens_seen": 872052736 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003715947843530592, + "loss": 3.0161, + "theoretical_loss": 3.6976333305848366, + "tokens_seen": 872118272 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037158475426278835, + "loss": 2.8686, + "theoretical_loss": 3.6976062158547713, + "tokens_seen": 872183808 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003715747241725176, + "loss": 2.3929, + "theoretical_loss": 3.697579103732458, + "tokens_seen": 872249344 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037156469408224676, + "loss": 2.6487, + "theoretical_loss": 3.6975519942174495, + "tokens_seen": 872314880 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037155466399197595, + "loss": 2.5502, + "theoretical_loss": 3.6975248873093003, + "tokens_seen": 872380416 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003715446339017051, + "loss": 2.8222, + "theoretical_loss": 3.6974977830075626, + "tokens_seen": 872445952 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037153460381143436, + "loss": 2.4791, + "theoretical_loss": 3.697470681311791, + "tokens_seen": 872511488 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003715245737211635, + "loss": 2.5693, + "theoretical_loss": 3.6974435822215392, + "tokens_seen": 872577024 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003715145436308927, + "loss": 2.6591, + "theoretical_loss": 3.6974164857363605, + "tokens_seen": 872642560 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037150451354062185, + "loss": 2.5865, + "theoretical_loss": 3.6973893918558094, + "tokens_seen": 872708096 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003714944834503511, + "loss": 2.414, + "theoretical_loss": 3.6973623005794396, + "tokens_seen": 872773632 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037148445336008027, + "loss": 2.6693, + "theoretical_loss": 3.6973352119068053, + "tokens_seen": 872839168 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037147442326980945, + "loss": 2.7309, + "theoretical_loss": 3.697308125837461, + "tokens_seen": 872904704 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037146439317953863, + "loss": 2.6199, + "theoretical_loss": 3.6972810423709603, + "tokens_seen": 872970240 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003714543630892678, + "loss": 2.5813, + "theoretical_loss": 3.6972539615068594, + "tokens_seen": 873035776 + }, + { + "epoch": 2.08, + "learning_rate": 0.000371444332998997, + "loss": 2.5225, + "theoretical_loss": 3.6972268832447117, + "tokens_seen": 873101312 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037143430290872623, + "loss": 2.7103, + "theoretical_loss": 3.6971998075840715, + "tokens_seen": 873166848 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037142427281845535, + "loss": 2.4854, + "theoretical_loss": 3.6971727345244947, + "tokens_seen": 873232384 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5809988975524902, + "objective/train/theoretical_loss": 3.697159198969966, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.697159198969966, + "tokens_seen": 873265152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003714142427281846, + "loss": 2.58, + "theoretical_loss": 3.697145664065536, + "tokens_seen": 873297920 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003714042126379137, + "loss": 2.6467, + "theoretical_loss": 3.69711859620675, + "tokens_seen": 873363456 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037139418254764295, + "loss": 2.4065, + "theoretical_loss": 3.6970915309476915, + "tokens_seen": 873428992 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037138415245737213, + "loss": 2.497, + "theoretical_loss": 3.6970644682879175, + "tokens_seen": 873494528 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003713741223671013, + "loss": 2.7433, + "theoretical_loss": 3.6970374082269815, + "tokens_seen": 873560064 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003713640922768305, + "loss": 2.782, + "theoretical_loss": 3.6970103507644403, + "tokens_seen": 873625600 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037135406218655973, + "loss": 2.5894, + "theoretical_loss": 3.6969832958998485, + "tokens_seen": 873691136 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037134403209628886, + "loss": 2.7581, + "theoretical_loss": 3.6969562436327625, + "tokens_seen": 873756672 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003713340020060181, + "loss": 2.5189, + "theoretical_loss": 3.696929193962738, + "tokens_seen": 873822208 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003713239719157472, + "loss": 2.8104, + "theoretical_loss": 3.6969021468893306, + "tokens_seen": 873887744 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037131394182547645, + "loss": 2.5091, + "theoretical_loss": 3.6968751024120965, + "tokens_seen": 873953280 + }, + { + "epoch": 2.08, + "learning_rate": 0.00037130391173520564, + "loss": 2.8142, + "theoretical_loss": 3.696848060530592, + "tokens_seen": 874018816 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003712938816449348, + "loss": 2.9372, + "theoretical_loss": 3.6968210212443733, + "tokens_seen": 874084352 + }, + { + "epoch": 2.09, + "learning_rate": 0.000371283851554664, + "loss": 2.5234, + "theoretical_loss": 3.6967939845529965, + "tokens_seen": 874149888 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003712738214643932, + "loss": 2.489, + "theoretical_loss": 3.696766950456018, + "tokens_seen": 874215424 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037126379137412236, + "loss": 2.6471, + "theoretical_loss": 3.6967399189529955, + "tokens_seen": 874280960 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003712537612838516, + "loss": 2.6633, + "theoretical_loss": 3.696712890043485, + "tokens_seen": 874346496 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003712437311935807, + "loss": 2.7678, + "theoretical_loss": 3.6966858637270423, + "tokens_seen": 874412032 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037123370110330996, + "loss": 2.727, + "theoretical_loss": 3.6966588400032254, + "tokens_seen": 874477568 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003712236710130391, + "loss": 2.9024, + "theoretical_loss": 3.6966318188715914, + "tokens_seen": 874543104 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003712136409227683, + "loss": 2.6938, + "theoretical_loss": 3.696604800331697, + "tokens_seen": 874608640 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003712036108324975, + "loss": 2.7446, + "theoretical_loss": 3.6965777843830994, + "tokens_seen": 874674176 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003711935807422267, + "loss": 2.8905, + "theoretical_loss": 3.6965507710253562, + "tokens_seen": 874739712 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037118355065195586, + "loss": 2.575, + "theoretical_loss": 3.696523760258025, + "tokens_seen": 874805248 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003711735205616851, + "loss": 2.6711, + "theoretical_loss": 3.696496752080663, + "tokens_seen": 874870784 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8190174102783203, + "objective/train/theoretical_loss": 3.696483248963082, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.696483248963082, + "tokens_seen": 874903552 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003711634904714142, + "loss": 2.9269, + "theoretical_loss": 3.696469746492828, + "tokens_seen": 874936320 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037115346038114346, + "loss": 2.7968, + "theoretical_loss": 3.6964427434940776, + "tokens_seen": 875001856 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003711434302908726, + "loss": 2.6184, + "theoretical_loss": 3.6964157430839704, + "tokens_seen": 875067392 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003711334002006018, + "loss": 2.5877, + "theoretical_loss": 3.6963887452620634, + "tokens_seen": 875132928 + }, + { + "epoch": 2.09, + "learning_rate": 0.000371123370110331, + "loss": 2.4639, + "theoretical_loss": 3.696361750027915, + "tokens_seen": 875198464 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003711133400200602, + "loss": 2.9735, + "theoretical_loss": 3.696334757381084, + "tokens_seen": 875264000 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037110330992978937, + "loss": 2.6998, + "theoretical_loss": 3.696307767321128, + "tokens_seen": 875329536 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037109327983951855, + "loss": 2.6655, + "theoretical_loss": 3.696280779847606, + "tokens_seen": 875395072 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037108324974924773, + "loss": 2.713, + "theoretical_loss": 3.696253794960076, + "tokens_seen": 875460608 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037107321965897696, + "loss": 2.5791, + "theoretical_loss": 3.696226812658097, + "tokens_seen": 875526144 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003710631895687061, + "loss": 2.7312, + "theoretical_loss": 3.696199832941228, + "tokens_seen": 875591680 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003710531594784353, + "loss": 2.9171, + "theoretical_loss": 3.696172855809027, + "tokens_seen": 875657216 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037104312938816445, + "loss": 2.7087, + "theoretical_loss": 3.696145881261054, + "tokens_seen": 875722752 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003710330992978937, + "loss": 2.726, + "theoretical_loss": 3.6961189092968674, + "tokens_seen": 875788288 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037102306920762287, + "loss": 2.5598, + "theoretical_loss": 3.696091939916027, + "tokens_seen": 875853824 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037101303911735205, + "loss": 2.6229, + "theoretical_loss": 3.696064973118091, + "tokens_seen": 875919360 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037100300902708123, + "loss": 2.7326, + "theoretical_loss": 3.6960380089026197, + "tokens_seen": 875984896 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037099297893681047, + "loss": 2.5333, + "theoretical_loss": 3.696011047269173, + "tokens_seen": 876050432 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003709829488465396, + "loss": 2.4307, + "theoretical_loss": 3.695984088217309, + "tokens_seen": 876115968 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037097291875626883, + "loss": 2.5423, + "theoretical_loss": 3.695957131746589, + "tokens_seen": 876181504 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037096288866599796, + "loss": 2.8121, + "theoretical_loss": 3.695930177856572, + "tokens_seen": 876247040 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003709528585757272, + "loss": 2.4569, + "theoretical_loss": 3.695903226546818, + "tokens_seen": 876312576 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003709428284854564, + "loss": 2.7681, + "theoretical_loss": 3.695876277816888, + "tokens_seen": 876378112 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037093279839518555, + "loss": 2.6246, + "theoretical_loss": 3.6958493316663406, + "tokens_seen": 876443648 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037092276830491474, + "loss": 2.5519, + "theoretical_loss": 3.6958223880947374, + "tokens_seen": 876509184 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.962693691253662, + "objective/train/theoretical_loss": 3.695808917275902, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.695808917275902, + "tokens_seen": 876541952 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003709127382146439, + "loss": 2.7102, + "theoretical_loss": 3.695795447101638, + "tokens_seen": 876574720 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003709027081243731, + "loss": 2.8035, + "theoretical_loss": 3.695768508686603, + "tokens_seen": 876640256 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037089267803410233, + "loss": 2.7445, + "theoretical_loss": 3.695741572849193, + "tokens_seen": 876705792 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037088264794383146, + "loss": 2.8925, + "theoretical_loss": 3.695714639588969, + "tokens_seen": 876771328 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003708726178535607, + "loss": 2.7924, + "theoretical_loss": 3.695687708905491, + "tokens_seen": 876836864 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037086258776328993, + "loss": 2.5923, + "theoretical_loss": 3.6956607807983213, + "tokens_seen": 876902400 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037085255767301906, + "loss": 2.6051, + "theoretical_loss": 3.69563385526702, + "tokens_seen": 876967936 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003708425275827483, + "loss": 2.7864, + "theoretical_loss": 3.695606932311148, + "tokens_seen": 877033472 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003708324974924774, + "loss": 2.9097, + "theoretical_loss": 3.695580011930267, + "tokens_seen": 877099008 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037082246740220665, + "loss": 2.8712, + "theoretical_loss": 3.6955530941239387, + "tokens_seen": 877164544 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037081243731193584, + "loss": 2.7235, + "theoretical_loss": 3.6955261788917237, + "tokens_seen": 877230080 + }, + { + "epoch": 2.09, + "learning_rate": 0.000370802407221665, + "loss": 2.7495, + "theoretical_loss": 3.6954992662331847, + "tokens_seen": 877295616 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003707923771313942, + "loss": 2.7919, + "theoretical_loss": 3.695472356147882, + "tokens_seen": 877361152 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003707823470411234, + "loss": 2.5835, + "theoretical_loss": 3.695445448635378, + "tokens_seen": 877426688 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037077231695085256, + "loss": 2.6822, + "theoretical_loss": 3.6954185436952347, + "tokens_seen": 877492224 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003707622868605818, + "loss": 2.658, + "theoretical_loss": 3.6953916413270145, + "tokens_seen": 877557760 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003707522567703109, + "loss": 2.4346, + "theoretical_loss": 3.6953647415302786, + "tokens_seen": 877623296 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037074222668004016, + "loss": 2.8774, + "theoretical_loss": 3.6953378443045892, + "tokens_seen": 877688832 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003707321965897693, + "loss": 2.6141, + "theoretical_loss": 3.6953109496495093, + "tokens_seen": 877754368 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003707221664994985, + "loss": 2.683, + "theoretical_loss": 3.695284057564601, + "tokens_seen": 877819904 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003707121364092277, + "loss": 2.7027, + "theoretical_loss": 3.6952571680494275, + "tokens_seen": 877885440 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003707021063189569, + "loss": 2.6527, + "theoretical_loss": 3.6952302811035502, + "tokens_seen": 877950976 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037069207622868606, + "loss": 2.593, + "theoretical_loss": 3.6952033967265328, + "tokens_seen": 878016512 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003706820461384153, + "loss": 2.7754, + "theoretical_loss": 3.6951765149179376, + "tokens_seen": 878082048 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003706720160481444, + "loss": 2.7959, + "theoretical_loss": 3.6951496356773275, + "tokens_seen": 878147584 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8952300548553467, + "objective/train/theoretical_loss": 3.6951361970198806, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.6951361970198806, + "tokens_seen": 878180352 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037066198595787366, + "loss": 2.6569, + "theoretical_loss": 3.695122759004266, + "tokens_seen": 878213120 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003706519558676028, + "loss": 2.885, + "theoretical_loss": 3.695095884898316, + "tokens_seen": 878278656 + }, + { + "epoch": 2.09, + "learning_rate": 0.000370641925777332, + "loss": 2.7, + "theoretical_loss": 3.695069013359041, + "tokens_seen": 878344192 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003706318956870612, + "loss": 2.8532, + "theoretical_loss": 3.6950421443860044, + "tokens_seen": 878409728 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003706218655967904, + "loss": 2.7147, + "theoretical_loss": 3.6950152779787695, + "tokens_seen": 878475264 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037061183550651957, + "loss": 2.5623, + "theoretical_loss": 3.6949884141369003, + "tokens_seen": 878540800 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037060180541624875, + "loss": 2.584, + "theoretical_loss": 3.6949615528599593, + "tokens_seen": 878606336 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037059177532597793, + "loss": 2.778, + "theoretical_loss": 3.694934694147512, + "tokens_seen": 878671872 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037058174523570716, + "loss": 2.8953, + "theoretical_loss": 3.6949078379991205, + "tokens_seen": 878737408 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003705717151454363, + "loss": 2.7651, + "theoretical_loss": 3.6948809844143504, + "tokens_seen": 878802944 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003705616850551655, + "loss": 2.6819, + "theoretical_loss": 3.694854133392765, + "tokens_seen": 878868480 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037055165496489465, + "loss": 2.6763, + "theoretical_loss": 3.694827284933929, + "tokens_seen": 878934016 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003705416248746239, + "loss": 2.4986, + "theoretical_loss": 3.6948004390374063, + "tokens_seen": 878999552 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037053159478435307, + "loss": 2.6861, + "theoretical_loss": 3.6947735957027614, + "tokens_seen": 879065088 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037052156469408225, + "loss": 2.6115, + "theoretical_loss": 3.694746754929559, + "tokens_seen": 879130624 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037051153460381143, + "loss": 2.5333, + "theoretical_loss": 3.6947199167173643, + "tokens_seen": 879196160 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037050150451354067, + "loss": 2.8628, + "theoretical_loss": 3.6946930810657412, + "tokens_seen": 879261696 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003704914744232698, + "loss": 2.8541, + "theoretical_loss": 3.694666247974255, + "tokens_seen": 879327232 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037048144433299903, + "loss": 2.6677, + "theoretical_loss": 3.6946394174424704, + "tokens_seen": 879392768 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037047141424272816, + "loss": 2.5643, + "theoretical_loss": 3.694612589469953, + "tokens_seen": 879458304 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003704613841524574, + "loss": 2.6471, + "theoretical_loss": 3.6945857640562676, + "tokens_seen": 879523840 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003704513540621866, + "loss": 2.5586, + "theoretical_loss": 3.6945589412009796, + "tokens_seen": 879589376 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037044132397191575, + "loss": 2.6428, + "theoretical_loss": 3.694532120903654, + "tokens_seen": 879654912 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037043129388164494, + "loss": 2.6342, + "theoretical_loss": 3.694505303163857, + "tokens_seen": 879720448 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003704212637913741, + "loss": 2.6444, + "theoretical_loss": 3.6944784879811543, + "tokens_seen": 879785984 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.845834732055664, + "objective/train/theoretical_loss": 3.6944650813485773, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.6944650813485773, + "tokens_seen": 879818752 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003704112337011033, + "loss": 2.6323, + "theoretical_loss": 3.694451675355111, + "tokens_seen": 879851520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037040120361083253, + "loss": 2.6846, + "theoretical_loss": 3.694424865285293, + "tokens_seen": 879917056 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037039117352056166, + "loss": 2.2876, + "theoretical_loss": 3.6943980577712665, + "tokens_seen": 879982592 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003703811434302909, + "loss": 2.8253, + "theoretical_loss": 3.694371252812597, + "tokens_seen": 880048128 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003703711133400201, + "loss": 2.8466, + "theoretical_loss": 3.694344450408852, + "tokens_seen": 880113664 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037036108324974926, + "loss": 2.9858, + "theoretical_loss": 3.6943176505595963, + "tokens_seen": 880179200 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037035105315947844, + "loss": 2.9263, + "theoretical_loss": 3.6942908532643974, + "tokens_seen": 880244736 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003703410230692076, + "loss": 2.6523, + "theoretical_loss": 3.6942640585228204, + "tokens_seen": 880310272 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003703309929789368, + "loss": 2.6214, + "theoretical_loss": 3.694237266334433, + "tokens_seen": 880375808 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037032096288866604, + "loss": 2.4765, + "theoretical_loss": 3.694210476698802, + "tokens_seen": 880441344 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037031093279839516, + "loss": 2.6534, + "theoretical_loss": 3.6941836896154934, + "tokens_seen": 880506880 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003703009027081244, + "loss": 2.638, + "theoretical_loss": 3.6941569050840743, + "tokens_seen": 880572416 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003702908726178535, + "loss": 2.7322, + "theoretical_loss": 3.6941301231041117, + "tokens_seen": 880637952 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037028084252758276, + "loss": 2.7069, + "theoretical_loss": 3.6941033436751733, + "tokens_seen": 880703488 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037027081243731194, + "loss": 2.6372, + "theoretical_loss": 3.6940765667968254, + "tokens_seen": 880769024 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003702607823470411, + "loss": 2.6195, + "theoretical_loss": 3.694049792468636, + "tokens_seen": 880834560 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003702507522567703, + "loss": 2.7658, + "theoretical_loss": 3.6940230206901727, + "tokens_seen": 880900096 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003702407221664995, + "loss": 2.5632, + "theoretical_loss": 3.693996251461002, + "tokens_seen": 880965632 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037023069207622867, + "loss": 2.4424, + "theoretical_loss": 3.693969484780692, + "tokens_seen": 881031168 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003702206619859579, + "loss": 2.6033, + "theoretical_loss": 3.693942720648811, + "tokens_seen": 881096704 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037021063189568703, + "loss": 2.6652, + "theoretical_loss": 3.6939159590649266, + "tokens_seen": 881162240 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037020060180541626, + "loss": 2.773, + "theoretical_loss": 3.693889200028606, + "tokens_seen": 881227776 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037019057171514544, + "loss": 2.6614, + "theoretical_loss": 3.6938624435394183, + "tokens_seen": 881293312 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003701805416248746, + "loss": 2.4945, + "theoretical_loss": 3.6938356895969306, + "tokens_seen": 881358848 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003701705115346038, + "loss": 2.6585, + "theoretical_loss": 3.6938089382007124, + "tokens_seen": 881424384 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6805853843688965, + "objective/train/theoretical_loss": 3.6937955634573187, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.6937955634573187, + "tokens_seen": 881457152 + }, + { + "epoch": 2.09, + "learning_rate": 0.000370160481444333, + "loss": 2.7097, + "theoretical_loss": 3.6937821893503306, + "tokens_seen": 881489920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037015045135406217, + "loss": 2.8098, + "theoretical_loss": 3.6937554430453554, + "tokens_seen": 881555456 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003701404212637914, + "loss": 2.4764, + "theoretical_loss": 3.6937286992853537, + "tokens_seen": 881620992 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037013039117352053, + "loss": 2.767, + "theoretical_loss": 3.6937019580698953, + "tokens_seen": 881686528 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037012036108324977, + "loss": 2.6104, + "theoretical_loss": 3.6936752193985485, + "tokens_seen": 881752064 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037011033099297895, + "loss": 2.6776, + "theoretical_loss": 3.6936484832708825, + "tokens_seen": 881817600 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037010030090270813, + "loss": 2.8724, + "theoretical_loss": 3.693621749686466, + "tokens_seen": 881883136 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037009027081243736, + "loss": 2.5104, + "theoretical_loss": 3.693595018644868, + "tokens_seen": 881948672 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003700802407221665, + "loss": 2.6103, + "theoretical_loss": 3.6935682901456586, + "tokens_seen": 882014208 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003700702106318957, + "loss": 2.8333, + "theoretical_loss": 3.6935415641884055, + "tokens_seen": 882079744 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037006018054162485, + "loss": 2.4745, + "theoretical_loss": 3.6935148407726794, + "tokens_seen": 882145280 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003700501504513541, + "loss": 2.6831, + "theoretical_loss": 3.69348811989805, + "tokens_seen": 882210816 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037004012036108327, + "loss": 2.5069, + "theoretical_loss": 3.693461401564086, + "tokens_seen": 882276352 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037003009027081245, + "loss": 2.5407, + "theoretical_loss": 3.6934346857703577, + "tokens_seen": 882341888 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037002006018054163, + "loss": 2.9309, + "theoretical_loss": 3.693407972516434, + "tokens_seen": 882407424 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037001003009027087, + "loss": 2.5706, + "theoretical_loss": 3.6933812618018864, + "tokens_seen": 882472960 + }, + { + "epoch": 2.09, + "learning_rate": 0.00037, + "loss": 2.6596, + "theoretical_loss": 3.6933545536262846, + "tokens_seen": 882538496 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036998996990972923, + "loss": 2.8532, + "theoretical_loss": 3.693327847989198, + "tokens_seen": 882604032 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036997993981945836, + "loss": 2.6359, + "theoretical_loss": 3.6933011448901967, + "tokens_seen": 882669568 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003699699097291876, + "loss": 2.7255, + "theoretical_loss": 3.693274444328852, + "tokens_seen": 882735104 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003699598796389168, + "loss": 2.6571, + "theoretical_loss": 3.693247746304734, + "tokens_seen": 882800640 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036994984954864595, + "loss": 2.5259, + "theoretical_loss": 3.693221050817413, + "tokens_seen": 882866176 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036993981945837514, + "loss": 2.8501, + "theoretical_loss": 3.69319435786646, + "tokens_seen": 882931712 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003699297893681043, + "loss": 2.466, + "theoretical_loss": 3.693167667451445, + "tokens_seen": 882997248 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003699197592778335, + "loss": 2.8891, + "theoretical_loss": 3.6931409795719405, + "tokens_seen": 883062784 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3581273555755615, + "objective/train/theoretical_loss": 3.6931276365828696, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.6931276365828696, + "tokens_seen": 883095552 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036990972918756273, + "loss": 2.6237, + "theoretical_loss": 3.6931142942275157, + "tokens_seen": 883128320 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036989969909729186, + "loss": 2.4756, + "theoretical_loss": 3.6930876114177433, + "tokens_seen": 883193856 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003698896690070211, + "loss": 2.5828, + "theoretical_loss": 3.6930609311421936, + "tokens_seen": 883259392 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003698796389167503, + "loss": 2.7051, + "theoretical_loss": 3.6930342534004374, + "tokens_seen": 883324928 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036986960882647946, + "loss": 2.4825, + "theoretical_loss": 3.693007578192047, + "tokens_seen": 883390464 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036985957873620864, + "loss": 2.6811, + "theoretical_loss": 3.6929809055165936, + "tokens_seen": 883456000 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003698495486459378, + "loss": 2.8016, + "theoretical_loss": 3.6929542353736493, + "tokens_seen": 883521536 + }, + { + "epoch": 2.09, + "learning_rate": 0.000369839518555667, + "loss": 2.6199, + "theoretical_loss": 3.6929275677627853, + "tokens_seen": 883587072 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036982948846539624, + "loss": 2.6573, + "theoretical_loss": 3.692900902683573, + "tokens_seen": 883652608 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036981945837512536, + "loss": 2.7305, + "theoretical_loss": 3.692874240135585, + "tokens_seen": 883718144 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003698094282848546, + "loss": 2.7184, + "theoretical_loss": 3.6928475801183933, + "tokens_seen": 883783680 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003697993981945837, + "loss": 2.5299, + "theoretical_loss": 3.6928209226315696, + "tokens_seen": 883849216 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036978936810431296, + "loss": 2.5705, + "theoretical_loss": 3.692794267674687, + "tokens_seen": 883914752 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036977933801404214, + "loss": 2.6542, + "theoretical_loss": 3.692767615247317, + "tokens_seen": 883980288 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003697693079237713, + "loss": 2.7557, + "theoretical_loss": 3.6927409653490324, + "tokens_seen": 884045824 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003697592778335005, + "loss": 2.5676, + "theoretical_loss": 3.6927143179794055, + "tokens_seen": 884111360 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003697492477432297, + "loss": 2.7962, + "theoretical_loss": 3.692687673138009, + "tokens_seen": 884176896 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036973921765295887, + "loss": 2.475, + "theoretical_loss": 3.692661030824416, + "tokens_seen": 884242432 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003697291875626881, + "loss": 2.6934, + "theoretical_loss": 3.6926343910381996, + "tokens_seen": 884307968 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036971915747241723, + "loss": 2.5649, + "theoretical_loss": 3.692607753778932, + "tokens_seen": 884373504 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036970912738214646, + "loss": 2.825, + "theoretical_loss": 3.692581119046186, + "tokens_seen": 884439040 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036969909729187565, + "loss": 2.7717, + "theoretical_loss": 3.692554486839536, + "tokens_seen": 884504576 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003696890672016048, + "loss": 2.6135, + "theoretical_loss": 3.692527857158555, + "tokens_seen": 884570112 + }, + { + "epoch": 2.09, + "learning_rate": 0.000369679037111334, + "loss": 2.6465, + "theoretical_loss": 3.692501230002815, + "tokens_seen": 884635648 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003696690070210632, + "loss": 2.7335, + "theoretical_loss": 3.6924746053718915, + "tokens_seen": 884701184 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.489288568496704, + "objective/train/theoretical_loss": 3.692461294003102, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.692461294003102, + "tokens_seen": 884733952 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036965897693079237, + "loss": 2.7261, + "theoretical_loss": 3.6924479832653567, + "tokens_seen": 884766720 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003696489468405216, + "loss": 2.7215, + "theoretical_loss": 3.6924213636827847, + "tokens_seen": 884832256 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036963891675025073, + "loss": 2.4819, + "theoretical_loss": 3.6923947466237497, + "tokens_seen": 884897792 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036962888665997997, + "loss": 2.7193, + "theoretical_loss": 3.6923681320878243, + "tokens_seen": 884963328 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003696188565697091, + "loss": 2.7264, + "theoretical_loss": 3.692341520074584, + "tokens_seen": 885028864 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036960882647943833, + "loss": 2.6642, + "theoretical_loss": 3.692314910583603, + "tokens_seen": 885094400 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003695987963891675, + "loss": 2.7689, + "theoretical_loss": 3.692288303614454, + "tokens_seen": 885159936 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003695887662988967, + "loss": 2.7526, + "theoretical_loss": 3.692261699166712, + "tokens_seen": 885225472 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036957873620862587, + "loss": 2.6084, + "theoretical_loss": 3.692235097239952, + "tokens_seen": 885291008 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036956870611835505, + "loss": 2.6363, + "theoretical_loss": 3.6922084978337475, + "tokens_seen": 885356544 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036955867602808424, + "loss": 2.6097, + "theoretical_loss": 3.6921819009476744, + "tokens_seen": 885422080 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036954864593781347, + "loss": 2.4996, + "theoretical_loss": 3.6921553065813066, + "tokens_seen": 885487616 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003695386158475426, + "loss": 2.6923, + "theoretical_loss": 3.6921287147342188, + "tokens_seen": 885553152 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036952858575727183, + "loss": 2.6882, + "theoretical_loss": 3.6921021254059863, + "tokens_seen": 885618688 + }, + { + "epoch": 2.09, + "learning_rate": 0.000369518555667001, + "loss": 2.5709, + "theoretical_loss": 3.6920755385961845, + "tokens_seen": 885684224 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003695085255767302, + "loss": 2.6799, + "theoretical_loss": 3.6920489543043873, + "tokens_seen": 885749760 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003694984954864594, + "loss": 2.7231, + "theoretical_loss": 3.692022372530171, + "tokens_seen": 885815296 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036948846539618856, + "loss": 2.5862, + "theoretical_loss": 3.691995793273111, + "tokens_seen": 885880832 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036947843530591774, + "loss": 2.7793, + "theoretical_loss": 3.691969216532782, + "tokens_seen": 885946368 + }, + { + "epoch": 2.09, + "learning_rate": 0.000369468405215647, + "loss": 2.566, + "theoretical_loss": 3.6919426423087605, + "tokens_seen": 886011904 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003694583751253761, + "loss": 2.577, + "theoretical_loss": 3.6919160706006213, + "tokens_seen": 886077440 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036944834503510534, + "loss": 2.7, + "theoretical_loss": 3.6918895014079407, + "tokens_seen": 886142976 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036943831494483446, + "loss": 2.5987, + "theoretical_loss": 3.691862934730294, + "tokens_seen": 886208512 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003694282848545637, + "loss": 2.4723, + "theoretical_loss": 3.691836370567257, + "tokens_seen": 886274048 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003694182547642929, + "loss": 2.4132, + "theoretical_loss": 3.6918098089184075, + "tokens_seen": 886339584 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.893305540084839, + "objective/train/theoretical_loss": 3.6917965290366697, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.6917965290366697, + "tokens_seen": 886372352 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036940822467402206, + "loss": 2.7739, + "theoretical_loss": 3.6917832497833203, + "tokens_seen": 886405120 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036939819458375124, + "loss": 2.7017, + "theoretical_loss": 3.691756693161571, + "tokens_seen": 886470656 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003693881644934805, + "loss": 2.8098, + "theoretical_loss": 3.6917301390527375, + "tokens_seen": 886536192 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003693781344032096, + "loss": 2.6014, + "theoretical_loss": 3.691703587456395, + "tokens_seen": 886601728 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036936810431293884, + "loss": 2.7644, + "theoretical_loss": 3.691677038372121, + "tokens_seen": 886667264 + }, + { + "epoch": 2.09, + "learning_rate": 0.000369358074222668, + "loss": 2.6346, + "theoretical_loss": 3.691650491799492, + "tokens_seen": 886732800 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003693480441323972, + "loss": 2.534, + "theoretical_loss": 3.6916239477380843, + "tokens_seen": 886798336 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036933801404212644, + "loss": 2.6581, + "theoretical_loss": 3.6915974061874754, + "tokens_seen": 886863872 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036932798395185556, + "loss": 2.7481, + "theoretical_loss": 3.6915708671472416, + "tokens_seen": 886929408 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003693179538615848, + "loss": 2.6258, + "theoretical_loss": 3.691544330616961, + "tokens_seen": 886994944 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003693079237713139, + "loss": 2.6422, + "theoretical_loss": 3.6915177965962096, + "tokens_seen": 887060480 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036929789368104316, + "loss": 2.7657, + "theoretical_loss": 3.691491265084566, + "tokens_seen": 887126016 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036928786359077234, + "loss": 2.5757, + "theoretical_loss": 3.691464736081606, + "tokens_seen": 887191552 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003692778335005015, + "loss": 2.7411, + "theoretical_loss": 3.6914382095869085, + "tokens_seen": 887257088 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003692678034102307, + "loss": 2.9059, + "theoretical_loss": 3.691411685600051, + "tokens_seen": 887322624 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003692577733199599, + "loss": 2.767, + "theoretical_loss": 3.691385164120611, + "tokens_seen": 887388160 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036924774322968907, + "loss": 2.6977, + "theoretical_loss": 3.6913586451481653, + "tokens_seen": 887453696 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003692377131394183, + "loss": 2.6306, + "theoretical_loss": 3.6913321286822924, + "tokens_seen": 887519232 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036922768304914743, + "loss": 2.7088, + "theoretical_loss": 3.6913056147225713, + "tokens_seen": 887584768 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036921765295887666, + "loss": 2.4745, + "theoretical_loss": 3.691279103268579, + "tokens_seen": 887650304 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036920762286860585, + "loss": 2.7917, + "theoretical_loss": 3.691252594319894, + "tokens_seen": 887715840 + }, + { + "epoch": 2.09, + "learning_rate": 0.000369197592778335, + "loss": 2.7978, + "theoretical_loss": 3.691226087876095, + "tokens_seen": 887781376 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003691875626880642, + "loss": 2.7152, + "theoretical_loss": 3.6911995839367604, + "tokens_seen": 887846912 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003691775325977934, + "loss": 2.6977, + "theoretical_loss": 3.691173082501468, + "tokens_seen": 887912448 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036916750250752257, + "loss": 2.769, + "theoretical_loss": 3.6911465835697976, + "tokens_seen": 887977984 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6037163734436035, + "objective/train/theoretical_loss": 3.691133335042688, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.691133335042688, + "tokens_seen": 888010752 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003691574724172518, + "loss": 2.5662, + "theoretical_loss": 3.6911200871413263, + "tokens_seen": 888043520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036914744232698093, + "loss": 2.7313, + "theoretical_loss": 3.6910935932156343, + "tokens_seen": 888109056 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036913741223671017, + "loss": 2.8887, + "theoretical_loss": 3.6910671017923002, + "tokens_seen": 888174592 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003691273821464393, + "loss": 2.7674, + "theoretical_loss": 3.6910406128709026, + "tokens_seen": 888240128 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036911735205616853, + "loss": 2.5497, + "theoretical_loss": 3.6910141264510212, + "tokens_seen": 888305664 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003691073219658977, + "loss": 2.6463, + "theoretical_loss": 3.690987642532235, + "tokens_seen": 888371200 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003690972918756269, + "loss": 2.8052, + "theoretical_loss": 3.690961161114123, + "tokens_seen": 888436736 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003690872617853561, + "loss": 2.906, + "theoretical_loss": 3.690934682196265, + "tokens_seen": 888502272 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036907723169508525, + "loss": 2.7999, + "theoretical_loss": 3.690908205778241, + "tokens_seen": 888567808 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036906720160481444, + "loss": 2.7024, + "theoretical_loss": 3.6908817318596303, + "tokens_seen": 888633344 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036905717151454367, + "loss": 2.6924, + "theoretical_loss": 3.6908552604400118, + "tokens_seen": 888698880 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003690471414242728, + "loss": 2.6747, + "theoretical_loss": 3.6908287915189666, + "tokens_seen": 888764416 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036903711133400203, + "loss": 2.6567, + "theoretical_loss": 3.690802325096074, + "tokens_seen": 888829952 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003690270812437312, + "loss": 2.4936, + "theoretical_loss": 3.6907758611709145, + "tokens_seen": 888895488 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003690170511534604, + "loss": 2.4739, + "theoretical_loss": 3.6907493997430674, + "tokens_seen": 888961024 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003690070210631896, + "loss": 2.5213, + "theoretical_loss": 3.6907229408121136, + "tokens_seen": 889026560 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036899699097291876, + "loss": 2.549, + "theoretical_loss": 3.690696484377634, + "tokens_seen": 889092096 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036898696088264794, + "loss": 2.7987, + "theoretical_loss": 3.6906700304392075, + "tokens_seen": 889157632 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003689769307923772, + "loss": 2.4143, + "theoretical_loss": 3.690643578996416, + "tokens_seen": 889223168 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003689669007021063, + "loss": 2.5284, + "theoretical_loss": 3.6906171300488397, + "tokens_seen": 889288704 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036895687061183554, + "loss": 2.5884, + "theoretical_loss": 3.690590683596059, + "tokens_seen": 889354240 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036894684052156466, + "loss": 2.5301, + "theoretical_loss": 3.6905642396376557, + "tokens_seen": 889419776 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003689368104312939, + "loss": 2.7106, + "theoretical_loss": 3.69053779817321, + "tokens_seen": 889485312 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003689267803410231, + "loss": 2.6738, + "theoretical_loss": 3.6905113592023033, + "tokens_seen": 889550848 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036891675025075226, + "loss": 2.9077, + "theoretical_loss": 3.690484922724517, + "tokens_seen": 889616384 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 961818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.595710515975952, + "objective/train/theoretical_loss": 3.6904717054204124, + "objective/train/tokens_used": 890267104, + "theoretical_loss": 3.6904717054204124, + "tokens_seen": 889649152 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036890672016048144, + "loss": 2.6372, + "theoretical_loss": 3.690458488739431, + "tokens_seen": 889681920 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003688966900702107, + "loss": 2.6164, + "theoretical_loss": 3.6904320572466283, + "tokens_seen": 889747456 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003688866599799398, + "loss": 2.6341, + "theoretical_loss": 3.6904056282456903, + "tokens_seen": 889812992 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036887662988966904, + "loss": 2.7133, + "theoretical_loss": 3.690379201736197, + "tokens_seen": 889878528 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036886659979939817, + "loss": 2.7007, + "theoretical_loss": 3.690352777717732, + "tokens_seen": 889944064 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003688565697091274, + "loss": 2.6807, + "theoretical_loss": 3.6903263561898756, + "tokens_seen": 890009600 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003688465396188566, + "loss": 2.5019, + "theoretical_loss": 3.6902999371522105, + "tokens_seen": 890075136 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036883650952858576, + "loss": 2.8941, + "theoretical_loss": 3.6902735206043182, + "tokens_seen": 890140672 + }, + { + "epoch": 2.09, + "learning_rate": 0.00036882647943831494, + "loss": 2.4319, + "theoretical_loss": 3.6902471065457814, + "tokens_seen": 890206208 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003688164493480441, + "loss": 2.775, + "theoretical_loss": 3.6902206949761815, + "tokens_seen": 890271744 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003688064192577733, + "loss": 3.5933, + "theoretical_loss": 3.690188096626651, + "tokens_seen": 890352640 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036879638916750254, + "loss": 2.7673, + "theoretical_loss": 3.690161690616762, + "tokens_seen": 890418176 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036878635907723167, + "loss": 2.9802, + "theoretical_loss": 3.690135287094459, + "tokens_seen": 890483712 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003687763289869609, + "loss": 2.9272, + "theoretical_loss": 3.690108886059325, + "tokens_seen": 890549248 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036876629889669003, + "loss": 2.8218, + "theoretical_loss": 3.690082487510943, + "tokens_seen": 890614784 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036875626880641927, + "loss": 2.9439, + "theoretical_loss": 3.690056091448896, + "tokens_seen": 890680320 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036874623871614845, + "loss": 2.8595, + "theoretical_loss": 3.690029697872766, + "tokens_seen": 890745856 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036873620862587763, + "loss": 2.8154, + "theoretical_loss": 3.6900033067821374, + "tokens_seen": 890811392 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003687261785356068, + "loss": 2.8048, + "theoretical_loss": 3.6899769181765922, + "tokens_seen": 890876928 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036871614844533605, + "loss": 2.8396, + "theoretical_loss": 3.6899505320557138, + "tokens_seen": 890942464 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036870611835506517, + "loss": 2.7181, + "theoretical_loss": 3.689924148419086, + "tokens_seen": 891008000 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003686960882647944, + "loss": 2.9879, + "theoretical_loss": 3.6898977672662916, + "tokens_seen": 891073536 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036868605817452353, + "loss": 2.7716, + "theoretical_loss": 3.6898713885969148, + "tokens_seen": 891139072 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036867602808425277, + "loss": 2.7015, + "theoretical_loss": 3.6898450124105384, + "tokens_seen": 891204608 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036866599799398195, + "loss": 2.7423, + "theoretical_loss": 3.689818638706747, + "tokens_seen": 891270144 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1026627, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.057016134262085, + "objective/train/theoretical_loss": 3.6898120456686536, + "objective/train/tokens_used": 911746528, + "theoretical_loss": 3.6898120456686536, + "tokens_seen": 891286528 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036865596790371113, + "loss": 2.8914, + "theoretical_loss": 3.689792267485123, + "tokens_seen": 891335680 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003686459378134403, + "loss": 2.7807, + "theoretical_loss": 3.689765898745252, + "tokens_seen": 891401216 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003686359077231695, + "loss": 2.8683, + "theoretical_loss": 3.689739532486717, + "tokens_seen": 891466752 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003686258776328987, + "loss": 2.6409, + "theoretical_loss": 3.689713168709102, + "tokens_seen": 891532288 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003686158475426279, + "loss": 2.8472, + "theoretical_loss": 3.6896868074119924, + "tokens_seen": 891597824 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003686058174523571, + "loss": 2.619, + "theoretical_loss": 3.689660448594971, + "tokens_seen": 891663360 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003685957873620863, + "loss": 2.788, + "theoretical_loss": 3.689634092257623, + "tokens_seen": 891728896 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036858575727181545, + "loss": 2.9168, + "theoretical_loss": 3.689607738399533, + "tokens_seen": 891794432 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036857572718154464, + "loss": 2.8188, + "theoretical_loss": 3.6895813870202856, + "tokens_seen": 891859968 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036856569709127387, + "loss": 2.744, + "theoretical_loss": 3.6895550381194653, + "tokens_seen": 891925504 + }, + { + "epoch": 3.0, + "learning_rate": 0.000368555667001003, + "loss": 2.8304, + "theoretical_loss": 3.6895286916966565, + "tokens_seen": 891991040 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036854563691073223, + "loss": 2.7051, + "theoretical_loss": 3.689502347751445, + "tokens_seen": 892056576 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003685356068204614, + "loss": 2.781, + "theoretical_loss": 3.6894760062834155, + "tokens_seen": 892122112 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003685255767301906, + "loss": 2.7491, + "theoretical_loss": 3.689449667292153, + "tokens_seen": 892187648 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003685155466399198, + "loss": 2.8092, + "theoretical_loss": 3.689423330777242, + "tokens_seen": 892253184 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036850551654964896, + "loss": 2.8625, + "theoretical_loss": 3.689396996738269, + "tokens_seen": 892318720 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036849548645937814, + "loss": 2.672, + "theoretical_loss": 3.6893706651748195, + "tokens_seen": 892384256 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003684854563691074, + "loss": 2.8162, + "theoretical_loss": 3.6893443360864775, + "tokens_seen": 892449792 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003684754262788365, + "loss": 2.8069, + "theoretical_loss": 3.68931800947283, + "tokens_seen": 892515328 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036846539618856574, + "loss": 2.78, + "theoretical_loss": 3.6892916853334627, + "tokens_seen": 892580864 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036845536609829486, + "loss": 2.7692, + "theoretical_loss": 3.6892653636679604, + "tokens_seen": 892646400 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003684453360080241, + "loss": 2.6777, + "theoretical_loss": 3.68923904447591, + "tokens_seen": 892711936 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003684353059177533, + "loss": 2.7668, + "theoretical_loss": 3.689212727756897, + "tokens_seen": 892777472 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036842527582748246, + "loss": 2.8518, + "theoretical_loss": 3.689186413510508, + "tokens_seen": 892843008 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036841524573721164, + "loss": 2.8783, + "theoretical_loss": 3.689160101736328, + "tokens_seen": 892908544 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1031633, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0238943099975586, + "objective/train/theoretical_loss": 3.6891535241790177, + "objective/train/tokens_used": 913384928, + "theoretical_loss": 3.6891535241790177, + "tokens_seen": 892924928 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003684052156469409, + "loss": 2.858, + "theoretical_loss": 3.689133792433945, + "tokens_seen": 892974080 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036839518555667, + "loss": 2.9869, + "theoretical_loss": 3.689107485602944, + "tokens_seen": 893039616 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036838515546639924, + "loss": 2.8007, + "theoretical_loss": 3.689081181242912, + "tokens_seen": 893105152 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036837512537612837, + "loss": 2.9061, + "theoretical_loss": 3.689054879353437, + "tokens_seen": 893170688 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003683650952858576, + "loss": 2.8327, + "theoretical_loss": 3.689028579934103, + "tokens_seen": 893236224 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003683550651955868, + "loss": 2.8884, + "theoretical_loss": 3.6890022829844984, + "tokens_seen": 893301760 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036834503510531596, + "loss": 2.538, + "theoretical_loss": 3.6889759885042106, + "tokens_seen": 893367296 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036833500501504514, + "loss": 2.8365, + "theoretical_loss": 3.6889496964928252, + "tokens_seen": 893432832 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003683249749247743, + "loss": 2.7163, + "theoretical_loss": 3.6889234069499306, + "tokens_seen": 893498368 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003683149448345035, + "loss": 2.7978, + "theoretical_loss": 3.6888971198751133, + "tokens_seen": 893563904 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036830491474423274, + "loss": 2.7502, + "theoretical_loss": 3.688870835267961, + "tokens_seen": 893629440 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036829488465396187, + "loss": 2.7726, + "theoretical_loss": 3.6888445531280603, + "tokens_seen": 893694976 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003682848545636911, + "loss": 2.9963, + "theoretical_loss": 3.688818273455, + "tokens_seen": 893760512 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036827482447342023, + "loss": 2.7254, + "theoretical_loss": 3.688791996248366, + "tokens_seen": 893826048 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036826479438314947, + "loss": 2.8003, + "theoretical_loss": 3.688765721507748, + "tokens_seen": 893891584 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036825476429287865, + "loss": 2.8246, + "theoretical_loss": 3.6887394492327323, + "tokens_seen": 893957120 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036824473420260783, + "loss": 2.5958, + "theoretical_loss": 3.6887131794229076, + "tokens_seen": 894022656 + }, + { + "epoch": 3.0, + "learning_rate": 0.000368234704112337, + "loss": 2.6973, + "theoretical_loss": 3.688686912077861, + "tokens_seen": 894088192 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036822467402206625, + "loss": 2.7454, + "theoretical_loss": 3.6886606471971817, + "tokens_seen": 894153728 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036821464393179537, + "loss": 2.6587, + "theoretical_loss": 3.688634384780457, + "tokens_seen": 894219264 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003682046138415246, + "loss": 2.6925, + "theoretical_loss": 3.688608124827276, + "tokens_seen": 894284800 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036819458375125373, + "loss": 2.8973, + "theoretical_loss": 3.688581867337226, + "tokens_seen": 894350336 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036818455366098297, + "loss": 2.6081, + "theoretical_loss": 3.688555612309897, + "tokens_seen": 894415872 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036817452357071215, + "loss": 2.8145, + "theoretical_loss": 3.6885293597448765, + "tokens_seen": 894481408 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036816449348044133, + "loss": 2.7993, + "theoretical_loss": 3.6885031096417533, + "tokens_seen": 894546944 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1036652, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8129405975341797, + "objective/train/theoretical_loss": 3.6884965475005957, + "objective/train/tokens_used": 915023328, + "theoretical_loss": 3.6884965475005957, + "tokens_seen": 894563328 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003681544633901705, + "loss": 2.7861, + "theoretical_loss": 3.6884768620001163, + "tokens_seen": 894612480 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003681444332998997, + "loss": 2.7175, + "theoretical_loss": 3.688450616819555, + "tokens_seen": 894678016 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003681344032096289, + "loss": 2.9492, + "theoretical_loss": 3.6884243740996574, + "tokens_seen": 894743552 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003681243731193581, + "loss": 2.6274, + "theoretical_loss": 3.688398133840013, + "tokens_seen": 894809088 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036811434302908724, + "loss": 2.7315, + "theoretical_loss": 3.688371896040211, + "tokens_seen": 894874624 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003681043129388165, + "loss": 2.9269, + "theoretical_loss": 3.6883456606998406, + "tokens_seen": 894940160 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003680942828485456, + "loss": 2.8358, + "theoretical_loss": 3.6883194278184916, + "tokens_seen": 895005696 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036808425275827484, + "loss": 2.8829, + "theoretical_loss": 3.688293197395753, + "tokens_seen": 895071232 + }, + { + "epoch": 3.0, + "learning_rate": 0.000368074222668004, + "loss": 2.7785, + "theoretical_loss": 3.6882669694312145, + "tokens_seen": 895136768 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003680641925777332, + "loss": 2.9305, + "theoretical_loss": 3.688240743924466, + "tokens_seen": 895202304 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003680541624874624, + "loss": 2.7584, + "theoretical_loss": 3.688214520875097, + "tokens_seen": 895267840 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003680441323971916, + "loss": 2.859, + "theoretical_loss": 3.6881883002826976, + "tokens_seen": 895333376 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003680441323971916, + "loss": 2.8224, + "theoretical_loss": 3.6881620821468575, + "tokens_seen": 895398912 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036803410230692074, + "loss": 2.8127, + "theoretical_loss": 3.6881358664671673, + "tokens_seen": 895464448 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036802407221665, + "loss": 2.8278, + "theoretical_loss": 3.6881096532432163, + "tokens_seen": 895529984 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003680140421263791, + "loss": 2.8223, + "theoretical_loss": 3.6880834424745954, + "tokens_seen": 895595520 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036800401203610834, + "loss": 2.8507, + "theoretical_loss": 3.688057234160895, + "tokens_seen": 895661056 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003679939819458375, + "loss": 2.9332, + "theoretical_loss": 3.6880310283017055, + "tokens_seen": 895726592 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003679839518555667, + "loss": 2.8083, + "theoretical_loss": 3.688004824896617, + "tokens_seen": 895792128 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003679739217652959, + "loss": 2.887, + "theoretical_loss": 3.687978623945221, + "tokens_seen": 895857664 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036796389167502506, + "loss": 2.8633, + "theoretical_loss": 3.687952425447108, + "tokens_seen": 895923200 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036795386158475424, + "loss": 2.7929, + "theoretical_loss": 3.687926229401868, + "tokens_seen": 895988736 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003679438314944835, + "loss": 2.8538, + "theoretical_loss": 3.687900035809093, + "tokens_seen": 896054272 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003679338014042126, + "loss": 2.8214, + "theoretical_loss": 3.687873844668373, + "tokens_seen": 896119808 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036792377131394184, + "loss": 2.7678, + "theoretical_loss": 3.6878476559793008, + "tokens_seen": 896185344 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1041586, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.545072555541992, + "objective/train/theoretical_loss": 3.687841109190055, + "objective/train/tokens_used": 916661728, + "theoretical_loss": 3.687841109190055, + "tokens_seen": 896201728 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036791374122367097, + "loss": 2.575, + "theoretical_loss": 3.6878214697414666, + "tokens_seen": 896250880 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003679037111334002, + "loss": 2.7571, + "theoretical_loss": 3.6877952859544614, + "tokens_seen": 896316416 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003678936810431294, + "loss": 2.8336, + "theoretical_loss": 3.6877691046178773, + "tokens_seen": 896381952 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036788365095285857, + "loss": 2.8234, + "theoretical_loss": 3.687742925731306, + "tokens_seen": 896447488 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036787362086258775, + "loss": 2.7132, + "theoretical_loss": 3.687716749294338, + "tokens_seen": 896513024 + }, + { + "epoch": 3.0, + "learning_rate": 0.000367863590772317, + "loss": 2.751, + "theoretical_loss": 3.6876905753065667, + "tokens_seen": 896578560 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036785356068204616, + "loss": 2.8567, + "theoretical_loss": 3.6876644037675823, + "tokens_seen": 896644096 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036784353059177535, + "loss": 2.8155, + "theoretical_loss": 3.687638234676978, + "tokens_seen": 896709632 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003678335005015045, + "loss": 2.9743, + "theoretical_loss": 3.687612068034345, + "tokens_seen": 896775168 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003678234704112337, + "loss": 2.7782, + "theoretical_loss": 3.6875859038392758, + "tokens_seen": 896840704 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036781344032096294, + "loss": 2.618, + "theoretical_loss": 3.687559742091363, + "tokens_seen": 896906240 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036780341023069207, + "loss": 2.8275, + "theoretical_loss": 3.6875335827901985, + "tokens_seen": 896971776 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003677933801404213, + "loss": 2.7824, + "theoretical_loss": 3.687507425935374, + "tokens_seen": 897037312 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003677933801404213, + "loss": 2.8379, + "theoretical_loss": 3.687481271526484, + "tokens_seen": 897102848 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036778335005015043, + "loss": 2.8418, + "theoretical_loss": 3.6874551195631193, + "tokens_seen": 897168384 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036777331995987967, + "loss": 2.8881, + "theoretical_loss": 3.687428970044873, + "tokens_seen": 897233920 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036776328986960885, + "loss": 2.8368, + "theoretical_loss": 3.6874028229713387, + "tokens_seen": 897299456 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036775325977933803, + "loss": 2.7317, + "theoretical_loss": 3.6873766783421082, + "tokens_seen": 897364992 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003677432296890672, + "loss": 2.9576, + "theoretical_loss": 3.6873505361567753, + "tokens_seen": 897430528 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036773319959879645, + "loss": 2.8593, + "theoretical_loss": 3.687324396414933, + "tokens_seen": 897496064 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036772316950852557, + "loss": 2.8501, + "theoretical_loss": 3.6872982591161745, + "tokens_seen": 897561600 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003677131394182548, + "loss": 2.9521, + "theoretical_loss": 3.6872721242600925, + "tokens_seen": 897627136 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036770310932798393, + "loss": 2.6791, + "theoretical_loss": 3.6872459918462814, + "tokens_seen": 897692672 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036769307923771317, + "loss": 2.6738, + "theoretical_loss": 3.6872198618743344, + "tokens_seen": 897758208 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036768304914744235, + "loss": 3.0484, + "theoretical_loss": 3.687193734343844, + "tokens_seen": 897823744 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1046594, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.713426113128662, + "objective/train/theoretical_loss": 3.687187202842652, + "objective/train/tokens_used": 918300128, + "theoretical_loss": 3.687187202842652, + "tokens_seen": 897840128 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036767301905717153, + "loss": 2.8236, + "theoretical_loss": 3.6871676092544057, + "tokens_seen": 897889280 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003676629889669007, + "loss": 2.856, + "theoretical_loss": 3.6871414866056123, + "tokens_seen": 897954816 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003676529588766299, + "loss": 2.9411, + "theoretical_loss": 3.687115366397057, + "tokens_seen": 898020352 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003676429287863591, + "loss": 2.622, + "theoretical_loss": 3.6870892486283355, + "tokens_seen": 898085888 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003676328986960883, + "loss": 2.8903, + "theoretical_loss": 3.6870631332990405, + "tokens_seen": 898151424 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036762286860581744, + "loss": 2.7094, + "theoretical_loss": 3.6870370204087664, + "tokens_seen": 898216960 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003676128385155467, + "loss": 2.5918, + "theoretical_loss": 3.6870109099571082, + "tokens_seen": 898282496 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003676028084252758, + "loss": 3.0356, + "theoretical_loss": 3.6869848019436597, + "tokens_seen": 898348032 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036759277833500504, + "loss": 2.8615, + "theoretical_loss": 3.6869586963680154, + "tokens_seen": 898413568 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003675827482447342, + "loss": 2.7986, + "theoretical_loss": 3.68693259322977, + "tokens_seen": 898479104 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003675727181544634, + "loss": 2.7854, + "theoretical_loss": 3.686906492528518, + "tokens_seen": 898544640 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003675626880641926, + "loss": 2.7457, + "theoretical_loss": 3.686880394263854, + "tokens_seen": 898610176 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003675526579739218, + "loss": 2.731, + "theoretical_loss": 3.6868542984353736, + "tokens_seen": 898675712 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036754262788365094, + "loss": 2.8772, + "theoretical_loss": 3.686828205042671, + "tokens_seen": 898741248 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003675325977933802, + "loss": 2.8269, + "theoretical_loss": 3.6868021140853413, + "tokens_seen": 898806784 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003675225677031093, + "loss": 2.8921, + "theoretical_loss": 3.6867760255629802, + "tokens_seen": 898872320 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036751253761283854, + "loss": 2.7749, + "theoretical_loss": 3.686749939475183, + "tokens_seen": 898937856 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003675025075225677, + "loss": 2.6101, + "theoretical_loss": 3.686723855821544, + "tokens_seen": 899003392 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003674924774322969, + "loss": 2.8056, + "theoretical_loss": 3.68669777460166, + "tokens_seen": 899068928 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003674824473420261, + "loss": 2.8025, + "theoretical_loss": 3.686671695815125, + "tokens_seen": 899134464 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036747241725175526, + "loss": 2.7852, + "theoretical_loss": 3.686645619461536, + "tokens_seen": 899200000 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036746238716148444, + "loss": 2.5889, + "theoretical_loss": 3.6866195455404878, + "tokens_seen": 899265536 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003674523570712137, + "loss": 2.9105, + "theoretical_loss": 3.686593474051577, + "tokens_seen": 899331072 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003674423269809428, + "loss": 2.5764, + "theoretical_loss": 3.686567404994399, + "tokens_seen": 899396608 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036743229689067204, + "loss": 2.6419, + "theoretical_loss": 3.68654133836855, + "tokens_seen": 899462144 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1051693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.662870407104492, + "objective/train/theoretical_loss": 3.686534822091936, + "objective/train/tokens_used": 919938528, + "theoretical_loss": 3.686534822091936, + "tokens_seen": 899478528 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036742226680040117, + "loss": 2.8781, + "theoretical_loss": 3.686515274173626, + "tokens_seen": 899527680 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003674122367101304, + "loss": 2.8185, + "theoretical_loss": 3.6864892124092234, + "tokens_seen": 899593216 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003674022066198596, + "loss": 2.8196, + "theoretical_loss": 3.6864631530749388, + "tokens_seen": 899658752 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036739217652958877, + "loss": 2.69, + "theoretical_loss": 3.686437096170368, + "tokens_seen": 899724288 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036738214643931795, + "loss": 2.7704, + "theoretical_loss": 3.6864110416951075, + "tokens_seen": 899789824 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003673721163490472, + "loss": 2.8191, + "theoretical_loss": 3.6863849896487544, + "tokens_seen": 899855360 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003673620862587763, + "loss": 2.8529, + "theoretical_loss": 3.686358940030905, + "tokens_seen": 899920896 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036735205616850555, + "loss": 2.7385, + "theoretical_loss": 3.686332892841156, + "tokens_seen": 899986432 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036734202607823467, + "loss": 2.9449, + "theoretical_loss": 3.686306848079105, + "tokens_seen": 900051968 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003673319959879639, + "loss": 2.9013, + "theoretical_loss": 3.686280805744348, + "tokens_seen": 900117504 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003673219658976931, + "loss": 2.87, + "theoretical_loss": 3.686254765836483, + "tokens_seen": 900183040 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036731193580742227, + "loss": 2.7614, + "theoretical_loss": 3.6862287283551067, + "tokens_seen": 900248576 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036730190571715145, + "loss": 2.7338, + "theoretical_loss": 3.686202693299816, + "tokens_seen": 900314112 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036729187562688063, + "loss": 2.5981, + "theoretical_loss": 3.686176660670209, + "tokens_seen": 900379648 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003672818455366098, + "loss": 2.7967, + "theoretical_loss": 3.6861506304658826, + "tokens_seen": 900445184 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036727181544633905, + "loss": 2.9024, + "theoretical_loss": 3.6861246026864354, + "tokens_seen": 900510720 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003672617853560682, + "loss": 2.8047, + "theoretical_loss": 3.6860985773314634, + "tokens_seen": 900576256 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003672517552657974, + "loss": 2.6793, + "theoretical_loss": 3.6860725544005657, + "tokens_seen": 900641792 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036724172517552654, + "loss": 2.9473, + "theoretical_loss": 3.6860465338933395, + "tokens_seen": 900707328 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036723169508525577, + "loss": 2.6197, + "theoretical_loss": 3.686020515809383, + "tokens_seen": 900772864 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036722166499498495, + "loss": 2.8145, + "theoretical_loss": 3.6859945001482943, + "tokens_seen": 900838400 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036721163490471414, + "loss": 2.7004, + "theoretical_loss": 3.685968486909671, + "tokens_seen": 900903936 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003672016048144433, + "loss": 2.9374, + "theoretical_loss": 3.6859424760931123, + "tokens_seen": 900969472 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036719157472417255, + "loss": 2.7168, + "theoretical_loss": 3.6859164676982155, + "tokens_seen": 901035008 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003671815446339017, + "loss": 2.7888, + "theoretical_loss": 3.68589046172458, + "tokens_seen": 901100544 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1056725, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.974947214126587, + "objective/train/theoretical_loss": 3.685883960609446, + "objective/train/tokens_used": 921576928, + "theoretical_loss": 3.685883960609446, + "tokens_seen": 901116928 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003671715145436309, + "loss": 2.7201, + "theoretical_loss": 3.685864458171803, + "tokens_seen": 901166080 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036716148445336004, + "loss": 2.7771, + "theoretical_loss": 3.685838457039485, + "tokens_seen": 901231616 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003671514543630893, + "loss": 2.7884, + "theoretical_loss": 3.6858124583272227, + "tokens_seen": 901297152 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036714142427281846, + "loss": 2.9321, + "theoretical_loss": 3.685786462034616, + "tokens_seen": 901362688 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036713139418254764, + "loss": 2.7786, + "theoretical_loss": 3.6857604681612646, + "tokens_seen": 901428224 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003671213640922768, + "loss": 2.7034, + "theoretical_loss": 3.6857344767067657, + "tokens_seen": 901493760 + }, + { + "epoch": 3.0, + "learning_rate": 0.000367111334002006, + "loss": 2.8064, + "theoretical_loss": 3.6857084876707193, + "tokens_seen": 901559296 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036710130391173524, + "loss": 2.7424, + "theoretical_loss": 3.6856825010527245, + "tokens_seen": 901624832 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003670912738214644, + "loss": 2.7533, + "theoretical_loss": 3.685656516852381, + "tokens_seen": 901690368 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003670812437311936, + "loss": 2.9162, + "theoretical_loss": 3.6856305350692873, + "tokens_seen": 901755904 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003670712136409228, + "loss": 2.7772, + "theoretical_loss": 3.685604555703044, + "tokens_seen": 901821440 + }, + { + "epoch": 3.0, + "learning_rate": 0.000367061183550652, + "loss": 2.8262, + "theoretical_loss": 3.68557857875325, + "tokens_seen": 901886976 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036705115346038114, + "loss": 2.5888, + "theoretical_loss": 3.6855526042195046, + "tokens_seen": 901952512 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003670411233701104, + "loss": 2.4701, + "theoretical_loss": 3.685526632101408, + "tokens_seen": 902018048 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003670310932798395, + "loss": 3.0106, + "theoretical_loss": 3.6855006623985602, + "tokens_seen": 902083584 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036702106318956874, + "loss": 2.5845, + "theoretical_loss": 3.6854746951105612, + "tokens_seen": 902149120 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003670110330992979, + "loss": 2.738, + "theoretical_loss": 3.6854487302370105, + "tokens_seen": 902214656 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003670010030090271, + "loss": 2.7965, + "theoretical_loss": 3.685422767777509, + "tokens_seen": 902280192 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003669909729187563, + "loss": 2.5952, + "theoretical_loss": 3.6853968077316566, + "tokens_seen": 902345728 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036698094282848546, + "loss": 2.7099, + "theoretical_loss": 3.685370850099053, + "tokens_seen": 902411264 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036697091273821464, + "loss": 2.9008, + "theoretical_loss": 3.6853448948792993, + "tokens_seen": 902476800 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003669608826479439, + "loss": 2.5409, + "theoretical_loss": 3.685318942071996, + "tokens_seen": 902542336 + }, + { + "epoch": 3.0, + "learning_rate": 0.000366950852557673, + "loss": 2.6951, + "theoretical_loss": 3.685292991676744, + "tokens_seen": 902607872 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036694082246740224, + "loss": 2.9099, + "theoretical_loss": 3.685267043693144, + "tokens_seen": 902673408 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036693079237713137, + "loss": 2.5979, + "theoretical_loss": 3.6852410981207955, + "tokens_seen": 902738944 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1061572, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.726848840713501, + "objective/train/theoretical_loss": 3.68523461210442, + "objective/train/tokens_used": 923215328, + "theoretical_loss": 3.68523461210442, + "tokens_seen": 902755328 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003669207622868606, + "loss": 2.8534, + "theoretical_loss": 3.685215154959301, + "tokens_seen": 902804480 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003669107321965898, + "loss": 2.8183, + "theoretical_loss": 3.6851892142082607, + "tokens_seen": 902870016 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036690070210631897, + "loss": 2.7522, + "theoretical_loss": 3.685163275867276, + "tokens_seen": 902935552 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036689067201604815, + "loss": 2.7613, + "theoretical_loss": 3.685137339935948, + "tokens_seen": 903001088 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003668806419257774, + "loss": 2.7894, + "theoretical_loss": 3.6851114064138777, + "tokens_seen": 903066624 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003668706118355065, + "loss": 2.8125, + "theoretical_loss": 3.685085475300667, + "tokens_seen": 903132160 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036686058174523575, + "loss": 2.7282, + "theoretical_loss": 3.685059546595917, + "tokens_seen": 903197696 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036685055165496487, + "loss": 2.8206, + "theoretical_loss": 3.6850336202992295, + "tokens_seen": 903263232 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003668405215646941, + "loss": 2.7565, + "theoretical_loss": 3.6850076964102056, + "tokens_seen": 903328768 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003668304914744233, + "loss": 2.7826, + "theoretical_loss": 3.684981774928448, + "tokens_seen": 903394304 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036682046138415247, + "loss": 2.8357, + "theoretical_loss": 3.684955855853558, + "tokens_seen": 903459840 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036681043129388165, + "loss": 2.8705, + "theoretical_loss": 3.684929939185137, + "tokens_seen": 903525376 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036680040120361083, + "loss": 2.6921, + "theoretical_loss": 3.6849040249227887, + "tokens_seen": 903590912 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036679037111334, + "loss": 2.8528, + "theoretical_loss": 3.6848781130661137, + "tokens_seen": 903656448 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036678034102306925, + "loss": 2.7383, + "theoretical_loss": 3.6848522036147147, + "tokens_seen": 903721984 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003667703109327984, + "loss": 2.7307, + "theoretical_loss": 3.684826296568194, + "tokens_seen": 903787520 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003667602808425276, + "loss": 2.6868, + "theoretical_loss": 3.6848003919261547, + "tokens_seen": 903853056 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036675025075225674, + "loss": 2.8841, + "theoretical_loss": 3.6847744896881975, + "tokens_seen": 903918592 + }, + { + "epoch": 3.0, + "learning_rate": 0.000366740220661986, + "loss": 2.8175, + "theoretical_loss": 3.6847485898539274, + "tokens_seen": 903984128 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036673019057171515, + "loss": 2.577, + "theoretical_loss": 3.6847226924229455, + "tokens_seen": 904049664 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036672016048144434, + "loss": 2.6837, + "theoretical_loss": 3.684696797394855, + "tokens_seen": 904115200 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003667101303911735, + "loss": 2.7804, + "theoretical_loss": 3.6846709047692587, + "tokens_seen": 904180736 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036670010030090275, + "loss": 2.847, + "theoretical_loss": 3.68464501454576, + "tokens_seen": 904246272 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003666900702106319, + "loss": 2.7734, + "theoretical_loss": 3.6846191267239616, + "tokens_seen": 904311808 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003666800401203611, + "loss": 2.7455, + "theoretical_loss": 3.6845932413034665, + "tokens_seen": 904377344 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1066630, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.873715877532959, + "objective/train/theoretical_loss": 3.6845867703234996, + "objective/train/tokens_used": 924853728, + "theoretical_loss": 3.6845867703234996, + "tokens_seen": 904393728 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036667001003009024, + "loss": 2.8156, + "theoretical_loss": 3.6845673582838785, + "tokens_seen": 904442880 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003666599799398195, + "loss": 2.7413, + "theoretical_loss": 3.6845414776648004, + "tokens_seen": 904508416 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036664994984954866, + "loss": 2.7902, + "theoretical_loss": 3.6845155994458363, + "tokens_seen": 904573952 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036663991975927784, + "loss": 2.6701, + "theoretical_loss": 3.684489723626589, + "tokens_seen": 904639488 + }, + { + "epoch": 3.0, + "learning_rate": 0.000366629889669007, + "loss": 2.6021, + "theoretical_loss": 3.6844638502066633, + "tokens_seen": 904705024 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003666198595787362, + "loss": 2.6243, + "theoretical_loss": 3.6844379791856614, + "tokens_seen": 904770560 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003666098294884654, + "loss": 2.8246, + "theoretical_loss": 3.684412110563189, + "tokens_seen": 904836096 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003665997993981946, + "loss": 2.5495, + "theoretical_loss": 3.6843862443388478, + "tokens_seen": 904901632 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036658976930792374, + "loss": 2.7142, + "theoretical_loss": 3.684360380512244, + "tokens_seen": 904967168 + }, + { + "epoch": 3.0, + "learning_rate": 0.000366579739217653, + "loss": 2.7124, + "theoretical_loss": 3.68433451908298, + "tokens_seen": 905032704 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036656970912738216, + "loss": 2.8297, + "theoretical_loss": 3.684308660050661, + "tokens_seen": 905098240 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036655967903711134, + "loss": 2.625, + "theoretical_loss": 3.6842828034148916, + "tokens_seen": 905163776 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003665496489468405, + "loss": 2.7145, + "theoretical_loss": 3.684256949175275, + "tokens_seen": 905229312 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003665396188565697, + "loss": 2.7347, + "theoretical_loss": 3.6842310973314163, + "tokens_seen": 905294848 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003665295887662989, + "loss": 2.7359, + "theoretical_loss": 3.684205247882921, + "tokens_seen": 905360384 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003665195586760281, + "loss": 2.8178, + "theoretical_loss": 3.684179400829392, + "tokens_seen": 905425920 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036650952858575725, + "loss": 2.8469, + "theoretical_loss": 3.6841535561704353, + "tokens_seen": 905491456 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003664994984954865, + "loss": 2.7027, + "theoretical_loss": 3.6841277139056556, + "tokens_seen": 905556992 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003664894684052156, + "loss": 2.7502, + "theoretical_loss": 3.6841018740346576, + "tokens_seen": 905622528 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036647943831494484, + "loss": 2.7623, + "theoretical_loss": 3.6840760365570464, + "tokens_seen": 905688064 + }, + { + "epoch": 3.0, + "learning_rate": 0.000366469408224674, + "loss": 2.6832, + "theoretical_loss": 3.684050201472428, + "tokens_seen": 905753600 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003664593781344032, + "loss": 2.9027, + "theoretical_loss": 3.684024368780406, + "tokens_seen": 905819136 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003664493480441324, + "loss": 2.8143, + "theoretical_loss": 3.683998538480587, + "tokens_seen": 905884672 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036643931795386157, + "loss": 2.7001, + "theoretical_loss": 3.6839727105725766, + "tokens_seen": 905950208 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036642928786359075, + "loss": 2.7919, + "theoretical_loss": 3.683946885055979, + "tokens_seen": 906015744 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1071650, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7729568481445312, + "objective/train/theoretical_loss": 3.683940429050442, + "objective/train/tokens_used": 926492128, + "theoretical_loss": 3.683940429050442, + "tokens_seen": 906032128 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036641925777332, + "loss": 2.7335, + "theoretical_loss": 3.683921061930401, + "tokens_seen": 906081280 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003664092276830491, + "loss": 2.7948, + "theoretical_loss": 3.6838952411954473, + "tokens_seen": 906146816 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036639919759277835, + "loss": 2.8415, + "theoretical_loss": 3.6838694228507256, + "tokens_seen": 906212352 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036638916750250753, + "loss": 2.8463, + "theoretical_loss": 3.68384360689584, + "tokens_seen": 906277888 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003663791374122367, + "loss": 2.9069, + "theoretical_loss": 3.6838177933303964, + "tokens_seen": 906343424 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003663691073219659, + "loss": 2.8556, + "theoretical_loss": 3.683791982154002, + "tokens_seen": 906408960 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036635907723169507, + "loss": 2.725, + "theoretical_loss": 3.683766173366263, + "tokens_seen": 906474496 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003663490471414243, + "loss": 2.5566, + "theoretical_loss": 3.6837403669667843, + "tokens_seen": 906540032 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003663390170511535, + "loss": 2.7766, + "theoretical_loss": 3.683714562955174, + "tokens_seen": 906605568 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036632898696088267, + "loss": 2.9063, + "theoretical_loss": 3.6836887613310374, + "tokens_seen": 906671104 + }, + { + "epoch": 3.0, + "learning_rate": 0.00036631895687061185, + "loss": 2.8221, + "theoretical_loss": 3.6836629620939814, + "tokens_seen": 906736640 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036630892678034103, + "loss": 2.9086, + "theoretical_loss": 3.6836371652436126, + "tokens_seen": 906802176 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003662988966900702, + "loss": 2.8452, + "theoretical_loss": 3.6836113707795377, + "tokens_seen": 906867712 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036628886659979945, + "loss": 2.7909, + "theoretical_loss": 3.6835855787013636, + "tokens_seen": 906933248 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003662788365095286, + "loss": 2.8312, + "theoretical_loss": 3.6835597890086973, + "tokens_seen": 906998784 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003662688064192578, + "loss": 2.5495, + "theoretical_loss": 3.6835340017011458, + "tokens_seen": 907064320 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036625877632898694, + "loss": 2.7974, + "theoretical_loss": 3.683508216778316, + "tokens_seen": 907129856 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003662487462387162, + "loss": 2.7805, + "theoretical_loss": 3.6834824342398154, + "tokens_seen": 907195392 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036623871614844535, + "loss": 2.8686, + "theoretical_loss": 3.6834566540852514, + "tokens_seen": 907260928 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036622868605817454, + "loss": 2.9814, + "theoretical_loss": 3.683430876314231, + "tokens_seen": 907326464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003662186559679037, + "loss": 2.874, + "theoretical_loss": 3.683405100926362, + "tokens_seen": 907392000 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036620862587763295, + "loss": 2.522, + "theoretical_loss": 3.6833793279212514, + "tokens_seen": 907457536 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003661985957873621, + "loss": 2.6894, + "theoretical_loss": 3.683353557298507, + "tokens_seen": 907523072 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003661885656970913, + "loss": 2.8295, + "theoretical_loss": 3.683327789057738, + "tokens_seen": 907588608 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036617853560682044, + "loss": 2.799, + "theoretical_loss": 3.683302023198551, + "tokens_seen": 907654144 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1076755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.107651472091675, + "objective/train/theoretical_loss": 3.68329558210583, + "objective/train/tokens_used": 928130528, + "theoretical_loss": 3.68329558210583, + "tokens_seen": 907670528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003661685055165497, + "loss": 2.9, + "theoretical_loss": 3.6832762597205533, + "tokens_seen": 907719680 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036615847542627886, + "loss": 2.9029, + "theoretical_loss": 3.6832504986233543, + "tokens_seen": 907785216 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036614844533600804, + "loss": 2.8958, + "theoretical_loss": 3.683224739906561, + "tokens_seen": 907850752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003661384152457372, + "loss": 2.709, + "theoretical_loss": 3.683198983569783, + "tokens_seen": 907916288 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003661283851554664, + "loss": 2.6734, + "theoretical_loss": 3.6831732296126276, + "tokens_seen": 907981824 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003661183550651956, + "loss": 2.746, + "theoretical_loss": 3.683147478034703, + "tokens_seen": 908047360 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003661083249749248, + "loss": 2.7898, + "theoretical_loss": 3.6831217288356184, + "tokens_seen": 908112896 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036609829488465394, + "loss": 2.7533, + "theoretical_loss": 3.683095982014982, + "tokens_seen": 908178432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003660882647943832, + "loss": 2.5579, + "theoretical_loss": 3.6830702375724025, + "tokens_seen": 908243968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036607823470411236, + "loss": 2.7278, + "theoretical_loss": 3.6830444955074895, + "tokens_seen": 908309504 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036606820461384154, + "loss": 2.7775, + "theoretical_loss": 3.683018755819851, + "tokens_seen": 908375040 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003660581745235707, + "loss": 2.8285, + "theoretical_loss": 3.6829930185090958, + "tokens_seen": 908440576 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003660481444332999, + "loss": 2.7422, + "theoretical_loss": 3.6829672835748335, + "tokens_seen": 908506112 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003660381143430291, + "loss": 2.8331, + "theoretical_loss": 3.682941551016673, + "tokens_seen": 908571648 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003660280842527583, + "loss": 2.7105, + "theoretical_loss": 3.682915820834224, + "tokens_seen": 908637184 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036601805416248745, + "loss": 2.7307, + "theoretical_loss": 3.682890093027095, + "tokens_seen": 908702720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003660080240722167, + "loss": 2.8438, + "theoretical_loss": 3.682864367594896, + "tokens_seen": 908768256 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003659979939819458, + "loss": 2.9085, + "theoretical_loss": 3.682838644537237, + "tokens_seen": 908833792 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036598796389167504, + "loss": 2.607, + "theoretical_loss": 3.6828129238537266, + "tokens_seen": 908899328 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003659779338014042, + "loss": 2.7233, + "theoretical_loss": 3.682787205543975, + "tokens_seen": 908964864 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003659679037111334, + "loss": 2.5585, + "theoretical_loss": 3.682761489607592, + "tokens_seen": 909030400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003659578736208626, + "loss": 2.6669, + "theoretical_loss": 3.6827357760441877, + "tokens_seen": 909095936 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036594784353059177, + "loss": 2.7123, + "theoretical_loss": 3.6827100648533717, + "tokens_seen": 909161472 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036593781344032095, + "loss": 2.8216, + "theoretical_loss": 3.682684356034754, + "tokens_seen": 909227008 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003659277833500502, + "loss": 2.5725, + "theoretical_loss": 3.682658649587945, + "tokens_seen": 909292544 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1081876, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.961531162261963, + "objective/train/theoretical_loss": 3.6826522233467927, + "objective/train/tokens_used": 929768928, + "theoretical_loss": 3.6826522233467927, + "tokens_seen": 909308928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003659177532597793, + "loss": 2.6922, + "theoretical_loss": 3.6826329455125553, + "tokens_seen": 909358080 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036590772316950855, + "loss": 2.8059, + "theoretical_loss": 3.682607243808195, + "tokens_seen": 909423616 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036589769307923773, + "loss": 2.6864, + "theoretical_loss": 3.6825815444744743, + "tokens_seen": 909489152 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003658876629889669, + "loss": 2.6581, + "theoretical_loss": 3.682555847511004, + "tokens_seen": 909554688 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003658776328986961, + "loss": 2.6658, + "theoretical_loss": 3.6825301529173946, + "tokens_seen": 909620224 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036586760280842527, + "loss": 2.9523, + "theoretical_loss": 3.6825044606932567, + "tokens_seen": 909685760 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036585757271815445, + "loss": 2.8693, + "theoretical_loss": 3.682478770838202, + "tokens_seen": 909751296 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003658475426278837, + "loss": 2.8812, + "theoretical_loss": 3.6824530833518394, + "tokens_seen": 909816832 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003658375125376128, + "loss": 2.7843, + "theoretical_loss": 3.682427398233782, + "tokens_seen": 909882368 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036582748244734205, + "loss": 2.7056, + "theoretical_loss": 3.68240171548364, + "tokens_seen": 909947904 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003658174523570712, + "loss": 2.8099, + "theoretical_loss": 3.682376035101025, + "tokens_seen": 910013440 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003658074222668004, + "loss": 2.5475, + "theoretical_loss": 3.6823503570855474, + "tokens_seen": 910078976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003657973921765296, + "loss": 2.8597, + "theoretical_loss": 3.6823246814368193, + "tokens_seen": 910144512 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003657873620862588, + "loss": 2.8058, + "theoretical_loss": 3.6822990081544518, + "tokens_seen": 910210048 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036577733199598796, + "loss": 2.8276, + "theoretical_loss": 3.6822733372380574, + "tokens_seen": 910275584 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036576730190571714, + "loss": 2.5292, + "theoretical_loss": 3.6822476686872463, + "tokens_seen": 910341120 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003657572718154463, + "loss": 2.7811, + "theoretical_loss": 3.682222002501631, + "tokens_seen": 910406656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036574724172517555, + "loss": 2.9896, + "theoretical_loss": 3.682196338680823, + "tokens_seen": 910472192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003657372116349047, + "loss": 2.6515, + "theoretical_loss": 3.682170677224435, + "tokens_seen": 910537728 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003657271815446339, + "loss": 2.842, + "theoretical_loss": 3.682145018132078, + "tokens_seen": 910603264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003657171514543631, + "loss": 2.5961, + "theoretical_loss": 3.6821193614033643, + "tokens_seen": 910668800 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003657071213640923, + "loss": 2.8136, + "theoretical_loss": 3.682093707037907, + "tokens_seen": 910734336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036569709127382146, + "loss": 2.8072, + "theoretical_loss": 3.6820680550353178, + "tokens_seen": 910799872 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036568706118355064, + "loss": 2.8705, + "theoretical_loss": 3.682042405395208, + "tokens_seen": 910865408 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003656770310932798, + "loss": 2.6743, + "theoretical_loss": 3.682016758117192, + "tokens_seen": 910930944 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1086888, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.870638132095337, + "objective/train/theoretical_loss": 3.6820103466667193, + "objective/train/tokens_used": 931407328, + "theoretical_loss": 3.6820103466667193, + "tokens_seen": 910947328 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036566700100300906, + "loss": 2.7901, + "theoretical_loss": 3.681991113200881, + "tokens_seen": 910996480 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003656569709127382, + "loss": 2.7127, + "theoretical_loss": 3.6819654706458884, + "tokens_seen": 911062016 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003656469408224674, + "loss": 2.7062, + "theoretical_loss": 3.6819398304518267, + "tokens_seen": 911127552 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036563691073219655, + "loss": 2.7506, + "theoretical_loss": 3.6819141926183083, + "tokens_seen": 911193088 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003656268806419258, + "loss": 2.6625, + "theoretical_loss": 3.681888557144947, + "tokens_seen": 911258624 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036561685055165496, + "loss": 2.6967, + "theoretical_loss": 3.6818629240313543, + "tokens_seen": 911324160 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036560682046138414, + "loss": 2.8149, + "theoretical_loss": 3.6818372932771455, + "tokens_seen": 911389696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003655967903711134, + "loss": 2.6254, + "theoretical_loss": 3.681811664881932, + "tokens_seen": 911455232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036558676028084256, + "loss": 2.7563, + "theoretical_loss": 3.6817860388453276, + "tokens_seen": 911520768 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036557673019057174, + "loss": 2.7822, + "theoretical_loss": 3.681760415166946, + "tokens_seen": 911586304 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003655667001003009, + "loss": 2.8973, + "theoretical_loss": 3.6817347938464002, + "tokens_seen": 911651840 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003655566700100301, + "loss": 2.607, + "theoretical_loss": 3.681709174883304, + "tokens_seen": 911717376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003655466399197593, + "loss": 2.6586, + "theoretical_loss": 3.681683558277271, + "tokens_seen": 911782912 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003655366098294885, + "loss": 2.852, + "theoretical_loss": 3.6816579440279154, + "tokens_seen": 911848448 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036552657973921765, + "loss": 2.7226, + "theoretical_loss": 3.6816323321348507, + "tokens_seen": 911913984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003655165496489469, + "loss": 2.6382, + "theoretical_loss": 3.68160672259769, + "tokens_seen": 911979520 + }, + { + "epoch": 3.01, + "learning_rate": 0.000365506519558676, + "loss": 2.8484, + "theoretical_loss": 3.6815811154160487, + "tokens_seen": 912045056 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036549648946840525, + "loss": 2.6745, + "theoretical_loss": 3.68155551058954, + "tokens_seen": 912110592 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003654864593781344, + "loss": 2.8237, + "theoretical_loss": 3.6815299081177786, + "tokens_seen": 912176128 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003654764292878636, + "loss": 3.1232, + "theoretical_loss": 3.6815043080003784, + "tokens_seen": 912241664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003654663991975928, + "loss": 2.6531, + "theoretical_loss": 3.6814787102369544, + "tokens_seen": 912307200 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036545636910732197, + "loss": 2.7191, + "theoretical_loss": 3.6814531148271206, + "tokens_seen": 912372736 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036544633901705115, + "loss": 2.5767, + "theoretical_loss": 3.681427521770491, + "tokens_seen": 912438272 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003654363089267804, + "loss": 2.8236, + "theoretical_loss": 3.6814019310666812, + "tokens_seen": 912503808 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003654262788365095, + "loss": 2.7541, + "theoretical_loss": 3.6813763427153052, + "tokens_seen": 912569344 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1091883, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6327216625213623, + "objective/train/theoretical_loss": 3.6813699459949842, + "objective/train/tokens_used": 933045728, + "theoretical_loss": 3.6813699459949842, + "tokens_seen": 912585728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036541624874623875, + "loss": 2.6855, + "theoretical_loss": 3.681350756715979, + "tokens_seen": 912634880 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036540621865596793, + "loss": 2.6951, + "theoretical_loss": 3.6813251730683163, + "tokens_seen": 912700416 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003653961885656971, + "loss": 2.8262, + "theoretical_loss": 3.681299591771933, + "tokens_seen": 912765952 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003653861584754263, + "loss": 2.7102, + "theoretical_loss": 3.681274012826443, + "tokens_seen": 912831488 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036537612838515547, + "loss": 2.6939, + "theoretical_loss": 3.6812484362314626, + "tokens_seen": 912897024 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036536609829488465, + "loss": 2.8567, + "theoretical_loss": 3.681222861986607, + "tokens_seen": 912962560 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003653560682046139, + "loss": 2.7321, + "theoretical_loss": 3.6811972900914913, + "tokens_seen": 913028096 + }, + { + "epoch": 3.01, + "learning_rate": 0.000365346038114343, + "loss": 2.6525, + "theoretical_loss": 3.6811717205457306, + "tokens_seen": 913093632 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036533600802407225, + "loss": 2.6161, + "theoretical_loss": 3.6811461533489416, + "tokens_seen": 913159168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003653259779338014, + "loss": 2.7166, + "theoretical_loss": 3.6811205885007388, + "tokens_seen": 913224704 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003653159478435306, + "loss": 2.5017, + "theoretical_loss": 3.6810950260007385, + "tokens_seen": 913290240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003653059177532598, + "loss": 2.8044, + "theoretical_loss": 3.681069465848556, + "tokens_seen": 913355776 + }, + { + "epoch": 3.01, + "learning_rate": 0.000365295887662989, + "loss": 2.7958, + "theoretical_loss": 3.681043908043808, + "tokens_seen": 913421312 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036528585757271816, + "loss": 2.7558, + "theoretical_loss": 3.6810183525861095, + "tokens_seen": 913486848 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036527582748244734, + "loss": 2.6713, + "theoretical_loss": 3.680992799475078, + "tokens_seen": 913552384 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003652657973921765, + "loss": 2.7104, + "theoretical_loss": 3.6809672487103287, + "tokens_seen": 913617920 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036525576730190575, + "loss": 2.8065, + "theoretical_loss": 3.680941700291478, + "tokens_seen": 913683456 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003652457372116349, + "loss": 2.6175, + "theoretical_loss": 3.680916154218143, + "tokens_seen": 913748992 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003652357071213641, + "loss": 2.7811, + "theoretical_loss": 3.680890610489939, + "tokens_seen": 913814528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003652256770310933, + "loss": 2.7338, + "theoretical_loss": 3.680865069106483, + "tokens_seen": 913880064 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003652156469408225, + "loss": 2.7676, + "theoretical_loss": 3.680839530067392, + "tokens_seen": 913945600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036520561685055166, + "loss": 2.6906, + "theoretical_loss": 3.680813993372282, + "tokens_seen": 914011136 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036519558676028084, + "loss": 2.9039, + "theoretical_loss": 3.680788459020771, + "tokens_seen": 914076672 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036518555667001, + "loss": 2.7388, + "theoretical_loss": 3.680762927012475, + "tokens_seen": 914142208 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036517552657973926, + "loss": 2.8223, + "theoretical_loss": 3.680737397347011, + "tokens_seen": 914207744 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1093340, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.954491138458252, + "objective/train/theoretical_loss": 3.6807310152966677, + "objective/train/tokens_used": 934684128, + "theoretical_loss": 3.6807310152966677, + "tokens_seen": 914224128 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003651654964894684, + "loss": 3.0444, + "theoretical_loss": 3.6807118700239965, + "tokens_seen": 914273280 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003651554663991976, + "loss": 2.813, + "theoretical_loss": 3.6806863450430485, + "tokens_seen": 914338816 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036514543630892675, + "loss": 2.7931, + "theoretical_loss": 3.680660822403784, + "tokens_seen": 914404352 + }, + { + "epoch": 3.01, + "learning_rate": 0.000365135406218656, + "loss": 2.7916, + "theoretical_loss": 3.680635302105821, + "tokens_seen": 914469888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036512537612838516, + "loss": 2.5788, + "theoretical_loss": 3.6806097841487766, + "tokens_seen": 914535424 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036511534603811434, + "loss": 2.7142, + "theoretical_loss": 3.680584268532268, + "tokens_seen": 914600960 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003651053159478435, + "loss": 2.8717, + "theoretical_loss": 3.680558755255914, + "tokens_seen": 914666496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036509528585757276, + "loss": 2.8554, + "theoretical_loss": 3.6805332443193306, + "tokens_seen": 914732032 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003650852557673019, + "loss": 2.6752, + "theoretical_loss": 3.680507735722137, + "tokens_seen": 914797568 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003650752256770311, + "loss": 2.6707, + "theoretical_loss": 3.6804822294639505, + "tokens_seen": 914863104 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036506519558676025, + "loss": 2.7038, + "theoretical_loss": 3.6804567255443894, + "tokens_seen": 914928640 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003650551654964895, + "loss": 2.8116, + "theoretical_loss": 3.680431223963071, + "tokens_seen": 914994176 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036504513540621867, + "loss": 2.8997, + "theoretical_loss": 3.6804057247196145, + "tokens_seen": 915059712 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036503510531594785, + "loss": 2.6417, + "theoretical_loss": 3.6803802278136377, + "tokens_seen": 915125248 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036502507522567703, + "loss": 2.8395, + "theoretical_loss": 3.6803547332447595, + "tokens_seen": 915190784 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003650150451354062, + "loss": 2.854, + "theoretical_loss": 3.680329241012597, + "tokens_seen": 915256320 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003650050150451354, + "loss": 2.9282, + "theoretical_loss": 3.68030375111677, + "tokens_seen": 915321856 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003649949849548646, + "loss": 2.7012, + "theoretical_loss": 3.6802782635568967, + "tokens_seen": 915387392 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036498495486459375, + "loss": 2.7663, + "theoretical_loss": 3.6802527783325956, + "tokens_seen": 915452928 + }, + { + "epoch": 3.01, + "learning_rate": 0.000364974924774323, + "loss": 2.8494, + "theoretical_loss": 3.680227295443486, + "tokens_seen": 915518464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003649648946840521, + "loss": 2.595, + "theoretical_loss": 3.680201814889186, + "tokens_seen": 915584000 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036495486459378135, + "loss": 2.8819, + "theoretical_loss": 3.6801763366693154, + "tokens_seen": 915649536 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036494483450351053, + "loss": 2.5959, + "theoretical_loss": 3.6801508607834927, + "tokens_seen": 915715072 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003649348044132397, + "loss": 2.773, + "theoretical_loss": 3.680125387231338, + "tokens_seen": 915780608 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003649247743229689, + "loss": 2.7749, + "theoretical_loss": 3.680099916012469, + "tokens_seen": 915846144 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1093676, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2436115741729736, + "objective/train/theoretical_loss": 3.6800935485722834, + "objective/train/tokens_used": 936322528, + "theoretical_loss": 3.6800935485722834, + "tokens_seen": 915862528 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036491474423269813, + "loss": 2.8122, + "theoretical_loss": 3.6800744471265063, + "tokens_seen": 915911680 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036490471414242726, + "loss": 2.7161, + "theoretical_loss": 3.680048980573069, + "tokens_seen": 915977216 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003648946840521565, + "loss": 2.7632, + "theoretical_loss": 3.6800235163517767, + "tokens_seen": 916042752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003648846539618856, + "loss": 2.7306, + "theoretical_loss": 3.6799980544622484, + "tokens_seen": 916108288 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036487462387161485, + "loss": 2.8455, + "theoretical_loss": 3.679972594904104, + "tokens_seen": 916173824 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003648645937813441, + "loss": 2.7776, + "theoretical_loss": 3.679947137676964, + "tokens_seen": 916239360 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003648545636910732, + "loss": 2.7945, + "theoretical_loss": 3.6799216827804475, + "tokens_seen": 916304896 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036484453360080245, + "loss": 2.7072, + "theoretical_loss": 3.679896230214175, + "tokens_seen": 916370432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003648345035105316, + "loss": 2.6346, + "theoretical_loss": 3.679870779977766, + "tokens_seen": 916435968 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003648244734202608, + "loss": 2.9591, + "theoretical_loss": 3.6798453320708413, + "tokens_seen": 916501504 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036481444332999, + "loss": 2.687, + "theoretical_loss": 3.6798198864930205, + "tokens_seen": 916567040 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003648044132397192, + "loss": 2.5467, + "theoretical_loss": 3.679794443243924, + "tokens_seen": 916632576 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036479438314944836, + "loss": 2.6591, + "theoretical_loss": 3.6797690023231726, + "tokens_seen": 916698112 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036478435305917754, + "loss": 2.8308, + "theoretical_loss": 3.6797435637303866, + "tokens_seen": 916763648 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003647743229689067, + "loss": 2.8794, + "theoretical_loss": 3.6797181274651867, + "tokens_seen": 916829184 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036476429287863595, + "loss": 2.9907, + "theoretical_loss": 3.679692693527193, + "tokens_seen": 916894720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003647542627883651, + "loss": 2.7011, + "theoretical_loss": 3.679667261916027, + "tokens_seen": 916960256 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003647442326980943, + "loss": 2.816, + "theoretical_loss": 3.6796418326313094, + "tokens_seen": 917025792 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003647342026078235, + "loss": 2.8047, + "theoretical_loss": 3.679616405672661, + "tokens_seen": 917091328 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003647241725175527, + "loss": 2.5946, + "theoretical_loss": 3.679590981039703, + "tokens_seen": 917156864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036471414242728186, + "loss": 2.7901, + "theoretical_loss": 3.6795655587320555, + "tokens_seen": 917222400 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036470411233701104, + "loss": 2.6762, + "theoretical_loss": 3.679540138749341, + "tokens_seen": 917287936 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003646940822467402, + "loss": 2.7547, + "theoretical_loss": 3.6795147210911807, + "tokens_seen": 917353472 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036468405215646946, + "loss": 2.8656, + "theoretical_loss": 3.679489305757196, + "tokens_seen": 917419008 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003646740220661986, + "loss": 2.8932, + "theoretical_loss": 3.679463892747007, + "tokens_seen": 917484544 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1095045, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7306671142578125, + "objective/train/theoretical_loss": 3.6794575398575087, + "objective/train/tokens_used": 937960928, + "theoretical_loss": 3.6794575398575087, + "tokens_seen": 917500928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003646639919759278, + "loss": 2.831, + "theoretical_loss": 3.6794384820602364, + "tokens_seen": 917550080 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036465396188565695, + "loss": 2.8848, + "theoretical_loss": 3.679413073696506, + "tokens_seen": 917615616 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003646439317953862, + "loss": 2.7306, + "theoretical_loss": 3.679387667655438, + "tokens_seen": 917681152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036463390170511536, + "loss": 2.708, + "theoretical_loss": 3.6793622639366523, + "tokens_seen": 917746688 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036462387161484454, + "loss": 2.6047, + "theoretical_loss": 3.679336862539772, + "tokens_seen": 917812224 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003646138415245737, + "loss": 2.7643, + "theoretical_loss": 3.67931146346442, + "tokens_seen": 917877760 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036460381143430296, + "loss": 2.715, + "theoretical_loss": 3.679286066710217, + "tokens_seen": 917943296 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003645937813440321, + "loss": 2.6738, + "theoretical_loss": 3.679260672276786, + "tokens_seen": 918008832 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003645837512537613, + "loss": 2.8271, + "theoretical_loss": 3.679235280163749, + "tokens_seen": 918074368 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036457372116349045, + "loss": 2.6745, + "theoretical_loss": 3.679209890370728, + "tokens_seen": 918139904 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003645636910732197, + "loss": 2.9066, + "theoretical_loss": 3.6791845028973453, + "tokens_seen": 918205440 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036455366098294887, + "loss": 2.7727, + "theoretical_loss": 3.6791591177432252, + "tokens_seen": 918270976 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036454363089267805, + "loss": 2.7415, + "theoretical_loss": 3.679133734907988, + "tokens_seen": 918336512 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036453360080240723, + "loss": 2.9084, + "theoretical_loss": 3.679108354391258, + "tokens_seen": 918402048 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003645235707121364, + "loss": 2.7322, + "theoretical_loss": 3.6790829761926567, + "tokens_seen": 918467584 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003645135406218656, + "loss": 2.6849, + "theoretical_loss": 3.6790576003118085, + "tokens_seen": 918533120 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003645035105315948, + "loss": 2.623, + "theoretical_loss": 3.679032226748335, + "tokens_seen": 918598656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036449348044132395, + "loss": 2.8732, + "theoretical_loss": 3.6790068555018607, + "tokens_seen": 918664192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003644834503510532, + "loss": 2.6817, + "theoretical_loss": 3.678981486572007, + "tokens_seen": 918729728 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003644734202607823, + "loss": 2.8337, + "theoretical_loss": 3.6789561199583987, + "tokens_seen": 918795264 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036446339017051155, + "loss": 2.8237, + "theoretical_loss": 3.6789307556606583, + "tokens_seen": 918860800 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036445336008024073, + "loss": 2.7562, + "theoretical_loss": 3.6789053936784093, + "tokens_seen": 918926336 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003644433299899699, + "loss": 2.6789, + "theoretical_loss": 3.6788800340112755, + "tokens_seen": 918991872 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003644332998996991, + "loss": 2.8857, + "theoretical_loss": 3.6788546766588803, + "tokens_seen": 919057408 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036442326980942833, + "loss": 2.9261, + "theoretical_loss": 3.6788293216208476, + "tokens_seen": 919122944 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1095626, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8860342502593994, + "objective/train/theoretical_loss": 3.678822983222914, + "objective/train/tokens_used": 939599328, + "theoretical_loss": 3.678822983222914, + "tokens_seen": 919139328 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036441323971915746, + "loss": 2.7617, + "theoretical_loss": 3.6788039688968004, + "tokens_seen": 919188480 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003644032096288867, + "loss": 2.6231, + "theoretical_loss": 3.678778618486364, + "tokens_seen": 919254016 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003643931795386158, + "loss": 2.918, + "theoretical_loss": 3.678753270389161, + "tokens_seen": 919319552 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036438314944834505, + "loss": 2.6964, + "theoretical_loss": 3.6787279246048157, + "tokens_seen": 919385088 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036437311935807424, + "loss": 2.5647, + "theoretical_loss": 3.6787025811329523, + "tokens_seen": 919450624 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003643630892678034, + "loss": 2.8001, + "theoretical_loss": 3.6786772399731955, + "tokens_seen": 919516160 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003643530591775326, + "loss": 2.8182, + "theoretical_loss": 3.6786519011251695, + "tokens_seen": 919581696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003643430290872618, + "loss": 2.8766, + "theoretical_loss": 3.678626564588498, + "tokens_seen": 919647232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036433299899699096, + "loss": 2.6051, + "theoretical_loss": 3.678601230362806, + "tokens_seen": 919712768 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003643229689067202, + "loss": 2.6735, + "theoretical_loss": 3.678575898447718, + "tokens_seen": 919778304 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003643129388164493, + "loss": 2.7497, + "theoretical_loss": 3.6785505688428586, + "tokens_seen": 919843840 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036430290872617856, + "loss": 2.7237, + "theoretical_loss": 3.6785252415478524, + "tokens_seen": 919909376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003642928786359077, + "loss": 2.676, + "theoretical_loss": 3.678499916562325, + "tokens_seen": 919974912 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003642828485456369, + "loss": 2.7326, + "theoretical_loss": 3.6784745938859, + "tokens_seen": 920040448 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003642728184553661, + "loss": 2.7578, + "theoretical_loss": 3.6784492735182033, + "tokens_seen": 920105984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003642627883650953, + "loss": 2.8639, + "theoretical_loss": 3.67842395545886, + "tokens_seen": 920171520 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036425275827482446, + "loss": 2.8007, + "theoretical_loss": 3.678398639707495, + "tokens_seen": 920237056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003642427281845537, + "loss": 2.8858, + "theoretical_loss": 3.6783733262637335, + "tokens_seen": 920302592 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003642326980942828, + "loss": 2.7203, + "theoretical_loss": 3.678348015127201, + "tokens_seen": 920368128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036422266800401206, + "loss": 2.795, + "theoretical_loss": 3.678322706297523, + "tokens_seen": 920433664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003642126379137412, + "loss": 2.7759, + "theoretical_loss": 3.678297399774325, + "tokens_seen": 920499200 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003642026078234704, + "loss": 2.5123, + "theoretical_loss": 3.678272095557232, + "tokens_seen": 920564736 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003641925777331996, + "loss": 2.8862, + "theoretical_loss": 3.6782467936458705, + "tokens_seen": 920630272 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003641825476429288, + "loss": 2.7438, + "theoretical_loss": 3.6782214940398665, + "tokens_seen": 920695808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036417251755265797, + "loss": 2.6299, + "theoretical_loss": 3.6781961967388446, + "tokens_seen": 920761344 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1097042, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9992458820343018, + "objective/train/theoretical_loss": 3.6781898727736992, + "objective/train/tokens_used": 941237728, + "theoretical_loss": 3.6781898727736992, + "tokens_seen": 920777728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036416248746238715, + "loss": 2.9299, + "theoretical_loss": 3.6781709017424324, + "tokens_seen": 920826880 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036415245737211633, + "loss": 2.8001, + "theoretical_loss": 3.6781456090502544, + "tokens_seen": 920892416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036414242728184556, + "loss": 2.7548, + "theoretical_loss": 3.678120318661938, + "tokens_seen": 920957952 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003641323971915747, + "loss": 2.8482, + "theoretical_loss": 3.6780950305771087, + "tokens_seen": 921023488 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003641223671013039, + "loss": 2.9371, + "theoretical_loss": 3.6780697447953927, + "tokens_seen": 921089024 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036411233701103316, + "loss": 2.6601, + "theoretical_loss": 3.678044461316417, + "tokens_seen": 921154560 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003641023069207623, + "loss": 2.464, + "theoretical_loss": 3.678019180139808, + "tokens_seen": 921220096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003640922768304915, + "loss": 2.8123, + "theoretical_loss": 3.677993901265191, + "tokens_seen": 921285632 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036408224674022065, + "loss": 2.6211, + "theoretical_loss": 3.6779686246921948, + "tokens_seen": 921351168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003640722166499499, + "loss": 2.7441, + "theoretical_loss": 3.6779433504204446, + "tokens_seen": 921416704 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036406218655967907, + "loss": 2.6296, + "theoretical_loss": 3.6779180784495678, + "tokens_seen": 921482240 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036405215646940825, + "loss": 2.7529, + "theoretical_loss": 3.677892808779191, + "tokens_seen": 921547776 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036404212637913743, + "loss": 2.6744, + "theoretical_loss": 3.6778675414089417, + "tokens_seen": 921613312 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003640320962888666, + "loss": 2.568, + "theoretical_loss": 3.6778422763384464, + "tokens_seen": 921678848 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003640220661985958, + "loss": 2.6847, + "theoretical_loss": 3.677817013567333, + "tokens_seen": 921744384 + }, + { + "epoch": 3.01, + "learning_rate": 0.000364012036108325, + "loss": 2.672, + "theoretical_loss": 3.677791753095228, + "tokens_seen": 921809920 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036400200601805415, + "loss": 2.6619, + "theoretical_loss": 3.677766494921759, + "tokens_seen": 921875456 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003639919759277834, + "loss": 2.6684, + "theoretical_loss": 3.6777412390465543, + "tokens_seen": 921940992 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003639819458375125, + "loss": 2.7847, + "theoretical_loss": 3.6777159854692396, + "tokens_seen": 922006528 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036397191574724175, + "loss": 2.6895, + "theoretical_loss": 3.6776907341894445, + "tokens_seen": 922072064 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036396188565697093, + "loss": 2.7031, + "theoretical_loss": 3.677665485206796, + "tokens_seen": 922137600 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003639518555667001, + "loss": 2.5351, + "theoretical_loss": 3.6776402385209206, + "tokens_seen": 922203136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003639418254764293, + "loss": 2.9291, + "theoretical_loss": 3.6776149941314484, + "tokens_seen": 922268672 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036393179538615853, + "loss": 2.5709, + "theoretical_loss": 3.6775897520380054, + "tokens_seen": 922334208 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036392176529588766, + "loss": 2.4427, + "theoretical_loss": 3.6775645122402207, + "tokens_seen": 922399744 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1097866, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2819089889526367, + "objective/train/theoretical_loss": 3.6775582026494273, + "objective/train/tokens_used": 942876128, + "theoretical_loss": 3.6775582026494273, + "tokens_seen": 922416128 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003639117352056169, + "loss": 2.811, + "theoretical_loss": 3.6775392747377227, + "tokens_seen": 922465280 + }, + { + "epoch": 3.01, + "learning_rate": 0.000363901705115346, + "loss": 2.6732, + "theoretical_loss": 3.6775140395301387, + "tokens_seen": 922530816 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036389167502507525, + "loss": 2.7929, + "theoretical_loss": 3.677488806617097, + "tokens_seen": 922596352 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036388164493480444, + "loss": 2.6178, + "theoretical_loss": 3.6774635759982273, + "tokens_seen": 922661888 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003638716148445336, + "loss": 2.6879, + "theoretical_loss": 3.677438347673157, + "tokens_seen": 922727424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003638615847542628, + "loss": 2.5878, + "theoretical_loss": 3.677413121641515, + "tokens_seen": 922792960 + }, + { + "epoch": 3.01, + "learning_rate": 0.000363851554663992, + "loss": 2.6521, + "theoretical_loss": 3.6773878979029293, + "tokens_seen": 922858496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036384152457372116, + "loss": 2.6997, + "theoretical_loss": 3.67736267645703, + "tokens_seen": 922924032 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003638314944834504, + "loss": 2.7684, + "theoretical_loss": 3.6773374573034445, + "tokens_seen": 922989568 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003638214643931795, + "loss": 2.5416, + "theoretical_loss": 3.6773122404418026, + "tokens_seen": 923055104 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036381143430290876, + "loss": 2.6587, + "theoretical_loss": 3.677287025871733, + "tokens_seen": 923120640 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003638014042126379, + "loss": 2.8315, + "theoretical_loss": 3.6772618135928647, + "tokens_seen": 923186176 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003637913741223671, + "loss": 2.598, + "theoretical_loss": 3.6772366036048276, + "tokens_seen": 923251712 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003637813440320963, + "loss": 2.8329, + "theoretical_loss": 3.67721139590725, + "tokens_seen": 923317248 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003637713139418255, + "loss": 2.7133, + "theoretical_loss": 3.6771861904997616, + "tokens_seen": 923382784 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036376128385155466, + "loss": 2.5512, + "theoretical_loss": 3.677160987381992, + "tokens_seen": 923448320 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003637512537612839, + "loss": 2.7946, + "theoretical_loss": 3.6771357865535705, + "tokens_seen": 923513856 + }, + { + "epoch": 3.01, + "learning_rate": 0.000363741223671013, + "loss": 2.6753, + "theoretical_loss": 3.6771105880141266, + "tokens_seen": 923579392 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036373119358074226, + "loss": 2.7061, + "theoretical_loss": 3.67708539176329, + "tokens_seen": 923644928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003637211634904714, + "loss": 2.7782, + "theoretical_loss": 3.6770601978006914, + "tokens_seen": 923710464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003637111334002006, + "loss": 2.6796, + "theoretical_loss": 3.6770350061259593, + "tokens_seen": 923776000 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003637011033099298, + "loss": 2.9432, + "theoretical_loss": 3.6770098167387246, + "tokens_seen": 923841536 + }, + { + "epoch": 3.01, + "learning_rate": 0.000363691073219659, + "loss": 2.8094, + "theoretical_loss": 3.676984629638617, + "tokens_seen": 923907072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036368104312938817, + "loss": 2.6513, + "theoretical_loss": 3.6769594448252665, + "tokens_seen": 923972608 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036367101303911735, + "loss": 2.7836, + "theoretical_loss": 3.6769342622983032, + "tokens_seen": 924038144 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1099373, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.109006881713867, + "objective/train/theoretical_loss": 3.676927967023768, + "objective/train/tokens_used": 944514528, + "theoretical_loss": 3.676927967023768, + "tokens_seen": 924054528 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036366098294884653, + "loss": 2.7884, + "theoretical_loss": 3.6769090820573584, + "tokens_seen": 924103680 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036365095285857576, + "loss": 2.8461, + "theoretical_loss": 3.6768839041020613, + "tokens_seen": 924169216 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003636409227683049, + "loss": 2.6621, + "theoretical_loss": 3.676858728432043, + "tokens_seen": 924234752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003636308926780341, + "loss": 2.8803, + "theoretical_loss": 3.676833555046933, + "tokens_seen": 924300288 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036362086258776325, + "loss": 2.8337, + "theoretical_loss": 3.6768083839463634, + "tokens_seen": 924365824 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003636108324974925, + "loss": 2.8779, + "theoretical_loss": 3.6767832151299644, + "tokens_seen": 924431360 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036360080240722167, + "loss": 2.5029, + "theoretical_loss": 3.6767580485973665, + "tokens_seen": 924496896 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036359077231695085, + "loss": 2.586, + "theoretical_loss": 3.6767328843482012, + "tokens_seen": 924562432 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036358074222668003, + "loss": 2.6846, + "theoretical_loss": 3.6767077223820985, + "tokens_seen": 924627968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036357071213640927, + "loss": 2.8209, + "theoretical_loss": 3.6766825626986908, + "tokens_seen": 924693504 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003635606820461384, + "loss": 2.9991, + "theoretical_loss": 3.6766574052976075, + "tokens_seen": 924759040 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036355065195586763, + "loss": 2.7583, + "theoretical_loss": 3.676632250178482, + "tokens_seen": 924824576 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036354062186559676, + "loss": 2.6988, + "theoretical_loss": 3.676607097340944, + "tokens_seen": 924890112 + }, + { + "epoch": 3.01, + "learning_rate": 0.000363530591775326, + "loss": 2.7356, + "theoretical_loss": 3.676581946784625, + "tokens_seen": 924955648 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036352056168505517, + "loss": 2.8004, + "theoretical_loss": 3.676556798509157, + "tokens_seen": 925021184 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036351053159478435, + "loss": 2.6135, + "theoretical_loss": 3.676531652514172, + "tokens_seen": 925086720 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036350050150451353, + "loss": 2.7654, + "theoretical_loss": 3.6765065087993007, + "tokens_seen": 925152256 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003634904714142427, + "loss": 2.7685, + "theoretical_loss": 3.6764813673641754, + "tokens_seen": 925217792 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003634804413239719, + "loss": 2.6842, + "theoretical_loss": 3.676456228208428, + "tokens_seen": 925283328 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036347041123370113, + "loss": 2.8569, + "theoretical_loss": 3.67643109133169, + "tokens_seen": 925348864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036346038114343026, + "loss": 2.7765, + "theoretical_loss": 3.6764059567335936, + "tokens_seen": 925414400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003634503510531595, + "loss": 2.703, + "theoretical_loss": 3.6763808244137715, + "tokens_seen": 925479936 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003634403209628886, + "loss": 2.7041, + "theoretical_loss": 3.676355694371855, + "tokens_seen": 925545472 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036343029087261786, + "loss": 2.7526, + "theoretical_loss": 3.6763305666074766, + "tokens_seen": 925611008 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036342026078234704, + "loss": 2.7518, + "theoretical_loss": 3.676305441120269, + "tokens_seen": 925676544 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1100200, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.052302360534668, + "objective/train/theoretical_loss": 3.6762991601042323, + "objective/train/tokens_used": 946152928, + "theoretical_loss": 3.6762991601042323, + "tokens_seen": 925692928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003634102306920762, + "loss": 2.7231, + "theoretical_loss": 3.6762803179098644, + "tokens_seen": 925742080 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003634002006018054, + "loss": 2.6954, + "theoretical_loss": 3.6762551969758954, + "tokens_seen": 925807616 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036339017051153464, + "loss": 2.6857, + "theoretical_loss": 3.676230078317994, + "tokens_seen": 925873152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036338014042126376, + "loss": 2.7367, + "theoretical_loss": 3.676204961935794, + "tokens_seen": 925938688 + }, + { + "epoch": 3.01, + "learning_rate": 0.000363370110330993, + "loss": 2.6934, + "theoretical_loss": 3.6761798478289274, + "tokens_seen": 926004224 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003633600802407222, + "loss": 2.6842, + "theoretical_loss": 3.6761547359970272, + "tokens_seen": 926069760 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036335005015045136, + "loss": 2.7722, + "theoretical_loss": 3.6761296264397263, + "tokens_seen": 926135296 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003633400200601806, + "loss": 2.6782, + "theoretical_loss": 3.6761045191566586, + "tokens_seen": 926200832 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003633299899699097, + "loss": 2.9231, + "theoretical_loss": 3.6760794141474555, + "tokens_seen": 926266368 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036331995987963896, + "loss": 2.7677, + "theoretical_loss": 3.6760543114117517, + "tokens_seen": 926331904 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003633099297893681, + "loss": 2.7595, + "theoretical_loss": 3.6760292109491797, + "tokens_seen": 926397440 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003632998996990973, + "loss": 2.8369, + "theoretical_loss": 3.676004112759373, + "tokens_seen": 926462976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003632898696088265, + "loss": 2.7427, + "theoretical_loss": 3.675979016841966, + "tokens_seen": 926528512 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003632798395185557, + "loss": 2.7186, + "theoretical_loss": 3.675953923196591, + "tokens_seen": 926594048 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036326980942828486, + "loss": 2.7138, + "theoretical_loss": 3.6759288318228815, + "tokens_seen": 926659584 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003632597793380141, + "loss": 2.7673, + "theoretical_loss": 3.6759037427204726, + "tokens_seen": 926725120 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003632497492477432, + "loss": 2.6312, + "theoretical_loss": 3.675878655888997, + "tokens_seen": 926790656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036323971915747246, + "loss": 2.6802, + "theoretical_loss": 3.675853571328089, + "tokens_seen": 926856192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003632296890672016, + "loss": 2.9694, + "theoretical_loss": 3.6758284890373822, + "tokens_seen": 926921728 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003632196589769308, + "loss": 2.7553, + "theoretical_loss": 3.675803409016511, + "tokens_seen": 926987264 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036320962888666, + "loss": 2.7427, + "theoretical_loss": 3.6757783312651093, + "tokens_seen": 927052800 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003631995987963892, + "loss": 2.7938, + "theoretical_loss": 3.6757532557828116, + "tokens_seen": 927118336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036318956870611837, + "loss": 2.8564, + "theoretical_loss": 3.6757281825692525, + "tokens_seen": 927183872 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036317953861584755, + "loss": 2.817, + "theoretical_loss": 3.675703111624065, + "tokens_seen": 927249408 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036316950852557673, + "loss": 2.6925, + "theoretical_loss": 3.6756780429468856, + "tokens_seen": 927314944 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1101371, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7425971031188965, + "objective/train/theoretical_loss": 3.6756717761319235, + "objective/train/tokens_used": 947791328, + "theoretical_loss": 3.6756717761319235, + "tokens_seen": 927331328 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036315947843530596, + "loss": 2.738, + "theoretical_loss": 3.6756529765373473, + "tokens_seen": 927380480 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003631494483450351, + "loss": 2.9168, + "theoretical_loss": 3.6756279123950852, + "tokens_seen": 927446016 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003631394182547643, + "loss": 2.8124, + "theoretical_loss": 3.675602850519734, + "tokens_seen": 927511552 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036312938816449345, + "loss": 2.9051, + "theoretical_loss": 3.6755777909109284, + "tokens_seen": 927577088 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003631193580742227, + "loss": 2.7232, + "theoretical_loss": 3.675552733568304, + "tokens_seen": 927642624 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036310932798395187, + "loss": 2.5916, + "theoretical_loss": 3.675527678491495, + "tokens_seen": 927708160 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036309929789368105, + "loss": 2.794, + "theoretical_loss": 3.675502625680137, + "tokens_seen": 927773696 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036308926780341023, + "loss": 2.7731, + "theoretical_loss": 3.675477575133865, + "tokens_seen": 927839232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036307923771313947, + "loss": 2.7715, + "theoretical_loss": 3.6754525268523137, + "tokens_seen": 927904768 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003630692076228686, + "loss": 2.779, + "theoretical_loss": 3.6754274808351193, + "tokens_seen": 927970304 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036305917753259783, + "loss": 2.7027, + "theoretical_loss": 3.6754024370819165, + "tokens_seen": 928035840 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036304914744232696, + "loss": 2.8426, + "theoretical_loss": 3.6753773955923412, + "tokens_seen": 928101376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003630391173520562, + "loss": 2.7696, + "theoretical_loss": 3.6753523563660293, + "tokens_seen": 928166912 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036302908726178537, + "loss": 2.7511, + "theoretical_loss": 3.6753273194026157, + "tokens_seen": 928232448 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036301905717151455, + "loss": 2.5918, + "theoretical_loss": 3.6753022847017363, + "tokens_seen": 928297984 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036300902708124374, + "loss": 2.5532, + "theoretical_loss": 3.675277252263027, + "tokens_seen": 928363520 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003629989969909729, + "loss": 2.7777, + "theoretical_loss": 3.675252222086124, + "tokens_seen": 928429056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003629889669007021, + "loss": 2.6777, + "theoretical_loss": 3.675227194170663, + "tokens_seen": 928494592 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036297893681043133, + "loss": 2.8747, + "theoretical_loss": 3.6752021685162806, + "tokens_seen": 928560128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036296890672016046, + "loss": 2.7622, + "theoretical_loss": 3.6751771451226123, + "tokens_seen": 928625664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003629588766298897, + "loss": 2.5471, + "theoretical_loss": 3.6751521239892946, + "tokens_seen": 928691200 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003629488465396188, + "loss": 2.5757, + "theoretical_loss": 3.6751271051159637, + "tokens_seen": 928756736 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036293881644934806, + "loss": 2.7508, + "theoretical_loss": 3.675102088502257, + "tokens_seen": 928822272 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036292878635907724, + "loss": 2.8222, + "theoretical_loss": 3.675077074147809, + "tokens_seen": 928887808 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003629187562688064, + "loss": 2.7448, + "theoretical_loss": 3.6750520620522584, + "tokens_seen": 928953344 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1102157, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.89856219291687, + "objective/train/theoretical_loss": 3.67504580938128, + "objective/train/tokens_used": 949429728, + "theoretical_loss": 3.67504580938128, + "tokens_seen": 928969728 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003629087261785356, + "loss": 2.7577, + "theoretical_loss": 3.67502705221524, + "tokens_seen": 929018880 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036289869608826484, + "loss": 2.764, + "theoretical_loss": 3.6750020446363925, + "tokens_seen": 929084416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036288866599799396, + "loss": 2.5949, + "theoretical_loss": 3.674977039315351, + "tokens_seen": 929149952 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003628786359077232, + "loss": 2.7679, + "theoretical_loss": 3.674952036251754, + "tokens_seen": 929215488 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003628686058174523, + "loss": 2.8058, + "theoretical_loss": 3.6749270354452372, + "tokens_seen": 929281024 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036285857572718156, + "loss": 2.5555, + "theoretical_loss": 3.674902036895438, + "tokens_seen": 929346560 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036284854563691074, + "loss": 2.6937, + "theoretical_loss": 3.6748770406019937, + "tokens_seen": 929412096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003628385155466399, + "loss": 2.813, + "theoretical_loss": 3.674852046564542, + "tokens_seen": 929477632 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003628284854563691, + "loss": 2.573, + "theoretical_loss": 3.6748270547827198, + "tokens_seen": 929543168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003628184553660983, + "loss": 2.8186, + "theoretical_loss": 3.6748020652561646, + "tokens_seen": 929608704 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036280842527582747, + "loss": 2.6764, + "theoretical_loss": 3.6747770779845137, + "tokens_seen": 929674240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003627983951855567, + "loss": 2.6099, + "theoretical_loss": 3.674752092967405, + "tokens_seen": 929739776 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036278836509528583, + "loss": 2.979, + "theoretical_loss": 3.6747271102044756, + "tokens_seen": 929805312 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036277833500501506, + "loss": 2.8215, + "theoretical_loss": 3.6747021296953646, + "tokens_seen": 929870848 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036276830491474424, + "loss": 2.8194, + "theoretical_loss": 3.6746771514397083, + "tokens_seen": 929936384 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003627582748244734, + "loss": 2.6955, + "theoretical_loss": 3.6746521754371457, + "tokens_seen": 930001920 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003627482447342026, + "loss": 2.886, + "theoretical_loss": 3.674627201687314, + "tokens_seen": 930067456 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003627382146439318, + "loss": 2.6595, + "theoretical_loss": 3.674602230189852, + "tokens_seen": 930132992 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036272818455366097, + "loss": 2.8569, + "theoretical_loss": 3.674577260944398, + "tokens_seen": 930198528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003627181544633902, + "loss": 2.7004, + "theoretical_loss": 3.6745522939505895, + "tokens_seen": 930264064 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036270812437311933, + "loss": 2.537, + "theoretical_loss": 3.674527329208065, + "tokens_seen": 930329600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036269809428284857, + "loss": 2.6057, + "theoretical_loss": 3.674502366716464, + "tokens_seen": 930395136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003626880641925777, + "loss": 2.6501, + "theoretical_loss": 3.6744774064754235, + "tokens_seen": 930460672 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036267803410230693, + "loss": 2.7078, + "theoretical_loss": 3.6744524484845824, + "tokens_seen": 930526208 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003626680040120361, + "loss": 2.6438, + "theoretical_loss": 3.6744274927435807, + "tokens_seen": 930591744 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1103642, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0793814659118652, + "objective/train/theoretical_loss": 3.6744212541598245, + "objective/train/tokens_used": 951068128, + "theoretical_loss": 3.6744212541598245, + "tokens_seen": 930608128 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003626579739217653, + "loss": 2.5507, + "theoretical_loss": 3.6744025392520556, + "tokens_seen": 930657280 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036264794383149447, + "loss": 2.7309, + "theoretical_loss": 3.674377588009647, + "tokens_seen": 930722816 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036263791374122365, + "loss": 2.6374, + "theoretical_loss": 3.674352639015993, + "tokens_seen": 930788352 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036262788365095283, + "loss": 2.8798, + "theoretical_loss": 3.674327692270733, + "tokens_seen": 930853888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036261785356068207, + "loss": 2.7256, + "theoretical_loss": 3.674302747773506, + "tokens_seen": 930919424 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036260782347041125, + "loss": 2.8085, + "theoretical_loss": 3.674277805523952, + "tokens_seen": 930984960 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036259779338014043, + "loss": 2.7442, + "theoretical_loss": 3.6742528655217095, + "tokens_seen": 931050496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036258776328986967, + "loss": 2.6867, + "theoretical_loss": 3.6742279277664176, + "tokens_seen": 931116032 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003625777331995988, + "loss": 2.8555, + "theoretical_loss": 3.6742029922577157, + "tokens_seen": 931181568 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036256770310932803, + "loss": 2.6521, + "theoretical_loss": 3.674178058995244, + "tokens_seen": 931247104 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036255767301905716, + "loss": 2.7883, + "theoretical_loss": 3.6741531279786424, + "tokens_seen": 931312640 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003625476429287864, + "loss": 2.8195, + "theoretical_loss": 3.674128199207549, + "tokens_seen": 931378176 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003625376128385156, + "loss": 2.6329, + "theoretical_loss": 3.674103272681605, + "tokens_seen": 931443712 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036252758274824475, + "loss": 2.8075, + "theoretical_loss": 3.67407834840045, + "tokens_seen": 931509248 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036251755265797394, + "loss": 2.5847, + "theoretical_loss": 3.6740534263637237, + "tokens_seen": 931574784 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003625075225677031, + "loss": 2.7421, + "theoretical_loss": 3.674028506571066, + "tokens_seen": 931640320 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003624974924774323, + "loss": 2.9755, + "theoretical_loss": 3.6740035890221168, + "tokens_seen": 931705856 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036248746238716153, + "loss": 2.8285, + "theoretical_loss": 3.673978673716517, + "tokens_seen": 931771392 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036247743229689066, + "loss": 2.9064, + "theoretical_loss": 3.673953760653906, + "tokens_seen": 931836928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003624674022066199, + "loss": 2.7024, + "theoretical_loss": 3.673928849833925, + "tokens_seen": 931902464 + }, + { + "epoch": 3.01, + "learning_rate": 0.000362457372116349, + "loss": 2.83, + "theoretical_loss": 3.6739039412562144, + "tokens_seen": 931968000 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036244734202607826, + "loss": 2.6702, + "theoretical_loss": 3.6738790349204136, + "tokens_seen": 932033536 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036243731193580744, + "loss": 2.7042, + "theoretical_loss": 3.6738541308261645, + "tokens_seen": 932099072 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003624272818455366, + "loss": 2.8335, + "theoretical_loss": 3.673829228973107, + "tokens_seen": 932164608 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003624172517552658, + "loss": 2.7473, + "theoretical_loss": 3.673804329360882, + "tokens_seen": 932230144 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1104463, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2996299266815186, + "objective/train/theoretical_loss": 3.673798104807914, + "objective/train/tokens_used": 952706528, + "theoretical_loss": 3.673798104807914, + "tokens_seen": 932246528 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036240722166499504, + "loss": 2.9647, + "theoretical_loss": 3.6737794319891313, + "tokens_seen": 932295680 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036239719157472416, + "loss": 2.7751, + "theoretical_loss": 3.673754536857494, + "tokens_seen": 932361216 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003623871614844534, + "loss": 2.8086, + "theoretical_loss": 3.673729643965612, + "tokens_seen": 932426752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003623771313941825, + "loss": 2.5253, + "theoretical_loss": 3.6737047533131273, + "tokens_seen": 932492288 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036236710130391176, + "loss": 2.8928, + "theoretical_loss": 3.67367986489968, + "tokens_seen": 932557824 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036235707121364094, + "loss": 2.7164, + "theoretical_loss": 3.6736549787249118, + "tokens_seen": 932623360 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003623470411233701, + "loss": 2.8051, + "theoretical_loss": 3.6736300947884635, + "tokens_seen": 932688896 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003623370110330993, + "loss": 2.6297, + "theoretical_loss": 3.673605213089977, + "tokens_seen": 932754432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003623269809428285, + "loss": 2.6382, + "theoretical_loss": 3.6735803336290935, + "tokens_seen": 932819968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036231695085255767, + "loss": 2.8193, + "theoretical_loss": 3.673555456405455, + "tokens_seen": 932885504 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003623069207622869, + "loss": 2.6228, + "theoretical_loss": 3.673530581418703, + "tokens_seen": 932951040 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036229689067201603, + "loss": 2.6079, + "theoretical_loss": 3.673505708668479, + "tokens_seen": 933016576 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036228686058174526, + "loss": 2.7311, + "theoretical_loss": 3.6734808381544255, + "tokens_seen": 933082112 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036227683049147444, + "loss": 2.6853, + "theoretical_loss": 3.6734559698761835, + "tokens_seen": 933147648 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003622668004012036, + "loss": 2.6714, + "theoretical_loss": 3.673431103833396, + "tokens_seen": 933213184 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003622567703109328, + "loss": 2.8825, + "theoretical_loss": 3.673406240025704, + "tokens_seen": 933278720 + }, + { + "epoch": 3.01, + "learning_rate": 0.000362246740220662, + "loss": 2.5792, + "theoretical_loss": 3.6733813784527505, + "tokens_seen": 933344256 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036223671013039117, + "loss": 2.8156, + "theoretical_loss": 3.6733565191141775, + "tokens_seen": 933409792 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003622266800401204, + "loss": 2.9049, + "theoretical_loss": 3.673331662009627, + "tokens_seen": 933475328 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036221664994984953, + "loss": 2.9021, + "theoretical_loss": 3.673306807138742, + "tokens_seen": 933540864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036220661985957877, + "loss": 2.8239, + "theoretical_loss": 3.6732819545011646, + "tokens_seen": 933606400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003621965897693079, + "loss": 2.6474, + "theoretical_loss": 3.6732571040965376, + "tokens_seen": 933671936 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036218655967903713, + "loss": 2.8546, + "theoretical_loss": 3.6732322559245034, + "tokens_seen": 933737472 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003621765295887663, + "loss": 2.6471, + "theoretical_loss": 3.6732074099847054, + "tokens_seen": 933803008 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003621664994984955, + "loss": 2.714, + "theoretical_loss": 3.673182566276785, + "tokens_seen": 933868544 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1105263, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4603049755096436, + "objective/train/theoretical_loss": 3.6731763556984944, + "objective/train/tokens_used": 954344928, + "theoretical_loss": 3.6731763556984944, + "tokens_seen": 933884928 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036215646940822467, + "loss": 2.6152, + "theoretical_loss": 3.6731577248003866, + "tokens_seen": 933934080 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036214643931795385, + "loss": 2.7085, + "theoretical_loss": 3.6731328855551526, + "tokens_seen": 933999616 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036213640922768303, + "loss": 2.6834, + "theoretical_loss": 3.6731080485407266, + "tokens_seen": 934065152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036212637913741227, + "loss": 2.7269, + "theoretical_loss": 3.6730832137567506, + "tokens_seen": 934130688 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003621163490471414, + "loss": 2.749, + "theoretical_loss": 3.6730583812028685, + "tokens_seen": 934196224 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036210631895687063, + "loss": 2.7608, + "theoretical_loss": 3.673033550878724, + "tokens_seen": 934261760 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003620962888665998, + "loss": 2.9078, + "theoretical_loss": 3.67300872278396, + "tokens_seen": 934327296 + }, + { + "epoch": 3.01, + "learning_rate": 0.000362086258776329, + "loss": 2.8538, + "theoretical_loss": 3.67298389691822, + "tokens_seen": 934392832 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003620762286860582, + "loss": 2.6955, + "theoretical_loss": 3.6729590732811475, + "tokens_seen": 934458368 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036206619859578736, + "loss": 2.7588, + "theoretical_loss": 3.6729342518723866, + "tokens_seen": 934523904 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036205616850551654, + "loss": 2.8656, + "theoretical_loss": 3.67290943269158, + "tokens_seen": 934589440 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003620461384152458, + "loss": 2.8229, + "theoretical_loss": 3.672884615738373, + "tokens_seen": 934654976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003620361083249749, + "loss": 2.8307, + "theoretical_loss": 3.6728598010124083, + "tokens_seen": 934720512 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036202607823470414, + "loss": 2.7492, + "theoretical_loss": 3.6728349885133307, + "tokens_seen": 934786048 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036201604814443326, + "loss": 2.6536, + "theoretical_loss": 3.672810178240784, + "tokens_seen": 934851584 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003620060180541625, + "loss": 2.8426, + "theoretical_loss": 3.6727853701944118, + "tokens_seen": 934917120 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003619959879638917, + "loss": 2.745, + "theoretical_loss": 3.672760564373859, + "tokens_seen": 934982656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036198595787362086, + "loss": 2.6526, + "theoretical_loss": 3.672735760778769, + "tokens_seen": 935048192 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036197592778335004, + "loss": 2.4051, + "theoretical_loss": 3.6727109594087874, + "tokens_seen": 935113728 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003619658976930792, + "loss": 2.4586, + "theoretical_loss": 3.672686160263558, + "tokens_seen": 935179264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003619558676028084, + "loss": 2.8034, + "theoretical_loss": 3.6726613633427254, + "tokens_seen": 935244800 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036194583751253764, + "loss": 2.8574, + "theoretical_loss": 3.6726365686459346, + "tokens_seen": 935310336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036193580742226677, + "loss": 3.0104, + "theoretical_loss": 3.6726117761728294, + "tokens_seen": 935375872 + }, + { + "epoch": 3.01, + "learning_rate": 0.000361925777331996, + "loss": 2.7326, + "theoretical_loss": 3.672586985923055, + "tokens_seen": 935441408 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003619157472417252, + "loss": 2.5144, + "theoretical_loss": 3.672562197896257, + "tokens_seen": 935506944 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1106660, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0614864826202393, + "objective/train/theoretical_loss": 3.6725560012368557, + "objective/train/tokens_used": 955983328, + "theoretical_loss": 3.6725560012368557, + "tokens_seen": 935523328 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036190571715145436, + "loss": 2.7475, + "theoretical_loss": 3.6725374120920797, + "tokens_seen": 935572480 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036189568706118354, + "loss": 2.5965, + "theoretical_loss": 3.672512628510168, + "tokens_seen": 935638016 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003618856569709127, + "loss": 2.6269, + "theoretical_loss": 3.672487847150167, + "tokens_seen": 935703552 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003618756268806419, + "loss": 2.6792, + "theoretical_loss": 3.672463068011723, + "tokens_seen": 935769088 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036186559679037114, + "loss": 2.7316, + "theoretical_loss": 3.6724382910944797, + "tokens_seen": 935834624 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003618555667001003, + "loss": 2.7486, + "theoretical_loss": 3.672413516398083, + "tokens_seen": 935900160 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003618455366098295, + "loss": 2.6199, + "theoretical_loss": 3.672388743922179, + "tokens_seen": 935965696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003618355065195587, + "loss": 2.6766, + "theoretical_loss": 3.672363973666413, + "tokens_seen": 936031232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036182547642928787, + "loss": 2.7611, + "theoretical_loss": 3.67233920563043, + "tokens_seen": 936096768 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003618154463390171, + "loss": 2.4793, + "theoretical_loss": 3.672314439813876, + "tokens_seen": 936162304 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036180541624874623, + "loss": 2.5873, + "theoretical_loss": 3.672289676216397, + "tokens_seen": 936227840 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036179538615847546, + "loss": 2.6239, + "theoretical_loss": 3.6722649148376383, + "tokens_seen": 936293376 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036178535606820464, + "loss": 2.7087, + "theoretical_loss": 3.6722401556772466, + "tokens_seen": 936358912 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003617753259779338, + "loss": 2.7876, + "theoretical_loss": 3.672215398734867, + "tokens_seen": 936424448 + }, + { + "epoch": 3.01, + "learning_rate": 0.000361765295887663, + "loss": 2.6702, + "theoretical_loss": 3.672190644010147, + "tokens_seen": 936489984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003617552657973922, + "loss": 2.788, + "theoretical_loss": 3.672165891502731, + "tokens_seen": 936555520 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036174523570712137, + "loss": 2.7138, + "theoretical_loss": 3.6721411412122666, + "tokens_seen": 936621056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003617352056168506, + "loss": 2.7349, + "theoretical_loss": 3.6721163931383995, + "tokens_seen": 936686592 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036172517552657973, + "loss": 2.7485, + "theoretical_loss": 3.6720916472807765, + "tokens_seen": 936752128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036171514543630897, + "loss": 2.8504, + "theoretical_loss": 3.6720669036390436, + "tokens_seen": 936817664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003617051153460381, + "loss": 2.7481, + "theoretical_loss": 3.672042162212848, + "tokens_seen": 936883200 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036169508525576733, + "loss": 2.9067, + "theoretical_loss": 3.672017423001836, + "tokens_seen": 936948736 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003616850551654965, + "loss": 2.4963, + "theoretical_loss": 3.6719926860056544, + "tokens_seen": 937014272 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003616750250752257, + "loss": 2.6033, + "theoretical_loss": 3.67196795122395, + "tokens_seen": 937079808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036166499498495487, + "loss": 2.7483, + "theoretical_loss": 3.6719432186563696, + "tokens_seen": 937145344 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1107254, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6422669887542725, + "objective/train/theoretical_loss": 3.67193703586039, + "objective/train/tokens_used": 957621728, + "theoretical_loss": 3.67193703586039, + "tokens_seen": 937161728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036165496489468405, + "loss": 2.6973, + "theoretical_loss": 3.67191848830256, + "tokens_seen": 937210880 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036164493480441323, + "loss": 2.6796, + "theoretical_loss": 3.6718937601621695, + "tokens_seen": 937276416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036163490471414247, + "loss": 2.6766, + "theoretical_loss": 3.6718690342348435, + "tokens_seen": 937341952 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003616248746238716, + "loss": 2.8178, + "theoretical_loss": 3.6718443105202305, + "tokens_seen": 937407488 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036161484453360083, + "loss": 2.6341, + "theoretical_loss": 3.6718195890179772, + "tokens_seen": 937473024 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036160481444333, + "loss": 2.8366, + "theoretical_loss": 3.671794869727731, + "tokens_seen": 937538560 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003615947843530592, + "loss": 2.6977, + "theoretical_loss": 3.67177015264914, + "tokens_seen": 937604096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003615847542627884, + "loss": 2.6461, + "theoretical_loss": 3.6717454377818513, + "tokens_seen": 937669632 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036157472417251756, + "loss": 2.6455, + "theoretical_loss": 3.671720725125512, + "tokens_seen": 937735168 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036156469408224674, + "loss": 2.7254, + "theoretical_loss": 3.671696014679771, + "tokens_seen": 937800704 + }, + { + "epoch": 3.01, + "learning_rate": 0.000361554663991976, + "loss": 2.6941, + "theoretical_loss": 3.6716713064442743, + "tokens_seen": 937866240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003615446339017051, + "loss": 2.7385, + "theoretical_loss": 3.671646600418672, + "tokens_seen": 937931776 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036153460381143434, + "loss": 2.5716, + "theoretical_loss": 3.671621896602611, + "tokens_seen": 937997312 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036152457372116346, + "loss": 2.5538, + "theoretical_loss": 3.671597194995739, + "tokens_seen": 938062848 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003615145436308927, + "loss": 2.7646, + "theoretical_loss": 3.671572495597704, + "tokens_seen": 938128384 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003615045135406219, + "loss": 2.5636, + "theoretical_loss": 3.671547798408155, + "tokens_seen": 938193920 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036149448345035106, + "loss": 2.6606, + "theoretical_loss": 3.6715231034267397, + "tokens_seen": 938259456 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036148445336008024, + "loss": 2.7052, + "theoretical_loss": 3.671498410653107, + "tokens_seen": 938324992 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003614744232698094, + "loss": 2.7681, + "theoretical_loss": 3.6714737200869045, + "tokens_seen": 938390528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003614643931795386, + "loss": 2.3929, + "theoretical_loss": 3.6714490317277817, + "tokens_seen": 938456064 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036145436308926784, + "loss": 2.6983, + "theoretical_loss": 3.6714243455753865, + "tokens_seen": 938521600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036144433299899697, + "loss": 2.6932, + "theoretical_loss": 3.6713996616293683, + "tokens_seen": 938587136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003614343029087262, + "loss": 2.5358, + "theoretical_loss": 3.671374979889375, + "tokens_seen": 938652672 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003614242728184554, + "loss": 2.583, + "theoretical_loss": 3.671350300355056, + "tokens_seen": 938718208 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036141424272818456, + "loss": 2.7081, + "theoretical_loss": 3.6713256230260596, + "tokens_seen": 938783744 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1108649, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.576571464538574, + "objective/train/theoretical_loss": 3.671319454038351, + "objective/train/tokens_used": 959260128, + "theoretical_loss": 3.671319454038351, + "tokens_seen": 938800128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036140421263791374, + "loss": 2.7355, + "theoretical_loss": 3.6713009479020355, + "tokens_seen": 938849280 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003613941825476429, + "loss": 2.7932, + "theoretical_loss": 3.6712762749826324, + "tokens_seen": 938914816 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003613841524573721, + "loss": 2.659, + "theoretical_loss": 3.6712516042675, + "tokens_seen": 938980352 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036137412236710134, + "loss": 2.4067, + "theoretical_loss": 3.6712269357562866, + "tokens_seen": 939045888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036136409227683047, + "loss": 2.6325, + "theoretical_loss": 3.671202269448642, + "tokens_seen": 939111424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003613540621865597, + "loss": 2.7483, + "theoretical_loss": 3.671177605344216, + "tokens_seen": 939176960 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036134403209628883, + "loss": 2.6851, + "theoretical_loss": 3.671152943442658, + "tokens_seen": 939242496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036133400200601807, + "loss": 2.7459, + "theoretical_loss": 3.6711282837436174, + "tokens_seen": 939308032 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036132397191574725, + "loss": 2.6561, + "theoretical_loss": 3.671103626246743, + "tokens_seen": 939373568 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036131394182547643, + "loss": 2.8424, + "theoretical_loss": 3.671078970951686, + "tokens_seen": 939439104 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003613039117352056, + "loss": 2.7681, + "theoretical_loss": 3.671054317858095, + "tokens_seen": 939504640 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036129388164493484, + "loss": 2.7278, + "theoretical_loss": 3.6710296669656204, + "tokens_seen": 939570176 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036128385155466397, + "loss": 2.9318, + "theoretical_loss": 3.6710050182739127, + "tokens_seen": 939635712 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003612738214643932, + "loss": 2.843, + "theoretical_loss": 3.6709803717826213, + "tokens_seen": 939701248 + }, + { + "epoch": 3.01, + "learning_rate": 0.00036126379137412233, + "loss": 2.7645, + "theoretical_loss": 3.670955727491396, + "tokens_seen": 939766784 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036125376128385157, + "loss": 2.7355, + "theoretical_loss": 3.6709310853998876, + "tokens_seen": 939832320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036124373119358075, + "loss": 2.8614, + "theoretical_loss": 3.6709064455077467, + "tokens_seen": 939897856 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036123370110330993, + "loss": 2.78, + "theoretical_loss": 3.670881807814623, + "tokens_seen": 939963392 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003612236710130391, + "loss": 2.6633, + "theoretical_loss": 3.6708571723201664, + "tokens_seen": 940028928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003612136409227683, + "loss": 2.7481, + "theoretical_loss": 3.670832539024029, + "tokens_seen": 940094464 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003612036108324975, + "loss": 2.7512, + "theoretical_loss": 3.6708079079258598, + "tokens_seen": 940160000 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003611935807422267, + "loss": 2.5802, + "theoretical_loss": 3.6707832790253105, + "tokens_seen": 940225536 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036118355065195584, + "loss": 2.8009, + "theoretical_loss": 3.670758652322032, + "tokens_seen": 940291072 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036117352056168507, + "loss": 2.8288, + "theoretical_loss": 3.6707340278156746, + "tokens_seen": 940356608 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003611634904714142, + "loss": 2.9308, + "theoretical_loss": 3.6707094055058898, + "tokens_seen": 940422144 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1109497, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9072909355163574, + "objective/train/theoretical_loss": 3.670703250271617, + "objective/train/tokens_used": 960898528, + "theoretical_loss": 3.670703250271617, + "tokens_seen": 940438528 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036115346038114343, + "loss": 2.8859, + "theoretical_loss": 3.6706847853923277, + "tokens_seen": 940487680 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003611434302908726, + "loss": 2.8009, + "theoretical_loss": 3.6706601674746397, + "tokens_seen": 940553216 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003611334002006018, + "loss": 2.7237, + "theoretical_loss": 3.6706355517524774, + "tokens_seen": 940618752 + }, + { + "epoch": 3.02, + "learning_rate": 0.000361123370110331, + "loss": 2.5759, + "theoretical_loss": 3.670610938225492, + "tokens_seen": 940684288 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003611133400200602, + "loss": 2.6999, + "theoretical_loss": 3.6705863268933347, + "tokens_seen": 940749824 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003611033099297894, + "loss": 2.6335, + "theoretical_loss": 3.6705617177556564, + "tokens_seen": 940815360 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003610932798395186, + "loss": 2.9326, + "theoretical_loss": 3.670537110812109, + "tokens_seen": 940880896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036108324974924776, + "loss": 2.7079, + "theoretical_loss": 3.6705125060623445, + "tokens_seen": 940946432 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036107321965897694, + "loss": 2.8174, + "theoretical_loss": 3.670487903506014, + "tokens_seen": 941011968 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003610631895687062, + "loss": 2.6992, + "theoretical_loss": 3.6704633031427694, + "tokens_seen": 941077504 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003610531594784353, + "loss": 2.5846, + "theoretical_loss": 3.6704387049722618, + "tokens_seen": 941143040 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036104312938816454, + "loss": 2.5115, + "theoretical_loss": 3.6704141089941444, + "tokens_seen": 941208576 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036103309929789366, + "loss": 2.8338, + "theoretical_loss": 3.6703895152080683, + "tokens_seen": 941274112 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003610230692076229, + "loss": 2.7403, + "theoretical_loss": 3.670364923613686, + "tokens_seen": 941339648 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003610130391173521, + "loss": 2.7208, + "theoretical_loss": 3.670340334210649, + "tokens_seen": 941405184 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036100300902708126, + "loss": 2.8817, + "theoretical_loss": 3.67031574699861, + "tokens_seen": 941470720 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036099297893681044, + "loss": 2.4179, + "theoretical_loss": 3.6702911619772207, + "tokens_seen": 941536256 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003609829488465396, + "loss": 2.5508, + "theoretical_loss": 3.6702665791461344, + "tokens_seen": 941601792 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003609729187562688, + "loss": 2.8291, + "theoretical_loss": 3.670241998505003, + "tokens_seen": 941667328 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036096288866599804, + "loss": 2.8138, + "theoretical_loss": 3.670217420053479, + "tokens_seen": 941732864 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036095285857572717, + "loss": 2.5917, + "theoretical_loss": 3.670192843791215, + "tokens_seen": 941798400 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003609428284854564, + "loss": 2.6289, + "theoretical_loss": 3.6701682697178635, + "tokens_seen": 941863936 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003609327983951856, + "loss": 2.6654, + "theoretical_loss": 3.670143697833078, + "tokens_seen": 941929472 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036092276830491476, + "loss": 2.7178, + "theoretical_loss": 3.67011912813651, + "tokens_seen": 941995008 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036091273821464394, + "loss": 2.6515, + "theoretical_loss": 3.670094560627814, + "tokens_seen": 942060544 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1110990, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0038723945617676, + "objective/train/theoretical_loss": 3.6700884190924543, + "objective/train/tokens_used": 962536928, + "theoretical_loss": 3.6700884190924543, + "tokens_seen": 942076928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003609027081243731, + "loss": 2.5359, + "theoretical_loss": 3.6700699953066422, + "tokens_seen": 942126080 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003608926780341023, + "loss": 2.7436, + "theoretical_loss": 3.670045432172647, + "tokens_seen": 942191616 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036088264794383154, + "loss": 2.8468, + "theoretical_loss": 3.670020871225483, + "tokens_seen": 942257152 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036087261785356067, + "loss": 2.7068, + "theoretical_loss": 3.669996312464802, + "tokens_seen": 942322688 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003608625877632899, + "loss": 2.7132, + "theoretical_loss": 3.6699717558902583, + "tokens_seen": 942388224 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036085255767301903, + "loss": 2.9056, + "theoretical_loss": 3.669947201501505, + "tokens_seen": 942453760 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036084252758274827, + "loss": 2.6841, + "theoretical_loss": 3.6699226492981953, + "tokens_seen": 942519296 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036083249749247745, + "loss": 2.6888, + "theoretical_loss": 3.6698980992799832, + "tokens_seen": 942584832 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036082246740220663, + "loss": 2.6369, + "theoretical_loss": 3.669873551446522, + "tokens_seen": 942650368 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003608124373119358, + "loss": 2.7913, + "theoretical_loss": 3.6698490057974658, + "tokens_seen": 942715904 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036080240722166505, + "loss": 2.7292, + "theoretical_loss": 3.6698244623324676, + "tokens_seen": 942781440 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036079237713139417, + "loss": 2.8, + "theoretical_loss": 3.6697999210511822, + "tokens_seen": 942846976 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003607823470411234, + "loss": 2.492, + "theoretical_loss": 3.6697753819532624, + "tokens_seen": 942912512 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036077231695085253, + "loss": 2.8689, + "theoretical_loss": 3.6697508450383634, + "tokens_seen": 942978048 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036076228686058177, + "loss": 2.7812, + "theoretical_loss": 3.6697263103061393, + "tokens_seen": 943043584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036075225677031095, + "loss": 2.5538, + "theoretical_loss": 3.6697017777562433, + "tokens_seen": 943109120 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036074222668004013, + "loss": 2.7327, + "theoretical_loss": 3.6696772473883303, + "tokens_seen": 943174656 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003607321965897693, + "loss": 2.7159, + "theoretical_loss": 3.6696527192020545, + "tokens_seen": 943240192 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003607221664994985, + "loss": 2.5015, + "theoretical_loss": 3.66962819319707, + "tokens_seen": 943305728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003607121364092277, + "loss": 2.6532, + "theoretical_loss": 3.6696036693730316, + "tokens_seen": 943371264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003607021063189569, + "loss": 2.7995, + "theoretical_loss": 3.669579147729594, + "tokens_seen": 943436800 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036069207622868604, + "loss": 2.6887, + "theoretical_loss": 3.6695546282664115, + "tokens_seen": 943502336 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036068204613841527, + "loss": 2.9105, + "theoretical_loss": 3.669530110983139, + "tokens_seen": 943567872 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003606720160481444, + "loss": 2.4251, + "theoretical_loss": 3.669505595879432, + "tokens_seen": 943633408 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036066198595787364, + "loss": 2.7481, + "theoretical_loss": 3.6694810829549445, + "tokens_seen": 943698944 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1111689, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0604324340820312, + "objective/train/theoretical_loss": 3.669474955064285, + "objective/train/tokens_used": 964175328, + "theoretical_loss": 3.669474955064285, + "tokens_seen": 943715328 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003606519558676028, + "loss": 2.7923, + "theoretical_loss": 3.6694565722093317, + "tokens_seen": 943764480 + }, + { + "epoch": 3.02, + "learning_rate": 0.000360641925777332, + "loss": 2.8104, + "theoretical_loss": 3.6694320636422484, + "tokens_seen": 943830016 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003606318956870612, + "loss": 2.9466, + "theoretical_loss": 3.66940755725335, + "tokens_seen": 943895552 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003606218655967904, + "loss": 2.6342, + "theoretical_loss": 3.669383053042292, + "tokens_seen": 943961088 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036061183550651954, + "loss": 2.73, + "theoretical_loss": 3.6693585510087288, + "tokens_seen": 944026624 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003606018054162488, + "loss": 2.8563, + "theoretical_loss": 3.6693340511523167, + "tokens_seen": 944092160 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003605917753259779, + "loss": 2.6699, + "theoretical_loss": 3.6693095534727105, + "tokens_seen": 944157696 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036058174523570714, + "loss": 2.7349, + "theoretical_loss": 3.6692850579695664, + "tokens_seen": 944223232 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003605717151454363, + "loss": 2.8572, + "theoretical_loss": 3.6692605646425394, + "tokens_seen": 944288768 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003605616850551655, + "loss": 2.6415, + "theoretical_loss": 3.6692360734912848, + "tokens_seen": 944354304 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003605516549648947, + "loss": 2.7184, + "theoretical_loss": 3.6692115845154594, + "tokens_seen": 944419840 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036054162487462386, + "loss": 2.7668, + "theoretical_loss": 3.669187097714718, + "tokens_seen": 944485376 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036053159478435304, + "loss": 2.4856, + "theoretical_loss": 3.6691626130887176, + "tokens_seen": 944550912 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003605215646940823, + "loss": 2.7285, + "theoretical_loss": 3.669138130637114, + "tokens_seen": 944616448 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003605115346038114, + "loss": 2.8301, + "theoretical_loss": 3.669113650359562, + "tokens_seen": 944681984 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036050150451354064, + "loss": 2.9229, + "theoretical_loss": 3.6690891722557186, + "tokens_seen": 944747520 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036049147442326977, + "loss": 2.8787, + "theoretical_loss": 3.6690646963252402, + "tokens_seen": 944813056 + }, + { + "epoch": 3.02, + "learning_rate": 0.000360481444332999, + "loss": 2.8984, + "theoretical_loss": 3.6690402225677827, + "tokens_seen": 944878592 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003604714142427282, + "loss": 2.8173, + "theoretical_loss": 3.6690157509830033, + "tokens_seen": 944944128 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036046138415245737, + "loss": 2.3796, + "theoretical_loss": 3.668991281570557, + "tokens_seen": 945009664 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036045135406218655, + "loss": 2.8952, + "theoretical_loss": 3.668966814330102, + "tokens_seen": 945075200 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003604413239719158, + "loss": 2.8054, + "theoretical_loss": 3.668942349261294, + "tokens_seen": 945140736 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003604312938816449, + "loss": 2.7728, + "theoretical_loss": 3.6689178863637895, + "tokens_seen": 945206272 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036042126379137414, + "loss": 2.8421, + "theoretical_loss": 3.668893425637245, + "tokens_seen": 945271808 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036041123370110327, + "loss": 2.74, + "theoretical_loss": 3.668868967081319, + "tokens_seen": 945337344 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1112870, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6617534160614014, + "objective/train/theoretical_loss": 3.668862852781456, + "objective/train/tokens_used": 965813728, + "theoretical_loss": 3.668862852781456, + "tokens_seen": 945353728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003604012036108325, + "loss": 2.8314, + "theoretical_loss": 3.6688445106956666, + "tokens_seen": 945402880 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003603911735205617, + "loss": 2.5809, + "theoretical_loss": 3.668820056479946, + "tokens_seen": 945468416 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036038114343029087, + "loss": 2.5361, + "theoretical_loss": 3.6687956044338135, + "tokens_seen": 945533952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036037111334002005, + "loss": 2.7613, + "theoretical_loss": 3.6687711545569264, + "tokens_seen": 945599488 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036036108324974923, + "loss": 2.7731, + "theoretical_loss": 3.668746706848942, + "tokens_seen": 945665024 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036035105315947847, + "loss": 2.8019, + "theoretical_loss": 3.668722261309518, + "tokens_seen": 945730560 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036034102306920765, + "loss": 2.8439, + "theoretical_loss": 3.6686978179383116, + "tokens_seen": 945796096 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036033099297893683, + "loss": 2.8839, + "theoretical_loss": 3.6686733767349797, + "tokens_seen": 945861632 + }, + { + "epoch": 3.02, + "learning_rate": 0.000360320962888666, + "loss": 2.5422, + "theoretical_loss": 3.6686489376991807, + "tokens_seen": 945927168 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036031093279839525, + "loss": 2.8942, + "theoretical_loss": 3.668624500830571, + "tokens_seen": 945992704 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036030090270812437, + "loss": 2.5538, + "theoretical_loss": 3.6686000661288105, + "tokens_seen": 946058240 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003602908726178536, + "loss": 2.7451, + "theoretical_loss": 3.6685756335935547, + "tokens_seen": 946123776 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036028084252758273, + "loss": 2.7525, + "theoretical_loss": 3.668551203224463, + "tokens_seen": 946189312 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036027081243731197, + "loss": 2.7951, + "theoretical_loss": 3.668526775021192, + "tokens_seen": 946254848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036026078234704115, + "loss": 2.6365, + "theoretical_loss": 3.6685023489834006, + "tokens_seen": 946320384 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036025075225677033, + "loss": 2.812, + "theoretical_loss": 3.6684779251107464, + "tokens_seen": 946385920 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003602407221664995, + "loss": 2.5312, + "theoretical_loss": 3.6684535034028887, + "tokens_seen": 946451456 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003602306920762287, + "loss": 2.7091, + "theoretical_loss": 3.668429083859484, + "tokens_seen": 946516992 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003602206619859579, + "loss": 2.9199, + "theoretical_loss": 3.668404666480192, + "tokens_seen": 946582528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003602106318956871, + "loss": 2.807, + "theoretical_loss": 3.6683802512646704, + "tokens_seen": 946648064 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036020060180541624, + "loss": 2.5709, + "theoretical_loss": 3.6683558382125785, + "tokens_seen": 946713600 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003601905717151455, + "loss": 2.6406, + "theoretical_loss": 3.6683314273235736, + "tokens_seen": 946779136 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003601805416248746, + "loss": 2.895, + "theoretical_loss": 3.6683070185973152, + "tokens_seen": 946844672 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036017051153460384, + "loss": 2.7009, + "theoretical_loss": 3.668282612033462, + "tokens_seen": 946910208 + }, + { + "epoch": 3.02, + "learning_rate": 0.000360160481444333, + "loss": 2.796, + "theoretical_loss": 3.6682582076316725, + "tokens_seen": 946975744 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1113638, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.09346079826355, + "objective/train/theoretical_loss": 3.6682521068690077, + "objective/train/tokens_used": 967452128, + "theoretical_loss": 3.6682521068690077, + "tokens_seen": 946992128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003601504513540622, + "loss": 2.9157, + "theoretical_loss": 3.6682338053916057, + "tokens_seen": 947041280 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003601404212637914, + "loss": 2.5278, + "theoretical_loss": 3.6682094053129206, + "tokens_seen": 947106816 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003601303911735206, + "loss": 2.7553, + "theoretical_loss": 3.6681850073952758, + "tokens_seen": 947172352 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036012036108324974, + "loss": 2.7387, + "theoretical_loss": 3.6681606116383314, + "tokens_seen": 947237888 + }, + { + "epoch": 3.02, + "learning_rate": 0.000360110330992979, + "loss": 2.6852, + "theoretical_loss": 3.668136218041745, + "tokens_seen": 947303424 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003601003009027081, + "loss": 2.4123, + "theoretical_loss": 3.6681118266051778, + "tokens_seen": 947368960 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036009027081243734, + "loss": 2.7635, + "theoretical_loss": 3.6680874373282872, + "tokens_seen": 947434496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003600802407221665, + "loss": 2.7806, + "theoretical_loss": 3.6680630502107334, + "tokens_seen": 947500032 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003600702106318957, + "loss": 2.7379, + "theoretical_loss": 3.668038665252177, + "tokens_seen": 947565568 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003600601805416249, + "loss": 2.7847, + "theoretical_loss": 3.668014282452276, + "tokens_seen": 947631104 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036005015045135406, + "loss": 2.6898, + "theoretical_loss": 3.6679899018106905, + "tokens_seen": 947696640 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036004012036108324, + "loss": 2.7477, + "theoretical_loss": 3.66796552332708, + "tokens_seen": 947762176 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003600300902708125, + "loss": 2.9667, + "theoretical_loss": 3.667941147001105, + "tokens_seen": 947827712 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003600200601805416, + "loss": 2.6944, + "theoretical_loss": 3.6679167728324247, + "tokens_seen": 947893248 + }, + { + "epoch": 3.02, + "learning_rate": 0.00036001003009027084, + "loss": 2.8867, + "theoretical_loss": 3.6678924008206995, + "tokens_seen": 947958784 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035999999999999997, + "loss": 2.6801, + "theoretical_loss": 3.6678680309655896, + "tokens_seen": 948024320 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003599899699097292, + "loss": 2.6097, + "theoretical_loss": 3.667843663266754, + "tokens_seen": 948089856 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003599799398194584, + "loss": 2.7772, + "theoretical_loss": 3.667819297723854, + "tokens_seen": 948155392 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035996990972918757, + "loss": 2.722, + "theoretical_loss": 3.6677949343365492, + "tokens_seen": 948220928 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035995987963891675, + "loss": 2.6455, + "theoretical_loss": 3.6677705731045007, + "tokens_seen": 948286464 + }, + { + "epoch": 3.02, + "learning_rate": 0.000359949849548646, + "loss": 2.7656, + "theoretical_loss": 3.6677462140273684, + "tokens_seen": 948352000 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003599398194583751, + "loss": 2.8047, + "theoretical_loss": 3.6677218571048122, + "tokens_seen": 948417536 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035992978936810434, + "loss": 2.7955, + "theoretical_loss": 3.6676975023364937, + "tokens_seen": 948483072 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035991975927783347, + "loss": 2.789, + "theoretical_loss": 3.667673149722073, + "tokens_seen": 948548608 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003599097291875627, + "loss": 2.9306, + "theoretical_loss": 3.667648799261211, + "tokens_seen": 948614144 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1115063, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7750394344329834, + "objective/train/theoretical_loss": 3.6676427119824493, + "objective/train/tokens_used": 969090528, + "theoretical_loss": 3.6676427119824493, + "tokens_seen": 948630528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003598996990972919, + "loss": 2.7399, + "theoretical_loss": 3.6676244509535687, + "tokens_seen": 948679680 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035988966900702107, + "loss": 2.7975, + "theoretical_loss": 3.6676001047988063, + "tokens_seen": 948745216 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035987963891675025, + "loss": 2.7424, + "theoretical_loss": 3.667575760796585, + "tokens_seen": 948810752 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035986960882647943, + "loss": 2.9323, + "theoretical_loss": 3.667551418946566, + "tokens_seen": 948876288 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003598595787362086, + "loss": 2.8068, + "theoretical_loss": 3.6675270792484107, + "tokens_seen": 948941824 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035984954864593785, + "loss": 2.9573, + "theoretical_loss": 3.66750274170178, + "tokens_seen": 949007360 + }, + { + "epoch": 3.02, + "learning_rate": 0.000359839518555667, + "loss": 2.6737, + "theoretical_loss": 3.6674784063063344, + "tokens_seen": 949072896 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003598294884653962, + "loss": 2.7313, + "theoretical_loss": 3.6674540730617364, + "tokens_seen": 949138432 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035981945837512534, + "loss": 2.6642, + "theoretical_loss": 3.6674297419676476, + "tokens_seen": 949203968 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035980942828485457, + "loss": 2.8812, + "theoretical_loss": 3.6674054130237277, + "tokens_seen": 949269504 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035979939819458375, + "loss": 2.6248, + "theoretical_loss": 3.66738108622964, + "tokens_seen": 949335040 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035978936810431293, + "loss": 2.7558, + "theoretical_loss": 3.6673567615850455, + "tokens_seen": 949400576 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003597793380140421, + "loss": 2.6226, + "theoretical_loss": 3.6673324390896056, + "tokens_seen": 949466112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035976930792377135, + "loss": 2.5855, + "theoretical_loss": 3.667308118742983, + "tokens_seen": 949531648 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003597592778335005, + "loss": 2.7705, + "theoretical_loss": 3.6672838005448387, + "tokens_seen": 949597184 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003597492477432297, + "loss": 2.692, + "theoretical_loss": 3.667259484494835, + "tokens_seen": 949662720 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035973921765295884, + "loss": 2.8313, + "theoretical_loss": 3.667235170592634, + "tokens_seen": 949728256 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003597291875626881, + "loss": 2.8125, + "theoretical_loss": 3.667210858837897, + "tokens_seen": 949793792 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035971915747241726, + "loss": 2.6625, + "theoretical_loss": 3.6671865492302875, + "tokens_seen": 949859328 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035970912738214644, + "loss": 2.6021, + "theoretical_loss": 3.667162241769467, + "tokens_seen": 949924864 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003596990972918756, + "loss": 2.829, + "theoretical_loss": 3.667137936455098, + "tokens_seen": 949990400 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003596890672016048, + "loss": 2.7101, + "theoretical_loss": 3.667113633286842, + "tokens_seen": 950055936 + }, + { + "epoch": 3.02, + "learning_rate": 0.000359679037111334, + "loss": 2.6948, + "theoretical_loss": 3.6670893322643634, + "tokens_seen": 950121472 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003596690070210632, + "loss": 2.716, + "theoretical_loss": 3.667065033387323, + "tokens_seen": 950187008 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035965897693079234, + "loss": 2.7286, + "theoretical_loss": 3.6670407366553848, + "tokens_seen": 950252544 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1115859, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0115370750427246, + "objective/train/theoretical_loss": 3.6670346628075325, + "objective/train/tokens_used": 970728928, + "theoretical_loss": 3.6670346628075325, + "tokens_seen": 950268928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003596489468405216, + "loss": 2.8904, + "theoretical_loss": 3.66701644206821, + "tokens_seen": 950318080 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035963891675025076, + "loss": 2.7355, + "theoretical_loss": 3.6669921496254627, + "tokens_seen": 950383616 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035962888665997994, + "loss": 2.6775, + "theoretical_loss": 3.666967859326805, + "tokens_seen": 950449152 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003596188565697091, + "loss": 2.7278, + "theoretical_loss": 3.6669435711719, + "tokens_seen": 950514688 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003596088264794383, + "loss": 2.8868, + "theoretical_loss": 3.6669192851604113, + "tokens_seen": 950580224 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035959879638916754, + "loss": 2.7949, + "theoretical_loss": 3.666895001292001, + "tokens_seen": 950645760 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003595887662988967, + "loss": 2.8372, + "theoretical_loss": 3.666870719566333, + "tokens_seen": 950711296 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003595787362086259, + "loss": 2.6296, + "theoretical_loss": 3.66684643998307, + "tokens_seen": 950776832 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003595687061183551, + "loss": 2.5486, + "theoretical_loss": 3.6668221625418758, + "tokens_seen": 950842368 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035955867602808426, + "loss": 2.7116, + "theoretical_loss": 3.666797887242414, + "tokens_seen": 950907904 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035954864593781344, + "loss": 2.6305, + "theoretical_loss": 3.6667736140843474, + "tokens_seen": 950973440 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003595386158475427, + "loss": 2.7846, + "theoretical_loss": 3.6667493430673397, + "tokens_seen": 951038976 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003595285857572718, + "loss": 2.7952, + "theoretical_loss": 3.6667250741910546, + "tokens_seen": 951104512 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035951855566700104, + "loss": 2.8643, + "theoretical_loss": 3.666700807455156, + "tokens_seen": 951170048 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035950852557673017, + "loss": 2.6962, + "theoretical_loss": 3.6666765428593076, + "tokens_seen": 951235584 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003594984954864594, + "loss": 2.7865, + "theoretical_loss": 3.6666522804031736, + "tokens_seen": 951301120 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003594884653961886, + "loss": 2.5407, + "theoretical_loss": 3.666628020086417, + "tokens_seen": 951366656 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035947843530591777, + "loss": 2.3828, + "theoretical_loss": 3.6666037619087026, + "tokens_seen": 951432192 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035946840521564695, + "loss": 2.8197, + "theoretical_loss": 3.6665795058696937, + "tokens_seen": 951497728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003594583751253762, + "loss": 2.7182, + "theoretical_loss": 3.666555251969055, + "tokens_seen": 951563264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003594483450351053, + "loss": 2.729, + "theoretical_loss": 3.6665310002064513, + "tokens_seen": 951628800 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035943831494483454, + "loss": 2.6999, + "theoretical_loss": 3.6665067505815454, + "tokens_seen": 951694336 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035942828485456367, + "loss": 2.7045, + "theoretical_loss": 3.6664825030940036, + "tokens_seen": 951759872 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003594182547642929, + "loss": 2.5441, + "theoretical_loss": 3.6664582577434888, + "tokens_seen": 951825408 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003594082246740221, + "loss": 2.7172, + "theoretical_loss": 3.666434014529666, + "tokens_seen": 951890944 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1116983, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5958456993103027, + "objective/train/theoretical_loss": 3.666427954060029, + "objective/train/tokens_used": 972367328, + "theoretical_loss": 3.666427954060029, + "tokens_seen": 951907328 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035939819458375127, + "loss": 2.7405, + "theoretical_loss": 3.6664097734522, + "tokens_seen": 951956480 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035938816449348045, + "loss": 2.8947, + "theoretical_loss": 3.666385534510755, + "tokens_seen": 952022016 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035937813440320963, + "loss": 2.7593, + "theoretical_loss": 3.666361297704997, + "tokens_seen": 952087552 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003593681043129388, + "loss": 2.8236, + "theoretical_loss": 3.666337063034589, + "tokens_seen": 952153088 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035935807422266805, + "loss": 2.4366, + "theoretical_loss": 3.666312830499197, + "tokens_seen": 952218624 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003593480441323972, + "loss": 2.8637, + "theoretical_loss": 3.6662886000984862, + "tokens_seen": 952284160 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003593380140421264, + "loss": 2.8358, + "theoretical_loss": 3.6662643718321215, + "tokens_seen": 952349696 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035932798395185554, + "loss": 2.7572, + "theoretical_loss": 3.6662401456997675, + "tokens_seen": 952415232 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035931795386158477, + "loss": 2.8309, + "theoretical_loss": 3.66621592170109, + "tokens_seen": 952480768 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035930792377131395, + "loss": 3.0672, + "theoretical_loss": 3.6661916998357538, + "tokens_seen": 952546304 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035929789368104313, + "loss": 2.6583, + "theoretical_loss": 3.6661674801034243, + "tokens_seen": 952611840 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003592878635907723, + "loss": 2.6992, + "theoretical_loss": 3.666143262503768, + "tokens_seen": 952677376 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035927783350050155, + "loss": 2.7778, + "theoretical_loss": 3.666119047036449, + "tokens_seen": 952742912 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003592678034102307, + "loss": 2.7916, + "theoretical_loss": 3.6660948337011336, + "tokens_seen": 952808448 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003592577733199599, + "loss": 2.6527, + "theoretical_loss": 3.6660706224974877, + "tokens_seen": 952873984 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035924774322968904, + "loss": 2.7234, + "theoretical_loss": 3.666046413425176, + "tokens_seen": 952939520 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003592377131394183, + "loss": 2.7744, + "theoretical_loss": 3.6660222064838655, + "tokens_seen": 953005056 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035922768304914746, + "loss": 2.608, + "theoretical_loss": 3.665998001673221, + "tokens_seen": 953070592 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035921765295887664, + "loss": 2.6749, + "theoretical_loss": 3.66597379899291, + "tokens_seen": 953136128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003592076228686058, + "loss": 2.5654, + "theoretical_loss": 3.665949598442597, + "tokens_seen": 953201664 + }, + { + "epoch": 3.02, + "learning_rate": 0.000359197592778335, + "loss": 2.7041, + "theoretical_loss": 3.665925400021949, + "tokens_seen": 953267200 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003591875626880642, + "loss": 2.906, + "theoretical_loss": 3.6659012037306318, + "tokens_seen": 953332736 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003591775325977934, + "loss": 2.7529, + "theoretical_loss": 3.665877009568312, + "tokens_seen": 953398272 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035916750250752254, + "loss": 2.8305, + "theoretical_loss": 3.665852817534655, + "tokens_seen": 953463808 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003591574724172518, + "loss": 2.9454, + "theoretical_loss": 3.6658286276293293, + "tokens_seen": 953529344 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1117727, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.588460683822632, + "objective/train/theoretical_loss": 3.66582258048551, + "objective/train/tokens_used": 974005728, + "theoretical_loss": 3.66582258048551, + "tokens_seen": 953545728 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035914744232698096, + "loss": 2.6301, + "theoretical_loss": 3.6658044398519993, + "tokens_seen": 953594880 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035913741223671014, + "loss": 2.8647, + "theoretical_loss": 3.6657802542023323, + "tokens_seen": 953660416 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003591273821464393, + "loss": 2.7181, + "theoretical_loss": 3.665756070679995, + "tokens_seen": 953725952 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003591173520561685, + "loss": 2.856, + "theoretical_loss": 3.6657318892846544, + "tokens_seen": 953791488 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003591073219658977, + "loss": 2.7305, + "theoretical_loss": 3.665707710015977, + "tokens_seen": 953857024 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003590972918756269, + "loss": 2.9282, + "theoretical_loss": 3.6656835328736292, + "tokens_seen": 953922560 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035908726178535605, + "loss": 2.6648, + "theoretical_loss": 3.6656593578572787, + "tokens_seen": 953988096 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003590772316950853, + "loss": 2.7987, + "theoretical_loss": 3.6656351849665922, + "tokens_seen": 954053632 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003590672016048144, + "loss": 2.4934, + "theoretical_loss": 3.665611014201237, + "tokens_seen": 954119168 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035905717151454364, + "loss": 2.6069, + "theoretical_loss": 3.66558684556088, + "tokens_seen": 954184704 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003590471414242728, + "loss": 2.8056, + "theoretical_loss": 3.665562679045189, + "tokens_seen": 954250240 + }, + { + "epoch": 3.02, + "learning_rate": 0.000359037111334002, + "loss": 2.9904, + "theoretical_loss": 3.6655385146538304, + "tokens_seen": 954315776 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003590270812437312, + "loss": 2.6449, + "theoretical_loss": 3.665514352386472, + "tokens_seen": 954381312 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035901705115346037, + "loss": 2.7939, + "theoretical_loss": 3.665490192242782, + "tokens_seen": 954446848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035900702106318955, + "loss": 2.7474, + "theoretical_loss": 3.6654660342224266, + "tokens_seen": 954512384 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003589969909729188, + "loss": 2.6304, + "theoretical_loss": 3.6654418783250744, + "tokens_seen": 954577920 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003589869608826479, + "loss": 2.6741, + "theoretical_loss": 3.6654177245503927, + "tokens_seen": 954643456 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035897693079237715, + "loss": 2.6727, + "theoretical_loss": 3.66539357289805, + "tokens_seen": 954708992 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035896690070210633, + "loss": 2.6506, + "theoretical_loss": 3.6653694233677125, + "tokens_seen": 954774528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003589568706118355, + "loss": 2.7298, + "theoretical_loss": 3.6653452759590497, + "tokens_seen": 954840064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003589468405215647, + "loss": 2.7313, + "theoretical_loss": 3.665321130671729, + "tokens_seen": 954905600 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035893681043129387, + "loss": 2.9473, + "theoretical_loss": 3.665296987505419, + "tokens_seen": 954971136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035892678034102305, + "loss": 2.7933, + "theoretical_loss": 3.6652728464597866, + "tokens_seen": 955036672 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003589167502507523, + "loss": 2.6645, + "theoretical_loss": 3.6652487075345013, + "tokens_seen": 955102208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003589067201604814, + "loss": 2.605, + "theoretical_loss": 3.6652245707292304, + "tokens_seen": 955167744 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1119307, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7554943561553955, + "objective/train/theoretical_loss": 3.6652185368591264, + "objective/train/tokens_used": 975644128, + "theoretical_loss": 3.6652185368591264, + "tokens_seen": 955184128 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035889669007021065, + "loss": 2.7885, + "theoretical_loss": 3.6652004360436434, + "tokens_seen": 955233280 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003588866599799398, + "loss": 2.7021, + "theoretical_loss": 3.6651763034774074, + "tokens_seen": 955298816 + }, + { + "epoch": 3.02, + "learning_rate": 0.000358876629889669, + "loss": 2.7756, + "theoretical_loss": 3.665152173030192, + "tokens_seen": 955364352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003588665997993982, + "loss": 2.6781, + "theoretical_loss": 3.6651280447016656, + "tokens_seen": 955429888 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003588565697091274, + "loss": 2.8244, + "theoretical_loss": 3.6651039184914964, + "tokens_seen": 955495424 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003588465396188566, + "loss": 2.6043, + "theoretical_loss": 3.665079794399354, + "tokens_seen": 955560960 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035883650952858574, + "loss": 2.5664, + "theoretical_loss": 3.6650556724249057, + "tokens_seen": 955626496 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035882647943831497, + "loss": 2.6075, + "theoretical_loss": 3.665031552567822, + "tokens_seen": 955692032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035881644934804415, + "loss": 2.5505, + "theoretical_loss": 3.665007434827771, + "tokens_seen": 955757568 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035880641925777333, + "loss": 2.5502, + "theoretical_loss": 3.664983319204422, + "tokens_seen": 955823104 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003587963891675025, + "loss": 2.5772, + "theoretical_loss": 3.664959205697444, + "tokens_seen": 955888640 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035878635907723175, + "loss": 2.7139, + "theoretical_loss": 3.664935094306506, + "tokens_seen": 955954176 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003587763289869609, + "loss": 2.8381, + "theoretical_loss": 3.6649109850312778, + "tokens_seen": 956019712 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003587662988966901, + "loss": 2.5263, + "theoretical_loss": 3.664886877871428, + "tokens_seen": 956085248 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035875626880641924, + "loss": 2.7676, + "theoretical_loss": 3.664862772826627, + "tokens_seen": 956150784 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003587462387161485, + "loss": 2.7778, + "theoretical_loss": 3.6648386698965436, + "tokens_seen": 956216320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035873620862587766, + "loss": 2.637, + "theoretical_loss": 3.664814569080847, + "tokens_seen": 956281856 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035872617853560684, + "loss": 2.7548, + "theoretical_loss": 3.664790470379208, + "tokens_seen": 956347392 + }, + { + "epoch": 3.02, + "learning_rate": 0.000358716148445336, + "loss": 2.8223, + "theoretical_loss": 3.6647663737912954, + "tokens_seen": 956412928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003587061183550652, + "loss": 2.8319, + "theoretical_loss": 3.664742279316779, + "tokens_seen": 956478464 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003586960882647944, + "loss": 2.6764, + "theoretical_loss": 3.664718186955329, + "tokens_seen": 956544000 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003586860581745236, + "loss": 2.7721, + "theoretical_loss": 3.6646940967066155, + "tokens_seen": 956609536 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035867602808425274, + "loss": 2.5923, + "theoretical_loss": 3.664670008570308, + "tokens_seen": 956675072 + }, + { + "epoch": 3.02, + "learning_rate": 0.000358665997993982, + "loss": 2.7841, + "theoretical_loss": 3.664645922546077, + "tokens_seen": 956740608 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035865596790371116, + "loss": 2.5776, + "theoretical_loss": 3.6646218386335923, + "tokens_seen": 956806144 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1120060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.80765438079834, + "objective/train/theoretical_loss": 3.6646158179853927, + "objective/train/tokens_used": 977282528, + "theoretical_loss": 3.6646158179853927, + "tokens_seen": 956822528 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035864593781344034, + "loss": 2.6522, + "theoretical_loss": 3.6645977568325243, + "tokens_seen": 956871680 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003586359077231695, + "loss": 2.8831, + "theoretical_loss": 3.664573677142543, + "tokens_seen": 956937216 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003586258776328987, + "loss": 2.4891, + "theoretical_loss": 3.664549599563319, + "tokens_seen": 957002752 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003586158475426279, + "loss": 2.6942, + "theoretical_loss": 3.6645255240945236, + "tokens_seen": 957068288 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003586058174523571, + "loss": 2.6859, + "theoretical_loss": 3.664501450735826, + "tokens_seen": 957133824 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035859578736208625, + "loss": 2.7707, + "theoretical_loss": 3.6644773794868972, + "tokens_seen": 957199360 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003585857572718155, + "loss": 2.6198, + "theoretical_loss": 3.664453310347408, + "tokens_seen": 957264896 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003585757271815446, + "loss": 2.8187, + "theoretical_loss": 3.6644292433170293, + "tokens_seen": 957330432 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035856569709127384, + "loss": 2.5686, + "theoretical_loss": 3.664405178395432, + "tokens_seen": 957395968 + }, + { + "epoch": 3.02, + "learning_rate": 0.000358555667001003, + "loss": 2.922, + "theoretical_loss": 3.6643811155822865, + "tokens_seen": 957461504 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003585456369107322, + "loss": 2.66, + "theoretical_loss": 3.6643570548772644, + "tokens_seen": 957527040 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003585356068204614, + "loss": 2.8352, + "theoretical_loss": 3.6643329962800353, + "tokens_seen": 957592576 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035852557673019057, + "loss": 2.6895, + "theoretical_loss": 3.6643089397902724, + "tokens_seen": 957658112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035851554663991975, + "loss": 2.9951, + "theoretical_loss": 3.6642848854076453, + "tokens_seen": 957723648 + }, + { + "epoch": 3.02, + "learning_rate": 0.000358505516549649, + "loss": 2.7052, + "theoretical_loss": 3.6642608331318263, + "tokens_seen": 957789184 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003584954864593781, + "loss": 2.7671, + "theoretical_loss": 3.6642367829624862, + "tokens_seen": 957854720 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035848545636910735, + "loss": 2.7657, + "theoretical_loss": 3.664212734899296, + "tokens_seen": 957920256 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035847542627883653, + "loss": 2.7118, + "theoretical_loss": 3.6641886889419286, + "tokens_seen": 957985792 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003584653961885657, + "loss": 2.7238, + "theoretical_loss": 3.6641646450900542, + "tokens_seen": 958051328 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003584553660982949, + "loss": 2.8321, + "theoretical_loss": 3.664140603343345, + "tokens_seen": 958116864 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035844533600802407, + "loss": 2.7182, + "theoretical_loss": 3.664116563701472, + "tokens_seen": 958182400 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035843530591775325, + "loss": 2.5963, + "theoretical_loss": 3.664092526164108, + "tokens_seen": 958247936 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003584252758274825, + "loss": 2.9829, + "theoretical_loss": 3.6640684907309247, + "tokens_seen": 958313472 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003584152457372116, + "loss": 2.8543, + "theoretical_loss": 3.6640444574015927, + "tokens_seen": 958379008 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035840521564694085, + "loss": 2.6679, + "theoretical_loss": 3.664020426175786, + "tokens_seen": 958444544 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1121284, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.857239246368408, + "objective/train/theoretical_loss": 3.6640144186979713, + "objective/train/tokens_used": 978920928, + "theoretical_loss": 3.6640144186979713, + "tokens_seen": 958460928 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035839518555667, + "loss": 2.6887, + "theoretical_loss": 3.6639963970531753, + "tokens_seen": 958510080 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003583851554663992, + "loss": 2.8986, + "theoretical_loss": 3.6639723700334335, + "tokens_seen": 958575616 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003583751253761284, + "loss": 2.7256, + "theoretical_loss": 3.663948345116232, + "tokens_seen": 958641152 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003583650952858576, + "loss": 2.7297, + "theoretical_loss": 3.663924322301244, + "tokens_seen": 958706688 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035835506519558676, + "loss": 2.6622, + "theoretical_loss": 3.663900301588141, + "tokens_seen": 958772224 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035834503510531594, + "loss": 2.5843, + "theoretical_loss": 3.663876282976596, + "tokens_seen": 958837760 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003583350050150451, + "loss": 2.6251, + "theoretical_loss": 3.6638522664662814, + "tokens_seen": 958903296 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035832497492477435, + "loss": 2.8493, + "theoretical_loss": 3.66382825205687, + "tokens_seen": 958968832 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003583149448345035, + "loss": 2.9119, + "theoretical_loss": 3.6638042397480337, + "tokens_seen": 959034368 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003583049147442327, + "loss": 2.6118, + "theoretical_loss": 3.663780229539446, + "tokens_seen": 959099904 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003582948846539619, + "loss": 2.6434, + "theoretical_loss": 3.6637562214307797, + "tokens_seen": 959165440 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003582848545636911, + "loss": 2.779, + "theoretical_loss": 3.6637322154217076, + "tokens_seen": 959230976 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035827482447342026, + "loss": 2.773, + "theoretical_loss": 3.6637082115119024, + "tokens_seen": 959296512 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035826479438314944, + "loss": 2.7536, + "theoretical_loss": 3.663684209701037, + "tokens_seen": 959362048 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003582547642928786, + "loss": 2.7197, + "theoretical_loss": 3.6636602099887847, + "tokens_seen": 959427584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035824473420260786, + "loss": 2.8215, + "theoretical_loss": 3.6636362123748194, + "tokens_seen": 959493120 + }, + { + "epoch": 3.02, + "learning_rate": 0.000358234704112337, + "loss": 2.921, + "theoretical_loss": 3.663612216858813, + "tokens_seen": 959558656 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003582246740220662, + "loss": 2.8807, + "theoretical_loss": 3.6635882234404393, + "tokens_seen": 959624192 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035821464393179535, + "loss": 2.7939, + "theoretical_loss": 3.6635642321193727, + "tokens_seen": 959689728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003582046138415246, + "loss": 2.8857, + "theoretical_loss": 3.6635402428952855, + "tokens_seen": 959755264 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035819458375125376, + "loss": 2.5746, + "theoretical_loss": 3.6635162557678513, + "tokens_seen": 959820800 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035818455366098294, + "loss": 2.6412, + "theoretical_loss": 3.6634922707367443, + "tokens_seen": 959886336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003581745235707121, + "loss": 2.4484, + "theoretical_loss": 3.6634682878016376, + "tokens_seen": 959951872 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035816449348044136, + "loss": 2.7905, + "theoretical_loss": 3.6634443069622056, + "tokens_seen": 960017408 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003581544633901705, + "loss": 2.6442, + "theoretical_loss": 3.6634203282181215, + "tokens_seen": 960082944 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1121988, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3468472957611084, + "objective/train/theoretical_loss": 3.6634143338594605, + "objective/train/tokens_used": 980559328, + "theoretical_loss": 3.6634143338594605, + "tokens_seen": 960099328 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003581444332998997, + "loss": 2.4599, + "theoretical_loss": 3.6633963515690597, + "tokens_seen": 960148480 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035813440320962885, + "loss": 2.7676, + "theoretical_loss": 3.663372377014694, + "tokens_seen": 960214016 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003581243731193581, + "loss": 2.9168, + "theoretical_loss": 3.6633484045546982, + "tokens_seen": 960279552 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035811434302908727, + "loss": 2.7659, + "theoretical_loss": 3.6633244341887465, + "tokens_seen": 960345088 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035810431293881645, + "loss": 2.8983, + "theoretical_loss": 3.663300465916514, + "tokens_seen": 960410624 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003580942828485457, + "loss": 2.7638, + "theoretical_loss": 3.663276499737673, + "tokens_seen": 960476160 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003580842527582748, + "loss": 2.673, + "theoretical_loss": 3.6632525356518997, + "tokens_seen": 960541696 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035807422266800404, + "loss": 2.8277, + "theoretical_loss": 3.663228573658868, + "tokens_seen": 960607232 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003580641925777332, + "loss": 2.9438, + "theoretical_loss": 3.6632046137582526, + "tokens_seen": 960672768 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003580541624874624, + "loss": 2.6973, + "theoretical_loss": 3.663180655949727, + "tokens_seen": 960738304 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003580441323971916, + "loss": 2.8067, + "theoretical_loss": 3.663156700232967, + "tokens_seen": 960803840 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035803410230692077, + "loss": 2.8689, + "theoretical_loss": 3.663132746607647, + "tokens_seen": 960869376 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035802407221664995, + "loss": 2.9861, + "theoretical_loss": 3.663108795073441, + "tokens_seen": 960934912 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003580140421263792, + "loss": 2.9207, + "theoretical_loss": 3.663084845630025, + "tokens_seen": 961000448 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003580040120361083, + "loss": 2.8269, + "theoretical_loss": 3.663060898277073, + "tokens_seen": 961065984 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035799398194583755, + "loss": 2.7846, + "theoretical_loss": 3.663036953014261, + "tokens_seen": 961131520 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035798395185556673, + "loss": 2.8141, + "theoretical_loss": 3.663013009841263, + "tokens_seen": 961197056 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003579739217652959, + "loss": 2.8821, + "theoretical_loss": 3.6629890687577547, + "tokens_seen": 961262592 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003579638916750251, + "loss": 2.8052, + "theoretical_loss": 3.662965129763412, + "tokens_seen": 961328128 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035795386158475427, + "loss": 2.7982, + "theoretical_loss": 3.6629411928579083, + "tokens_seen": 961393664 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035794383149448345, + "loss": 2.7195, + "theoretical_loss": 3.662917258040921, + "tokens_seen": 961459200 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003579338014042127, + "loss": 2.8697, + "theoretical_loss": 3.6628933253121243, + "tokens_seen": 961524736 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003579237713139418, + "loss": 2.8098, + "theoretical_loss": 3.6628693946711937, + "tokens_seen": 961590272 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035791374122367105, + "loss": 2.8547, + "theoretical_loss": 3.6628454661178056, + "tokens_seen": 961655808 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003579037111334002, + "loss": 2.541, + "theoretical_loss": 3.662821539651635, + "tokens_seen": 961721344 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1123320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.837873697280884, + "objective/train/theoretical_loss": 3.6628155583611823, + "objective/train/tokens_used": 982197728, + "theoretical_loss": 3.6628155583611823, + "tokens_seen": 961737728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003578936810431294, + "loss": 2.8232, + "theoretical_loss": 3.662797615272358, + "tokens_seen": 961786880 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003578836509528586, + "loss": 2.8893, + "theoretical_loss": 3.6627736929796497, + "tokens_seen": 961852416 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003578736208625878, + "loss": 2.537, + "theoretical_loss": 3.6627497727731866, + "tokens_seen": 961917952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035786359077231696, + "loss": 2.8157, + "theoretical_loss": 3.6627258546526447, + "tokens_seen": 961983488 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035785356068204614, + "loss": 3.0576, + "theoretical_loss": 3.6627019386177, + "tokens_seen": 962049024 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003578435305917753, + "loss": 2.7024, + "theoretical_loss": 3.662678024668028, + "tokens_seen": 962114560 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035783350050150455, + "loss": 2.7436, + "theoretical_loss": 3.6626541128033057, + "tokens_seen": 962180096 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003578234704112337, + "loss": 2.741, + "theoretical_loss": 3.662630203023209, + "tokens_seen": 962245632 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003578134403209629, + "loss": 2.8045, + "theoretical_loss": 3.6626062953274134, + "tokens_seen": 962311168 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003578034102306921, + "loss": 2.6184, + "theoretical_loss": 3.6625823897155962, + "tokens_seen": 962376704 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003577933801404213, + "loss": 2.9925, + "theoretical_loss": 3.662558486187434, + "tokens_seen": 962442240 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035778335005015046, + "loss": 3.0443, + "theoretical_loss": 3.6625345847426027, + "tokens_seen": 962507776 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035777331995987964, + "loss": 2.714, + "theoretical_loss": 3.662510685380779, + "tokens_seen": 962573312 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003577632898696088, + "loss": 2.6968, + "theoretical_loss": 3.6624867881016403, + "tokens_seen": 962638848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035775325977933806, + "loss": 2.931, + "theoretical_loss": 3.662462892904862, + "tokens_seen": 962704384 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003577432296890672, + "loss": 2.6591, + "theoretical_loss": 3.662438999790122, + "tokens_seen": 962769920 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003577331995987964, + "loss": 3.067, + "theoretical_loss": 3.662415108757097, + "tokens_seen": 962835456 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035772316950852555, + "loss": 2.8951, + "theoretical_loss": 3.6623912198054636, + "tokens_seen": 962900992 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003577131394182548, + "loss": 2.7869, + "theoretical_loss": 3.662367332934899, + "tokens_seen": 962966528 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035770310932798396, + "loss": 3.0066, + "theoretical_loss": 3.66234344814508, + "tokens_seen": 963032064 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035769307923771314, + "loss": 2.7544, + "theoretical_loss": 3.6623195654356846, + "tokens_seen": 963097600 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003576830491474423, + "loss": 2.7924, + "theoretical_loss": 3.6622956848063897, + "tokens_seen": 963163136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035767301905717156, + "loss": 2.6071, + "theoretical_loss": 3.6622718062568715, + "tokens_seen": 963228672 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003576629889669007, + "loss": 2.8556, + "theoretical_loss": 3.662247929786809, + "tokens_seen": 963294208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003576529588766299, + "loss": 2.9046, + "theoretical_loss": 3.662224055395879, + "tokens_seen": 963359744 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1123898, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7481706142425537, + "objective/train/theoretical_loss": 3.662218087122973, + "objective/train/tokens_used": 983836128, + "theoretical_loss": 3.662218087122973, + "tokens_seen": 963376128 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035764292878635905, + "loss": 2.9226, + "theoretical_loss": 3.662200183083759, + "tokens_seen": 963425280 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003576328986960883, + "loss": 2.8547, + "theoretical_loss": 3.6621763128501263, + "tokens_seen": 963490816 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035762286860581747, + "loss": 2.7839, + "theoretical_loss": 3.6621524446946587, + "tokens_seen": 963556352 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035761283851554665, + "loss": 2.7891, + "theoretical_loss": 3.662128578617035, + "tokens_seen": 963621888 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035760280842527583, + "loss": 2.7659, + "theoretical_loss": 3.662104714616931, + "tokens_seen": 963687424 + }, + { + "epoch": 3.02, + "learning_rate": 0.000357592778335005, + "loss": 2.864, + "theoretical_loss": 3.662080852694027, + "tokens_seen": 963752960 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003575827482447342, + "loss": 2.6925, + "theoretical_loss": 3.662056992847999, + "tokens_seen": 963818496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003575727181544634, + "loss": 2.9939, + "theoretical_loss": 3.6620331350785253, + "tokens_seen": 963884032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035756268806419255, + "loss": 2.5997, + "theoretical_loss": 3.662009279385285, + "tokens_seen": 963949568 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003575526579739218, + "loss": 2.5903, + "theoretical_loss": 3.661985425767956, + "tokens_seen": 964015104 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003575426278836509, + "loss": 2.9253, + "theoretical_loss": 3.6619615742262157, + "tokens_seen": 964080640 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035753259779338015, + "loss": 3.0226, + "theoretical_loss": 3.661937724759743, + "tokens_seen": 964146176 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035752256770310933, + "loss": 2.7521, + "theoretical_loss": 3.6619138773682165, + "tokens_seen": 964211712 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003575125376128385, + "loss": 2.8833, + "theoretical_loss": 3.6618900320513146, + "tokens_seen": 964277248 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003575025075225677, + "loss": 2.9383, + "theoretical_loss": 3.661866188808715, + "tokens_seen": 964342784 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035749247743229693, + "loss": 2.7874, + "theoretical_loss": 3.661842347640098, + "tokens_seen": 964408320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035748244734202606, + "loss": 2.7917, + "theoretical_loss": 3.6618185085451405, + "tokens_seen": 964473856 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003574724172517553, + "loss": 2.7022, + "theoretical_loss": 3.661794671523522, + "tokens_seen": 964539392 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003574623871614844, + "loss": 2.6121, + "theoretical_loss": 3.661770836574922, + "tokens_seen": 964604928 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035745235707121365, + "loss": 2.8655, + "theoretical_loss": 3.661747003699018, + "tokens_seen": 964670464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035744232698094283, + "loss": 2.7242, + "theoretical_loss": 3.66172317289549, + "tokens_seen": 964736000 + }, + { + "epoch": 3.02, + "learning_rate": 0.000357432296890672, + "loss": 2.9418, + "theoretical_loss": 3.661699344164017, + "tokens_seen": 964801536 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003574222668004012, + "loss": 2.9072, + "theoretical_loss": 3.661675517504277, + "tokens_seen": 964867072 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003574122367101304, + "loss": 2.6928, + "theoretical_loss": 3.6616516929159504, + "tokens_seen": 964932608 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035740220661985956, + "loss": 2.5438, + "theoretical_loss": 3.661627870398716, + "tokens_seen": 964998144 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1124893, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8355164527893066, + "objective/train/theoretical_loss": 3.6616219150929785, + "objective/train/tokens_used": 985474528, + "theoretical_loss": 3.6616219150929785, + "tokens_seen": 965014528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003573921765295888, + "loss": 2.7413, + "theoretical_loss": 3.6616040499522535, + "tokens_seen": 965063680 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003573821464393179, + "loss": 2.8473, + "theoretical_loss": 3.661580231576242, + "tokens_seen": 965129216 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035737211634904716, + "loss": 2.5775, + "theoretical_loss": 3.6615564152703604, + "tokens_seen": 965194752 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003573620862587763, + "loss": 2.9372, + "theoretical_loss": 3.6615326010342892, + "tokens_seen": 965260288 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003573520561685055, + "loss": 2.6619, + "theoretical_loss": 3.6615087888677076, + "tokens_seen": 965325824 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035734202607823475, + "loss": 2.8003, + "theoretical_loss": 3.661484978770295, + "tokens_seen": 965391360 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003573319959879639, + "loss": 2.6679, + "theoretical_loss": 3.661461170741732, + "tokens_seen": 965456896 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003573219658976931, + "loss": 2.7185, + "theoretical_loss": 3.6614373647816976, + "tokens_seen": 965522432 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003573119358074223, + "loss": 2.7959, + "theoretical_loss": 3.661413560889871, + "tokens_seen": 965587968 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003573019057171515, + "loss": 2.6025, + "theoretical_loss": 3.6613897590659343, + "tokens_seen": 965653504 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035729187562688066, + "loss": 2.8268, + "theoretical_loss": 3.661365959309566, + "tokens_seen": 965719040 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035728184553660984, + "loss": 2.8639, + "theoretical_loss": 3.6613421616204462, + "tokens_seen": 965784576 + }, + { + "epoch": 3.02, + "learning_rate": 0.000357271815446339, + "loss": 2.7539, + "theoretical_loss": 3.6613183659982553, + "tokens_seen": 965850112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035726178535606826, + "loss": 2.7141, + "theoretical_loss": 3.6612945724426744, + "tokens_seen": 965915648 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003572517552657974, + "loss": 2.6672, + "theoretical_loss": 3.661270780953383, + "tokens_seen": 965981184 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003572417251755266, + "loss": 2.8184, + "theoretical_loss": 3.661246991530061, + "tokens_seen": 966046720 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035723169508525575, + "loss": 2.7297, + "theoretical_loss": 3.66122320417239, + "tokens_seen": 966112256 + }, + { + "epoch": 3.02, + "learning_rate": 0.000357221664994985, + "loss": 2.7294, + "theoretical_loss": 3.661199418880049, + "tokens_seen": 966177792 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035721163490471416, + "loss": 2.9766, + "theoretical_loss": 3.6611756356527203, + "tokens_seen": 966243328 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035720160481444334, + "loss": 2.7843, + "theoretical_loss": 3.661151854490084, + "tokens_seen": 966308864 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003571915747241725, + "loss": 2.684, + "theoretical_loss": 3.6611280753918205, + "tokens_seen": 966374400 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035718154463390176, + "loss": 2.5815, + "theoretical_loss": 3.6611042983576105, + "tokens_seen": 966439936 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003571715145436309, + "loss": 2.6649, + "theoretical_loss": 3.6610805233871355, + "tokens_seen": 966505472 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003571614844533601, + "loss": 2.8501, + "theoretical_loss": 3.6610567504800766, + "tokens_seen": 966571008 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035715145436308925, + "loss": 2.8742, + "theoretical_loss": 3.6610329796361136, + "tokens_seen": 966636544 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1125576, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.660078525543213, + "objective/train/theoretical_loss": 3.6610270372474445, + "objective/train/tokens_used": 987112928, + "theoretical_loss": 3.6610270372474445, + "tokens_seen": 966652928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003571414242728185, + "loss": 2.6385, + "theoretical_loss": 3.661009210854929, + "tokens_seen": 966702080 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035713139418254767, + "loss": 2.7278, + "theoretical_loss": 3.660985444136203, + "tokens_seen": 966767616 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035712136409227685, + "loss": 2.8441, + "theoretical_loss": 3.660961679479617, + "tokens_seen": 966833152 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035711133400200603, + "loss": 2.8036, + "theoretical_loss": 3.660937916884853, + "tokens_seen": 966898688 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003571013039117352, + "loss": 2.7275, + "theoretical_loss": 3.660914156351592, + "tokens_seen": 966964224 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003570912738214644, + "loss": 2.7555, + "theoretical_loss": 3.6608903978795153, + "tokens_seen": 967029760 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003570812437311936, + "loss": 3.0237, + "theoretical_loss": 3.6608666414683038, + "tokens_seen": 967095296 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035707121364092275, + "loss": 2.7074, + "theoretical_loss": 3.6608428871176404, + "tokens_seen": 967160832 + }, + { + "epoch": 3.02, + "learning_rate": 0.000357061183550652, + "loss": 2.7985, + "theoretical_loss": 3.6608191348272063, + "tokens_seen": 967226368 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003570511534603811, + "loss": 2.5992, + "theoretical_loss": 3.6607953845966827, + "tokens_seen": 967291904 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035704112337011035, + "loss": 2.8545, + "theoretical_loss": 3.6607716364257525, + "tokens_seen": 967357440 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035703109327983953, + "loss": 2.8628, + "theoretical_loss": 3.660747890314096, + "tokens_seen": 967422976 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003570210631895687, + "loss": 2.6079, + "theoretical_loss": 3.6607241462613964, + "tokens_seen": 967488512 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003570110330992979, + "loss": 2.7346, + "theoretical_loss": 3.6607004042673355, + "tokens_seen": 967554048 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035700100300902713, + "loss": 2.6818, + "theoretical_loss": 3.660676664331595, + "tokens_seen": 967619584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035699097291875626, + "loss": 2.7036, + "theoretical_loss": 3.6606529264538574, + "tokens_seen": 967685120 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003569809428284855, + "loss": 2.7349, + "theoretical_loss": 3.660629190633805, + "tokens_seen": 967750656 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003569709127382146, + "loss": 3.0038, + "theoretical_loss": 3.66060545687112, + "tokens_seen": 967816192 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035696088264794385, + "loss": 2.7256, + "theoretical_loss": 3.6605817251654846, + "tokens_seen": 967881728 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035695085255767303, + "loss": 2.7116, + "theoretical_loss": 3.6605579955165815, + "tokens_seen": 967947264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003569408224674022, + "loss": 2.833, + "theoretical_loss": 3.660534267924093, + "tokens_seen": 968012800 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003569307923771314, + "loss": 2.8843, + "theoretical_loss": 3.6605105423877013, + "tokens_seen": 968078336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003569207622868606, + "loss": 2.5918, + "theoretical_loss": 3.6604868189070903, + "tokens_seen": 968143872 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035691073219658976, + "loss": 2.6069, + "theoretical_loss": 3.6604630974819417, + "tokens_seen": 968209408 + }, + { + "epoch": 3.02, + "learning_rate": 0.000356900702106319, + "loss": 3.0027, + "theoretical_loss": 3.6604393781119384, + "tokens_seen": 968274944 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1126260, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6405274868011475, + "objective/train/theoretical_loss": 3.660433448590517, + "objective/train/tokens_used": 988751328, + "theoretical_loss": 3.660433448590517, + "tokens_seen": 968291328 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003568906720160481, + "loss": 2.7458, + "theoretical_loss": 3.6604156607967635, + "tokens_seen": 968340480 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035688064192577736, + "loss": 2.8098, + "theoretical_loss": 3.6603919455361, + "tokens_seen": 968406016 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003568706118355065, + "loss": 2.9405, + "theoretical_loss": 3.6603682323296307, + "tokens_seen": 968471552 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003568605817452357, + "loss": 2.5768, + "theoretical_loss": 3.6603445211770387, + "tokens_seen": 968537088 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003568505516549649, + "loss": 2.9667, + "theoretical_loss": 3.660320812078007, + "tokens_seen": 968602624 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003568405215646941, + "loss": 2.7004, + "theoretical_loss": 3.6602971050322197, + "tokens_seen": 968668160 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035683049147442326, + "loss": 2.6829, + "theoretical_loss": 3.660273400039359, + "tokens_seen": 968733696 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003568204613841525, + "loss": 2.6353, + "theoretical_loss": 3.6602496970991085, + "tokens_seen": 968799232 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003568104312938816, + "loss": 2.5665, + "theoretical_loss": 3.660225996211152, + "tokens_seen": 968864768 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035680040120361086, + "loss": 2.9588, + "theoretical_loss": 3.6602022973751724, + "tokens_seen": 968930304 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035679037111334, + "loss": 2.7872, + "theoretical_loss": 3.6601786005908545, + "tokens_seen": 968995840 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003567803410230692, + "loss": 2.8798, + "theoretical_loss": 3.6601549058578806, + "tokens_seen": 969061376 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003567703109327984, + "loss": 2.9378, + "theoretical_loss": 3.660131213175935, + "tokens_seen": 969126912 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003567602808425276, + "loss": 2.8404, + "theoretical_loss": 3.6601075225447017, + "tokens_seen": 969192448 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035675025075225677, + "loss": 2.7157, + "theoretical_loss": 3.6600838339638635, + "tokens_seen": 969257984 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035674022066198595, + "loss": 2.8261, + "theoretical_loss": 3.660060147433106, + "tokens_seen": 969323520 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035673019057171513, + "loss": 2.8776, + "theoretical_loss": 3.660036462952112, + "tokens_seen": 969389056 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035672016048144436, + "loss": 2.857, + "theoretical_loss": 3.6600127805205656, + "tokens_seen": 969454592 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003567101303911735, + "loss": 2.8433, + "theoretical_loss": 3.659989100138151, + "tokens_seen": 969520128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003567001003009027, + "loss": 2.7035, + "theoretical_loss": 3.6599654218045528, + "tokens_seen": 969585664 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035669007021063185, + "loss": 2.7555, + "theoretical_loss": 3.659941745519455, + "tokens_seen": 969651200 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003566800401203611, + "loss": 2.4502, + "theoretical_loss": 3.659918071282542, + "tokens_seen": 969716736 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035667001003009027, + "loss": 2.8731, + "theoretical_loss": 3.6598943990934987, + "tokens_seen": 969782272 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035665997993981945, + "loss": 2.4678, + "theoretical_loss": 3.6598707289520087, + "tokens_seen": 969847808 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035664994984954863, + "loss": 2.9327, + "theoretical_loss": 3.659847060857757, + "tokens_seen": 969913344 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1127813, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4290738105773926, + "objective/train/theoretical_loss": 3.659841144154038, + "objective/train/tokens_used": 990389728, + "theoretical_loss": 3.659841144154038, + "tokens_seen": 969929728 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035663991975927787, + "loss": 2.5046, + "theoretical_loss": 3.6598233948104277, + "tokens_seen": 969978880 + }, + { + "epoch": 3.02, + "learning_rate": 0.000356629889669007, + "loss": 2.6093, + "theoretical_loss": 3.6597997308097066, + "tokens_seen": 970044416 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035661985957873623, + "loss": 2.8465, + "theoretical_loss": 3.6597760688552774, + "tokens_seen": 970109952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035660982948846536, + "loss": 2.9324, + "theoretical_loss": 3.6597524089468254, + "tokens_seen": 970175488 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003565997993981946, + "loss": 2.6557, + "theoretical_loss": 3.6597287510840353, + "tokens_seen": 970241024 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003565897693079238, + "loss": 2.8453, + "theoretical_loss": 3.6597050952665926, + "tokens_seen": 970306560 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035657973921765295, + "loss": 2.8741, + "theoretical_loss": 3.6596814414941816, + "tokens_seen": 970372096 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003565697091273822, + "loss": 2.7244, + "theoretical_loss": 3.6596577897664884, + "tokens_seen": 970437632 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003565596790371113, + "loss": 2.7886, + "theoretical_loss": 3.659634140083197, + "tokens_seen": 970503168 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035654964894684055, + "loss": 2.6121, + "theoretical_loss": 3.659610492443993, + "tokens_seen": 970568704 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035653961885656973, + "loss": 2.6921, + "theoretical_loss": 3.659586846848563, + "tokens_seen": 970634240 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003565295887662989, + "loss": 2.7066, + "theoretical_loss": 3.6595632032965897, + "tokens_seen": 970699776 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003565195586760281, + "loss": 2.7141, + "theoretical_loss": 3.6595395617877613, + "tokens_seen": 970765312 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035650952858575733, + "loss": 2.8058, + "theoretical_loss": 3.659515922321762, + "tokens_seen": 970830848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035649949849548646, + "loss": 2.8712, + "theoretical_loss": 3.6594922848982776, + "tokens_seen": 970896384 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003564894684052157, + "loss": 2.8611, + "theoretical_loss": 3.6594686495169935, + "tokens_seen": 970961920 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003564794383149448, + "loss": 3.0298, + "theoretical_loss": 3.659445016177596, + "tokens_seen": 971027456 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035646940822467405, + "loss": 2.7334, + "theoretical_loss": 3.6594213848797703, + "tokens_seen": 971092992 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035645937813440323, + "loss": 2.6687, + "theoretical_loss": 3.6593977556232025, + "tokens_seen": 971158528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003564493480441324, + "loss": 2.9812, + "theoretical_loss": 3.6593741284075794, + "tokens_seen": 971224064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003564393179538616, + "loss": 2.7254, + "theoretical_loss": 3.6593505032325853, + "tokens_seen": 971289600 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003564292878635908, + "loss": 2.6469, + "theoretical_loss": 3.659326880097908, + "tokens_seen": 971355136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035641925777331996, + "loss": 2.77, + "theoretical_loss": 3.6593032590032326, + "tokens_seen": 971420672 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003564092276830492, + "loss": 2.8566, + "theoretical_loss": 3.6592796399482452, + "tokens_seen": 971486208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003563991975927783, + "loss": 2.9147, + "theoretical_loss": 3.659256022932633, + "tokens_seen": 971551744 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1128556, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.751553773880005, + "objective/train/theoretical_loss": 3.6592501189973454, + "objective/train/tokens_used": 992028128, + "theoretical_loss": 3.6592501189973454, + "tokens_seen": 971568128 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035638916750250756, + "loss": 3.0277, + "theoretical_loss": 3.6592324079560816, + "tokens_seen": 971617280 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003563791374122367, + "loss": 2.8446, + "theoretical_loss": 3.6592087950182783, + "tokens_seen": 971682816 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003563691073219659, + "loss": 2.7536, + "theoretical_loss": 3.6591851841189085, + "tokens_seen": 971748352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003563590772316951, + "loss": 2.8624, + "theoretical_loss": 3.6591615752576594, + "tokens_seen": 971813888 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003563490471414243, + "loss": 2.779, + "theoretical_loss": 3.659137968434217, + "tokens_seen": 971879424 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035633901705115346, + "loss": 2.836, + "theoretical_loss": 3.6591143636482695, + "tokens_seen": 971944960 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003563289869608827, + "loss": 2.9205, + "theoretical_loss": 3.659090760899502, + "tokens_seen": 972010496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003563189568706118, + "loss": 2.655, + "theoretical_loss": 3.659067160187602, + "tokens_seen": 972076032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035630892678034106, + "loss": 2.7509, + "theoretical_loss": 3.6590435615122567, + "tokens_seen": 972141568 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003562988966900702, + "loss": 2.7927, + "theoretical_loss": 3.6590199648731527, + "tokens_seen": 972207104 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003562888665997994, + "loss": 2.8891, + "theoretical_loss": 3.6589963702699775, + "tokens_seen": 972272640 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003562788365095286, + "loss": 2.9476, + "theoretical_loss": 3.658972777702418, + "tokens_seen": 972338176 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003562688064192578, + "loss": 2.8627, + "theoretical_loss": 3.658949187170161, + "tokens_seen": 972403712 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035625877632898697, + "loss": 2.7483, + "theoretical_loss": 3.6589255986728944, + "tokens_seen": 972469248 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035624874623871615, + "loss": 2.5822, + "theoretical_loss": 3.658902012210305, + "tokens_seen": 972534784 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035623871614844533, + "loss": 2.962, + "theoretical_loss": 3.6588784277820805, + "tokens_seen": 972600320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00035622868605817456, + "loss": 2.7324, + "theoretical_loss": 3.658854845387908, + "tokens_seen": 972665856 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003562186559679037, + "loss": 2.7687, + "theoretical_loss": 3.6588312650274757, + "tokens_seen": 972731392 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003562086258776329, + "loss": 2.4566, + "theoretical_loss": 3.658807686700471, + "tokens_seen": 972796928 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035619859578736205, + "loss": 2.6661, + "theoretical_loss": 3.6587841104065815, + "tokens_seen": 972862464 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003561885656970913, + "loss": 2.6573, + "theoretical_loss": 3.6587605361454942, + "tokens_seen": 972928000 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035617853560682047, + "loss": 2.6695, + "theoretical_loss": 3.658736963916898, + "tokens_seen": 972993536 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035616850551654965, + "loss": 2.8766, + "theoretical_loss": 3.6587133937204808, + "tokens_seen": 973059072 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035615847542627883, + "loss": 2.6962, + "theoretical_loss": 3.6586898255559293, + "tokens_seen": 973124608 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035614844533600807, + "loss": 2.7565, + "theoretical_loss": 3.6586662594229327, + "tokens_seen": 973190144 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1129663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.72713303565979, + "objective/train/theoretical_loss": 3.6586603682070775, + "objective/train/tokens_used": 993666528, + "theoretical_loss": 3.6586603682070775, + "tokens_seen": 973206528 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003561384152457372, + "loss": 2.8701, + "theoretical_loss": 3.658642695321179, + "tokens_seen": 973255680 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035612838515546643, + "loss": 2.7955, + "theoretical_loss": 3.6586191332503555, + "tokens_seen": 973321216 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035611835506519556, + "loss": 2.8356, + "theoretical_loss": 3.6585955732101514, + "tokens_seen": 973386752 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003561083249749248, + "loss": 2.7941, + "theoretical_loss": 3.6585720152002548, + "tokens_seen": 973452288 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035609829488465397, + "loss": 2.9232, + "theoretical_loss": 3.658548459220354, + "tokens_seen": 973517824 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035608826479438315, + "loss": 2.6519, + "theoretical_loss": 3.658524905270137, + "tokens_seen": 973583360 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035607823470411233, + "loss": 2.7978, + "theoretical_loss": 3.6585013533492927, + "tokens_seen": 973648896 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003560682046138415, + "loss": 2.7026, + "theoretical_loss": 3.65847780345751, + "tokens_seen": 973714432 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003560581745235707, + "loss": 2.689, + "theoretical_loss": 3.6584542555944766, + "tokens_seen": 973779968 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035604814443329993, + "loss": 3.099, + "theoretical_loss": 3.658430709759882, + "tokens_seen": 973845504 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035603811434302906, + "loss": 2.7942, + "theoretical_loss": 3.658407165953415, + "tokens_seen": 973911040 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003560280842527583, + "loss": 2.629, + "theoretical_loss": 3.6583836241747636, + "tokens_seen": 973976576 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003560180541624874, + "loss": 2.7733, + "theoretical_loss": 3.6583600844236175, + "tokens_seen": 974042112 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035600802407221666, + "loss": 2.8486, + "theoretical_loss": 3.658336546699666, + "tokens_seen": 974107648 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035599799398194584, + "loss": 2.648, + "theoretical_loss": 3.658313011002597, + "tokens_seen": 974173184 + }, + { + "epoch": 3.03, + "learning_rate": 0.000355987963891675, + "loss": 2.8866, + "theoretical_loss": 3.658289477332101, + "tokens_seen": 974238720 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003559779338014042, + "loss": 2.6934, + "theoretical_loss": 3.658265945687866, + "tokens_seen": 974304256 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035596790371113344, + "loss": 2.653, + "theoretical_loss": 3.6582424160695814, + "tokens_seen": 974369792 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035595787362086256, + "loss": 2.7027, + "theoretical_loss": 3.658218888476937, + "tokens_seen": 974435328 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003559478435305918, + "loss": 2.7906, + "theoretical_loss": 3.6581953629096224, + "tokens_seen": 974500864 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003559378134403209, + "loss": 2.6374, + "theoretical_loss": 3.658171839367327, + "tokens_seen": 974566400 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035592778335005016, + "loss": 2.64, + "theoretical_loss": 3.6581483178497396, + "tokens_seen": 974631936 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035591775325977934, + "loss": 2.6977, + "theoretical_loss": 3.6581247983565506, + "tokens_seen": 974697472 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003559077231695085, + "loss": 2.9227, + "theoretical_loss": 3.6581012808874487, + "tokens_seen": 974763008 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003558976930792377, + "loss": 2.7544, + "theoretical_loss": 3.6580777654421244, + "tokens_seen": 974828544 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1130250, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.891390562057495, + "objective/train/theoretical_loss": 3.6580718868969724, + "objective/train/tokens_used": 995304928, + "theoretical_loss": 3.6580718868969724, + "tokens_seen": 974844928 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003558876629889669, + "loss": 2.7537, + "theoretical_loss": 3.6580542520202677, + "tokens_seen": 974894080 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035587763289869607, + "loss": 2.8645, + "theoretical_loss": 3.658030740621568, + "tokens_seen": 974959616 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003558676028084253, + "loss": 2.6031, + "theoretical_loss": 3.6580072312457155, + "tokens_seen": 975025152 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035585757271815443, + "loss": 2.9164, + "theoretical_loss": 3.6579837238924, + "tokens_seen": 975090688 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035584754262788366, + "loss": 2.5411, + "theoretical_loss": 3.657960218561312, + "tokens_seen": 975156224 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003558375125376129, + "loss": 2.7479, + "theoretical_loss": 3.6579367152521414, + "tokens_seen": 975221760 + }, + { + "epoch": 3.03, + "learning_rate": 0.000355827482447342, + "loss": 2.8431, + "theoretical_loss": 3.6579132139645774, + "tokens_seen": 975287296 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035581745235707126, + "loss": 2.8175, + "theoretical_loss": 3.657889714698312, + "tokens_seen": 975352832 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003558074222668004, + "loss": 2.8773, + "theoretical_loss": 3.6578662174530345, + "tokens_seen": 975418368 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003557973921765296, + "loss": 2.6768, + "theoretical_loss": 3.657842722228436, + "tokens_seen": 975483904 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003557873620862588, + "loss": 2.6195, + "theoretical_loss": 3.6578192290242066, + "tokens_seen": 975549440 + }, + { + "epoch": 3.03, + "learning_rate": 0.000355777331995988, + "loss": 2.7406, + "theoretical_loss": 3.6577957378400368, + "tokens_seen": 975614976 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035576730190571717, + "loss": 2.7412, + "theoretical_loss": 3.657772248675617, + "tokens_seen": 975680512 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035575727181544635, + "loss": 2.6126, + "theoretical_loss": 3.6577487615306383, + "tokens_seen": 975746048 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035574724172517553, + "loss": 2.6626, + "theoretical_loss": 3.6577252764047916, + "tokens_seen": 975811584 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035573721163490476, + "loss": 3.0388, + "theoretical_loss": 3.6577017932977673, + "tokens_seen": 975877120 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003557271815446339, + "loss": 2.8096, + "theoretical_loss": 3.6576783122092564, + "tokens_seen": 975942656 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003557171514543631, + "loss": 2.8943, + "theoretical_loss": 3.6576548331389507, + "tokens_seen": 976008192 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035570712136409225, + "loss": 2.8692, + "theoretical_loss": 3.6576313560865397, + "tokens_seen": 976073728 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003556970912738215, + "loss": 2.913, + "theoretical_loss": 3.6576078810517156, + "tokens_seen": 976139264 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035568706118355067, + "loss": 2.7489, + "theoretical_loss": 3.657584408034169, + "tokens_seen": 976204800 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035567703109327985, + "loss": 2.5959, + "theoretical_loss": 3.6575609370335913, + "tokens_seen": 976270336 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035566700100300903, + "loss": 2.962, + "theoretical_loss": 3.657537468049674, + "tokens_seen": 976335872 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035565697091273827, + "loss": 2.7383, + "theoretical_loss": 3.657514001082109, + "tokens_seen": 976401408 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003556469408224674, + "loss": 3.0656, + "theoretical_loss": 3.6574905361305867, + "tokens_seen": 976466944 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1131717, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6520185470581055, + "objective/train/theoretical_loss": 3.657484670207677, + "objective/train/tokens_used": 996943328, + "theoretical_loss": 3.657484670207677, + "tokens_seen": 976483328 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035563691073219663, + "loss": 2.8189, + "theoretical_loss": 3.657467073194799, + "tokens_seen": 976532480 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035562688064192576, + "loss": 2.6232, + "theoretical_loss": 3.657443612274438, + "tokens_seen": 976598016 + }, + { + "epoch": 3.03, + "learning_rate": 0.000355616850551655, + "loss": 2.6579, + "theoretical_loss": 3.657420153369194, + "tokens_seen": 976663552 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035560682046138417, + "loss": 2.7898, + "theoretical_loss": 3.6573966964787603, + "tokens_seen": 976729088 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035559679037111335, + "loss": 2.707, + "theoretical_loss": 3.6573732416028277, + "tokens_seen": 976794624 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035558676028084253, + "loss": 3.012, + "theoretical_loss": 3.6573497887410884, + "tokens_seen": 976860160 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003555767301905717, + "loss": 2.7479, + "theoretical_loss": 3.6573263378932346, + "tokens_seen": 976925696 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003555667001003009, + "loss": 2.8411, + "theoretical_loss": 3.6573028890589576, + "tokens_seen": 976991232 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035555667001003013, + "loss": 2.8103, + "theoretical_loss": 3.6572794422379493, + "tokens_seen": 977056768 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035554663991975926, + "loss": 2.8471, + "theoretical_loss": 3.6572559974299033, + "tokens_seen": 977122304 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003555366098294885, + "loss": 2.6952, + "theoretical_loss": 3.6572325546345104, + "tokens_seen": 977187840 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003555265797392176, + "loss": 2.4342, + "theoretical_loss": 3.657209113851463, + "tokens_seen": 977253376 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035551654964894686, + "loss": 2.7549, + "theoretical_loss": 3.657185675080454, + "tokens_seen": 977318912 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035550651955867604, + "loss": 2.7508, + "theoretical_loss": 3.657162238321175, + "tokens_seen": 977384448 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003554964894684052, + "loss": 2.8389, + "theoretical_loss": 3.6571388035733197, + "tokens_seen": 977449984 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003554864593781344, + "loss": 3.0033, + "theoretical_loss": 3.657115370836579, + "tokens_seen": 977515520 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035547642928786364, + "loss": 2.782, + "theoretical_loss": 3.657091940110647, + "tokens_seen": 977581056 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035546639919759276, + "loss": 2.7965, + "theoretical_loss": 3.6570685113952153, + "tokens_seen": 977646592 + }, + { + "epoch": 3.03, + "learning_rate": 0.000355456369107322, + "loss": 2.6375, + "theoretical_loss": 3.6570450846899774, + "tokens_seen": 977712128 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003554463390170511, + "loss": 2.6154, + "theoretical_loss": 3.6570216599946255, + "tokens_seen": 977777664 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035543630892678036, + "loss": 2.8124, + "theoretical_loss": 3.656998237308853, + "tokens_seen": 977843200 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035542627883650954, + "loss": 2.8307, + "theoretical_loss": 3.6569748166323524, + "tokens_seen": 977908736 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003554162487462387, + "loss": 2.6667, + "theoretical_loss": 3.656951397964817, + "tokens_seen": 977974272 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003554062186559679, + "loss": 2.8426, + "theoretical_loss": 3.6569279813059397, + "tokens_seen": 978039808 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003553961885656971, + "loss": 2.6248, + "theoretical_loss": 3.656904566655413, + "tokens_seen": 978105344 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1132115, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2902779579162598, + "objective/train/theoretical_loss": 3.6568987133065507, + "objective/train/tokens_used": 998581728, + "theoretical_loss": 3.6568987133065507, + "tokens_seen": 978121728 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035538615847542627, + "loss": 2.984, + "theoretical_loss": 3.6568811540129316, + "tokens_seen": 978170880 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003553761283851555, + "loss": 2.8201, + "theoretical_loss": 3.656857743378188, + "tokens_seen": 978236416 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035536609829488463, + "loss": 2.8302, + "theoretical_loss": 3.656834334750875, + "tokens_seen": 978301952 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035535606820461386, + "loss": 2.6622, + "theoretical_loss": 3.6568109281306866, + "tokens_seen": 978367488 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035534603811434304, + "loss": 2.8905, + "theoretical_loss": 3.656787523517316, + "tokens_seen": 978433024 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003553360080240722, + "loss": 2.8052, + "theoretical_loss": 3.656764120910457, + "tokens_seen": 978498560 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003553259779338014, + "loss": 2.9525, + "theoretical_loss": 3.6567407203098035, + "tokens_seen": 978564096 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003553159478435306, + "loss": 2.6697, + "theoretical_loss": 3.6567173217150484, + "tokens_seen": 978629632 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035530591775325977, + "loss": 2.8138, + "theoretical_loss": 3.656693925125886, + "tokens_seen": 978695168 + }, + { + "epoch": 3.03, + "learning_rate": 0.000355295887662989, + "loss": 2.7755, + "theoretical_loss": 3.6566705305420095, + "tokens_seen": 978760704 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035528585757271813, + "loss": 3.0818, + "theoretical_loss": 3.656647137963114, + "tokens_seen": 978826240 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035527582748244737, + "loss": 2.766, + "theoretical_loss": 3.656623747388892, + "tokens_seen": 978891776 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003552657973921765, + "loss": 3.0058, + "theoretical_loss": 3.6566003588190386, + "tokens_seen": 978957312 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035525576730190573, + "loss": 2.8212, + "theoretical_loss": 3.656576972253247, + "tokens_seen": 979022848 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003552457372116349, + "loss": 2.9931, + "theoretical_loss": 3.656553587691212, + "tokens_seen": 979088384 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003552357071213641, + "loss": 2.9403, + "theoretical_loss": 3.6565302051326274, + "tokens_seen": 979153920 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035522567703109327, + "loss": 2.7488, + "theoretical_loss": 3.656506824577188, + "tokens_seen": 979219456 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035521564694082245, + "loss": 2.9203, + "theoretical_loss": 3.6564834460245876, + "tokens_seen": 979284992 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035520561685055163, + "loss": 2.9973, + "theoretical_loss": 3.6564600694745213, + "tokens_seen": 979350528 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035519558676028087, + "loss": 2.8523, + "theoretical_loss": 3.6564366949266827, + "tokens_seen": 979416064 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035518555667001, + "loss": 2.8387, + "theoretical_loss": 3.6564133223807667, + "tokens_seen": 979481600 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035517552657973923, + "loss": 2.8515, + "theoretical_loss": 3.656389951836468, + "tokens_seen": 979547136 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003551654964894684, + "loss": 2.7526, + "theoretical_loss": 3.6563665832934813, + "tokens_seen": 979612672 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003551554663991976, + "loss": 2.8189, + "theoretical_loss": 3.6563432167515018, + "tokens_seen": 979678208 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003551454363089268, + "loss": 2.7372, + "theoretical_loss": 3.656319852210223, + "tokens_seen": 979743744 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1133253, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4291090965270996, + "objective/train/theoretical_loss": 3.6563140113874777, + "objective/train/tokens_used": 1000220128, + "theoretical_loss": 3.6563140113874777, + "tokens_seen": 979760128 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035513540621865596, + "loss": 2.6382, + "theoretical_loss": 3.6562964896693417, + "tokens_seen": 979809280 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035512537612838514, + "loss": 2.7368, + "theoretical_loss": 3.656273129128551, + "tokens_seen": 979874816 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035511534603811437, + "loss": 2.7358, + "theoretical_loss": 3.6562497705875465, + "tokens_seen": 979940352 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003551053159478435, + "loss": 2.9211, + "theoretical_loss": 3.6562264140460243, + "tokens_seen": 980005888 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035509528585757273, + "loss": 2.8374, + "theoretical_loss": 3.656203059503678, + "tokens_seen": 980071424 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003550852557673019, + "loss": 2.5918, + "theoretical_loss": 3.6561797069602036, + "tokens_seen": 980136960 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003550752256770311, + "loss": 2.7636, + "theoretical_loss": 3.6561563564152966, + "tokens_seen": 980202496 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035506519558676033, + "loss": 2.8558, + "theoretical_loss": 3.656133007868652, + "tokens_seen": 980268032 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035505516549648946, + "loss": 2.6737, + "theoretical_loss": 3.6561096613199653, + "tokens_seen": 980333568 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003550451354062187, + "loss": 2.7461, + "theoretical_loss": 3.656086316768932, + "tokens_seen": 980399104 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003550351053159478, + "loss": 2.9711, + "theoretical_loss": 3.6560629742152475, + "tokens_seen": 980464640 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035502507522567706, + "loss": 2.765, + "theoretical_loss": 3.656039633658607, + "tokens_seen": 980530176 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035501504513540624, + "loss": 2.7839, + "theoretical_loss": 3.6560162950987083, + "tokens_seen": 980595712 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003550050150451354, + "loss": 2.7088, + "theoretical_loss": 3.6559929585352444, + "tokens_seen": 980661248 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003549949849548646, + "loss": 2.9667, + "theoretical_loss": 3.6559696239679127, + "tokens_seen": 980726784 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035498495486459384, + "loss": 2.9507, + "theoretical_loss": 3.6559462913964085, + "tokens_seen": 980792320 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035497492477432296, + "loss": 2.906, + "theoretical_loss": 3.655922960820428, + "tokens_seen": 980857856 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003549648946840522, + "loss": 2.8641, + "theoretical_loss": 3.6558996322396666, + "tokens_seen": 980923392 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003549548645937813, + "loss": 2.6324, + "theoretical_loss": 3.655876305653822, + "tokens_seen": 980988928 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035494483450351056, + "loss": 2.8169, + "theoretical_loss": 3.655852981062589, + "tokens_seen": 981054464 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035493480441323974, + "loss": 2.8943, + "theoretical_loss": 3.6558296584656635, + "tokens_seen": 981120000 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003549247743229689, + "loss": 2.7909, + "theoretical_loss": 3.655806337862743, + "tokens_seen": 981185536 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003549147442326981, + "loss": 2.7996, + "theoretical_loss": 3.655783019253523, + "tokens_seen": 981251072 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003549047141424273, + "loss": 2.625, + "theoretical_loss": 3.6557597026377, + "tokens_seen": 981316608 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035489468405215647, + "loss": 2.9213, + "theoretical_loss": 3.655736388014971, + "tokens_seen": 981382144 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1133558, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7345311641693115, + "objective/train/theoretical_loss": 3.655730559670674, + "objective/train/tokens_used": 1001858528, + "theoretical_loss": 3.655730559670674, + "tokens_seen": 981398528 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003548846539618857, + "loss": 2.7113, + "theoretical_loss": 3.655713075385032, + "tokens_seen": 981447680 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035487462387161483, + "loss": 2.9233, + "theoretical_loss": 3.6556897647475797, + "tokens_seen": 981513216 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035486459378134406, + "loss": 2.635, + "theoretical_loss": 3.6556664561023116, + "tokens_seen": 981578752 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035485456369107324, + "loss": 2.9342, + "theoretical_loss": 3.655643149448923, + "tokens_seen": 981644288 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003548445336008024, + "loss": 2.739, + "theoretical_loss": 3.655619844787112, + "tokens_seen": 981709824 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003548345035105316, + "loss": 2.8912, + "theoretical_loss": 3.6555965421165744, + "tokens_seen": 981775360 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003548244734202608, + "loss": 3.0486, + "theoretical_loss": 3.6555732414370077, + "tokens_seen": 981840896 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035481444332998997, + "loss": 3.1694, + "theoretical_loss": 3.6555499427481095, + "tokens_seen": 981906432 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003548044132397192, + "loss": 3.1176, + "theoretical_loss": 3.655526646049576, + "tokens_seen": 981971968 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035479438314944833, + "loss": 2.8801, + "theoretical_loss": 3.655503351341105, + "tokens_seen": 982037504 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035478435305917757, + "loss": 2.9109, + "theoretical_loss": 3.655480058622393, + "tokens_seen": 982103040 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003547743229689067, + "loss": 2.9983, + "theoretical_loss": 3.6554567678931376, + "tokens_seen": 982168576 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035476429287863593, + "loss": 2.7778, + "theoretical_loss": 3.6554334791530363, + "tokens_seen": 982234112 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003547542627883651, + "loss": 2.7951, + "theoretical_loss": 3.6554101924017863, + "tokens_seen": 982299648 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003547442326980943, + "loss": 2.9124, + "theoretical_loss": 3.6553869076390857, + "tokens_seen": 982365184 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035473420260782347, + "loss": 2.6658, + "theoretical_loss": 3.6553636248646306, + "tokens_seen": 982430720 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035472417251755265, + "loss": 3.0599, + "theoretical_loss": 3.6553403440781205, + "tokens_seen": 982496256 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035471414242728183, + "loss": 2.8763, + "theoretical_loss": 3.6553170652792515, + "tokens_seen": 982561792 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035470411233701107, + "loss": 2.8212, + "theoretical_loss": 3.655293788467722, + "tokens_seen": 982627328 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003546940822467402, + "loss": 2.677, + "theoretical_loss": 3.65527051364323, + "tokens_seen": 982692864 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035468405215646943, + "loss": 2.8286, + "theoretical_loss": 3.655247240805473, + "tokens_seen": 982758400 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003546740220661986, + "loss": 2.8833, + "theoretical_loss": 3.6552239699541493, + "tokens_seen": 982823936 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003546639919759278, + "loss": 2.8848, + "theoretical_loss": 3.6552007010889564, + "tokens_seen": 982889472 + }, + { + "epoch": 3.03, + "learning_rate": 0.000354653961885657, + "loss": 2.9692, + "theoretical_loss": 3.6551774342095933, + "tokens_seen": 982955008 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035464393179538616, + "loss": 2.8621, + "theoretical_loss": 3.6551541693157574, + "tokens_seen": 983020544 + }, + { + "debugging/Self-BLEU-5": 0.39763851797645977, + "debugging/distinct-1-grams": 0.7825305380824156, + "debugging/distinct-2-grams": 0.9587387596654564, + "debugging/entropy-1-grams": 5.676356416295845, + "debugging/entropy-2-grams": 6.320223384568614, + "debugging/length": 540.7142857142857, + "debugging/num_segments": 7, + "debugging/score": 0.006025538542352743, + "debugging/score_std": 0.005169474872870312, + "epoch": 3.03, + "objective/train/docs_used": 1133558, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.444734573364258, + "objective/train/theoretical_loss": 3.6551483534025015, + "objective/train/tokens_used": 1003496928, + "theoretical_loss": 3.6551483534025015, + "tokens_seen": 983036928 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035463390170511534, + "loss": 2.7331, + "theoretical_loss": 3.6551309064071464, + "tokens_seen": 983086080 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035462387161484457, + "loss": 2.7984, + "theoretical_loss": 3.6551076454834597, + "tokens_seen": 983151616 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003546138415245737, + "loss": 2.6689, + "theoretical_loss": 3.655084386544395, + "tokens_seen": 983217152 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035460381143430293, + "loss": 2.9601, + "theoretical_loss": 3.655061129589651, + "tokens_seen": 983282688 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035459378134403206, + "loss": 2.8545, + "theoretical_loss": 3.655037874618926, + "tokens_seen": 983348224 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003545837512537613, + "loss": 2.8236, + "theoretical_loss": 3.655014621631918, + "tokens_seen": 983413760 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003545737211634905, + "loss": 2.8477, + "theoretical_loss": 3.654991370628327, + "tokens_seen": 983479296 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035456369107321966, + "loss": 3.001, + "theoretical_loss": 3.6549681216078502, + "tokens_seen": 983544832 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035455366098294884, + "loss": 2.8405, + "theoretical_loss": 3.654944874570187, + "tokens_seen": 983610368 + }, + { + "epoch": 3.03, + "learning_rate": 0.000354543630892678, + "loss": 2.7946, + "theoretical_loss": 3.6549216295150364, + "tokens_seen": 983675904 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003545336008024072, + "loss": 2.7789, + "theoretical_loss": 3.6548983864420967, + "tokens_seen": 983741440 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035452357071213644, + "loss": 2.9531, + "theoretical_loss": 3.6548751453510677, + "tokens_seen": 983806976 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035451354062186556, + "loss": 2.8242, + "theoretical_loss": 3.654851906241647, + "tokens_seen": 983872512 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003545035105315948, + "loss": 2.8175, + "theoretical_loss": 3.6548286691135354, + "tokens_seen": 983938048 + }, + { + "epoch": 3.03, + "learning_rate": 0.000354493480441324, + "loss": 2.9619, + "theoretical_loss": 3.6548054339664304, + "tokens_seen": 984003584 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035448345035105316, + "loss": 2.8656, + "theoretical_loss": 3.6547822008000326, + "tokens_seen": 984069120 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035447342026078234, + "loss": 2.7168, + "theoretical_loss": 3.654758969614041, + "tokens_seen": 984134656 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003544633901705115, + "loss": 2.741, + "theoretical_loss": 3.654735740408153, + "tokens_seen": 984200192 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003544533600802407, + "loss": 3.0168, + "theoretical_loss": 3.6547125131820706, + "tokens_seen": 984265728 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035444332998996994, + "loss": 3.0056, + "theoretical_loss": 3.654689287935492, + "tokens_seen": 984331264 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035443329989969907, + "loss": 2.8746, + "theoretical_loss": 3.654666064668117, + "tokens_seen": 984396800 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003544232698094283, + "loss": 3.1804, + "theoretical_loss": 3.654642843379645, + "tokens_seen": 984462336 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035441323971915743, + "loss": 3.1955, + "theoretical_loss": 3.6546196240697757, + "tokens_seen": 984527872 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035440320962888667, + "loss": 2.9598, + "theoretical_loss": 3.6545964067382086, + "tokens_seen": 984593408 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035439317953861585, + "loss": 2.938, + "theoretical_loss": 3.654573191384644, + "tokens_seen": 984658944 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1134314, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.677119493484497, + "objective/train/theoretical_loss": 3.6545673878552805, + "objective/train/tokens_used": 1005135328, + "theoretical_loss": 3.6545673878552805, + "tokens_seen": 984675328 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035438314944834503, + "loss": 3.1099, + "theoretical_loss": 3.654549978008782, + "tokens_seen": 984724480 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003543731193580742, + "loss": 3.0527, + "theoretical_loss": 3.6545267666103207, + "tokens_seen": 984790016 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035436308926780344, + "loss": 3.0231, + "theoretical_loss": 3.6545035571889626, + "tokens_seen": 984855552 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003543530591775326, + "loss": 2.8865, + "theoretical_loss": 3.6544803497444063, + "tokens_seen": 984921088 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003543430290872618, + "loss": 2.7854, + "theoretical_loss": 3.6544571442763516, + "tokens_seen": 984986624 + }, + { + "epoch": 3.03, + "learning_rate": 0.000354332998996991, + "loss": 2.712, + "theoretical_loss": 3.6544339407844997, + "tokens_seen": 985052160 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035432296890672017, + "loss": 2.8541, + "theoretical_loss": 3.6544107392685503, + "tokens_seen": 985117696 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003543129388164494, + "loss": 2.9849, + "theoretical_loss": 3.6543875397282037, + "tokens_seen": 985183232 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035430290872617853, + "loss": 2.9454, + "theoretical_loss": 3.65436434216316, + "tokens_seen": 985248768 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035429287863590777, + "loss": 3.1817, + "theoretical_loss": 3.654341146573121, + "tokens_seen": 985314304 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003542828485456369, + "loss": 2.9778, + "theoretical_loss": 3.6543179529577854, + "tokens_seen": 985379840 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035427281845536613, + "loss": 2.9435, + "theoretical_loss": 3.654294761316855, + "tokens_seen": 985445376 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003542627883650953, + "loss": 2.8006, + "theoretical_loss": 3.6542715716500296, + "tokens_seen": 985510912 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003542527582748245, + "loss": 2.9925, + "theoretical_loss": 3.6542483839570106, + "tokens_seen": 985576448 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035424272818455367, + "loss": 3.0758, + "theoretical_loss": 3.6542251982374983, + "tokens_seen": 985641984 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035423269809428285, + "loss": 2.8809, + "theoretical_loss": 3.6542020144911938, + "tokens_seen": 985707520 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035422266800401203, + "loss": 2.9749, + "theoretical_loss": 3.6541788327177978, + "tokens_seen": 985773056 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035421263791374127, + "loss": 3.0246, + "theoretical_loss": 3.6541556529170114, + "tokens_seen": 985838592 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003542026078234704, + "loss": 3.0333, + "theoretical_loss": 3.6541324750885362, + "tokens_seen": 985904128 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035419257773319963, + "loss": 2.9751, + "theoretical_loss": 3.654109299232072, + "tokens_seen": 985969664 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003541825476429288, + "loss": 3.002, + "theoretical_loss": 3.6540861253473205, + "tokens_seen": 986035200 + }, + { + "epoch": 3.03, + "learning_rate": 0.000354172517552658, + "loss": 3.0969, + "theoretical_loss": 3.654062953433984, + "tokens_seen": 986100736 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003541624874623872, + "loss": 3.0918, + "theoretical_loss": 3.654039783491762, + "tokens_seen": 986166272 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035415245737211636, + "loss": 2.738, + "theoretical_loss": 3.654016615520357, + "tokens_seen": 986231808 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035414242728184554, + "loss": 2.915, + "theoretical_loss": 3.65399344951947, + "tokens_seen": 986297344 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1135754, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.907013416290283, + "objective/train/theoretical_loss": 3.6539876583271065, + "objective/train/tokens_used": 1006773728, + "theoretical_loss": 3.6539876583271065, + "tokens_seen": 986313728 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035413239719157477, + "loss": 2.919, + "theoretical_loss": 3.6539702854888025, + "tokens_seen": 986362880 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003541223671013039, + "loss": 2.9234, + "theoretical_loss": 3.6539471234280567, + "tokens_seen": 986428416 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035411233701103314, + "loss": 2.8228, + "theoretical_loss": 3.6539239633369327, + "tokens_seen": 986493952 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035410230692076226, + "loss": 2.9489, + "theoretical_loss": 3.653900805215134, + "tokens_seen": 986559488 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003540922768304915, + "loss": 3.0417, + "theoretical_loss": 3.6538776490623612, + "tokens_seen": 986625024 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003540822467402207, + "loss": 2.7938, + "theoretical_loss": 3.6538544948783165, + "tokens_seen": 986690560 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035407221664994986, + "loss": 2.9534, + "theoretical_loss": 3.6538313426627016, + "tokens_seen": 986756096 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035406218655967904, + "loss": 2.8762, + "theoretical_loss": 3.6538081924152186, + "tokens_seen": 986821632 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003540521564694082, + "loss": 2.8072, + "theoretical_loss": 3.653785044135569, + "tokens_seen": 986887168 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003540421263791374, + "loss": 3.145, + "theoretical_loss": 3.653761897823456, + "tokens_seen": 986952704 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035403209628886664, + "loss": 3.0583, + "theoretical_loss": 3.6537387534785806, + "tokens_seen": 987018240 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035402206619859577, + "loss": 3.0636, + "theoretical_loss": 3.6537156111006457, + "tokens_seen": 987083776 + }, + { + "epoch": 3.03, + "learning_rate": 0.000354012036108325, + "loss": 2.9563, + "theoretical_loss": 3.6536924706893528, + "tokens_seen": 987149312 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003540020060180542, + "loss": 3.0312, + "theoretical_loss": 3.6536693322444047, + "tokens_seen": 987214848 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035399197592778336, + "loss": 3.0896, + "theoretical_loss": 3.6536461957655044, + "tokens_seen": 987280384 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035398194583751254, + "loss": 2.9546, + "theoretical_loss": 3.653623061252353, + "tokens_seen": 987345920 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003539719157472417, + "loss": 2.9942, + "theoretical_loss": 3.6535999287046543, + "tokens_seen": 987411456 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003539618856569709, + "loss": 3.0153, + "theoretical_loss": 3.65357679812211, + "tokens_seen": 987476992 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035395185556670014, + "loss": 2.7674, + "theoretical_loss": 3.6535536695044235, + "tokens_seen": 987542528 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035394182547642927, + "loss": 3.0445, + "theoretical_loss": 3.653530542851297, + "tokens_seen": 987608064 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003539317953861585, + "loss": 3.0266, + "theoretical_loss": 3.6535074181624334, + "tokens_seen": 987673600 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035392176529588763, + "loss": 2.6688, + "theoretical_loss": 3.6534842954375355, + "tokens_seen": 987739136 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035391173520561687, + "loss": 2.8888, + "theoretical_loss": 3.6534611746763064, + "tokens_seen": 987804672 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035390170511534605, + "loss": 3.0519, + "theoretical_loss": 3.653438055878449, + "tokens_seen": 987870208 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035389167502507523, + "loss": 2.8346, + "theoretical_loss": 3.6534149390436657, + "tokens_seen": 987935744 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1136387, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.134091854095459, + "objective/train/theoretical_loss": 3.6534091601416656, + "objective/train/tokens_used": 1008412128, + "theoretical_loss": 3.6534091601416656, + "tokens_seen": 987952128 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003538816449348044, + "loss": 2.9636, + "theoretical_loss": 3.6533918241716608, + "tokens_seen": 988001280 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035387161484453364, + "loss": 2.8623, + "theoretical_loss": 3.6533687112621367, + "tokens_seen": 988066816 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035386158475426277, + "loss": 3.0957, + "theoretical_loss": 3.6533456003147964, + "tokens_seen": 988132352 + }, + { + "epoch": 3.03, + "learning_rate": 0.000353851554663992, + "loss": 2.8578, + "theoretical_loss": 3.653322491329344, + "tokens_seen": 988197888 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035384152457372113, + "loss": 2.8403, + "theoretical_loss": 3.653299384305482, + "tokens_seen": 988263424 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035383149448345037, + "loss": 2.8318, + "theoretical_loss": 3.653276279242915, + "tokens_seen": 988328960 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035382146439317955, + "loss": 2.8225, + "theoretical_loss": 3.6532531761413454, + "tokens_seen": 988394496 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035381143430290873, + "loss": 2.7515, + "theoretical_loss": 3.6532300750004767, + "tokens_seen": 988460032 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003538014042126379, + "loss": 2.8674, + "theoretical_loss": 3.6532069758200136, + "tokens_seen": 988525568 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003537913741223671, + "loss": 2.8581, + "theoretical_loss": 3.653183878599659, + "tokens_seen": 988591104 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003537813440320963, + "loss": 2.8162, + "theoretical_loss": 3.6531607833391164, + "tokens_seen": 988656640 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003537713139418255, + "loss": 2.9214, + "theoretical_loss": 3.6531376900380903, + "tokens_seen": 988722176 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035376128385155464, + "loss": 2.9007, + "theoretical_loss": 3.653114598696284, + "tokens_seen": 988787712 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035375125376128387, + "loss": 3.061, + "theoretical_loss": 3.653091509313402, + "tokens_seen": 988853248 + }, + { + "epoch": 3.03, + "learning_rate": 0.000353741223671013, + "loss": 3.2347, + "theoretical_loss": 3.653068421889148, + "tokens_seen": 988918784 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035373119358074223, + "loss": 2.7098, + "theoretical_loss": 3.653045336423226, + "tokens_seen": 988984320 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003537211634904714, + "loss": 2.6685, + "theoretical_loss": 3.65302225291534, + "tokens_seen": 989049856 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003537111334002006, + "loss": 3.1113, + "theoretical_loss": 3.6529991713651944, + "tokens_seen": 989115392 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003537011033099298, + "loss": 2.8261, + "theoretical_loss": 3.652976091772494, + "tokens_seen": 989180928 + }, + { + "epoch": 3.03, + "learning_rate": 0.000353691073219659, + "loss": 2.706, + "theoretical_loss": 3.6529530141369424, + "tokens_seen": 989246464 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035368104312938814, + "loss": 2.8985, + "theoretical_loss": 3.652929938458244, + "tokens_seen": 989312000 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003536710130391174, + "loss": 2.8457, + "theoretical_loss": 3.652906864736103, + "tokens_seen": 989377536 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003536609829488465, + "loss": 2.9615, + "theoretical_loss": 3.6528837929702256, + "tokens_seen": 989443072 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035365095285857574, + "loss": 2.7433, + "theoretical_loss": 3.6528607231603143, + "tokens_seen": 989508608 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003536409227683049, + "loss": 2.8964, + "theoretical_loss": 3.6528376553060746, + "tokens_seen": 989574144 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1137615, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.197035074234009, + "objective/train/theoretical_loss": 3.6528318886480537, + "objective/train/tokens_used": 1010050528, + "theoretical_loss": 3.6528318886480537, + "tokens_seen": 989590528 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003536308926780341, + "loss": 3.1049, + "theoretical_loss": 3.6528145894072113, + "tokens_seen": 989639680 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003536208625877633, + "loss": 2.9796, + "theoretical_loss": 3.6527915254634293, + "tokens_seen": 989705216 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035361083249749246, + "loss": 2.7986, + "theoretical_loss": 3.652768463474433, + "tokens_seen": 989770752 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003536008024072217, + "loss": 2.8468, + "theoretical_loss": 3.652745403439928, + "tokens_seen": 989836288 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003535907723169509, + "loss": 2.8229, + "theoretical_loss": 3.652722345359618, + "tokens_seen": 989901824 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035358074222668006, + "loss": 2.7055, + "theoretical_loss": 3.65269928923321, + "tokens_seen": 989967360 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035357071213640924, + "loss": 2.593, + "theoretical_loss": 3.652676235060407, + "tokens_seen": 990032896 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003535606820461384, + "loss": 2.8729, + "theoretical_loss": 3.6526531828409157, + "tokens_seen": 990098432 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003535506519558676, + "loss": 2.9613, + "theoretical_loss": 3.6526301325744406, + "tokens_seen": 990163968 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035354062186559684, + "loss": 3.0164, + "theoretical_loss": 3.652607084260687, + "tokens_seen": 990229504 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035353059177532597, + "loss": 2.9109, + "theoretical_loss": 3.65258403789936, + "tokens_seen": 990295040 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003535205616850552, + "loss": 2.8497, + "theoretical_loss": 3.652560993490166, + "tokens_seen": 990360576 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003535105315947844, + "loss": 2.9282, + "theoretical_loss": 3.6525379510328095, + "tokens_seen": 990426112 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035350050150451356, + "loss": 2.9462, + "theoretical_loss": 3.652514910526997, + "tokens_seen": 990491648 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035349047141424274, + "loss": 2.7424, + "theoretical_loss": 3.652491871972433, + "tokens_seen": 990557184 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003534804413239719, + "loss": 2.738, + "theoretical_loss": 3.6524688353688237, + "tokens_seen": 990622720 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003534704112337011, + "loss": 3.0098, + "theoretical_loss": 3.652445800715875, + "tokens_seen": 990688256 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035346038114343034, + "loss": 2.6713, + "theoretical_loss": 3.6524227680132926, + "tokens_seen": 990753792 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035345035105315947, + "loss": 3.0378, + "theoretical_loss": 3.652399737260782, + "tokens_seen": 990819328 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003534403209628887, + "loss": 3.0039, + "theoretical_loss": 3.65237670845805, + "tokens_seen": 990884864 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035343029087261783, + "loss": 2.8203, + "theoretical_loss": 3.6523536816048017, + "tokens_seen": 990950400 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035342026078234707, + "loss": 2.9151, + "theoretical_loss": 3.6523306567007436, + "tokens_seen": 991015936 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035341023069207625, + "loss": 3.0935, + "theoretical_loss": 3.6523076337455813, + "tokens_seen": 991081472 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035340020060180543, + "loss": 2.7343, + "theoretical_loss": 3.652284612739022, + "tokens_seen": 991147008 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003533901705115346, + "loss": 2.9753, + "theoretical_loss": 3.6522615936807705, + "tokens_seen": 991212544 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1138406, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1054980754852295, + "objective/train/theoretical_loss": 3.6522558392205973, + "objective/train/tokens_used": 1011688928, + "theoretical_loss": 3.6522558392205973, + "tokens_seen": 991228928 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035338014042126384, + "loss": 2.9782, + "theoretical_loss": 3.652238576570535, + "tokens_seen": 991278080 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035337011033099297, + "loss": 2.844, + "theoretical_loss": 3.6522155614080205, + "tokens_seen": 991343616 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003533600802407222, + "loss": 2.9592, + "theoretical_loss": 3.652192548192933, + "tokens_seen": 991409152 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035335005015045133, + "loss": 2.976, + "theoretical_loss": 3.6521695369249807, + "tokens_seen": 991474688 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035334002006018057, + "loss": 2.9461, + "theoretical_loss": 3.652146527603869, + "tokens_seen": 991540224 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035332998996990975, + "loss": 2.8723, + "theoretical_loss": 3.6521235202293045, + "tokens_seen": 991605760 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035331995987963893, + "loss": 2.7463, + "theoretical_loss": 3.6521005148009946, + "tokens_seen": 991671296 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003533099297893681, + "loss": 2.8138, + "theoretical_loss": 3.6520775113186454, + "tokens_seen": 991736832 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003532998996990973, + "loss": 2.8062, + "theoretical_loss": 3.652054509781964, + "tokens_seen": 991802368 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003532898696088265, + "loss": 2.9703, + "theoretical_loss": 3.652031510190657, + "tokens_seen": 991867904 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003532798395185557, + "loss": 2.7611, + "theoretical_loss": 3.6520085125444313, + "tokens_seen": 991933440 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035326980942828484, + "loss": 2.7765, + "theoretical_loss": 3.651985516842995, + "tokens_seen": 991998976 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035325977933801407, + "loss": 2.7648, + "theoretical_loss": 3.6519625230860537, + "tokens_seen": 992064512 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003532497492477432, + "loss": 3.091, + "theoretical_loss": 3.6519395312733156, + "tokens_seen": 992130048 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035323971915747243, + "loss": 2.7488, + "theoretical_loss": 3.651916541404487, + "tokens_seen": 992195584 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003532296890672016, + "loss": 2.7365, + "theoretical_loss": 3.651893553479276, + "tokens_seen": 992261120 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003532196589769308, + "loss": 2.9087, + "theoretical_loss": 3.6518705674973893, + "tokens_seen": 992326656 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035320962888666, + "loss": 2.8225, + "theoretical_loss": 3.651847583458535, + "tokens_seen": 992392192 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003531995987963892, + "loss": 2.5887, + "theoretical_loss": 3.65182460136242, + "tokens_seen": 992457728 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035318956870611834, + "loss": 2.8481, + "theoretical_loss": 3.651801621208752, + "tokens_seen": 992523264 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003531795386158476, + "loss": 2.7536, + "theoretical_loss": 3.651778642997238, + "tokens_seen": 992588800 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003531695085255767, + "loss": 2.8793, + "theoretical_loss": 3.6517556667275866, + "tokens_seen": 992654336 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035315947843530594, + "loss": 2.8578, + "theoretical_loss": 3.651732692399505, + "tokens_seen": 992719872 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003531494483450351, + "loss": 3.0135, + "theoretical_loss": 3.6517097200127013, + "tokens_seen": 992785408 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003531394182547643, + "loss": 2.7965, + "theoretical_loss": 3.651686749566883, + "tokens_seen": 992850944 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1143666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9665863513946533, + "objective/train/theoretical_loss": 3.6516810072586727, + "objective/train/tokens_used": 1013327328, + "theoretical_loss": 3.6516810072586727, + "tokens_seen": 992867328 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003531293881644935, + "loss": 2.9326, + "theoretical_loss": 3.6516637810617576, + "tokens_seen": 992916480 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035311935807422266, + "loss": 2.9258, + "theoretical_loss": 3.651640814497034, + "tokens_seen": 992982016 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035310932798395184, + "loss": 3.1624, + "theoretical_loss": 3.6516178498724194, + "tokens_seen": 993047552 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003530992978936811, + "loss": 2.8843, + "theoretical_loss": 3.6515948871876227, + "tokens_seen": 993113088 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003530892678034102, + "loss": 2.7025, + "theoretical_loss": 3.6515719264423514, + "tokens_seen": 993178624 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035307923771313944, + "loss": 2.7606, + "theoretical_loss": 3.651548967636314, + "tokens_seen": 993244160 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035306920762286857, + "loss": 2.9293, + "theoretical_loss": 3.651526010769219, + "tokens_seen": 993309696 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003530591775325978, + "loss": 2.6736, + "theoretical_loss": 3.6515030558407737, + "tokens_seen": 993375232 + }, + { + "epoch": 3.03, + "learning_rate": 0.000353049147442327, + "loss": 2.9679, + "theoretical_loss": 3.651480102850688, + "tokens_seen": 993440768 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035303911735205617, + "loss": 2.6353, + "theoretical_loss": 3.651457151798669, + "tokens_seen": 993506304 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035302908726178535, + "loss": 2.9455, + "theoretical_loss": 3.6514342026844258, + "tokens_seen": 993571840 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003530190571715146, + "loss": 2.9826, + "theoretical_loss": 3.6514112555076674, + "tokens_seen": 993637376 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003530090270812437, + "loss": 2.8492, + "theoretical_loss": 3.651388310268102, + "tokens_seen": 993702912 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035299899699097294, + "loss": 2.8351, + "theoretical_loss": 3.6513653669654387, + "tokens_seen": 993768448 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035298896690070207, + "loss": 2.9885, + "theoretical_loss": 3.6513424255993856, + "tokens_seen": 993833984 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003529789368104313, + "loss": 2.7295, + "theoretical_loss": 3.651319486169652, + "tokens_seen": 993899520 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003529689067201605, + "loss": 2.9383, + "theoretical_loss": 3.6512965486759468, + "tokens_seen": 993965056 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035295887662988967, + "loss": 2.6859, + "theoretical_loss": 3.651273613117979, + "tokens_seen": 994030592 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035294884653961885, + "loss": 2.8415, + "theoretical_loss": 3.6512506794954573, + "tokens_seen": 994096128 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035293881644934803, + "loss": 2.697, + "theoretical_loss": 3.651227747808091, + "tokens_seen": 994161664 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003529287863590772, + "loss": 2.8838, + "theoretical_loss": 3.6512048180555894, + "tokens_seen": 994227200 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035291875626880645, + "loss": 2.9552, + "theoretical_loss": 3.6511818902376616, + "tokens_seen": 994292736 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003529087261785356, + "loss": 3.0095, + "theoretical_loss": 3.6511589643540168, + "tokens_seen": 994358272 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003528986960882648, + "loss": 2.6717, + "theoretical_loss": 3.6511360404043645, + "tokens_seen": 994423808 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035288866599799394, + "loss": 3.1065, + "theoretical_loss": 3.6511131183884142, + "tokens_seen": 994489344 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1148851, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7076807022094727, + "objective/train/theoretical_loss": 3.6511073881865337, + "objective/train/tokens_used": 1014965728, + "theoretical_loss": 3.6511073881865337, + "tokens_seen": 994505728 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035287863590772317, + "loss": 2.8518, + "theoretical_loss": 3.6510901983058757, + "tokens_seen": 994554880 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035286860581745235, + "loss": 3.0575, + "theoretical_loss": 3.6510672801564574, + "tokens_seen": 994620416 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035285857572718153, + "loss": 2.7546, + "theoretical_loss": 3.65104436393987, + "tokens_seen": 994685952 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035284854563691077, + "loss": 2.8476, + "theoretical_loss": 3.651021449655822, + "tokens_seen": 994751488 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035283851554663995, + "loss": 2.7623, + "theoretical_loss": 3.650998537304025, + "tokens_seen": 994817024 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035282848545636913, + "loss": 2.9836, + "theoretical_loss": 3.6509756268841866, + "tokens_seen": 994882560 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003528184553660983, + "loss": 2.7934, + "theoretical_loss": 3.6509527183960184, + "tokens_seen": 994948096 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003528084252758275, + "loss": 2.8664, + "theoretical_loss": 3.6509298118392293, + "tokens_seen": 995013632 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003527983951855567, + "loss": 2.8624, + "theoretical_loss": 3.650906907213529, + "tokens_seen": 995079168 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003527883650952859, + "loss": 2.958, + "theoretical_loss": 3.650884004518629, + "tokens_seen": 995144704 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035277833500501504, + "loss": 2.9275, + "theoretical_loss": 3.650861103754238, + "tokens_seen": 995210240 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035276830491474427, + "loss": 3.1767, + "theoretical_loss": 3.650838204920067, + "tokens_seen": 995275776 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003527582748244734, + "loss": 2.8752, + "theoretical_loss": 3.650815308015826, + "tokens_seen": 995341312 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035274824473420263, + "loss": 2.9867, + "theoretical_loss": 3.650792413041225, + "tokens_seen": 995406848 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003527382146439318, + "loss": 2.9126, + "theoretical_loss": 3.6507695199959747, + "tokens_seen": 995472384 + }, + { + "epoch": 3.03, + "learning_rate": 0.000352728184553661, + "loss": 2.957, + "theoretical_loss": 3.650746628879785, + "tokens_seen": 995537920 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003527181544633902, + "loss": 2.9834, + "theoretical_loss": 3.650723739692367, + "tokens_seen": 995603456 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003527081243731194, + "loss": 2.6613, + "theoretical_loss": 3.650700852433431, + "tokens_seen": 995668992 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035269809428284854, + "loss": 2.4629, + "theoretical_loss": 3.6506779671026877, + "tokens_seen": 995734528 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003526880641925778, + "loss": 2.7763, + "theoretical_loss": 3.6506550836998475, + "tokens_seen": 995800064 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003526780341023069, + "loss": 2.7792, + "theoretical_loss": 3.650632202224621, + "tokens_seen": 995865600 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035266800401203614, + "loss": 3.1098, + "theoretical_loss": 3.6506093226767193, + "tokens_seen": 995931136 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003526579739217653, + "loss": 2.7007, + "theoretical_loss": 3.650586445055853, + "tokens_seen": 995996672 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003526479438314945, + "loss": 2.9314, + "theoretical_loss": 3.650563569361734, + "tokens_seen": 996062208 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003526379137412237, + "loss": 3.065, + "theoretical_loss": 3.6505406955940725, + "tokens_seen": 996127744 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1153874, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.706052541732788, + "objective/train/theoretical_loss": 3.650534977453132, + "objective/train/tokens_used": 1016604128, + "theoretical_loss": 3.650534977453132, + "tokens_seen": 996144128 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035262788365095286, + "loss": 2.8504, + "theoretical_loss": 3.650517823752579, + "tokens_seen": 996193280 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035261785356068204, + "loss": 2.9067, + "theoretical_loss": 3.650494953836965, + "tokens_seen": 996258816 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003526078234704113, + "loss": 2.7841, + "theoretical_loss": 3.650472085846942, + "tokens_seen": 996324352 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003525977933801404, + "loss": 2.7586, + "theoretical_loss": 3.6504492197822214, + "tokens_seen": 996389888 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035258776328986964, + "loss": 2.6799, + "theoretical_loss": 3.650426355642514, + "tokens_seen": 996455424 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035257773319959877, + "loss": 2.8452, + "theoretical_loss": 3.6504034934275307, + "tokens_seen": 996520960 + }, + { + "epoch": 3.03, + "learning_rate": 0.000352567703109328, + "loss": 2.7273, + "theoretical_loss": 3.650380633136984, + "tokens_seen": 996586496 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003525576730190572, + "loss": 2.9257, + "theoretical_loss": 3.6503577747705847, + "tokens_seen": 996652032 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035254764292878637, + "loss": 2.8902, + "theoretical_loss": 3.6503349183280447, + "tokens_seen": 996717568 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035253761283851555, + "loss": 2.8466, + "theoretical_loss": 3.650312063809075, + "tokens_seen": 996783104 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003525275827482448, + "loss": 2.9663, + "theoretical_loss": 3.6502892112133876, + "tokens_seen": 996848640 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003525175526579739, + "loss": 2.9607, + "theoretical_loss": 3.6502663605406944, + "tokens_seen": 996914176 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035250752256770314, + "loss": 2.795, + "theoretical_loss": 3.6502435117907073, + "tokens_seen": 996979712 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035249749247743227, + "loss": 2.8096, + "theoretical_loss": 3.6502206649631375, + "tokens_seen": 997045248 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003524874623871615, + "loss": 2.9291, + "theoretical_loss": 3.6501978200576977, + "tokens_seen": 997110784 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003524774322968907, + "loss": 2.7279, + "theoretical_loss": 3.6501749770740988, + "tokens_seen": 997176320 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035246740220661987, + "loss": 2.9556, + "theoretical_loss": 3.6501521360120543, + "tokens_seen": 997241856 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035245737211634905, + "loss": 2.9142, + "theoretical_loss": 3.650129296871275, + "tokens_seen": 997307392 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035244734202607823, + "loss": 2.9961, + "theoretical_loss": 3.6501064596514734, + "tokens_seen": 997372928 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003524373119358074, + "loss": 2.7048, + "theoretical_loss": 3.6500836243523627, + "tokens_seen": 997438464 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035242728184553665, + "loss": 2.8819, + "theoretical_loss": 3.6500607909736535, + "tokens_seen": 997504000 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003524172517552658, + "loss": 2.9255, + "theoretical_loss": 3.6500379595150596, + "tokens_seen": 997569536 + }, + { + "epoch": 3.03, + "learning_rate": 0.000352407221664995, + "loss": 2.8446, + "theoretical_loss": 3.6500151299762926, + "tokens_seen": 997635072 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035239719157472414, + "loss": 2.6841, + "theoretical_loss": 3.649992302357065, + "tokens_seen": 997700608 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035238716148445337, + "loss": 2.8274, + "theoretical_loss": 3.64996947665709, + "tokens_seen": 997766144 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1156592, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5675315856933594, + "objective/train/theoretical_loss": 3.6499637705319454, + "objective/train/tokens_used": 1018242528, + "theoretical_loss": 3.6499637705319454, + "tokens_seen": 997782528 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035237713139418255, + "loss": 2.9074, + "theoretical_loss": 3.6499466528760793, + "tokens_seen": 997831680 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035236710130391173, + "loss": 2.8998, + "theoretical_loss": 3.649923831013746, + "tokens_seen": 997897216 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003523570712136409, + "loss": 2.7522, + "theoretical_loss": 3.6499010110698027, + "tokens_seen": 997962752 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035234704112337015, + "loss": 2.8103, + "theoretical_loss": 3.6498781930439623, + "tokens_seen": 998028288 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003523370110330993, + "loss": 2.8486, + "theoretical_loss": 3.6498553769359376, + "tokens_seen": 998093824 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003523269809428285, + "loss": 2.8552, + "theoretical_loss": 3.6498325627454413, + "tokens_seen": 998159360 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035231695085255764, + "loss": 2.8996, + "theoretical_loss": 3.649809750472187, + "tokens_seen": 998224896 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003523069207622869, + "loss": 2.8666, + "theoretical_loss": 3.649786940115887, + "tokens_seen": 998290432 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035229689067201606, + "loss": 2.8501, + "theoretical_loss": 3.649764131676255, + "tokens_seen": 998355968 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035228686058174524, + "loss": 2.7246, + "theoretical_loss": 3.649741325153003, + "tokens_seen": 998421504 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003522768304914744, + "loss": 2.7134, + "theoretical_loss": 3.6497185205458456, + "tokens_seen": 998487040 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003522668004012036, + "loss": 2.9018, + "theoretical_loss": 3.6496957178544953, + "tokens_seen": 998552576 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003522567703109328, + "loss": 2.7131, + "theoretical_loss": 3.649672917078666, + "tokens_seen": 998618112 + }, + { + "epoch": 3.03, + "learning_rate": 0.000352246740220662, + "loss": 2.9995, + "theoretical_loss": 3.6496501182180703, + "tokens_seen": 998683648 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035223671013039114, + "loss": 2.8589, + "theoretical_loss": 3.649627321272422, + "tokens_seen": 998749184 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003522266800401204, + "loss": 2.7328, + "theoretical_loss": 3.649604526241435, + "tokens_seen": 998814720 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035221664994984956, + "loss": 2.9369, + "theoretical_loss": 3.6495817331248226, + "tokens_seen": 998880256 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035220661985957874, + "loss": 3.0445, + "theoretical_loss": 3.6495589419222982, + "tokens_seen": 998945792 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003521965897693079, + "loss": 2.8777, + "theoretical_loss": 3.6495361526335754, + "tokens_seen": 999011328 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003521865596790371, + "loss": 2.8657, + "theoretical_loss": 3.649513365258369, + "tokens_seen": 999076864 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003521765295887663, + "loss": 2.7128, + "theoretical_loss": 3.6494905797963915, + "tokens_seen": 999142400 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003521664994984955, + "loss": 2.704, + "theoretical_loss": 3.649467796247358, + "tokens_seen": 999207936 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035215646940822465, + "loss": 2.8895, + "theoretical_loss": 3.649445014610981, + "tokens_seen": 999273472 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003521464393179539, + "loss": 2.8723, + "theoretical_loss": 3.649422234886976, + "tokens_seen": 999339008 + }, + { + "epoch": 3.03, + "learning_rate": 0.000352136409227683, + "loss": 3.0843, + "theoretical_loss": 3.6493994570750568, + "tokens_seen": 999404544 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1159661, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2150416374206543, + "objective/train/theoretical_loss": 3.6493937629208064, + "objective/train/tokens_used": 1019880928, + "theoretical_loss": 3.6493937629208064, + "tokens_seen": 999420928 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035212637913741224, + "loss": 2.8963, + "theoretical_loss": 3.649376681174936, + "tokens_seen": 999470080 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003521163490471414, + "loss": 3.0057, + "theoretical_loss": 3.64935390718633, + "tokens_seen": 999535616 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003521063189568706, + "loss": 2.9464, + "theoretical_loss": 3.6493311351089517, + "tokens_seen": 999601152 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035209628886659984, + "loss": 3.0535, + "theoretical_loss": 3.6493083649425153, + "tokens_seen": 999666688 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035208625877632897, + "loss": 3.0811, + "theoretical_loss": 3.649285596686736, + "tokens_seen": 999732224 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003520762286860582, + "loss": 2.69, + "theoretical_loss": 3.6492628303413275, + "tokens_seen": 999797760 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003520661985957874, + "loss": 3.0515, + "theoretical_loss": 3.649240065906005, + "tokens_seen": 999863296 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035205616850551657, + "loss": 2.9586, + "theoretical_loss": 3.6492173033804827, + "tokens_seen": 999928832 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035204613841524575, + "loss": 3.0117, + "theoretical_loss": 3.649194542764475, + "tokens_seen": 999994368 + }, + { + "epoch": 3.03, + "learning_rate": 0.000352036108324975, + "loss": 2.9995, + "theoretical_loss": 3.6491717840576974, + "tokens_seen": 1000059904 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003520260782347041, + "loss": 3.0554, + "theoretical_loss": 3.6491490272598637, + "tokens_seen": 1000125440 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035201604814443334, + "loss": 3.0433, + "theoretical_loss": 3.649126272370689, + "tokens_seen": 1000190976 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035200601805416247, + "loss": 2.8908, + "theoretical_loss": 3.6491035193898886, + "tokens_seen": 1000256512 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003519959879638917, + "loss": 2.9182, + "theoretical_loss": 3.6490807683171766, + "tokens_seen": 1000322048 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003519859578736209, + "loss": 2.7976, + "theoretical_loss": 3.649058019152269, + "tokens_seen": 1000387584 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035197592778335007, + "loss": 2.671, + "theoretical_loss": 3.6490352718948795, + "tokens_seen": 1000453120 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035196589769307925, + "loss": 2.8257, + "theoretical_loss": 3.649012526544725, + "tokens_seen": 1000518656 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035195586760280843, + "loss": 2.5004, + "theoretical_loss": 3.6489897831015194, + "tokens_seen": 1000584192 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003519458375125376, + "loss": 2.5683, + "theoretical_loss": 3.648967041564978, + "tokens_seen": 1000649728 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035193580742226685, + "loss": 3.0786, + "theoretical_loss": 3.648944301934817, + "tokens_seen": 1000715264 + }, + { + "epoch": 3.03, + "learning_rate": 0.000351925777331996, + "loss": 2.8131, + "theoretical_loss": 3.6489215642107506, + "tokens_seen": 1000780800 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003519157472417252, + "loss": 2.787, + "theoretical_loss": 3.6488988283924946, + "tokens_seen": 1000846336 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035190571715145434, + "loss": 2.8895, + "theoretical_loss": 3.648876094479765, + "tokens_seen": 1000911872 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035189568706118357, + "loss": 2.5759, + "theoretical_loss": 3.648853362472277, + "tokens_seen": 1000977408 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035188565697091275, + "loss": 2.6727, + "theoretical_loss": 3.648830632369746, + "tokens_seen": 1001042944 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1160836, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1073474884033203, + "objective/train/theoretical_loss": 3.648824950141729, + "objective/train/tokens_used": 1021519328, + "theoretical_loss": 3.648824950141729, + "tokens_seen": 1001059328 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035187562688064193, + "loss": 3.1555, + "theoretical_loss": 3.6488079041718877, + "tokens_seen": 1001108480 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003518655967903711, + "loss": 2.9301, + "theoretical_loss": 3.648785177878418, + "tokens_seen": 1001174016 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035185556670010035, + "loss": 2.7881, + "theoretical_loss": 3.648762453489053, + "tokens_seen": 1001239552 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003518455366098295, + "loss": 2.9136, + "theoretical_loss": 3.6487397310035083, + "tokens_seen": 1001305088 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003518355065195587, + "loss": 2.9075, + "theoretical_loss": 3.648717010421499, + "tokens_seen": 1001370624 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035182547642928784, + "loss": 2.561, + "theoretical_loss": 3.6486942917427427, + "tokens_seen": 1001436160 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003518154463390171, + "loss": 3.1295, + "theoretical_loss": 3.648671574966954, + "tokens_seen": 1001501696 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035180541624874626, + "loss": 2.7492, + "theoretical_loss": 3.6486488600938496, + "tokens_seen": 1001567232 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035179538615847544, + "loss": 2.8224, + "theoretical_loss": 3.648626147123146, + "tokens_seen": 1001632768 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003517853560682046, + "loss": 2.9366, + "theoretical_loss": 3.648603436054558, + "tokens_seen": 1001698304 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003517753259779338, + "loss": 2.9208, + "theoretical_loss": 3.648580726887803, + "tokens_seen": 1001763840 + }, + { + "epoch": 3.03, + "learning_rate": 0.000351765295887663, + "loss": 2.8788, + "theoretical_loss": 3.648558019622598, + "tokens_seen": 1001829376 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003517552657973922, + "loss": 2.9987, + "theoretical_loss": 3.648535314258658, + "tokens_seen": 1001894912 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035174523570712134, + "loss": 3.0191, + "theoretical_loss": 3.6485126107957004, + "tokens_seen": 1001960448 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003517352056168506, + "loss": 2.8817, + "theoretical_loss": 3.648489909233441, + "tokens_seen": 1002025984 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035172517552657976, + "loss": 2.9599, + "theoretical_loss": 3.648467209571597, + "tokens_seen": 1002091520 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035171514543630894, + "loss": 2.7182, + "theoretical_loss": 3.6484445118098847, + "tokens_seen": 1002157056 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003517051153460381, + "loss": 2.9031, + "theoretical_loss": 3.648421815948021, + "tokens_seen": 1002222592 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003516950852557673, + "loss": 2.6092, + "theoretical_loss": 3.6483991219857224, + "tokens_seen": 1002288128 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003516850551654965, + "loss": 2.6493, + "theoretical_loss": 3.648376429922706, + "tokens_seen": 1002353664 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003516750250752257, + "loss": 2.7576, + "theoretical_loss": 3.6483537397586887, + "tokens_seen": 1002419200 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035166499498495485, + "loss": 2.9421, + "theoretical_loss": 3.648331051493387, + "tokens_seen": 1002484736 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003516549648946841, + "loss": 2.6704, + "theoretical_loss": 3.6483083651265185, + "tokens_seen": 1002550272 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003516449348044132, + "loss": 2.8942, + "theoretical_loss": 3.6482856806578, + "tokens_seen": 1002615808 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035163490471414244, + "loss": 2.9315, + "theoretical_loss": 3.648262998086948, + "tokens_seen": 1002681344 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1161424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.439594030380249, + "objective/train/theoretical_loss": 3.648257327740744, + "objective/train/tokens_used": 1023157728, + "theoretical_loss": 3.648257327740744, + "tokens_seen": 1002697728 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003516248746238716, + "loss": 2.7253, + "theoretical_loss": 3.6482403174136806, + "tokens_seen": 1002746880 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003516148445336008, + "loss": 2.9394, + "theoretical_loss": 3.648217638637715, + "tokens_seen": 1002812416 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035160481444333, + "loss": 2.498, + "theoretical_loss": 3.648194961758768, + "tokens_seen": 1002877952 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035159478435305917, + "loss": 2.7678, + "theoretical_loss": 3.6481722867765574, + "tokens_seen": 1002943488 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035158475426278835, + "loss": 2.7227, + "theoretical_loss": 3.6481496136908, + "tokens_seen": 1003009024 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003515747241725176, + "loss": 2.6544, + "theoretical_loss": 3.6481269425012144, + "tokens_seen": 1003074560 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003515646940822467, + "loss": 2.7992, + "theoretical_loss": 3.648104273207517, + "tokens_seen": 1003140096 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035155466399197595, + "loss": 2.8121, + "theoretical_loss": 3.648081605809426, + "tokens_seen": 1003205632 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035154463390170513, + "loss": 2.8442, + "theoretical_loss": 3.648058940306659, + "tokens_seen": 1003271168 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003515346038114343, + "loss": 2.91, + "theoretical_loss": 3.648036276698934, + "tokens_seen": 1003336704 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003515245737211635, + "loss": 2.9568, + "theoretical_loss": 3.6480136149859677, + "tokens_seen": 1003402240 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035151454363089267, + "loss": 2.7311, + "theoretical_loss": 3.6479909551674794, + "tokens_seen": 1003467776 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035150451354062185, + "loss": 2.8667, + "theoretical_loss": 3.647968297243186, + "tokens_seen": 1003533312 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003514944834503511, + "loss": 2.7325, + "theoretical_loss": 3.6479456412128064, + "tokens_seen": 1003598848 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003514844533600802, + "loss": 2.9816, + "theoretical_loss": 3.6479229870760577, + "tokens_seen": 1003664384 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035147442326980945, + "loss": 2.8746, + "theoretical_loss": 3.6479003348326584, + "tokens_seen": 1003729920 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003514643931795386, + "loss": 2.9706, + "theoretical_loss": 3.647877684482326, + "tokens_seen": 1003795456 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003514543630892678, + "loss": 2.769, + "theoretical_loss": 3.64785503602478, + "tokens_seen": 1003860992 + }, + { + "epoch": 3.03, + "learning_rate": 0.000351444332998997, + "loss": 2.815, + "theoretical_loss": 3.647832389459738, + "tokens_seen": 1003926528 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003514343029087262, + "loss": 2.6748, + "theoretical_loss": 3.647809744786918, + "tokens_seen": 1003992064 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035142427281845536, + "loss": 2.787, + "theoretical_loss": 3.6477871020060393, + "tokens_seen": 1004057600 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035141424272818454, + "loss": 2.9168, + "theoretical_loss": 3.647764461116819, + "tokens_seen": 1004123136 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003514042126379137, + "loss": 2.7364, + "theoretical_loss": 3.6477418221189772, + "tokens_seen": 1004188672 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035139418254764295, + "loss": 2.7187, + "theoretical_loss": 3.647719185012231, + "tokens_seen": 1004254208 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003513841524573721, + "loss": 2.7207, + "theoretical_loss": 3.6476965497963, + "tokens_seen": 1004319744 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1162777, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7652225494384766, + "objective/train/theoretical_loss": 3.6476908912877244, + "objective/train/tokens_used": 1024796128, + "theoretical_loss": 3.6476908912877244, + "tokens_seen": 1004336128 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003513741223671013, + "loss": 2.9625, + "theoretical_loss": 3.647673916470903, + "tokens_seen": 1004385280 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003513640922768305, + "loss": 2.8803, + "theoretical_loss": 3.647651285035758, + "tokens_seen": 1004450816 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003513540621865597, + "loss": 2.7448, + "theoretical_loss": 3.647628655490584, + "tokens_seen": 1004516352 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003513440320962889, + "loss": 3.026, + "theoretical_loss": 3.6476060278351, + "tokens_seen": 1004581888 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035133400200601804, + "loss": 2.9949, + "theoretical_loss": 3.6475834020690256, + "tokens_seen": 1004647424 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003513239719157473, + "loss": 3.0162, + "theoretical_loss": 3.647560778192079, + "tokens_seen": 1004712960 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035131394182547646, + "loss": 2.7788, + "theoretical_loss": 3.6475381562039795, + "tokens_seen": 1004778496 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035130391173520564, + "loss": 2.9032, + "theoretical_loss": 3.647515536104446, + "tokens_seen": 1004844032 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003512938816449348, + "loss": 2.8642, + "theoretical_loss": 3.6474929178931976, + "tokens_seen": 1004909568 + }, + { + "epoch": 3.03, + "learning_rate": 0.000351283851554664, + "loss": 2.7253, + "theoretical_loss": 3.647470301569954, + "tokens_seen": 1004975104 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003512738214643932, + "loss": 2.9281, + "theoretical_loss": 3.647447687134435, + "tokens_seen": 1005040640 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003512637913741224, + "loss": 2.8002, + "theoretical_loss": 3.6474250745863586, + "tokens_seen": 1005106176 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035125376128385154, + "loss": 2.7841, + "theoretical_loss": 3.647402463925445, + "tokens_seen": 1005171712 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003512437311935808, + "loss": 2.7919, + "theoretical_loss": 3.6473798551514136, + "tokens_seen": 1005237248 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035123370110330996, + "loss": 2.9419, + "theoretical_loss": 3.647357248263984, + "tokens_seen": 1005302784 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035122367101303914, + "loss": 2.7465, + "theoretical_loss": 3.6473346432628753, + "tokens_seen": 1005368320 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003512136409227683, + "loss": 2.811, + "theoretical_loss": 3.647312040147808, + "tokens_seen": 1005433856 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003512036108324975, + "loss": 2.8367, + "theoretical_loss": 3.6472894389185013, + "tokens_seen": 1005499392 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003511935807422267, + "loss": 2.87, + "theoretical_loss": 3.647266839574675, + "tokens_seen": 1005564928 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003511835506519559, + "loss": 2.8118, + "theoretical_loss": 3.6472442421160487, + "tokens_seen": 1005630464 + }, + { + "epoch": 3.03, + "learning_rate": 0.00035117352056168505, + "loss": 2.947, + "theoretical_loss": 3.6472216465423424, + "tokens_seen": 1005696000 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003511634904714143, + "loss": 2.6745, + "theoretical_loss": 3.647199052853277, + "tokens_seen": 1005761536 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003511534603811434, + "loss": 2.8839, + "theoretical_loss": 3.6471764610485713, + "tokens_seen": 1005827072 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035114343029087264, + "loss": 2.8895, + "theoretical_loss": 3.647153871127946, + "tokens_seen": 1005892608 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003511334002006018, + "loss": 2.7014, + "theoretical_loss": 3.647131283091121, + "tokens_seen": 1005958144 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1163373, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.477107524871826, + "objective/train/theoretical_loss": 3.6471256363762254, + "objective/train/tokens_used": 1026434528, + "theoretical_loss": 3.6471256363762254, + "tokens_seen": 1005974528 + }, + { + "epoch": 3.04, + "learning_rate": 0.000351123370110331, + "loss": 2.8156, + "theoretical_loss": 3.6471086969378166, + "tokens_seen": 1006023680 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003511133400200602, + "loss": 2.5824, + "theoretical_loss": 3.647086112667753, + "tokens_seen": 1006089216 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035110330992978937, + "loss": 2.8562, + "theoretical_loss": 3.6470635302806507, + "tokens_seen": 1006154752 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035109327983951855, + "loss": 2.9694, + "theoretical_loss": 3.6470409497762297, + "tokens_seen": 1006220288 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003510832497492478, + "loss": 2.7441, + "theoretical_loss": 3.647018371154211, + "tokens_seen": 1006285824 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003510732196589769, + "loss": 2.8582, + "theoretical_loss": 3.6469957944143143, + "tokens_seen": 1006351360 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035106318956870615, + "loss": 2.764, + "theoretical_loss": 3.646973219556261, + "tokens_seen": 1006416896 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035105315947843533, + "loss": 2.6505, + "theoretical_loss": 3.6469506465797714, + "tokens_seen": 1006482432 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003510431293881645, + "loss": 2.6349, + "theoretical_loss": 3.6469280754845657, + "tokens_seen": 1006547968 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003510330992978937, + "loss": 2.8268, + "theoretical_loss": 3.6469055062703655, + "tokens_seen": 1006613504 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035102306920762287, + "loss": 2.8678, + "theoretical_loss": 3.6468829389368906, + "tokens_seen": 1006679040 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035101303911735205, + "loss": 3.0746, + "theoretical_loss": 3.6468603734838627, + "tokens_seen": 1006744576 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003510030090270813, + "loss": 2.9625, + "theoretical_loss": 3.6468378099110024, + "tokens_seen": 1006810112 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003509929789368104, + "loss": 2.8395, + "theoretical_loss": 3.6468152482180307, + "tokens_seen": 1006875648 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035098294884653965, + "loss": 2.5887, + "theoretical_loss": 3.646792688404669, + "tokens_seen": 1006941184 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003509729187562688, + "loss": 2.7411, + "theoretical_loss": 3.646770130470637, + "tokens_seen": 1007006720 + }, + { + "epoch": 3.04, + "learning_rate": 0.000350962888665998, + "loss": 3.0379, + "theoretical_loss": 3.6467475744156577, + "tokens_seen": 1007072256 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003509528585757272, + "loss": 2.7484, + "theoretical_loss": 3.646725020239451, + "tokens_seen": 1007137792 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003509428284854564, + "loss": 2.8597, + "theoretical_loss": 3.646702467941739, + "tokens_seen": 1007203328 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035093279839518556, + "loss": 2.8188, + "theoretical_loss": 3.646679917522243, + "tokens_seen": 1007268864 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035092276830491474, + "loss": 2.9834, + "theoretical_loss": 3.646657368980683, + "tokens_seen": 1007334400 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003509127382146439, + "loss": 2.7302, + "theoretical_loss": 3.6466348223167824, + "tokens_seen": 1007399936 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035090270812437315, + "loss": 2.7753, + "theoretical_loss": 3.646612277530261, + "tokens_seen": 1007465472 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003508926780341023, + "loss": 2.6536, + "theoretical_loss": 3.646589734620842, + "tokens_seen": 1007531008 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003508826479438315, + "loss": 2.6738, + "theoretical_loss": 3.646567193588246, + "tokens_seen": 1007596544 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1164906, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.129481554031372, + "objective/train/theoretical_loss": 3.6465615586233175, + "objective/train/tokens_used": 1028072928, + "theoretical_loss": 3.6465615586233175, + "tokens_seen": 1007612928 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003508726178535607, + "loss": 2.9174, + "theoretical_loss": 3.6465446544321947, + "tokens_seen": 1007662080 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003508625877632899, + "loss": 2.7308, + "theoretical_loss": 3.6465221171524105, + "tokens_seen": 1007727616 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035085255767301906, + "loss": 2.8025, + "theoretical_loss": 3.646499581748614, + "tokens_seen": 1007793152 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035084252758274824, + "loss": 2.8899, + "theoretical_loss": 3.646477048220528, + "tokens_seen": 1007858688 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003508324974924774, + "loss": 2.8594, + "theoretical_loss": 3.646454516567875, + "tokens_seen": 1007924224 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035082246740220666, + "loss": 2.9397, + "theoretical_loss": 3.6464319867903754, + "tokens_seen": 1007989760 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003508124373119358, + "loss": 2.8024, + "theoretical_loss": 3.646409458887752, + "tokens_seen": 1008055296 + }, + { + "epoch": 3.04, + "learning_rate": 0.000350802407221665, + "loss": 2.8186, + "theoretical_loss": 3.646386932859728, + "tokens_seen": 1008120832 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035079237713139415, + "loss": 2.7744, + "theoretical_loss": 3.646364408706024, + "tokens_seen": 1008186368 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003507823470411234, + "loss": 2.6754, + "theoretical_loss": 3.646341886426362, + "tokens_seen": 1008251904 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035077231695085256, + "loss": 2.9253, + "theoretical_loss": 3.646319366020466, + "tokens_seen": 1008317440 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035076228686058174, + "loss": 2.7929, + "theoretical_loss": 3.646296847488057, + "tokens_seen": 1008382976 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003507522567703109, + "loss": 2.9397, + "theoretical_loss": 3.6462743308288577, + "tokens_seen": 1008448512 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035074222668004016, + "loss": 2.8635, + "theoretical_loss": 3.646251816042591, + "tokens_seen": 1008514048 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003507321965897693, + "loss": 2.8216, + "theoretical_loss": 3.6462293031289787, + "tokens_seen": 1008579584 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003507221664994985, + "loss": 2.8308, + "theoretical_loss": 3.6462067920877437, + "tokens_seen": 1008645120 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035071213640922765, + "loss": 2.9977, + "theoretical_loss": 3.646184282918609, + "tokens_seen": 1008710656 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003507021063189569, + "loss": 2.8788, + "theoretical_loss": 3.6461617756212963, + "tokens_seen": 1008776192 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035069207622868607, + "loss": 2.6056, + "theoretical_loss": 3.6461392701955297, + "tokens_seen": 1008841728 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035068204613841525, + "loss": 2.8894, + "theoretical_loss": 3.646116766641031, + "tokens_seen": 1008907264 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035067201604814443, + "loss": 2.8793, + "theoretical_loss": 3.646094264957523, + "tokens_seen": 1008972800 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003506619859578736, + "loss": 2.6769, + "theoretical_loss": 3.64607176514473, + "tokens_seen": 1009038336 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003506519558676028, + "loss": 2.6217, + "theoretical_loss": 3.6460492672023728, + "tokens_seen": 1009103872 + }, + { + "epoch": 3.04, + "learning_rate": 0.000350641925777332, + "loss": 2.7837, + "theoretical_loss": 3.6460267711301766, + "tokens_seen": 1009169408 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035063189568706115, + "loss": 2.8821, + "theoretical_loss": 3.6460042769278633, + "tokens_seen": 1009234944 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1165274, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.818286657333374, + "objective/train/theoretical_loss": 3.6459986536694218, + "objective/train/tokens_used": 1029711328, + "theoretical_loss": 3.6459986536694218, + "tokens_seen": 1009251328 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003506218655967904, + "loss": 2.8747, + "theoretical_loss": 3.6459817845951563, + "tokens_seen": 1009300480 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003506118355065195, + "loss": 2.6386, + "theoretical_loss": 3.645959294131779, + "tokens_seen": 1009366016 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035060180541624875, + "loss": 2.8541, + "theoretical_loss": 3.6459368055374544, + "tokens_seen": 1009431552 + }, + { + "epoch": 3.04, + "learning_rate": 0.000350591775325978, + "loss": 2.925, + "theoretical_loss": 3.6459143188119056, + "tokens_seen": 1009497088 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003505817452357071, + "loss": 2.7224, + "theoretical_loss": 3.645891833954857, + "tokens_seen": 1009562624 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035057171514543635, + "loss": 2.9708, + "theoretical_loss": 3.645869350966031, + "tokens_seen": 1009628160 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035056168505516553, + "loss": 2.6602, + "theoretical_loss": 3.645846869845152, + "tokens_seen": 1009693696 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003505516549648947, + "loss": 2.8586, + "theoretical_loss": 3.645824390591943, + "tokens_seen": 1009759232 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003505416248746239, + "loss": 2.6716, + "theoretical_loss": 3.6458019132061272, + "tokens_seen": 1009824768 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035053159478435307, + "loss": 3.0043, + "theoretical_loss": 3.6457794376874295, + "tokens_seen": 1009890304 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035052156469408225, + "loss": 2.7406, + "theoretical_loss": 3.645756964035573, + "tokens_seen": 1009955840 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003505115346038115, + "loss": 2.8073, + "theoretical_loss": 3.6457344922502815, + "tokens_seen": 1010021376 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003505015045135406, + "loss": 2.9574, + "theoretical_loss": 3.6457120223312787, + "tokens_seen": 1010086912 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035049147442326985, + "loss": 2.8053, + "theoretical_loss": 3.645689554278289, + "tokens_seen": 1010152448 + }, + { + "epoch": 3.04, + "learning_rate": 0.000350481444332999, + "loss": 2.7435, + "theoretical_loss": 3.6456670880910362, + "tokens_seen": 1010217984 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003504714142427282, + "loss": 2.9574, + "theoretical_loss": 3.645644623769244, + "tokens_seen": 1010283520 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003504613841524574, + "loss": 2.8229, + "theoretical_loss": 3.645622161312637, + "tokens_seen": 1010349056 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003504513540621866, + "loss": 2.9436, + "theoretical_loss": 3.645599700720939, + "tokens_seen": 1010414592 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035044132397191576, + "loss": 2.8731, + "theoretical_loss": 3.6455772419938746, + "tokens_seen": 1010480128 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035043129388164494, + "loss": 2.8602, + "theoretical_loss": 3.645554785131168, + "tokens_seen": 1010545664 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003504212637913741, + "loss": 2.721, + "theoretical_loss": 3.645532330132543, + "tokens_seen": 1010611200 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035041123370110335, + "loss": 2.6924, + "theoretical_loss": 3.6455098769977248, + "tokens_seen": 1010676736 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003504012036108325, + "loss": 2.9447, + "theoretical_loss": 3.645487425726437, + "tokens_seen": 1010742272 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003503911735205617, + "loss": 2.7244, + "theoretical_loss": 3.6454649763184044, + "tokens_seen": 1010807808 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003503811434302909, + "loss": 2.8867, + "theoretical_loss": 3.6454425287733527, + "tokens_seen": 1010873344 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1166720, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7130308151245117, + "objective/train/theoretical_loss": 3.6454369171781478, + "objective/train/tokens_used": 1031349728, + "theoretical_loss": 3.6454369171781478, + "tokens_seen": 1010889728 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003503711133400201, + "loss": 2.7468, + "theoretical_loss": 3.645420083091005, + "tokens_seen": 1010938880 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035036108324974926, + "loss": 2.7862, + "theoretical_loss": 3.6453976392710867, + "tokens_seen": 1011004416 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035035105315947844, + "loss": 2.9374, + "theoretical_loss": 3.6453751973133226, + "tokens_seen": 1011069952 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003503410230692076, + "loss": 2.6861, + "theoretical_loss": 3.645352757217437, + "tokens_seen": 1011135488 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035033099297893686, + "loss": 2.7727, + "theoretical_loss": 3.6453303189831554, + "tokens_seen": 1011201024 + }, + { + "epoch": 3.04, + "learning_rate": 0.000350320962888666, + "loss": 2.9209, + "theoretical_loss": 3.6453078826102026, + "tokens_seen": 1011266560 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003503109327983952, + "loss": 2.7929, + "theoretical_loss": 3.6452854480983032, + "tokens_seen": 1011332096 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035030090270812435, + "loss": 2.8713, + "theoretical_loss": 3.6452630154471826, + "tokens_seen": 1011397632 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003502908726178536, + "loss": 2.8707, + "theoretical_loss": 3.645240584656566, + "tokens_seen": 1011463168 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035028084252758276, + "loss": 2.7903, + "theoretical_loss": 3.645218155726178, + "tokens_seen": 1011528704 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035027081243731194, + "loss": 2.8254, + "theoretical_loss": 3.6451957286557444, + "tokens_seen": 1011594240 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003502607823470411, + "loss": 2.9904, + "theoretical_loss": 3.64517330344499, + "tokens_seen": 1011659776 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035025075225677036, + "loss": 2.8842, + "theoretical_loss": 3.6451508800936407, + "tokens_seen": 1011725312 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003502407221664995, + "loss": 2.885, + "theoretical_loss": 3.6451284586014214, + "tokens_seen": 1011790848 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003502306920762287, + "loss": 2.5298, + "theoretical_loss": 3.6451060389680574, + "tokens_seen": 1011856384 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035022066198595785, + "loss": 2.8897, + "theoretical_loss": 3.645083621193275, + "tokens_seen": 1011921920 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003502106318956871, + "loss": 2.5561, + "theoretical_loss": 3.6450612052767997, + "tokens_seen": 1011987456 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035020060180541627, + "loss": 2.7428, + "theoretical_loss": 3.645038791218356, + "tokens_seen": 1012052992 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035019057171514545, + "loss": 3.0237, + "theoretical_loss": 3.6450163790176706, + "tokens_seen": 1012118528 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035018054162487463, + "loss": 2.6427, + "theoretical_loss": 3.644993968674469, + "tokens_seen": 1012184064 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003501705115346038, + "loss": 3.029, + "theoretical_loss": 3.644971560188477, + "tokens_seen": 1012249600 + }, + { + "epoch": 3.04, + "learning_rate": 0.000350160481444333, + "loss": 2.8482, + "theoretical_loss": 3.6449491535594207, + "tokens_seen": 1012315136 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003501504513540622, + "loss": 2.7981, + "theoretical_loss": 3.644926748787025, + "tokens_seen": 1012380672 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035014042126379135, + "loss": 2.9763, + "theoretical_loss": 3.6449043458710175, + "tokens_seen": 1012446208 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003501303911735206, + "loss": 2.8806, + "theoretical_loss": 3.6448819448111234, + "tokens_seen": 1012511744 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1167486, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.346987724304199, + "objective/train/theoretical_loss": 3.644876344836135, + "objective/train/tokens_used": 1032988128, + "theoretical_loss": 3.644876344836135, + "tokens_seen": 1012528128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003501203610832497, + "loss": 2.9117, + "theoretical_loss": 3.6448595456070683, + "tokens_seen": 1012577280 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035011033099297895, + "loss": 2.748, + "theoretical_loss": 3.644837148258579, + "tokens_seen": 1012642816 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035010030090270813, + "loss": 2.9558, + "theoretical_loss": 3.644814752765382, + "tokens_seen": 1012708352 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003500902708124373, + "loss": 2.8949, + "theoretical_loss": 3.644792359127202, + "tokens_seen": 1012773888 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003500802407221665, + "loss": 2.8262, + "theoretical_loss": 3.644769967343767, + "tokens_seen": 1012839424 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035007021063189573, + "loss": 2.6742, + "theoretical_loss": 3.644747577414803, + "tokens_seen": 1012904960 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035006018054162486, + "loss": 2.7622, + "theoretical_loss": 3.6447251893400363, + "tokens_seen": 1012970496 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003500501504513541, + "loss": 2.7589, + "theoretical_loss": 3.644702803119193, + "tokens_seen": 1013036032 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003500401203610832, + "loss": 2.6765, + "theoretical_loss": 3.6446804187520003, + "tokens_seen": 1013101568 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035003009027081245, + "loss": 2.7089, + "theoretical_loss": 3.644658036238184, + "tokens_seen": 1013167104 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035002006018054163, + "loss": 2.7562, + "theoretical_loss": 3.644635655577472, + "tokens_seen": 1013232640 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003500100300902708, + "loss": 2.9357, + "theoretical_loss": 3.64461327676959, + "tokens_seen": 1013298176 + }, + { + "epoch": 3.04, + "learning_rate": 0.00035, + "loss": 2.6649, + "theoretical_loss": 3.6445908998142653, + "tokens_seen": 1013363712 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003499899699097292, + "loss": 2.4005, + "theoretical_loss": 3.6445685247112247, + "tokens_seen": 1013429248 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034997993981945836, + "loss": 3.0307, + "theoretical_loss": 3.6445461514601947, + "tokens_seen": 1013494784 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003499699097291876, + "loss": 2.6786, + "theoretical_loss": 3.6445237800609025, + "tokens_seen": 1013560320 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003499598796389167, + "loss": 2.9899, + "theoretical_loss": 3.6445014105130755, + "tokens_seen": 1013625856 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034994984954864596, + "loss": 2.7386, + "theoretical_loss": 3.64447904281644, + "tokens_seen": 1013691392 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003499398194583751, + "loss": 2.9648, + "theoretical_loss": 3.6444566769707243, + "tokens_seen": 1013756928 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003499297893681043, + "loss": 2.7837, + "theoretical_loss": 3.6444343129756547, + "tokens_seen": 1013822464 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003499197592778335, + "loss": 2.7852, + "theoretical_loss": 3.6444119508309583, + "tokens_seen": 1013888000 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003499097291875627, + "loss": 2.8927, + "theoretical_loss": 3.644389590536363, + "tokens_seen": 1013953536 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034989969909729186, + "loss": 2.9017, + "theoretical_loss": 3.6443672320915956, + "tokens_seen": 1014019072 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003498896690070211, + "loss": 2.9671, + "theoretical_loss": 3.644344875496384, + "tokens_seen": 1014084608 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003498796389167502, + "loss": 2.7448, + "theoretical_loss": 3.644322520750456, + "tokens_seen": 1014150144 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1168150, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5092766284942627, + "objective/train/theoretical_loss": 3.6443169323528926, + "objective/train/tokens_used": 1034626528, + "theoretical_loss": 3.6443169323528926, + "tokens_seen": 1014166528 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034986960882647946, + "loss": 2.6011, + "theoretical_loss": 3.644300167853538, + "tokens_seen": 1014215680 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003498595787362086, + "loss": 2.7115, + "theoretical_loss": 3.6442778168053587, + "tokens_seen": 1014281216 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003498495486459378, + "loss": 2.6693, + "theoretical_loss": 3.644255467605645, + "tokens_seen": 1014346752 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034983951855566706, + "loss": 2.6503, + "theoretical_loss": 3.6442331202541256, + "tokens_seen": 1014412288 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003498294884653962, + "loss": 2.8226, + "theoretical_loss": 3.644210774750527, + "tokens_seen": 1014477824 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003498194583751254, + "loss": 2.8758, + "theoretical_loss": 3.6441884310945785, + "tokens_seen": 1014543360 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034980942828485455, + "loss": 2.9343, + "theoretical_loss": 3.6441660892860064, + "tokens_seen": 1014608896 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003497993981945838, + "loss": 2.8126, + "theoretical_loss": 3.6441437493245394, + "tokens_seen": 1014674432 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034978936810431296, + "loss": 2.891, + "theoretical_loss": 3.6441214112099054, + "tokens_seen": 1014739968 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034977933801404214, + "loss": 2.9544, + "theoretical_loss": 3.644099074941833, + "tokens_seen": 1014805504 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003497693079237713, + "loss": 2.8482, + "theoretical_loss": 3.64407674052005, + "tokens_seen": 1014871040 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034975927783350056, + "loss": 2.7608, + "theoretical_loss": 3.6440544079442843, + "tokens_seen": 1014936576 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003497492477432297, + "loss": 2.5706, + "theoretical_loss": 3.644032077214264, + "tokens_seen": 1015002112 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003497392176529589, + "loss": 2.6036, + "theoretical_loss": 3.644009748329718, + "tokens_seen": 1015067648 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034972918756268805, + "loss": 2.8312, + "theoretical_loss": 3.6439874212903742, + "tokens_seen": 1015133184 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003497191574724173, + "loss": 2.8235, + "theoretical_loss": 3.643965096095961, + "tokens_seen": 1015198720 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034970912738214647, + "loss": 2.8537, + "theoretical_loss": 3.6439427727462075, + "tokens_seen": 1015264256 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034969909729187565, + "loss": 2.8265, + "theoretical_loss": 3.643920451240841, + "tokens_seen": 1015329792 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034968906720160483, + "loss": 2.8553, + "theoretical_loss": 3.6438981315795917, + "tokens_seen": 1015395328 + }, + { + "epoch": 3.04, + "learning_rate": 0.000349679037111334, + "loss": 2.6262, + "theoretical_loss": 3.643875813762187, + "tokens_seen": 1015460864 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003496690070210632, + "loss": 2.93, + "theoretical_loss": 3.6438534977883554, + "tokens_seen": 1015526400 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003496589769307924, + "loss": 2.9229, + "theoretical_loss": 3.643831183657827, + "tokens_seen": 1015591936 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034964894684052155, + "loss": 3.0048, + "theoretical_loss": 3.6438088713703296, + "tokens_seen": 1015657472 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003496389167502508, + "loss": 2.8082, + "theoretical_loss": 3.6437865609255917, + "tokens_seen": 1015723008 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003496288866599799, + "loss": 2.6069, + "theoretical_loss": 3.6437642523233436, + "tokens_seen": 1015788544 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1169618, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.381481885910034, + "objective/train/theoretical_loss": 3.6437586754606386, + "objective/train/tokens_used": 1036264928, + "theoretical_loss": 3.6437586754606386, + "tokens_seen": 1015804928 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034961885656970915, + "loss": 2.815, + "theoretical_loss": 3.6437419455633133, + "tokens_seen": 1015854080 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034960882647943833, + "loss": 2.7737, + "theoretical_loss": 3.64371964064523, + "tokens_seen": 1015919616 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003495987963891675, + "loss": 2.606, + "theoretical_loss": 3.6436973375688226, + "tokens_seen": 1015985152 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003495887662988967, + "loss": 2.8198, + "theoretical_loss": 3.6436750363338204, + "tokens_seen": 1016050688 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034957873620862593, + "loss": 2.8019, + "theoretical_loss": 3.643652736939953, + "tokens_seen": 1016116224 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034956870611835506, + "loss": 2.8078, + "theoretical_loss": 3.643630439386949, + "tokens_seen": 1016181760 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003495586760280843, + "loss": 2.7872, + "theoretical_loss": 3.6436081436745384, + "tokens_seen": 1016247296 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003495486459378134, + "loss": 2.7448, + "theoretical_loss": 3.6435858498024505, + "tokens_seen": 1016312832 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034953861584754265, + "loss": 2.9181, + "theoretical_loss": 3.643563557770414, + "tokens_seen": 1016378368 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034952858575727183, + "loss": 2.8828, + "theoretical_loss": 3.643541267578159, + "tokens_seen": 1016443904 + }, + { + "epoch": 3.04, + "learning_rate": 0.000349518555667001, + "loss": 2.9088, + "theoretical_loss": 3.643518979225415, + "tokens_seen": 1016509440 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003495085255767302, + "loss": 2.7972, + "theoretical_loss": 3.6434966927119117, + "tokens_seen": 1016574976 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003494984954864594, + "loss": 2.7734, + "theoretical_loss": 3.6434744080373784, + "tokens_seen": 1016640512 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034948846539618856, + "loss": 2.6963, + "theoretical_loss": 3.643452125201545, + "tokens_seen": 1016706048 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003494784353059178, + "loss": 2.9125, + "theoretical_loss": 3.6434298442041415, + "tokens_seen": 1016771584 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003494684052156469, + "loss": 2.8687, + "theoretical_loss": 3.643407565044898, + "tokens_seen": 1016837120 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034945837512537616, + "loss": 2.6719, + "theoretical_loss": 3.6433852877235435, + "tokens_seen": 1016902656 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003494483450351053, + "loss": 2.8081, + "theoretical_loss": 3.6433630122398086, + "tokens_seen": 1016968192 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003494383149448345, + "loss": 2.8221, + "theoretical_loss": 3.643340738593423, + "tokens_seen": 1017033728 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003494282848545637, + "loss": 2.6224, + "theoretical_loss": 3.643318466784117, + "tokens_seen": 1017099264 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003494182547642929, + "loss": 2.7745, + "theoretical_loss": 3.6432961968116206, + "tokens_seen": 1017164800 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034940822467402206, + "loss": 2.7596, + "theoretical_loss": 3.643273928675664, + "tokens_seen": 1017230336 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003493981945837513, + "loss": 2.5713, + "theoretical_loss": 3.643251662375978, + "tokens_seen": 1017295872 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003493881644934804, + "loss": 2.7915, + "theoretical_loss": 3.643229397912292, + "tokens_seen": 1017361408 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034937813440320966, + "loss": 2.6503, + "theoretical_loss": 3.6432071352843365, + "tokens_seen": 1017426944 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1170378, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.776099681854248, + "objective/train/theoretical_loss": 3.643201569914149, + "objective/train/tokens_used": 1037903328, + "theoretical_loss": 3.643201569914149, + "tokens_seen": 1017443328 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003493681043129388, + "loss": 3.0182, + "theoretical_loss": 3.6431848744918423, + "tokens_seen": 1017492480 + }, + { + "epoch": 3.04, + "learning_rate": 0.000349358074222668, + "loss": 2.9773, + "theoretical_loss": 3.6431626155345396, + "tokens_seen": 1017558016 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003493480441323972, + "loss": 2.822, + "theoretical_loss": 3.6431403584121593, + "tokens_seen": 1017623552 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003493380140421264, + "loss": 2.7286, + "theoretical_loss": 3.643118103124431, + "tokens_seen": 1017689088 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034932798395185557, + "loss": 2.7543, + "theoretical_loss": 3.643095849671087, + "tokens_seen": 1017754624 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034931795386158475, + "loss": 2.7698, + "theoretical_loss": 3.6430735980518563, + "tokens_seen": 1017820160 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034930792377131393, + "loss": 2.4205, + "theoretical_loss": 3.6430513482664706, + "tokens_seen": 1017885696 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034929789368104316, + "loss": 2.6192, + "theoretical_loss": 3.643029100314661, + "tokens_seen": 1017951232 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003492878635907723, + "loss": 2.8012, + "theoretical_loss": 3.643006854196157, + "tokens_seen": 1018016768 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003492778335005015, + "loss": 2.6011, + "theoretical_loss": 3.642984609910691, + "tokens_seen": 1018082304 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034926780341023065, + "loss": 2.6748, + "theoretical_loss": 3.6429623674579927, + "tokens_seen": 1018147840 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003492577733199599, + "loss": 2.789, + "theoretical_loss": 3.642940126837795, + "tokens_seen": 1018213376 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034924774322968907, + "loss": 2.691, + "theoretical_loss": 3.6429178880498263, + "tokens_seen": 1018278912 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034923771313941825, + "loss": 2.813, + "theoretical_loss": 3.64289565109382, + "tokens_seen": 1018344448 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034922768304914743, + "loss": 2.7174, + "theoretical_loss": 3.6428734159695066, + "tokens_seen": 1018409984 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034921765295887667, + "loss": 2.7577, + "theoretical_loss": 3.6428511826766172, + "tokens_seen": 1018475520 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003492076228686058, + "loss": 2.6778, + "theoretical_loss": 3.6428289512148835, + "tokens_seen": 1018541056 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034919759277833503, + "loss": 2.6804, + "theoretical_loss": 3.6428067215840363, + "tokens_seen": 1018606592 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034918756268806416, + "loss": 2.679, + "theoretical_loss": 3.6427844937838074, + "tokens_seen": 1018672128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003491775325977934, + "loss": 2.8358, + "theoretical_loss": 3.6427622678139286, + "tokens_seen": 1018737664 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034916750250752257, + "loss": 2.7548, + "theoretical_loss": 3.642740043674131, + "tokens_seen": 1018803200 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034915747241725175, + "loss": 2.7795, + "theoretical_loss": 3.642717821364146, + "tokens_seen": 1018868736 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034914744232698093, + "loss": 2.7847, + "theoretical_loss": 3.6426956008837053, + "tokens_seen": 1018934272 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003491374122367101, + "loss": 2.8055, + "theoretical_loss": 3.642673382232541, + "tokens_seen": 1018999808 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003491273821464393, + "loss": 2.8508, + "theoretical_loss": 3.642651165410385, + "tokens_seen": 1019065344 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1171971, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9963715076446533, + "objective/train/theoretical_loss": 3.6426456114905967, + "objective/train/tokens_used": 1039541728, + "theoretical_loss": 3.6426456114905967, + "tokens_seen": 1019081728 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034911735205616853, + "loss": 2.9141, + "theoretical_loss": 3.642628950416969, + "tokens_seen": 1019130880 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034910732196589766, + "loss": 2.7184, + "theoretical_loss": 3.6426067372520246, + "tokens_seen": 1019196416 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003490972918756269, + "loss": 2.6736, + "theoretical_loss": 3.6425845259152836, + "tokens_seen": 1019261952 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034908726178535613, + "loss": 2.6398, + "theoretical_loss": 3.642562316406478, + "tokens_seen": 1019327488 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034907723169508526, + "loss": 2.7368, + "theoretical_loss": 3.642540108725341, + "tokens_seen": 1019393024 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003490672016048145, + "loss": 2.6245, + "theoretical_loss": 3.6425179028716035, + "tokens_seen": 1019458560 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003490571715145436, + "loss": 2.7568, + "theoretical_loss": 3.642495698844998, + "tokens_seen": 1019524096 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034904714142427285, + "loss": 2.9744, + "theoretical_loss": 3.6424734966452568, + "tokens_seen": 1019589632 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034903711133400203, + "loss": 2.8797, + "theoretical_loss": 3.642451296272112, + "tokens_seen": 1019655168 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003490270812437312, + "loss": 2.6659, + "theoretical_loss": 3.642429097725296, + "tokens_seen": 1019720704 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003490170511534604, + "loss": 2.8605, + "theoretical_loss": 3.642406901004542, + "tokens_seen": 1019786240 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003490070210631896, + "loss": 2.6881, + "theoretical_loss": 3.6423847061095813, + "tokens_seen": 1019851776 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034899699097291876, + "loss": 2.6267, + "theoretical_loss": 3.6423625130401467, + "tokens_seen": 1019917312 + }, + { + "epoch": 3.04, + "learning_rate": 0.000348986960882648, + "loss": 2.8009, + "theoretical_loss": 3.6423403217959707, + "tokens_seen": 1019982848 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003489769307923771, + "loss": 2.8462, + "theoretical_loss": 3.6423181323767864, + "tokens_seen": 1020048384 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034896690070210636, + "loss": 2.9541, + "theoretical_loss": 3.642295944782326, + "tokens_seen": 1020113920 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003489568706118355, + "loss": 2.6883, + "theoretical_loss": 3.6422737590123226, + "tokens_seen": 1020179456 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003489468405215647, + "loss": 2.6475, + "theoretical_loss": 3.6422515750665085, + "tokens_seen": 1020244992 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003489368104312939, + "loss": 2.7722, + "theoretical_loss": 3.6422293929446172, + "tokens_seen": 1020310528 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003489267803410231, + "loss": 2.7078, + "theoretical_loss": 3.6422072126463814, + "tokens_seen": 1020376064 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034891675025075226, + "loss": 2.7822, + "theoretical_loss": 3.6421850341715336, + "tokens_seen": 1020441600 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003489067201604815, + "loss": 2.6382, + "theoretical_loss": 3.642162857519807, + "tokens_seen": 1020507136 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003488966900702106, + "loss": 2.6841, + "theoretical_loss": 3.6421406826909353, + "tokens_seen": 1020572672 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034888665997993986, + "loss": 2.8371, + "theoretical_loss": 3.642118509684651, + "tokens_seen": 1020638208 + }, + { + "epoch": 3.04, + "learning_rate": 0.000348876629889669, + "loss": 2.7937, + "theoretical_loss": 3.642096338500687, + "tokens_seen": 1020703744 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1172849, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6258201599121094, + "objective/train/theoretical_loss": 3.6420907959894024, + "objective/train/tokens_used": 1041180128, + "theoretical_loss": 3.6420907959894024, + "tokens_seen": 1020720128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003488665997993982, + "loss": 2.9837, + "theoretical_loss": 3.6420741691387772, + "tokens_seen": 1020769280 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003488565697091274, + "loss": 2.67, + "theoretical_loss": 3.642052001598655, + "tokens_seen": 1020834816 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003488465396188566, + "loss": 2.7801, + "theoretical_loss": 3.6420298358800527, + "tokens_seen": 1020900352 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034883650952858577, + "loss": 2.6078, + "theoretical_loss": 3.6420076719827046, + "tokens_seen": 1020965888 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034882647943831495, + "loss": 2.8249, + "theoretical_loss": 3.6419855099063443, + "tokens_seen": 1021031424 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034881644934804413, + "loss": 2.9316, + "theoretical_loss": 3.641963349650705, + "tokens_seen": 1021096960 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034880641925777336, + "loss": 2.6782, + "theoretical_loss": 3.64194119121552, + "tokens_seen": 1021162496 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003487963891675025, + "loss": 2.8126, + "theoretical_loss": 3.641919034600523, + "tokens_seen": 1021228032 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003487863590772317, + "loss": 2.7651, + "theoretical_loss": 3.6418968798054485, + "tokens_seen": 1021293568 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034877632898696085, + "loss": 2.8099, + "theoretical_loss": 3.6418747268300296, + "tokens_seen": 1021359104 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003487662988966901, + "loss": 2.6514, + "theoretical_loss": 3.6418525756739992, + "tokens_seen": 1021424640 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034875626880641927, + "loss": 2.7745, + "theoretical_loss": 3.6418304263370933, + "tokens_seen": 1021490176 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034874623871614845, + "loss": 2.7871, + "theoretical_loss": 3.6418082788190436, + "tokens_seen": 1021555712 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034873620862587763, + "loss": 2.4823, + "theoretical_loss": 3.6417861331195853, + "tokens_seen": 1021621248 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034872617853560687, + "loss": 2.7061, + "theoretical_loss": 3.6417639892384526, + "tokens_seen": 1021686784 + }, + { + "epoch": 3.04, + "learning_rate": 0.000348716148445336, + "loss": 2.7506, + "theoretical_loss": 3.6417418471753793, + "tokens_seen": 1021752320 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034870611835506523, + "loss": 2.7926, + "theoretical_loss": 3.6417197069300986, + "tokens_seen": 1021817856 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034869608826479436, + "loss": 2.745, + "theoretical_loss": 3.6416975685023463, + "tokens_seen": 1021883392 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003486860581745236, + "loss": 2.4741, + "theoretical_loss": 3.6416754318918554, + "tokens_seen": 1021948928 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034867602808425277, + "loss": 2.8679, + "theoretical_loss": 3.641653297098361, + "tokens_seen": 1022014464 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034866599799398195, + "loss": 2.9177, + "theoretical_loss": 3.6416311641215966, + "tokens_seen": 1022080000 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034865596790371113, + "loss": 2.7663, + "theoretical_loss": 3.6416090329612976, + "tokens_seen": 1022145536 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003486459378134403, + "loss": 2.794, + "theoretical_loss": 3.6415869036171973, + "tokens_seen": 1022211072 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003486359077231695, + "loss": 2.6762, + "theoretical_loss": 3.641564776089032, + "tokens_seen": 1022276608 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034862587763289873, + "loss": 2.6958, + "theoretical_loss": 3.6415426503765342, + "tokens_seen": 1022342144 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1174219, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4945321083068848, + "objective/train/theoretical_loss": 3.641537119232077, + "objective/train/tokens_used": 1042818528, + "theoretical_loss": 3.641537119232077, + "tokens_seen": 1022358528 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034861584754262786, + "loss": 2.6434, + "theoretical_loss": 3.6415205264794404, + "tokens_seen": 1022407680 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003486058174523571, + "loss": 2.8606, + "theoretical_loss": 3.641498404397484, + "tokens_seen": 1022473216 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003485957873620862, + "loss": 2.6391, + "theoretical_loss": 3.6414762841304, + "tokens_seen": 1022538752 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034858575727181546, + "loss": 2.6538, + "theoretical_loss": 3.6414541656779233, + "tokens_seen": 1022604288 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034857572718154464, + "loss": 2.664, + "theoretical_loss": 3.6414320490397896, + "tokens_seen": 1022669824 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003485656970912738, + "loss": 2.7767, + "theoretical_loss": 3.6414099342157327, + "tokens_seen": 1022735360 + }, + { + "epoch": 3.04, + "learning_rate": 0.000348555667001003, + "loss": 2.6541, + "theoretical_loss": 3.6413878212054875, + "tokens_seen": 1022800896 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034854563691073223, + "loss": 2.902, + "theoretical_loss": 3.64136571000879, + "tokens_seen": 1022866432 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034853560682046136, + "loss": 2.764, + "theoretical_loss": 3.641343600625375, + "tokens_seen": 1022931968 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003485255767301906, + "loss": 2.7848, + "theoretical_loss": 3.641321493054977, + "tokens_seen": 1022997504 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003485155466399197, + "loss": 2.6366, + "theoretical_loss": 3.6412993872973316, + "tokens_seen": 1023063040 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034850551654964896, + "loss": 2.6445, + "theoretical_loss": 3.6412772833521743, + "tokens_seen": 1023128576 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034849548645937814, + "loss": 2.8639, + "theoretical_loss": 3.6412551812192397, + "tokens_seen": 1023194112 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003484854563691073, + "loss": 2.8851, + "theoretical_loss": 3.641233080898264, + "tokens_seen": 1023259648 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003484754262788365, + "loss": 2.7964, + "theoretical_loss": 3.6412109823889818, + "tokens_seen": 1023325184 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003484653961885657, + "loss": 2.8088, + "theoretical_loss": 3.64118888569113, + "tokens_seen": 1023390720 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034845536609829486, + "loss": 2.6254, + "theoretical_loss": 3.641166790804442, + "tokens_seen": 1023456256 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003484453360080241, + "loss": 2.6364, + "theoretical_loss": 3.641144697728655, + "tokens_seen": 1023521792 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003484353059177532, + "loss": 2.822, + "theoretical_loss": 3.6411226064635045, + "tokens_seen": 1023587328 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034842527582748246, + "loss": 2.6328, + "theoretical_loss": 3.6411005170087254, + "tokens_seen": 1023652864 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034841524573721164, + "loss": 2.8881, + "theoretical_loss": 3.641078429364054, + "tokens_seen": 1023718400 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003484052156469408, + "loss": 2.8342, + "theoretical_loss": 3.6410563435292262, + "tokens_seen": 1023783936 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034839518555667, + "loss": 2.63, + "theoretical_loss": 3.6410342595039777, + "tokens_seen": 1023849472 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003483851554663992, + "loss": 2.7483, + "theoretical_loss": 3.6410121772880446, + "tokens_seen": 1023915008 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034837512537612837, + "loss": 2.6859, + "theoretical_loss": 3.640990096881162, + "tokens_seen": 1023980544 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1174839, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9092800617218018, + "objective/train/theoretical_loss": 3.640984577062075, + "objective/train/tokens_used": 1044456928, + "theoretical_loss": 3.640984577062075, + "tokens_seen": 1023996928 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003483650952858576, + "loss": 2.8045, + "theoretical_loss": 3.6409680182830675, + "tokens_seen": 1024046080 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034835506519558673, + "loss": 2.8116, + "theoretical_loss": 3.6409459414934955, + "tokens_seen": 1024111616 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034834503510531597, + "loss": 2.7341, + "theoretical_loss": 3.6409238665121832, + "tokens_seen": 1024177152 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034833500501504515, + "loss": 2.4435, + "theoretical_loss": 3.640901793338867, + "tokens_seen": 1024242688 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034832497492477433, + "loss": 2.6926, + "theoretical_loss": 3.640879721973282, + "tokens_seen": 1024308224 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034831494483450356, + "loss": 2.8559, + "theoretical_loss": 3.6408576524151655, + "tokens_seen": 1024373760 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003483049147442327, + "loss": 2.7453, + "theoretical_loss": 3.6408355846642535, + "tokens_seen": 1024439296 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003482948846539619, + "loss": 2.7137, + "theoretical_loss": 3.6408135187202824, + "tokens_seen": 1024504832 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034828485456369105, + "loss": 2.573, + "theoretical_loss": 3.640791454582989, + "tokens_seen": 1024570368 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003482748244734203, + "loss": 2.6886, + "theoretical_loss": 3.640769392252109, + "tokens_seen": 1024635904 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034826479438314947, + "loss": 2.9169, + "theoretical_loss": 3.64074733172738, + "tokens_seen": 1024701440 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034825476429287865, + "loss": 2.6807, + "theoretical_loss": 3.640725273008538, + "tokens_seen": 1024766976 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034824473420260783, + "loss": 2.8173, + "theoretical_loss": 3.64070321609532, + "tokens_seen": 1024832512 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034823470411233707, + "loss": 2.8759, + "theoretical_loss": 3.6406811609874628, + "tokens_seen": 1024898048 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003482246740220662, + "loss": 2.6937, + "theoretical_loss": 3.640659107684703, + "tokens_seen": 1024963584 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034821464393179543, + "loss": 2.8059, + "theoretical_loss": 3.640637056186777, + "tokens_seen": 1025029120 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034820461384152456, + "loss": 2.7328, + "theoretical_loss": 3.6406150064934226, + "tokens_seen": 1025094656 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003481945837512538, + "loss": 2.8426, + "theoretical_loss": 3.6405929586043766, + "tokens_seen": 1025160192 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034818455366098297, + "loss": 2.5166, + "theoretical_loss": 3.640570912519375, + "tokens_seen": 1025225728 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034817452357071215, + "loss": 2.7644, + "theoretical_loss": 3.640548868238157, + "tokens_seen": 1025291264 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034816449348044133, + "loss": 2.7304, + "theoretical_loss": 3.640526825760457, + "tokens_seen": 1025356800 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003481544633901705, + "loss": 2.6769, + "theoretical_loss": 3.6405047850860144, + "tokens_seen": 1025422336 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003481444332998997, + "loss": 2.8822, + "theoretical_loss": 3.6404827462145652, + "tokens_seen": 1025487872 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034813440320962893, + "loss": 2.8437, + "theoretical_loss": 3.6404607091458474, + "tokens_seen": 1025553408 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034812437311935806, + "loss": 2.6955, + "theoretical_loss": 3.640438673879598, + "tokens_seen": 1025618944 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1176332, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5118682384490967, + "objective/train/theoretical_loss": 3.640433165344641, + "objective/train/tokens_used": 1046095328, + "theoretical_loss": 3.640433165344641, + "tokens_seen": 1025635328 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003481143430290873, + "loss": 2.6492, + "theoretical_loss": 3.6404166404155545, + "tokens_seen": 1025684480 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003481043129388164, + "loss": 2.468, + "theoretical_loss": 3.6403946087534544, + "tokens_seen": 1025750016 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034809428284854566, + "loss": 2.7733, + "theoretical_loss": 3.6403725788930354, + "tokens_seen": 1025815552 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034808425275827484, + "loss": 2.8966, + "theoretical_loss": 3.6403505508340346, + "tokens_seen": 1025881088 + }, + { + "epoch": 3.04, + "learning_rate": 0.000348074222668004, + "loss": 2.8089, + "theoretical_loss": 3.6403285245761903, + "tokens_seen": 1025946624 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003480641925777332, + "loss": 2.7961, + "theoretical_loss": 3.640306500119239, + "tokens_seen": 1026012160 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034805416248746243, + "loss": 2.691, + "theoretical_loss": 3.64028447746292, + "tokens_seen": 1026077696 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034804413239719156, + "loss": 2.7155, + "theoretical_loss": 3.64026245660697, + "tokens_seen": 1026143232 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003480341023069208, + "loss": 2.7533, + "theoretical_loss": 3.6402404375511273, + "tokens_seen": 1026208768 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003480240722166499, + "loss": 2.8656, + "theoretical_loss": 3.6402184202951293, + "tokens_seen": 1026274304 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034801404212637916, + "loss": 2.9105, + "theoretical_loss": 3.640196404838715, + "tokens_seen": 1026339840 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034800401203610834, + "loss": 2.545, + "theoretical_loss": 3.6401743911816213, + "tokens_seen": 1026405376 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003479939819458375, + "loss": 2.7461, + "theoretical_loss": 3.6401523793235864, + "tokens_seen": 1026470912 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003479839518555667, + "loss": 2.6097, + "theoretical_loss": 3.640130369264349, + "tokens_seen": 1026536448 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003479739217652959, + "loss": 2.6789, + "theoretical_loss": 3.640108361003647, + "tokens_seen": 1026601984 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034796389167502506, + "loss": 2.8538, + "theoretical_loss": 3.6400863545412188, + "tokens_seen": 1026667520 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003479538615847543, + "loss": 2.9017, + "theoretical_loss": 3.640064349876803, + "tokens_seen": 1026733056 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034794383149448343, + "loss": 2.6433, + "theoretical_loss": 3.6400423470101364, + "tokens_seen": 1026798592 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034793380140421266, + "loss": 2.8106, + "theoretical_loss": 3.6400203459409592, + "tokens_seen": 1026864128 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034792377131394184, + "loss": 2.5955, + "theoretical_loss": 3.6399983466690085, + "tokens_seen": 1026929664 + }, + { + "epoch": 3.04, + "learning_rate": 0.000347913741223671, + "loss": 2.7773, + "theoretical_loss": 3.639976349194024, + "tokens_seen": 1026995200 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003479037111334002, + "loss": 2.7691, + "theoretical_loss": 3.6399543535157433, + "tokens_seen": 1027060736 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003478936810431294, + "loss": 2.9683, + "theoretical_loss": 3.639932359633906, + "tokens_seen": 1027126272 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034788365095285857, + "loss": 2.7609, + "theoretical_loss": 3.63991036754825, + "tokens_seen": 1027191808 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003478736208625878, + "loss": 2.8229, + "theoretical_loss": 3.639888377258514, + "tokens_seen": 1027257344 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1176787, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9666268825531006, + "objective/train/theoretical_loss": 3.639882879966662, + "objective/train/tokens_used": 1047733728, + "theoretical_loss": 3.639882879966662, + "tokens_seen": 1027273728 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034786359077231693, + "loss": 2.7606, + "theoretical_loss": 3.639866388764437, + "tokens_seen": 1027322880 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034785356068204617, + "loss": 2.8858, + "theoretical_loss": 3.6398444020657577, + "tokens_seen": 1027388416 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003478435305917753, + "loss": 2.8103, + "theoretical_loss": 3.6398224171622156, + "tokens_seen": 1027453952 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034783350050150453, + "loss": 2.8517, + "theoretical_loss": 3.639800434053549, + "tokens_seen": 1027519488 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003478234704112337, + "loss": 2.7297, + "theoretical_loss": 3.6397784527394967, + "tokens_seen": 1027585024 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003478134403209629, + "loss": 3.0084, + "theoretical_loss": 3.6397564732197987, + "tokens_seen": 1027650560 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034780341023069207, + "loss": 2.8202, + "theoretical_loss": 3.639734495494193, + "tokens_seen": 1027716096 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034779338014042125, + "loss": 2.7186, + "theoretical_loss": 3.6397125195624196, + "tokens_seen": 1027781632 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034778335005015043, + "loss": 2.8238, + "theoretical_loss": 3.6396905454242177, + "tokens_seen": 1027847168 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034777331995987967, + "loss": 2.8337, + "theoretical_loss": 3.6396685730793257, + "tokens_seen": 1027912704 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003477632898696088, + "loss": 2.7286, + "theoretical_loss": 3.6396466025274834, + "tokens_seen": 1027978240 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034775325977933803, + "loss": 2.846, + "theoretical_loss": 3.6396246337684306, + "tokens_seen": 1028043776 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003477432296890672, + "loss": 2.7812, + "theoretical_loss": 3.639602666801906, + "tokens_seen": 1028109312 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003477331995987964, + "loss": 2.6417, + "theoretical_loss": 3.6395807016276502, + "tokens_seen": 1028174848 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003477231695085256, + "loss": 2.6741, + "theoretical_loss": 3.6395587382454018, + "tokens_seen": 1028240384 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034771313941825476, + "loss": 2.9862, + "theoretical_loss": 3.639536776654901, + "tokens_seen": 1028305920 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034770310932798394, + "loss": 2.9435, + "theoretical_loss": 3.6395148168558866, + "tokens_seen": 1028371456 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034769307923771317, + "loss": 2.5491, + "theoretical_loss": 3.639492858848099, + "tokens_seen": 1028436992 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003476830491474423, + "loss": 2.8525, + "theoretical_loss": 3.6394709026312775, + "tokens_seen": 1028502528 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034767301905717153, + "loss": 2.7869, + "theoretical_loss": 3.6394489482051626, + "tokens_seen": 1028568064 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034766298896690066, + "loss": 2.9154, + "theoretical_loss": 3.639426995569494, + "tokens_seen": 1028633600 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003476529588766299, + "loss": 2.6809, + "theoretical_loss": 3.6394050447240107, + "tokens_seen": 1028699136 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003476429287863591, + "loss": 2.9911, + "theoretical_loss": 3.6393830956684536, + "tokens_seen": 1028764672 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034763289869608826, + "loss": 2.6227, + "theoretical_loss": 3.6393611484025628, + "tokens_seen": 1028830208 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034762286860581744, + "loss": 2.775, + "theoretical_loss": 3.639339202926078, + "tokens_seen": 1028895744 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1178361, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4620256423950195, + "objective/train/theoretical_loss": 3.6393337168365205, + "objective/train/tokens_used": 1049372128, + "theoretical_loss": 3.6393337168365205, + "tokens_seen": 1028912128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003476128385155466, + "loss": 2.7881, + "theoretical_loss": 3.639317259238739, + "tokens_seen": 1028961280 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003476028084252758, + "loss": 2.8292, + "theoretical_loss": 3.639295317340287, + "tokens_seen": 1029026816 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034759277833500504, + "loss": 2.6422, + "theoretical_loss": 3.6392733772304613, + "tokens_seen": 1029092352 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003475827482447342, + "loss": 2.971, + "theoretical_loss": 3.639251438909003, + "tokens_seen": 1029157888 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003475727181544634, + "loss": 2.6485, + "theoretical_loss": 3.6392295023756516, + "tokens_seen": 1029223424 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034756268806419263, + "loss": 2.9472, + "theoretical_loss": 3.6392075676301485, + "tokens_seen": 1029288960 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034755265797392176, + "loss": 2.6777, + "theoretical_loss": 3.6391856346722333, + "tokens_seen": 1029354496 + }, + { + "epoch": 3.04, + "learning_rate": 0.000347542627883651, + "loss": 2.6996, + "theoretical_loss": 3.639163703501647, + "tokens_seen": 1029420032 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003475325977933801, + "loss": 2.8, + "theoretical_loss": 3.63914177411813, + "tokens_seen": 1029485568 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034752256770310936, + "loss": 2.7562, + "theoretical_loss": 3.6391198465214227, + "tokens_seen": 1029551104 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034751253761283854, + "loss": 2.6978, + "theoretical_loss": 3.6390979207112664, + "tokens_seen": 1029616640 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003475025075225677, + "loss": 2.6952, + "theoretical_loss": 3.639075996687401, + "tokens_seen": 1029682176 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003474924774322969, + "loss": 2.7032, + "theoretical_loss": 3.6390540744495685, + "tokens_seen": 1029747712 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003474824473420261, + "loss": 2.6098, + "theoretical_loss": 3.639032153997509, + "tokens_seen": 1029813248 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034747241725175527, + "loss": 2.9094, + "theoretical_loss": 3.6390102353309626, + "tokens_seen": 1029878784 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003474623871614845, + "loss": 2.6849, + "theoretical_loss": 3.6389883184496723, + "tokens_seen": 1029944320 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034745235707121363, + "loss": 2.8267, + "theoretical_loss": 3.638966403353377, + "tokens_seen": 1030009856 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034744232698094286, + "loss": 2.8079, + "theoretical_loss": 3.6389444900418186, + "tokens_seen": 1030075392 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034743229689067204, + "loss": 2.6925, + "theoretical_loss": 3.6389225785147388, + "tokens_seen": 1030140928 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003474222668004012, + "loss": 2.8071, + "theoretical_loss": 3.638900668771878, + "tokens_seen": 1030206464 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003474122367101304, + "loss": 2.8804, + "theoretical_loss": 3.638878760812978, + "tokens_seen": 1030272000 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003474022066198596, + "loss": 2.6228, + "theoretical_loss": 3.6388568546377793, + "tokens_seen": 1030337536 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034739217652958877, + "loss": 2.6782, + "theoretical_loss": 3.6388349502460238, + "tokens_seen": 1030403072 + }, + { + "epoch": 3.04, + "learning_rate": 0.000347382146439318, + "loss": 2.8143, + "theoretical_loss": 3.638813047637453, + "tokens_seen": 1030468608 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034737211634904713, + "loss": 2.5234, + "theoretical_loss": 3.6387911468118075, + "tokens_seen": 1030534144 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1179088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1466615200042725, + "objective/train/theoretical_loss": 3.6387856718839484, + "objective/train/tokens_used": 1051010528, + "theoretical_loss": 3.6387856718839484, + "tokens_seen": 1030550528 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034736208625877637, + "loss": 2.859, + "theoretical_loss": 3.6387692477688303, + "tokens_seen": 1030599680 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003473520561685055, + "loss": 2.7063, + "theoretical_loss": 3.6387473505082615, + "tokens_seen": 1030665216 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034734202607823473, + "loss": 2.4725, + "theoretical_loss": 3.6387254550298436, + "tokens_seen": 1030730752 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003473319959879639, + "loss": 2.8637, + "theoretical_loss": 3.6387035613333176, + "tokens_seen": 1030796288 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003473219658976931, + "loss": 2.9199, + "theoretical_loss": 3.6386816694184256, + "tokens_seen": 1030861824 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034731193580742227, + "loss": 2.8224, + "theoretical_loss": 3.6386597792849096, + "tokens_seen": 1030927360 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034730190571715145, + "loss": 2.6093, + "theoretical_loss": 3.638637890932511, + "tokens_seen": 1030992896 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034729187562688063, + "loss": 2.783, + "theoretical_loss": 3.638616004360972, + "tokens_seen": 1031058432 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034728184553660987, + "loss": 2.7837, + "theoretical_loss": 3.6385941195700338, + "tokens_seen": 1031123968 + }, + { + "epoch": 3.04, + "learning_rate": 0.000347271815446339, + "loss": 2.644, + "theoretical_loss": 3.638572236559439, + "tokens_seen": 1031189504 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034726178535606823, + "loss": 2.708, + "theoretical_loss": 3.63855035532893, + "tokens_seen": 1031255040 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003472517552657974, + "loss": 2.8875, + "theoretical_loss": 3.638528475878248, + "tokens_seen": 1031320576 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003472417251755266, + "loss": 2.712, + "theoretical_loss": 3.6385065982071354, + "tokens_seen": 1031386112 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003472316950852558, + "loss": 2.7669, + "theoretical_loss": 3.6384847223153347, + "tokens_seen": 1031451648 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034722166499498496, + "loss": 2.8086, + "theoretical_loss": 3.6384628482025887, + "tokens_seen": 1031517184 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034721163490471414, + "loss": 2.6404, + "theoretical_loss": 3.6384409758686385, + "tokens_seen": 1031582720 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034720160481444337, + "loss": 2.6375, + "theoretical_loss": 3.6384191053132264, + "tokens_seen": 1031648256 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003471915747241725, + "loss": 2.8026, + "theoretical_loss": 3.638397236536096, + "tokens_seen": 1031713792 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034718154463390173, + "loss": 2.5592, + "theoretical_loss": 3.6383753695369894, + "tokens_seen": 1031779328 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034717151454363086, + "loss": 2.6478, + "theoretical_loss": 3.6383535043156483, + "tokens_seen": 1031844864 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003471614844533601, + "loss": 2.977, + "theoretical_loss": 3.638331640871816, + "tokens_seen": 1031910400 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003471514543630893, + "loss": 2.4206, + "theoretical_loss": 3.638309779205235, + "tokens_seen": 1031975936 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034714142427281846, + "loss": 2.4946, + "theoretical_loss": 3.6382879193156485, + "tokens_seen": 1032041472 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034713139418254764, + "loss": 2.6483, + "theoretical_loss": 3.6382660612027977, + "tokens_seen": 1032107008 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003471213640922768, + "loss": 2.6798, + "theoretical_loss": 3.638244204866427, + "tokens_seen": 1032172544 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1180390, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7738821506500244, + "objective/train/theoretical_loss": 3.6382387410598787, + "objective/train/tokens_used": 1052648928, + "theoretical_loss": 3.6382387410598787, + "tokens_seen": 1032188928 + }, + { + "epoch": 3.04, + "learning_rate": 0.000347111334002006, + "loss": 2.8191, + "theoretical_loss": 3.638222350306278, + "tokens_seen": 1032238080 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034710130391173524, + "loss": 2.733, + "theoretical_loss": 3.638200497522095, + "tokens_seen": 1032303616 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034709127382146436, + "loss": 2.7866, + "theoretical_loss": 3.6381786465136194, + "tokens_seen": 1032369152 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003470812437311936, + "loss": 2.7378, + "theoretical_loss": 3.6381567972805957, + "tokens_seen": 1032434688 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003470712136409228, + "loss": 2.5182, + "theoretical_loss": 3.6381349498227653, + "tokens_seen": 1032500224 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034706118355065196, + "loss": 2.5097, + "theoretical_loss": 3.6381131041398724, + "tokens_seen": 1032565760 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034705115346038114, + "loss": 2.8444, + "theoretical_loss": 3.6380912602316604, + "tokens_seen": 1032631296 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003470411233701103, + "loss": 2.9808, + "theoretical_loss": 3.6380694180978717, + "tokens_seen": 1032696832 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003470310932798395, + "loss": 2.7574, + "theoretical_loss": 3.6380475777382504, + "tokens_seen": 1032762368 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034702106318956874, + "loss": 2.7757, + "theoretical_loss": 3.638025739152539, + "tokens_seen": 1032827904 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034701103309929787, + "loss": 2.676, + "theoretical_loss": 3.638003902340481, + "tokens_seen": 1032893440 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003470010030090271, + "loss": 2.6281, + "theoretical_loss": 3.63798206730182, + "tokens_seen": 1032958976 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034699097291875623, + "loss": 2.5487, + "theoretical_loss": 3.6379602340363, + "tokens_seen": 1033024512 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034698094282848547, + "loss": 2.8011, + "theoretical_loss": 3.637938402543664, + "tokens_seen": 1033090048 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034697091273821465, + "loss": 2.71, + "theoretical_loss": 3.6379165728236553, + "tokens_seen": 1033155584 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034696088264794383, + "loss": 2.7223, + "theoretical_loss": 3.637894744876018, + "tokens_seen": 1033221120 + }, + { + "epoch": 3.04, + "learning_rate": 0.000346950852557673, + "loss": 2.4191, + "theoretical_loss": 3.637872918700496, + "tokens_seen": 1033286656 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034694082246740224, + "loss": 2.9634, + "theoretical_loss": 3.6378510942968325, + "tokens_seen": 1033352192 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034693079237713137, + "loss": 2.8381, + "theoretical_loss": 3.6378292716647715, + "tokens_seen": 1033417728 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003469207622868606, + "loss": 2.7367, + "theoretical_loss": 3.637807450804057, + "tokens_seen": 1033483264 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034691073219658973, + "loss": 2.5099, + "theoretical_loss": 3.637785631714433, + "tokens_seen": 1033548800 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034690070210631897, + "loss": 2.8073, + "theoretical_loss": 3.637763814395643, + "tokens_seen": 1033614336 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034689067201604815, + "loss": 2.9439, + "theoretical_loss": 3.6377419988474315, + "tokens_seen": 1033679872 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034688064192577733, + "loss": 2.8287, + "theoretical_loss": 3.6377201850695418, + "tokens_seen": 1033745408 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003468706118355065, + "loss": 2.6707, + "theoretical_loss": 3.6376983730617187, + "tokens_seen": 1033810944 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1181093, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.383265256881714, + "objective/train/theoretical_loss": 3.637692920336306, + "objective/train/tokens_used": 1054287328, + "theoretical_loss": 3.637692920336306, + "tokens_seen": 1033827328 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003468605817452357, + "loss": 2.6716, + "theoretical_loss": 3.6376765628237067, + "tokens_seen": 1033876480 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003468505516549649, + "loss": 2.4495, + "theoretical_loss": 3.6376547543552493, + "tokens_seen": 1033942016 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003468405215646941, + "loss": 2.8331, + "theoretical_loss": 3.6376329476560914, + "tokens_seen": 1034007552 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003468304914744233, + "loss": 2.8199, + "theoretical_loss": 3.6376111427259765, + "tokens_seen": 1034073088 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034682046138415247, + "loss": 2.6642, + "theoretical_loss": 3.6375893395646495, + "tokens_seen": 1034138624 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034681043129388165, + "loss": 2.6186, + "theoretical_loss": 3.637567538171855, + "tokens_seen": 1034204160 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034680040120361083, + "loss": 2.8684, + "theoretical_loss": 3.637545738547337, + "tokens_seen": 1034269696 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034679037111334007, + "loss": 2.6562, + "theoretical_loss": 3.6375239406908406, + "tokens_seen": 1034335232 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003467803410230692, + "loss": 2.7554, + "theoretical_loss": 3.6375021446021103, + "tokens_seen": 1034400768 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034677031093279843, + "loss": 3.0065, + "theoretical_loss": 3.63748035028089, + "tokens_seen": 1034466304 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003467602808425276, + "loss": 2.8307, + "theoretical_loss": 3.6374585577269256, + "tokens_seen": 1034531840 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003467502507522568, + "loss": 2.647, + "theoretical_loss": 3.6374367669399605, + "tokens_seen": 1034597376 + }, + { + "epoch": 3.04, + "learning_rate": 0.000346740220661986, + "loss": 2.8376, + "theoretical_loss": 3.637414977919741, + "tokens_seen": 1034662912 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034673019057171516, + "loss": 2.7634, + "theoretical_loss": 3.6373931906660113, + "tokens_seen": 1034728448 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034672016048144434, + "loss": 2.6724, + "theoretical_loss": 3.6373714051785155, + "tokens_seen": 1034793984 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034671013039117357, + "loss": 2.6667, + "theoretical_loss": 3.637349621457, + "tokens_seen": 1034859520 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003467001003009027, + "loss": 3.0254, + "theoretical_loss": 3.6373278395012085, + "tokens_seen": 1034925056 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034669007021063193, + "loss": 2.7656, + "theoretical_loss": 3.6373060593108875, + "tokens_seen": 1034990592 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034668004012036106, + "loss": 2.6329, + "theoretical_loss": 3.6372842808857806, + "tokens_seen": 1035056128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003466700100300903, + "loss": 2.7772, + "theoretical_loss": 3.6372625042256344, + "tokens_seen": 1035121664 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003466599799398195, + "loss": 2.7761, + "theoretical_loss": 3.637240729330193, + "tokens_seen": 1035187200 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034664994984954866, + "loss": 2.7121, + "theoretical_loss": 3.637218956199202, + "tokens_seen": 1035252736 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034663991975927784, + "loss": 2.8773, + "theoretical_loss": 3.637197184832407, + "tokens_seen": 1035318272 + }, + { + "epoch": 3.04, + "learning_rate": 0.000346629889669007, + "loss": 2.7028, + "theoretical_loss": 3.6371754152295535, + "tokens_seen": 1035383808 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003466198595787362, + "loss": 2.7789, + "theoretical_loss": 3.6371536473903863, + "tokens_seen": 1035449344 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1182527, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.771360158920288, + "objective/train/theoretical_loss": 3.637148205706141, + "objective/train/tokens_used": 1055925728, + "theoretical_loss": 3.637148205706141, + "tokens_seen": 1035465728 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034660982948846544, + "loss": 2.901, + "theoretical_loss": 3.6371318813146516, + "tokens_seen": 1035514880 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034659979939819456, + "loss": 2.9041, + "theoretical_loss": 3.637110117002095, + "tokens_seen": 1035580416 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003465897693079238, + "loss": 2.9562, + "theoretical_loss": 3.6370883544524615, + "tokens_seen": 1035645952 + }, + { + "epoch": 3.04, + "learning_rate": 0.000346579739217653, + "loss": 2.843, + "theoretical_loss": 3.637066593665497, + "tokens_seen": 1035711488 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034656970912738216, + "loss": 2.7811, + "theoretical_loss": 3.637044834640947, + "tokens_seen": 1035777024 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034655967903711134, + "loss": 2.5779, + "theoretical_loss": 3.637023077378558, + "tokens_seen": 1035842560 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003465496489468405, + "loss": 2.7218, + "theoretical_loss": 3.637001321878075, + "tokens_seen": 1035908096 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003465396188565697, + "loss": 2.7265, + "theoretical_loss": 3.6369795681392443, + "tokens_seen": 1035973632 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034652958876629894, + "loss": 2.8509, + "theoretical_loss": 3.6369578161618117, + "tokens_seen": 1036039168 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034651955867602807, + "loss": 2.6018, + "theoretical_loss": 3.636936065945523, + "tokens_seen": 1036104704 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003465095285857573, + "loss": 2.5405, + "theoretical_loss": 3.6369143174901244, + "tokens_seen": 1036170240 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034649949849548643, + "loss": 2.644, + "theoretical_loss": 3.6368925707953625, + "tokens_seen": 1036235776 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034648946840521567, + "loss": 2.7706, + "theoretical_loss": 3.6368708258609823, + "tokens_seen": 1036301312 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034647943831494485, + "loss": 2.7791, + "theoretical_loss": 3.636849082686731, + "tokens_seen": 1036366848 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034646940822467403, + "loss": 2.528, + "theoretical_loss": 3.6368273412723546, + "tokens_seen": 1036432384 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003464593781344032, + "loss": 2.7708, + "theoretical_loss": 3.6368056016175987, + "tokens_seen": 1036497920 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034644934804413244, + "loss": 2.8057, + "theoretical_loss": 3.6367838637222105, + "tokens_seen": 1036563456 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034643931795386157, + "loss": 2.8408, + "theoretical_loss": 3.6367621275859356, + "tokens_seen": 1036628992 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003464292878635908, + "loss": 2.7432, + "theoretical_loss": 3.6367403932085214, + "tokens_seen": 1036694528 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034641925777331993, + "loss": 2.7451, + "theoretical_loss": 3.636718660589714, + "tokens_seen": 1036760064 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034640922768304917, + "loss": 2.7478, + "theoretical_loss": 3.6366969297292595, + "tokens_seen": 1036825600 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034639919759277835, + "loss": 2.71, + "theoretical_loss": 3.6366752006269056, + "tokens_seen": 1036891136 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034638916750250753, + "loss": 2.8828, + "theoretical_loss": 3.636653473282397, + "tokens_seen": 1036956672 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003463791374122367, + "loss": 2.5511, + "theoretical_loss": 3.636631747695483, + "tokens_seen": 1037022208 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003463691073219659, + "loss": 2.2055, + "theoretical_loss": 3.6366100238659085, + "tokens_seen": 1037087744 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1183367, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4552834033966064, + "objective/train/theoretical_loss": 3.636604593183069, + "objective/train/tokens_used": 1057564128, + "theoretical_loss": 3.636604593183069, + "tokens_seen": 1037104128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003463590772316951, + "loss": 2.5444, + "theoretical_loss": 3.63658830179342, + "tokens_seen": 1037153280 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003463490471414243, + "loss": 2.6151, + "theoretical_loss": 3.6365665814777666, + "tokens_seen": 1037218816 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034633901705115344, + "loss": 2.7626, + "theoretical_loss": 3.6365448629186927, + "tokens_seen": 1037284352 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034632898696088267, + "loss": 2.8501, + "theoretical_loss": 3.6365231461159473, + "tokens_seen": 1037349888 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003463189568706118, + "loss": 2.8395, + "theoretical_loss": 3.6365014310692754, + "tokens_seen": 1037415424 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034630892678034103, + "loss": 2.8243, + "theoretical_loss": 3.636479717778426, + "tokens_seen": 1037480960 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003462988966900702, + "loss": 2.8931, + "theoretical_loss": 3.6364580062431453, + "tokens_seen": 1037546496 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003462888665997994, + "loss": 2.4737, + "theoretical_loss": 3.6364362964631805, + "tokens_seen": 1037612032 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003462788365095286, + "loss": 2.6304, + "theoretical_loss": 3.636414588438279, + "tokens_seen": 1037677568 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003462688064192578, + "loss": 2.7085, + "theoretical_loss": 3.636392882168188, + "tokens_seen": 1037743104 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034625877632898694, + "loss": 2.7709, + "theoretical_loss": 3.636371177652655, + "tokens_seen": 1037808640 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003462487462387162, + "loss": 2.7462, + "theoretical_loss": 3.6363494748914267, + "tokens_seen": 1037874176 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003462387161484453, + "loss": 2.7757, + "theoretical_loss": 3.6363277738842514, + "tokens_seen": 1037939712 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034622868605817454, + "loss": 2.7106, + "theoretical_loss": 3.636306074630876, + "tokens_seen": 1038005248 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003462186559679037, + "loss": 2.5829, + "theoretical_loss": 3.636284377131049, + "tokens_seen": 1038070784 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003462086258776329, + "loss": 2.6035, + "theoretical_loss": 3.636262681384517, + "tokens_seen": 1038136320 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003461985957873621, + "loss": 2.6727, + "theoretical_loss": 3.636240987391028, + "tokens_seen": 1038201856 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034618856569709126, + "loss": 2.8162, + "theoretical_loss": 3.6362192951503296, + "tokens_seen": 1038267392 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034617853560682044, + "loss": 2.7681, + "theoretical_loss": 3.6361976046621702, + "tokens_seen": 1038332928 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003461685055165497, + "loss": 2.7701, + "theoretical_loss": 3.6361759159262963, + "tokens_seen": 1038398464 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003461584754262788, + "loss": 2.7035, + "theoretical_loss": 3.636154228942457, + "tokens_seen": 1038464000 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034614844533600804, + "loss": 2.986, + "theoretical_loss": 3.6361325437103993, + "tokens_seen": 1038529536 + }, + { + "epoch": 3.04, + "learning_rate": 0.00034613841524573717, + "loss": 2.9617, + "theoretical_loss": 3.636110860229872, + "tokens_seen": 1038595072 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003461283851554664, + "loss": 2.6108, + "theoretical_loss": 3.636089178500623, + "tokens_seen": 1038660608 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003461183550651956, + "loss": 2.639, + "theoretical_loss": 3.6360674985223995, + "tokens_seen": 1038726144 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1184791, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.915973663330078, + "objective/train/theoretical_loss": 3.636062078801412, + "objective/train/tokens_used": 1059202528, + "theoretical_loss": 3.636062078801412, + "tokens_seen": 1038742528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034610832497492476, + "loss": 2.8912, + "theoretical_loss": 3.6360458202949504, + "tokens_seen": 1038791680 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034609829488465395, + "loss": 2.5813, + "theoretical_loss": 3.636024143818024, + "tokens_seen": 1038857216 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003460882647943832, + "loss": 2.761, + "theoretical_loss": 3.636002469091368, + "tokens_seen": 1038922752 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034607823470411236, + "loss": 2.6937, + "theoretical_loss": 3.6359807961147306, + "tokens_seen": 1038988288 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034606820461384154, + "loss": 2.5175, + "theoretical_loss": 3.6359591248878607, + "tokens_seen": 1039053824 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003460581745235707, + "loss": 2.9226, + "theoretical_loss": 3.635937455410507, + "tokens_seen": 1039119360 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003460481444332999, + "loss": 2.6023, + "theoretical_loss": 3.6359157876824164, + "tokens_seen": 1039184896 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034603811434302914, + "loss": 2.7549, + "theoretical_loss": 3.6358941217033385, + "tokens_seen": 1039250432 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034602808425275827, + "loss": 2.6038, + "theoretical_loss": 3.635872457473022, + "tokens_seen": 1039315968 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003460180541624875, + "loss": 2.8857, + "theoretical_loss": 3.635850794991215, + "tokens_seen": 1039381504 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034600802407221663, + "loss": 2.6358, + "theoretical_loss": 3.6358291342576665, + "tokens_seen": 1039447040 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034599799398194587, + "loss": 2.8824, + "theoretical_loss": 3.635807475272125, + "tokens_seen": 1039512576 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034598796389167505, + "loss": 2.8954, + "theoretical_loss": 3.635785818034339, + "tokens_seen": 1039578112 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034597793380140423, + "loss": 2.682, + "theoretical_loss": 3.6357641625440578, + "tokens_seen": 1039643648 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003459679037111334, + "loss": 2.6801, + "theoretical_loss": 3.6357425088010302, + "tokens_seen": 1039709184 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034595787362086264, + "loss": 2.5574, + "theoretical_loss": 3.635720856805005, + "tokens_seen": 1039774720 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034594784353059177, + "loss": 2.7004, + "theoretical_loss": 3.63569920655573, + "tokens_seen": 1039840256 + }, + { + "epoch": 3.05, + "learning_rate": 0.000345937813440321, + "loss": 2.4317, + "theoretical_loss": 3.635677558052956, + "tokens_seen": 1039905792 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034592778335005013, + "loss": 2.69, + "theoretical_loss": 3.635655911296431, + "tokens_seen": 1039971328 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034591775325977937, + "loss": 2.6949, + "theoretical_loss": 3.635634266285905, + "tokens_seen": 1040036864 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034590772316950855, + "loss": 2.9375, + "theoretical_loss": 3.6356126230211263, + "tokens_seen": 1040102400 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034589769307923773, + "loss": 2.5751, + "theoretical_loss": 3.635590981501845, + "tokens_seen": 1040167936 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003458876629889669, + "loss": 2.4966, + "theoretical_loss": 3.635569341727809, + "tokens_seen": 1040233472 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003458776328986961, + "loss": 2.6717, + "theoretical_loss": 3.6355477036987685, + "tokens_seen": 1040299008 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003458676028084253, + "loss": 2.7597, + "theoretical_loss": 3.6355260674144727, + "tokens_seen": 1040364544 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1185522, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9252572059631348, + "objective/train/theoretical_loss": 3.6355206586159863, + "objective/train/tokens_used": 1060840928, + "theoretical_loss": 3.6355206586159863, + "tokens_seen": 1040380928 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003458575727181545, + "loss": 2.8402, + "theoretical_loss": 3.6355044328746713, + "tokens_seen": 1040430080 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034584754262788364, + "loss": 2.8633, + "theoretical_loss": 3.6354828000791137, + "tokens_seen": 1040495616 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034583751253761287, + "loss": 2.6076, + "theoretical_loss": 3.6354611690275496, + "tokens_seen": 1040561152 + }, + { + "epoch": 3.05, + "learning_rate": 0.000345827482447342, + "loss": 2.67, + "theoretical_loss": 3.6354395397197274, + "tokens_seen": 1040626688 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034581745235707123, + "loss": 2.7122, + "theoretical_loss": 3.6354179121553987, + "tokens_seen": 1040692224 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003458074222668004, + "loss": 2.7741, + "theoretical_loss": 3.635396286334311, + "tokens_seen": 1040757760 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003457973921765296, + "loss": 2.7348, + "theoretical_loss": 3.6353746622562157, + "tokens_seen": 1040823296 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003457873620862588, + "loss": 2.8063, + "theoretical_loss": 3.6353530399208616, + "tokens_seen": 1040888832 + }, + { + "epoch": 3.05, + "learning_rate": 0.000345777331995988, + "loss": 2.4041, + "theoretical_loss": 3.6353314193279993, + "tokens_seen": 1040954368 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034576730190571714, + "loss": 2.9207, + "theoretical_loss": 3.635309800477378, + "tokens_seen": 1041019904 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003457572718154464, + "loss": 2.7454, + "theoretical_loss": 3.6352881833687483, + "tokens_seen": 1041085440 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003457472417251755, + "loss": 2.7788, + "theoretical_loss": 3.6352665680018594, + "tokens_seen": 1041150976 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034573721163490474, + "loss": 2.909, + "theoretical_loss": 3.6352449543764624, + "tokens_seen": 1041216512 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003457271815446339, + "loss": 2.8034, + "theoretical_loss": 3.6352233424923064, + "tokens_seen": 1041282048 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003457171514543631, + "loss": 2.8243, + "theoretical_loss": 3.6352017323491417, + "tokens_seen": 1041347584 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003457071213640923, + "loss": 2.6581, + "theoretical_loss": 3.635180123946719, + "tokens_seen": 1041413120 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034569709127382146, + "loss": 2.7839, + "theoretical_loss": 3.6351585172847884, + "tokens_seen": 1041478656 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034568706118355064, + "loss": 2.6316, + "theoretical_loss": 3.6351369123631, + "tokens_seen": 1041544192 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003456770310932799, + "loss": 2.5668, + "theoretical_loss": 3.635115309181404, + "tokens_seen": 1041609728 + }, + { + "epoch": 3.05, + "learning_rate": 0.000345667001003009, + "loss": 2.7055, + "theoretical_loss": 3.6350937077394514, + "tokens_seen": 1041675264 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034565697091273824, + "loss": 2.7145, + "theoretical_loss": 3.635072108036992, + "tokens_seen": 1041740800 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034564694082246737, + "loss": 2.8442, + "theoretical_loss": 3.6350505100737767, + "tokens_seen": 1041806336 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003456369107321966, + "loss": 2.6584, + "theoretical_loss": 3.6350289138495557, + "tokens_seen": 1041871872 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003456268806419258, + "loss": 2.7529, + "theoretical_loss": 3.63500731936408, + "tokens_seen": 1041937408 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034561685055165496, + "loss": 2.7175, + "theoretical_loss": 3.6349857266171, + "tokens_seen": 1042002944 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1186074, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.702031135559082, + "objective/train/theoretical_loss": 3.6349803287019657, + "objective/train/tokens_used": 1062479328, + "theoretical_loss": 3.6349803287019657, + "tokens_seen": 1042019328 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034560682046138415, + "loss": 2.7838, + "theoretical_loss": 3.634964135608367, + "tokens_seen": 1042068480 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003455967903711134, + "loss": 2.7338, + "theoretical_loss": 3.634942546337631, + "tokens_seen": 1042134016 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003455867602808425, + "loss": 2.6965, + "theoretical_loss": 3.634920958804643, + "tokens_seen": 1042199552 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034557673019057174, + "loss": 2.7916, + "theoretical_loss": 3.634899373009154, + "tokens_seen": 1042265088 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034556670010030087, + "loss": 2.4773, + "theoretical_loss": 3.6348777889509147, + "tokens_seen": 1042330624 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003455566700100301, + "loss": 2.3448, + "theoretical_loss": 3.634856206629677, + "tokens_seen": 1042396160 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003455466399197593, + "loss": 2.6472, + "theoretical_loss": 3.6348346260451905, + "tokens_seen": 1042461696 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034553660982948847, + "loss": 2.8073, + "theoretical_loss": 3.634813047197207, + "tokens_seen": 1042527232 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034552657973921765, + "loss": 2.7546, + "theoretical_loss": 3.6347914700854775, + "tokens_seen": 1042592768 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034551654964894683, + "loss": 2.8483, + "theoretical_loss": 3.6347698947097538, + "tokens_seen": 1042658304 + }, + { + "epoch": 3.05, + "learning_rate": 0.000345506519558676, + "loss": 2.6587, + "theoretical_loss": 3.634748321069786, + "tokens_seen": 1042723840 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034549648946840525, + "loss": 2.675, + "theoretical_loss": 3.634726749165326, + "tokens_seen": 1042789376 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003454864593781344, + "loss": 2.642, + "theoretical_loss": 3.634705178996125, + "tokens_seen": 1042854912 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003454764292878636, + "loss": 2.4778, + "theoretical_loss": 3.6346836105619347, + "tokens_seen": 1042920448 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034546639919759274, + "loss": 2.4542, + "theoretical_loss": 3.634662043862506, + "tokens_seen": 1042985984 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034545636910732197, + "loss": 2.4854, + "theoretical_loss": 3.634640478897591, + "tokens_seen": 1043051520 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034544633901705115, + "loss": 2.834, + "theoretical_loss": 3.6346189156669406, + "tokens_seen": 1043117056 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034543630892678033, + "loss": 3.0203, + "theoretical_loss": 3.6345973541703067, + "tokens_seen": 1043182592 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003454262788365095, + "loss": 2.9097, + "theoretical_loss": 3.634575794407441, + "tokens_seen": 1043248128 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034541624874623875, + "loss": 2.8772, + "theoretical_loss": 3.6345542363780954, + "tokens_seen": 1043313664 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003454062186559679, + "loss": 2.6594, + "theoretical_loss": 3.634532680082021, + "tokens_seen": 1043379200 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003453961885656971, + "loss": 2.8194, + "theoretical_loss": 3.63451112551897, + "tokens_seen": 1043444736 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034538615847542624, + "loss": 2.8311, + "theoretical_loss": 3.634489572688694, + "tokens_seen": 1043510272 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003453761283851555, + "loss": 2.6407, + "theoretical_loss": 3.634468021590945, + "tokens_seen": 1043575808 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034536609829488466, + "loss": 2.7576, + "theoretical_loss": 3.6344464722254757, + "tokens_seen": 1043641344 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1186076, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4055583477020264, + "objective/train/theoretical_loss": 3.634441085154748, + "objective/train/tokens_used": 1064117728, + "theoretical_loss": 3.634441085154748, + "tokens_seen": 1043657728 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034535606820461384, + "loss": 2.7181, + "theoretical_loss": 3.634424924592037, + "tokens_seen": 1043706880 + }, + { + "epoch": 3.05, + "learning_rate": 0.000345346038114343, + "loss": 2.7111, + "theoretical_loss": 3.634403378690381, + "tokens_seen": 1043772416 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003453360080240722, + "loss": 2.8271, + "theoretical_loss": 3.6343818345202603, + "tokens_seen": 1043837952 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034532597793380143, + "loss": 2.7099, + "theoretical_loss": 3.634360292081427, + "tokens_seen": 1043903488 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003453159478435306, + "loss": 2.8388, + "theoretical_loss": 3.634338751373633, + "tokens_seen": 1043969024 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003453059177532598, + "loss": 2.695, + "theoretical_loss": 3.634317212396631, + "tokens_seen": 1044034560 + }, + { + "epoch": 3.05, + "learning_rate": 0.000345295887662989, + "loss": 2.5767, + "theoretical_loss": 3.634295675150173, + "tokens_seen": 1044100096 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003452858575727182, + "loss": 2.8101, + "theoretical_loss": 3.634274139634011, + "tokens_seen": 1044165632 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034527582748244734, + "loss": 3.153, + "theoretical_loss": 3.634252605847898, + "tokens_seen": 1044231168 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003452657973921766, + "loss": 2.9676, + "theoretical_loss": 3.634231073791586, + "tokens_seen": 1044296704 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003452557673019057, + "loss": 2.6099, + "theoretical_loss": 3.634209543464828, + "tokens_seen": 1044362240 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034524573721163494, + "loss": 2.8402, + "theoretical_loss": 3.6341880148673766, + "tokens_seen": 1044427776 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003452357071213641, + "loss": 2.9848, + "theoretical_loss": 3.6341664879989835, + "tokens_seen": 1044493312 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003452256770310933, + "loss": 2.7283, + "theoretical_loss": 3.634144962859402, + "tokens_seen": 1044558848 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003452156469408225, + "loss": 2.6106, + "theoretical_loss": 3.634123439448385, + "tokens_seen": 1044624384 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034520561685055166, + "loss": 2.696, + "theoretical_loss": 3.634101917765685, + "tokens_seen": 1044689920 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034519558676028084, + "loss": 2.7361, + "theoretical_loss": 3.6340803978110543, + "tokens_seen": 1044755456 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003451855566700101, + "loss": 2.813, + "theoretical_loss": 3.634058879584247, + "tokens_seen": 1044820992 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003451755265797392, + "loss": 2.8119, + "theoretical_loss": 3.6340373630850147, + "tokens_seen": 1044886528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034516549648946844, + "loss": 2.7531, + "theoretical_loss": 3.6340158483131115, + "tokens_seen": 1044952064 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034515546639919757, + "loss": 2.8369, + "theoretical_loss": 3.6339943352682895, + "tokens_seen": 1045017600 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003451454363089268, + "loss": 2.7817, + "theoretical_loss": 3.633972823950302, + "tokens_seen": 1045083136 + }, + { + "epoch": 3.05, + "learning_rate": 0.000345135406218656, + "loss": 2.572, + "theoretical_loss": 3.633951314358903, + "tokens_seen": 1045148672 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034512537612838517, + "loss": 3.0036, + "theoretical_loss": 3.6339298064938443, + "tokens_seen": 1045214208 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034511534603811435, + "loss": 2.9289, + "theoretical_loss": 3.6339083003548796, + "tokens_seen": 1045279744 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1186840, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6896257400512695, + "objective/train/theoretical_loss": 3.6339029240898117, + "objective/train/tokens_used": 1065756128, + "theoretical_loss": 3.6339029240898117, + "tokens_seen": 1045296128 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003451053159478436, + "loss": 2.7279, + "theoretical_loss": 3.6338867959417622, + "tokens_seen": 1045345280 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003450952858575727, + "loss": 2.8756, + "theoretical_loss": 3.633865293254246, + "tokens_seen": 1045410816 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034508525576730194, + "loss": 2.6824, + "theoretical_loss": 3.633843792292083, + "tokens_seen": 1045476352 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034507522567703107, + "loss": 2.7099, + "theoretical_loss": 3.6338222930550286, + "tokens_seen": 1045541888 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003450651955867603, + "loss": 2.6207, + "theoretical_loss": 3.6338007955428346, + "tokens_seen": 1045607424 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003450551654964895, + "loss": 2.7052, + "theoretical_loss": 3.633779299755255, + "tokens_seen": 1045672960 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034504513540621867, + "loss": 2.7676, + "theoretical_loss": 3.633757805692044, + "tokens_seen": 1045738496 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034503510531594785, + "loss": 2.7697, + "theoretical_loss": 3.633736313352954, + "tokens_seen": 1045804032 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034502507522567703, + "loss": 2.6363, + "theoretical_loss": 3.6337148227377396, + "tokens_seen": 1045869568 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003450150451354062, + "loss": 2.8698, + "theoretical_loss": 3.633693333846154, + "tokens_seen": 1045935104 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034500501504513545, + "loss": 2.7141, + "theoretical_loss": 3.6336718466779514, + "tokens_seen": 1046000640 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003449949849548646, + "loss": 2.8788, + "theoretical_loss": 3.6336503612328857, + "tokens_seen": 1046066176 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003449849548645938, + "loss": 2.9623, + "theoretical_loss": 3.63362887751071, + "tokens_seen": 1046131712 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034497492477432294, + "loss": 2.7589, + "theoretical_loss": 3.633607395511179, + "tokens_seen": 1046197248 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034496489468405217, + "loss": 2.716, + "theoretical_loss": 3.6335859152340464, + "tokens_seen": 1046262784 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034495486459378135, + "loss": 2.9333, + "theoretical_loss": 3.633564436679066, + "tokens_seen": 1046328320 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034494483450351053, + "loss": 2.7096, + "theoretical_loss": 3.6335429598459927, + "tokens_seen": 1046393856 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003449348044132397, + "loss": 2.5753, + "theoretical_loss": 3.6335214847345796, + "tokens_seen": 1046459392 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034492477432296895, + "loss": 2.7974, + "theoretical_loss": 3.633500011344582, + "tokens_seen": 1046524928 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003449147442326981, + "loss": 2.733, + "theoretical_loss": 3.6334785396757527, + "tokens_seen": 1046590464 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003449047141424273, + "loss": 2.6909, + "theoretical_loss": 3.633457069727847, + "tokens_seen": 1046656000 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034489468405215644, + "loss": 2.966, + "theoretical_loss": 3.6334356015006186, + "tokens_seen": 1046721536 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003448846539618857, + "loss": 2.6867, + "theoretical_loss": 3.633414134993823, + "tokens_seen": 1046787072 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034487462387161486, + "loss": 2.9336, + "theoretical_loss": 3.6333926702072135, + "tokens_seen": 1046852608 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034486459378134404, + "loss": 2.6975, + "theoretical_loss": 3.6333712071405446, + "tokens_seen": 1046918144 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1188135, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7929697036743164, + "objective/train/theoretical_loss": 3.63336584164259, + "objective/train/tokens_used": 1067394528, + "theoretical_loss": 3.63336584164259, + "tokens_seen": 1046934528 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003448545636910732, + "loss": 2.5809, + "theoretical_loss": 3.6333497457935717, + "tokens_seen": 1046983680 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003448445336008024, + "loss": 2.8726, + "theoretical_loss": 3.6333282861660487, + "tokens_seen": 1047049216 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003448345035105316, + "loss": 2.6964, + "theoretical_loss": 3.63330682825773, + "tokens_seen": 1047114752 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003448244734202608, + "loss": 2.8283, + "theoretical_loss": 3.6332853720683715, + "tokens_seen": 1047180288 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034481444332998994, + "loss": 3.0839, + "theoretical_loss": 3.633263917597726, + "tokens_seen": 1047245824 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003448044132397192, + "loss": 2.8138, + "theoretical_loss": 3.6332424648455506, + "tokens_seen": 1047311360 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034479438314944836, + "loss": 2.6576, + "theoretical_loss": 3.633221013811598, + "tokens_seen": 1047376896 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034478435305917754, + "loss": 2.7789, + "theoretical_loss": 3.6331995644956243, + "tokens_seen": 1047442432 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003447743229689067, + "loss": 2.8961, + "theoretical_loss": 3.6331781168973842, + "tokens_seen": 1047507968 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003447642928786359, + "loss": 2.6799, + "theoretical_loss": 3.6331566710166325, + "tokens_seen": 1047573504 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003447542627883651, + "loss": 2.9108, + "theoretical_loss": 3.6331352268531245, + "tokens_seen": 1047639040 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003447442326980943, + "loss": 2.7632, + "theoretical_loss": 3.633113784406615, + "tokens_seen": 1047704576 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034473420260782345, + "loss": 2.8166, + "theoretical_loss": 3.6330923436768594, + "tokens_seen": 1047770112 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003447241725175527, + "loss": 2.8229, + "theoretical_loss": 3.6330709046636125, + "tokens_seen": 1047835648 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003447141424272818, + "loss": 3.0599, + "theoretical_loss": 3.63304946736663, + "tokens_seen": 1047901184 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034470411233701104, + "loss": 2.8003, + "theoretical_loss": 3.633028031785667, + "tokens_seen": 1047966720 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003446940822467402, + "loss": 2.8111, + "theoretical_loss": 3.633006597920479, + "tokens_seen": 1048032256 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003446840521564694, + "loss": 2.7593, + "theoretical_loss": 3.6329851657708208, + "tokens_seen": 1048097792 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003446740220661986, + "loss": 2.6875, + "theoretical_loss": 3.632963735336448, + "tokens_seen": 1048163328 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034466399197592777, + "loss": 2.8356, + "theoretical_loss": 3.632942306617117, + "tokens_seen": 1048228864 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034465396188565695, + "loss": 2.591, + "theoretical_loss": 3.6329208796125823, + "tokens_seen": 1048294400 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003446439317953862, + "loss": 2.6914, + "theoretical_loss": 3.6328994543226, + "tokens_seen": 1048359936 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003446339017051153, + "loss": 2.757, + "theoretical_loss": 3.632878030746925, + "tokens_seen": 1048425472 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034462387161484455, + "loss": 2.9459, + "theoretical_loss": 3.632856608885314, + "tokens_seen": 1048491008 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034461384152457373, + "loss": 2.9613, + "theoretical_loss": 3.6328351887375225, + "tokens_seen": 1048556544 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1188820, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7827885150909424, + "objective/train/theoretical_loss": 3.63282983396833, + "objective/train/tokens_used": 1069032928, + "theoretical_loss": 3.63282983396833, + "tokens_seen": 1048572928 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003446038114343029, + "loss": 2.8882, + "theoretical_loss": 3.6328137703033057, + "tokens_seen": 1048622080 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034459378134403214, + "loss": 2.6796, + "theoretical_loss": 3.63279235358242, + "tokens_seen": 1048687616 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034458375125376127, + "loss": 2.7243, + "theoretical_loss": 3.632770938574621, + "tokens_seen": 1048753152 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003445737211634905, + "loss": 2.835, + "theoretical_loss": 3.6327495252796647, + "tokens_seen": 1048818688 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003445636910732197, + "loss": 2.756, + "theoretical_loss": 3.632728113697307, + "tokens_seen": 1048884224 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034455366098294887, + "loss": 2.7928, + "theoretical_loss": 3.6327067038273047, + "tokens_seen": 1048949760 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034454363089267805, + "loss": 2.5859, + "theoretical_loss": 3.632685295669413, + "tokens_seen": 1049015296 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034453360080240723, + "loss": 2.7753, + "theoretical_loss": 3.6326638892233873, + "tokens_seen": 1049080832 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003445235707121364, + "loss": 2.7935, + "theoretical_loss": 3.6326424844889855, + "tokens_seen": 1049146368 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034451354062186565, + "loss": 2.6532, + "theoretical_loss": 3.6326210814659636, + "tokens_seen": 1049211904 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003445035105315948, + "loss": 2.7596, + "theoretical_loss": 3.632599680154077, + "tokens_seen": 1049277440 + }, + { + "epoch": 3.05, + "learning_rate": 0.000344493480441324, + "loss": 2.7922, + "theoretical_loss": 3.632578280553082, + "tokens_seen": 1049342976 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034448345035105314, + "loss": 2.6624, + "theoretical_loss": 3.632556882662736, + "tokens_seen": 1049408512 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034447342026078237, + "loss": 2.6397, + "theoretical_loss": 3.6325354864827943, + "tokens_seen": 1049474048 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034446339017051155, + "loss": 2.7459, + "theoretical_loss": 3.6325140920130146, + "tokens_seen": 1049539584 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034445336008024073, + "loss": 2.6949, + "theoretical_loss": 3.6324926992531523, + "tokens_seen": 1049605120 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003444433299899699, + "loss": 2.4783, + "theoretical_loss": 3.6324713082029643, + "tokens_seen": 1049670656 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034443329989969915, + "loss": 2.788, + "theoretical_loss": 3.632449918862208, + "tokens_seen": 1049736192 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003444232698094283, + "loss": 2.8112, + "theoretical_loss": 3.632428531230639, + "tokens_seen": 1049801728 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003444132397191575, + "loss": 2.7413, + "theoretical_loss": 3.6324071453080147, + "tokens_seen": 1049867264 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034440320962888664, + "loss": 2.6558, + "theoretical_loss": 3.6323857610940915, + "tokens_seen": 1049932800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003443931795386159, + "loss": 2.7811, + "theoretical_loss": 3.6323643785886266, + "tokens_seen": 1049998336 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034438314944834506, + "loss": 2.9847, + "theoretical_loss": 3.6323429977913766, + "tokens_seen": 1050063872 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034437311935807424, + "loss": 2.6683, + "theoretical_loss": 3.6323216187020986, + "tokens_seen": 1050129408 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003443630892678034, + "loss": 2.7443, + "theoretical_loss": 3.632300241320549, + "tokens_seen": 1050194944 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1190162, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4736685752868652, + "objective/train/theoretical_loss": 3.6322948972419664, + "objective/train/tokens_used": 1070671328, + "theoretical_loss": 3.6322948972419664, + "tokens_seen": 1050211328 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003443530591775326, + "loss": 2.7721, + "theoretical_loss": 3.6322788656464864, + "tokens_seen": 1050260480 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003443430290872618, + "loss": 2.5936, + "theoretical_loss": 3.632257491679666, + "tokens_seen": 1050326016 + }, + { + "epoch": 3.05, + "learning_rate": 0.000344332998996991, + "loss": 2.5859, + "theoretical_loss": 3.6322361194198463, + "tokens_seen": 1050391552 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034432296890672014, + "loss": 2.5561, + "theoretical_loss": 3.632214748866784, + "tokens_seen": 1050457088 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003443129388164494, + "loss": 2.7315, + "theoretical_loss": 3.6321933800202357, + "tokens_seen": 1050522624 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034430290872617856, + "loss": 2.6789, + "theoretical_loss": 3.63217201287996, + "tokens_seen": 1050588160 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034429287863590774, + "loss": 2.7474, + "theoretical_loss": 3.632150647445713, + "tokens_seen": 1050653696 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003442828485456369, + "loss": 2.8795, + "theoretical_loss": 3.6321292837172523, + "tokens_seen": 1050719232 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003442728184553661, + "loss": 2.7036, + "theoretical_loss": 3.6321079216943364, + "tokens_seen": 1050784768 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003442627883650953, + "loss": 2.6738, + "theoretical_loss": 3.6320865613767213, + "tokens_seen": 1050850304 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003442527582748245, + "loss": 2.6909, + "theoretical_loss": 3.6320652027641653, + "tokens_seen": 1050915840 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034424272818455365, + "loss": 2.6339, + "theoretical_loss": 3.6320438458564266, + "tokens_seen": 1050981376 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003442326980942829, + "loss": 2.8449, + "theoretical_loss": 3.632022490653261, + "tokens_seen": 1051046912 + }, + { + "epoch": 3.05, + "learning_rate": 0.000344222668004012, + "loss": 2.759, + "theoretical_loss": 3.6320011371544285, + "tokens_seen": 1051112448 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034421263791374124, + "loss": 2.7718, + "theoretical_loss": 3.631979785359685, + "tokens_seen": 1051177984 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003442026078234704, + "loss": 2.5683, + "theoretical_loss": 3.6319584352687895, + "tokens_seen": 1051243520 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003441925777331996, + "loss": 2.87, + "theoretical_loss": 3.6319370868814986, + "tokens_seen": 1051309056 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003441825476429288, + "loss": 2.9015, + "theoretical_loss": 3.631915740197571, + "tokens_seen": 1051374592 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034417251755265797, + "loss": 2.717, + "theoretical_loss": 3.6318943952167646, + "tokens_seen": 1051440128 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034416248746238715, + "loss": 2.6497, + "theoretical_loss": 3.631873051938837, + "tokens_seen": 1051505664 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003441524573721164, + "loss": 2.7191, + "theoretical_loss": 3.631851710363547, + "tokens_seen": 1051571200 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003441424272818455, + "loss": 2.7562, + "theoretical_loss": 3.6318303704906514, + "tokens_seen": 1051636736 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034413239719157475, + "loss": 2.6796, + "theoretical_loss": 3.6318090323199095, + "tokens_seen": 1051702272 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034412236710130393, + "loss": 2.7178, + "theoretical_loss": 3.631787695851079, + "tokens_seen": 1051767808 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003441123370110331, + "loss": 2.9961, + "theoretical_loss": 3.6317663610839177, + "tokens_seen": 1051833344 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1190623, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.515566825866699, + "objective/train/theoretical_loss": 3.6317610276579853, + "objective/train/tokens_used": 1072309728, + "theoretical_loss": 3.6317610276579853, + "tokens_seen": 1051849728 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003441023069207623, + "loss": 2.5649, + "theoretical_loss": 3.6317450280181847, + "tokens_seen": 1051898880 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034409227683049147, + "loss": 2.9343, + "theoretical_loss": 3.631723696653638, + "tokens_seen": 1051964416 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034408224674022065, + "loss": 2.8018, + "theoretical_loss": 3.631702366990036, + "tokens_seen": 1052029952 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003440722166499499, + "loss": 2.5757, + "theoretical_loss": 3.6316810390271366, + "tokens_seen": 1052095488 + }, + { + "epoch": 3.05, + "learning_rate": 0.000344062186559679, + "loss": 2.7342, + "theoretical_loss": 3.631659712764699, + "tokens_seen": 1052161024 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034405215646940825, + "loss": 2.7447, + "theoretical_loss": 3.6316383882024814, + "tokens_seen": 1052226560 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003440421263791374, + "loss": 2.5221, + "theoretical_loss": 3.631617065340243, + "tokens_seen": 1052292096 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003440320962888666, + "loss": 2.8617, + "theoretical_loss": 3.6315957441777407, + "tokens_seen": 1052357632 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003440220661985958, + "loss": 2.8462, + "theoretical_loss": 3.631574424714735, + "tokens_seen": 1052423168 + }, + { + "epoch": 3.05, + "learning_rate": 0.000344012036108325, + "loss": 2.7823, + "theoretical_loss": 3.6315531069509834, + "tokens_seen": 1052488704 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034400200601805416, + "loss": 2.7497, + "theoretical_loss": 3.6315317908862457, + "tokens_seen": 1052554240 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034399197592778334, + "loss": 2.7485, + "theoretical_loss": 3.6315104765202797, + "tokens_seen": 1052619776 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003439819458375125, + "loss": 2.6464, + "theoretical_loss": 3.6314891638528453, + "tokens_seen": 1052685312 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034397191574724175, + "loss": 2.9019, + "theoretical_loss": 3.6314678528837003, + "tokens_seen": 1052750848 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003439618856569709, + "loss": 2.8454, + "theoretical_loss": 3.6314465436126047, + "tokens_seen": 1052816384 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003439518555667001, + "loss": 2.9119, + "theoretical_loss": 3.6314252360393167, + "tokens_seen": 1052881920 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003439418254764293, + "loss": 2.8862, + "theoretical_loss": 3.6314039301635956, + "tokens_seen": 1052947456 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003439317953861585, + "loss": 2.6123, + "theoretical_loss": 3.631382625985201, + "tokens_seen": 1053012992 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034392176529588766, + "loss": 2.746, + "theoretical_loss": 3.6313613235038913, + "tokens_seen": 1053078528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034391173520561684, + "loss": 2.8266, + "theoretical_loss": 3.6313400227194266, + "tokens_seen": 1053144064 + }, + { + "epoch": 3.05, + "learning_rate": 0.000343901705115346, + "loss": 2.7095, + "theoretical_loss": 3.631318723631565, + "tokens_seen": 1053209600 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034389167502507526, + "loss": 2.7095, + "theoretical_loss": 3.631297426240067, + "tokens_seen": 1053275136 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003438816449348044, + "loss": 2.7669, + "theoretical_loss": 3.6312761305446912, + "tokens_seen": 1053340672 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003438716148445336, + "loss": 2.7469, + "theoretical_loss": 3.631254836545197, + "tokens_seen": 1053406208 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034386158475426275, + "loss": 2.6697, + "theoretical_loss": 3.6312335442413444, + "tokens_seen": 1053471744 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1192122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.786816120147705, + "objective/train/theoretical_loss": 3.6312282214302964, + "objective/train/tokens_used": 1073948128, + "theoretical_loss": 3.6312282214302964, + "tokens_seen": 1053488128 + }, + { + "epoch": 3.05, + "learning_rate": 0.000343851554663992, + "loss": 2.7388, + "theoretical_loss": 3.631212253632892, + "tokens_seen": 1053537280 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003438415245737212, + "loss": 2.8104, + "theoretical_loss": 3.6311909647196003, + "tokens_seen": 1053602816 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034383149448345034, + "loss": 2.8182, + "theoretical_loss": 3.6311696775012283, + "tokens_seen": 1053668352 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003438214643931796, + "loss": 2.8246, + "theoretical_loss": 3.6311483919775362, + "tokens_seen": 1053733888 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034381143430290876, + "loss": 2.7451, + "theoretical_loss": 3.6311271081482834, + "tokens_seen": 1053799424 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034380140421263794, + "loss": 2.7069, + "theoretical_loss": 3.631105826013229, + "tokens_seen": 1053864960 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003437913741223671, + "loss": 2.5447, + "theoretical_loss": 3.631084545572134, + "tokens_seen": 1053930496 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003437813440320963, + "loss": 2.8172, + "theoretical_loss": 3.6310632668247576, + "tokens_seen": 1053996032 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003437713139418255, + "loss": 2.8105, + "theoretical_loss": 3.6310419897708597, + "tokens_seen": 1054061568 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003437612838515547, + "loss": 2.7633, + "theoretical_loss": 3.6310207144102, + "tokens_seen": 1054127104 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034375125376128385, + "loss": 2.7244, + "theoretical_loss": 3.630999440742539, + "tokens_seen": 1054192640 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003437412236710131, + "loss": 2.6294, + "theoretical_loss": 3.6309781687676366, + "tokens_seen": 1054258176 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003437311935807422, + "loss": 2.688, + "theoretical_loss": 3.6309568984852527, + "tokens_seen": 1054323712 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034372116349047144, + "loss": 2.7278, + "theoretical_loss": 3.6309356298951476, + "tokens_seen": 1054389248 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003437111334002006, + "loss": 2.8325, + "theoretical_loss": 3.630914362997082, + "tokens_seen": 1054454784 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003437011033099298, + "loss": 2.5317, + "theoretical_loss": 3.630893097790815, + "tokens_seen": 1054520320 + }, + { + "epoch": 3.05, + "learning_rate": 0.000343691073219659, + "loss": 2.6591, + "theoretical_loss": 3.6308718342761073, + "tokens_seen": 1054585856 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034368104312938817, + "loss": 2.8177, + "theoretical_loss": 3.6308505724527196, + "tokens_seen": 1054651392 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034367101303911735, + "loss": 2.5762, + "theoretical_loss": 3.6308293123204125, + "tokens_seen": 1054716928 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003436609829488466, + "loss": 2.7574, + "theoretical_loss": 3.630808053878946, + "tokens_seen": 1054782464 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003436509528585757, + "loss": 2.8421, + "theoretical_loss": 3.6307867971280796, + "tokens_seen": 1054848000 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034364092276830495, + "loss": 2.7248, + "theoretical_loss": 3.630765542067576, + "tokens_seen": 1054913536 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034363089267803413, + "loss": 2.6222, + "theoretical_loss": 3.630744288697194, + "tokens_seen": 1054979072 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003436208625877633, + "loss": 2.7392, + "theoretical_loss": 3.630723037016695, + "tokens_seen": 1055044608 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003436108324974925, + "loss": 2.685, + "theoretical_loss": 3.63070178702584, + "tokens_seen": 1055110144 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1192817, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.987161874771118, + "objective/train/theoretical_loss": 3.6306964747921047, + "objective/train/tokens_used": 1075586528, + "theoretical_loss": 3.6306964747921047, + "tokens_seen": 1055126528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034360080240722167, + "loss": 2.8193, + "theoretical_loss": 3.6306805387243886, + "tokens_seen": 1055175680 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034359077231695085, + "loss": 2.6656, + "theoretical_loss": 3.6306592921121026, + "tokens_seen": 1055241216 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003435807422266801, + "loss": 2.8565, + "theoretical_loss": 3.630638047188742, + "tokens_seen": 1055306752 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003435707121364092, + "loss": 2.8089, + "theoretical_loss": 3.630616803954069, + "tokens_seen": 1055372288 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034356068204613845, + "loss": 2.7897, + "theoretical_loss": 3.6305955624078434, + "tokens_seen": 1055437824 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003435506519558676, + "loss": 2.8819, + "theoretical_loss": 3.630574322549826, + "tokens_seen": 1055503360 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003435406218655968, + "loss": 2.6583, + "theoretical_loss": 3.6305530843797786, + "tokens_seen": 1055568896 + }, + { + "epoch": 3.05, + "learning_rate": 0.000343530591775326, + "loss": 2.8231, + "theoretical_loss": 3.630531847897462, + "tokens_seen": 1055634432 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003435205616850552, + "loss": 2.5255, + "theoretical_loss": 3.6305106131026363, + "tokens_seen": 1055699968 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034351053159478436, + "loss": 2.8292, + "theoretical_loss": 3.6304893799950646, + "tokens_seen": 1055765504 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034350050150451354, + "loss": 2.8406, + "theoretical_loss": 3.630468148574507, + "tokens_seen": 1055831040 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003434904714142427, + "loss": 2.6635, + "theoretical_loss": 3.6304469188407245, + "tokens_seen": 1055896576 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034348044132397195, + "loss": 2.5332, + "theoretical_loss": 3.630425690793479, + "tokens_seen": 1055962112 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003434704112337011, + "loss": 2.7283, + "theoretical_loss": 3.6304044644325315, + "tokens_seen": 1056027648 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003434603811434303, + "loss": 2.8458, + "theoretical_loss": 3.6303832397576437, + "tokens_seen": 1056093184 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003434503510531595, + "loss": 2.8332, + "theoretical_loss": 3.6303620167685766, + "tokens_seen": 1056158720 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003434403209628887, + "loss": 2.8633, + "theoretical_loss": 3.6303407954650924, + "tokens_seen": 1056224256 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034343029087261786, + "loss": 2.8437, + "theoretical_loss": 3.630319575846952, + "tokens_seen": 1056289792 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034342026078234704, + "loss": 2.837, + "theoretical_loss": 3.6302983579139174, + "tokens_seen": 1056355328 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003434102306920762, + "loss": 2.7527, + "theoretical_loss": 3.6302771416657498, + "tokens_seen": 1056420864 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034340020060180546, + "loss": 2.8107, + "theoretical_loss": 3.6302559271022115, + "tokens_seen": 1056486400 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003433901705115346, + "loss": 2.751, + "theoretical_loss": 3.6302347142230644, + "tokens_seen": 1056551936 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003433801404212638, + "loss": 2.834, + "theoretical_loss": 3.6302135030280693, + "tokens_seen": 1056617472 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034337011033099295, + "loss": 2.6986, + "theoretical_loss": 3.6301922935169886, + "tokens_seen": 1056683008 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003433600802407222, + "loss": 2.7787, + "theoretical_loss": 3.630171085689584, + "tokens_seen": 1056748544 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1193467, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6657423973083496, + "objective/train/theoretical_loss": 3.6301657839957793, + "objective/train/tokens_used": 1077224928, + "theoretical_loss": 3.6301657839957793, + "tokens_seen": 1056764928 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034335005015045136, + "loss": 2.6549, + "theoretical_loss": 3.630149879545618, + "tokens_seen": 1056814080 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034334002006018054, + "loss": 2.6641, + "theoretical_loss": 3.630128675084852, + "tokens_seen": 1056879616 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003433299899699097, + "loss": 2.6756, + "theoretical_loss": 3.6301074723070483, + "tokens_seen": 1056945152 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003433199598796389, + "loss": 2.8726, + "theoretical_loss": 3.6300862712119693, + "tokens_seen": 1057010688 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003433099297893681, + "loss": 2.8562, + "theoretical_loss": 3.6300650717993763, + "tokens_seen": 1057076224 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003432998996990973, + "loss": 2.4702, + "theoretical_loss": 3.630043874069032, + "tokens_seen": 1057141760 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034328986960882645, + "loss": 2.7899, + "theoretical_loss": 3.630022678020699, + "tokens_seen": 1057207296 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003432798395185557, + "loss": 2.6209, + "theoretical_loss": 3.630001483654139, + "tokens_seen": 1057272832 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034326980942828487, + "loss": 2.8341, + "theoretical_loss": 3.6299802909691143, + "tokens_seen": 1057338368 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034325977933801405, + "loss": 2.6814, + "theoretical_loss": 3.6299590999653875, + "tokens_seen": 1057403904 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034324974924774323, + "loss": 2.6038, + "theoretical_loss": 3.629937910642721, + "tokens_seen": 1057469440 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003432397191574724, + "loss": 2.6944, + "theoretical_loss": 3.6299167230008775, + "tokens_seen": 1057534976 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003432296890672016, + "loss": 2.8614, + "theoretical_loss": 3.6298955370396193, + "tokens_seen": 1057600512 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003432196589769308, + "loss": 2.753, + "theoretical_loss": 3.6298743527587085, + "tokens_seen": 1057666048 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034320962888665995, + "loss": 2.697, + "theoretical_loss": 3.629853170157909, + "tokens_seen": 1057731584 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003431995987963892, + "loss": 2.6929, + "theoretical_loss": 3.6298319892369815, + "tokens_seen": 1057797120 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003431895687061183, + "loss": 2.7009, + "theoretical_loss": 3.629810809995691, + "tokens_seen": 1057862656 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034317953861584755, + "loss": 2.9626, + "theoretical_loss": 3.6297896324337984, + "tokens_seen": 1057928192 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034316950852557673, + "loss": 2.6848, + "theoretical_loss": 3.6297684565510675, + "tokens_seen": 1057993728 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003431594784353059, + "loss": 2.7164, + "theoretical_loss": 3.629747282347261, + "tokens_seen": 1058059264 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003431494483450351, + "loss": 2.4662, + "theoretical_loss": 3.6297261098221414, + "tokens_seen": 1058124800 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034313941825476433, + "loss": 2.8341, + "theoretical_loss": 3.629704938975472, + "tokens_seen": 1058190336 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034312938816449345, + "loss": 2.8161, + "theoretical_loss": 3.6296837698070163, + "tokens_seen": 1058255872 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003431193580742227, + "loss": 3.0423, + "theoretical_loss": 3.6296626023165364, + "tokens_seen": 1058321408 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003431093279839518, + "loss": 2.9207, + "theoretical_loss": 3.6296414365037957, + "tokens_seen": 1058386944 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1194725, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.449052095413208, + "objective/train/theoretical_loss": 3.6296361453127295, + "objective/train/tokens_used": 1078863328, + "theoretical_loss": 3.6296361453127295, + "tokens_seen": 1058403328 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034309929789368105, + "loss": 2.7992, + "theoretical_loss": 3.6296202723685576, + "tokens_seen": 1058452480 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003430892678034103, + "loss": 2.9941, + "theoretical_loss": 3.6295991099105853, + "tokens_seen": 1058518016 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003430792377131394, + "loss": 2.6418, + "theoretical_loss": 3.629577949129642, + "tokens_seen": 1058583552 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034306920762286865, + "loss": 2.7596, + "theoretical_loss": 3.6295567900254904, + "tokens_seen": 1058649088 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003430591775325978, + "loss": 2.7151, + "theoretical_loss": 3.6295356325978947, + "tokens_seen": 1058714624 + }, + { + "epoch": 3.05, + "learning_rate": 0.000343049147442327, + "loss": 2.839, + "theoretical_loss": 3.6295144768466177, + "tokens_seen": 1058780160 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003430391173520562, + "loss": 2.7535, + "theoretical_loss": 3.629493322771423, + "tokens_seen": 1058845696 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003430290872617854, + "loss": 2.7381, + "theoretical_loss": 3.6294721703720745, + "tokens_seen": 1058911232 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034301905717151456, + "loss": 2.8283, + "theoretical_loss": 3.629451019648335, + "tokens_seen": 1058976768 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034300902708124374, + "loss": 2.5179, + "theoretical_loss": 3.6294298705999686, + "tokens_seen": 1059042304 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003429989969909729, + "loss": 2.844, + "theoretical_loss": 3.6294087232267387, + "tokens_seen": 1059107840 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034298896690070215, + "loss": 2.883, + "theoretical_loss": 3.629387577528409, + "tokens_seen": 1059173376 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003429789368104313, + "loss": 2.6924, + "theoretical_loss": 3.6293664335047433, + "tokens_seen": 1059238912 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003429689067201605, + "loss": 2.8918, + "theoretical_loss": 3.6293452911555057, + "tokens_seen": 1059304448 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003429588766298897, + "loss": 2.8112, + "theoretical_loss": 3.6293241504804596, + "tokens_seen": 1059369984 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003429488465396189, + "loss": 2.9203, + "theoretical_loss": 3.6293030114793687, + "tokens_seen": 1059435520 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034293881644934806, + "loss": 2.7102, + "theoretical_loss": 3.629281874151997, + "tokens_seen": 1059501056 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034292878635907724, + "loss": 2.6518, + "theoretical_loss": 3.6292607384981093, + "tokens_seen": 1059566592 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003429187562688064, + "loss": 2.6567, + "theoretical_loss": 3.629239604517468, + "tokens_seen": 1059632128 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034290872617853566, + "loss": 2.9782, + "theoretical_loss": 3.629218472209839, + "tokens_seen": 1059697664 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003428986960882648, + "loss": 2.6975, + "theoretical_loss": 3.6291973415749847, + "tokens_seen": 1059763200 + }, + { + "epoch": 3.05, + "learning_rate": 0.000342888665997994, + "loss": 2.7407, + "theoretical_loss": 3.6291762126126708, + "tokens_seen": 1059828736 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034287863590772315, + "loss": 2.9337, + "theoretical_loss": 3.6291550853226604, + "tokens_seen": 1059894272 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003428686058174524, + "loss": 2.6709, + "theoretical_loss": 3.629133959704718, + "tokens_seen": 1059959808 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034285857572718156, + "loss": 2.6925, + "theoretical_loss": 3.629112835758608, + "tokens_seen": 1060025344 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1195453, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8471500873565674, + "objective/train/theoretical_loss": 3.629107555033277, + "objective/train/tokens_used": 1080501728, + "theoretical_loss": 3.629107555033277, + "tokens_seen": 1060041728 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034284854563691074, + "loss": 2.9138, + "theoretical_loss": 3.629091713484095, + "tokens_seen": 1060090880 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003428385155466399, + "loss": 2.6919, + "theoretical_loss": 3.629070592880943, + "tokens_seen": 1060156416 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003428284854563691, + "loss": 2.7959, + "theoretical_loss": 3.6290494739489167, + "tokens_seen": 1060221952 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003428184553660983, + "loss": 2.7872, + "theoretical_loss": 3.62902835668778, + "tokens_seen": 1060287488 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003428084252758275, + "loss": 2.6846, + "theoretical_loss": 3.6290072410972987, + "tokens_seen": 1060353024 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034279839518555665, + "loss": 2.8737, + "theoretical_loss": 3.628986127177236, + "tokens_seen": 1060418560 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003427883650952859, + "loss": 2.9303, + "theoretical_loss": 3.6289650149273576, + "tokens_seen": 1060484096 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034277833500501507, + "loss": 2.7662, + "theoretical_loss": 3.628943904347427, + "tokens_seen": 1060549632 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034276830491474425, + "loss": 2.8635, + "theoretical_loss": 3.6289227954372105, + "tokens_seen": 1060615168 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034275827482447343, + "loss": 2.7954, + "theoretical_loss": 3.6289016881964713, + "tokens_seen": 1060680704 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003427482447342026, + "loss": 2.7364, + "theoretical_loss": 3.6288805826249755, + "tokens_seen": 1060746240 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003427382146439318, + "loss": 2.9048, + "theoretical_loss": 3.628859478722487, + "tokens_seen": 1060811776 + }, + { + "epoch": 3.05, + "learning_rate": 0.000342728184553661, + "loss": 2.9484, + "theoretical_loss": 3.6288383764887713, + "tokens_seen": 1060877312 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034271815446339015, + "loss": 2.8514, + "theoretical_loss": 3.628817275923593, + "tokens_seen": 1060942848 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003427081243731194, + "loss": 2.9913, + "theoretical_loss": 3.628796177026718, + "tokens_seen": 1061008384 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003426980942828485, + "loss": 2.7835, + "theoretical_loss": 3.6287750797979097, + "tokens_seen": 1061073920 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034268806419257775, + "loss": 2.787, + "theoretical_loss": 3.628753984236935, + "tokens_seen": 1061139456 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034267803410230693, + "loss": 2.6012, + "theoretical_loss": 3.628732890343558, + "tokens_seen": 1061204992 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003426680040120361, + "loss": 2.8233, + "theoretical_loss": 3.6287117981175436, + "tokens_seen": 1061270528 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003426579739217653, + "loss": 2.9159, + "theoretical_loss": 3.6286907075586585, + "tokens_seen": 1061336064 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034264794383149453, + "loss": 2.754, + "theoretical_loss": 3.6286696186666667, + "tokens_seen": 1061401600 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034263791374122366, + "loss": 2.5168, + "theoretical_loss": 3.6286485314413337, + "tokens_seen": 1061467136 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003426278836509529, + "loss": 2.8263, + "theoretical_loss": 3.6286274458824255, + "tokens_seen": 1061532672 + }, + { + "epoch": 3.05, + "learning_rate": 0.000342617853560682, + "loss": 2.6702, + "theoretical_loss": 3.6286063619897067, + "tokens_seen": 1061598208 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034260782347041125, + "loss": 3.0237, + "theoretical_loss": 3.6285852797629436, + "tokens_seen": 1061663744 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1196907, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8292043209075928, + "objective/train/theoretical_loss": 3.628580009466531, + "objective/train/tokens_used": 1082140128, + "theoretical_loss": 3.628580009466531, + "tokens_seen": 1061680128 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034259779338014043, + "loss": 2.798, + "theoretical_loss": 3.628564199201901, + "tokens_seen": 1061729280 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003425877632898696, + "loss": 2.5776, + "theoretical_loss": 3.628543120306346, + "tokens_seen": 1061794816 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003425777331995988, + "loss": 2.8096, + "theoretical_loss": 3.628522043076042, + "tokens_seen": 1061860352 + }, + { + "epoch": 3.05, + "learning_rate": 0.000342567703109328, + "loss": 2.8131, + "theoretical_loss": 3.6285009675107567, + "tokens_seen": 1061925888 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034255767301905716, + "loss": 2.8516, + "theoretical_loss": 3.628479893610254, + "tokens_seen": 1061991424 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003425476429287864, + "loss": 2.6119, + "theoretical_loss": 3.628458821374301, + "tokens_seen": 1062056960 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003425376128385155, + "loss": 2.7599, + "theoretical_loss": 3.6284377508026635, + "tokens_seen": 1062122496 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034252758274824476, + "loss": 2.6915, + "theoretical_loss": 3.6284166818951067, + "tokens_seen": 1062188032 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003425175526579739, + "loss": 2.7241, + "theoretical_loss": 3.6283956146513967, + "tokens_seen": 1062253568 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003425075225677031, + "loss": 2.6911, + "theoretical_loss": 3.6283745490712995, + "tokens_seen": 1062319104 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003424974924774323, + "loss": 2.8022, + "theoretical_loss": 3.628353485154581, + "tokens_seen": 1062384640 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003424874623871615, + "loss": 2.5889, + "theoretical_loss": 3.6283324229010083, + "tokens_seen": 1062450176 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034247743229689066, + "loss": 2.8766, + "theoretical_loss": 3.6283113623103462, + "tokens_seen": 1062515712 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003424674022066199, + "loss": 2.7854, + "theoretical_loss": 3.6282903033823612, + "tokens_seen": 1062581248 + }, + { + "epoch": 3.05, + "learning_rate": 0.000342457372116349, + "loss": 2.838, + "theoretical_loss": 3.62826924611682, + "tokens_seen": 1062646784 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034244734202607826, + "loss": 2.7359, + "theoretical_loss": 3.628248190513488, + "tokens_seen": 1062712320 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003424373119358074, + "loss": 2.9226, + "theoretical_loss": 3.628227136572132, + "tokens_seen": 1062777856 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003424272818455366, + "loss": 2.942, + "theoretical_loss": 3.6282060842925183, + "tokens_seen": 1062843392 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003424172517552658, + "loss": 2.8092, + "theoretical_loss": 3.6281850336744137, + "tokens_seen": 1062908928 + }, + { + "epoch": 3.05, + "learning_rate": 0.000342407221664995, + "loss": 2.8666, + "theoretical_loss": 3.6281639847175837, + "tokens_seen": 1062974464 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034239719157472416, + "loss": 2.6878, + "theoretical_loss": 3.6281429374217953, + "tokens_seen": 1063040000 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034238716148445335, + "loss": 2.7101, + "theoretical_loss": 3.628121891786815, + "tokens_seen": 1063105536 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003423771313941825, + "loss": 2.9153, + "theoretical_loss": 3.62810084781241, + "tokens_seen": 1063171072 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034236710130391176, + "loss": 2.8423, + "theoretical_loss": 3.628079805498346, + "tokens_seen": 1063236608 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003423570712136409, + "loss": 2.5257, + "theoretical_loss": 3.62805876484439, + "tokens_seen": 1063302144 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1197465, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.75803279876709, + "objective/train/theoretical_loss": 3.628053504940265, + "objective/train/tokens_used": 1083778528, + "theoretical_loss": 3.628053504940265, + "tokens_seen": 1063318528 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003423470411233701, + "loss": 2.5156, + "theoretical_loss": 3.6280377258503087, + "tokens_seen": 1063367680 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003423370110330993, + "loss": 2.8494, + "theoretical_loss": 3.628016688515869, + "tokens_seen": 1063433216 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003423269809428285, + "loss": 2.8854, + "theoretical_loss": 3.6279956528408377, + "tokens_seen": 1063498752 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003423169508525577, + "loss": 2.6486, + "theoretical_loss": 3.627974618824981, + "tokens_seen": 1063564288 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034230692076228685, + "loss": 2.765, + "theoretical_loss": 3.6279535864680668, + "tokens_seen": 1063629824 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003422968906720161, + "loss": 2.6579, + "theoretical_loss": 3.6279325557698616, + "tokens_seen": 1063695360 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034228686058174527, + "loss": 2.7489, + "theoretical_loss": 3.6279115267301325, + "tokens_seen": 1063760896 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034227683049147445, + "loss": 2.7437, + "theoretical_loss": 3.6278904993486467, + "tokens_seen": 1063826432 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034226680040120363, + "loss": 2.9753, + "theoretical_loss": 3.6278694736251706, + "tokens_seen": 1063891968 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003422567703109328, + "loss": 2.8276, + "theoretical_loss": 3.627848449559472, + "tokens_seen": 1063957504 + }, + { + "epoch": 3.05, + "learning_rate": 0.000342246740220662, + "loss": 2.6558, + "theoretical_loss": 3.6278274271513182, + "tokens_seen": 1064023040 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003422367101303912, + "loss": 2.8123, + "theoretical_loss": 3.627806406400476, + "tokens_seen": 1064088576 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034222668004012035, + "loss": 2.515, + "theoretical_loss": 3.6277853873067127, + "tokens_seen": 1064154112 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003422166499498496, + "loss": 2.9208, + "theoretical_loss": 3.627764369869796, + "tokens_seen": 1064219648 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003422066198595787, + "loss": 2.8514, + "theoretical_loss": 3.6277433540894926, + "tokens_seen": 1064285184 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034219658976930795, + "loss": 2.6061, + "theoretical_loss": 3.6277223399655707, + "tokens_seen": 1064350720 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034218655967903713, + "loss": 2.8817, + "theoretical_loss": 3.627701327497798, + "tokens_seen": 1064416256 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003421765295887663, + "loss": 2.7702, + "theoretical_loss": 3.627680316685941, + "tokens_seen": 1064481792 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003421664994984955, + "loss": 2.9302, + "theoretical_loss": 3.6276593075297674, + "tokens_seen": 1064547328 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034215646940822473, + "loss": 2.8216, + "theoretical_loss": 3.6276383000290457, + "tokens_seen": 1064612864 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034214643931795386, + "loss": 2.6398, + "theoretical_loss": 3.627617294183543, + "tokens_seen": 1064678400 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003421364092276831, + "loss": 2.873, + "theoretical_loss": 3.6275962899930265, + "tokens_seen": 1064743936 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003421263791374122, + "loss": 2.887, + "theoretical_loss": 3.627575287457265, + "tokens_seen": 1064809472 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034211634904714145, + "loss": 2.7337, + "theoretical_loss": 3.6275542865760255, + "tokens_seen": 1064875008 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034210631895687063, + "loss": 3.067, + "theoretical_loss": 3.6275332873490767, + "tokens_seen": 1064940544 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1198703, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5579044818878174, + "objective/train/theoretical_loss": 3.627528037800795, + "objective/train/tokens_used": 1085416928, + "theoretical_loss": 3.627528037800795, + "tokens_seen": 1064956928 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003420962888665998, + "loss": 2.7133, + "theoretical_loss": 3.627512289776185, + "tokens_seen": 1065006080 + }, + { + "epoch": 3.05, + "learning_rate": 0.000342086258776329, + "loss": 2.7935, + "theoretical_loss": 3.62749129385712, + "tokens_seen": 1065071616 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003420762286860582, + "loss": 2.6849, + "theoretical_loss": 3.6274702995916486, + "tokens_seen": 1065137152 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034206619859578736, + "loss": 2.8924, + "theoretical_loss": 3.627449306979539, + "tokens_seen": 1065202688 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003420561685055166, + "loss": 2.7858, + "theoretical_loss": 3.62742831602056, + "tokens_seen": 1065268224 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003420461384152457, + "loss": 2.8484, + "theoretical_loss": 3.6274073267144793, + "tokens_seen": 1065333760 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034203610832497496, + "loss": 2.7231, + "theoretical_loss": 3.627386339061064, + "tokens_seen": 1065399296 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003420260782347041, + "loss": 2.8332, + "theoretical_loss": 3.6273653530600845, + "tokens_seen": 1065464832 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003420160481444333, + "loss": 2.7879, + "theoretical_loss": 3.627344368711307, + "tokens_seen": 1065530368 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003420060180541625, + "loss": 2.4803, + "theoretical_loss": 3.6273233860145013, + "tokens_seen": 1065595904 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003419959879638917, + "loss": 2.8658, + "theoretical_loss": 3.627302404969435, + "tokens_seen": 1065661440 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034198595787362086, + "loss": 2.7391, + "theoretical_loss": 3.6272814255758763, + "tokens_seen": 1065726976 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003419759277833501, + "loss": 2.9654, + "theoretical_loss": 3.6272604478335944, + "tokens_seen": 1065792512 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003419658976930792, + "loss": 2.5866, + "theoretical_loss": 3.6272394717423575, + "tokens_seen": 1065858048 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034195586760280846, + "loss": 2.7491, + "theoretical_loss": 3.627218497301934, + "tokens_seen": 1065923584 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003419458375125376, + "loss": 2.94, + "theoretical_loss": 3.627197524512092, + "tokens_seen": 1065989120 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003419358074222668, + "loss": 2.7148, + "theoretical_loss": 3.6271765533726015, + "tokens_seen": 1066054656 + }, + { + "epoch": 3.05, + "learning_rate": 0.000341925777331996, + "loss": 3.0207, + "theoretical_loss": 3.62715558388323, + "tokens_seen": 1066120192 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003419157472417252, + "loss": 2.8715, + "theoretical_loss": 3.6271346160437465, + "tokens_seen": 1066185728 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034190571715145436, + "loss": 2.8406, + "theoretical_loss": 3.6271136498539205, + "tokens_seen": 1066251264 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034189568706118355, + "loss": 2.6654, + "theoretical_loss": 3.6270926853135195, + "tokens_seen": 1066316800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003418856569709127, + "loss": 2.7278, + "theoretical_loss": 3.627071722422314, + "tokens_seen": 1066382336 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034187562688064196, + "loss": 2.7196, + "theoretical_loss": 3.6270507611800715, + "tokens_seen": 1066447872 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003418655967903711, + "loss": 2.8842, + "theoretical_loss": 3.6270298015865614, + "tokens_seen": 1066513408 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003418555667001003, + "loss": 2.5783, + "theoretical_loss": 3.627008843641553, + "tokens_seen": 1066578944 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1199964, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9109983444213867, + "objective/train/theoretical_loss": 3.6270036044128524, + "objective/train/tokens_used": 1087055328, + "theoretical_loss": 3.6270036044128524, + "tokens_seen": 1066595328 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034184553660982945, + "loss": 3.0025, + "theoretical_loss": 3.626987887344815, + "tokens_seen": 1066644480 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003418355065195587, + "loss": 2.7713, + "theoretical_loss": 3.626966932696117, + "tokens_seen": 1066710016 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034182547642928787, + "loss": 2.7343, + "theoretical_loss": 3.626945979695228, + "tokens_seen": 1066775552 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034181544633901705, + "loss": 2.942, + "theoretical_loss": 3.626925028341917, + "tokens_seen": 1066841088 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034180541624874623, + "loss": 3.002, + "theoretical_loss": 3.6269040786359534, + "tokens_seen": 1066906624 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034179538615847547, + "loss": 2.9148, + "theoretical_loss": 3.626883130577106, + "tokens_seen": 1066972160 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003417853560682046, + "loss": 2.864, + "theoretical_loss": 3.6268621841651445, + "tokens_seen": 1067037696 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034177532597793383, + "loss": 2.8888, + "theoretical_loss": 3.626841239399839, + "tokens_seen": 1067103232 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034176529588766295, + "loss": 2.9927, + "theoretical_loss": 3.626820296280958, + "tokens_seen": 1067168768 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003417552657973922, + "loss": 2.6687, + "theoretical_loss": 3.626799354808271, + "tokens_seen": 1067234304 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034174523570712137, + "loss": 2.7331, + "theoretical_loss": 3.626778414981548, + "tokens_seen": 1067299840 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034173520561685055, + "loss": 2.6517, + "theoretical_loss": 3.626757476800558, + "tokens_seen": 1067365376 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034172517552657973, + "loss": 3.0107, + "theoretical_loss": 3.6267365402650715, + "tokens_seen": 1067430912 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003417151454363089, + "loss": 3.1109, + "theoretical_loss": 3.6267156053748577, + "tokens_seen": 1067496448 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003417051153460381, + "loss": 2.6219, + "theoretical_loss": 3.6266946721296858, + "tokens_seen": 1067561984 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034169508525576733, + "loss": 2.7849, + "theoretical_loss": 3.626673740529326, + "tokens_seen": 1067627520 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034168505516549646, + "loss": 2.7899, + "theoretical_loss": 3.6266528105735487, + "tokens_seen": 1067693056 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003416750250752257, + "loss": 2.6937, + "theoretical_loss": 3.6266318822621226, + "tokens_seen": 1067758592 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003416649949849548, + "loss": 2.7576, + "theoretical_loss": 3.6266109555948187, + "tokens_seen": 1067824128 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034165496489468406, + "loss": 2.7506, + "theoretical_loss": 3.626590030571406, + "tokens_seen": 1067889664 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034164493480441324, + "loss": 2.7177, + "theoretical_loss": 3.626569107191655, + "tokens_seen": 1067955200 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003416349047141424, + "loss": 2.6327, + "theoretical_loss": 3.6265481854553356, + "tokens_seen": 1068020736 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003416248746238716, + "loss": 2.8429, + "theoretical_loss": 3.6265272653622183, + "tokens_seen": 1068086272 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034161484453360083, + "loss": 2.7674, + "theoretical_loss": 3.6265063469120724, + "tokens_seen": 1068151808 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034160481444332996, + "loss": 2.796, + "theoretical_loss": 3.626485430104669, + "tokens_seen": 1068217344 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1200699, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.70048451423645, + "objective/train/theoretical_loss": 3.6264802011594695, + "objective/train/tokens_used": 1088693728, + "theoretical_loss": 3.6264802011594695, + "tokens_seen": 1068233728 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003415947843530592, + "loss": 2.6839, + "theoretical_loss": 3.6264645149397774, + "tokens_seen": 1068282880 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003415847542627884, + "loss": 2.7033, + "theoretical_loss": 3.6264436014171686, + "tokens_seen": 1068348416 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034157472417251756, + "loss": 2.6647, + "theoretical_loss": 3.6264226895366125, + "tokens_seen": 1068413952 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003415646940822468, + "loss": 2.8639, + "theoretical_loss": 3.62640177929788, + "tokens_seen": 1068479488 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003415546639919759, + "loss": 2.8761, + "theoretical_loss": 3.626380870700741, + "tokens_seen": 1068545024 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034154463390170516, + "loss": 2.8552, + "theoretical_loss": 3.6263599637449655, + "tokens_seen": 1068610560 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003415346038114343, + "loss": 2.8734, + "theoretical_loss": 3.6263390584303252, + "tokens_seen": 1068676096 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003415245737211635, + "loss": 2.7517, + "theoretical_loss": 3.6263181547565893, + "tokens_seen": 1068741632 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003415145436308927, + "loss": 2.7747, + "theoretical_loss": 3.62629725272353, + "tokens_seen": 1068807168 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003415045135406219, + "loss": 2.8721, + "theoretical_loss": 3.6262763523309163, + "tokens_seen": 1068872704 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034149448345035106, + "loss": 2.7056, + "theoretical_loss": 3.62625545357852, + "tokens_seen": 1068938240 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003414844533600803, + "loss": 2.6699, + "theoretical_loss": 3.6262345564661116, + "tokens_seen": 1069003776 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003414744232698094, + "loss": 2.8384, + "theoretical_loss": 3.6262136609934617, + "tokens_seen": 1069069312 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034146439317953866, + "loss": 2.6113, + "theoretical_loss": 3.626192767160341, + "tokens_seen": 1069134848 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003414543630892678, + "loss": 2.8847, + "theoretical_loss": 3.6261718749665204, + "tokens_seen": 1069200384 + }, + { + "epoch": 3.05, + "learning_rate": 0.000341444332998997, + "loss": 2.9789, + "theoretical_loss": 3.6261509844117708, + "tokens_seen": 1069265920 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003414343029087262, + "loss": 2.69, + "theoretical_loss": 3.626130095495864, + "tokens_seen": 1069331456 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003414242728184554, + "loss": 2.8193, + "theoretical_loss": 3.62610920821857, + "tokens_seen": 1069396992 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034141424272818456, + "loss": 2.7256, + "theoretical_loss": 3.62608832257966, + "tokens_seen": 1069462528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034140421263791375, + "loss": 2.8196, + "theoretical_loss": 3.626067438578905, + "tokens_seen": 1069528064 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034139418254764293, + "loss": 2.8286, + "theoretical_loss": 3.626046556216077, + "tokens_seen": 1069593600 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034138415245737216, + "loss": 2.8624, + "theoretical_loss": 3.626025675490946, + "tokens_seen": 1069659136 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003413741223671013, + "loss": 2.8349, + "theoretical_loss": 3.6260047964032847, + "tokens_seen": 1069724672 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003413640922768305, + "loss": 2.8984, + "theoretical_loss": 3.625983918952863, + "tokens_seen": 1069790208 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034135406218655965, + "loss": 2.7387, + "theoretical_loss": 3.6259630431394525, + "tokens_seen": 1069855744 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1201919, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.7883902788162231, + "objective/train/theoretical_loss": 3.625957824441856, + "objective/train/tokens_used": 1090332128, + "theoretical_loss": 3.625957824441856, + "tokens_seen": 1069872128 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003413440320962889, + "loss": 2.4208, + "theoretical_loss": 3.6259421689628253, + "tokens_seen": 1069921280 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034133400200601807, + "loss": 2.8423, + "theoretical_loss": 3.625921296422752, + "tokens_seen": 1069986816 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034132397191574725, + "loss": 2.7369, + "theoretical_loss": 3.625900425519004, + "tokens_seen": 1070052352 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034131394182547643, + "loss": 2.9399, + "theoretical_loss": 3.6258795562513537, + "tokens_seen": 1070117888 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034130391173520567, + "loss": 2.7031, + "theoretical_loss": 3.6258586886195725, + "tokens_seen": 1070183424 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003412938816449348, + "loss": 2.8105, + "theoretical_loss": 3.6258378226234314, + "tokens_seen": 1070248960 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034128385155466403, + "loss": 2.6984, + "theoretical_loss": 3.6258169582627024, + "tokens_seen": 1070314496 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034127382146439315, + "loss": 2.7699, + "theoretical_loss": 3.625796095537157, + "tokens_seen": 1070380032 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003412637913741224, + "loss": 2.79, + "theoretical_loss": 3.625775234446567, + "tokens_seen": 1070445568 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034125376128385157, + "loss": 2.5291, + "theoretical_loss": 3.6257543749907044, + "tokens_seen": 1070511104 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034124373119358075, + "loss": 2.7412, + "theoretical_loss": 3.625733517169341, + "tokens_seen": 1070576640 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034123370110330993, + "loss": 2.7322, + "theoretical_loss": 3.6257126609822485, + "tokens_seen": 1070642176 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003412236710130391, + "loss": 2.7944, + "theoretical_loss": 3.6256918064291987, + "tokens_seen": 1070707712 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003412136409227683, + "loss": 2.6609, + "theoretical_loss": 3.625670953509964, + "tokens_seen": 1070773248 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034120361083249753, + "loss": 2.6465, + "theoretical_loss": 3.625650102224316, + "tokens_seen": 1070838784 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034119358074222666, + "loss": 2.5707, + "theoretical_loss": 3.6256292525720273, + "tokens_seen": 1070904320 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003411835506519559, + "loss": 2.4912, + "theoretical_loss": 3.625608404552869, + "tokens_seen": 1070969856 + }, + { + "epoch": 3.05, + "learning_rate": 0.000341173520561685, + "loss": 2.5813, + "theoretical_loss": 3.6255875581666146, + "tokens_seen": 1071035392 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034116349047141426, + "loss": 2.8009, + "theoretical_loss": 3.6255667134130354, + "tokens_seen": 1071100928 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034115346038114344, + "loss": 2.7488, + "theoretical_loss": 3.6255458702919037, + "tokens_seen": 1071166464 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003411434302908726, + "loss": 2.8851, + "theoretical_loss": 3.6255250288029917, + "tokens_seen": 1071232000 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003411334002006018, + "loss": 2.879, + "theoretical_loss": 3.6255041889460724, + "tokens_seen": 1071297536 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034112337011033103, + "loss": 2.8509, + "theoretical_loss": 3.625483350720918, + "tokens_seen": 1071363072 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034111334002006016, + "loss": 2.7807, + "theoretical_loss": 3.6254625141273, + "tokens_seen": 1071428608 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003411033099297894, + "loss": 2.8458, + "theoretical_loss": 3.625441679164992, + "tokens_seen": 1071494144 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1202634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0612995624542236, + "objective/train/theoretical_loss": 3.6254364706792805, + "objective/train/tokens_used": 1091970528, + "theoretical_loss": 3.6254364706792805, + "tokens_seen": 1071510528 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003410932798395185, + "loss": 2.8313, + "theoretical_loss": 3.6254208458337662, + "tokens_seen": 1071559680 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034108324974924776, + "loss": 2.7383, + "theoretical_loss": 3.625400014133395, + "tokens_seen": 1071625216 + }, + { + "epoch": 3.05, + "learning_rate": 0.00034107321965897694, + "loss": 2.9735, + "theoretical_loss": 3.6253791840636507, + "tokens_seen": 1071690752 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003410631895687061, + "loss": 2.9058, + "theoretical_loss": 3.6253583556243063, + "tokens_seen": 1071756288 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003410531594784353, + "loss": 2.8198, + "theoretical_loss": 3.625337528815135, + "tokens_seen": 1071821824 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003410431293881645, + "loss": 2.8492, + "theoretical_loss": 3.6253167036359084, + "tokens_seen": 1071887360 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034103309929789366, + "loss": 2.7865, + "theoretical_loss": 3.6252958800864006, + "tokens_seen": 1071952896 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003410230692076229, + "loss": 2.9489, + "theoretical_loss": 3.625275058166384, + "tokens_seen": 1072018432 + }, + { + "epoch": 3.06, + "learning_rate": 0.000341013039117352, + "loss": 2.99, + "theoretical_loss": 3.6252542378756303, + "tokens_seen": 1072083968 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034100300902708126, + "loss": 2.6486, + "theoretical_loss": 3.6252334192139144, + "tokens_seen": 1072149504 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034099297893681044, + "loss": 2.6888, + "theoretical_loss": 3.6252126021810076, + "tokens_seen": 1072215040 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003409829488465396, + "loss": 2.738, + "theoretical_loss": 3.625191786776684, + "tokens_seen": 1072280576 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003409729187562688, + "loss": 2.8253, + "theoretical_loss": 3.6251709730007162, + "tokens_seen": 1072346112 + }, + { + "epoch": 3.06, + "learning_rate": 0.000340962888665998, + "loss": 2.5057, + "theoretical_loss": 3.6251501608528773, + "tokens_seen": 1072411648 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034095285857572717, + "loss": 2.8812, + "theoretical_loss": 3.6251293503329407, + "tokens_seen": 1072477184 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003409428284854564, + "loss": 2.9102, + "theoretical_loss": 3.625108541440679, + "tokens_seen": 1072542720 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034093279839518553, + "loss": 2.6922, + "theoretical_loss": 3.6250877341758665, + "tokens_seen": 1072608256 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034092276830491477, + "loss": 2.8168, + "theoretical_loss": 3.6250669285382755, + "tokens_seen": 1072673792 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003409127382146439, + "loss": 2.7439, + "theoretical_loss": 3.62504612452768, + "tokens_seen": 1072739328 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034090270812437313, + "loss": 2.9342, + "theoretical_loss": 3.625025322143853, + "tokens_seen": 1072804864 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003408926780341023, + "loss": 2.8336, + "theoretical_loss": 3.625004521386568, + "tokens_seen": 1072870400 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003408826479438315, + "loss": 2.8846, + "theoretical_loss": 3.6249837222555987, + "tokens_seen": 1072935936 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034087261785356067, + "loss": 2.703, + "theoretical_loss": 3.624962924750718, + "tokens_seen": 1073001472 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034086258776328985, + "loss": 2.6356, + "theoretical_loss": 3.6249421288717, + "tokens_seen": 1073067008 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034085255767301903, + "loss": 2.8443, + "theoretical_loss": 3.6249213346183184, + "tokens_seen": 1073132544 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1203719, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.52689266204834, + "objective/train/theoretical_loss": 3.6249161363089524, + "objective/train/tokens_used": 1093608928, + "theoretical_loss": 3.6249161363089524, + "tokens_seen": 1073148928 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034084252758274827, + "loss": 2.6909, + "theoretical_loss": 3.624900541990346, + "tokens_seen": 1073198080 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034083249749247745, + "loss": 2.9078, + "theoretical_loss": 3.6248797509875583, + "tokens_seen": 1073263616 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034082246740220663, + "loss": 2.8621, + "theoretical_loss": 3.6248589616097275, + "tokens_seen": 1073329152 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034081243731193587, + "loss": 2.7082, + "theoretical_loss": 3.6248381738566278, + "tokens_seen": 1073394688 + }, + { + "epoch": 3.06, + "learning_rate": 0.000340802407221665, + "loss": 2.8163, + "theoretical_loss": 3.624817387728033, + "tokens_seen": 1073460224 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034079237713139423, + "loss": 2.8633, + "theoretical_loss": 3.624796603223717, + "tokens_seen": 1073525760 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034078234704112335, + "loss": 2.6537, + "theoretical_loss": 3.6247758203434537, + "tokens_seen": 1073591296 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003407723169508526, + "loss": 2.843, + "theoretical_loss": 3.624755039087017, + "tokens_seen": 1073656832 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034076228686058177, + "loss": 2.9657, + "theoretical_loss": 3.624734259454182, + "tokens_seen": 1073722368 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034075225677031095, + "loss": 2.7164, + "theoretical_loss": 3.6247134814447204, + "tokens_seen": 1073787904 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034074222668004013, + "loss": 2.6927, + "theoretical_loss": 3.6246927050584086, + "tokens_seen": 1073853440 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003407321965897693, + "loss": 2.7393, + "theoretical_loss": 3.62467193029502, + "tokens_seen": 1073918976 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003407221664994985, + "loss": 2.8091, + "theoretical_loss": 3.624651157154328, + "tokens_seen": 1073984512 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034071213640922773, + "loss": 2.6501, + "theoretical_loss": 3.6246303856361077, + "tokens_seen": 1074050048 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034070210631895686, + "loss": 2.7927, + "theoretical_loss": 3.6246096157401335, + "tokens_seen": 1074115584 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003406920762286861, + "loss": 2.8565, + "theoretical_loss": 3.6245888474661796, + "tokens_seen": 1074181120 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003406820461384152, + "loss": 2.7706, + "theoretical_loss": 3.62456808081402, + "tokens_seen": 1074246656 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034067201604814446, + "loss": 2.7304, + "theoretical_loss": 3.6245473157834294, + "tokens_seen": 1074312192 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034066198595787364, + "loss": 2.8795, + "theoretical_loss": 3.6245265523741823, + "tokens_seen": 1074377728 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003406519558676028, + "loss": 2.8952, + "theoretical_loss": 3.6245057905860527, + "tokens_seen": 1074443264 + }, + { + "epoch": 3.06, + "learning_rate": 0.000340641925777332, + "loss": 2.854, + "theoretical_loss": 3.624485030418816, + "tokens_seen": 1074508800 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034063189568706123, + "loss": 2.7502, + "theoretical_loss": 3.624464271872246, + "tokens_seen": 1074574336 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034062186559679036, + "loss": 2.5958, + "theoretical_loss": 3.6244435149461185, + "tokens_seen": 1074639872 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003406118355065196, + "loss": 2.8345, + "theoretical_loss": 3.624422759640207, + "tokens_seen": 1074705408 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003406018054162487, + "loss": 2.8674, + "theoretical_loss": 3.6244020059542867, + "tokens_seen": 1074770944 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1204483, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6257479190826416, + "objective/train/theoretical_loss": 3.624396817785904, + "objective/train/tokens_used": 1095247328, + "theoretical_loss": 3.624396817785904, + "tokens_seen": 1074787328 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034059177532597796, + "loss": 2.763, + "theoretical_loss": 3.6243812538881324, + "tokens_seen": 1074836480 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034058174523570714, + "loss": 2.8225, + "theoretical_loss": 3.624360503441519, + "tokens_seen": 1074902016 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003405717151454363, + "loss": 2.7167, + "theoretical_loss": 3.6243397546142218, + "tokens_seen": 1074967552 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003405616850551655, + "loss": 2.7695, + "theoretical_loss": 3.6243190074060143, + "tokens_seen": 1075033088 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003405516549648947, + "loss": 2.664, + "theoretical_loss": 3.624298261816673, + "tokens_seen": 1075098624 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034054162487462386, + "loss": 2.8657, + "theoretical_loss": 3.6242775178459716, + "tokens_seen": 1075164160 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003405315947843531, + "loss": 2.707, + "theoretical_loss": 3.624256775493687, + "tokens_seen": 1075229696 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003405215646940822, + "loss": 2.8292, + "theoretical_loss": 3.6242360347595923, + "tokens_seen": 1075295232 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034051153460381146, + "loss": 2.7094, + "theoretical_loss": 3.6242152956434635, + "tokens_seen": 1075360768 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034050150451354064, + "loss": 2.9164, + "theoretical_loss": 3.624194558145076, + "tokens_seen": 1075426304 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003404914744232698, + "loss": 2.6311, + "theoretical_loss": 3.624173822264205, + "tokens_seen": 1075491840 + }, + { + "epoch": 3.06, + "learning_rate": 0.000340481444332999, + "loss": 2.5965, + "theoretical_loss": 3.624153088000625, + "tokens_seen": 1075557376 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003404714142427282, + "loss": 2.8016, + "theoretical_loss": 3.6241323553541127, + "tokens_seen": 1075622912 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034046138415245737, + "loss": 2.7171, + "theoretical_loss": 3.624111624324442, + "tokens_seen": 1075688448 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003404513540621866, + "loss": 2.8304, + "theoretical_loss": 3.6240908949113897, + "tokens_seen": 1075753984 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034044132397191573, + "loss": 2.9595, + "theoretical_loss": 3.62407016711473, + "tokens_seen": 1075819520 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034043129388164497, + "loss": 2.6081, + "theoretical_loss": 3.6240494409342396, + "tokens_seen": 1075885056 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003404212637913741, + "loss": 3.0219, + "theoretical_loss": 3.6240287163696934, + "tokens_seen": 1075950592 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034041123370110333, + "loss": 2.8945, + "theoretical_loss": 3.624007993420866, + "tokens_seen": 1076016128 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003404012036108325, + "loss": 2.9308, + "theoretical_loss": 3.6239872720875352, + "tokens_seen": 1076081664 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003403911735205617, + "loss": 2.6346, + "theoretical_loss": 3.623966552369475, + "tokens_seen": 1076147200 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034038114343029087, + "loss": 2.754, + "theoretical_loss": 3.623945834266462, + "tokens_seen": 1076212736 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034037111334002005, + "loss": 2.5849, + "theoretical_loss": 3.6239251177782714, + "tokens_seen": 1076278272 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034036108324974923, + "loss": 2.8053, + "theoretical_loss": 3.6239044029046794, + "tokens_seen": 1076343808 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034035105315947847, + "loss": 2.7989, + "theoretical_loss": 3.623883689645462, + "tokens_seen": 1076409344 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1205647, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.873117446899414, + "objective/train/theoretical_loss": 3.623878511582877, + "objective/train/tokens_used": 1096885728, + "theoretical_loss": 3.623878511582877, + "tokens_seen": 1076425728 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003403410230692076, + "loss": 2.8016, + "theoretical_loss": 3.6238629780003944, + "tokens_seen": 1076474880 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034033099297893683, + "loss": 2.8755, + "theoretical_loss": 3.6238422679692532, + "tokens_seen": 1076540416 + }, + { + "epoch": 3.06, + "learning_rate": 0.000340320962888666, + "loss": 2.6786, + "theoretical_loss": 3.623821559551814, + "tokens_seen": 1076605952 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003403109327983952, + "loss": 2.5573, + "theoretical_loss": 3.623800852747854, + "tokens_seen": 1076671488 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003403009027081244, + "loss": 2.8492, + "theoretical_loss": 3.623780147557147, + "tokens_seen": 1076737024 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034029087261785356, + "loss": 2.7657, + "theoretical_loss": 3.623759443979471, + "tokens_seen": 1076802560 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034028084252758274, + "loss": 2.8066, + "theoretical_loss": 3.623738742014602, + "tokens_seen": 1076868096 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034027081243731197, + "loss": 2.7101, + "theoretical_loss": 3.6237180416623156, + "tokens_seen": 1076933632 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003402607823470411, + "loss": 2.8152, + "theoretical_loss": 3.623697342922388, + "tokens_seen": 1076999168 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034025075225677033, + "loss": 2.6212, + "theoretical_loss": 3.623676645794596, + "tokens_seen": 1077064704 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034024072216649946, + "loss": 2.7761, + "theoretical_loss": 3.6236559502787165, + "tokens_seen": 1077130240 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003402306920762287, + "loss": 2.7202, + "theoretical_loss": 3.6236352563745244, + "tokens_seen": 1077195776 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003402206619859579, + "loss": 2.6571, + "theoretical_loss": 3.623614564081797, + "tokens_seen": 1077261312 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034021063189568706, + "loss": 2.8752, + "theoretical_loss": 3.6235938734003117, + "tokens_seen": 1077326848 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034020060180541624, + "loss": 2.6293, + "theoretical_loss": 3.6235731843298433, + "tokens_seen": 1077392384 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003401905717151454, + "loss": 2.7159, + "theoretical_loss": 3.623552496870169, + "tokens_seen": 1077457920 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003401805416248746, + "loss": 2.7924, + "theoretical_loss": 3.6235318110210657, + "tokens_seen": 1077523456 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034017051153460384, + "loss": 2.7635, + "theoretical_loss": 3.62351112678231, + "tokens_seen": 1077588992 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034016048144433296, + "loss": 2.9351, + "theoretical_loss": 3.6234904441536786, + "tokens_seen": 1077654528 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003401504513540622, + "loss": 2.9664, + "theoretical_loss": 3.623469763134948, + "tokens_seen": 1077720064 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003401404212637914, + "loss": 2.7644, + "theoretical_loss": 3.6234490837258955, + "tokens_seen": 1077785600 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034013039117352056, + "loss": 2.8821, + "theoretical_loss": 3.6234284059262976, + "tokens_seen": 1077851136 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034012036108324974, + "loss": 2.9963, + "theoretical_loss": 3.6234077297359315, + "tokens_seen": 1077916672 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003401103309929789, + "loss": 2.5783, + "theoretical_loss": 3.6233870551545735, + "tokens_seen": 1077982208 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003401003009027081, + "loss": 2.8951, + "theoretical_loss": 3.6233663821820015, + "tokens_seen": 1078047744 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1210610, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8817059993743896, + "objective/train/theoretical_loss": 3.6233612141902043, + "objective/train/tokens_used": 1098524128, + "theoretical_loss": 3.6233612141902043, + "tokens_seen": 1078064128 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034009027081243734, + "loss": 2.9285, + "theoretical_loss": 3.623345710817991, + "tokens_seen": 1078113280 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003400802407221665, + "loss": 2.929, + "theoretical_loss": 3.623325041062321, + "tokens_seen": 1078178816 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003400702106318957, + "loss": 2.7234, + "theoretical_loss": 3.623304372914767, + "tokens_seen": 1078244352 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003400601805416249, + "loss": 2.6798, + "theoretical_loss": 3.6232837063751075, + "tokens_seen": 1078309888 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034005015045135406, + "loss": 2.9209, + "theoretical_loss": 3.6232630414431184, + "tokens_seen": 1078375424 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003400401203610833, + "loss": 2.576, + "theoretical_loss": 3.6232423781185776, + "tokens_seen": 1078440960 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003400300902708124, + "loss": 2.7724, + "theoretical_loss": 3.623221716401263, + "tokens_seen": 1078506496 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034002006018054166, + "loss": 2.7263, + "theoretical_loss": 3.6232010562909505, + "tokens_seen": 1078572032 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034001003009027084, + "loss": 2.8475, + "theoretical_loss": 3.623180397787419, + "tokens_seen": 1078637568 + }, + { + "epoch": 3.06, + "learning_rate": 0.00034, + "loss": 2.863, + "theoretical_loss": 3.623159740890445, + "tokens_seen": 1078703104 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003399899699097292, + "loss": 2.8764, + "theoretical_loss": 3.6231390855998056, + "tokens_seen": 1078768640 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003399799398194584, + "loss": 2.6731, + "theoretical_loss": 3.6231184319152794, + "tokens_seen": 1078834176 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033996990972918757, + "loss": 2.9074, + "theoretical_loss": 3.6230977798366437, + "tokens_seen": 1078899712 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003399598796389168, + "loss": 2.6713, + "theoretical_loss": 3.6230771293636757, + "tokens_seen": 1078965248 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033994984954864593, + "loss": 2.5203, + "theoretical_loss": 3.623056480496153, + "tokens_seen": 1079030784 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033993981945837517, + "loss": 2.8662, + "theoretical_loss": 3.6230358332338533, + "tokens_seen": 1079096320 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003399297893681043, + "loss": 2.7636, + "theoretical_loss": 3.623015187576555, + "tokens_seen": 1079161856 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033991975927783353, + "loss": 2.7715, + "theoretical_loss": 3.622994543524035, + "tokens_seen": 1079227392 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003399097291875627, + "loss": 2.8445, + "theoretical_loss": 3.6229739010760715, + "tokens_seen": 1079292928 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003398996990972919, + "loss": 2.9655, + "theoretical_loss": 3.6229532602324426, + "tokens_seen": 1079358464 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033988966900702107, + "loss": 2.6079, + "theoretical_loss": 3.622932620992926, + "tokens_seen": 1079424000 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033987963891675025, + "loss": 2.6488, + "theoretical_loss": 3.6229119833572994, + "tokens_seen": 1079489536 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033986960882647943, + "loss": 2.5739, + "theoretical_loss": 3.622891347325341, + "tokens_seen": 1079555072 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033985957873620867, + "loss": 2.8467, + "theoretical_loss": 3.6228707128968294, + "tokens_seen": 1079620608 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003398495486459378, + "loss": 2.8359, + "theoretical_loss": 3.622850080071542, + "tokens_seen": 1079686144 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1215587, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9586844444274902, + "objective/train/theoretical_loss": 3.6228449221156978, + "objective/train/tokens_used": 1100162528, + "theoretical_loss": 3.6228449221156978, + "tokens_seen": 1079702528 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033983951855566703, + "loss": 2.838, + "theoretical_loss": 3.6228294488492567, + "tokens_seen": 1079751680 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003398294884653962, + "loss": 2.6845, + "theoretical_loss": 3.6228088192297525, + "tokens_seen": 1079817216 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003398194583751254, + "loss": 2.6935, + "theoretical_loss": 3.622788191212807, + "tokens_seen": 1079882752 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003398094282848546, + "loss": 2.8, + "theoretical_loss": 3.622767564798199, + "tokens_seen": 1079948288 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033979939819458376, + "loss": 2.8069, + "theoretical_loss": 3.622746939985706, + "tokens_seen": 1080013824 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033978936810431294, + "loss": 2.7078, + "theoretical_loss": 3.622726316775107, + "tokens_seen": 1080079360 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033977933801404217, + "loss": 2.7325, + "theoretical_loss": 3.6227056951661805, + "tokens_seen": 1080144896 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003397693079237713, + "loss": 2.7843, + "theoretical_loss": 3.6226850751587043, + "tokens_seen": 1080210432 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033975927783350053, + "loss": 2.8463, + "theoretical_loss": 3.6226644567524575, + "tokens_seen": 1080275968 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033974924774322966, + "loss": 2.7421, + "theoretical_loss": 3.622643839947218, + "tokens_seen": 1080341504 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003397392176529589, + "loss": 2.8555, + "theoretical_loss": 3.622623224742765, + "tokens_seen": 1080407040 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003397291875626881, + "loss": 2.8457, + "theoretical_loss": 3.622602611138877, + "tokens_seen": 1080472576 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033971915747241726, + "loss": 2.8681, + "theoretical_loss": 3.6225819991353325, + "tokens_seen": 1080538112 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033970912738214644, + "loss": 2.5826, + "theoretical_loss": 3.6225613887319104, + "tokens_seen": 1080603648 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003396990972918756, + "loss": 2.5601, + "theoretical_loss": 3.622540779928389, + "tokens_seen": 1080669184 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003396890672016048, + "loss": 2.7977, + "theoretical_loss": 3.6225201727245473, + "tokens_seen": 1080734720 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033967903711133404, + "loss": 2.592, + "theoretical_loss": 3.6224995671201645, + "tokens_seen": 1080800256 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033966900702106316, + "loss": 2.6062, + "theoretical_loss": 3.622478963115019, + "tokens_seen": 1080865792 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003396589769307924, + "loss": 2.6532, + "theoretical_loss": 3.6224583607088903, + "tokens_seen": 1080931328 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003396489468405216, + "loss": 2.7851, + "theoretical_loss": 3.6224377599015565, + "tokens_seen": 1080996864 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033963891675025076, + "loss": 2.8381, + "theoretical_loss": 3.6224171606927973, + "tokens_seen": 1081062400 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033962888665997994, + "loss": 2.8092, + "theoretical_loss": 3.6223965630823916, + "tokens_seen": 1081127936 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003396188565697091, + "loss": 2.791, + "theoretical_loss": 3.6223759670701186, + "tokens_seen": 1081193472 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003396088264794383, + "loss": 2.8194, + "theoretical_loss": 3.622355372655757, + "tokens_seen": 1081259008 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033959879638916754, + "loss": 3.0145, + "theoretical_loss": 3.622334779839087, + "tokens_seen": 1081324544 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1220679, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.306006908416748, + "objective/train/theoretical_loss": 3.6223296318845324, + "objective/train/tokens_used": 1101800928, + "theoretical_loss": 3.6223296318845324, + "tokens_seen": 1081340928 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033958876629889667, + "loss": 2.6374, + "theoretical_loss": 3.622314188619886, + "tokens_seen": 1081390080 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003395787362086259, + "loss": 2.7159, + "theoretical_loss": 3.6222935989979352, + "tokens_seen": 1081455616 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033956870611835503, + "loss": 2.764, + "theoretical_loss": 3.622273010973013, + "tokens_seen": 1081521152 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033955867602808426, + "loss": 2.8377, + "theoretical_loss": 3.622252424544899, + "tokens_seen": 1081586688 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033954864593781345, + "loss": 2.7197, + "theoretical_loss": 3.622231839713372, + "tokens_seen": 1081652224 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003395386158475426, + "loss": 2.7794, + "theoretical_loss": 3.6222112564782125, + "tokens_seen": 1081717760 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003395285857572718, + "loss": 2.7446, + "theoretical_loss": 3.6221906748391994, + "tokens_seen": 1081783296 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033951855566700104, + "loss": 2.8116, + "theoretical_loss": 3.622170094796112, + "tokens_seen": 1081848832 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033950852557673017, + "loss": 2.8156, + "theoretical_loss": 3.6221495163487303, + "tokens_seen": 1081914368 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003394984954864594, + "loss": 2.8413, + "theoretical_loss": 3.622128939496834, + "tokens_seen": 1081979904 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033948846539618853, + "loss": 2.7511, + "theoretical_loss": 3.6221083642402023, + "tokens_seen": 1082045440 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033947843530591777, + "loss": 2.7974, + "theoretical_loss": 3.6220877905786155, + "tokens_seen": 1082110976 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033946840521564695, + "loss": 2.7031, + "theoretical_loss": 3.6220672185118525, + "tokens_seen": 1082176512 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033945837512537613, + "loss": 2.4711, + "theoretical_loss": 3.622046648039694, + "tokens_seen": 1082242048 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003394483450351053, + "loss": 2.6463, + "theoretical_loss": 3.6220260791619197, + "tokens_seen": 1082307584 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003394383149448345, + "loss": 2.7821, + "theoretical_loss": 3.622005511878309, + "tokens_seen": 1082373120 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003394282848545637, + "loss": 2.7633, + "theoretical_loss": 3.6219849461886424, + "tokens_seen": 1082438656 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003394182547642929, + "loss": 2.7907, + "theoretical_loss": 3.621964382092699, + "tokens_seen": 1082504192 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033940822467402204, + "loss": 2.5404, + "theoretical_loss": 3.62194381959026, + "tokens_seen": 1082569728 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033939819458375127, + "loss": 2.6696, + "theoretical_loss": 3.6219232586811048, + "tokens_seen": 1082635264 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003393881644934804, + "loss": 2.72, + "theoretical_loss": 3.6219026993650134, + "tokens_seen": 1082700800 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033937813440320963, + "loss": 2.7538, + "theoretical_loss": 3.621882141641766, + "tokens_seen": 1082766336 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003393681043129388, + "loss": 2.7411, + "theoretical_loss": 3.6218615855111436, + "tokens_seen": 1082831872 + }, + { + "epoch": 3.06, + "learning_rate": 0.000339358074222668, + "loss": 2.7286, + "theoretical_loss": 3.6218410309729254, + "tokens_seen": 1082897408 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003393480441323972, + "loss": 3.0193, + "theoretical_loss": 3.6218204780268914, + "tokens_seen": 1082962944 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1225756, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9983677864074707, + "objective/train/theoretical_loss": 3.621815340039136, + "objective/train/tokens_used": 1103439328, + "theoretical_loss": 3.621815340039136, + "tokens_seen": 1082979328 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003393380140421264, + "loss": 2.7483, + "theoretical_loss": 3.6217999266728236, + "tokens_seen": 1083028480 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003393279839518556, + "loss": 2.7398, + "theoretical_loss": 3.621779376910501, + "tokens_seen": 1083094016 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003393179538615848, + "loss": 2.7203, + "theoretical_loss": 3.6217588287397042, + "tokens_seen": 1083159552 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033930792377131396, + "loss": 2.6191, + "theoretical_loss": 3.621738282160214, + "tokens_seen": 1083225088 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033929789368104314, + "loss": 2.97, + "theoretical_loss": 3.6217177371718106, + "tokens_seen": 1083290624 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033928786359077237, + "loss": 2.6384, + "theoretical_loss": 3.621697193774275, + "tokens_seen": 1083356160 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003392778335005015, + "loss": 2.5806, + "theoretical_loss": 3.6216766519673866, + "tokens_seen": 1083421696 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033926780341023073, + "loss": 2.8141, + "theoretical_loss": 3.621656111750928, + "tokens_seen": 1083487232 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033925777331995986, + "loss": 2.7713, + "theoretical_loss": 3.621635573124678, + "tokens_seen": 1083552768 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003392477432296891, + "loss": 2.7386, + "theoretical_loss": 3.6216150360884183, + "tokens_seen": 1083618304 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003392377131394183, + "loss": 2.8081, + "theoretical_loss": 3.6215945006419297, + "tokens_seen": 1083683840 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033922768304914746, + "loss": 2.8118, + "theoretical_loss": 3.621573966784992, + "tokens_seen": 1083749376 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033921765295887664, + "loss": 3.0737, + "theoretical_loss": 3.621553434517388, + "tokens_seen": 1083814912 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003392076228686058, + "loss": 2.7613, + "theoretical_loss": 3.6215329038388964, + "tokens_seen": 1083880448 + }, + { + "epoch": 3.06, + "learning_rate": 0.000339197592778335, + "loss": 2.6075, + "theoretical_loss": 3.621512374749299, + "tokens_seen": 1083945984 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033918756268806424, + "loss": 2.7083, + "theoretical_loss": 3.621491847248378, + "tokens_seen": 1084011520 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033917753259779336, + "loss": 2.7446, + "theoretical_loss": 3.621471321335912, + "tokens_seen": 1084077056 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003391675025075226, + "loss": 2.9731, + "theoretical_loss": 3.6214507970116845, + "tokens_seen": 1084142592 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003391574724172518, + "loss": 2.687, + "theoretical_loss": 3.6214302742754754, + "tokens_seen": 1084208128 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033914744232698096, + "loss": 2.7052, + "theoretical_loss": 3.6214097531270655, + "tokens_seen": 1084273664 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033913741223671014, + "loss": 2.7163, + "theoretical_loss": 3.6213892335662363, + "tokens_seen": 1084339200 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003391273821464393, + "loss": 2.7634, + "theoretical_loss": 3.6213687155927694, + "tokens_seen": 1084404736 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003391173520561685, + "loss": 2.651, + "theoretical_loss": 3.6213481992064462, + "tokens_seen": 1084470272 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033910732196589774, + "loss": 2.7872, + "theoretical_loss": 3.6213276844070474, + "tokens_seen": 1084535808 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033909729187562687, + "loss": 2.7807, + "theoretical_loss": 3.6213071711943545, + "tokens_seen": 1084601344 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1230848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.071620225906372, + "objective/train/theoretical_loss": 3.621302043139079, + "objective/train/tokens_used": 1105077728, + "theoretical_loss": 3.621302043139079, + "tokens_seen": 1084617728 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003390872617853561, + "loss": 2.2767, + "theoretical_loss": 3.6212866595681494, + "tokens_seen": 1084666880 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033907723169508523, + "loss": 2.7164, + "theoretical_loss": 3.6212661495282132, + "tokens_seen": 1084732416 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033906720160481446, + "loss": 2.5967, + "theoretical_loss": 3.621245641074328, + "tokens_seen": 1084797952 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033905717151454365, + "loss": 2.7439, + "theoretical_loss": 3.6212251342062736, + "tokens_seen": 1084863488 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033904714142427283, + "loss": 2.8463, + "theoretical_loss": 3.6212046289238335, + "tokens_seen": 1084929024 + }, + { + "epoch": 3.06, + "learning_rate": 0.000339037111334002, + "loss": 2.9693, + "theoretical_loss": 3.6211841252267885, + "tokens_seen": 1084994560 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033902708124373124, + "loss": 2.7778, + "theoretical_loss": 3.6211636231149202, + "tokens_seen": 1085060096 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033901705115346037, + "loss": 2.9191, + "theoretical_loss": 3.621143122588011, + "tokens_seen": 1085125632 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003390070210631896, + "loss": 2.738, + "theoretical_loss": 3.6211226236458414, + "tokens_seen": 1085191168 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033899699097291873, + "loss": 2.6476, + "theoretical_loss": 3.6211021262881946, + "tokens_seen": 1085256704 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033898696088264797, + "loss": 2.8807, + "theoretical_loss": 3.621081630514851, + "tokens_seen": 1085322240 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033897693079237715, + "loss": 2.5783, + "theoretical_loss": 3.6210611363255945, + "tokens_seen": 1085387776 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033896690070210633, + "loss": 2.8129, + "theoretical_loss": 3.621040643720205, + "tokens_seen": 1085453312 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003389568706118355, + "loss": 2.9223, + "theoretical_loss": 3.621020152698465, + "tokens_seen": 1085518848 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003389468405215647, + "loss": 2.8439, + "theoretical_loss": 3.6209996632601573, + "tokens_seen": 1085584384 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003389368104312939, + "loss": 2.4224, + "theoretical_loss": 3.620979175405064, + "tokens_seen": 1085649920 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003389267803410231, + "loss": 2.8556, + "theoretical_loss": 3.6209586891329657, + "tokens_seen": 1085715456 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033891675025075224, + "loss": 2.9056, + "theoretical_loss": 3.620938204443646, + "tokens_seen": 1085780992 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033890672016048147, + "loss": 2.9773, + "theoretical_loss": 3.6209177213368866, + "tokens_seen": 1085846528 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003388966900702106, + "loss": 2.5649, + "theoretical_loss": 3.6208972398124697, + "tokens_seen": 1085912064 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033888665997993983, + "loss": 2.7481, + "theoretical_loss": 3.6208767598701774, + "tokens_seen": 1085977600 + }, + { + "epoch": 3.06, + "learning_rate": 0.000338876629889669, + "loss": 2.7252, + "theoretical_loss": 3.6208562815097927, + "tokens_seen": 1086043136 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003388665997993982, + "loss": 2.7418, + "theoretical_loss": 3.6208358047310973, + "tokens_seen": 1086108672 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003388565697091274, + "loss": 2.7919, + "theoretical_loss": 3.6208153295338734, + "tokens_seen": 1086174208 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003388465396188566, + "loss": 2.8255, + "theoretical_loss": 3.6207948559179046, + "tokens_seen": 1086239744 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1232129, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1969611644744873, + "objective/train/theoretical_loss": 3.6207897377609575, + "objective/train/tokens_used": 1106716128, + "theoretical_loss": 3.6207897377609575, + "tokens_seen": 1086256128 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033883650952858574, + "loss": 2.8786, + "theoretical_loss": 3.6207743838829725, + "tokens_seen": 1086305280 + }, + { + "epoch": 3.06, + "learning_rate": 0.000338826479438315, + "loss": 2.6704, + "theoretical_loss": 3.6207539134288593, + "tokens_seen": 1086370816 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003388164493480441, + "loss": 2.6908, + "theoretical_loss": 3.6207334445553485, + "tokens_seen": 1086436352 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033880641925777334, + "loss": 2.5391, + "theoretical_loss": 3.6207129772622224, + "tokens_seen": 1086501888 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003387963891675025, + "loss": 2.8369, + "theoretical_loss": 3.6206925115492634, + "tokens_seen": 1086567424 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003387863590772317, + "loss": 2.7543, + "theoretical_loss": 3.620672047416254, + "tokens_seen": 1086632960 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003387763289869609, + "loss": 2.7815, + "theoretical_loss": 3.6206515848629777, + "tokens_seen": 1086698496 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033876629889669006, + "loss": 2.6934, + "theoretical_loss": 3.6206311238892166, + "tokens_seen": 1086764032 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033875626880641924, + "loss": 2.7152, + "theoretical_loss": 3.6206106644947544, + "tokens_seen": 1086829568 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003387462387161485, + "loss": 3.0106, + "theoretical_loss": 3.620590206679373, + "tokens_seen": 1086895104 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003387362086258776, + "loss": 2.9409, + "theoretical_loss": 3.620569750442856, + "tokens_seen": 1086960640 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033872617853560684, + "loss": 2.7743, + "theoretical_loss": 3.6205492957849863, + "tokens_seen": 1087026176 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033871614844533597, + "loss": 2.5937, + "theoretical_loss": 3.620528842705546, + "tokens_seen": 1087091712 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003387061183550652, + "loss": 2.9943, + "theoretical_loss": 3.6205083912043197, + "tokens_seen": 1087157248 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003386960882647944, + "loss": 2.6698, + "theoretical_loss": 3.620487941281089, + "tokens_seen": 1087222784 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033868605817452356, + "loss": 2.5315, + "theoretical_loss": 3.6204674929356377, + "tokens_seen": 1087288320 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033867602808425275, + "loss": 2.8809, + "theoretical_loss": 3.620447046167749, + "tokens_seen": 1087353856 + }, + { + "epoch": 3.06, + "learning_rate": 0.000338665997993982, + "loss": 2.6834, + "theoretical_loss": 3.620426600977207, + "tokens_seen": 1087419392 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003386559679037111, + "loss": 2.7171, + "theoretical_loss": 3.620406157363793, + "tokens_seen": 1087484928 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033864593781344034, + "loss": 2.8595, + "theoretical_loss": 3.6203857153272923, + "tokens_seen": 1087550464 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033863590772316947, + "loss": 2.8387, + "theoretical_loss": 3.6203652748674866, + "tokens_seen": 1087616000 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003386258776328987, + "loss": 2.9063, + "theoretical_loss": 3.6203448359841603, + "tokens_seen": 1087681536 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003386158475426279, + "loss": 2.7254, + "theoretical_loss": 3.6203243986770968, + "tokens_seen": 1087747072 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033860581745235707, + "loss": 2.8621, + "theoretical_loss": 3.620303962946079, + "tokens_seen": 1087812608 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033859578736208625, + "loss": 2.8689, + "theoretical_loss": 3.6202835287908908, + "tokens_seen": 1087878144 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1232782, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.946256637573242, + "objective/train/theoretical_loss": 3.620278420498292, + "objective/train/tokens_used": 1108354528, + "theoretical_loss": 3.620278420498292, + "tokens_seen": 1087894528 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033858575727181543, + "loss": 2.9374, + "theoretical_loss": 3.620263096211316, + "tokens_seen": 1087943680 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033857572718154467, + "loss": 2.9483, + "theoretical_loss": 3.6202426652071376, + "tokens_seen": 1088009216 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033856569709127385, + "loss": 2.8696, + "theoretical_loss": 3.6202222357781397, + "tokens_seen": 1088074752 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033855566700100303, + "loss": 2.6028, + "theoretical_loss": 3.620201807924106, + "tokens_seen": 1088140288 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003385456369107322, + "loss": 2.7729, + "theoretical_loss": 3.62018138164482, + "tokens_seen": 1088205824 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033853560682046144, + "loss": 3.0473, + "theoretical_loss": 3.6201609569400657, + "tokens_seen": 1088271360 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033852557673019057, + "loss": 2.5592, + "theoretical_loss": 3.620140533809627, + "tokens_seen": 1088336896 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003385155466399198, + "loss": 2.6389, + "theoretical_loss": 3.620120112253287, + "tokens_seen": 1088402432 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033850551654964893, + "loss": 2.6717, + "theoretical_loss": 3.6200996922708306, + "tokens_seen": 1088467968 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033849548645937817, + "loss": 2.7892, + "theoretical_loss": 3.620079273862041, + "tokens_seen": 1088533504 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033848545636910735, + "loss": 2.7012, + "theoretical_loss": 3.620058857026703, + "tokens_seen": 1088599040 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033847542627883653, + "loss": 2.4863, + "theoretical_loss": 3.6200384417646, + "tokens_seen": 1088664576 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003384653961885657, + "loss": 2.754, + "theoretical_loss": 3.6200180280755165, + "tokens_seen": 1088730112 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003384553660982949, + "loss": 2.6993, + "theoretical_loss": 3.619997615959236, + "tokens_seen": 1088795648 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003384453360080241, + "loss": 2.6705, + "theoretical_loss": 3.6199772054155432, + "tokens_seen": 1088861184 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003384353059177533, + "loss": 2.8679, + "theoretical_loss": 3.619956796444222, + "tokens_seen": 1088926720 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033842527582748244, + "loss": 2.7584, + "theoretical_loss": 3.619936389045057, + "tokens_seen": 1088992256 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033841524573721167, + "loss": 2.7861, + "theoretical_loss": 3.6199159832178323, + "tokens_seen": 1089057792 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003384052156469408, + "loss": 2.8264, + "theoretical_loss": 3.619895578962332, + "tokens_seen": 1089123328 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033839518555667003, + "loss": 2.8653, + "theoretical_loss": 3.6198751762783408, + "tokens_seen": 1089188864 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003383851554663992, + "loss": 2.864, + "theoretical_loss": 3.619854775165643, + "tokens_seen": 1089254400 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003383751253761284, + "loss": 2.793, + "theoretical_loss": 3.6198343756240225, + "tokens_seen": 1089319936 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003383650952858576, + "loss": 2.7411, + "theoretical_loss": 3.6198139776532656, + "tokens_seen": 1089385472 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003383550651955868, + "loss": 2.8531, + "theoretical_loss": 3.6197935812531545, + "tokens_seen": 1089451008 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033834503510531594, + "loss": 2.6371, + "theoretical_loss": 3.6197731864234752, + "tokens_seen": 1089516544 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1234372, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.325249433517456, + "objective/train/theoretical_loss": 3.6197680879614103, + "objective/train/tokens_used": 1109992928, + "theoretical_loss": 3.6197680879614103, + "tokens_seen": 1089532928 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003383350050150452, + "loss": 2.4403, + "theoretical_loss": 3.6197527931640123, + "tokens_seen": 1089582080 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003383249749247743, + "loss": 2.7791, + "theoretical_loss": 3.6197324014745504, + "tokens_seen": 1089647616 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033831494483450354, + "loss": 2.9964, + "theoretical_loss": 3.6197120113548733, + "tokens_seen": 1089713152 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003383049147442327, + "loss": 2.7922, + "theoretical_loss": 3.619691622804767, + "tokens_seen": 1089778688 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003382948846539619, + "loss": 2.7788, + "theoretical_loss": 3.6196712358240157, + "tokens_seen": 1089844224 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003382848545636911, + "loss": 2.8688, + "theoretical_loss": 3.619650850412404, + "tokens_seen": 1089909760 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033827482447342026, + "loss": 2.7936, + "theoretical_loss": 3.6196304665697174, + "tokens_seen": 1089975296 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033826479438314944, + "loss": 2.8542, + "theoretical_loss": 3.619610084295741, + "tokens_seen": 1090040832 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003382547642928787, + "loss": 2.6332, + "theoretical_loss": 3.6195897035902584, + "tokens_seen": 1090106368 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003382447342026078, + "loss": 2.6517, + "theoretical_loss": 3.619569324453056, + "tokens_seen": 1090171904 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033823470411233704, + "loss": 2.6189, + "theoretical_loss": 3.6195489468839184, + "tokens_seen": 1090237440 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033822467402206617, + "loss": 2.8172, + "theoretical_loss": 3.619528570882631, + "tokens_seen": 1090302976 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003382146439317954, + "loss": 2.8496, + "theoretical_loss": 3.6195081964489777, + "tokens_seen": 1090368512 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003382046138415246, + "loss": 2.9001, + "theoretical_loss": 3.6194878235827455, + "tokens_seen": 1090434048 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033819458375125376, + "loss": 3.004, + "theoretical_loss": 3.6194674522837182, + "tokens_seen": 1090499584 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033818455366098295, + "loss": 2.5051, + "theoretical_loss": 3.6194470825516816, + "tokens_seen": 1090565120 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003381745235707122, + "loss": 2.6448, + "theoretical_loss": 3.619426714386421, + "tokens_seen": 1090630656 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003381644934804413, + "loss": 2.7133, + "theoretical_loss": 3.619406347787722, + "tokens_seen": 1090696192 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033815446339017054, + "loss": 2.8944, + "theoretical_loss": 3.61938598275537, + "tokens_seen": 1090761728 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033814443329989967, + "loss": 2.7101, + "theoretical_loss": 3.6193656192891495, + "tokens_seen": 1090827264 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003381344032096289, + "loss": 2.7511, + "theoretical_loss": 3.619345257388847, + "tokens_seen": 1090892800 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003381243731193581, + "loss": 2.7677, + "theoretical_loss": 3.6193248970542475, + "tokens_seen": 1090958336 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033811434302908727, + "loss": 2.7238, + "theoretical_loss": 3.619304538285137, + "tokens_seen": 1091023872 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033810431293881645, + "loss": 2.8521, + "theoretical_loss": 3.6192841810813006, + "tokens_seen": 1091089408 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033809428284854563, + "loss": 2.5637, + "theoretical_loss": 3.6192638254425242, + "tokens_seen": 1091154944 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1235119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5149593353271484, + "objective/train/theoretical_loss": 3.619258736777346, + "objective/train/tokens_used": 1111631328, + "theoretical_loss": 3.619258736777346, + "tokens_seen": 1091171328 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003380842527582748, + "loss": 2.4153, + "theoretical_loss": 3.619243471368594, + "tokens_seen": 1091220480 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033807422266800405, + "loss": 2.6798, + "theoretical_loss": 3.6192231188592947, + "tokens_seen": 1091286016 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003380641925777332, + "loss": 2.8856, + "theoretical_loss": 3.6192027679144125, + "tokens_seen": 1091351552 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003380541624874624, + "loss": 2.6799, + "theoretical_loss": 3.619182418533734, + "tokens_seen": 1091417088 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033804413239719154, + "loss": 2.7602, + "theoretical_loss": 3.619162070717044, + "tokens_seen": 1091482624 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033803410230692077, + "loss": 2.7477, + "theoretical_loss": 3.6191417244641286, + "tokens_seen": 1091548160 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033802407221664995, + "loss": 2.757, + "theoretical_loss": 3.619121379774774, + "tokens_seen": 1091613696 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033801404212637913, + "loss": 2.7874, + "theoretical_loss": 3.6191010366487664, + "tokens_seen": 1091679232 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003380040120361083, + "loss": 2.8289, + "theoretical_loss": 3.619080695085891, + "tokens_seen": 1091744768 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033799398194583755, + "loss": 2.8967, + "theoretical_loss": 3.619060355085934, + "tokens_seen": 1091810304 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003379839518555667, + "loss": 2.6525, + "theoretical_loss": 3.6190400166486834, + "tokens_seen": 1091875840 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003379739217652959, + "loss": 2.4763, + "theoretical_loss": 3.6190196797739227, + "tokens_seen": 1091941376 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033796389167502504, + "loss": 2.6815, + "theoretical_loss": 3.6189993444614394, + "tokens_seen": 1092006912 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003379538615847543, + "loss": 2.6668, + "theoretical_loss": 3.61897901071102, + "tokens_seen": 1092072448 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033794383149448346, + "loss": 2.7857, + "theoretical_loss": 3.61895867852245, + "tokens_seen": 1092137984 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033793380140421264, + "loss": 2.7561, + "theoretical_loss": 3.6189383478955164, + "tokens_seen": 1092203520 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003379237713139418, + "loss": 2.8636, + "theoretical_loss": 3.618918018830005, + "tokens_seen": 1092269056 + }, + { + "epoch": 3.06, + "learning_rate": 0.000337913741223671, + "loss": 2.7024, + "theoretical_loss": 3.6188976913257025, + "tokens_seen": 1092334592 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003379037111334002, + "loss": 2.6201, + "theoretical_loss": 3.618877365382396, + "tokens_seen": 1092400128 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003378936810431294, + "loss": 2.7568, + "theoretical_loss": 3.6188570409998704, + "tokens_seen": 1092465664 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033788365095285854, + "loss": 2.8051, + "theoretical_loss": 3.618836718177913, + "tokens_seen": 1092531200 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003378736208625878, + "loss": 2.7044, + "theoretical_loss": 3.618816396916311, + "tokens_seen": 1092596736 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003378635907723169, + "loss": 2.8435, + "theoretical_loss": 3.6187960772148506, + "tokens_seen": 1092662272 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033785356068204614, + "loss": 2.6148, + "theoretical_loss": 3.6187757590733183, + "tokens_seen": 1092727808 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003378435305917753, + "loss": 2.8435, + "theoretical_loss": 3.6187554424915005, + "tokens_seen": 1092793344 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1236502, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.975677490234375, + "objective/train/theoretical_loss": 3.6187503635897267, + "objective/train/tokens_used": 1113269728, + "theoretical_loss": 3.6187503635897267, + "tokens_seen": 1092809728 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003378335005015045, + "loss": 2.84, + "theoretical_loss": 3.6187351274691846, + "tokens_seen": 1092858880 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033782347041123374, + "loss": 2.7963, + "theoretical_loss": 3.6187148140061574, + "tokens_seen": 1092924416 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003378134403209629, + "loss": 2.7715, + "theoretical_loss": 3.618694502102205, + "tokens_seen": 1092989952 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003378034102306921, + "loss": 2.7411, + "theoretical_loss": 3.618674191757115, + "tokens_seen": 1093055488 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003377933801404213, + "loss": 2.8463, + "theoretical_loss": 3.618653882970674, + "tokens_seen": 1093121024 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033778335005015046, + "loss": 2.7457, + "theoretical_loss": 3.6186335757426686, + "tokens_seen": 1093186560 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033777331995987964, + "loss": 2.7479, + "theoretical_loss": 3.6186132700728866, + "tokens_seen": 1093252096 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003377632898696089, + "loss": 2.8089, + "theoretical_loss": 3.6185929659611142, + "tokens_seen": 1093317632 + }, + { + "epoch": 3.06, + "learning_rate": 0.000337753259779338, + "loss": 2.8787, + "theoretical_loss": 3.6185726634071393, + "tokens_seen": 1093383168 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033774322968906724, + "loss": 2.6973, + "theoretical_loss": 3.6185523624107487, + "tokens_seen": 1093448704 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033773319959879637, + "loss": 2.8744, + "theoretical_loss": 3.6185320629717292, + "tokens_seen": 1093514240 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003377231695085256, + "loss": 2.5628, + "theoretical_loss": 3.6185117650898677, + "tokens_seen": 1093579776 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003377131394182548, + "loss": 2.6778, + "theoretical_loss": 3.6184914687649528, + "tokens_seen": 1093645312 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033770310932798396, + "loss": 2.8106, + "theoretical_loss": 3.6184711739967708, + "tokens_seen": 1093710848 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033769307923771315, + "loss": 2.6888, + "theoretical_loss": 3.618450880785109, + "tokens_seen": 1093776384 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003376830491474424, + "loss": 2.8349, + "theoretical_loss": 3.618430589129755, + "tokens_seen": 1093841920 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003376730190571715, + "loss": 2.8202, + "theoretical_loss": 3.6184102990304963, + "tokens_seen": 1093907456 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033766298896690074, + "loss": 2.6311, + "theoretical_loss": 3.6183900104871203, + "tokens_seen": 1093972992 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033765295887662987, + "loss": 2.8867, + "theoretical_loss": 3.6183697234994145, + "tokens_seen": 1094038528 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003376429287863591, + "loss": 2.5946, + "theoretical_loss": 3.6183494380671664, + "tokens_seen": 1094104064 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003376328986960883, + "loss": 2.5993, + "theoretical_loss": 3.618329154190163, + "tokens_seen": 1094169600 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033762286860581747, + "loss": 2.6903, + "theoretical_loss": 3.618308871868193, + "tokens_seen": 1094235136 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033761283851554665, + "loss": 2.8344, + "theoretical_loss": 3.6182885911010434, + "tokens_seen": 1094300672 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033760280842527583, + "loss": 2.6523, + "theoretical_loss": 3.618268311888502, + "tokens_seen": 1094366208 + }, + { + "epoch": 3.06, + "learning_rate": 0.000337592778335005, + "loss": 2.6933, + "theoretical_loss": 3.6182480342303567, + "tokens_seen": 1094431744 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1237024, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6148250102996826, + "objective/train/theoretical_loss": 3.6182429650586703, + "objective/train/tokens_used": 1114908128, + "theoretical_loss": 3.6182429650586703, + "tokens_seen": 1094448128 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033758274824473425, + "loss": 2.6435, + "theoretical_loss": 3.6182277581263955, + "tokens_seen": 1094497280 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003375727181544634, + "loss": 2.6805, + "theoretical_loss": 3.6182074835764055, + "tokens_seen": 1094562816 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003375626880641926, + "loss": 2.5208, + "theoretical_loss": 3.6181872105801753, + "tokens_seen": 1094628352 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033755265797392174, + "loss": 2.7542, + "theoretical_loss": 3.618166939137492, + "tokens_seen": 1094693888 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033754262788365097, + "loss": 2.6371, + "theoretical_loss": 3.6181466692481443, + "tokens_seen": 1094759424 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033753259779338015, + "loss": 2.8191, + "theoretical_loss": 3.61812640091192, + "tokens_seen": 1094824960 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033752256770310933, + "loss": 2.7634, + "theoretical_loss": 3.6181061341286074, + "tokens_seen": 1094890496 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003375125376128385, + "loss": 2.7474, + "theoretical_loss": 3.6180858688979938, + "tokens_seen": 1094956032 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033750250752256775, + "loss": 2.5913, + "theoretical_loss": 3.6180656052198676, + "tokens_seen": 1095021568 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003374924774322969, + "loss": 2.8893, + "theoretical_loss": 3.6180453430940176, + "tokens_seen": 1095087104 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003374824473420261, + "loss": 2.675, + "theoretical_loss": 3.618025082520232, + "tokens_seen": 1095152640 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033747241725175524, + "loss": 2.7979, + "theoretical_loss": 3.618004823498297, + "tokens_seen": 1095218176 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003374623871614845, + "loss": 2.8325, + "theoretical_loss": 3.617984566028004, + "tokens_seen": 1095283712 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033745235707121366, + "loss": 2.6937, + "theoretical_loss": 3.617964310109139, + "tokens_seen": 1095349248 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033744232698094284, + "loss": 2.777, + "theoretical_loss": 3.617944055741492, + "tokens_seen": 1095414784 + }, + { + "epoch": 3.06, + "learning_rate": 0.000337432296890672, + "loss": 2.5714, + "theoretical_loss": 3.6179238029248495, + "tokens_seen": 1095480320 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003374222668004012, + "loss": 2.7678, + "theoretical_loss": 3.6179035516590017, + "tokens_seen": 1095545856 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003374122367101304, + "loss": 2.6847, + "theoretical_loss": 3.617883301943736, + "tokens_seen": 1095611392 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003374022066198596, + "loss": 2.733, + "theoretical_loss": 3.6178630537788417, + "tokens_seen": 1095676928 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033739217652958874, + "loss": 2.5637, + "theoretical_loss": 3.6178428071641076, + "tokens_seen": 1095742464 + }, + { + "epoch": 3.06, + "learning_rate": 0.000337382146439318, + "loss": 2.716, + "theoretical_loss": 3.6178225620993207, + "tokens_seen": 1095808000 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003373721163490471, + "loss": 2.7915, + "theoretical_loss": 3.617802318584271, + "tokens_seen": 1095873536 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033736208625877634, + "loss": 2.8944, + "theoretical_loss": 3.6177820766187474, + "tokens_seen": 1095939072 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003373520561685055, + "loss": 2.7548, + "theoretical_loss": 3.617761836202538, + "tokens_seen": 1096004608 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003373420260782347, + "loss": 2.6252, + "theoretical_loss": 3.6177415973354314, + "tokens_seen": 1096070144 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1237677, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6507112979888916, + "objective/train/theoretical_loss": 3.6177365378606776, + "objective/train/tokens_used": 1116546528, + "theoretical_loss": 3.6177365378606776, + "tokens_seen": 1096086528 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003373319959879639, + "loss": 2.7237, + "theoretical_loss": 3.6177213600172173, + "tokens_seen": 1096135680 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003373219658976931, + "loss": 2.6584, + "theoretical_loss": 3.6177011242476835, + "tokens_seen": 1096201216 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033731193580742225, + "loss": 2.6034, + "theoretical_loss": 3.6176808900266204, + "tokens_seen": 1096266752 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003373019057171515, + "loss": 2.7871, + "theoretical_loss": 3.6176606573538153, + "tokens_seen": 1096332288 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003372918756268806, + "loss": 2.7767, + "theoretical_loss": 3.617640426229058, + "tokens_seen": 1096397824 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033728184553660984, + "loss": 2.5391, + "theoretical_loss": 3.6176201966521373, + "tokens_seen": 1096463360 + }, + { + "epoch": 3.06, + "learning_rate": 0.000337271815446339, + "loss": 2.7906, + "theoretical_loss": 3.6175999686228426, + "tokens_seen": 1096528896 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003372617853560682, + "loss": 2.8622, + "theoretical_loss": 3.6175797421409626, + "tokens_seen": 1096594432 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003372517552657974, + "loss": 2.6529, + "theoretical_loss": 3.6175595172062867, + "tokens_seen": 1096659968 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033724172517552657, + "loss": 2.5864, + "theoretical_loss": 3.617539293818605, + "tokens_seen": 1096725504 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033723169508525575, + "loss": 2.6748, + "theoretical_loss": 3.6175190719777044, + "tokens_seen": 1096791040 + }, + { + "epoch": 3.06, + "learning_rate": 0.000337221664994985, + "loss": 2.7057, + "theoretical_loss": 3.6174988516833766, + "tokens_seen": 1096856576 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003372116349047141, + "loss": 2.7422, + "theoretical_loss": 3.617478632935409, + "tokens_seen": 1096922112 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033720160481444335, + "loss": 2.813, + "theoretical_loss": 3.617458415733593, + "tokens_seen": 1096987648 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003371915747241725, + "loss": 2.785, + "theoretical_loss": 3.617438200077716, + "tokens_seen": 1097053184 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003371815446339017, + "loss": 2.5565, + "theoretical_loss": 3.617417985967569, + "tokens_seen": 1097118720 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003371715145436309, + "loss": 2.7586, + "theoretical_loss": 3.6173977734029403, + "tokens_seen": 1097184256 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033716148445336007, + "loss": 2.7171, + "theoretical_loss": 3.6173775623836204, + "tokens_seen": 1097249792 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033715145436308925, + "loss": 2.8573, + "theoretical_loss": 3.6173573529093983, + "tokens_seen": 1097315328 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003371414242728185, + "loss": 2.4732, + "theoretical_loss": 3.6173371449800635, + "tokens_seen": 1097380864 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003371313941825476, + "loss": 2.6875, + "theoretical_loss": 3.6173169385954065, + "tokens_seen": 1097446400 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033712136409227685, + "loss": 2.6837, + "theoretical_loss": 3.6172967337552153, + "tokens_seen": 1097511936 + }, + { + "epoch": 3.06, + "learning_rate": 0.000337111334002006, + "loss": 2.6919, + "theoretical_loss": 3.6172765304592813, + "tokens_seen": 1097577472 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003371013039117352, + "loss": 2.5633, + "theoretical_loss": 3.617256328707394, + "tokens_seen": 1097643008 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003370912738214644, + "loss": 2.4857, + "theoretical_loss": 3.617236128499343, + "tokens_seen": 1097708544 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1238996, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.87542986869812, + "objective/train/theoretical_loss": 3.6172310786885298, + "objective/train/tokens_used": 1118184928, + "theoretical_loss": 3.6172310786885298, + "tokens_seen": 1097724928 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003370812437311936, + "loss": 2.9281, + "theoretical_loss": 3.6172159298349174, + "tokens_seen": 1097774080 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003370712136409228, + "loss": 2.849, + "theoretical_loss": 3.617195732713908, + "tokens_seen": 1097839616 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033706118355065194, + "loss": 2.582, + "theoretical_loss": 3.617175537136105, + "tokens_seen": 1097905152 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033705115346038117, + "loss": 2.6484, + "theoretical_loss": 3.617155343101297, + "tokens_seen": 1097970688 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033704112337011035, + "loss": 2.8651, + "theoretical_loss": 3.6171351506092755, + "tokens_seen": 1098036224 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033703109327983953, + "loss": 2.8084, + "theoretical_loss": 3.61711495965983, + "tokens_seen": 1098101760 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003370210631895687, + "loss": 2.72, + "theoretical_loss": 3.6170947702527503, + "tokens_seen": 1098167296 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033701103309929795, + "loss": 2.5411, + "theoretical_loss": 3.6170745823878274, + "tokens_seen": 1098232832 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003370010030090271, + "loss": 2.634, + "theoretical_loss": 3.6170543960648507, + "tokens_seen": 1098298368 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003369909729187563, + "loss": 2.6711, + "theoretical_loss": 3.6170342112836105, + "tokens_seen": 1098363904 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033698094282848544, + "loss": 2.7764, + "theoretical_loss": 3.6170140280438967, + "tokens_seen": 1098429440 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003369709127382147, + "loss": 2.6749, + "theoretical_loss": 3.6169938463455003, + "tokens_seen": 1098494976 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033696088264794386, + "loss": 2.9666, + "theoretical_loss": 3.616973666188212, + "tokens_seen": 1098560512 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033695085255767304, + "loss": 2.7339, + "theoretical_loss": 3.6169534875718217, + "tokens_seen": 1098626048 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003369408224674022, + "loss": 2.6009, + "theoretical_loss": 3.616933310496119, + "tokens_seen": 1098691584 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003369307923771314, + "loss": 2.4912, + "theoretical_loss": 3.616913134960896, + "tokens_seen": 1098757120 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003369207622868606, + "loss": 2.8986, + "theoretical_loss": 3.6168929609659424, + "tokens_seen": 1098822656 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003369107321965898, + "loss": 2.7956, + "theoretical_loss": 3.6168727885110483, + "tokens_seen": 1098888192 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033690070210631894, + "loss": 2.6026, + "theoretical_loss": 3.6168526175960047, + "tokens_seen": 1098953728 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003368906720160482, + "loss": 3.0197, + "theoretical_loss": 3.6168324482206025, + "tokens_seen": 1099019264 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003368806419257773, + "loss": 2.8678, + "theoretical_loss": 3.6168122803846323, + "tokens_seen": 1099084800 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033687061183550654, + "loss": 2.5931, + "theoretical_loss": 3.6167921140878843, + "tokens_seen": 1099150336 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003368605817452357, + "loss": 2.5982, + "theoretical_loss": 3.6167719493301496, + "tokens_seen": 1099215872 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003368505516549649, + "loss": 2.6875, + "theoretical_loss": 3.6167517861112195, + "tokens_seen": 1099281408 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003368405215646941, + "loss": 2.6566, + "theoretical_loss": 3.6167316244308845, + "tokens_seen": 1099346944 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1239758, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1782515048980713, + "objective/train/theoretical_loss": 3.6167265842511815, + "objective/train/tokens_used": 1119823328, + "theoretical_loss": 3.6167265842511815, + "tokens_seen": 1099363328 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003368304914744233, + "loss": 2.6187, + "theoretical_loss": 3.616711464288935, + "tokens_seen": 1099412480 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033682046138415245, + "loss": 2.8087, + "theoretical_loss": 3.616691305685162, + "tokens_seen": 1099478016 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003368104312938817, + "loss": 2.7666, + "theoretical_loss": 3.616671148619357, + "tokens_seen": 1099543552 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003368004012036108, + "loss": 2.8175, + "theoretical_loss": 3.6166509930913113, + "tokens_seen": 1099609088 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033679037111334004, + "loss": 2.6504, + "theoretical_loss": 3.6166308391008153, + "tokens_seen": 1099674624 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003367803410230692, + "loss": 2.7364, + "theoretical_loss": 3.6166106866476593, + "tokens_seen": 1099740160 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003367703109327984, + "loss": 2.6164, + "theoretical_loss": 3.6165905357316364, + "tokens_seen": 1099805696 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003367602808425276, + "loss": 2.7391, + "theoretical_loss": 3.616570386352536, + "tokens_seen": 1099871232 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033675025075225677, + "loss": 2.539, + "theoretical_loss": 3.6165502385101505, + "tokens_seen": 1099936768 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033674022066198595, + "loss": 2.6233, + "theoretical_loss": 3.61653009220427, + "tokens_seen": 1100002304 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003367301905717152, + "loss": 2.7204, + "theoretical_loss": 3.616509947434687, + "tokens_seen": 1100067840 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003367201604814443, + "loss": 2.6064, + "theoretical_loss": 3.6164898042011924, + "tokens_seen": 1100133376 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033671013039117355, + "loss": 2.7523, + "theoretical_loss": 3.616469662503577, + "tokens_seen": 1100198912 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033670010030090273, + "loss": 2.7206, + "theoretical_loss": 3.616449522341633, + "tokens_seen": 1100264448 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003366900702106319, + "loss": 2.7963, + "theoretical_loss": 3.6164293837151513, + "tokens_seen": 1100329984 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003366800401203611, + "loss": 2.5174, + "theoretical_loss": 3.616409246623924, + "tokens_seen": 1100395520 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033667001003009027, + "loss": 2.8013, + "theoretical_loss": 3.6163891110677415, + "tokens_seen": 1100461056 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033665997993981945, + "loss": 2.6539, + "theoretical_loss": 3.6163689770463967, + "tokens_seen": 1100526592 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003366499498495487, + "loss": 2.6839, + "theoretical_loss": 3.6163488445596808, + "tokens_seen": 1100592128 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003366399197592778, + "loss": 2.7934, + "theoretical_loss": 3.616328713607385, + "tokens_seen": 1100657664 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033662988966900705, + "loss": 2.7716, + "theoretical_loss": 3.6163085841893015, + "tokens_seen": 1100723200 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003366198595787362, + "loss": 2.7189, + "theoretical_loss": 3.616288456305222, + "tokens_seen": 1100788736 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003366098294884654, + "loss": 2.5594, + "theoretical_loss": 3.616268329954938, + "tokens_seen": 1100854272 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003365997993981946, + "loss": 2.6865, + "theoretical_loss": 3.616248205138241, + "tokens_seen": 1100919808 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003365897693079238, + "loss": 2.812, + "theoretical_loss": 3.616228081854924, + "tokens_seen": 1100985344 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1241137, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8605268001556396, + "objective/train/theoretical_loss": 3.6162230512736606, + "objective/train/tokens_used": 1121461728, + "theoretical_loss": 3.6162230512736606, + "tokens_seen": 1101001728 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033657973921765295, + "loss": 2.9682, + "theoretical_loss": 3.616207960104778, + "tokens_seen": 1101050880 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033656970912738214, + "loss": 2.8834, + "theoretical_loss": 3.616187839887595, + "tokens_seen": 1101116416 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003365596790371113, + "loss": 2.8656, + "theoretical_loss": 3.6161677212031673, + "tokens_seen": 1101181952 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033654964894684055, + "loss": 2.6308, + "theoretical_loss": 3.616147604051287, + "tokens_seen": 1101247488 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003365396188565697, + "loss": 2.7918, + "theoretical_loss": 3.6161274884317454, + "tokens_seen": 1101313024 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003365295887662989, + "loss": 2.6168, + "theoretical_loss": 3.6161073743443355, + "tokens_seen": 1101378560 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003365195586760281, + "loss": 2.7806, + "theoretical_loss": 3.616087261788849, + "tokens_seen": 1101444096 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003365095285857573, + "loss": 2.6078, + "theoretical_loss": 3.6160671507650783, + "tokens_seen": 1101509632 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033649949849548646, + "loss": 2.5558, + "theoretical_loss": 3.6160470412728154, + "tokens_seen": 1101575168 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033648946840521564, + "loss": 3.0053, + "theoretical_loss": 3.616026933311853, + "tokens_seen": 1101640704 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003364794383149448, + "loss": 2.5638, + "theoretical_loss": 3.6160068268819825, + "tokens_seen": 1101706240 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033646940822467406, + "loss": 2.4866, + "theoretical_loss": 3.6159867219829973, + "tokens_seen": 1101771776 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003364593781344032, + "loss": 2.546, + "theoretical_loss": 3.615966618614689, + "tokens_seen": 1101837312 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003364493480441324, + "loss": 2.6268, + "theoretical_loss": 3.6159465167768507, + "tokens_seen": 1101902848 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033643931795386154, + "loss": 2.7514, + "theoretical_loss": 3.615926416469274, + "tokens_seen": 1101968384 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003364292878635908, + "loss": 2.7597, + "theoretical_loss": 3.6159063176917527, + "tokens_seen": 1102033920 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033641925777331996, + "loss": 2.7634, + "theoretical_loss": 3.615886220444078, + "tokens_seen": 1102099456 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033640922768304914, + "loss": 2.8582, + "theoretical_loss": 3.6158661247260433, + "tokens_seen": 1102164992 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003363991975927783, + "loss": 2.7127, + "theoretical_loss": 3.6158460305374405, + "tokens_seen": 1102230528 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003363891675025075, + "loss": 2.6674, + "theoretical_loss": 3.6158259378780633, + "tokens_seen": 1102296064 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003363791374122367, + "loss": 2.9135, + "theoretical_loss": 3.615805846747704, + "tokens_seen": 1102361600 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003363691073219659, + "loss": 2.7931, + "theoretical_loss": 3.6157857571461554, + "tokens_seen": 1102427136 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033635907723169505, + "loss": 2.7545, + "theoretical_loss": 3.61576566907321, + "tokens_seen": 1102492672 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003363490471414243, + "loss": 2.4555, + "theoretical_loss": 3.6157455825286604, + "tokens_seen": 1102558208 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033633901705115346, + "loss": 2.8385, + "theoretical_loss": 3.6157254975123, + "tokens_seen": 1102623744 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1241790, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5041520595550537, + "objective/train/theoretical_loss": 3.615720476496965, + "objective/train/tokens_used": 1123100128, + "theoretical_loss": 3.615720476496965, + "tokens_seen": 1102640128 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033632898696088265, + "loss": 2.6422, + "theoretical_loss": 3.6157054140239215, + "tokens_seen": 1102689280 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003363189568706119, + "loss": 2.8461, + "theoretical_loss": 3.615685332063318, + "tokens_seen": 1102754816 + }, + { + "epoch": 3.06, + "learning_rate": 0.000336308926780341, + "loss": 2.71, + "theoretical_loss": 3.6156652516302827, + "tokens_seen": 1102820352 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033629889669007024, + "loss": 2.789, + "theoretical_loss": 3.615645172724608, + "tokens_seen": 1102885888 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003362888665997994, + "loss": 2.747, + "theoretical_loss": 3.6156250953460876, + "tokens_seen": 1102951424 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003362788365095286, + "loss": 2.6397, + "theoretical_loss": 3.615605019494514, + "tokens_seen": 1103016960 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003362688064192578, + "loss": 2.6001, + "theoretical_loss": 3.6155849451696813, + "tokens_seen": 1103082496 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033625877632898697, + "loss": 2.4701, + "theoretical_loss": 3.615564872371382, + "tokens_seen": 1103148032 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033624874623871615, + "loss": 2.6813, + "theoretical_loss": 3.615544801099409, + "tokens_seen": 1103213568 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003362387161484454, + "loss": 2.7879, + "theoretical_loss": 3.615524731353556, + "tokens_seen": 1103279104 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003362286860581745, + "loss": 2.6216, + "theoretical_loss": 3.615504663133617, + "tokens_seen": 1103344640 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033621865596790375, + "loss": 2.383, + "theoretical_loss": 3.615484596439384, + "tokens_seen": 1103410176 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033620862587763293, + "loss": 2.4897, + "theoretical_loss": 3.6154645312706513, + "tokens_seen": 1103475712 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003361985957873621, + "loss": 2.7389, + "theoretical_loss": 3.615444467627212, + "tokens_seen": 1103541248 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003361885656970913, + "loss": 2.7377, + "theoretical_loss": 3.6154244055088602, + "tokens_seen": 1103606784 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033617853560682047, + "loss": 2.8105, + "theoretical_loss": 3.6154043449153885, + "tokens_seen": 1103672320 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033616850551654965, + "loss": 2.6996, + "theoretical_loss": 3.615384285846591, + "tokens_seen": 1103737856 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003361584754262789, + "loss": 2.4126, + "theoretical_loss": 3.6153642283022616, + "tokens_seen": 1103803392 + }, + { + "epoch": 3.06, + "learning_rate": 0.000336148445336008, + "loss": 2.6139, + "theoretical_loss": 3.6153441722821933, + "tokens_seen": 1103868928 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033613841524573725, + "loss": 2.6361, + "theoretical_loss": 3.6153241177861797, + "tokens_seen": 1103934464 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003361283851554664, + "loss": 2.4851, + "theoretical_loss": 3.615304064814015, + "tokens_seen": 1104000000 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003361183550651956, + "loss": 2.6684, + "theoretical_loss": 3.615284013365493, + "tokens_seen": 1104065536 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003361083249749248, + "loss": 2.7077, + "theoretical_loss": 3.615263963440407, + "tokens_seen": 1104131072 + }, + { + "epoch": 3.06, + "learning_rate": 0.000336098294884654, + "loss": 2.7662, + "theoretical_loss": 3.615243915038551, + "tokens_seen": 1104196608 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033608826479438316, + "loss": 2.6451, + "theoretical_loss": 3.61522386815972, + "tokens_seen": 1104262144 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1242899, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7701282501220703, + "objective/train/theoretical_loss": 3.61521885667796, + "objective/train/tokens_used": 1124738528, + "theoretical_loss": 3.61521885667796, + "tokens_seen": 1104278528 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033607823470411234, + "loss": 2.5712, + "theoretical_loss": 3.6152038228037062, + "tokens_seen": 1104327680 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003360682046138415, + "loss": 2.7366, + "theoretical_loss": 3.6151837789703043, + "tokens_seen": 1104393216 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033605817452357075, + "loss": 2.6724, + "theoretical_loss": 3.6151637366593086, + "tokens_seen": 1104458752 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003360481444332999, + "loss": 2.6507, + "theoretical_loss": 3.615143695870512, + "tokens_seen": 1104524288 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003360381143430291, + "loss": 2.8124, + "theoretical_loss": 3.6151236566037106, + "tokens_seen": 1104589824 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003360280842527583, + "loss": 2.6883, + "theoretical_loss": 3.6151036188586967, + "tokens_seen": 1104655360 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003360180541624875, + "loss": 2.6271, + "theoretical_loss": 3.6150835826352656, + "tokens_seen": 1104720896 + }, + { + "epoch": 3.06, + "learning_rate": 0.00033600802407221666, + "loss": 2.679, + "theoretical_loss": 3.6150635479332105, + "tokens_seen": 1104786432 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033599799398194584, + "loss": 2.6898, + "theoretical_loss": 3.615043514752326, + "tokens_seen": 1104851968 + }, + { + "epoch": 3.07, + "learning_rate": 0.000335987963891675, + "loss": 2.6468, + "theoretical_loss": 3.615023483092407, + "tokens_seen": 1104917504 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033597793380140426, + "loss": 2.7967, + "theoretical_loss": 3.6150034529532475, + "tokens_seen": 1104983040 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003359679037111334, + "loss": 2.8978, + "theoretical_loss": 3.6149834243346417, + "tokens_seen": 1105048576 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003359578736208626, + "loss": 2.6337, + "theoretical_loss": 3.6149633972363837, + "tokens_seen": 1105114112 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033594784353059174, + "loss": 2.6396, + "theoretical_loss": 3.614943371658268, + "tokens_seen": 1105179648 + }, + { + "epoch": 3.07, + "learning_rate": 0.000335937813440321, + "loss": 2.8301, + "theoretical_loss": 3.61492334760009, + "tokens_seen": 1105245184 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033592778335005016, + "loss": 3.037, + "theoretical_loss": 3.6149033250616434, + "tokens_seen": 1105310720 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033591775325977934, + "loss": 2.6608, + "theoretical_loss": 3.6148833040427224, + "tokens_seen": 1105376256 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003359077231695085, + "loss": 2.8969, + "theoretical_loss": 3.614863284543123, + "tokens_seen": 1105441792 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003358976930792377, + "loss": 2.7132, + "theoretical_loss": 3.6148432665626387, + "tokens_seen": 1105507328 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003358876629889669, + "loss": 2.4359, + "theoretical_loss": 3.6148232501010638, + "tokens_seen": 1105572864 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003358776328986961, + "loss": 2.9279, + "theoretical_loss": 3.6148032351581945, + "tokens_seen": 1105638400 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033586760280842525, + "loss": 2.7633, + "theoretical_loss": 3.6147832217338243, + "tokens_seen": 1105703936 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003358575727181545, + "loss": 2.7452, + "theoretical_loss": 3.614763209827749, + "tokens_seen": 1105769472 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033584754262788366, + "loss": 2.6413, + "theoretical_loss": 3.6147431994397623, + "tokens_seen": 1105835008 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033583751253761285, + "loss": 2.5344, + "theoretical_loss": 3.61472319056966, + "tokens_seen": 1105900544 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1243613, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9853389263153076, + "objective/train/theoretical_loss": 3.6147181885892796, + "objective/train/tokens_used": 1126376928, + "theoretical_loss": 3.6147181885892796, + "tokens_seen": 1105916928 + }, + { + "epoch": 3.07, + "learning_rate": 0.000335827482447342, + "loss": 2.7355, + "theoretical_loss": 3.6147031832172365, + "tokens_seen": 1105966080 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003358174523570712, + "loss": 2.6175, + "theoretical_loss": 3.614683177382287, + "tokens_seen": 1106031616 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003358074222668004, + "loss": 2.9476, + "theoretical_loss": 3.6146631730646064, + "tokens_seen": 1106097152 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003357973921765296, + "loss": 2.7897, + "theoretical_loss": 3.6146431702639896, + "tokens_seen": 1106162688 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033578736208625875, + "loss": 2.6465, + "theoretical_loss": 3.6146231689802324, + "tokens_seen": 1106228224 + }, + { + "epoch": 3.07, + "learning_rate": 0.000335777331995988, + "loss": 2.6784, + "theoretical_loss": 3.6146031692131295, + "tokens_seen": 1106293760 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003357673019057171, + "loss": 2.7595, + "theoretical_loss": 3.614583170962475, + "tokens_seen": 1106359296 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033575727181544635, + "loss": 2.5581, + "theoretical_loss": 3.614563174228066, + "tokens_seen": 1106424832 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033574724172517553, + "loss": 2.7814, + "theoretical_loss": 3.6145431790096962, + "tokens_seen": 1106490368 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003357372116349047, + "loss": 2.6407, + "theoretical_loss": 3.6145231853071618, + "tokens_seen": 1106555904 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003357271815446339, + "loss": 2.7503, + "theoretical_loss": 3.6145031931202576, + "tokens_seen": 1106621440 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033571715145436313, + "loss": 2.794, + "theoretical_loss": 3.6144832024487794, + "tokens_seen": 1106686976 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033570712136409225, + "loss": 2.765, + "theoretical_loss": 3.614463213292522, + "tokens_seen": 1106752512 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003356970912738215, + "loss": 2.7039, + "theoretical_loss": 3.6144432256512817, + "tokens_seen": 1106818048 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003356870611835506, + "loss": 2.5687, + "theoretical_loss": 3.6144232395248532, + "tokens_seen": 1106883584 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033567703109327985, + "loss": 2.7355, + "theoretical_loss": 3.6144032549130323, + "tokens_seen": 1106949120 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033566700100300903, + "loss": 2.7997, + "theoretical_loss": 3.6143832718156146, + "tokens_seen": 1107014656 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003356569709127382, + "loss": 2.6971, + "theoretical_loss": 3.6143632902323954, + "tokens_seen": 1107080192 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003356469408224674, + "loss": 2.8226, + "theoretical_loss": 3.6143433101631706, + "tokens_seen": 1107145728 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003356369107321966, + "loss": 2.458, + "theoretical_loss": 3.614323331607736, + "tokens_seen": 1107211264 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033562688064192576, + "loss": 2.6713, + "theoretical_loss": 3.6143033545658874, + "tokens_seen": 1107276800 + }, + { + "epoch": 3.07, + "learning_rate": 0.000335616850551655, + "loss": 2.5748, + "theoretical_loss": 3.61428337903742, + "tokens_seen": 1107342336 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003356068204613841, + "loss": 2.8478, + "theoretical_loss": 3.61426340502213, + "tokens_seen": 1107407872 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033559679037111336, + "loss": 2.8482, + "theoretical_loss": 3.614243432519813, + "tokens_seen": 1107473408 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003355867602808425, + "loss": 2.7905, + "theoretical_loss": 3.6142234615302655, + "tokens_seen": 1107538944 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1244863, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.85904860496521, + "objective/train/theoretical_loss": 3.6142184690192245, + "objective/train/tokens_used": 1128015328, + "theoretical_loss": 3.6142184690192245, + "tokens_seen": 1107555328 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003355767301905717, + "loss": 2.8847, + "theoretical_loss": 3.6142034920532824, + "tokens_seen": 1107604480 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033556670010030095, + "loss": 2.7116, + "theoretical_loss": 3.6141835240886606, + "tokens_seen": 1107670016 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003355566700100301, + "loss": 2.6555, + "theoretical_loss": 3.6141635576361955, + "tokens_seen": 1107735552 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003355466399197593, + "loss": 2.5725, + "theoretical_loss": 3.6141435926956835, + "tokens_seen": 1107801088 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003355366098294885, + "loss": 2.6107, + "theoretical_loss": 3.6141236292669205, + "tokens_seen": 1107866624 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003355265797392177, + "loss": 2.4844, + "theoretical_loss": 3.6141036673497027, + "tokens_seen": 1107932160 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033551654964894686, + "loss": 2.7763, + "theoretical_loss": 3.6140837069438256, + "tokens_seen": 1107997696 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033550651955867604, + "loss": 2.7287, + "theoretical_loss": 3.6140637480490865, + "tokens_seen": 1108063232 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003354964894684052, + "loss": 2.6421, + "theoretical_loss": 3.6140437906652805, + "tokens_seen": 1108128768 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033548645937813446, + "loss": 2.595, + "theoretical_loss": 3.614023834792205, + "tokens_seen": 1108194304 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003354764292878636, + "loss": 2.7723, + "theoretical_loss": 3.6140038804296557, + "tokens_seen": 1108259840 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003354663991975928, + "loss": 2.8769, + "theoretical_loss": 3.6139839275774293, + "tokens_seen": 1108325376 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033545636910732195, + "loss": 2.587, + "theoretical_loss": 3.613963976235321, + "tokens_seen": 1108390912 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003354463390170512, + "loss": 2.7087, + "theoretical_loss": 3.613944026403129, + "tokens_seen": 1108456448 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033543630892678036, + "loss": 2.8548, + "theoretical_loss": 3.6139240780806485, + "tokens_seen": 1108521984 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033542627883650954, + "loss": 2.5704, + "theoretical_loss": 3.6139041312676765, + "tokens_seen": 1108587520 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003354162487462387, + "loss": 2.7445, + "theoretical_loss": 3.61388418596401, + "tokens_seen": 1108653056 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003354062186559679, + "loss": 2.8618, + "theoretical_loss": 3.613864242169444, + "tokens_seen": 1108718592 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003353961885656971, + "loss": 2.6855, + "theoretical_loss": 3.6138442998837768, + "tokens_seen": 1108784128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003353861584754263, + "loss": 2.51, + "theoretical_loss": 3.613824359106805, + "tokens_seen": 1108849664 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033537612838515545, + "loss": 2.6772, + "theoretical_loss": 3.6138044198383237, + "tokens_seen": 1108915200 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003353660982948847, + "loss": 2.6674, + "theoretical_loss": 3.613784482078131, + "tokens_seen": 1108980736 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033535606820461386, + "loss": 2.6406, + "theoretical_loss": 3.6137645458260237, + "tokens_seen": 1109046272 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033534603811434305, + "loss": 2.678, + "theoretical_loss": 3.613744611081798, + "tokens_seen": 1109111808 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003353360080240722, + "loss": 2.7416, + "theoretical_loss": 3.613724677845251, + "tokens_seen": 1109177344 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1245226, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.714931011199951, + "objective/train/theoretical_loss": 3.6137196947716657, + "objective/train/tokens_used": 1129653728, + "theoretical_loss": 3.6137196947716657, + "tokens_seen": 1109193728 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003353259779338014, + "loss": 2.8707, + "theoretical_loss": 3.6137047461161798, + "tokens_seen": 1109242880 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003353159478435306, + "loss": 2.908, + "theoretical_loss": 3.6136848158943815, + "tokens_seen": 1109308416 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003353059177532598, + "loss": 2.5868, + "theoretical_loss": 3.6136648871796524, + "tokens_seen": 1109373952 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033529588766298895, + "loss": 2.7557, + "theoretical_loss": 3.61364495997179, + "tokens_seen": 1109439488 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003352858575727182, + "loss": 2.4417, + "theoretical_loss": 3.6136250342705907, + "tokens_seen": 1109505024 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003352758274824473, + "loss": 2.693, + "theoretical_loss": 3.613605110075853, + "tokens_seen": 1109570560 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033526579739217655, + "loss": 2.9798, + "theoretical_loss": 3.6135851873873728, + "tokens_seen": 1109636096 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033525576730190573, + "loss": 2.6654, + "theoretical_loss": 3.613565266204948, + "tokens_seen": 1109701632 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003352457372116349, + "loss": 2.5048, + "theoretical_loss": 3.613545346528375, + "tokens_seen": 1109767168 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003352357071213641, + "loss": 2.8293, + "theoretical_loss": 3.613525428357452, + "tokens_seen": 1109832704 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033522567703109333, + "loss": 2.658, + "theoretical_loss": 3.6135055116919754, + "tokens_seen": 1109898240 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033521564694082245, + "loss": 2.6935, + "theoretical_loss": 3.6134855965317434, + "tokens_seen": 1109963776 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003352056168505517, + "loss": 2.8863, + "theoretical_loss": 3.6134656828765523, + "tokens_seen": 1110029312 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003351955867602808, + "loss": 2.6411, + "theoretical_loss": 3.613445770726201, + "tokens_seen": 1110094848 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033518555667001005, + "loss": 2.7521, + "theoretical_loss": 3.613425860080486, + "tokens_seen": 1110160384 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033517552657973923, + "loss": 2.4854, + "theoretical_loss": 3.6134059509392045, + "tokens_seen": 1110225920 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003351654964894684, + "loss": 2.8907, + "theoretical_loss": 3.6133860433021545, + "tokens_seen": 1110291456 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003351554663991976, + "loss": 2.616, + "theoretical_loss": 3.613366137169134, + "tokens_seen": 1110356992 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003351454363089268, + "loss": 2.7054, + "theoretical_loss": 3.6133462325399393, + "tokens_seen": 1110422528 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033513540621865596, + "loss": 2.7419, + "theoretical_loss": 3.6133263294143694, + "tokens_seen": 1110488064 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003351253761283852, + "loss": 2.721, + "theoretical_loss": 3.613306427792221, + "tokens_seen": 1110553600 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003351153460381143, + "loss": 2.791, + "theoretical_loss": 3.6132865276732926, + "tokens_seen": 1110619136 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033510531594784356, + "loss": 2.5685, + "theoretical_loss": 3.6132666290573816, + "tokens_seen": 1110684672 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003350952858575727, + "loss": 2.8731, + "theoretical_loss": 3.613246731944286, + "tokens_seen": 1110750208 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003350852557673019, + "loss": 2.5971, + "theoretical_loss": 3.613226836333803, + "tokens_seen": 1110815744 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1246492, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.347496747970581, + "objective/train/theoretical_loss": 3.6132218626659425, + "objective/train/tokens_used": 1131292128, + "theoretical_loss": 3.6132218626659425, + "tokens_seen": 1110832128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003350752256770311, + "loss": 2.4939, + "theoretical_loss": 3.6132069422257316, + "tokens_seen": 1110881280 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003350651955867603, + "loss": 2.3556, + "theoretical_loss": 3.613187049619869, + "tokens_seen": 1110946816 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033505516549648946, + "loss": 2.7816, + "theoretical_loss": 3.6131671585160126, + "tokens_seen": 1111012352 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003350451354062187, + "loss": 2.3472, + "theoretical_loss": 3.6131472689139614, + "tokens_seen": 1111077888 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003350351053159478, + "loss": 2.8442, + "theoretical_loss": 3.6131273808135136, + "tokens_seen": 1111143424 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033502507522567706, + "loss": 2.6176, + "theoretical_loss": 3.613107494214466, + "tokens_seen": 1111208960 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003350150451354062, + "loss": 2.7964, + "theoretical_loss": 3.6130876091166177, + "tokens_seen": 1111274496 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003350050150451354, + "loss": 2.5386, + "theoretical_loss": 3.613067725519767, + "tokens_seen": 1111340032 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003349949849548646, + "loss": 2.7889, + "theoretical_loss": 3.613047843423711, + "tokens_seen": 1111405568 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003349849548645938, + "loss": 2.5481, + "theoretical_loss": 3.61302796282825, + "tokens_seen": 1111471104 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033497492477432296, + "loss": 2.804, + "theoretical_loss": 3.6130080837331797, + "tokens_seen": 1111536640 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033496489468405215, + "loss": 2.6183, + "theoretical_loss": 3.6129882061383, + "tokens_seen": 1111602176 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003349548645937813, + "loss": 2.6518, + "theoretical_loss": 3.612968330043409, + "tokens_seen": 1111667712 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033494483450351056, + "loss": 2.731, + "theoretical_loss": 3.612948455448305, + "tokens_seen": 1111733248 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003349348044132397, + "loss": 2.8229, + "theoretical_loss": 3.6129285823527866, + "tokens_seen": 1111798784 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003349247743229689, + "loss": 2.8859, + "theoretical_loss": 3.6129087107566518, + "tokens_seen": 1111864320 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033491474423269805, + "loss": 2.5248, + "theoretical_loss": 3.6128888406597, + "tokens_seen": 1111929856 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003349047141424273, + "loss": 2.869, + "theoretical_loss": 3.612868972061728, + "tokens_seen": 1111995392 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033489468405215647, + "loss": 2.6972, + "theoretical_loss": 3.6128491049625366, + "tokens_seen": 1112060928 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033488465396188565, + "loss": 2.6138, + "theoretical_loss": 3.612829239361923, + "tokens_seen": 1112126464 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033487462387161483, + "loss": 2.9363, + "theoretical_loss": 3.612809375259686, + "tokens_seen": 1112192000 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033486459378134406, + "loss": 2.9163, + "theoretical_loss": 3.612789512655625, + "tokens_seen": 1112257536 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003348545636910732, + "loss": 2.9406, + "theoretical_loss": 3.612769651549538, + "tokens_seen": 1112323072 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003348445336008024, + "loss": 2.6882, + "theoretical_loss": 3.6127497919412237, + "tokens_seen": 1112388608 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003348345035105316, + "loss": 2.8227, + "theoretical_loss": 3.6127299338304812, + "tokens_seen": 1112454144 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1247248, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.9692270755767822, + "objective/train/theoretical_loss": 3.6127249695367682, + "objective/train/tokens_used": 1132930528, + "theoretical_loss": 3.6127249695367682, + "tokens_seen": 1112470528 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003348244734202608, + "loss": 2.3856, + "theoretical_loss": 3.61271007721711, + "tokens_seen": 1112519680 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033481444332999, + "loss": 2.7777, + "theoretical_loss": 3.612690222100908, + "tokens_seen": 1112585216 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033480441323971915, + "loss": 2.5555, + "theoretical_loss": 3.612670368481675, + "tokens_seen": 1112650752 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003347943831494484, + "loss": 2.7394, + "theoretical_loss": 3.612650516359209, + "tokens_seen": 1112716288 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003347843530591775, + "loss": 2.5448, + "theoretical_loss": 3.6126306657333105, + "tokens_seen": 1112781824 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033477432296890675, + "loss": 2.6602, + "theoretical_loss": 3.6126108166037767, + "tokens_seen": 1112847360 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033476429287863593, + "loss": 2.5948, + "theoretical_loss": 3.612590968970408, + "tokens_seen": 1112912896 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003347542627883651, + "loss": 2.8068, + "theoretical_loss": 3.6125711228330024, + "tokens_seen": 1112978432 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003347442326980943, + "loss": 2.6638, + "theoretical_loss": 3.6125512781913605, + "tokens_seen": 1113043968 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033473420260782353, + "loss": 2.8068, + "theoretical_loss": 3.6125314350452804, + "tokens_seen": 1113109504 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033472417251755265, + "loss": 2.7284, + "theoretical_loss": 3.6125115933945624, + "tokens_seen": 1113175040 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003347141424272819, + "loss": 2.6465, + "theoretical_loss": 3.612491753239004, + "tokens_seen": 1113240576 + }, + { + "epoch": 3.07, + "learning_rate": 0.000334704112337011, + "loss": 2.5866, + "theoretical_loss": 3.6124719145784066, + "tokens_seen": 1113306112 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033469408224674025, + "loss": 2.776, + "theoretical_loss": 3.612452077412568, + "tokens_seen": 1113371648 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033468405215646943, + "loss": 2.5994, + "theoretical_loss": 3.612432241741289, + "tokens_seen": 1113437184 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003346740220661986, + "loss": 2.7386, + "theoretical_loss": 3.612412407564367, + "tokens_seen": 1113502720 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003346639919759278, + "loss": 2.4634, + "theoretical_loss": 3.6123925748816035, + "tokens_seen": 1113568256 + }, + { + "epoch": 3.07, + "learning_rate": 0.000334653961885657, + "loss": 2.6224, + "theoretical_loss": 3.6123727436927977, + "tokens_seen": 1113633792 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033464393179538616, + "loss": 2.8138, + "theoretical_loss": 3.612352913997748, + "tokens_seen": 1113699328 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003346339017051154, + "loss": 2.7124, + "theoretical_loss": 3.612333085796255, + "tokens_seen": 1113764864 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003346238716148445, + "loss": 2.6895, + "theoretical_loss": 3.6123132590881175, + "tokens_seen": 1113830400 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033461384152457376, + "loss": 2.6007, + "theoretical_loss": 3.612293433873136, + "tokens_seen": 1113895936 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003346038114343029, + "loss": 2.5594, + "theoretical_loss": 3.61227361015111, + "tokens_seen": 1113961472 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003345937813440321, + "loss": 2.8955, + "theoretical_loss": 3.612253787921839, + "tokens_seen": 1114027008 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003345837512537613, + "loss": 2.637, + "theoretical_loss": 3.612233967185123, + "tokens_seen": 1114092544 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1248654, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.329355001449585, + "objective/train/theoretical_loss": 3.612229012234132, + "objective/train/tokens_used": 1134568928, + "theoretical_loss": 3.612229012234132, + "tokens_seen": 1114108928 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003345737211634905, + "loss": 2.763, + "theoretical_loss": 3.6122141479407617, + "tokens_seen": 1114158080 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033456369107321966, + "loss": 2.599, + "theoretical_loss": 3.6121943301885553, + "tokens_seen": 1114223616 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003345536609829489, + "loss": 2.896, + "theoretical_loss": 3.612174513928303, + "tokens_seen": 1114289152 + }, + { + "epoch": 3.07, + "learning_rate": 0.000334543630892678, + "loss": 2.5523, + "theoretical_loss": 3.612154699159806, + "tokens_seen": 1114354688 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033453360080240726, + "loss": 2.5513, + "theoretical_loss": 3.6121348858828624, + "tokens_seen": 1114420224 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003345235707121364, + "loss": 2.6675, + "theoretical_loss": 3.612115074097274, + "tokens_seen": 1114485760 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003345135406218656, + "loss": 2.7643, + "theoretical_loss": 3.6120952638028396, + "tokens_seen": 1114551296 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003345035105315948, + "loss": 2.8072, + "theoretical_loss": 3.61207545499936, + "tokens_seen": 1114616832 + }, + { + "epoch": 3.07, + "learning_rate": 0.000334493480441324, + "loss": 2.7499, + "theoretical_loss": 3.6120556476866352, + "tokens_seen": 1114682368 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033448345035105316, + "loss": 2.7357, + "theoretical_loss": 3.612035841864466, + "tokens_seen": 1114747904 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033447342026078235, + "loss": 2.9083, + "theoretical_loss": 3.612016037532651, + "tokens_seen": 1114813440 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003344633901705115, + "loss": 2.9005, + "theoretical_loss": 3.6119962346909915, + "tokens_seen": 1114878976 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033445336008024076, + "loss": 2.7063, + "theoretical_loss": 3.6119764333392883, + "tokens_seen": 1114944512 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003344433299899699, + "loss": 2.5994, + "theoretical_loss": 3.61195663347734, + "tokens_seen": 1115010048 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003344332998996991, + "loss": 2.6802, + "theoretical_loss": 3.6119368351049492, + "tokens_seen": 1115075584 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033442326980942825, + "loss": 2.5207, + "theoretical_loss": 3.6119170382219146, + "tokens_seen": 1115141120 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003344132397191575, + "loss": 2.4613, + "theoretical_loss": 3.6118972428280376, + "tokens_seen": 1115206656 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033440320962888667, + "loss": 2.5894, + "theoretical_loss": 3.611877448923118, + "tokens_seen": 1115272192 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033439317953861585, + "loss": 2.5237, + "theoretical_loss": 3.611857656506957, + "tokens_seen": 1115337728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033438314944834503, + "loss": 2.6439, + "theoretical_loss": 3.6118378655793544, + "tokens_seen": 1115403264 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033437311935807427, + "loss": 2.4564, + "theoretical_loss": 3.611818076140111, + "tokens_seen": 1115468800 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003343630892678034, + "loss": 2.7816, + "theoretical_loss": 3.611798288189028, + "tokens_seen": 1115534336 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033435305917753263, + "loss": 2.8158, + "theoretical_loss": 3.6117785017259054, + "tokens_seen": 1115599872 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033434302908726175, + "loss": 2.7997, + "theoretical_loss": 3.6117587167505447, + "tokens_seen": 1115665408 + }, + { + "epoch": 3.07, + "learning_rate": 0.000334332998996991, + "loss": 2.7841, + "theoretical_loss": 3.611738933262746, + "tokens_seen": 1115730944 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1249313, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.326357126235962, + "objective/train/theoretical_loss": 3.6117339876232046, + "objective/train/tokens_used": 1136207328, + "theoretical_loss": 3.6117339876232046, + "tokens_seen": 1115747328 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033432296890672017, + "loss": 2.5598, + "theoretical_loss": 3.6117191512623097, + "tokens_seen": 1115796480 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033431293881644935, + "loss": 2.6363, + "theoretical_loss": 3.6116993707490375, + "tokens_seen": 1115862016 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033430290872617853, + "loss": 2.5902, + "theoretical_loss": 3.61167959172273, + "tokens_seen": 1115927552 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003342928786359077, + "loss": 2.859, + "theoretical_loss": 3.611659814183188, + "tokens_seen": 1115993088 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003342828485456369, + "loss": 2.7009, + "theoretical_loss": 3.6116400381302123, + "tokens_seen": 1116058624 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033427281845536613, + "loss": 2.8771, + "theoretical_loss": 3.611620263563605, + "tokens_seen": 1116124160 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033426278836509526, + "loss": 2.7368, + "theoretical_loss": 3.611600490483165, + "tokens_seen": 1116189696 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003342527582748245, + "loss": 2.8553, + "theoretical_loss": 3.611580718888695, + "tokens_seen": 1116255232 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003342427281845536, + "loss": 2.6817, + "theoretical_loss": 3.611560948779996, + "tokens_seen": 1116320768 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033423269809428285, + "loss": 2.788, + "theoretical_loss": 3.611541180156868, + "tokens_seen": 1116386304 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033422266800401204, + "loss": 2.6987, + "theoretical_loss": 3.611521413019113, + "tokens_seen": 1116451840 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003342126379137412, + "loss": 2.5525, + "theoretical_loss": 3.6115016473665325, + "tokens_seen": 1116517376 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003342026078234704, + "loss": 2.5741, + "theoretical_loss": 3.6114818831989277, + "tokens_seen": 1116582912 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033419257773319963, + "loss": 2.6247, + "theoretical_loss": 3.611462120516099, + "tokens_seen": 1116648448 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033418254764292876, + "loss": 2.6781, + "theoretical_loss": 3.6114423593178486, + "tokens_seen": 1116713984 + }, + { + "epoch": 3.07, + "learning_rate": 0.000334172517552658, + "loss": 2.8599, + "theoretical_loss": 3.6114225996039773, + "tokens_seen": 1116779520 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003341624874623871, + "loss": 2.5877, + "theoretical_loss": 3.6114028413742867, + "tokens_seen": 1116845056 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033415245737211636, + "loss": 2.6957, + "theoretical_loss": 3.611383084628579, + "tokens_seen": 1116910592 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033414242728184554, + "loss": 2.8918, + "theoretical_loss": 3.6113633293666547, + "tokens_seen": 1116976128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003341323971915747, + "loss": 2.6015, + "theoretical_loss": 3.6113435755883154, + "tokens_seen": 1117041664 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003341223671013039, + "loss": 2.7051, + "theoretical_loss": 3.611323823293363, + "tokens_seen": 1117107200 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003341123370110331, + "loss": 2.6845, + "theoretical_loss": 3.611304072481598, + "tokens_seen": 1117172736 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033410230692076226, + "loss": 2.5767, + "theoretical_loss": 3.6112843231528244, + "tokens_seen": 1117238272 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003340922768304915, + "loss": 2.4481, + "theoretical_loss": 3.6112645753068415, + "tokens_seen": 1117303808 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003340822467402207, + "loss": 2.6844, + "theoretical_loss": 3.6112448289434527, + "tokens_seen": 1117369344 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1250355, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9746341705322266, + "objective/train/theoretical_loss": 3.6112398925842375, + "objective/train/tokens_used": 1137845728, + "theoretical_loss": 3.6112398925842375, + "tokens_seen": 1117385728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033407221664994986, + "loss": 2.6491, + "theoretical_loss": 3.6112250840624585, + "tokens_seen": 1117434880 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003340621865596791, + "loss": 2.6187, + "theoretical_loss": 3.6112053406636617, + "tokens_seen": 1117500416 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003340521564694082, + "loss": 2.5762, + "theoretical_loss": 3.611185598746863, + "tokens_seen": 1117565952 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033404212637913746, + "loss": 2.8125, + "theoretical_loss": 3.611165858311865, + "tokens_seen": 1117631488 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003340320962888666, + "loss": 2.8009, + "theoretical_loss": 3.6111461193584695, + "tokens_seen": 1117697024 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003340220661985958, + "loss": 2.7583, + "theoretical_loss": 3.6111263818864785, + "tokens_seen": 1117762560 + }, + { + "epoch": 3.07, + "learning_rate": 0.000334012036108325, + "loss": 2.6939, + "theoretical_loss": 3.6111066458956937, + "tokens_seen": 1117828096 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003340020060180542, + "loss": 2.9579, + "theoretical_loss": 3.6110869113859176, + "tokens_seen": 1117893632 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033399197592778336, + "loss": 2.9776, + "theoretical_loss": 3.611067178356952, + "tokens_seen": 1117959168 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033398194583751255, + "loss": 2.8083, + "theoretical_loss": 3.6110474468085987, + "tokens_seen": 1118024704 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003339719157472417, + "loss": 2.5772, + "theoretical_loss": 3.61102771674066, + "tokens_seen": 1118090240 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033396188565697096, + "loss": 2.6275, + "theoretical_loss": 3.611007988152938, + "tokens_seen": 1118155776 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003339518555667001, + "loss": 2.4687, + "theoretical_loss": 3.6109882610452355, + "tokens_seen": 1118221312 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003339418254764293, + "loss": 2.7554, + "theoretical_loss": 3.6109685354173537, + "tokens_seen": 1118286848 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033393179538615845, + "loss": 2.6071, + "theoretical_loss": 3.610948811269096, + "tokens_seen": 1118352384 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003339217652958877, + "loss": 2.705, + "theoretical_loss": 3.6109290886002636, + "tokens_seen": 1118417920 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033391173520561687, + "loss": 2.7084, + "theoretical_loss": 3.6109093674106596, + "tokens_seen": 1118483456 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033390170511534605, + "loss": 2.9054, + "theoretical_loss": 3.6108896477000862, + "tokens_seen": 1118548992 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033389167502507523, + "loss": 2.7773, + "theoretical_loss": 3.610869929468346, + "tokens_seen": 1118614528 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033388164493480447, + "loss": 2.8347, + "theoretical_loss": 3.610850212715241, + "tokens_seen": 1118680064 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003338716148445336, + "loss": 2.7266, + "theoretical_loss": 3.610830497440574, + "tokens_seen": 1118745600 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033386158475426283, + "loss": 2.4787, + "theoretical_loss": 3.610810783644148, + "tokens_seen": 1118811136 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033385155466399195, + "loss": 2.643, + "theoretical_loss": 3.6107910713257647, + "tokens_seen": 1118876672 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003338415245737212, + "loss": 2.7258, + "theoretical_loss": 3.610771360485227, + "tokens_seen": 1118942208 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033383149448345037, + "loss": 2.9903, + "theoretical_loss": 3.6107516511223383, + "tokens_seen": 1119007744 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1250984, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.452301502227783, + "objective/train/theoretical_loss": 3.6107467240124755, + "objective/train/tokens_used": 1139484128, + "theoretical_loss": 3.6107467240124755, + "tokens_seen": 1119024128 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033382146439317955, + "loss": 2.5613, + "theoretical_loss": 3.6107319432369005, + "tokens_seen": 1119073280 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033381143430290873, + "loss": 2.445, + "theoretical_loss": 3.610712236828716, + "tokens_seen": 1119138816 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003338014042126379, + "loss": 2.8158, + "theoretical_loss": 3.610692531897589, + "tokens_seen": 1119204352 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003337913741223671, + "loss": 2.8361, + "theoretical_loss": 3.610672828443321, + "tokens_seen": 1119269888 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033378134403209633, + "loss": 2.8296, + "theoretical_loss": 3.6106531264657153, + "tokens_seen": 1119335424 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033377131394182546, + "loss": 2.545, + "theoretical_loss": 3.6106334259645747, + "tokens_seen": 1119400960 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003337612838515547, + "loss": 2.8547, + "theoretical_loss": 3.610613726939702, + "tokens_seen": 1119466496 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003337512537612838, + "loss": 2.8374, + "theoretical_loss": 3.6105940293909002, + "tokens_seen": 1119532032 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033374122367101306, + "loss": 2.6208, + "theoretical_loss": 3.610574333317973, + "tokens_seen": 1119597568 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033373119358074224, + "loss": 2.6924, + "theoretical_loss": 3.610554638720722, + "tokens_seen": 1119663104 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003337211634904714, + "loss": 2.8094, + "theoretical_loss": 3.610534945598952, + "tokens_seen": 1119728640 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003337111334002006, + "loss": 2.7446, + "theoretical_loss": 3.6105152539524648, + "tokens_seen": 1119794176 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033370110330992983, + "loss": 2.81, + "theoretical_loss": 3.6104955637810643, + "tokens_seen": 1119859712 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033369107321965896, + "loss": 2.6061, + "theoretical_loss": 3.6104758750845534, + "tokens_seen": 1119925248 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003336810431293882, + "loss": 2.8181, + "theoretical_loss": 3.6104561878627353, + "tokens_seen": 1119990784 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003336710130391173, + "loss": 2.9375, + "theoretical_loss": 3.6104365021154132, + "tokens_seen": 1120056320 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033366098294884656, + "loss": 2.6833, + "theoretical_loss": 3.61041681784239, + "tokens_seen": 1120121856 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033365095285857574, + "loss": 2.9931, + "theoretical_loss": 3.61039713504347, + "tokens_seen": 1120187392 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003336409227683049, + "loss": 2.5654, + "theoretical_loss": 3.6103774537184563, + "tokens_seen": 1120252928 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003336308926780341, + "loss": 2.7439, + "theoretical_loss": 3.6103577738671513, + "tokens_seen": 1120318464 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003336208625877633, + "loss": 2.6787, + "theoretical_loss": 3.61033809548936, + "tokens_seen": 1120384000 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033361083249749246, + "loss": 2.7574, + "theoretical_loss": 3.6103184185848853, + "tokens_seen": 1120449536 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003336008024072217, + "loss": 2.5471, + "theoretical_loss": 3.61029874315353, + "tokens_seen": 1120515072 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003335907723169508, + "loss": 2.8783, + "theoretical_loss": 3.6102790691950988, + "tokens_seen": 1120580608 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033358074222668006, + "loss": 2.819, + "theoretical_loss": 3.6102593967093943, + "tokens_seen": 1120646144 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1252222, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9513473510742188, + "objective/train/theoretical_loss": 3.610254478818059, + "objective/train/tokens_used": 1141122528, + "theoretical_loss": 3.610254478818059, + "tokens_seen": 1120662528 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033357071213640924, + "loss": 2.7699, + "theoretical_loss": 3.6102397256962204, + "tokens_seen": 1120711680 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003335606820461384, + "loss": 2.6794, + "theoretical_loss": 3.6102200561553817, + "tokens_seen": 1120777216 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003335506519558676, + "loss": 2.694, + "theoretical_loss": 3.6102003880866804, + "tokens_seen": 1120842752 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003335406218655968, + "loss": 2.8837, + "theoretical_loss": 3.6101807214899218, + "tokens_seen": 1120908288 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033353059177532597, + "loss": 2.7733, + "theoretical_loss": 3.610161056364908, + "tokens_seen": 1120973824 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003335205616850552, + "loss": 2.7387, + "theoretical_loss": 3.6101413927114443, + "tokens_seen": 1121039360 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033351053159478433, + "loss": 2.7936, + "theoretical_loss": 3.6101217305293343, + "tokens_seen": 1121104896 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033350050150451356, + "loss": 2.9508, + "theoretical_loss": 3.610102069818381, + "tokens_seen": 1121170432 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003334904714142427, + "loss": 2.6335, + "theoretical_loss": 3.6100824105783893, + "tokens_seen": 1121235968 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003334804413239719, + "loss": 2.6319, + "theoretical_loss": 3.610062752809163, + "tokens_seen": 1121301504 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003334704112337011, + "loss": 2.7402, + "theoretical_loss": 3.6100430965105055, + "tokens_seen": 1121367040 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003334603811434303, + "loss": 2.7282, + "theoretical_loss": 3.6100234416822223, + "tokens_seen": 1121432576 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033345035105315947, + "loss": 2.7094, + "theoretical_loss": 3.6100037883241156, + "tokens_seen": 1121498112 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033344032096288865, + "loss": 2.8534, + "theoretical_loss": 3.6099841364359904, + "tokens_seen": 1121563648 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033343029087261783, + "loss": 2.909, + "theoretical_loss": 3.609964486017651, + "tokens_seen": 1121629184 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033342026078234707, + "loss": 2.9188, + "theoretical_loss": 3.609944837068902, + "tokens_seen": 1121694720 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003334102306920762, + "loss": 2.8513, + "theoretical_loss": 3.609925189589547, + "tokens_seen": 1121760256 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033340020060180543, + "loss": 2.5853, + "theoretical_loss": 3.60990554357939, + "tokens_seen": 1121825792 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003333901705115346, + "loss": 2.6128, + "theoretical_loss": 3.609885899038236, + "tokens_seen": 1121891328 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003333801404212638, + "loss": 2.6936, + "theoretical_loss": 3.6098662559658887, + "tokens_seen": 1121956864 + }, + { + "epoch": 3.07, + "learning_rate": 0.000333370110330993, + "loss": 2.5918, + "theoretical_loss": 3.6098466143621533, + "tokens_seen": 1122022400 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033336008024072215, + "loss": 2.6725, + "theoretical_loss": 3.609826974226834, + "tokens_seen": 1122087936 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033335005015045134, + "loss": 2.7519, + "theoretical_loss": 3.6098073355597347, + "tokens_seen": 1122153472 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033334002006018057, + "loss": 2.6089, + "theoretical_loss": 3.6097876983606607, + "tokens_seen": 1122219008 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033332998996990975, + "loss": 2.4769, + "theoretical_loss": 3.6097680626294157, + "tokens_seen": 1122284544 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1252729, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7386367321014404, + "objective/train/theoretical_loss": 3.60976315392593, + "objective/train/tokens_used": 1142760928, + "theoretical_loss": 3.60976315392593, + "tokens_seen": 1122300928 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033331995987963893, + "loss": 2.7276, + "theoretical_loss": 3.609748428365805, + "tokens_seen": 1122350080 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003333099297893681, + "loss": 2.7054, + "theoretical_loss": 3.609728795569633, + "tokens_seen": 1122415616 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003332998996990973, + "loss": 2.8766, + "theoretical_loss": 3.6097091642407038, + "tokens_seen": 1122481152 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033328986960882653, + "loss": 2.7833, + "theoretical_loss": 3.6096895343788225, + "tokens_seen": 1122546688 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033327983951855566, + "loss": 2.7231, + "theoretical_loss": 3.6096699059837944, + "tokens_seen": 1122612224 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003332698094282849, + "loss": 3.13, + "theoretical_loss": 3.6096502790554235, + "tokens_seen": 1122677760 + }, + { + "epoch": 3.07, + "learning_rate": 0.000333259779338014, + "loss": 2.7866, + "theoretical_loss": 3.6096306535935154, + "tokens_seen": 1122743296 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033324974924774326, + "loss": 2.7094, + "theoretical_loss": 3.6096110295978736, + "tokens_seen": 1122808832 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033323971915747244, + "loss": 2.699, + "theoretical_loss": 3.6095914070683044, + "tokens_seen": 1122874368 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003332296890672016, + "loss": 2.8131, + "theoretical_loss": 3.609571786004612, + "tokens_seen": 1122939904 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003332196589769308, + "loss": 2.669, + "theoretical_loss": 3.6095521664066013, + "tokens_seen": 1123005440 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033320962888666003, + "loss": 2.7178, + "theoretical_loss": 3.6095325482740774, + "tokens_seen": 1123070976 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033319959879638916, + "loss": 2.848, + "theoretical_loss": 3.6095129316068455, + "tokens_seen": 1123136512 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003331895687061184, + "loss": 2.7948, + "theoretical_loss": 3.609493316404711, + "tokens_seen": 1123202048 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003331795386158475, + "loss": 2.5098, + "theoretical_loss": 3.6094737026674775, + "tokens_seen": 1123267584 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033316950852557676, + "loss": 2.8703, + "theoretical_loss": 3.609454090394952, + "tokens_seen": 1123333120 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033315947843530594, + "loss": 2.6075, + "theoretical_loss": 3.6094344795869384, + "tokens_seen": 1123398656 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003331494483450351, + "loss": 2.7532, + "theoretical_loss": 3.609414870243243, + "tokens_seen": 1123464192 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003331394182547643, + "loss": 2.7733, + "theoretical_loss": 3.6093952623636696, + "tokens_seen": 1123529728 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003331293881644935, + "loss": 2.8531, + "theoretical_loss": 3.609375655948025, + "tokens_seen": 1123595264 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033311935807422266, + "loss": 2.5311, + "theoretical_loss": 3.609356050996113, + "tokens_seen": 1123660800 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003331093279839519, + "loss": 2.9696, + "theoretical_loss": 3.609336447507741, + "tokens_seen": 1123726336 + }, + { + "epoch": 3.07, + "learning_rate": 0.000333099297893681, + "loss": 2.9202, + "theoretical_loss": 3.609316845482712, + "tokens_seen": 1123791872 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033308926780341026, + "loss": 2.9049, + "theoretical_loss": 3.6092972449208336, + "tokens_seen": 1123857408 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033307923771313944, + "loss": 2.842, + "theoretical_loss": 3.60927764582191, + "tokens_seen": 1123922944 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1254028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9574615955352783, + "objective/train/theoretical_loss": 3.609272746275743, + "objective/train/tokens_used": 1144399328, + "theoretical_loss": 3.609272746275743, + "tokens_seen": 1123939328 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003330692076228686, + "loss": 2.911, + "theoretical_loss": 3.6092580481857466, + "tokens_seen": 1123988480 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003330591775325978, + "loss": 2.7521, + "theoretical_loss": 3.60923845201215, + "tokens_seen": 1124054016 + }, + { + "epoch": 3.07, + "learning_rate": 0.000333049147442327, + "loss": 2.762, + "theoretical_loss": 3.6092188573009247, + "tokens_seen": 1124119552 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033303911735205617, + "loss": 3.0036, + "theoretical_loss": 3.6091992640518766, + "tokens_seen": 1124185088 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003330290872617854, + "loss": 2.87, + "theoretical_loss": 3.6091796722648124, + "tokens_seen": 1124250624 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033301905717151453, + "loss": 2.833, + "theoretical_loss": 3.609160081939536, + "tokens_seen": 1124316160 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033300902708124376, + "loss": 3.1339, + "theoretical_loss": 3.609140493075855, + "tokens_seen": 1124381696 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003329989969909729, + "loss": 2.846, + "theoretical_loss": 3.6091209056735734, + "tokens_seen": 1124447232 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003329889669007021, + "loss": 2.7986, + "theoretical_loss": 3.609101319732498, + "tokens_seen": 1124512768 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003329789368104313, + "loss": 2.8447, + "theoretical_loss": 3.6090817352524347, + "tokens_seen": 1124578304 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003329689067201605, + "loss": 2.6167, + "theoretical_loss": 3.6090621522331894, + "tokens_seen": 1124643840 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033295887662988967, + "loss": 2.6057, + "theoretical_loss": 3.6090425706745677, + "tokens_seen": 1124709376 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033294884653961885, + "loss": 2.8129, + "theoretical_loss": 3.609022990576376, + "tokens_seen": 1124774912 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033293881644934803, + "loss": 2.5224, + "theoretical_loss": 3.609003411938419, + "tokens_seen": 1124840448 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033292878635907727, + "loss": 2.826, + "theoretical_loss": 3.6089838347605045, + "tokens_seen": 1124905984 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003329187562688064, + "loss": 2.4935, + "theoretical_loss": 3.6089642590424376, + "tokens_seen": 1124971520 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033290872617853563, + "loss": 2.5188, + "theoretical_loss": 3.6089446847840243, + "tokens_seen": 1125037056 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003328986960882648, + "loss": 2.6992, + "theoretical_loss": 3.608925111985071, + "tokens_seen": 1125102592 + }, + { + "epoch": 3.07, + "learning_rate": 0.000332888665997994, + "loss": 2.7602, + "theoretical_loss": 3.6089055406453845, + "tokens_seen": 1125168128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003328786359077232, + "loss": 2.6183, + "theoretical_loss": 3.60888597076477, + "tokens_seen": 1125233664 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033286860581745235, + "loss": 2.7598, + "theoretical_loss": 3.608866402343034, + "tokens_seen": 1125299200 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033285857572718154, + "loss": 2.5389, + "theoretical_loss": 3.608846835379983, + "tokens_seen": 1125364736 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033284854563691077, + "loss": 2.928, + "theoretical_loss": 3.6088272698754236, + "tokens_seen": 1125430272 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003328385155466399, + "loss": 2.8564, + "theoretical_loss": 3.6088077058291614, + "tokens_seen": 1125495808 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033282848545636913, + "loss": 2.6543, + "theoretical_loss": 3.6087881432410036, + "tokens_seen": 1125561344 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1254775, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.805901527404785, + "objective/train/theoretical_loss": 3.60878325282177, + "objective/train/tokens_used": 1146037728, + "theoretical_loss": 3.60878325282177, + "tokens_seen": 1125577728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033281845536609826, + "loss": 2.7767, + "theoretical_loss": 3.608768582110756, + "tokens_seen": 1125626880 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003328084252758275, + "loss": 2.6417, + "theoretical_loss": 3.608749022438226, + "tokens_seen": 1125692416 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003327983951855567, + "loss": 2.6314, + "theoretical_loss": 3.6087294642232184, + "tokens_seen": 1125757952 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033278836509528586, + "loss": 2.4157, + "theoretical_loss": 3.6087099074655415, + "tokens_seen": 1125823488 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033277833500501504, + "loss": 2.8866, + "theoretical_loss": 3.6086903521650013, + "tokens_seen": 1125889024 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003327683049147442, + "loss": 2.9665, + "theoretical_loss": 3.608670798321404, + "tokens_seen": 1125954560 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003327582748244734, + "loss": 2.6735, + "theoretical_loss": 3.608651245934557, + "tokens_seen": 1126020096 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033274824473420264, + "loss": 2.656, + "theoretical_loss": 3.6086316950042665, + "tokens_seen": 1126085632 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033273821464393176, + "loss": 2.5484, + "theoretical_loss": 3.608612145530339, + "tokens_seen": 1126151168 + }, + { + "epoch": 3.07, + "learning_rate": 0.000332728184553661, + "loss": 2.5752, + "theoretical_loss": 3.608592597512582, + "tokens_seen": 1126216704 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003327181544633902, + "loss": 2.5263, + "theoretical_loss": 3.6085730509508016, + "tokens_seen": 1126282240 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033270812437311936, + "loss": 2.968, + "theoretical_loss": 3.608553505844805, + "tokens_seen": 1126347776 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033269809428284854, + "loss": 2.5893, + "theoretical_loss": 3.6085339621943993, + "tokens_seen": 1126413312 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003326880641925777, + "loss": 2.4944, + "theoretical_loss": 3.6085144199993904, + "tokens_seen": 1126478848 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003326780341023069, + "loss": 2.5355, + "theoretical_loss": 3.608494879259587, + "tokens_seen": 1126544384 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033266800401203614, + "loss": 2.702, + "theoretical_loss": 3.6084753399747944, + "tokens_seen": 1126609920 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033265797392176527, + "loss": 2.6876, + "theoretical_loss": 3.6084558021448205, + "tokens_seen": 1126675456 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003326479438314945, + "loss": 2.7097, + "theoretical_loss": 3.6084362657694724, + "tokens_seen": 1126740992 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033263791374122363, + "loss": 2.8215, + "theoretical_loss": 3.608416730848557, + "tokens_seen": 1126806528 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033262788365095286, + "loss": 2.7713, + "theoretical_loss": 3.608397197381881, + "tokens_seen": 1126872064 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033261785356068205, + "loss": 2.6377, + "theoretical_loss": 3.6083776653692525, + "tokens_seen": 1126937600 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003326078234704112, + "loss": 2.8388, + "theoretical_loss": 3.6083581348104783, + "tokens_seen": 1127003136 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003325977933801404, + "loss": 2.8913, + "theoretical_loss": 3.608338605705365, + "tokens_seen": 1127068672 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033258776328986964, + "loss": 2.7082, + "theoretical_loss": 3.6083190780537207, + "tokens_seen": 1127134208 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003325777331995988, + "loss": 2.6834, + "theoretical_loss": 3.608299551855353, + "tokens_seen": 1127199744 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1256132, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0457611083984375, + "objective/train/theoretical_loss": 3.6082946705328123, + "objective/train/tokens_used": 1147676128, + "theoretical_loss": 3.6082946705328123, + "tokens_seen": 1127216128 + }, + { + "epoch": 3.07, + "learning_rate": 0.000332567703109328, + "loss": 3.1307, + "theoretical_loss": 3.608280027110068, + "tokens_seen": 1127265280 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003325576730190572, + "loss": 2.5617, + "theoretical_loss": 3.6082605038176743, + "tokens_seen": 1127330816 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033254764292878637, + "loss": 2.6656, + "theoretical_loss": 3.6082409819779784, + "tokens_seen": 1127396352 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003325376128385156, + "loss": 2.9161, + "theoretical_loss": 3.608221461590789, + "tokens_seen": 1127461888 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033252758274824473, + "loss": 2.7213, + "theoretical_loss": 3.6082019426559127, + "tokens_seen": 1127527424 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033251755265797396, + "loss": 2.6276, + "theoretical_loss": 3.6081824251731565, + "tokens_seen": 1127592960 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003325075225677031, + "loss": 2.6259, + "theoretical_loss": 3.6081629091423295, + "tokens_seen": 1127658496 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033249749247743233, + "loss": 2.7656, + "theoretical_loss": 3.608143394563238, + "tokens_seen": 1127724032 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003324874623871615, + "loss": 2.7165, + "theoretical_loss": 3.6081238814356906, + "tokens_seen": 1127789568 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003324774322968907, + "loss": 2.8234, + "theoretical_loss": 3.608104369759494, + "tokens_seen": 1127855104 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033246740220661987, + "loss": 2.9237, + "theoretical_loss": 3.608084859534457, + "tokens_seen": 1127920640 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033245737211634905, + "loss": 2.7804, + "theoretical_loss": 3.6080653507603864, + "tokens_seen": 1127986176 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033244734202607823, + "loss": 2.7878, + "theoretical_loss": 3.6080458434370906, + "tokens_seen": 1128051712 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033243731193580747, + "loss": 2.8638, + "theoretical_loss": 3.608026337564377, + "tokens_seen": 1128117248 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003324272818455366, + "loss": 2.5568, + "theoretical_loss": 3.6080068331420545, + "tokens_seen": 1128182784 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033241725175526583, + "loss": 2.7804, + "theoretical_loss": 3.6079873301699292, + "tokens_seen": 1128248320 + }, + { + "epoch": 3.07, + "learning_rate": 0.000332407221664995, + "loss": 2.6939, + "theoretical_loss": 3.607967828647811, + "tokens_seen": 1128313856 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003323971915747242, + "loss": 2.9341, + "theoretical_loss": 3.6079483285755067, + "tokens_seen": 1128379392 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003323871614844534, + "loss": 2.8499, + "theoretical_loss": 3.6079288299528245, + "tokens_seen": 1128444928 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033237713139418255, + "loss": 2.6364, + "theoretical_loss": 3.6079093327795726, + "tokens_seen": 1128510464 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033236710130391174, + "loss": 2.692, + "theoretical_loss": 3.607889837055559, + "tokens_seen": 1128576000 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033235707121364097, + "loss": 2.7181, + "theoretical_loss": 3.607870342780592, + "tokens_seen": 1128641536 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003323470411233701, + "loss": 2.6819, + "theoretical_loss": 3.6078508499544797, + "tokens_seen": 1128707072 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033233701103309933, + "loss": 2.7742, + "theoretical_loss": 3.6078313585770303, + "tokens_seen": 1128772608 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033232698094282846, + "loss": 2.8938, + "theoretical_loss": 3.607811868648052, + "tokens_seen": 1128838144 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1256724, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9342129230499268, + "objective/train/theoretical_loss": 3.6078069963921084, + "objective/train/tokens_used": 1149314528, + "theoretical_loss": 3.6078069963921084, + "tokens_seen": 1128854528 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003323169508525577, + "loss": 2.6303, + "theoretical_loss": 3.6077923801673526, + "tokens_seen": 1128903680 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003323069207622869, + "loss": 2.9345, + "theoretical_loss": 3.607772893134741, + "tokens_seen": 1128969216 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033229689067201606, + "loss": 2.5573, + "theoretical_loss": 3.6077534075500264, + "tokens_seen": 1129034752 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033228686058174524, + "loss": 2.6129, + "theoretical_loss": 3.607733923413015, + "tokens_seen": 1129100288 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003322768304914744, + "loss": 2.7973, + "theoretical_loss": 3.607714440723517, + "tokens_seen": 1129165824 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003322668004012036, + "loss": 2.7817, + "theoretical_loss": 3.60769495948134, + "tokens_seen": 1129231360 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033225677031093284, + "loss": 2.9254, + "theoretical_loss": 3.6076754796862933, + "tokens_seen": 1129296896 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033224674022066196, + "loss": 2.8688, + "theoretical_loss": 3.6076560013381846, + "tokens_seen": 1129362432 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003322367101303912, + "loss": 2.731, + "theoretical_loss": 3.607636524436823, + "tokens_seen": 1129427968 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003322266800401204, + "loss": 2.3886, + "theoretical_loss": 3.607617048982017, + "tokens_seen": 1129493504 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033221664994984956, + "loss": 2.4788, + "theoretical_loss": 3.6075975749735747, + "tokens_seen": 1129559040 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033220661985957874, + "loss": 2.6113, + "theoretical_loss": 3.6075781024113054, + "tokens_seen": 1129624576 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003321965897693079, + "loss": 2.7369, + "theoretical_loss": 3.607558631295017, + "tokens_seen": 1129690112 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003321865596790371, + "loss": 2.7931, + "theoretical_loss": 3.60753916162452, + "tokens_seen": 1129755648 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033217652958876634, + "loss": 2.6653, + "theoretical_loss": 3.607519693399621, + "tokens_seen": 1129821184 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033216649949849547, + "loss": 2.771, + "theoretical_loss": 3.6075002266201306, + "tokens_seen": 1129886720 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003321564694082247, + "loss": 2.8361, + "theoretical_loss": 3.6074807612858573, + "tokens_seen": 1129952256 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033214643931795383, + "loss": 2.4755, + "theoretical_loss": 3.6074612973966085, + "tokens_seen": 1130017792 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033213640922768306, + "loss": 2.882, + "theoretical_loss": 3.607441834952195, + "tokens_seen": 1130083328 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033212637913741225, + "loss": 2.6837, + "theoretical_loss": 3.607422373952425, + "tokens_seen": 1130148864 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003321163490471414, + "loss": 2.825, + "theoretical_loss": 3.607402914397108, + "tokens_seen": 1130214400 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003321063189568706, + "loss": 2.7678, + "theoretical_loss": 3.607383456286051, + "tokens_seen": 1130279936 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033209628886659984, + "loss": 2.7812, + "theoretical_loss": 3.607363999619066, + "tokens_seen": 1130345472 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033208625877632897, + "loss": 2.7462, + "theoretical_loss": 3.6073445443959606, + "tokens_seen": 1130411008 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003320762286860582, + "loss": 2.9633, + "theoretical_loss": 3.607325090616544, + "tokens_seen": 1130476544 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1258334, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.567551851272583, + "objective/train/theoretical_loss": 3.607320227397244, + "objective/train/tokens_used": 1150952928, + "theoretical_loss": 3.607320227397244, + "tokens_seen": 1130492928 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033206619859578733, + "loss": 2.753, + "theoretical_loss": 3.607305638280626, + "tokens_seen": 1130542080 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033205616850551657, + "loss": 2.5546, + "theoretical_loss": 3.6072861873880147, + "tokens_seen": 1130607616 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033204613841524575, + "loss": 2.8395, + "theoretical_loss": 3.6072667379385197, + "tokens_seen": 1130673152 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033203610832497493, + "loss": 2.8606, + "theoretical_loss": 3.607247289931951, + "tokens_seen": 1130738688 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003320260782347041, + "loss": 2.7365, + "theoretical_loss": 3.6072278433681175, + "tokens_seen": 1130804224 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003320160481444333, + "loss": 2.7856, + "theoretical_loss": 3.6072083982468284, + "tokens_seen": 1130869760 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003320060180541625, + "loss": 2.5667, + "theoretical_loss": 3.607188954567894, + "tokens_seen": 1130935296 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003319959879638917, + "loss": 2.629, + "theoretical_loss": 3.6071695123311223, + "tokens_seen": 1131000832 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033198595787362084, + "loss": 2.7256, + "theoretical_loss": 3.6071500715363234, + "tokens_seen": 1131066368 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033197592778335007, + "loss": 3.0258, + "theoretical_loss": 3.607130632183307, + "tokens_seen": 1131131904 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003319658976930792, + "loss": 2.8214, + "theoretical_loss": 3.6071111942718836, + "tokens_seen": 1131197440 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033195586760280843, + "loss": 2.6264, + "theoretical_loss": 3.607091757801861, + "tokens_seen": 1131262976 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003319458375125376, + "loss": 2.6839, + "theoretical_loss": 3.607072322773049, + "tokens_seen": 1131328512 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003319358074222668, + "loss": 2.9318, + "theoretical_loss": 3.6070528891852582, + "tokens_seen": 1131394048 + }, + { + "epoch": 3.07, + "learning_rate": 0.000331925777331996, + "loss": 2.7424, + "theoretical_loss": 3.6070334570382983, + "tokens_seen": 1131459584 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003319157472417252, + "loss": 2.7137, + "theoretical_loss": 3.6070140263319783, + "tokens_seen": 1131525120 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033190571715145434, + "loss": 2.8819, + "theoretical_loss": 3.6069945970661084, + "tokens_seen": 1131590656 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003318956870611836, + "loss": 2.8069, + "theoretical_loss": 3.6069751692404983, + "tokens_seen": 1131656192 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003318856569709127, + "loss": 2.7572, + "theoretical_loss": 3.606955742854958, + "tokens_seen": 1131721728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033187562688064194, + "loss": 2.646, + "theoretical_loss": 3.606936317909297, + "tokens_seen": 1131787264 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003318655967903711, + "loss": 2.8115, + "theoretical_loss": 3.6069168944033256, + "tokens_seen": 1131852800 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003318555667001003, + "loss": 2.629, + "theoretical_loss": 3.606897472336853, + "tokens_seen": 1131918336 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003318455366098295, + "loss": 2.5971, + "theoretical_loss": 3.6068780517096903, + "tokens_seen": 1131983872 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033183550651955866, + "loss": 2.7799, + "theoretical_loss": 3.606858632521647, + "tokens_seen": 1132049408 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003318254764292879, + "loss": 2.7061, + "theoretical_loss": 3.606839214772533, + "tokens_seen": 1132114944 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1259128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0284805297851562, + "objective/train/theoretical_loss": 3.606834360560065, + "objective/train/tokens_used": 1152591328, + "theoretical_loss": 3.606834360560065, + "tokens_seen": 1132131328 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003318154463390171, + "loss": 2.8331, + "theoretical_loss": 3.6068197984621584, + "tokens_seen": 1132180480 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033180541624874626, + "loss": 2.4103, + "theoretical_loss": 3.6068003835903335, + "tokens_seen": 1132246016 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033179538615847544, + "loss": 2.5474, + "theoretical_loss": 3.6067809701568687, + "tokens_seen": 1132311552 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003317853560682046, + "loss": 2.5511, + "theoretical_loss": 3.6067615581615735, + "tokens_seen": 1132377088 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003317753259779338, + "loss": 2.7727, + "theoretical_loss": 3.606742147604259, + "tokens_seen": 1132442624 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033176529588766304, + "loss": 2.8224, + "theoretical_loss": 3.6067227384847347, + "tokens_seen": 1132508160 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033175526579739216, + "loss": 2.6972, + "theoretical_loss": 3.6067033308028114, + "tokens_seen": 1132573696 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003317452357071214, + "loss": 2.626, + "theoretical_loss": 3.6066839245582987, + "tokens_seen": 1132639232 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003317352056168506, + "loss": 2.6497, + "theoretical_loss": 3.6066645197510083, + "tokens_seen": 1132704768 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033172517552657976, + "loss": 2.6526, + "theoretical_loss": 3.6066451163807494, + "tokens_seen": 1132770304 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033171514543630894, + "loss": 2.8228, + "theoretical_loss": 3.606625714447333, + "tokens_seen": 1132835840 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003317051153460381, + "loss": 2.9014, + "theoretical_loss": 3.6066063139505697, + "tokens_seen": 1132901376 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003316950852557673, + "loss": 2.4233, + "theoretical_loss": 3.6065869148902694, + "tokens_seen": 1132966912 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033168505516549654, + "loss": 2.7184, + "theoretical_loss": 3.6065675172662433, + "tokens_seen": 1133032448 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033167502507522567, + "loss": 2.7, + "theoretical_loss": 3.606548121078302, + "tokens_seen": 1133097984 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003316649949849549, + "loss": 2.771, + "theoretical_loss": 3.6065287263262555, + "tokens_seen": 1133163520 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033165496489468403, + "loss": 2.7794, + "theoretical_loss": 3.6065093330099147, + "tokens_seen": 1133229056 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033164493480441326, + "loss": 2.5853, + "theoretical_loss": 3.606489941129091, + "tokens_seen": 1133294592 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033163490471414245, + "loss": 2.5687, + "theoretical_loss": 3.6064705506835946, + "tokens_seen": 1133360128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003316248746238716, + "loss": 2.7562, + "theoretical_loss": 3.6064511616732355, + "tokens_seen": 1133425664 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003316148445336008, + "loss": 2.6475, + "theoretical_loss": 3.6064317740978264, + "tokens_seen": 1133491200 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033160481444333004, + "loss": 2.8928, + "theoretical_loss": 3.6064123879571763, + "tokens_seen": 1133556736 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033159478435305917, + "loss": 2.8312, + "theoretical_loss": 3.606393003251097, + "tokens_seen": 1133622272 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003315847542627884, + "loss": 2.723, + "theoretical_loss": 3.606373619979399, + "tokens_seen": 1133687808 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033157472417251753, + "loss": 2.8412, + "theoretical_loss": 3.606354238141894, + "tokens_seen": 1133753344 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1260370, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1477930545806885, + "objective/train/theoretical_loss": 3.606349392906588, + "objective/train/tokens_used": 1154229728, + "theoretical_loss": 3.606349392906588, + "tokens_seen": 1133769728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033156469408224677, + "loss": 2.5493, + "theoretical_loss": 3.606334857738392, + "tokens_seen": 1133818880 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033155466399197595, + "loss": 2.5485, + "theoretical_loss": 3.606315478768704, + "tokens_seen": 1133884416 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033154463390170513, + "loss": 2.8706, + "theoretical_loss": 3.6062961012326427, + "tokens_seen": 1133949952 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003315346038114343, + "loss": 2.6468, + "theoretical_loss": 3.6062767251300176, + "tokens_seen": 1134015488 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003315245737211635, + "loss": 2.7919, + "theoretical_loss": 3.6062573504606403, + "tokens_seen": 1134081024 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003315145436308927, + "loss": 2.8601, + "theoretical_loss": 3.6062379772243216, + "tokens_seen": 1134146560 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003315045135406219, + "loss": 2.712, + "theoretical_loss": 3.6062186054208736, + "tokens_seen": 1134212096 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033149448345035104, + "loss": 2.5173, + "theoretical_loss": 3.6061992350501066, + "tokens_seen": 1134277632 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033148445336008027, + "loss": 2.4526, + "theoretical_loss": 3.606179866111832, + "tokens_seen": 1134343168 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003314744232698094, + "loss": 2.8311, + "theoretical_loss": 3.6061604986058615, + "tokens_seen": 1134408704 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033146439317953863, + "loss": 2.8495, + "theoretical_loss": 3.6061411325320067, + "tokens_seen": 1134474240 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003314543630892678, + "loss": 2.9322, + "theoretical_loss": 3.6061217678900785, + "tokens_seen": 1134539776 + }, + { + "epoch": 3.07, + "learning_rate": 0.000331444332998997, + "loss": 2.5131, + "theoretical_loss": 3.6061024046798886, + "tokens_seen": 1134605312 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003314343029087262, + "loss": 2.8674, + "theoretical_loss": 3.606083042901248, + "tokens_seen": 1134670848 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003314242728184554, + "loss": 2.8922, + "theoretical_loss": 3.6060636825539687, + "tokens_seen": 1134736384 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033141424272818454, + "loss": 2.7273, + "theoretical_loss": 3.6060443236378616, + "tokens_seen": 1134801920 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003314042126379138, + "loss": 2.7511, + "theoretical_loss": 3.606024966152739, + "tokens_seen": 1134867456 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003313941825476429, + "loss": 2.6649, + "theoretical_loss": 3.606005610098412, + "tokens_seen": 1134932992 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033138415245737214, + "loss": 2.8792, + "theoretical_loss": 3.6059862554746926, + "tokens_seen": 1134998528 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003313741223671013, + "loss": 2.6218, + "theoretical_loss": 3.605966902281392, + "tokens_seen": 1135064064 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003313640922768305, + "loss": 2.811, + "theoretical_loss": 3.605947550518322, + "tokens_seen": 1135129600 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003313540621865597, + "loss": 2.5705, + "theoretical_loss": 3.6059282001852946, + "tokens_seen": 1135195136 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033134403209628886, + "loss": 2.9007, + "theoretical_loss": 3.6059088512821216, + "tokens_seen": 1135260672 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033133400200601804, + "loss": 2.7223, + "theoretical_loss": 3.6058895038086147, + "tokens_seen": 1135326208 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003313239719157473, + "loss": 2.8814, + "theoretical_loss": 3.6058701577645857, + "tokens_seen": 1135391744 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1261008, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7761924266815186, + "objective/train/theoretical_loss": 3.6058653214769123, + "objective/train/tokens_used": 1155868128, + "theoretical_loss": 3.6058653214769123, + "tokens_seen": 1135408128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003313139418254764, + "loss": 2.8254, + "theoretical_loss": 3.605850813149847, + "tokens_seen": 1135457280 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033130391173520564, + "loss": 2.6401, + "theoretical_loss": 3.605831469964209, + "tokens_seen": 1135522816 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033129388164493477, + "loss": 2.9165, + "theoretical_loss": 3.6058121282074853, + "tokens_seen": 1135588352 + }, + { + "epoch": 3.07, + "learning_rate": 0.000331283851554664, + "loss": 2.6665, + "theoretical_loss": 3.605792787879487, + "tokens_seen": 1135653888 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003312738214643932, + "loss": 2.8296, + "theoretical_loss": 3.6057734489800266, + "tokens_seen": 1135719424 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033126379137412236, + "loss": 2.4184, + "theoretical_loss": 3.605754111508916, + "tokens_seen": 1135784960 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033125376128385155, + "loss": 2.8367, + "theoretical_loss": 3.605734775465967, + "tokens_seen": 1135850496 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003312437311935808, + "loss": 2.6468, + "theoretical_loss": 3.605715440850992, + "tokens_seen": 1135916032 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003312337011033099, + "loss": 2.5701, + "theoretical_loss": 3.6056961076638037, + "tokens_seen": 1135981568 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033122367101303914, + "loss": 2.7412, + "theoretical_loss": 3.6056767759042136, + "tokens_seen": 1136047104 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033121364092276827, + "loss": 2.4669, + "theoretical_loss": 3.6056574455720334, + "tokens_seen": 1136112640 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003312036108324975, + "loss": 2.7462, + "theoretical_loss": 3.605638116667077, + "tokens_seen": 1136178176 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003311935807422267, + "loss": 2.8023, + "theoretical_loss": 3.605618789189155, + "tokens_seen": 1136243712 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033118355065195587, + "loss": 2.5532, + "theoretical_loss": 3.6055994631380806, + "tokens_seen": 1136309248 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033117352056168505, + "loss": 2.6124, + "theoretical_loss": 3.6055801385136665, + "tokens_seen": 1136374784 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033116349047141423, + "loss": 2.5696, + "theoretical_loss": 3.605560815315725, + "tokens_seen": 1136440320 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003311534603811434, + "loss": 2.5819, + "theoretical_loss": 3.605541493544067, + "tokens_seen": 1136505856 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033114343029087265, + "loss": 2.8007, + "theoretical_loss": 3.6055221731985068, + "tokens_seen": 1136571392 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033113340020060177, + "loss": 2.6863, + "theoretical_loss": 3.605502854278857, + "tokens_seen": 1136636928 + }, + { + "epoch": 3.07, + "learning_rate": 0.000331123370110331, + "loss": 2.8361, + "theoretical_loss": 3.605483536784929, + "tokens_seen": 1136702464 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033111334002006013, + "loss": 2.8174, + "theoretical_loss": 3.605464220716536, + "tokens_seen": 1136768000 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033110330992978937, + "loss": 2.7956, + "theoretical_loss": 3.6054449060734903, + "tokens_seen": 1136833536 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033109327983951855, + "loss": 2.8733, + "theoretical_loss": 3.605425592855605, + "tokens_seen": 1136899072 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033108324974924773, + "loss": 2.6193, + "theoretical_loss": 3.6054062810626926, + "tokens_seen": 1136964608 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033107321965897697, + "loss": 2.8384, + "theoretical_loss": 3.605386970694566, + "tokens_seen": 1137030144 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1262017, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1084911823272705, + "objective/train/theoretical_loss": 3.6053821433251354, + "objective/train/tokens_used": 1157506528, + "theoretical_loss": 3.6053821433251354, + "tokens_seen": 1137046528 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033106318956870615, + "loss": 2.7541, + "theoretical_loss": 3.6053676617510377, + "tokens_seen": 1137095680 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033105315947843533, + "loss": 2.5768, + "theoretical_loss": 3.6053483542319205, + "tokens_seen": 1137161216 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003310431293881645, + "loss": 2.6472, + "theoretical_loss": 3.6053290481370275, + "tokens_seen": 1137226752 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003310330992978937, + "loss": 2.4923, + "theoretical_loss": 3.6053097434661714, + "tokens_seen": 1137292288 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003310230692076229, + "loss": 2.7072, + "theoretical_loss": 3.6052904402191652, + "tokens_seen": 1137357824 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003310130391173521, + "loss": 2.7793, + "theoretical_loss": 3.6052711383958216, + "tokens_seen": 1137423360 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033100300902708124, + "loss": 2.8709, + "theoretical_loss": 3.605251837995954, + "tokens_seen": 1137488896 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033099297893681047, + "loss": 2.58, + "theoretical_loss": 3.605232539019375, + "tokens_seen": 1137554432 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003309829488465396, + "loss": 2.7207, + "theoretical_loss": 3.605213241465898, + "tokens_seen": 1137619968 + }, + { + "epoch": 3.07, + "learning_rate": 0.00033097291875626883, + "loss": 2.5586, + "theoretical_loss": 3.605193945335336, + "tokens_seen": 1137685504 + }, + { + "epoch": 3.07, + "learning_rate": 0.000330962888665998, + "loss": 2.5498, + "theoretical_loss": 3.605174650627502, + "tokens_seen": 1137751040 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003309528585757272, + "loss": 2.7564, + "theoretical_loss": 3.6051553573422086, + "tokens_seen": 1137816576 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003309428284854564, + "loss": 2.8811, + "theoretical_loss": 3.60513606547927, + "tokens_seen": 1137882112 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003309327983951856, + "loss": 2.7868, + "theoretical_loss": 3.6051167750384994, + "tokens_seen": 1137947648 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033092276830491474, + "loss": 2.8146, + "theoretical_loss": 3.6050974860197096, + "tokens_seen": 1138013184 + }, + { + "epoch": 3.08, + "learning_rate": 0.000330912738214644, + "loss": 2.7553, + "theoretical_loss": 3.6050781984227136, + "tokens_seen": 1138078720 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003309027081243731, + "loss": 2.8369, + "theoretical_loss": 3.6050589122473253, + "tokens_seen": 1138144256 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033089267803410234, + "loss": 2.5716, + "theoretical_loss": 3.605039627493358, + "tokens_seen": 1138209792 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003308826479438315, + "loss": 2.9157, + "theoretical_loss": 3.605020344160625, + "tokens_seen": 1138275328 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003308726178535607, + "loss": 2.565, + "theoretical_loss": 3.60500106224894, + "tokens_seen": 1138340864 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003308625877632899, + "loss": 2.4975, + "theoretical_loss": 3.6049817817581156, + "tokens_seen": 1138406400 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033085255767301906, + "loss": 2.6338, + "theoretical_loss": 3.6049625026879664, + "tokens_seen": 1138471936 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033084252758274824, + "loss": 2.6649, + "theoretical_loss": 3.604943225038305, + "tokens_seen": 1138537472 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003308324974924775, + "loss": 2.669, + "theoretical_loss": 3.604923948808946, + "tokens_seen": 1138603008 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003308224674022066, + "loss": 2.7445, + "theoretical_loss": 3.604904673999702, + "tokens_seen": 1138668544 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1262682, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.618945837020874, + "objective/train/theoretical_loss": 3.604899855519262, + "objective/train/tokens_used": 1159144928, + "theoretical_loss": 3.604899855519262, + "tokens_seen": 1138684928 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033081243731193584, + "loss": 2.759, + "theoretical_loss": 3.604885400610388, + "tokens_seen": 1138734080 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033080240722166497, + "loss": 2.7804, + "theoretical_loss": 3.6048661286408157, + "tokens_seen": 1138799616 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003307923771313942, + "loss": 2.8219, + "theoretical_loss": 3.6048468580908004, + "tokens_seen": 1138865152 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003307823470411234, + "loss": 2.6843, + "theoretical_loss": 3.6048275889601555, + "tokens_seen": 1138930688 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033077231695085256, + "loss": 2.9408, + "theoretical_loss": 3.6048083212486945, + "tokens_seen": 1138996224 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033076228686058175, + "loss": 2.6726, + "theoretical_loss": 3.604789054956232, + "tokens_seen": 1139061760 + }, + { + "epoch": 3.08, + "learning_rate": 0.000330752256770311, + "loss": 2.8145, + "theoretical_loss": 3.604769790082581, + "tokens_seen": 1139127296 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003307422266800401, + "loss": 2.6603, + "theoretical_loss": 3.604750526627555, + "tokens_seen": 1139192832 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033073219658976934, + "loss": 2.6343, + "theoretical_loss": 3.6047312645909697, + "tokens_seen": 1139258368 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033072216649949847, + "loss": 2.5703, + "theoretical_loss": 3.6047120039726375, + "tokens_seen": 1139323904 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003307121364092277, + "loss": 3.0216, + "theoretical_loss": 3.6046927447723727, + "tokens_seen": 1139389440 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003307021063189569, + "loss": 2.7889, + "theoretical_loss": 3.6046734869899892, + "tokens_seen": 1139454976 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033069207622868607, + "loss": 2.7531, + "theoretical_loss": 3.604654230625302, + "tokens_seen": 1139520512 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033068204613841525, + "loss": 2.6034, + "theoretical_loss": 3.6046349756781244, + "tokens_seen": 1139586048 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033067201604814443, + "loss": 2.7754, + "theoretical_loss": 3.604615722148271, + "tokens_seen": 1139651584 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003306619859578736, + "loss": 2.7181, + "theoretical_loss": 3.6045964700355553, + "tokens_seen": 1139717120 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033065195586760285, + "loss": 2.8112, + "theoretical_loss": 3.6045772193397916, + "tokens_seen": 1139782656 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033064192577733197, + "loss": 2.7175, + "theoretical_loss": 3.6045579700607955, + "tokens_seen": 1139848192 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003306318956870612, + "loss": 2.7196, + "theoretical_loss": 3.6045387221983796, + "tokens_seen": 1139913728 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033062186559679034, + "loss": 2.5488, + "theoretical_loss": 3.604519475752359, + "tokens_seen": 1139979264 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033061183550651957, + "loss": 2.7686, + "theoretical_loss": 3.604500230722548, + "tokens_seen": 1140044800 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033060180541624875, + "loss": 2.8398, + "theoretical_loss": 3.6044809871087606, + "tokens_seen": 1140110336 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033059177532597793, + "loss": 2.8059, + "theoretical_loss": 3.604461744910812, + "tokens_seen": 1140175872 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003305817452357071, + "loss": 2.8029, + "theoretical_loss": 3.6044425041285164, + "tokens_seen": 1140241408 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033057171514543635, + "loss": 2.8484, + "theoretical_loss": 3.604423264761688, + "tokens_seen": 1140306944 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1264116, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.787461757659912, + "objective/train/theoretical_loss": 3.6044184551411256, + "objective/train/tokens_used": 1160783328, + "theoretical_loss": 3.6044184551411256, + "tokens_seen": 1140323328 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003305616850551655, + "loss": 2.7743, + "theoretical_loss": 3.604404026810141, + "tokens_seen": 1140372480 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003305516549648947, + "loss": 2.7962, + "theoretical_loss": 3.6043847902736905, + "tokens_seen": 1140438016 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033054162487462384, + "loss": 2.5863, + "theoretical_loss": 3.6043655551521514, + "tokens_seen": 1140503552 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003305315947843531, + "loss": 2.7442, + "theoretical_loss": 3.6043463214453375, + "tokens_seen": 1140569088 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033052156469408225, + "loss": 2.7425, + "theoretical_loss": 3.604327089153064, + "tokens_seen": 1140634624 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033051153460381144, + "loss": 2.7151, + "theoretical_loss": 3.6043078582751455, + "tokens_seen": 1140700160 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003305015045135406, + "loss": 2.7677, + "theoretical_loss": 3.604288628811397, + "tokens_seen": 1140765696 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003304914744232698, + "loss": 2.8816, + "theoretical_loss": 3.6042694007616327, + "tokens_seen": 1140831232 + }, + { + "epoch": 3.08, + "learning_rate": 0.000330481444332999, + "loss": 2.75, + "theoretical_loss": 3.604250174125668, + "tokens_seen": 1140896768 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003304714142427282, + "loss": 2.8542, + "theoretical_loss": 3.604230948903317, + "tokens_seen": 1140962304 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033046138415245734, + "loss": 2.7008, + "theoretical_loss": 3.6042117250943955, + "tokens_seen": 1141027840 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003304513540621866, + "loss": 2.8737, + "theoretical_loss": 3.6041925026987176, + "tokens_seen": 1141093376 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003304413239719157, + "loss": 2.6164, + "theoretical_loss": 3.6041732817160987, + "tokens_seen": 1141158912 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033043129388164494, + "loss": 2.9281, + "theoretical_loss": 3.604154062146354, + "tokens_seen": 1141224448 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003304212637913741, + "loss": 2.714, + "theoretical_loss": 3.604134843989298, + "tokens_seen": 1141289984 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003304112337011033, + "loss": 2.6748, + "theoretical_loss": 3.604115627244746, + "tokens_seen": 1141355520 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003304012036108325, + "loss": 2.5346, + "theoretical_loss": 3.604096411912513, + "tokens_seen": 1141421056 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003303911735205617, + "loss": 2.7443, + "theoretical_loss": 3.604077197992414, + "tokens_seen": 1141486592 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033038114343029084, + "loss": 2.901, + "theoretical_loss": 3.604057985484265, + "tokens_seen": 1141552128 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003303711133400201, + "loss": 2.9053, + "theoretical_loss": 3.6040387743878797, + "tokens_seen": 1141617664 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003303610832497492, + "loss": 2.9141, + "theoretical_loss": 3.6040195647030746, + "tokens_seen": 1141683200 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033035105315947844, + "loss": 2.6902, + "theoretical_loss": 3.604000356429664, + "tokens_seen": 1141748736 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003303410230692076, + "loss": 2.7633, + "theoretical_loss": 3.6039811495674643, + "tokens_seen": 1141814272 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003303309929789368, + "loss": 2.9135, + "theoretical_loss": 3.60396194411629, + "tokens_seen": 1141879808 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033032096288866604, + "loss": 2.5842, + "theoretical_loss": 3.6039427400759565, + "tokens_seen": 1141945344 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1264658, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5670065879821777, + "objective/train/theoretical_loss": 3.603937939286295, + "objective/train/tokens_used": 1162421728, + "theoretical_loss": 3.603937939286295, + "tokens_seen": 1141961728 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033031093279839517, + "loss": 2.6734, + "theoretical_loss": 3.6039235374462795, + "tokens_seen": 1142010880 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003303009027081244, + "loss": 2.6518, + "theoretical_loss": 3.6039043362270746, + "tokens_seen": 1142076416 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003302908726178536, + "loss": 2.6918, + "theoretical_loss": 3.6038851364181568, + "tokens_seen": 1142141952 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033028084252758276, + "loss": 2.8456, + "theoretical_loss": 3.603865938019341, + "tokens_seen": 1142207488 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033027081243731195, + "loss": 2.6436, + "theoretical_loss": 3.603846741030445, + "tokens_seen": 1142273024 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003302607823470412, + "loss": 2.6678, + "theoretical_loss": 3.603827545451282, + "tokens_seen": 1142338560 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003302507522567703, + "loss": 2.7732, + "theoretical_loss": 3.6038083512816685, + "tokens_seen": 1142404096 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033024072216649954, + "loss": 2.7161, + "theoretical_loss": 3.6037891585214203, + "tokens_seen": 1142469632 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033023069207622867, + "loss": 2.7033, + "theoretical_loss": 3.6037699671703534, + "tokens_seen": 1142535168 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003302206619859579, + "loss": 2.7151, + "theoretical_loss": 3.6037507772282824, + "tokens_seen": 1142600704 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003302106318956871, + "loss": 2.7908, + "theoretical_loss": 3.603731588695024, + "tokens_seen": 1142666240 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033020060180541627, + "loss": 2.8021, + "theoretical_loss": 3.6037124015703936, + "tokens_seen": 1142731776 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033019057171514545, + "loss": 2.6327, + "theoretical_loss": 3.603693215854207, + "tokens_seen": 1142797312 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033018054162487463, + "loss": 2.7521, + "theoretical_loss": 3.6036740315462805, + "tokens_seen": 1142862848 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003301705115346038, + "loss": 2.6045, + "theoretical_loss": 3.6036548486464293, + "tokens_seen": 1142928384 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033016048144433305, + "loss": 2.6104, + "theoretical_loss": 3.60363566715447, + "tokens_seen": 1142993920 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003301504513540622, + "loss": 2.602, + "theoretical_loss": 3.603616487070218, + "tokens_seen": 1143059456 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003301404212637914, + "loss": 2.6504, + "theoretical_loss": 3.6035973083934896, + "tokens_seen": 1143124992 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033013039117352054, + "loss": 2.6241, + "theoretical_loss": 3.6035781311241006, + "tokens_seen": 1143190528 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033012036108324977, + "loss": 2.4047, + "theoretical_loss": 3.6035589552618674, + "tokens_seen": 1143256064 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033011033099297895, + "loss": 2.6619, + "theoretical_loss": 3.6035397808066056, + "tokens_seen": 1143321600 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033010030090270813, + "loss": 2.7529, + "theoretical_loss": 3.6035206077581314, + "tokens_seen": 1143387136 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003300902708124373, + "loss": 2.5172, + "theoretical_loss": 3.603501436116262, + "tokens_seen": 1143452672 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033008024072216655, + "loss": 2.3102, + "theoretical_loss": 3.603482265880812, + "tokens_seen": 1143518208 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003300702106318957, + "loss": 2.8067, + "theoretical_loss": 3.6034630970515984, + "tokens_seen": 1143583744 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1266087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.628338098526001, + "objective/train/theoretical_loss": 3.603458305063998, + "objective/train/tokens_used": 1164060128, + "theoretical_loss": 3.603458305063998, + "tokens_seen": 1143600128 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003300601805416249, + "loss": 2.8557, + "theoretical_loss": 3.6034439296284377, + "tokens_seen": 1143649280 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033005015045135404, + "loss": 2.7171, + "theoretical_loss": 3.6034247636111463, + "tokens_seen": 1143714816 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003300401203610833, + "loss": 2.6378, + "theoretical_loss": 3.6034055989995393, + "tokens_seen": 1143780352 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033003009027081245, + "loss": 2.5898, + "theoretical_loss": 3.6033864357934347, + "tokens_seen": 1143845888 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033002006018054164, + "loss": 2.6563, + "theoretical_loss": 3.603367273992648, + "tokens_seen": 1143911424 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003300100300902708, + "loss": 2.9048, + "theoretical_loss": 3.603348113596996, + "tokens_seen": 1143976960 + }, + { + "epoch": 3.08, + "learning_rate": 0.00033, + "loss": 2.9012, + "theoretical_loss": 3.6033289546062948, + "tokens_seen": 1144042496 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003299899699097292, + "loss": 2.437, + "theoretical_loss": 3.603309797020361, + "tokens_seen": 1144108032 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003299799398194584, + "loss": 2.7824, + "theoretical_loss": 3.6032906408390115, + "tokens_seen": 1144173568 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032996990972918754, + "loss": 2.5932, + "theoretical_loss": 3.603271486062062, + "tokens_seen": 1144239104 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003299598796389168, + "loss": 2.8508, + "theoretical_loss": 3.60325233268933, + "tokens_seen": 1144304640 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003299498495486459, + "loss": 2.912, + "theoretical_loss": 3.6032331807206326, + "tokens_seen": 1144370176 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032993981945837514, + "loss": 2.7717, + "theoretical_loss": 3.603214030155785, + "tokens_seen": 1144435712 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003299297893681043, + "loss": 2.4534, + "theoretical_loss": 3.6031948809946046, + "tokens_seen": 1144501248 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003299197592778335, + "loss": 2.6931, + "theoretical_loss": 3.6031757332369088, + "tokens_seen": 1144566784 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003299097291875627, + "loss": 2.7032, + "theoretical_loss": 3.603156586882513, + "tokens_seen": 1144632320 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003298996990972919, + "loss": 2.4714, + "theoretical_loss": 3.6031374419312354, + "tokens_seen": 1144697856 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032988966900702104, + "loss": 2.7777, + "theoretical_loss": 3.603118298382892, + "tokens_seen": 1144763392 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003298796389167503, + "loss": 2.6707, + "theoretical_loss": 3.6030991562373003, + "tokens_seen": 1144828928 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003298696088264794, + "loss": 2.808, + "theoretical_loss": 3.6030800154942764, + "tokens_seen": 1144894464 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032985957873620864, + "loss": 2.8968, + "theoretical_loss": 3.603060876153638, + "tokens_seen": 1144960000 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003298495486459378, + "loss": 2.7766, + "theoretical_loss": 3.603041738215202, + "tokens_seen": 1145025536 + }, + { + "epoch": 3.08, + "learning_rate": 0.000329839518555667, + "loss": 2.5747, + "theoretical_loss": 3.6030226016787847, + "tokens_seen": 1145091072 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003298294884653962, + "loss": 2.5625, + "theoretical_loss": 3.603003466544204, + "tokens_seen": 1145156608 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032981945837512537, + "loss": 2.6457, + "theoretical_loss": 3.6029843328112765, + "tokens_seen": 1145222144 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1266852, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9177322387695312, + "objective/train/theoretical_loss": 3.602979549597032, + "objective/train/tokens_used": 1165698528, + "theoretical_loss": 3.602979549597032, + "tokens_seen": 1145238528 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032980942828485455, + "loss": 2.7197, + "theoretical_loss": 3.6029652004798196, + "tokens_seen": 1145287680 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003297993981945838, + "loss": 2.5158, + "theoretical_loss": 3.6029460695496507, + "tokens_seen": 1145353216 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003297893681043129, + "loss": 2.5897, + "theoretical_loss": 3.6029269400205868, + "tokens_seen": 1145418752 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032977933801404215, + "loss": 2.7719, + "theoretical_loss": 3.602907811892444, + "tokens_seen": 1145484288 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003297693079237713, + "loss": 2.9219, + "theoretical_loss": 3.6028886851650412, + "tokens_seen": 1145549824 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003297592778335005, + "loss": 2.6883, + "theoretical_loss": 3.6028695598381955, + "tokens_seen": 1145615360 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003297492477432297, + "loss": 2.7103, + "theoretical_loss": 3.6028504359117233, + "tokens_seen": 1145680896 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032973921765295887, + "loss": 2.7711, + "theoretical_loss": 3.6028313133854426, + "tokens_seen": 1145746432 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032972918756268805, + "loss": 2.6575, + "theoretical_loss": 3.6028121922591705, + "tokens_seen": 1145811968 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003297191574724173, + "loss": 2.783, + "theoretical_loss": 3.602793072532725, + "tokens_seen": 1145877504 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003297091273821464, + "loss": 2.6168, + "theoretical_loss": 3.6027739542059223, + "tokens_seen": 1145943040 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032969909729187565, + "loss": 2.8734, + "theoretical_loss": 3.6027548372785816, + "tokens_seen": 1146008576 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003296890672016048, + "loss": 2.8027, + "theoretical_loss": 3.60273572175052, + "tokens_seen": 1146074112 + }, + { + "epoch": 3.08, + "learning_rate": 0.000329679037111334, + "loss": 2.6679, + "theoretical_loss": 3.6027166076215535, + "tokens_seen": 1146139648 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003296690070210632, + "loss": 2.7966, + "theoretical_loss": 3.6026974948915016, + "tokens_seen": 1146205184 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003296589769307924, + "loss": 2.6869, + "theoretical_loss": 3.602678383560181, + "tokens_seen": 1146270720 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032964894684052155, + "loss": 2.7343, + "theoretical_loss": 3.6026592736274097, + "tokens_seen": 1146336256 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032963891675025074, + "loss": 2.6343, + "theoretical_loss": 3.602640165093005, + "tokens_seen": 1146401792 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003296288866599799, + "loss": 2.9379, + "theoretical_loss": 3.6026210579567852, + "tokens_seen": 1146467328 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032961885656970915, + "loss": 2.5409, + "theoretical_loss": 3.6026019522185684, + "tokens_seen": 1146532864 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003296088264794383, + "loss": 2.711, + "theoretical_loss": 3.602582847878171, + "tokens_seen": 1146598400 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003295987963891675, + "loss": 2.8479, + "theoretical_loss": 3.6025637449354124, + "tokens_seen": 1146663936 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003295887662988967, + "loss": 2.7248, + "theoretical_loss": 3.6025446433901096, + "tokens_seen": 1146729472 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003295787362086259, + "loss": 2.7041, + "theoretical_loss": 3.6025255432420806, + "tokens_seen": 1146795008 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003295687061183551, + "loss": 2.8072, + "theoretical_loss": 3.6025064444911434, + "tokens_seen": 1146860544 + }, + { + "debugging/Self-BLEU-5": 0.4473570585925922, + "debugging/distinct-1-grams": 0.7971195468618069, + "debugging/distinct-2-grams": 0.9608189858650149, + "debugging/entropy-1-grams": 5.677884863520164, + "debugging/entropy-2-grams": 6.546455298638856, + "debugging/length": 517.5, + "debugging/num_segments": 10, + "debugging/score": 0.004519826938251854, + "debugging/score_std": 0.004873429920306367, + "epoch": 3.08, + "objective/train/docs_used": 1267667, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.091930866241455, + "objective/train/theoretical_loss": 3.602501670021683, + "objective/train/tokens_used": 1167336928, + "theoretical_loss": 3.602501670021683, + "tokens_seen": 1146876928 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032955867602808424, + "loss": 2.711, + "theoretical_loss": 3.6024873471371155, + "tokens_seen": 1146926080 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003295486459378135, + "loss": 2.6344, + "theoretical_loss": 3.602468251179816, + "tokens_seen": 1146991616 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032953861584754265, + "loss": 2.6278, + "theoretical_loss": 3.6024491566190626, + "tokens_seen": 1147057152 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032952858575727184, + "loss": 2.5794, + "theoretical_loss": 3.6024300634546726, + "tokens_seen": 1147122688 + }, + { + "epoch": 3.08, + "learning_rate": 0.000329518555667001, + "loss": 2.5395, + "theoretical_loss": 3.6024109716864654, + "tokens_seen": 1147188224 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003295085255767302, + "loss": 2.5811, + "theoretical_loss": 3.602391881314258, + "tokens_seen": 1147253760 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003294984954864594, + "loss": 2.8294, + "theoretical_loss": 3.602372792337869, + "tokens_seen": 1147319296 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003294884653961886, + "loss": 2.4544, + "theoretical_loss": 3.6023537047571166, + "tokens_seen": 1147384832 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032947843530591774, + "loss": 2.4459, + "theoretical_loss": 3.6023346185718195, + "tokens_seen": 1147450368 + }, + { + "epoch": 3.08, + "learning_rate": 0.000329468405215647, + "loss": 2.7234, + "theoretical_loss": 3.6023155337817956, + "tokens_seen": 1147515904 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003294583751253761, + "loss": 2.7293, + "theoretical_loss": 3.602296450386863, + "tokens_seen": 1147581440 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032944834503510534, + "loss": 2.6907, + "theoretical_loss": 3.60227736838684, + "tokens_seen": 1147646976 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003294383149448345, + "loss": 2.7077, + "theoretical_loss": 3.602258287781545, + "tokens_seen": 1147712512 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003294282848545637, + "loss": 2.979, + "theoretical_loss": 3.6022392085707975, + "tokens_seen": 1147778048 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003294182547642929, + "loss": 2.687, + "theoretical_loss": 3.602220130754415, + "tokens_seen": 1147843584 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003294082246740221, + "loss": 2.7995, + "theoretical_loss": 3.602201054332216, + "tokens_seen": 1147909120 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032939819458375124, + "loss": 2.847, + "theoretical_loss": 3.602181979304019, + "tokens_seen": 1147974656 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003293881644934805, + "loss": 2.6341, + "theoretical_loss": 3.602162905669643, + "tokens_seen": 1148040192 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003293781344032096, + "loss": 2.773, + "theoretical_loss": 3.602143833428906, + "tokens_seen": 1148105728 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032936810431293884, + "loss": 2.7618, + "theoretical_loss": 3.6021247625816275, + "tokens_seen": 1148171264 + }, + { + "epoch": 3.08, + "learning_rate": 0.000329358074222668, + "loss": 2.6518, + "theoretical_loss": 3.6021056931276254, + "tokens_seen": 1148236800 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003293480441323972, + "loss": 2.7529, + "theoretical_loss": 3.6020866250667183, + "tokens_seen": 1148302336 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003293380140421264, + "loss": 2.71, + "theoretical_loss": 3.602067558398726, + "tokens_seen": 1148367872 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032932798395185557, + "loss": 2.8273, + "theoretical_loss": 3.6020484931234655, + "tokens_seen": 1148433408 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032931795386158475, + "loss": 2.8462, + "theoretical_loss": 3.6020294292407575, + "tokens_seen": 1148498944 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1268910, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8044888973236084, + "objective/train/theoretical_loss": 3.6020246634876454, + "objective/train/tokens_used": 1168975328, + "theoretical_loss": 3.6020246634876454, + "tokens_seen": 1148515328 + }, + { + "epoch": 3.08, + "learning_rate": 0.000329307923771314, + "loss": 2.5725, + "theoretical_loss": 3.6020103667504193, + "tokens_seen": 1148564480 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003292978936810431, + "loss": 2.7818, + "theoretical_loss": 3.601991305652271, + "tokens_seen": 1148630016 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032928786359077235, + "loss": 2.7852, + "theoretical_loss": 3.601972245946131, + "tokens_seen": 1148695552 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003292778335005015, + "loss": 2.8112, + "theoretical_loss": 3.6019531876318176, + "tokens_seen": 1148761088 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003292678034102307, + "loss": 2.7196, + "theoretical_loss": 3.60193413070915, + "tokens_seen": 1148826624 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003292577733199599, + "loss": 2.822, + "theoretical_loss": 3.6019150751779483, + "tokens_seen": 1148892160 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032924774322968907, + "loss": 2.8823, + "theoretical_loss": 3.601896021038031, + "tokens_seen": 1148957696 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032923771313941825, + "loss": 2.6873, + "theoretical_loss": 3.6018769682892158, + "tokens_seen": 1149023232 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003292276830491475, + "loss": 2.7185, + "theoretical_loss": 3.6018579169313236, + "tokens_seen": 1149088768 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003292176529588766, + "loss": 2.6924, + "theoretical_loss": 3.601838866964173, + "tokens_seen": 1149154304 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032920762286860585, + "loss": 2.6421, + "theoretical_loss": 3.6018198183875825, + "tokens_seen": 1149219840 + }, + { + "epoch": 3.08, + "learning_rate": 0.000329197592778335, + "loss": 2.6667, + "theoretical_loss": 3.6018007712013724, + "tokens_seen": 1149285376 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003291875626880642, + "loss": 2.5585, + "theoretical_loss": 3.6017817254053615, + "tokens_seen": 1149350912 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003291775325977934, + "loss": 2.761, + "theoretical_loss": 3.6017626809993684, + "tokens_seen": 1149416448 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003291675025075226, + "loss": 2.5061, + "theoretical_loss": 3.6017436379832133, + "tokens_seen": 1149481984 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032915747241725175, + "loss": 2.7733, + "theoretical_loss": 3.601724596356715, + "tokens_seen": 1149547520 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032914744232698094, + "loss": 2.4415, + "theoretical_loss": 3.6017055561196933, + "tokens_seen": 1149613056 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003291374122367101, + "loss": 2.5899, + "theoretical_loss": 3.6016865172719674, + "tokens_seen": 1149678592 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032912738214643935, + "loss": 2.5564, + "theoretical_loss": 3.601667479813356, + "tokens_seen": 1149744128 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003291173520561685, + "loss": 2.7516, + "theoretical_loss": 3.60164844374368, + "tokens_seen": 1149809664 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003291073219658977, + "loss": 2.7717, + "theoretical_loss": 3.6016294090627583, + "tokens_seen": 1149875200 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003290972918756269, + "loss": 2.81, + "theoretical_loss": 3.6016103757704103, + "tokens_seen": 1149940736 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003290872617853561, + "loss": 2.7642, + "theoretical_loss": 3.6015913438664553, + "tokens_seen": 1150006272 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032907723169508526, + "loss": 2.5971, + "theoretical_loss": 3.6015723133507134, + "tokens_seen": 1150071808 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032906720160481444, + "loss": 2.6199, + "theoretical_loss": 3.6015532842230042, + "tokens_seen": 1150137344 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1269509, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0626049041748047, + "objective/train/theoretical_loss": 3.601548527157936, + "objective/train/tokens_used": 1170613728, + "theoretical_loss": 3.601548527157936, + "tokens_seen": 1150153728 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003290571715145436, + "loss": 2.6663, + "theoretical_loss": 3.6015342564831476, + "tokens_seen": 1150202880 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032904714142427286, + "loss": 2.5544, + "theoretical_loss": 3.6015152301309623, + "tokens_seen": 1150268416 + }, + { + "epoch": 3.08, + "learning_rate": 0.000329037111334002, + "loss": 2.7881, + "theoretical_loss": 3.601496205166269, + "tokens_seen": 1150333952 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003290270812437312, + "loss": 2.8447, + "theoretical_loss": 3.601477181588887, + "tokens_seen": 1150399488 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032901705115346034, + "loss": 2.8106, + "theoretical_loss": 3.6014581593986366, + "tokens_seen": 1150465024 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003290070210631896, + "loss": 2.6651, + "theoretical_loss": 3.6014391385953375, + "tokens_seen": 1150530560 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032899699097291876, + "loss": 2.6976, + "theoretical_loss": 3.601420119178809, + "tokens_seen": 1150596096 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032898696088264794, + "loss": 2.6538, + "theoretical_loss": 3.601401101148872, + "tokens_seen": 1150661632 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003289769307923771, + "loss": 2.5883, + "theoretical_loss": 3.6013820845053455, + "tokens_seen": 1150727168 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003289669007021063, + "loss": 2.7749, + "theoretical_loss": 3.60136306924805, + "tokens_seen": 1150792704 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003289568706118355, + "loss": 2.7311, + "theoretical_loss": 3.6013440553768055, + "tokens_seen": 1150858240 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003289468405215647, + "loss": 2.7103, + "theoretical_loss": 3.601325042891432, + "tokens_seen": 1150923776 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032893681043129385, + "loss": 2.8096, + "theoretical_loss": 3.601306031791749, + "tokens_seen": 1150989312 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003289267803410231, + "loss": 2.6958, + "theoretical_loss": 3.6012870220775772, + "tokens_seen": 1151054848 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032891675025075226, + "loss": 2.6214, + "theoretical_loss": 3.6012680137487374, + "tokens_seen": 1151120384 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032890672016048145, + "loss": 2.5678, + "theoretical_loss": 3.6012490068050482, + "tokens_seen": 1151185920 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003288966900702106, + "loss": 2.8512, + "theoretical_loss": 3.6012300012463316, + "tokens_seen": 1151251456 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003288866599799398, + "loss": 2.8587, + "theoretical_loss": 3.601210997072406, + "tokens_seen": 1151316992 + }, + { + "epoch": 3.08, + "learning_rate": 0.000328876629889669, + "loss": 2.6661, + "theoretical_loss": 3.6011919942830932, + "tokens_seen": 1151382528 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003288665997993982, + "loss": 2.7665, + "theoretical_loss": 3.601172992878213, + "tokens_seen": 1151448064 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032885656970912735, + "loss": 2.7371, + "theoretical_loss": 3.6011539928575855, + "tokens_seen": 1151513600 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003288465396188566, + "loss": 2.5777, + "theoretical_loss": 3.601134994221032, + "tokens_seen": 1151579136 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003288365095285857, + "loss": 2.7746, + "theoretical_loss": 3.601115996968371, + "tokens_seen": 1151644672 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032882647943831495, + "loss": 2.6474, + "theoretical_loss": 3.6010970010994248, + "tokens_seen": 1151710208 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003288164493480442, + "loss": 2.5405, + "theoretical_loss": 3.6010780066140127, + "tokens_seen": 1151775744 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1270458, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7559614181518555, + "objective/train/theoretical_loss": 3.601073258208816, + "objective/train/tokens_used": 1172252128, + "theoretical_loss": 3.601073258208816, + "tokens_seen": 1151792128 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003288064192577733, + "loss": 2.6892, + "theoretical_loss": 3.6010590135119562, + "tokens_seen": 1151841280 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032879638916750255, + "loss": 2.5914, + "theoretical_loss": 3.601040021793075, + "tokens_seen": 1151906816 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003287863590772317, + "loss": 2.6654, + "theoretical_loss": 3.601021031457191, + "tokens_seen": 1151972352 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003287763289869609, + "loss": 2.6697, + "theoretical_loss": 3.601002042504123, + "tokens_seen": 1152037888 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003287662988966901, + "loss": 2.6694, + "theoretical_loss": 3.600983054933693, + "tokens_seen": 1152103424 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032875626880641927, + "loss": 2.8141, + "theoretical_loss": 3.600964068745721, + "tokens_seen": 1152168960 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032874623871614845, + "loss": 2.6888, + "theoretical_loss": 3.600945083940028, + "tokens_seen": 1152234496 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003287362086258777, + "loss": 2.8488, + "theoretical_loss": 3.6009261005164346, + "tokens_seen": 1152300032 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003287261785356068, + "loss": 2.5548, + "theoretical_loss": 3.600907118474762, + "tokens_seen": 1152365568 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032871614844533605, + "loss": 2.746, + "theoretical_loss": 3.600888137814831, + "tokens_seen": 1152431104 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003287061183550652, + "loss": 2.5498, + "theoretical_loss": 3.6008691585364616, + "tokens_seen": 1152496640 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003286960882647944, + "loss": 2.5718, + "theoretical_loss": 3.6008501806394753, + "tokens_seen": 1152562176 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003286860581745236, + "loss": 2.8185, + "theoretical_loss": 3.600831204123693, + "tokens_seen": 1152627712 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003286760280842528, + "loss": 2.7984, + "theoretical_loss": 3.6008122289889357, + "tokens_seen": 1152693248 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032866599799398195, + "loss": 2.6887, + "theoretical_loss": 3.600793255235024, + "tokens_seen": 1152758784 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032865596790371114, + "loss": 2.6701, + "theoretical_loss": 3.6007742828617797, + "tokens_seen": 1152824320 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003286459378134403, + "loss": 2.5288, + "theoretical_loss": 3.600755311869023, + "tokens_seen": 1152889856 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032863590772316955, + "loss": 2.7081, + "theoretical_loss": 3.6007363422565755, + "tokens_seen": 1152955392 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003286258776328987, + "loss": 2.7438, + "theoretical_loss": 3.600717374024258, + "tokens_seen": 1153020928 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003286158475426279, + "loss": 2.6553, + "theoretical_loss": 3.600698407171892, + "tokens_seen": 1153086464 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003286058174523571, + "loss": 2.8296, + "theoretical_loss": 3.600679441699299, + "tokens_seen": 1153152000 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003285957873620863, + "loss": 2.7507, + "theoretical_loss": 3.6006604776062994, + "tokens_seen": 1153217536 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032858575727181546, + "loss": 2.7123, + "theoretical_loss": 3.6006415148927147, + "tokens_seen": 1153283072 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032857572718154464, + "loss": 2.5924, + "theoretical_loss": 3.6006225535583667, + "tokens_seen": 1153348608 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003285656970912738, + "loss": 2.8264, + "theoretical_loss": 3.6006035936030756, + "tokens_seen": 1153414144 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1271091, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.858920097351074, + "objective/train/theoretical_loss": 3.60059885382971, + "objective/train/tokens_used": 1173890528, + "theoretical_loss": 3.60059885382971, + "tokens_seen": 1153430528 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032855566700100306, + "loss": 3.0625, + "theoretical_loss": 3.6005846350266637, + "tokens_seen": 1153479680 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003285456369107322, + "loss": 2.8801, + "theoretical_loss": 3.6005656778289525, + "tokens_seen": 1153545216 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003285356068204614, + "loss": 2.6872, + "theoretical_loss": 3.6005467220097627, + "tokens_seen": 1153610752 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032852557673019054, + "loss": 2.8549, + "theoretical_loss": 3.6005277675689165, + "tokens_seen": 1153676288 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003285155466399198, + "loss": 2.833, + "theoretical_loss": 3.6005088145062345, + "tokens_seen": 1153741824 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032850551654964896, + "loss": 2.7275, + "theoretical_loss": 3.600489862821539, + "tokens_seen": 1153807360 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032849548645937814, + "loss": 2.7133, + "theoretical_loss": 3.600470912514651, + "tokens_seen": 1153872896 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003284854563691073, + "loss": 2.6046, + "theoretical_loss": 3.6004519635853924, + "tokens_seen": 1153938432 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003284754262788365, + "loss": 2.4431, + "theoretical_loss": 3.600433016033585, + "tokens_seen": 1154003968 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003284653961885657, + "loss": 2.5941, + "theoretical_loss": 3.6004140698590503, + "tokens_seen": 1154069504 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003284553660982949, + "loss": 2.6592, + "theoretical_loss": 3.6003951250616097, + "tokens_seen": 1154135040 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032844533600802405, + "loss": 2.8864, + "theoretical_loss": 3.600376181641085, + "tokens_seen": 1154200576 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003284353059177533, + "loss": 2.7652, + "theoretical_loss": 3.6003572395972987, + "tokens_seen": 1154266112 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032842527582748246, + "loss": 2.7161, + "theoretical_loss": 3.6003382989300716, + "tokens_seen": 1154331648 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032841524573721165, + "loss": 2.7713, + "theoretical_loss": 3.600319359639226, + "tokens_seen": 1154397184 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003284052156469408, + "loss": 2.6318, + "theoretical_loss": 3.600300421724583, + "tokens_seen": 1154462720 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032839518555667, + "loss": 2.5957, + "theoretical_loss": 3.600281485185966, + "tokens_seen": 1154528256 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003283851554663992, + "loss": 2.536, + "theoretical_loss": 3.6002625500231957, + "tokens_seen": 1154593792 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003283751253761284, + "loss": 2.5904, + "theoretical_loss": 3.6002436162360945, + "tokens_seen": 1154659328 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032836509528585755, + "loss": 2.5183, + "theoretical_loss": 3.600224683824484, + "tokens_seen": 1154724864 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003283550651955868, + "loss": 2.7599, + "theoretical_loss": 3.6002057527881868, + "tokens_seen": 1154790400 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003283450351053159, + "loss": 2.7745, + "theoretical_loss": 3.600186823127024, + "tokens_seen": 1154855936 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032833500501504515, + "loss": 2.6968, + "theoretical_loss": 3.600167894840819, + "tokens_seen": 1154921472 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032832497492477433, + "loss": 2.7957, + "theoretical_loss": 3.6001489679293934, + "tokens_seen": 1154987008 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003283149448345035, + "loss": 2.8614, + "theoretical_loss": 3.6001300423925686, + "tokens_seen": 1155052544 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1272129, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.853865623474121, + "objective/train/theoretical_loss": 3.600125311223123, + "objective/train/tokens_used": 1175528928, + "theoretical_loss": 3.600125311223123, + "tokens_seen": 1155068928 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003283049147442327, + "loss": 2.8174, + "theoretical_loss": 3.600111118230168, + "tokens_seen": 1155118080 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003282948846539619, + "loss": 2.8958, + "theoretical_loss": 3.600092195442013, + "tokens_seen": 1155183616 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032828485456369105, + "loss": 2.7905, + "theoretical_loss": 3.6000732740279258, + "tokens_seen": 1155249152 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003282748244734203, + "loss": 2.7081, + "theoretical_loss": 3.600054353987729, + "tokens_seen": 1155314688 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003282647943831494, + "loss": 2.413, + "theoretical_loss": 3.600035435321245, + "tokens_seen": 1155380224 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032825476429287865, + "loss": 2.695, + "theoretical_loss": 3.6000165180282955, + "tokens_seen": 1155445760 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032824473420260783, + "loss": 2.5683, + "theoretical_loss": 3.5999976021087035, + "tokens_seen": 1155511296 + }, + { + "epoch": 3.08, + "learning_rate": 0.000328234704112337, + "loss": 2.5468, + "theoretical_loss": 3.5999786875622917, + "tokens_seen": 1155576832 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003282246740220662, + "loss": 2.413, + "theoretical_loss": 3.599959774388882, + "tokens_seen": 1155642368 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003282146439317954, + "loss": 2.7004, + "theoretical_loss": 3.599940862588297, + "tokens_seen": 1155707904 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032820461384152456, + "loss": 2.6909, + "theoretical_loss": 3.5999219521603587, + "tokens_seen": 1155773440 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003281945837512538, + "loss": 2.6059, + "theoretical_loss": 3.5999030431048906, + "tokens_seen": 1155838976 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003281845536609829, + "loss": 2.6892, + "theoretical_loss": 3.599884135421715, + "tokens_seen": 1155904512 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032817452357071215, + "loss": 2.5906, + "theoretical_loss": 3.5998652291106543, + "tokens_seen": 1155970048 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003281644934804413, + "loss": 2.7893, + "theoretical_loss": 3.599846324171531, + "tokens_seen": 1156035584 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003281544633901705, + "loss": 2.6951, + "theoretical_loss": 3.5998274206041683, + "tokens_seen": 1156101120 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003281444332998997, + "loss": 2.6377, + "theoretical_loss": 3.5998085184083886, + "tokens_seen": 1156166656 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003281344032096289, + "loss": 2.5826, + "theoretical_loss": 3.5997896175840145, + "tokens_seen": 1156232192 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032812437311935806, + "loss": 2.7148, + "theoretical_loss": 3.5997707181308685, + "tokens_seen": 1156297728 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003281143430290873, + "loss": 2.7672, + "theoretical_loss": 3.599751820048774, + "tokens_seen": 1156363264 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003281043129388164, + "loss": 2.8374, + "theoretical_loss": 3.5997329233375543, + "tokens_seen": 1156428800 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032809428284854566, + "loss": 2.6762, + "theoretical_loss": 3.599714027997031, + "tokens_seen": 1156494336 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003280842527582748, + "loss": 2.3282, + "theoretical_loss": 3.599695134027028, + "tokens_seen": 1156559872 + }, + { + "epoch": 3.08, + "learning_rate": 0.000328074222668004, + "loss": 2.7864, + "theoretical_loss": 3.5996762414273675, + "tokens_seen": 1156625408 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032806419257773326, + "loss": 2.8162, + "theoretical_loss": 3.599657350197873, + "tokens_seen": 1156690944 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1272864, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.16133189201355, + "objective/train/theoretical_loss": 3.5996526276045673, + "objective/train/tokens_used": 1177167328, + "theoretical_loss": 3.5996526276045673, + "tokens_seen": 1156707328 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003280541624874624, + "loss": 2.8585, + "theoretical_loss": 3.599638460338368, + "tokens_seen": 1156756480 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003280441323971916, + "loss": 2.5503, + "theoretical_loss": 3.599619571848674, + "tokens_seen": 1156822016 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032803410230692074, + "loss": 2.7527, + "theoretical_loss": 3.5996006847286157, + "tokens_seen": 1156887552 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032802407221665, + "loss": 2.6692, + "theoretical_loss": 3.599581798978015, + "tokens_seen": 1156953088 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032801404212637916, + "loss": 2.475, + "theoretical_loss": 3.599562914596696, + "tokens_seen": 1157018624 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032800401203610834, + "loss": 2.62, + "theoretical_loss": 3.5995440315844807, + "tokens_seen": 1157084160 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003279939819458375, + "loss": 2.8344, + "theoretical_loss": 3.5995251499411935, + "tokens_seen": 1157149696 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003279839518555667, + "loss": 2.6817, + "theoretical_loss": 3.5995062696666573, + "tokens_seen": 1157215232 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003279739217652959, + "loss": 2.7615, + "theoretical_loss": 3.599487390760695, + "tokens_seen": 1157280768 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003279638916750251, + "loss": 2.567, + "theoretical_loss": 3.5994685132231297, + "tokens_seen": 1157346304 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032795386158475425, + "loss": 2.4982, + "theoretical_loss": 3.5994496370537856, + "tokens_seen": 1157411840 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003279438314944835, + "loss": 2.7333, + "theoretical_loss": 3.599430762252486, + "tokens_seen": 1157477376 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032793380140421266, + "loss": 2.8056, + "theoretical_loss": 3.599411888819054, + "tokens_seen": 1157542912 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032792377131394185, + "loss": 2.5637, + "theoretical_loss": 3.599393016753312, + "tokens_seen": 1157608448 + }, + { + "epoch": 3.08, + "learning_rate": 0.000327913741223671, + "loss": 2.7559, + "theoretical_loss": 3.599374146055085, + "tokens_seen": 1157673984 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003279037111334002, + "loss": 2.7122, + "theoretical_loss": 3.599355276724196, + "tokens_seen": 1157739520 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003278936810431294, + "loss": 2.7572, + "theoretical_loss": 3.5993364087604682, + "tokens_seen": 1157805056 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003278836509528586, + "loss": 2.5657, + "theoretical_loss": 3.5993175421637256, + "tokens_seen": 1157870592 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032787362086258775, + "loss": 2.7728, + "theoretical_loss": 3.599298676933792, + "tokens_seen": 1157936128 + }, + { + "epoch": 3.08, + "learning_rate": 0.000327863590772317, + "loss": 2.2984, + "theoretical_loss": 3.59927981307049, + "tokens_seen": 1158001664 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003278535606820461, + "loss": 2.5368, + "theoretical_loss": 3.599260950573644, + "tokens_seen": 1158067200 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032784353059177535, + "loss": 2.6498, + "theoretical_loss": 3.599242089443078, + "tokens_seen": 1158132736 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032783350050150453, + "loss": 2.5435, + "theoretical_loss": 3.599223229678615, + "tokens_seen": 1158198272 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003278234704112337, + "loss": 2.6249, + "theoretical_loss": 3.5992043712800794, + "tokens_seen": 1158263808 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003278134403209629, + "loss": 2.8027, + "theoretical_loss": 3.599185514247295, + "tokens_seen": 1158329344 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1274329, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.622804641723633, + "objective/train/theoretical_loss": 3.5991808002024763, + "objective/train/tokens_used": 1178805728, + "theoretical_loss": 3.5991808002024763, + "tokens_seen": 1158345728 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032780341023069213, + "loss": 2.5089, + "theoretical_loss": 3.5991666585800846, + "tokens_seen": 1158394880 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032779338014042125, + "loss": 2.9317, + "theoretical_loss": 3.599147804278273, + "tokens_seen": 1158460416 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003277833500501505, + "loss": 2.7415, + "theoretical_loss": 3.599128951341684, + "tokens_seen": 1158525952 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003277733199598796, + "loss": 2.6007, + "theoretical_loss": 3.5991100997701415, + "tokens_seen": 1158591488 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032776328986960885, + "loss": 2.5627, + "theoretical_loss": 3.599091249563469, + "tokens_seen": 1158657024 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032775325977933803, + "loss": 2.5528, + "theoretical_loss": 3.5990724007214916, + "tokens_seen": 1158722560 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003277432296890672, + "loss": 2.4406, + "theoretical_loss": 3.5990535532440324, + "tokens_seen": 1158788096 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003277331995987964, + "loss": 2.4564, + "theoretical_loss": 3.5990347071309152, + "tokens_seen": 1158853632 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003277231695085256, + "loss": 2.8616, + "theoretical_loss": 3.5990158623819646, + "tokens_seen": 1158919168 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032771313941825476, + "loss": 2.6851, + "theoretical_loss": 3.5989970189970046, + "tokens_seen": 1158984704 + }, + { + "epoch": 3.08, + "learning_rate": 0.000327703109327984, + "loss": 2.8071, + "theoretical_loss": 3.5989781769758595, + "tokens_seen": 1159050240 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003276930792377131, + "loss": 2.7617, + "theoretical_loss": 3.5989593363183534, + "tokens_seen": 1159115776 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032768304914744235, + "loss": 2.6882, + "theoretical_loss": 3.598940497024311, + "tokens_seen": 1159181312 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003276730190571715, + "loss": 2.7579, + "theoretical_loss": 3.5989216590935555, + "tokens_seen": 1159246848 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003276629889669007, + "loss": 2.548, + "theoretical_loss": 3.5989028225259116, + "tokens_seen": 1159312384 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003276529588766299, + "loss": 2.6048, + "theoretical_loss": 3.598883987321204, + "tokens_seen": 1159377920 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003276429287863591, + "loss": 2.7181, + "theoretical_loss": 3.598865153479257, + "tokens_seen": 1159443456 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032763289869608826, + "loss": 2.7681, + "theoretical_loss": 3.5988463209998947, + "tokens_seen": 1159508992 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003276228686058175, + "loss": 2.7062, + "theoretical_loss": 3.5988274898829413, + "tokens_seen": 1159574528 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003276128385155466, + "loss": 2.6341, + "theoretical_loss": 3.598808660128222, + "tokens_seen": 1159640064 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032760280842527586, + "loss": 2.8696, + "theoretical_loss": 3.5987898317355604, + "tokens_seen": 1159705600 + }, + { + "epoch": 3.08, + "learning_rate": 0.000327592778335005, + "loss": 2.7675, + "theoretical_loss": 3.5987710047047816, + "tokens_seen": 1159771136 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003275827482447342, + "loss": 2.5033, + "theoretical_loss": 3.59875217903571, + "tokens_seen": 1159836672 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003275727181544634, + "loss": 2.7395, + "theoretical_loss": 3.5987333547281706, + "tokens_seen": 1159902208 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003275626880641926, + "loss": 2.6689, + "theoretical_loss": 3.598714531781987, + "tokens_seen": 1159967744 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1274819, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.889496326446533, + "objective/train/theoretical_loss": 3.5987098262581325, + "objective/train/tokens_used": 1180444128, + "theoretical_loss": 3.5987098262581325, + "tokens_seen": 1159984128 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032755265797392176, + "loss": 2.6646, + "theoretical_loss": 3.598695710196985, + "tokens_seen": 1160033280 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032754262788365094, + "loss": 2.621, + "theoretical_loss": 3.598676889972988, + "tokens_seen": 1160098816 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003275325977933801, + "loss": 2.7977, + "theoretical_loss": 3.598658071109822, + "tokens_seen": 1160164352 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032752256770310936, + "loss": 2.6562, + "theoretical_loss": 3.5986392536073106, + "tokens_seen": 1160229888 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003275125376128385, + "loss": 2.7267, + "theoretical_loss": 3.5986204374652795, + "tokens_seen": 1160295424 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003275025075225677, + "loss": 2.7961, + "theoretical_loss": 3.598601622683553, + "tokens_seen": 1160360960 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032749247743229685, + "loss": 2.5709, + "theoretical_loss": 3.598582809261956, + "tokens_seen": 1160426496 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003274824473420261, + "loss": 2.7136, + "theoretical_loss": 3.598563997200314, + "tokens_seen": 1160492032 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032747241725175527, + "loss": 2.6793, + "theoretical_loss": 3.59854518649845, + "tokens_seen": 1160557568 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032746238716148445, + "loss": 2.7906, + "theoretical_loss": 3.5985263771561913, + "tokens_seen": 1160623104 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032745235707121363, + "loss": 2.5507, + "theoretical_loss": 3.5985075691733615, + "tokens_seen": 1160688640 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032744232698094286, + "loss": 2.6493, + "theoretical_loss": 3.598488762549786, + "tokens_seen": 1160754176 + }, + { + "epoch": 3.08, + "learning_rate": 0.000327432296890672, + "loss": 2.5678, + "theoretical_loss": 3.59846995728529, + "tokens_seen": 1160819712 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003274222668004012, + "loss": 2.8937, + "theoretical_loss": 3.598451153379698, + "tokens_seen": 1160885248 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032741223671013035, + "loss": 2.802, + "theoretical_loss": 3.5984323508328355, + "tokens_seen": 1160950784 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003274022066198596, + "loss": 2.7575, + "theoretical_loss": 3.598413549644528, + "tokens_seen": 1161016320 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032739217652958877, + "loss": 2.5724, + "theoretical_loss": 3.5983947498145996, + "tokens_seen": 1161081856 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032738214643931795, + "loss": 2.695, + "theoretical_loss": 3.5983759513428764, + "tokens_seen": 1161147392 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032737211634904713, + "loss": 2.7675, + "theoretical_loss": 3.5983571542291832, + "tokens_seen": 1161212928 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003273620862587763, + "loss": 2.6807, + "theoretical_loss": 3.5983383584733453, + "tokens_seen": 1161278464 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003273520561685055, + "loss": 2.8354, + "theoretical_loss": 3.5983195640751884, + "tokens_seen": 1161344000 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032734202607823473, + "loss": 2.6488, + "theoretical_loss": 3.598300771034537, + "tokens_seen": 1161409536 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032733199598796386, + "loss": 2.4031, + "theoretical_loss": 3.598281979351217, + "tokens_seen": 1161475072 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003273219658976931, + "loss": 2.4916, + "theoretical_loss": 3.5982631890250545, + "tokens_seen": 1161540608 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032731193580742233, + "loss": 2.7317, + "theoretical_loss": 3.5982444000558735, + "tokens_seen": 1161606144 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1275406, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6565632820129395, + "objective/train/theoretical_loss": 3.5982397030255866, + "objective/train/tokens_used": 1182082528, + "theoretical_loss": 3.5982397030255866, + "tokens_seen": 1161622528 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032730190571715145, + "loss": 2.3889, + "theoretical_loss": 3.5982256124435006, + "tokens_seen": 1161671680 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003272918756268807, + "loss": 2.7799, + "theoretical_loss": 3.59820682618776, + "tokens_seen": 1161737216 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003272818455366098, + "loss": 2.7662, + "theoretical_loss": 3.598188041288479, + "tokens_seen": 1161802752 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032727181544633905, + "loss": 2.337, + "theoretical_loss": 3.5981692577454814, + "tokens_seen": 1161868288 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032726178535606823, + "loss": 2.8643, + "theoretical_loss": 3.598150475558594, + "tokens_seen": 1161933824 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003272517552657974, + "loss": 2.6224, + "theoretical_loss": 3.5981316947276416, + "tokens_seen": 1161999360 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003272417251755266, + "loss": 2.4621, + "theoretical_loss": 3.5981129152524503, + "tokens_seen": 1162064896 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003272316950852558, + "loss": 2.322, + "theoretical_loss": 3.598094137132846, + "tokens_seen": 1162130432 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032722166499498496, + "loss": 2.8936, + "theoretical_loss": 3.5980753603686537, + "tokens_seen": 1162195968 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003272116349047142, + "loss": 2.8057, + "theoretical_loss": 3.5980565849597, + "tokens_seen": 1162261504 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003272016048144433, + "loss": 2.7092, + "theoretical_loss": 3.5980378109058098, + "tokens_seen": 1162327040 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032719157472417256, + "loss": 2.6417, + "theoretical_loss": 3.598019038206809, + "tokens_seen": 1162392576 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003271815446339017, + "loss": 2.4224, + "theoretical_loss": 3.598000266862525, + "tokens_seen": 1162458112 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003271715145436309, + "loss": 2.4214, + "theoretical_loss": 3.597981496872781, + "tokens_seen": 1162523648 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003271614844533601, + "loss": 2.5387, + "theoretical_loss": 3.5979627282374054, + "tokens_seen": 1162589184 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003271514543630893, + "loss": 2.6946, + "theoretical_loss": 3.5979439609562225, + "tokens_seen": 1162654720 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032714142427281846, + "loss": 2.6044, + "theoretical_loss": 3.5979251950290587, + "tokens_seen": 1162720256 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003271313941825477, + "loss": 2.516, + "theoretical_loss": 3.5979064304557404, + "tokens_seen": 1162785792 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003271213640922768, + "loss": 2.5136, + "theoretical_loss": 3.5978876672360935, + "tokens_seen": 1162851328 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032711133400200606, + "loss": 2.625, + "theoretical_loss": 3.597868905369944, + "tokens_seen": 1162916864 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003271013039117352, + "loss": 2.4939, + "theoretical_loss": 3.5978501448571176, + "tokens_seen": 1162982400 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003270912738214644, + "loss": 2.684, + "theoretical_loss": 3.5978313856974413, + "tokens_seen": 1163047936 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003270812437311936, + "loss": 2.8434, + "theoretical_loss": 3.59781262789074, + "tokens_seen": 1163113472 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003270712136409228, + "loss": 2.5827, + "theoretical_loss": 3.597793871436841, + "tokens_seen": 1163179008 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032706118355065196, + "loss": 2.7304, + "theoretical_loss": 3.59777511633557, + "tokens_seen": 1163244544 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1276640, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.588357448577881, + "objective/train/theoretical_loss": 3.5977704277715805, + "objective/train/tokens_used": 1183720928, + "theoretical_loss": 3.5977704277715805, + "tokens_seen": 1163260928 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032705115346038114, + "loss": 2.678, + "theoretical_loss": 3.597756362586754, + "tokens_seen": 1163310080 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003270411233701103, + "loss": 2.7792, + "theoretical_loss": 3.597737610190218, + "tokens_seen": 1163375616 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032703109327983956, + "loss": 2.67, + "theoretical_loss": 3.597718859145789, + "tokens_seen": 1163441152 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003270210631895687, + "loss": 2.6397, + "theoretical_loss": 3.5977001094532937, + "tokens_seen": 1163506688 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003270110330992979, + "loss": 2.4046, + "theoretical_loss": 3.597681361112558, + "tokens_seen": 1163572224 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032700100300902705, + "loss": 2.55, + "theoretical_loss": 3.5976626141234087, + "tokens_seen": 1163637760 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003269909729187563, + "loss": 2.4804, + "theoretical_loss": 3.5976438684856715, + "tokens_seen": 1163703296 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032698094282848547, + "loss": 2.6268, + "theoretical_loss": 3.597625124199174, + "tokens_seen": 1163768832 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032697091273821465, + "loss": 2.5692, + "theoretical_loss": 3.5976063812637418, + "tokens_seen": 1163834368 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032696088264794383, + "loss": 2.5237, + "theoretical_loss": 3.5975876396792015, + "tokens_seen": 1163899904 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032695085255767306, + "loss": 2.6462, + "theoretical_loss": 3.5975688994453803, + "tokens_seen": 1163965440 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003269408224674022, + "loss": 2.7789, + "theoretical_loss": 3.597550160562104, + "tokens_seen": 1164030976 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003269307923771314, + "loss": 2.5586, + "theoretical_loss": 3.5975314230292, + "tokens_seen": 1164096512 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032692076228686055, + "loss": 2.6079, + "theoretical_loss": 3.5975126868464953, + "tokens_seen": 1164162048 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003269107321965898, + "loss": 2.5372, + "theoretical_loss": 3.5974939520138154, + "tokens_seen": 1164227584 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032690070210631897, + "loss": 2.4518, + "theoretical_loss": 3.5974752185309873, + "tokens_seen": 1164293120 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032689067201604815, + "loss": 2.4755, + "theoretical_loss": 3.5974564863978387, + "tokens_seen": 1164358656 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032688064192577733, + "loss": 2.6264, + "theoretical_loss": 3.5974377556141954, + "tokens_seen": 1164424192 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003268706118355065, + "loss": 2.4069, + "theoretical_loss": 3.5974190261798844, + "tokens_seen": 1164489728 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003268605817452357, + "loss": 2.5928, + "theoretical_loss": 3.5974002980947333, + "tokens_seen": 1164555264 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032685055165496493, + "loss": 2.5782, + "theoretical_loss": 3.597381571358568, + "tokens_seen": 1164620800 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032684052156469406, + "loss": 2.4116, + "theoretical_loss": 3.597362845971216, + "tokens_seen": 1164686336 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003268304914744233, + "loss": 2.8265, + "theoretical_loss": 3.597344121932504, + "tokens_seen": 1164751872 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003268204613841524, + "loss": 2.7104, + "theoretical_loss": 3.59732539924226, + "tokens_seen": 1164817408 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032681043129388165, + "loss": 2.5426, + "theoretical_loss": 3.5973066779003093, + "tokens_seen": 1164882944 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1277429, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5294029712677, + "objective/train/theoretical_loss": 3.5973019977754723, + "objective/train/tokens_used": 1185359328, + "theoretical_loss": 3.5973019977754723, + "tokens_seen": 1164899328 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032680040120361084, + "loss": 2.485, + "theoretical_loss": 3.59728795790648, + "tokens_seen": 1164948480 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032679037111334, + "loss": 2.661, + "theoretical_loss": 3.5972692392605996, + "tokens_seen": 1165014016 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003267803410230692, + "loss": 2.813, + "theoretical_loss": 3.5972505219624944, + "tokens_seen": 1165079552 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032677031093279843, + "loss": 2.4742, + "theoretical_loss": 3.597231806011991, + "tokens_seen": 1165145088 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032676028084252756, + "loss": 2.6719, + "theoretical_loss": 3.5972130914089187, + "tokens_seen": 1165210624 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003267502507522568, + "loss": 2.5704, + "theoretical_loss": 3.5971943781531026, + "tokens_seen": 1165276160 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003267402206619859, + "loss": 2.6898, + "theoretical_loss": 3.597175666244371, + "tokens_seen": 1165341696 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032673019057171516, + "loss": 2.3992, + "theoretical_loss": 3.597156955682551, + "tokens_seen": 1165407232 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032672016048144434, + "loss": 2.4696, + "theoretical_loss": 3.5971382464674697, + "tokens_seen": 1165472768 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003267101303911735, + "loss": 2.7106, + "theoretical_loss": 3.5971195385989545, + "tokens_seen": 1165538304 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003267001003009027, + "loss": 2.4775, + "theoretical_loss": 3.597100832076833, + "tokens_seen": 1165603840 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003266900702106319, + "loss": 2.4111, + "theoretical_loss": 3.5970821269009328, + "tokens_seen": 1165669376 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032668004012036106, + "loss": 2.6574, + "theoretical_loss": 3.5970634230710803, + "tokens_seen": 1165734912 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003266700100300903, + "loss": 2.5731, + "theoretical_loss": 3.597044720587104, + "tokens_seen": 1165800448 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003266599799398194, + "loss": 2.7557, + "theoretical_loss": 3.5970260194488315, + "tokens_seen": 1165865984 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032664994984954866, + "loss": 2.6017, + "theoretical_loss": 3.5970073196560897, + "tokens_seen": 1165931520 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032663991975927784, + "loss": 2.6644, + "theoretical_loss": 3.5969886212087063, + "tokens_seen": 1165997056 + }, + { + "epoch": 3.08, + "learning_rate": 0.000326629889669007, + "loss": 2.6361, + "theoretical_loss": 3.596969924106509, + "tokens_seen": 1166062592 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003266198595787362, + "loss": 2.7494, + "theoretical_loss": 3.596951228349325, + "tokens_seen": 1166128128 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003266098294884654, + "loss": 2.7595, + "theoretical_loss": 3.5969325339369833, + "tokens_seen": 1166193664 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032659979939819457, + "loss": 2.6563, + "theoretical_loss": 3.59691384086931, + "tokens_seen": 1166259200 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003265897693079238, + "loss": 2.7333, + "theoretical_loss": 3.5968951491461336, + "tokens_seen": 1166324736 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032657973921765293, + "loss": 2.7429, + "theoretical_loss": 3.596876458767282, + "tokens_seen": 1166390272 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032656970912738216, + "loss": 2.4125, + "theoretical_loss": 3.596857769732583, + "tokens_seen": 1166455808 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032655967903711135, + "loss": 2.5437, + "theoretical_loss": 3.596839082041864, + "tokens_seen": 1166521344 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0159366130828857, + "objective/train/theoretical_loss": 3.5968344103291607, + "objective/train/tokens_used": 1186997728, + "theoretical_loss": 3.5968344103291607, + "tokens_seen": 1166537728 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003265496489468405, + "loss": 2.6358, + "theoretical_loss": 3.596820395694953, + "tokens_seen": 1166586880 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032653961885656976, + "loss": 2.3213, + "theoretical_loss": 3.5968017106916776, + "tokens_seen": 1166652416 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003265295887662989, + "loss": 2.6772, + "theoretical_loss": 3.5967830270318664, + "tokens_seen": 1166717952 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003265195586760281, + "loss": 2.7451, + "theoretical_loss": 3.596764344715347, + "tokens_seen": 1166783488 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032650952858575725, + "loss": 2.7173, + "theoretical_loss": 3.596745663741947, + "tokens_seen": 1166849024 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003264994984954865, + "loss": 2.751, + "theoretical_loss": 3.596726984111495, + "tokens_seen": 1166914560 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032648946840521567, + "loss": 2.6119, + "theoretical_loss": 3.5967083058238187, + "tokens_seen": 1166980096 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032647943831494485, + "loss": 2.7055, + "theoretical_loss": 3.5966896288787464, + "tokens_seen": 1167045632 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032646940822467403, + "loss": 2.4983, + "theoretical_loss": 3.596670953276106, + "tokens_seen": 1167111168 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032645937813440326, + "loss": 2.6833, + "theoretical_loss": 3.596652279015726, + "tokens_seen": 1167176704 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003264493480441324, + "loss": 2.5952, + "theoretical_loss": 3.596633606097434, + "tokens_seen": 1167242240 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003264393179538616, + "loss": 2.6169, + "theoretical_loss": 3.596614934521059, + "tokens_seen": 1167307776 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032642928786359075, + "loss": 2.6897, + "theoretical_loss": 3.596596264286428, + "tokens_seen": 1167373312 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032641925777332, + "loss": 2.4475, + "theoretical_loss": 3.596577595393371, + "tokens_seen": 1167438848 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032640922768304917, + "loss": 2.7334, + "theoretical_loss": 3.5965589278417145, + "tokens_seen": 1167504384 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032639919759277835, + "loss": 2.5306, + "theoretical_loss": 3.596540261631288, + "tokens_seen": 1167569920 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032638916750250753, + "loss": 2.5529, + "theoretical_loss": 3.596521596761919, + "tokens_seen": 1167635456 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003263791374122367, + "loss": 2.9282, + "theoretical_loss": 3.5965029332334373, + "tokens_seen": 1167700992 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003263691073219659, + "loss": 2.487, + "theoretical_loss": 3.5964842710456697, + "tokens_seen": 1167766528 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032635907723169513, + "loss": 2.6882, + "theoretical_loss": 3.5964656101984454, + "tokens_seen": 1167832064 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032634904714142426, + "loss": 2.8419, + "theoretical_loss": 3.5964469506915933, + "tokens_seen": 1167897600 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003263390170511535, + "loss": 2.5661, + "theoretical_loss": 3.5964282925249407, + "tokens_seen": 1167963136 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003263289869608826, + "loss": 2.821, + "theoretical_loss": 3.5964096356983175, + "tokens_seen": 1168028672 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032631895687061185, + "loss": 2.7562, + "theoretical_loss": 3.5963909802115515, + "tokens_seen": 1168094208 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032630892678034104, + "loss": 2.3968, + "theoretical_loss": 3.5963723260644715, + "tokens_seen": 1168159744 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3768837451934814, + "objective/train/theoretical_loss": 3.5963676627370074, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5963676627370074, + "tokens_seen": 1168176128 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003262988966900702, + "loss": 2.6501, + "theoretical_loss": 3.596353673256906, + "tokens_seen": 1168225280 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003262888665997994, + "loss": 2.4647, + "theoretical_loss": 3.5963350217886845, + "tokens_seen": 1168290816 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032627883650952863, + "loss": 2.7632, + "theoretical_loss": 3.5963163716596345, + "tokens_seen": 1168356352 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032626880641925776, + "loss": 2.441, + "theoretical_loss": 3.5962977228695854, + "tokens_seen": 1168421888 + }, + { + "epoch": 3.08, + "learning_rate": 0.000326258776328987, + "loss": 2.7156, + "theoretical_loss": 3.596279075418366, + "tokens_seen": 1168487424 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003262487462387161, + "loss": 2.5267, + "theoretical_loss": 3.5962604293058043, + "tokens_seen": 1168552960 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032623871614844536, + "loss": 2.628, + "theoretical_loss": 3.5962417845317303, + "tokens_seen": 1168618496 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032622868605817454, + "loss": 2.5469, + "theoretical_loss": 3.596223141095972, + "tokens_seen": 1168684032 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003262186559679037, + "loss": 2.6036, + "theoretical_loss": 3.5962044989983593, + "tokens_seen": 1168749568 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003262086258776329, + "loss": 2.3583, + "theoretical_loss": 3.5961858582387194, + "tokens_seen": 1168815104 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003261985957873621, + "loss": 2.6966, + "theoretical_loss": 3.596167218816883, + "tokens_seen": 1168880640 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032618856569709126, + "loss": 2.6841, + "theoretical_loss": 3.5961485807326783, + "tokens_seen": 1168946176 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003261785356068205, + "loss": 2.6477, + "theoretical_loss": 3.5961299439859347, + "tokens_seen": 1169011712 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003261685055165496, + "loss": 2.7623, + "theoretical_loss": 3.596111308576481, + "tokens_seen": 1169077248 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032615847542627886, + "loss": 2.5495, + "theoretical_loss": 3.5960926745041455, + "tokens_seen": 1169142784 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032614844533600804, + "loss": 2.7324, + "theoretical_loss": 3.596074041768759, + "tokens_seen": 1169208320 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003261384152457372, + "loss": 2.5712, + "theoretical_loss": 3.596055410370149, + "tokens_seen": 1169273856 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003261283851554664, + "loss": 2.6998, + "theoretical_loss": 3.596036780308146, + "tokens_seen": 1169339392 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003261183550651956, + "loss": 2.6135, + "theoretical_loss": 3.5960181515825784, + "tokens_seen": 1169404928 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032610832497492477, + "loss": 2.577, + "theoretical_loss": 3.595999524193276, + "tokens_seen": 1169470464 + }, + { + "epoch": 3.08, + "learning_rate": 0.000326098294884654, + "loss": 2.8685, + "theoretical_loss": 3.595980898140067, + "tokens_seen": 1169536000 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032608826479438313, + "loss": 2.5749, + "theoretical_loss": 3.5959622734227823, + "tokens_seen": 1169601536 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032607823470411236, + "loss": 2.7083, + "theoretical_loss": 3.5959436500412494, + "tokens_seen": 1169667072 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003260682046138415, + "loss": 2.5683, + "theoretical_loss": 3.5959250279952997, + "tokens_seen": 1169732608 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003260581745235707, + "loss": 2.7942, + "theoretical_loss": 3.595906407284761, + "tokens_seen": 1169798144 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.560084581375122, + "objective/train/theoretical_loss": 3.595901752315764, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.595901752315764, + "tokens_seen": 1169814528 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003260481444332999, + "loss": 2.7187, + "theoretical_loss": 3.595887787909463, + "tokens_seen": 1169863680 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003260381143430291, + "loss": 2.909, + "theoretical_loss": 3.5958691698692355, + "tokens_seen": 1169929216 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032602808425275827, + "loss": 2.7557, + "theoretical_loss": 3.595850553163908, + "tokens_seen": 1169994752 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032601805416248745, + "loss": 2.5457, + "theoretical_loss": 3.59583193779331, + "tokens_seen": 1170060288 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032600802407221663, + "loss": 2.6619, + "theoretical_loss": 3.5958133237572714, + "tokens_seen": 1170125824 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032599799398194587, + "loss": 2.5647, + "theoretical_loss": 3.595794711055621, + "tokens_seen": 1170191360 + }, + { + "epoch": 3.08, + "learning_rate": 0.000325987963891675, + "loss": 2.7163, + "theoretical_loss": 3.595776099688189, + "tokens_seen": 1170256896 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032597793380140423, + "loss": 2.4849, + "theoretical_loss": 3.5957574896548046, + "tokens_seen": 1170322432 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003259679037111334, + "loss": 2.8802, + "theoretical_loss": 3.595738880955298, + "tokens_seen": 1170387968 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003259578736208626, + "loss": 2.296, + "theoretical_loss": 3.5957202735894986, + "tokens_seen": 1170453504 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003259478435305918, + "loss": 2.6945, + "theoretical_loss": 3.595701667557236, + "tokens_seen": 1170519040 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032593781344032095, + "loss": 2.6302, + "theoretical_loss": 3.5956830628583405, + "tokens_seen": 1170584576 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032592778335005014, + "loss": 2.4871, + "theoretical_loss": 3.5956644594926415, + "tokens_seen": 1170650112 + }, + { + "epoch": 3.08, + "learning_rate": 0.00032591775325977937, + "loss": 2.6558, + "theoretical_loss": 3.5956458574599686, + "tokens_seen": 1170715648 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003259077231695085, + "loss": 2.5413, + "theoretical_loss": 3.595627256760152, + "tokens_seen": 1170781184 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032589769307923773, + "loss": 2.4552, + "theoretical_loss": 3.5956086573930217, + "tokens_seen": 1170846720 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032588766298896686, + "loss": 2.4495, + "theoretical_loss": 3.5955900593584076, + "tokens_seen": 1170912256 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003258776328986961, + "loss": 2.7012, + "theoretical_loss": 3.5955714626561397, + "tokens_seen": 1170977792 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003258676028084253, + "loss": 2.4193, + "theoretical_loss": 3.5955528672860475, + "tokens_seen": 1171043328 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032585757271815446, + "loss": 2.7626, + "theoretical_loss": 3.5955342732479614, + "tokens_seen": 1171108864 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032584754262788364, + "loss": 2.6797, + "theoretical_loss": 3.5955156805417117, + "tokens_seen": 1171174400 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003258375125376128, + "loss": 2.3862, + "theoretical_loss": 3.595497089167128, + "tokens_seen": 1171239936 + }, + { + "epoch": 3.09, + "learning_rate": 0.000325827482447342, + "loss": 2.4888, + "theoretical_loss": 3.5954784991240407, + "tokens_seen": 1171305472 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032581745235707124, + "loss": 2.5438, + "theoretical_loss": 3.5954599104122797, + "tokens_seen": 1171371008 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003258074222668004, + "loss": 2.3983, + "theoretical_loss": 3.5954413230316757, + "tokens_seen": 1171436544 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6180059909820557, + "objective/train/theoretical_loss": 3.595436676394498, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.595436676394498, + "tokens_seen": 1171452928 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003257973921765296, + "loss": 2.658, + "theoretical_loss": 3.5954227369820586, + "tokens_seen": 1171502080 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032578736208625883, + "loss": 2.4644, + "theoretical_loss": 3.5954041522632583, + "tokens_seen": 1171567616 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032577733199598796, + "loss": 2.6907, + "theoretical_loss": 3.5953855688751055, + "tokens_seen": 1171633152 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003257673019057172, + "loss": 2.7324, + "theoretical_loss": 3.5953669868174307, + "tokens_seen": 1171698688 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003257572718154463, + "loss": 2.2496, + "theoretical_loss": 3.595348406090064, + "tokens_seen": 1171764224 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032574724172517556, + "loss": 2.7043, + "theoretical_loss": 3.595329826692835, + "tokens_seen": 1171829760 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032573721163490474, + "loss": 2.5151, + "theoretical_loss": 3.5953112486255754, + "tokens_seen": 1171895296 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003257271815446339, + "loss": 2.5465, + "theoretical_loss": 3.5952926718881146, + "tokens_seen": 1171960832 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003257171514543631, + "loss": 2.6868, + "theoretical_loss": 3.595274096480284, + "tokens_seen": 1172026368 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003257071213640923, + "loss": 2.7925, + "theoretical_loss": 3.5952555224019136, + "tokens_seen": 1172091904 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032569709127382146, + "loss": 2.7214, + "theoretical_loss": 3.595236949652833, + "tokens_seen": 1172157440 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003256870611835507, + "loss": 2.564, + "theoretical_loss": 3.5952183782328744, + "tokens_seen": 1172222976 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003256770310932798, + "loss": 2.4106, + "theoretical_loss": 3.5951998081418672, + "tokens_seen": 1172288512 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032566700100300906, + "loss": 2.505, + "theoretical_loss": 3.5951812393796434, + "tokens_seen": 1172354048 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032565697091273824, + "loss": 2.5024, + "theoretical_loss": 3.595162671946032, + "tokens_seen": 1172419584 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003256469408224674, + "loss": 2.4508, + "theoretical_loss": 3.5951441058408644, + "tokens_seen": 1172485120 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003256369107321966, + "loss": 2.7553, + "theoretical_loss": 3.5951255410639713, + "tokens_seen": 1172550656 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003256268806419258, + "loss": 2.5269, + "theoretical_loss": 3.595106977615184, + "tokens_seen": 1172616192 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032561685055165497, + "loss": 2.6031, + "theoretical_loss": 3.595088415494332, + "tokens_seen": 1172681728 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003256068204613842, + "loss": 2.5579, + "theoretical_loss": 3.5950698547012463, + "tokens_seen": 1172747264 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032559679037111333, + "loss": 2.374, + "theoretical_loss": 3.595051295235759, + "tokens_seen": 1172812800 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032558676028084256, + "loss": 2.6482, + "theoretical_loss": 3.5950327370977, + "tokens_seen": 1172878336 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003255767301905717, + "loss": 2.7096, + "theoretical_loss": 3.595014180286901, + "tokens_seen": 1172943872 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003255667001003009, + "loss": 2.4358, + "theoretical_loss": 3.5949956248031913, + "tokens_seen": 1173009408 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003255566700100301, + "loss": 2.5746, + "theoretical_loss": 3.5949770706464035, + "tokens_seen": 1173074944 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.576780080795288, + "objective/train/theoretical_loss": 3.5949724323145182, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5949724323145182, + "tokens_seen": 1173091328 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003255466399197593, + "loss": 2.6818, + "theoretical_loss": 3.594958517816368, + "tokens_seen": 1173140480 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032553660982948847, + "loss": 2.6777, + "theoretical_loss": 3.5949399663129156, + "tokens_seen": 1173206016 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032552657973921765, + "loss": 2.7612, + "theoretical_loss": 3.5949214161358776, + "tokens_seen": 1173271552 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032551654964894683, + "loss": 2.5333, + "theoretical_loss": 3.594902867285085, + "tokens_seen": 1173337088 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032550651955867607, + "loss": 2.6658, + "theoretical_loss": 3.594884319760369, + "tokens_seen": 1173402624 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003254964894684052, + "loss": 2.8645, + "theoretical_loss": 3.59486577356156, + "tokens_seen": 1173468160 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032548645937813443, + "loss": 2.7192, + "theoretical_loss": 3.594847228688491, + "tokens_seen": 1173533696 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003254764292878636, + "loss": 2.7719, + "theoretical_loss": 3.594828685140991, + "tokens_seen": 1173599232 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003254663991975928, + "loss": 2.535, + "theoretical_loss": 3.5948101429188926, + "tokens_seen": 1173664768 + }, + { + "epoch": 3.09, + "learning_rate": 0.000325456369107322, + "loss": 2.5644, + "theoretical_loss": 3.5947916020220267, + "tokens_seen": 1173730304 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032544633901705115, + "loss": 2.6096, + "theoretical_loss": 3.594773062450225, + "tokens_seen": 1173795840 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032543630892678034, + "loss": 2.14, + "theoretical_loss": 3.594754524203318, + "tokens_seen": 1173861376 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032542627883650957, + "loss": 2.4617, + "theoretical_loss": 3.5947359872811377, + "tokens_seen": 1173926912 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003254162487462387, + "loss": 2.5123, + "theoretical_loss": 3.5947174516835156, + "tokens_seen": 1173992448 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032540621865596793, + "loss": 2.6792, + "theoretical_loss": 3.5946989174102826, + "tokens_seen": 1174057984 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032539618856569706, + "loss": 2.5653, + "theoretical_loss": 3.5946803844612703, + "tokens_seen": 1174123520 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003253861584754263, + "loss": 2.5136, + "theoretical_loss": 3.5946618528363103, + "tokens_seen": 1174189056 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003253761283851555, + "loss": 2.6376, + "theoretical_loss": 3.594643322535234, + "tokens_seen": 1174254592 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032536609829488466, + "loss": 2.4409, + "theoretical_loss": 3.5946247935578732, + "tokens_seen": 1174320128 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032535606820461384, + "loss": 2.6197, + "theoretical_loss": 3.5946062659040594, + "tokens_seen": 1174385664 + }, + { + "epoch": 3.09, + "learning_rate": 0.000325346038114343, + "loss": 2.6027, + "theoretical_loss": 3.5945877395736243, + "tokens_seen": 1174451200 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003253360080240722, + "loss": 2.634, + "theoretical_loss": 3.5945692145663988, + "tokens_seen": 1174516736 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032532597793380144, + "loss": 2.8002, + "theoretical_loss": 3.5945506908822153, + "tokens_seen": 1174582272 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032531594784353056, + "loss": 2.607, + "theoretical_loss": 3.5945321685209053, + "tokens_seen": 1174647808 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003253059177532598, + "loss": 2.6358, + "theoretical_loss": 3.5945136474823007, + "tokens_seen": 1174713344 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.806351900100708, + "objective/train/theoretical_loss": 3.5945090174293024, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5945090174293024, + "tokens_seen": 1174729728 + }, + { + "epoch": 3.09, + "learning_rate": 0.000325295887662989, + "loss": 2.546, + "theoretical_loss": 3.594495127766233, + "tokens_seen": 1174778880 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032528585757271816, + "loss": 2.735, + "theoretical_loss": 3.594476609372534, + "tokens_seen": 1174844416 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032527582748244734, + "loss": 2.649, + "theoretical_loss": 3.5944580923010356, + "tokens_seen": 1174909952 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003252657973921765, + "loss": 2.1022, + "theoretical_loss": 3.5944395765515695, + "tokens_seen": 1174975488 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003252557673019057, + "loss": 2.6525, + "theoretical_loss": 3.594421062123968, + "tokens_seen": 1175041024 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032524573721163494, + "loss": 2.6477, + "theoretical_loss": 3.5944025490180627, + "tokens_seen": 1175106560 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032523570712136407, + "loss": 2.6208, + "theoretical_loss": 3.5943840372336853, + "tokens_seen": 1175172096 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003252256770310933, + "loss": 2.603, + "theoretical_loss": 3.5943655267706687, + "tokens_seen": 1175237632 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032521564694082243, + "loss": 2.6506, + "theoretical_loss": 3.594347017628844, + "tokens_seen": 1175303168 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032520561685055166, + "loss": 2.7226, + "theoretical_loss": 3.5943285098080433, + "tokens_seen": 1175368704 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032519558676028084, + "loss": 2.6465, + "theoretical_loss": 3.594310003308099, + "tokens_seen": 1175434240 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032518555667001, + "loss": 2.893, + "theoretical_loss": 3.594291498128843, + "tokens_seen": 1175499776 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003251755265797392, + "loss": 2.506, + "theoretical_loss": 3.5942729942701077, + "tokens_seen": 1175565312 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032516549648946844, + "loss": 2.6732, + "theoretical_loss": 3.594254491731725, + "tokens_seen": 1175630848 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032515546639919757, + "loss": 2.6036, + "theoretical_loss": 3.5942359905135275, + "tokens_seen": 1175696384 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003251454363089268, + "loss": 2.6352, + "theoretical_loss": 3.5942174906153466, + "tokens_seen": 1175761920 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032513540621865593, + "loss": 2.4589, + "theoretical_loss": 3.5941989920370148, + "tokens_seen": 1175827456 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032512537612838517, + "loss": 2.6806, + "theoretical_loss": 3.594180494778365, + "tokens_seen": 1175892992 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032511534603811435, + "loss": 2.6492, + "theoretical_loss": 3.5941619988392293, + "tokens_seen": 1175958528 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032510531594784353, + "loss": 2.6898, + "theoretical_loss": 3.5941435042194394, + "tokens_seen": 1176024064 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003250952858575727, + "loss": 2.6345, + "theoretical_loss": 3.5941250109188285, + "tokens_seen": 1176089600 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003250852557673019, + "loss": 2.4554, + "theoretical_loss": 3.5941065189372288, + "tokens_seen": 1176155136 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032507522567703107, + "loss": 2.2642, + "theoretical_loss": 3.594088028274472, + "tokens_seen": 1176220672 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003250651955867603, + "loss": 2.7183, + "theoretical_loss": 3.5940695389303916, + "tokens_seen": 1176286208 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003250551654964895, + "loss": 2.3044, + "theoretical_loss": 3.5940510509048194, + "tokens_seen": 1176351744 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1171867847442627, + "objective/train/theoretical_loss": 3.5940464291044236, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5940464291044236, + "tokens_seen": 1176368128 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032504513540621867, + "loss": 2.7721, + "theoretical_loss": 3.5940325641975885, + "tokens_seen": 1176417280 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032503510531594785, + "loss": 2.6341, + "theoretical_loss": 3.594014078808531, + "tokens_seen": 1176482816 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032502507522567703, + "loss": 2.5146, + "theoretical_loss": 3.5939955947374793, + "tokens_seen": 1176548352 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032501504513540627, + "loss": 2.6868, + "theoretical_loss": 3.5939771119842665, + "tokens_seen": 1176613888 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003250050150451354, + "loss": 2.3658, + "theoretical_loss": 3.593958630548725, + "tokens_seen": 1176679424 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032499498495486463, + "loss": 2.6863, + "theoretical_loss": 3.593940150430688, + "tokens_seen": 1176744960 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003249849548645938, + "loss": 2.6961, + "theoretical_loss": 3.5939216716299875, + "tokens_seen": 1176810496 + }, + { + "epoch": 3.09, + "learning_rate": 0.000324974924774323, + "loss": 2.5861, + "theoretical_loss": 3.5939031941464563, + "tokens_seen": 1176876032 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003249648946840522, + "loss": 2.667, + "theoretical_loss": 3.5938847179799276, + "tokens_seen": 1176941568 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032495486459378135, + "loss": 2.5671, + "theoretical_loss": 3.593866243130234, + "tokens_seen": 1177007104 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032494483450351054, + "loss": 2.2381, + "theoretical_loss": 3.5938477695972084, + "tokens_seen": 1177072640 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032493480441323977, + "loss": 2.5901, + "theoretical_loss": 3.593829297380683, + "tokens_seen": 1177138176 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003249247743229689, + "loss": 2.635, + "theoretical_loss": 3.593810826480492, + "tokens_seen": 1177203712 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032491474423269813, + "loss": 2.5594, + "theoretical_loss": 3.5937923568964676, + "tokens_seen": 1177269248 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032490471414242726, + "loss": 2.6957, + "theoretical_loss": 3.5937738886284425, + "tokens_seen": 1177334784 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003248946840521565, + "loss": 2.7526, + "theoretical_loss": 3.5937554216762497, + "tokens_seen": 1177400320 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003248846539618857, + "loss": 2.8548, + "theoretical_loss": 3.5937369560397228, + "tokens_seen": 1177465856 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032487462387161486, + "loss": 2.6545, + "theoretical_loss": 3.593718491718694, + "tokens_seen": 1177531392 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032486459378134404, + "loss": 2.6746, + "theoretical_loss": 3.5937000287129974, + "tokens_seen": 1177596928 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003248545636910732, + "loss": 2.6491, + "theoretical_loss": 3.5936815670224656, + "tokens_seen": 1177662464 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003248445336008024, + "loss": 2.5586, + "theoretical_loss": 3.5936631066469316, + "tokens_seen": 1177728000 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032483450351053164, + "loss": 2.8073, + "theoretical_loss": 3.5936446475862285, + "tokens_seen": 1177793536 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032482447342026076, + "loss": 2.401, + "theoretical_loss": 3.59362618984019, + "tokens_seen": 1177859072 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032481444332999, + "loss": 2.5078, + "theoretical_loss": 3.593607733408649, + "tokens_seen": 1177924608 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003248044132397192, + "loss": 2.5333, + "theoretical_loss": 3.5935892782914385, + "tokens_seen": 1177990144 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5997087955474854, + "objective/train/theoretical_loss": 3.5935846647174805, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5935846647174805, + "tokens_seen": 1178006528 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032479438314944836, + "loss": 2.6478, + "theoretical_loss": 3.593570824488392, + "tokens_seen": 1178055680 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032478435305917754, + "loss": 2.6641, + "theoretical_loss": 3.593552371999343, + "tokens_seen": 1178121216 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003247743229689067, + "loss": 2.6132, + "theoretical_loss": 3.5935339208241253, + "tokens_seen": 1178186752 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003247642928786359, + "loss": 2.5248, + "theoretical_loss": 3.593515470962571, + "tokens_seen": 1178252288 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032475426278836514, + "loss": 2.4717, + "theoretical_loss": 3.5934970224145144, + "tokens_seen": 1178317824 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032474423269809427, + "loss": 2.4565, + "theoretical_loss": 3.5934785751797893, + "tokens_seen": 1178383360 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003247342026078235, + "loss": 2.5356, + "theoretical_loss": 3.593460129258228, + "tokens_seen": 1178448896 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032472417251755263, + "loss": 2.4481, + "theoretical_loss": 3.5934416846496644, + "tokens_seen": 1178514432 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032471414242728186, + "loss": 2.679, + "theoretical_loss": 3.593423241353933, + "tokens_seen": 1178579968 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032470411233701104, + "loss": 2.4818, + "theoretical_loss": 3.593404799370867, + "tokens_seen": 1178645504 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003246940822467402, + "loss": 2.6474, + "theoretical_loss": 3.5933863587002985, + "tokens_seen": 1178711040 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003246840521564694, + "loss": 2.5939, + "theoretical_loss": 3.5933679193420627, + "tokens_seen": 1178776576 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032467402206619864, + "loss": 2.6959, + "theoretical_loss": 3.593349481295993, + "tokens_seen": 1178842112 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032466399197592777, + "loss": 2.6948, + "theoretical_loss": 3.5933310445619226, + "tokens_seen": 1178907648 + }, + { + "epoch": 3.09, + "learning_rate": 0.000324653961885657, + "loss": 2.3513, + "theoretical_loss": 3.593312609139686, + "tokens_seen": 1178973184 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032464393179538613, + "loss": 2.438, + "theoretical_loss": 3.593294175029116, + "tokens_seen": 1179038720 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032463390170511537, + "loss": 2.4049, + "theoretical_loss": 3.593275742230047, + "tokens_seen": 1179104256 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032462387161484455, + "loss": 2.7499, + "theoretical_loss": 3.5932573107423127, + "tokens_seen": 1179169792 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032461384152457373, + "loss": 2.6122, + "theoretical_loss": 3.593238880565747, + "tokens_seen": 1179235328 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003246038114343029, + "loss": 2.5427, + "theoretical_loss": 3.5932204517001836, + "tokens_seen": 1179300864 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003245937813440321, + "loss": 2.5502, + "theoretical_loss": 3.593202024145456, + "tokens_seen": 1179366400 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032458375125376127, + "loss": 2.5535, + "theoretical_loss": 3.593183597901399, + "tokens_seen": 1179431936 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003245737211634905, + "loss": 2.6025, + "theoretical_loss": 3.593165172967846, + "tokens_seen": 1179497472 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032456369107321963, + "loss": 2.736, + "theoretical_loss": 3.5931467493446307, + "tokens_seen": 1179563008 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032455366098294887, + "loss": 2.3614, + "theoretical_loss": 3.593128327031588, + "tokens_seen": 1179628544 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.648859977722168, + "objective/train/theoretical_loss": 3.5931237216580225, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5931237216580225, + "tokens_seen": 1179644928 + }, + { + "epoch": 3.09, + "learning_rate": 0.000324543630892678, + "loss": 2.65, + "theoretical_loss": 3.593109906028552, + "tokens_seen": 1179694080 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032453360080240723, + "loss": 2.6016, + "theoretical_loss": 3.5930914863353554, + "tokens_seen": 1179759616 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003245235707121364, + "loss": 2.6044, + "theoretical_loss": 3.5930730679518335, + "tokens_seen": 1179825152 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003245135406218656, + "loss": 2.4911, + "theoretical_loss": 3.5930546508778196, + "tokens_seen": 1179890688 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003245035105315948, + "loss": 2.5624, + "theoretical_loss": 3.593036235113149, + "tokens_seen": 1179956224 + }, + { + "epoch": 3.09, + "learning_rate": 0.000324493480441324, + "loss": 2.6682, + "theoretical_loss": 3.5930178206576553, + "tokens_seen": 1180021760 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032448345035105314, + "loss": 2.8837, + "theoretical_loss": 3.592999407511172, + "tokens_seen": 1180087296 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003244734202607824, + "loss": 2.643, + "theoretical_loss": 3.592980995673535, + "tokens_seen": 1180152832 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003244633901705115, + "loss": 2.408, + "theoretical_loss": 3.5929625851445772, + "tokens_seen": 1180218368 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032445336008024074, + "loss": 2.7895, + "theoretical_loss": 3.592944175924133, + "tokens_seen": 1180283904 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003244433299899699, + "loss": 2.629, + "theoretical_loss": 3.592925768012038, + "tokens_seen": 1180349440 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003244332998996991, + "loss": 2.3755, + "theoretical_loss": 3.592907361408125, + "tokens_seen": 1180414976 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003244232698094283, + "loss": 2.7623, + "theoretical_loss": 3.5928889561122292, + "tokens_seen": 1180480512 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032441323971915746, + "loss": 2.7398, + "theoretical_loss": 3.592870552124185, + "tokens_seen": 1180546048 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032440320962888664, + "loss": 2.6753, + "theoretical_loss": 3.592852149443827, + "tokens_seen": 1180611584 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003243931795386159, + "loss": 2.5979, + "theoretical_loss": 3.5928337480709898, + "tokens_seen": 1180677120 + }, + { + "epoch": 3.09, + "learning_rate": 0.000324383149448345, + "loss": 2.5014, + "theoretical_loss": 3.592815348005507, + "tokens_seen": 1180742656 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032437311935807424, + "loss": 2.2792, + "theoretical_loss": 3.592796949247214, + "tokens_seen": 1180808192 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032436308926780337, + "loss": 2.755, + "theoretical_loss": 3.592778551795945, + "tokens_seen": 1180873728 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003243530591775326, + "loss": 2.6092, + "theoretical_loss": 3.5927601556515354, + "tokens_seen": 1180939264 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003243430290872618, + "loss": 2.592, + "theoretical_loss": 3.5927417608138192, + "tokens_seen": 1181004800 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032433299899699096, + "loss": 2.7324, + "theoretical_loss": 3.5927233672826304, + "tokens_seen": 1181070336 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003243229689067202, + "loss": 2.6515, + "theoretical_loss": 3.5927049750578055, + "tokens_seen": 1181135872 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003243129388164494, + "loss": 2.7052, + "theoretical_loss": 3.5926865841391775, + "tokens_seen": 1181201408 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032430290872617856, + "loss": 2.8725, + "theoretical_loss": 3.592668194526582, + "tokens_seen": 1181266944 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2769789695739746, + "objective/train/theoretical_loss": 3.5926635973274816, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5926635973274816, + "tokens_seen": 1181283328 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032429287863590774, + "loss": 2.4443, + "theoretical_loss": 3.592649806219854, + "tokens_seen": 1181332480 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003242828485456369, + "loss": 2.4296, + "theoretical_loss": 3.5926314192188276, + "tokens_seen": 1181398016 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003242728184553661, + "loss": 2.5088, + "theoretical_loss": 3.592613033523339, + "tokens_seen": 1181463552 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032426278836509534, + "loss": 2.3114, + "theoretical_loss": 3.592594649133221, + "tokens_seen": 1181529088 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032425275827482447, + "loss": 2.7753, + "theoretical_loss": 3.59257626604831, + "tokens_seen": 1181594624 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003242427281845537, + "loss": 2.4749, + "theoretical_loss": 3.592557884268441, + "tokens_seen": 1181660160 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032423269809428283, + "loss": 2.7379, + "theoretical_loss": 3.592539503793448, + "tokens_seen": 1181725696 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032422266800401206, + "loss": 2.6699, + "theoretical_loss": 3.5925211246231674, + "tokens_seen": 1181791232 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032421263791374125, + "loss": 2.4808, + "theoretical_loss": 3.5925027467574333, + "tokens_seen": 1181856768 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003242026078234704, + "loss": 2.5653, + "theoretical_loss": 3.5924843701960807, + "tokens_seen": 1181922304 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003241925777331996, + "loss": 2.6213, + "theoretical_loss": 3.5924659949389453, + "tokens_seen": 1181987840 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032418254764292884, + "loss": 2.4843, + "theoretical_loss": 3.592447620985862, + "tokens_seen": 1182053376 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032417251755265797, + "loss": 2.5115, + "theoretical_loss": 3.592429248336665, + "tokens_seen": 1182118912 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003241624874623872, + "loss": 2.8637, + "theoretical_loss": 3.592410876991191, + "tokens_seen": 1182184448 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032415245737211633, + "loss": 2.6917, + "theoretical_loss": 3.5923925069492744, + "tokens_seen": 1182249984 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032414242728184557, + "loss": 2.5962, + "theoretical_loss": 3.592374138210751, + "tokens_seen": 1182315520 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032413239719157475, + "loss": 2.2916, + "theoretical_loss": 3.5923557707754554, + "tokens_seen": 1182381056 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032412236710130393, + "loss": 2.2378, + "theoretical_loss": 3.5923374046432226, + "tokens_seen": 1182446592 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003241123370110331, + "loss": 2.6255, + "theoretical_loss": 3.5923190398138893, + "tokens_seen": 1182512128 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003241023069207623, + "loss": 2.641, + "theoretical_loss": 3.59230067628729, + "tokens_seen": 1182577664 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032409227683049147, + "loss": 2.6365, + "theoretical_loss": 3.59228231406326, + "tokens_seen": 1182643200 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003240822467402207, + "loss": 2.683, + "theoretical_loss": 3.5922639531416354, + "tokens_seen": 1182708736 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032407221664994984, + "loss": 2.7049, + "theoretical_loss": 3.592245593522251, + "tokens_seen": 1182774272 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032406218655967907, + "loss": 2.6362, + "theoretical_loss": 3.592227235204942, + "tokens_seen": 1182839808 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003240521564694082, + "loss": 2.5488, + "theoretical_loss": 3.5922088781895454, + "tokens_seen": 1182905344 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5676214694976807, + "objective/train/theoretical_loss": 3.5922042891391004, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5922042891391004, + "tokens_seen": 1182921728 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032404212637913743, + "loss": 2.6056, + "theoretical_loss": 3.592190522475895, + "tokens_seen": 1182970880 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003240320962888666, + "loss": 2.6205, + "theoretical_loss": 3.5921721680638274, + "tokens_seen": 1183036416 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003240220661985958, + "loss": 2.6659, + "theoretical_loss": 3.5921538149531784, + "tokens_seen": 1183101952 + }, + { + "epoch": 3.09, + "learning_rate": 0.000324012036108325, + "loss": 2.4522, + "theoretical_loss": 3.5921354631437827, + "tokens_seen": 1183167488 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003240020060180542, + "loss": 2.6169, + "theoretical_loss": 3.5921171126354765, + "tokens_seen": 1183233024 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032399197592778334, + "loss": 2.527, + "theoretical_loss": 3.592098763428096, + "tokens_seen": 1183298560 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003239819458375126, + "loss": 2.5272, + "theoretical_loss": 3.5920804155214756, + "tokens_seen": 1183364096 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003239719157472417, + "loss": 2.5678, + "theoretical_loss": 3.5920620689154523, + "tokens_seen": 1183429632 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032396188565697094, + "loss": 2.5987, + "theoretical_loss": 3.5920437236098612, + "tokens_seen": 1183495168 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003239518555667001, + "loss": 2.8055, + "theoretical_loss": 3.592025379604539, + "tokens_seen": 1183560704 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003239418254764293, + "loss": 2.6494, + "theoretical_loss": 3.5920070368993207, + "tokens_seen": 1183626240 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003239317953861585, + "loss": 2.6893, + "theoretical_loss": 3.5919886954940425, + "tokens_seen": 1183691776 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032392176529588766, + "loss": 2.1192, + "theoretical_loss": 3.59197035538854, + "tokens_seen": 1183757312 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032391173520561684, + "loss": 2.5386, + "theoretical_loss": 3.591952016582649, + "tokens_seen": 1183822848 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003239017051153461, + "loss": 2.7836, + "theoretical_loss": 3.591933679076207, + "tokens_seen": 1183888384 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003238916750250752, + "loss": 2.5445, + "theoretical_loss": 3.5919153428690476, + "tokens_seen": 1183953920 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032388164493480444, + "loss": 2.7289, + "theoretical_loss": 3.591897007961008, + "tokens_seen": 1184019456 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032387161484453357, + "loss": 2.8839, + "theoretical_loss": 3.5918786743519258, + "tokens_seen": 1184084992 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003238615847542628, + "loss": 2.4107, + "theoretical_loss": 3.5918603420416346, + "tokens_seen": 1184150528 + }, + { + "epoch": 3.09, + "learning_rate": 0.000323851554663992, + "loss": 2.5578, + "theoretical_loss": 3.5918420110299714, + "tokens_seen": 1184216064 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032384152457372116, + "loss": 2.6188, + "theoretical_loss": 3.5918236813167725, + "tokens_seen": 1184281600 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032383149448345034, + "loss": 2.7958, + "theoretical_loss": 3.591805352901874, + "tokens_seen": 1184347136 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003238214643931796, + "loss": 2.8208, + "theoretical_loss": 3.5917870257851128, + "tokens_seen": 1184412672 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003238114343029087, + "loss": 2.4828, + "theoretical_loss": 3.591768699966324, + "tokens_seen": 1184478208 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032380140421263794, + "loss": 2.5075, + "theoretical_loss": 3.5917503754453444, + "tokens_seen": 1184543744 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8725781440734863, + "objective/train/theoretical_loss": 3.5917457945178626, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5917457945178626, + "tokens_seen": 1184560128 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032379137412236707, + "loss": 2.8068, + "theoretical_loss": 3.59173205222201, + "tokens_seen": 1184609280 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003237813440320963, + "loss": 2.456, + "theoretical_loss": 3.591713730296158, + "tokens_seen": 1184674816 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003237713139418255, + "loss": 2.7284, + "theoretical_loss": 3.591695409667624, + "tokens_seen": 1184740352 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032376128385155467, + "loss": 2.3995, + "theoretical_loss": 3.591677090336244, + "tokens_seen": 1184805888 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032375125376128385, + "loss": 2.7092, + "theoretical_loss": 3.591658772301855, + "tokens_seen": 1184871424 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032374122367101303, + "loss": 2.6974, + "theoretical_loss": 3.591640455564294, + "tokens_seen": 1184936960 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003237311935807422, + "loss": 2.286, + "theoretical_loss": 3.5916221401233965, + "tokens_seen": 1185002496 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032372116349047145, + "loss": 2.3438, + "theoretical_loss": 3.591603825978999, + "tokens_seen": 1185068032 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032371113340020057, + "loss": 2.4572, + "theoretical_loss": 3.591585513130939, + "tokens_seen": 1185133568 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003237011033099298, + "loss": 2.7635, + "theoretical_loss": 3.5915672015790525, + "tokens_seen": 1185199104 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032369107321965893, + "loss": 2.781, + "theoretical_loss": 3.5915488913231757, + "tokens_seen": 1185264640 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032368104312938817, + "loss": 2.649, + "theoretical_loss": 3.5915305823631454, + "tokens_seen": 1185330176 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032367101303911735, + "loss": 2.5419, + "theoretical_loss": 3.5915122746987986, + "tokens_seen": 1185395712 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032366098294884653, + "loss": 2.563, + "theoretical_loss": 3.5914939683299716, + "tokens_seen": 1185461248 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003236509528585757, + "loss": 2.7375, + "theoretical_loss": 3.591475663256502, + "tokens_seen": 1185526784 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032364092276830495, + "loss": 2.3776, + "theoretical_loss": 3.591457359478225, + "tokens_seen": 1185592320 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003236308926780341, + "loss": 2.5519, + "theoretical_loss": 3.591439056994979, + "tokens_seen": 1185657856 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003236208625877633, + "loss": 2.798, + "theoretical_loss": 3.5914207558066, + "tokens_seen": 1185723392 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032361083249749244, + "loss": 2.7447, + "theoretical_loss": 3.591402455912924, + "tokens_seen": 1185788928 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003236008024072217, + "loss": 2.5904, + "theoretical_loss": 3.5913841573137892, + "tokens_seen": 1185854464 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032359077231695085, + "loss": 2.6773, + "theoretical_loss": 3.5913658600090326, + "tokens_seen": 1185920000 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032358074222668004, + "loss": 2.6649, + "theoretical_loss": 3.5913475639984895, + "tokens_seen": 1185985536 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032357071213640927, + "loss": 2.5266, + "theoretical_loss": 3.5913292692819985, + "tokens_seen": 1186051072 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003235606820461384, + "loss": 2.7652, + "theoretical_loss": 3.591310975859396, + "tokens_seen": 1186116608 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032355065195586763, + "loss": 2.641, + "theoretical_loss": 3.5912926837305186, + "tokens_seen": 1186182144 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 1282424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.350708246231079, + "objective/train/theoretical_loss": 3.5912881109004244, + "objective/train/tokens_used": 1187029472, + "theoretical_loss": 3.5912881109004244, + "tokens_seen": 1186198528 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003235406218655968, + "loss": 2.4416, + "theoretical_loss": 3.591274392895204, + "tokens_seen": 1186247680 + }, + { + "epoch": 3.09, + "learning_rate": 0.000323530591775326, + "loss": 2.4279, + "theoretical_loss": 3.5912561033532886, + "tokens_seen": 1186313216 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003235205616850552, + "loss": 2.6713, + "theoretical_loss": 3.59123781510461, + "tokens_seen": 1186378752 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003235105315947844, + "loss": 2.8526, + "theoretical_loss": 3.591219528149005, + "tokens_seen": 1186444288 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032350050150451354, + "loss": 2.4494, + "theoretical_loss": 3.5912012424863113, + "tokens_seen": 1186509824 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003234904714142428, + "loss": 2.661, + "theoretical_loss": 3.5911829581163657, + "tokens_seen": 1186575360 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003234804413239719, + "loss": 2.5418, + "theoretical_loss": 3.591164675039005, + "tokens_seen": 1186640896 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032347041123370114, + "loss": 2.596, + "theoretical_loss": 3.5911463932540677, + "tokens_seen": 1186706432 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003234603811434303, + "loss": 2.421, + "theoretical_loss": 3.5911281127613894, + "tokens_seen": 1186771968 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003234503510531595, + "loss": 2.4505, + "theoretical_loss": 3.591109833560809, + "tokens_seen": 1186837504 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003234403209628887, + "loss": 2.5962, + "theoretical_loss": 3.591091555652162, + "tokens_seen": 1186903040 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032343029087261786, + "loss": 2.823, + "theoretical_loss": 3.591073279035288, + "tokens_seen": 1186968576 + }, + { + "epoch": 3.09, + "learning_rate": 0.00032342026078234704, + "loss": 2.4926, + "theoretical_loss": 3.591055003710023, + "tokens_seen": 1187034112 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003234102306920763, + "loss": 3.3894, + "theoretical_loss": 3.5910324468863224, + "tokens_seen": 1187115008 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003234002006018054, + "loss": 2.6997, + "theoretical_loss": 3.5910141744464092, + "tokens_seen": 1187180544 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032339017051153464, + "loss": 2.7738, + "theoretical_loss": 3.5909959032975793, + "tokens_seen": 1187246080 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032338014042126377, + "loss": 2.723, + "theoretical_loss": 3.590977633439671, + "tokens_seen": 1187311616 + }, + { + "epoch": 4.0, + "learning_rate": 0.000323370110330993, + "loss": 2.6097, + "theoretical_loss": 3.5909593648725204, + "tokens_seen": 1187377152 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003233600802407222, + "loss": 2.744, + "theoretical_loss": 3.5909410975959664, + "tokens_seen": 1187442688 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032335005015045136, + "loss": 2.8156, + "theoretical_loss": 3.590922831609846, + "tokens_seen": 1187508224 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032334002006018054, + "loss": 2.8714, + "theoretical_loss": 3.5909045669139967, + "tokens_seen": 1187573760 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003233299899699098, + "loss": 2.5544, + "theoretical_loss": 3.590886303508257, + "tokens_seen": 1187639296 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003233199598796389, + "loss": 2.7742, + "theoretical_loss": 3.5908680413924636, + "tokens_seen": 1187704832 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032330992978936814, + "loss": 2.8224, + "theoretical_loss": 3.5908497805664545, + "tokens_seen": 1187770368 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1347233, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8168835639953613, + "objective/train/theoretical_loss": 3.5908315210300685, + "objective/train/tokens_used": 1208295904, + "theoretical_loss": 3.5908315210300685, + "tokens_seen": 1187835904 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032329989969909727, + "loss": 2.6189, + "theoretical_loss": 3.5908315210300685, + "tokens_seen": 1187835904 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003232898696088265, + "loss": 2.699, + "theoretical_loss": 3.5908132627831417, + "tokens_seen": 1187901440 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003232798395185557, + "loss": 2.7227, + "theoretical_loss": 3.5907950058255125, + "tokens_seen": 1187966976 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032326980942828487, + "loss": 2.7502, + "theoretical_loss": 3.590776750157019, + "tokens_seen": 1188032512 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032325977933801405, + "loss": 2.6845, + "theoretical_loss": 3.5907584957774996, + "tokens_seen": 1188098048 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032324974924774323, + "loss": 2.8323, + "theoretical_loss": 3.590740242686791, + "tokens_seen": 1188163584 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003232397191574724, + "loss": 2.6911, + "theoretical_loss": 3.590721990884732, + "tokens_seen": 1188229120 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032322968906720165, + "loss": 2.8486, + "theoretical_loss": 3.5907037403711604, + "tokens_seen": 1188294656 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032321965897693077, + "loss": 2.7744, + "theoretical_loss": 3.590685491145914, + "tokens_seen": 1188360192 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032320962888666, + "loss": 2.7593, + "theoretical_loss": 3.5906672432088302, + "tokens_seen": 1188425728 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032319959879638913, + "loss": 2.7644, + "theoretical_loss": 3.5906489965597483, + "tokens_seen": 1188491264 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032318956870611837, + "loss": 2.7176, + "theoretical_loss": 3.5906307511985056, + "tokens_seen": 1188556800 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032317953861584755, + "loss": 2.8788, + "theoretical_loss": 3.5906125071249404, + "tokens_seen": 1188622336 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032316950852557673, + "loss": 2.7639, + "theoretical_loss": 3.5905942643388906, + "tokens_seen": 1188687872 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003231594784353059, + "loss": 2.5787, + "theoretical_loss": 3.590576022840195, + "tokens_seen": 1188753408 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032314944834503515, + "loss": 2.7243, + "theoretical_loss": 3.590557782628691, + "tokens_seen": 1188818944 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003231394182547643, + "loss": 2.7841, + "theoretical_loss": 3.5905395437042174, + "tokens_seen": 1188884480 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003231293881644935, + "loss": 2.5921, + "theoretical_loss": 3.5905213060666124, + "tokens_seen": 1188950016 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032311935807422264, + "loss": 2.8621, + "theoretical_loss": 3.590503069715714, + "tokens_seen": 1189015552 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003231093279839519, + "loss": 2.7728, + "theoretical_loss": 3.5904848346513605, + "tokens_seen": 1189081088 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032309929789368105, + "loss": 2.8418, + "theoretical_loss": 3.5904666008733903, + "tokens_seen": 1189146624 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032308926780341024, + "loss": 2.7546, + "theoretical_loss": 3.590448368381642, + "tokens_seen": 1189212160 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003230792377131394, + "loss": 2.4472, + "theoretical_loss": 3.5904301371759533, + "tokens_seen": 1189277696 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003230692076228686, + "loss": 2.7317, + "theoretical_loss": 3.590411907256164, + "tokens_seen": 1189343232 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003230591775325978, + "loss": 2.7694, + "theoretical_loss": 3.590393678622111, + "tokens_seen": 1189408768 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1352239, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0367116928100586, + "objective/train/theoretical_loss": 3.5903754512736334, + "objective/train/tokens_used": 1209934304, + "theoretical_loss": 3.5903754512736334, + "tokens_seen": 1189474304 + }, + { + "epoch": 4.0, + "learning_rate": 0.000323049147442327, + "loss": 2.7382, + "theoretical_loss": 3.5903754512736334, + "tokens_seen": 1189474304 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032303911735205614, + "loss": 2.4576, + "theoretical_loss": 3.59035722521057, + "tokens_seen": 1189539840 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003230290872617854, + "loss": 2.648, + "theoretical_loss": 3.5903390004327593, + "tokens_seen": 1189605376 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003230190571715145, + "loss": 2.7522, + "theoretical_loss": 3.5903207769400396, + "tokens_seen": 1189670912 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032300902708124374, + "loss": 2.87, + "theoretical_loss": 3.5903025547322494, + "tokens_seen": 1189736448 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003229989969909729, + "loss": 2.7883, + "theoretical_loss": 3.590284333809228, + "tokens_seen": 1189801984 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003229889669007021, + "loss": 2.8014, + "theoretical_loss": 3.590266114170813, + "tokens_seen": 1189867520 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003229789368104313, + "loss": 2.9751, + "theoretical_loss": 3.5902478958168436, + "tokens_seen": 1189933056 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003229689067201605, + "loss": 2.8034, + "theoretical_loss": 3.590229678747159, + "tokens_seen": 1189998592 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032295887662988964, + "loss": 2.6914, + "theoretical_loss": 3.590211462961597, + "tokens_seen": 1190064128 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003229488465396189, + "loss": 2.7211, + "theoretical_loss": 3.5901932484599977, + "tokens_seen": 1190129664 + }, + { + "epoch": 4.0, + "learning_rate": 0.000322938816449348, + "loss": 2.7461, + "theoretical_loss": 3.590175035242199, + "tokens_seen": 1190195200 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032292878635907724, + "loss": 2.6821, + "theoretical_loss": 3.5901568233080394, + "tokens_seen": 1190260736 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003229187562688064, + "loss": 2.7034, + "theoretical_loss": 3.5901386126573582, + "tokens_seen": 1190326272 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003229087261785356, + "loss": 2.6984, + "theoretical_loss": 3.590120403289994, + "tokens_seen": 1190391808 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003228986960882648, + "loss": 2.8506, + "theoretical_loss": 3.5901021952057866, + "tokens_seen": 1190457344 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032288866599799397, + "loss": 2.8324, + "theoretical_loss": 3.590083988404574, + "tokens_seen": 1190522880 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032287863590772315, + "loss": 2.7436, + "theoretical_loss": 3.590065782886196, + "tokens_seen": 1190588416 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003228686058174524, + "loss": 2.669, + "theoretical_loss": 3.590047578650491, + "tokens_seen": 1190653952 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003228585757271815, + "loss": 2.7477, + "theoretical_loss": 3.5900293756972976, + "tokens_seen": 1190719488 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032284854563691074, + "loss": 2.7421, + "theoretical_loss": 3.590011174026456, + "tokens_seen": 1190785024 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003228385155466399, + "loss": 2.7889, + "theoretical_loss": 3.5899929736378047, + "tokens_seen": 1190850560 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003228284854563691, + "loss": 2.8669, + "theoretical_loss": 3.589974774531182, + "tokens_seen": 1190916096 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032281845536609834, + "loss": 2.7807, + "theoretical_loss": 3.5899565767064288, + "tokens_seen": 1190981632 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032280842527582747, + "loss": 2.707, + "theoretical_loss": 3.5899383801633835, + "tokens_seen": 1191047168 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1357258, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0113937854766846, + "objective/train/theoretical_loss": 3.5899201849018842, + "objective/train/tokens_used": 1211572704, + "theoretical_loss": 3.5899201849018842, + "tokens_seen": 1191112704 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003227983951855567, + "loss": 2.8706, + "theoretical_loss": 3.5899201849018842, + "tokens_seen": 1191112704 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003227883650952859, + "loss": 2.815, + "theoretical_loss": 3.5899019909217724, + "tokens_seen": 1191178240 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032277833500501507, + "loss": 2.82, + "theoretical_loss": 3.589883798222885, + "tokens_seen": 1191243776 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032276830491474425, + "loss": 2.4651, + "theoretical_loss": 3.5898656068050627, + "tokens_seen": 1191309312 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032275827482447343, + "loss": 2.6, + "theoretical_loss": 3.589847416668145, + "tokens_seen": 1191374848 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003227482447342026, + "loss": 2.7682, + "theoretical_loss": 3.58982922781197, + "tokens_seen": 1191440384 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032273821464393185, + "loss": 2.7798, + "theoretical_loss": 3.5898110402363788, + "tokens_seen": 1191505920 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032272818455366097, + "loss": 2.6923, + "theoretical_loss": 3.5897928539412094, + "tokens_seen": 1191571456 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003227181544633902, + "loss": 2.6961, + "theoretical_loss": 3.5897746689263017, + "tokens_seen": 1191636992 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032270812437311933, + "loss": 2.7842, + "theoretical_loss": 3.5897564851914954, + "tokens_seen": 1191702528 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032269809428284857, + "loss": 2.6957, + "theoretical_loss": 3.589738302736629, + "tokens_seen": 1191768064 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032268806419257775, + "loss": 2.8474, + "theoretical_loss": 3.5897201215615437, + "tokens_seen": 1191833600 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032267803410230693, + "loss": 2.754, + "theoretical_loss": 3.589701941666078, + "tokens_seen": 1191899136 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003226680040120361, + "loss": 2.6007, + "theoretical_loss": 3.589683763050071, + "tokens_seen": 1191964672 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032265797392176535, + "loss": 2.9095, + "theoretical_loss": 3.5896655857133637, + "tokens_seen": 1192030208 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003226479438314945, + "loss": 2.7656, + "theoretical_loss": 3.589647409655795, + "tokens_seen": 1192095744 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003226379137412237, + "loss": 2.7206, + "theoretical_loss": 3.5896292348772043, + "tokens_seen": 1192161280 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032262788365095284, + "loss": 2.9005, + "theoretical_loss": 3.5896110613774317, + "tokens_seen": 1192226816 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003226178535606821, + "loss": 2.8052, + "theoretical_loss": 3.5895928891563167, + "tokens_seen": 1192292352 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032260782347041125, + "loss": 2.7781, + "theoretical_loss": 3.589574718213699, + "tokens_seen": 1192357888 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032259779338014044, + "loss": 2.7395, + "theoretical_loss": 3.589556548549419, + "tokens_seen": 1192423424 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003225877632898696, + "loss": 2.931, + "theoretical_loss": 3.589538380163316, + "tokens_seen": 1192488960 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003225777331995988, + "loss": 2.7551, + "theoretical_loss": 3.5895202130552297, + "tokens_seen": 1192554496 + }, + { + "epoch": 4.0, + "learning_rate": 0.000322567703109328, + "loss": 2.7762, + "theoretical_loss": 3.5895020472250003, + "tokens_seen": 1192620032 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003225576730190572, + "loss": 2.6019, + "theoretical_loss": 3.589483882672468, + "tokens_seen": 1192685568 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1362192, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9670515060424805, + "objective/train/theoretical_loss": 3.589465719397472, + "objective/train/tokens_used": 1213211104, + "theoretical_loss": 3.589465719397472, + "tokens_seen": 1192751104 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032254764292878634, + "loss": 2.8065, + "theoretical_loss": 3.589465719397472, + "tokens_seen": 1192751104 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003225376128385156, + "loss": 2.6473, + "theoretical_loss": 3.5894475573998528, + "tokens_seen": 1192816640 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003225275827482447, + "loss": 2.7868, + "theoretical_loss": 3.58942939667945, + "tokens_seen": 1192882176 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032251755265797394, + "loss": 2.7406, + "theoretical_loss": 3.589411237236104, + "tokens_seen": 1192947712 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003225075225677031, + "loss": 2.8387, + "theoretical_loss": 3.589393079069654, + "tokens_seen": 1193013248 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003224974924774323, + "loss": 2.6082, + "theoretical_loss": 3.5893749221799416, + "tokens_seen": 1193078784 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003224874623871615, + "loss": 2.7082, + "theoretical_loss": 3.589356766566806, + "tokens_seen": 1193144320 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003224774322968907, + "loss": 2.7234, + "theoretical_loss": 3.5893386122300868, + "tokens_seen": 1193209856 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032246740220661984, + "loss": 2.6437, + "theoretical_loss": 3.5893204591696253, + "tokens_seen": 1193275392 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003224573721163491, + "loss": 2.6607, + "theoretical_loss": 3.5893023073852612, + "tokens_seen": 1193340928 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003224473420260782, + "loss": 2.6923, + "theoretical_loss": 3.5892841568768343, + "tokens_seen": 1193406464 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032243731193580744, + "loss": 2.758, + "theoretical_loss": 3.589266007644185, + "tokens_seen": 1193472000 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003224272818455366, + "loss": 2.7518, + "theoretical_loss": 3.5892478596871547, + "tokens_seen": 1193537536 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003224172517552658, + "loss": 2.7189, + "theoretical_loss": 3.589229713005582, + "tokens_seen": 1193603072 + }, + { + "epoch": 4.0, + "learning_rate": 0.000322407221664995, + "loss": 2.7066, + "theoretical_loss": 3.5892115675993077, + "tokens_seen": 1193668608 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032239719157472417, + "loss": 2.6636, + "theoretical_loss": 3.5891934234681733, + "tokens_seen": 1193734144 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032238716148445335, + "loss": 2.6847, + "theoretical_loss": 3.589175280612018, + "tokens_seen": 1193799680 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003223771313941826, + "loss": 2.7743, + "theoretical_loss": 3.589157139030683, + "tokens_seen": 1193865216 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003223671013039117, + "loss": 2.8292, + "theoretical_loss": 3.589138998724008, + "tokens_seen": 1193930752 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032235707121364095, + "loss": 2.8121, + "theoretical_loss": 3.5891208596918345, + "tokens_seen": 1193996288 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003223470411233701, + "loss": 2.7214, + "theoretical_loss": 3.589102721934002, + "tokens_seen": 1194061824 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003223370110330993, + "loss": 2.7198, + "theoretical_loss": 3.589084585450351, + "tokens_seen": 1194127360 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003223269809428285, + "loss": 2.7321, + "theoretical_loss": 3.589066450240723, + "tokens_seen": 1194192896 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032231695085255767, + "loss": 2.669, + "theoretical_loss": 3.5890483163049574, + "tokens_seen": 1194258432 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032230692076228685, + "loss": 2.8583, + "theoretical_loss": 3.5890301836428957, + "tokens_seen": 1194323968 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1367200, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5147886276245117, + "objective/train/theoretical_loss": 3.589012052254379, + "objective/train/tokens_used": 1214849504, + "theoretical_loss": 3.589012052254379, + "tokens_seen": 1194389504 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003222968906720161, + "loss": 2.6539, + "theoretical_loss": 3.589012052254379, + "tokens_seen": 1194389504 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003222868605817452, + "loss": 2.8133, + "theoretical_loss": 3.5889939221392466, + "tokens_seen": 1194455040 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032227683049147445, + "loss": 2.7431, + "theoretical_loss": 3.5889757932973403, + "tokens_seen": 1194520576 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003222668004012036, + "loss": 2.7153, + "theoretical_loss": 3.5889576657285, + "tokens_seen": 1194586112 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003222567703109328, + "loss": 2.691, + "theoretical_loss": 3.5889395394325674, + "tokens_seen": 1194651648 + }, + { + "epoch": 4.0, + "learning_rate": 0.000322246740220662, + "loss": 2.784, + "theoretical_loss": 3.5889214144093824, + "tokens_seen": 1194717184 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032223671013039117, + "loss": 2.6861, + "theoretical_loss": 3.588903290658787, + "tokens_seen": 1194782720 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032222668004012035, + "loss": 2.567, + "theoretical_loss": 3.5888851681806204, + "tokens_seen": 1194848256 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032221664994984953, + "loss": 2.6833, + "theoretical_loss": 3.588867046974725, + "tokens_seen": 1194913792 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003222066198595787, + "loss": 2.7416, + "theoretical_loss": 3.588848927040941, + "tokens_seen": 1194979328 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032219658976930795, + "loss": 2.6881, + "theoretical_loss": 3.588830808379109, + "tokens_seen": 1195044864 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003221865596790371, + "loss": 2.5894, + "theoretical_loss": 3.588812690989071, + "tokens_seen": 1195110400 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003221765295887663, + "loss": 2.8525, + "theoretical_loss": 3.588794574870667, + "tokens_seen": 1195175936 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003221664994984955, + "loss": 2.6549, + "theoretical_loss": 3.588776460023739, + "tokens_seen": 1195241472 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003221564694082247, + "loss": 2.8704, + "theoretical_loss": 3.5887583464481265, + "tokens_seen": 1195307008 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032214643931795386, + "loss": 2.7525, + "theoretical_loss": 3.5887402341436725, + "tokens_seen": 1195372544 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032213640922768304, + "loss": 2.745, + "theoretical_loss": 3.588722123110217, + "tokens_seen": 1195438080 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003221263791374122, + "loss": 2.8724, + "theoretical_loss": 3.588704013347601, + "tokens_seen": 1195503616 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032211634904714145, + "loss": 2.6772, + "theoretical_loss": 3.588685904855666, + "tokens_seen": 1195569152 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003221063189568706, + "loss": 2.5685, + "theoretical_loss": 3.588667797634253, + "tokens_seen": 1195634688 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003220962888665998, + "loss": 2.6183, + "theoretical_loss": 3.5886496916832034, + "tokens_seen": 1195700224 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032208625877632894, + "loss": 2.8578, + "theoretical_loss": 3.588631587002359, + "tokens_seen": 1195765760 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003220762286860582, + "loss": 2.4787, + "theoretical_loss": 3.58861348359156, + "tokens_seen": 1195831296 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003220661985957874, + "loss": 2.4697, + "theoretical_loss": 3.5885953814506486, + "tokens_seen": 1195896832 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032205616850551654, + "loss": 2.6127, + "theoretical_loss": 3.588577280579466, + "tokens_seen": 1195962368 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1372299, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.829193592071533, + "objective/train/theoretical_loss": 3.5885591809778523, + "objective/train/tokens_used": 1216487904, + "theoretical_loss": 3.5885591809778523, + "tokens_seen": 1196027904 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003220461384152458, + "loss": 2.8615, + "theoretical_loss": 3.5885591809778523, + "tokens_seen": 1196027904 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003220361083249749, + "loss": 2.649, + "theoretical_loss": 3.588541082645651, + "tokens_seen": 1196093440 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032202607823470414, + "loss": 2.816, + "theoretical_loss": 3.588522985582702, + "tokens_seen": 1196158976 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003220160481444333, + "loss": 2.7417, + "theoretical_loss": 3.5885048897888474, + "tokens_seen": 1196224512 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003220060180541625, + "loss": 2.7062, + "theoretical_loss": 3.588486795263928, + "tokens_seen": 1196290048 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003219959879638917, + "loss": 2.618, + "theoretical_loss": 3.588468702007787, + "tokens_seen": 1196355584 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003219859578736209, + "loss": 2.4866, + "theoretical_loss": 3.5884506100202636, + "tokens_seen": 1196421120 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032197592778335004, + "loss": 2.5016, + "theoretical_loss": 3.5884325193012008, + "tokens_seen": 1196486656 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003219658976930793, + "loss": 2.5037, + "theoretical_loss": 3.58841442985044, + "tokens_seen": 1196552192 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003219558676028084, + "loss": 2.5967, + "theoretical_loss": 3.588396341667823, + "tokens_seen": 1196617728 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032194583751253764, + "loss": 2.6184, + "theoretical_loss": 3.5883782547531906, + "tokens_seen": 1196683264 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003219358074222668, + "loss": 2.8586, + "theoretical_loss": 3.5883601691063856, + "tokens_seen": 1196748800 + }, + { + "epoch": 4.0, + "learning_rate": 0.000321925777331996, + "loss": 2.7405, + "theoretical_loss": 3.5883420847272487, + "tokens_seen": 1196814336 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003219157472417252, + "loss": 2.8422, + "theoretical_loss": 3.5883240016156224, + "tokens_seen": 1196879872 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032190571715145437, + "loss": 2.7227, + "theoretical_loss": 3.588305919771348, + "tokens_seen": 1196945408 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032189568706118355, + "loss": 2.8192, + "theoretical_loss": 3.5882878391942676, + "tokens_seen": 1197010944 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003218856569709128, + "loss": 2.6226, + "theoretical_loss": 3.5882697598842226, + "tokens_seen": 1197076480 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003218756268806419, + "loss": 2.6056, + "theoretical_loss": 3.5882516818410557, + "tokens_seen": 1197142016 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032186559679037115, + "loss": 2.751, + "theoretical_loss": 3.588233605064608, + "tokens_seen": 1197207552 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003218555667001003, + "loss": 2.5782, + "theoretical_loss": 3.588215529554722, + "tokens_seen": 1197273088 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003218455366098295, + "loss": 2.5359, + "theoretical_loss": 3.5881974553112386, + "tokens_seen": 1197338624 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003218355065195587, + "loss": 2.6353, + "theoretical_loss": 3.5881793823340007, + "tokens_seen": 1197404160 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032182547642928787, + "loss": 2.4617, + "theoretical_loss": 3.58816131062285, + "tokens_seen": 1197469696 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032181544633901705, + "loss": 2.8249, + "theoretical_loss": 3.5881432401776285, + "tokens_seen": 1197535232 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003218054162487463, + "loss": 2.753, + "theoretical_loss": 3.588125170998178, + "tokens_seen": 1197600768 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1377331, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7911863327026367, + "objective/train/theoretical_loss": 3.588107103084342, + "objective/train/tokens_used": 1218126304, + "theoretical_loss": 3.588107103084342, + "tokens_seen": 1197666304 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003217953861584754, + "loss": 2.8607, + "theoretical_loss": 3.588107103084342, + "tokens_seen": 1197666304 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032178535606820465, + "loss": 2.597, + "theoretical_loss": 3.58808903643596, + "tokens_seen": 1197731840 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003217753259779338, + "loss": 2.684, + "theoretical_loss": 3.5880709710528764, + "tokens_seen": 1197797376 + }, + { + "epoch": 4.0, + "learning_rate": 0.000321765295887663, + "loss": 2.819, + "theoretical_loss": 3.5880529069349327, + "tokens_seen": 1197862912 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003217552657973922, + "loss": 2.6415, + "theoretical_loss": 3.588034844081971, + "tokens_seen": 1197928448 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032174523570712137, + "loss": 2.6505, + "theoretical_loss": 3.588016782493833, + "tokens_seen": 1197993984 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032173520561685055, + "loss": 2.799, + "theoretical_loss": 3.587998722170362, + "tokens_seen": 1198059520 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032172517552657974, + "loss": 2.7641, + "theoretical_loss": 3.5879806631113995, + "tokens_seen": 1198125056 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003217151454363089, + "loss": 2.7987, + "theoretical_loss": 3.587962605316788, + "tokens_seen": 1198190592 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032170511534603815, + "loss": 2.6003, + "theoretical_loss": 3.5879445487863695, + "tokens_seen": 1198256128 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003216950852557673, + "loss": 2.8261, + "theoretical_loss": 3.587926493519987, + "tokens_seen": 1198321664 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003216850551654965, + "loss": 2.6754, + "theoretical_loss": 3.5879084395174825, + "tokens_seen": 1198387200 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003216750250752257, + "loss": 2.7249, + "theoretical_loss": 3.5878903867786986, + "tokens_seen": 1198452736 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003216649949849549, + "loss": 2.6604, + "theoretical_loss": 3.5878723353034783, + "tokens_seen": 1198518272 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032165496489468406, + "loss": 2.925, + "theoretical_loss": 3.5878542850916624, + "tokens_seen": 1198583808 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032164493480441324, + "loss": 2.7317, + "theoretical_loss": 3.587836236143095, + "tokens_seen": 1198649344 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003216349047141424, + "loss": 2.8008, + "theoretical_loss": 3.587818188457618, + "tokens_seen": 1198714880 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032162487462387165, + "loss": 2.689, + "theoretical_loss": 3.587800142035074, + "tokens_seen": 1198780416 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003216148445336008, + "loss": 2.7237, + "theoretical_loss": 3.5877820968753054, + "tokens_seen": 1198845952 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032160481444333, + "loss": 2.6511, + "theoretical_loss": 3.587764052978155, + "tokens_seen": 1198911488 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032159478435305914, + "loss": 2.5991, + "theoretical_loss": 3.587746010343466, + "tokens_seen": 1198977024 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003215847542627884, + "loss": 2.7523, + "theoretical_loss": 3.5877279689710804, + "tokens_seen": 1199042560 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032157472417251756, + "loss": 2.8926, + "theoretical_loss": 3.587709928860841, + "tokens_seen": 1199108096 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032156469408224674, + "loss": 2.6584, + "theoretical_loss": 3.5876918900125903, + "tokens_seen": 1199173632 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003215546639919759, + "loss": 2.5422, + "theoretical_loss": 3.5876738524261715, + "tokens_seen": 1199239168 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1382178, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.804316282272339, + "objective/train/theoretical_loss": 3.587655816101427, + "objective/train/tokens_used": 1219764704, + "theoretical_loss": 3.587655816101427, + "tokens_seen": 1199304704 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003215446339017051, + "loss": 2.7885, + "theoretical_loss": 3.587655816101427, + "tokens_seen": 1199304704 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003215346038114343, + "loss": 2.8044, + "theoretical_loss": 3.5876377810382003, + "tokens_seen": 1199370240 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003215245737211635, + "loss": 2.7155, + "theoretical_loss": 3.5876197472363334, + "tokens_seen": 1199435776 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032151454363089265, + "loss": 2.5803, + "theoretical_loss": 3.58760171469567, + "tokens_seen": 1199501312 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003215045135406219, + "loss": 2.6165, + "theoretical_loss": 3.587583683416052, + "tokens_seen": 1199566848 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032149448345035106, + "loss": 2.7678, + "theoretical_loss": 3.5875656533973235, + "tokens_seen": 1199632384 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032148445336008024, + "loss": 2.8196, + "theoretical_loss": 3.5875476246393263, + "tokens_seen": 1199697920 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003214744232698094, + "loss": 2.549, + "theoretical_loss": 3.587529597141904, + "tokens_seen": 1199763456 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003214643931795386, + "loss": 2.85, + "theoretical_loss": 3.5875115709048995, + "tokens_seen": 1199828992 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003214543630892678, + "loss": 2.6543, + "theoretical_loss": 3.587493545928156, + "tokens_seen": 1199894528 + }, + { + "epoch": 4.0, + "learning_rate": 0.000321444332998997, + "loss": 2.7943, + "theoretical_loss": 3.587475522211516, + "tokens_seen": 1199960064 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032143430290872615, + "loss": 2.6293, + "theoretical_loss": 3.587457499754823, + "tokens_seen": 1200025600 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003214242728184554, + "loss": 2.7308, + "theoretical_loss": 3.5874394785579207, + "tokens_seen": 1200091136 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003214142427281845, + "loss": 2.6226, + "theoretical_loss": 3.587421458620651, + "tokens_seen": 1200156672 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032140421263791375, + "loss": 2.615, + "theoretical_loss": 3.5874034399428583, + "tokens_seen": 1200222208 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032139418254764293, + "loss": 2.7029, + "theoretical_loss": 3.5873854225243855, + "tokens_seen": 1200287744 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003213841524573721, + "loss": 2.6413, + "theoretical_loss": 3.587367406365075, + "tokens_seen": 1200353280 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003213741223671013, + "loss": 2.6256, + "theoretical_loss": 3.5873493914647705, + "tokens_seen": 1200418816 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003213640922768305, + "loss": 2.756, + "theoretical_loss": 3.5873313778233165, + "tokens_seen": 1200484352 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032135406218655965, + "loss": 2.6721, + "theoretical_loss": 3.5873133654405542, + "tokens_seen": 1200549888 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003213440320962889, + "loss": 2.6894, + "theoretical_loss": 3.5872953543163284, + "tokens_seen": 1200615424 + }, + { + "epoch": 4.0, + "learning_rate": 0.000321334002006018, + "loss": 2.7949, + "theoretical_loss": 3.587277344450482, + "tokens_seen": 1200680960 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032132397191574725, + "loss": 2.8017, + "theoretical_loss": 3.587259335842859, + "tokens_seen": 1200746496 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003213139418254765, + "loss": 2.7304, + "theoretical_loss": 3.5872413284933016, + "tokens_seen": 1200812032 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003213039117352056, + "loss": 2.8873, + "theoretical_loss": 3.5872233224016545, + "tokens_seen": 1200877568 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1387236, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.934062957763672, + "objective/train/theoretical_loss": 3.5872053175677605, + "objective/train/tokens_used": 1221403104, + "theoretical_loss": 3.5872053175677605, + "tokens_seen": 1200943104 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032129388164493485, + "loss": 2.7567, + "theoretical_loss": 3.5872053175677605, + "tokens_seen": 1200943104 + }, + { + "epoch": 4.0, + "learning_rate": 0.000321283851554664, + "loss": 2.6779, + "theoretical_loss": 3.587187313991463, + "tokens_seen": 1201008640 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003212738214643932, + "loss": 2.7564, + "theoretical_loss": 3.587169311672606, + "tokens_seen": 1201074176 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003212637913741224, + "loss": 2.8282, + "theoretical_loss": 3.5871513106110333, + "tokens_seen": 1201139712 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003212537612838516, + "loss": 2.5829, + "theoretical_loss": 3.587133310806588, + "tokens_seen": 1201205248 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032124373119358075, + "loss": 2.6074, + "theoretical_loss": 3.5871153122591135, + "tokens_seen": 1201270784 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032123370110330994, + "loss": 2.5451, + "theoretical_loss": 3.587097314968454, + "tokens_seen": 1201336320 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003212236710130391, + "loss": 2.7113, + "theoretical_loss": 3.5870793189344523, + "tokens_seen": 1201401856 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032121364092276835, + "loss": 2.4835, + "theoretical_loss": 3.587061324156954, + "tokens_seen": 1201467392 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003212036108324975, + "loss": 2.5967, + "theoretical_loss": 3.587043330635801, + "tokens_seen": 1201532928 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003211935807422267, + "loss": 2.6397, + "theoretical_loss": 3.5870253383708377, + "tokens_seen": 1201598464 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003211835506519559, + "loss": 2.7351, + "theoretical_loss": 3.5870073473619075, + "tokens_seen": 1201664000 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003211735205616851, + "loss": 2.6823, + "theoretical_loss": 3.5869893576088545, + "tokens_seen": 1201729536 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032116349047141426, + "loss": 2.5033, + "theoretical_loss": 3.5869713691115237, + "tokens_seen": 1201795072 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032115346038114344, + "loss": 2.7195, + "theoretical_loss": 3.586953381869757, + "tokens_seen": 1201860608 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003211434302908726, + "loss": 2.691, + "theoretical_loss": 3.5869353958833994, + "tokens_seen": 1201926144 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032113340020060185, + "loss": 2.7591, + "theoretical_loss": 3.586917411152295, + "tokens_seen": 1201991680 + }, + { + "epoch": 4.0, + "learning_rate": 0.000321123370110331, + "loss": 2.7759, + "theoretical_loss": 3.586899427676287, + "tokens_seen": 1202057216 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003211133400200602, + "loss": 2.5341, + "theoretical_loss": 3.5868814454552203, + "tokens_seen": 1202122752 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032110330992978934, + "loss": 2.7228, + "theoretical_loss": 3.586863464488938, + "tokens_seen": 1202188288 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003210932798395186, + "loss": 2.7055, + "theoretical_loss": 3.5868454847772853, + "tokens_seen": 1202253824 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032108324974924776, + "loss": 2.6004, + "theoretical_loss": 3.586827506320105, + "tokens_seen": 1202319360 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032107321965897694, + "loss": 2.7184, + "theoretical_loss": 3.5868095291172413, + "tokens_seen": 1202384896 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003210631895687061, + "loss": 2.7188, + "theoretical_loss": 3.5867915531685397, + "tokens_seen": 1202450432 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003210531594784353, + "loss": 2.7433, + "theoretical_loss": 3.586773578473843, + "tokens_seen": 1202515968 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 1392256, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8471696376800537, + "objective/train/theoretical_loss": 3.586755605032996, + "objective/train/tokens_used": 1223041504, + "theoretical_loss": 3.586755605032996, + "tokens_seen": 1202581504 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003210431293881645, + "loss": 2.7683, + "theoretical_loss": 3.586755605032996, + "tokens_seen": 1202581504 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003210330992978937, + "loss": 2.6809, + "theoretical_loss": 3.5867376328458427, + "tokens_seen": 1202647040 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032102306920762285, + "loss": 2.7461, + "theoretical_loss": 3.5867196619122272, + "tokens_seen": 1202712576 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003210130391173521, + "loss": 2.7872, + "theoretical_loss": 3.586701692231994, + "tokens_seen": 1202778112 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032100300902708126, + "loss": 2.7223, + "theoretical_loss": 3.5866837238049873, + "tokens_seen": 1202843648 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032099297893681044, + "loss": 2.6087, + "theoretical_loss": 3.586665756631052, + "tokens_seen": 1202909184 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003209829488465396, + "loss": 2.6453, + "theoretical_loss": 3.5866477907100314, + "tokens_seen": 1202974720 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003209729187562688, + "loss": 2.6654, + "theoretical_loss": 3.5866298260417704, + "tokens_seen": 1203040256 + }, + { + "epoch": 4.0, + "learning_rate": 0.000320962888665998, + "loss": 2.6581, + "theoretical_loss": 3.586611862626114, + "tokens_seen": 1203105792 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003209528585757272, + "loss": 2.6549, + "theoretical_loss": 3.5865939004629057, + "tokens_seen": 1203171328 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032094282848545635, + "loss": 2.7228, + "theoretical_loss": 3.58657593955199, + "tokens_seen": 1203236864 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003209327983951856, + "loss": 2.8043, + "theoretical_loss": 3.586557979893212, + "tokens_seen": 1203302400 + }, + { + "epoch": 4.0, + "learning_rate": 0.0003209227683049147, + "loss": 2.5721, + "theoretical_loss": 3.5865400214864156, + "tokens_seen": 1203367936 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032091273821464395, + "loss": 2.6316, + "theoretical_loss": 3.5865220643314464, + "tokens_seen": 1203433472 + }, + { + "epoch": 4.0, + "learning_rate": 0.00032090270812437313, + "loss": 2.6906, + "theoretical_loss": 3.586504108428148, + "tokens_seen": 1203499008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003208926780341023, + "loss": 2.6847, + "theoretical_loss": 3.586486153776365, + "tokens_seen": 1203564544 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003208826479438315, + "loss": 2.7254, + "theoretical_loss": 3.586468200375942, + "tokens_seen": 1203630080 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003208726178535607, + "loss": 2.7631, + "theoretical_loss": 3.5864502482267246, + "tokens_seen": 1203695616 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032086258776328985, + "loss": 2.6306, + "theoretical_loss": 3.5864322973285567, + "tokens_seen": 1203761152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003208525576730191, + "loss": 2.806, + "theoretical_loss": 3.5864143476812824, + "tokens_seen": 1203826688 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003208425275827482, + "loss": 2.6084, + "theoretical_loss": 3.586396399284748, + "tokens_seen": 1203892224 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032083249749247745, + "loss": 2.7025, + "theoretical_loss": 3.5863784521387974, + "tokens_seen": 1203957760 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032082246740220663, + "loss": 2.7901, + "theoretical_loss": 3.5863605062432753, + "tokens_seen": 1204023296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003208124373119358, + "loss": 2.7009, + "theoretical_loss": 3.586342561598027, + "tokens_seen": 1204088832 + }, + { + "epoch": 4.01, + "learning_rate": 0.000320802407221665, + "loss": 2.6494, + "theoretical_loss": 3.5863246182028963, + "tokens_seen": 1204154368 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1397361, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8406195640563965, + "objective/train/theoretical_loss": 3.5863066760577293, + "objective/train/tokens_used": 1224679904, + "theoretical_loss": 3.5863066760577293, + "tokens_seen": 1204219904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003207923771313942, + "loss": 2.6758, + "theoretical_loss": 3.5863066760577293, + "tokens_seen": 1204219904 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032078234704112336, + "loss": 2.831, + "theoretical_loss": 3.5862887351623702, + "tokens_seen": 1204285440 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003207723169508526, + "loss": 2.8496, + "theoretical_loss": 3.5862707955166644, + "tokens_seen": 1204350976 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003207622868605817, + "loss": 2.6733, + "theoretical_loss": 3.5862528571204564, + "tokens_seen": 1204416512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032075225677031095, + "loss": 2.7109, + "theoretical_loss": 3.5862349199735912, + "tokens_seen": 1204482048 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003207422266800401, + "loss": 2.7204, + "theoretical_loss": 3.5862169840759144, + "tokens_seen": 1204547584 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003207321965897693, + "loss": 2.6926, + "theoretical_loss": 3.586199049427271, + "tokens_seen": 1204613120 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003207221664994985, + "loss": 2.6064, + "theoretical_loss": 3.586181116027505, + "tokens_seen": 1204678656 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003207121364092277, + "loss": 2.5865, + "theoretical_loss": 3.5861631838764625, + "tokens_seen": 1204744192 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032070210631895686, + "loss": 2.5053, + "theoretical_loss": 3.5861452529739886, + "tokens_seen": 1204809728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003206920762286861, + "loss": 2.751, + "theoretical_loss": 3.5861273233199276, + "tokens_seen": 1204875264 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003206820461384152, + "loss": 2.8412, + "theoretical_loss": 3.586109394914126, + "tokens_seen": 1204940800 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032067201604814446, + "loss": 2.79, + "theoretical_loss": 3.586091467756428, + "tokens_seen": 1205006336 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003206619859578736, + "loss": 2.7373, + "theoretical_loss": 3.586073541846679, + "tokens_seen": 1205071872 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003206519558676028, + "loss": 2.7581, + "theoretical_loss": 3.5860556171847247, + "tokens_seen": 1205137408 + }, + { + "epoch": 4.01, + "learning_rate": 0.000320641925777332, + "loss": 2.9763, + "theoretical_loss": 3.58603769377041, + "tokens_seen": 1205202944 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003206318956870612, + "loss": 2.7726, + "theoretical_loss": 3.5860197716035804, + "tokens_seen": 1205268480 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032062186559679036, + "loss": 2.6571, + "theoretical_loss": 3.586001850684081, + "tokens_seen": 1205334016 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032061183550651954, + "loss": 2.6722, + "theoretical_loss": 3.5859839310117576, + "tokens_seen": 1205399552 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003206018054162487, + "loss": 2.6794, + "theoretical_loss": 3.5859660125864554, + "tokens_seen": 1205465088 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032059177532597796, + "loss": 2.7808, + "theoretical_loss": 3.5859480954080194, + "tokens_seen": 1205530624 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003205817452357071, + "loss": 2.7927, + "theoretical_loss": 3.585930179476296, + "tokens_seen": 1205596160 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003205717151454363, + "loss": 2.6525, + "theoretical_loss": 3.58591226479113, + "tokens_seen": 1205661696 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003205616850551655, + "loss": 2.7671, + "theoretical_loss": 3.5858943513523664, + "tokens_seen": 1205727232 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003205516549648947, + "loss": 2.771, + "theoretical_loss": 3.5858764391598523, + "tokens_seen": 1205792768 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1402482, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.825957775115967, + "objective/train/theoretical_loss": 3.585858528213432, + "objective/train/tokens_used": 1226318304, + "theoretical_loss": 3.585858528213432, + "tokens_seen": 1205858304 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003205416248746239, + "loss": 2.6716, + "theoretical_loss": 3.585858528213432, + "tokens_seen": 1205858304 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032053159478435305, + "loss": 2.5004, + "theoretical_loss": 3.5858406185129517, + "tokens_seen": 1205923840 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003205215646940823, + "loss": 2.6681, + "theoretical_loss": 3.585822710058257, + "tokens_seen": 1205989376 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032051153460381146, + "loss": 2.6805, + "theoretical_loss": 3.585804802849193, + "tokens_seen": 1206054912 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032050150451354064, + "loss": 2.557, + "theoretical_loss": 3.5857868968856055, + "tokens_seen": 1206120448 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003204914744232698, + "loss": 2.7872, + "theoretical_loss": 3.585768992167341, + "tokens_seen": 1206185984 + }, + { + "epoch": 4.01, + "learning_rate": 0.000320481444332999, + "loss": 2.696, + "theoretical_loss": 3.585751088694244, + "tokens_seen": 1206251520 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003204714142427282, + "loss": 2.8453, + "theoretical_loss": 3.585733186466161, + "tokens_seen": 1206317056 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003204613841524574, + "loss": 2.7537, + "theoretical_loss": 3.585715285482938, + "tokens_seen": 1206382592 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032045135406218655, + "loss": 2.7335, + "theoretical_loss": 3.585697385744421, + "tokens_seen": 1206448128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003204413239719158, + "loss": 2.2961, + "theoretical_loss": 3.585679487250455, + "tokens_seen": 1206513664 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003204312938816449, + "loss": 2.6176, + "theoretical_loss": 3.5856615900008864, + "tokens_seen": 1206579200 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032042126379137415, + "loss": 2.553, + "theoretical_loss": 3.5856436939955607, + "tokens_seen": 1206644736 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032041123370110333, + "loss": 2.5838, + "theoretical_loss": 3.5856257992343243, + "tokens_seen": 1206710272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003204012036108325, + "loss": 2.813, + "theoretical_loss": 3.5856079057170227, + "tokens_seen": 1206775808 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003203911735205617, + "loss": 2.719, + "theoretical_loss": 3.585590013443502, + "tokens_seen": 1206841344 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003203811434302909, + "loss": 2.7019, + "theoretical_loss": 3.585572122413609, + "tokens_seen": 1206906880 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032037111334002005, + "loss": 2.4981, + "theoretical_loss": 3.5855542326271888, + "tokens_seen": 1206972416 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003203610832497493, + "loss": 2.6035, + "theoretical_loss": 3.5855363440840873, + "tokens_seen": 1207037952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003203510531594784, + "loss": 2.6704, + "theoretical_loss": 3.5855184567841514, + "tokens_seen": 1207103488 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032034102306920765, + "loss": 2.5376, + "theoretical_loss": 3.585500570727227, + "tokens_seen": 1207169024 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032033099297893683, + "loss": 2.5535, + "theoretical_loss": 3.58548268591316, + "tokens_seen": 1207234560 + }, + { + "epoch": 4.01, + "learning_rate": 0.000320320962888666, + "loss": 2.7184, + "theoretical_loss": 3.5854648023417965, + "tokens_seen": 1207300096 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003203109327983952, + "loss": 2.746, + "theoretical_loss": 3.585446920012983, + "tokens_seen": 1207365632 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003203009027081244, + "loss": 2.7058, + "theoretical_loss": 3.5854290389265655, + "tokens_seen": 1207431168 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1407494, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6062726974487305, + "objective/train/theoretical_loss": 3.5854111590823905, + "objective/train/tokens_used": 1227956704, + "theoretical_loss": 3.5854111590823905, + "tokens_seen": 1207496704 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032029087261785356, + "loss": 2.688, + "theoretical_loss": 3.5854111590823905, + "tokens_seen": 1207496704 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003202808425275828, + "loss": 2.8863, + "theoretical_loss": 3.5853932804803037, + "tokens_seen": 1207562240 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003202708124373119, + "loss": 2.6487, + "theoretical_loss": 3.5853754031201523, + "tokens_seen": 1207627776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032026078234704115, + "loss": 2.7036, + "theoretical_loss": 3.585357527001782, + "tokens_seen": 1207693312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003202507522567703, + "loss": 2.7126, + "theoretical_loss": 3.5853396521250396, + "tokens_seen": 1207758848 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003202407221664995, + "loss": 2.543, + "theoretical_loss": 3.585321778489771, + "tokens_seen": 1207824384 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003202306920762287, + "loss": 2.7911, + "theoretical_loss": 3.585303906095823, + "tokens_seen": 1207889920 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003202206619859579, + "loss": 2.6455, + "theoretical_loss": 3.585286034943042, + "tokens_seen": 1207955456 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032021063189568706, + "loss": 2.6103, + "theoretical_loss": 3.585268165031274, + "tokens_seen": 1208020992 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003202006018054163, + "loss": 2.6886, + "theoretical_loss": 3.5852502963603667, + "tokens_seen": 1208086528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003201905717151454, + "loss": 2.6231, + "theoretical_loss": 3.585232428930165, + "tokens_seen": 1208152064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032018054162487466, + "loss": 2.7009, + "theoretical_loss": 3.585214562740517, + "tokens_seen": 1208217600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003201705115346038, + "loss": 2.8047, + "theoretical_loss": 3.585196697791268, + "tokens_seen": 1208283136 + }, + { + "epoch": 4.01, + "learning_rate": 0.000320160481444333, + "loss": 2.7623, + "theoretical_loss": 3.5851788340822655, + "tokens_seen": 1208348672 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003201504513540622, + "loss": 2.7484, + "theoretical_loss": 3.5851609716133552, + "tokens_seen": 1208414208 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003201404212637914, + "loss": 2.6339, + "theoretical_loss": 3.585143110384385, + "tokens_seen": 1208479744 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032013039117352056, + "loss": 2.6358, + "theoretical_loss": 3.585125250395201, + "tokens_seen": 1208545280 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032012036108324974, + "loss": 2.6491, + "theoretical_loss": 3.58510739164565, + "tokens_seen": 1208610816 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003201103309929789, + "loss": 2.7829, + "theoretical_loss": 3.585089534135578, + "tokens_seen": 1208676352 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032010030090270816, + "loss": 2.5954, + "theoretical_loss": 3.585071677864833, + "tokens_seen": 1208741888 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003200902708124373, + "loss": 2.6858, + "theoretical_loss": 3.585053822833261, + "tokens_seen": 1208807424 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003200802407221665, + "loss": 2.7955, + "theoretical_loss": 3.585035969040709, + "tokens_seen": 1208872960 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032007021063189565, + "loss": 2.7785, + "theoretical_loss": 3.5850181164870243, + "tokens_seen": 1208938496 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003200601805416249, + "loss": 2.6245, + "theoretical_loss": 3.5850002651720527, + "tokens_seen": 1209004032 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032005015045135407, + "loss": 2.5587, + "theoretical_loss": 3.5849824150956424, + "tokens_seen": 1209069568 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1410378, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7569308280944824, + "objective/train/theoretical_loss": 3.5849645662576393, + "objective/train/tokens_used": 1229595104, + "theoretical_loss": 3.5849645662576393, + "tokens_seen": 1209135104 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032004012036108325, + "loss": 2.7375, + "theoretical_loss": 3.5849645662576393, + "tokens_seen": 1209135104 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032003009027081243, + "loss": 2.5791, + "theoretical_loss": 3.584946718657891, + "tokens_seen": 1209200640 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032002006018054166, + "loss": 2.6087, + "theoretical_loss": 3.5849288722962447, + "tokens_seen": 1209266176 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003200100300902708, + "loss": 2.8597, + "theoretical_loss": 3.584911027172547, + "tokens_seen": 1209331712 + }, + { + "epoch": 4.01, + "learning_rate": 0.00032, + "loss": 2.6192, + "theoretical_loss": 3.584893183286644, + "tokens_seen": 1209397248 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031998996990972915, + "loss": 2.6508, + "theoretical_loss": 3.5848753406383844, + "tokens_seen": 1209462784 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003199799398194584, + "loss": 2.564, + "theoretical_loss": 3.5848574992276148, + "tokens_seen": 1209528320 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031996990972918757, + "loss": 2.6464, + "theoretical_loss": 3.584839659054182, + "tokens_seen": 1209593856 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031995987963891675, + "loss": 2.6648, + "theoretical_loss": 3.5848218201179334, + "tokens_seen": 1209659392 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031994984954864593, + "loss": 2.698, + "theoretical_loss": 3.584803982418716, + "tokens_seen": 1209724928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003199398194583751, + "loss": 2.6806, + "theoretical_loss": 3.584786145956377, + "tokens_seen": 1209790464 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003199297893681043, + "loss": 2.7552, + "theoretical_loss": 3.584768310730764, + "tokens_seen": 1209856000 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031991975927783353, + "loss": 2.7092, + "theoretical_loss": 3.584750476741724, + "tokens_seen": 1209921536 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031990972918756266, + "loss": 2.6622, + "theoretical_loss": 3.5847326439891045, + "tokens_seen": 1209987072 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003198996990972919, + "loss": 2.8497, + "theoretical_loss": 3.584714812472752, + "tokens_seen": 1210052608 + }, + { + "epoch": 4.01, + "learning_rate": 0.000319889669007021, + "loss": 2.592, + "theoretical_loss": 3.584696982192515, + "tokens_seen": 1210118144 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031987963891675025, + "loss": 2.5142, + "theoretical_loss": 3.5846791531482403, + "tokens_seen": 1210183680 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031986960882647943, + "loss": 2.7154, + "theoretical_loss": 3.584661325339775, + "tokens_seen": 1210249216 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003198595787362086, + "loss": 2.7834, + "theoretical_loss": 3.584643498766968, + "tokens_seen": 1210314752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003198495486459378, + "loss": 2.5158, + "theoretical_loss": 3.584625673429665, + "tokens_seen": 1210380288 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031983951855566703, + "loss": 2.6208, + "theoretical_loss": 3.584607849327714, + "tokens_seen": 1210445824 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031982948846539616, + "loss": 2.6352, + "theoretical_loss": 3.5845900264609623, + "tokens_seen": 1210511360 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003198194583751254, + "loss": 2.7371, + "theoretical_loss": 3.584572204829258, + "tokens_seen": 1210576896 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003198094282848546, + "loss": 2.5, + "theoretical_loss": 3.5845543844324483, + "tokens_seen": 1210642432 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031979939819458376, + "loss": 2.6279, + "theoretical_loss": 3.5845365652703816, + "tokens_seen": 1210707968 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1413162, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.503354787826538, + "objective/train/theoretical_loss": 3.584518747342904, + "objective/train/tokens_used": 1231233504, + "theoretical_loss": 3.584518747342904, + "tokens_seen": 1210773504 + }, + { + "epoch": 4.01, + "learning_rate": 0.000319789368104313, + "loss": 2.5154, + "theoretical_loss": 3.584518747342904, + "tokens_seen": 1210773504 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003197793380140421, + "loss": 2.8354, + "theoretical_loss": 3.5845009306498645, + "tokens_seen": 1210839040 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031976930792377135, + "loss": 2.9319, + "theoretical_loss": 3.58448311519111, + "tokens_seen": 1210904576 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003197592778335005, + "loss": 2.6632, + "theoretical_loss": 3.584465300966489, + "tokens_seen": 1210970112 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003197492477432297, + "loss": 2.6491, + "theoretical_loss": 3.584447487975848, + "tokens_seen": 1211035648 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003197392176529589, + "loss": 2.7853, + "theoretical_loss": 3.5844296762190355, + "tokens_seen": 1211101184 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003197291875626881, + "loss": 2.7017, + "theoretical_loss": 3.5844118656958996, + "tokens_seen": 1211166720 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031971915747241726, + "loss": 2.7758, + "theoretical_loss": 3.5843940564062873, + "tokens_seen": 1211232256 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003197091273821465, + "loss": 2.8573, + "theoretical_loss": 3.584376248350047, + "tokens_seen": 1211297792 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003196990972918756, + "loss": 2.6947, + "theoretical_loss": 3.5843584415270264, + "tokens_seen": 1211363328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031968906720160486, + "loss": 2.6019, + "theoretical_loss": 3.5843406359370737, + "tokens_seen": 1211428864 + }, + { + "epoch": 4.01, + "learning_rate": 0.000319679037111334, + "loss": 2.7002, + "theoretical_loss": 3.584322831580036, + "tokens_seen": 1211494400 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003196690070210632, + "loss": 2.4958, + "theoretical_loss": 3.584305028455762, + "tokens_seen": 1211559936 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003196589769307924, + "loss": 2.7174, + "theoretical_loss": 3.5842872265640993, + "tokens_seen": 1211625472 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003196489468405216, + "loss": 2.6841, + "theoretical_loss": 3.584269425904896, + "tokens_seen": 1211691008 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031963891675025076, + "loss": 2.6407, + "theoretical_loss": 3.584251626478, + "tokens_seen": 1211756544 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031962888665997994, + "loss": 2.8254, + "theoretical_loss": 3.58423382828326, + "tokens_seen": 1211822080 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003196188565697091, + "loss": 2.7335, + "theoretical_loss": 3.5842160313205236, + "tokens_seen": 1211887616 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031960882647943836, + "loss": 2.6786, + "theoretical_loss": 3.5841982355896382, + "tokens_seen": 1211953152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003195987963891675, + "loss": 2.8014, + "theoretical_loss": 3.584180441090453, + "tokens_seen": 1212018688 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003195887662988967, + "loss": 2.6308, + "theoretical_loss": 3.584162647822816, + "tokens_seen": 1212084224 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031957873620862585, + "loss": 2.566, + "theoretical_loss": 3.5841448557865743, + "tokens_seen": 1212149760 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003195687061183551, + "loss": 2.5022, + "theoretical_loss": 3.584127064981578, + "tokens_seen": 1212215296 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031955867602808427, + "loss": 2.831, + "theoretical_loss": 3.5841092754076733, + "tokens_seen": 1212280832 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031954864593781345, + "loss": 2.3898, + "theoretical_loss": 3.5840914870647103, + "tokens_seen": 1212346368 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1414282, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5722250938415527, + "objective/train/theoretical_loss": 3.5840736999525356, + "objective/train/tokens_used": 1232871904, + "theoretical_loss": 3.5840736999525356, + "tokens_seen": 1212411904 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031953861584754263, + "loss": 2.7378, + "theoretical_loss": 3.5840736999525356, + "tokens_seen": 1212411904 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031952858575727186, + "loss": 2.5702, + "theoretical_loss": 3.5840559140709987, + "tokens_seen": 1212477440 + }, + { + "epoch": 4.01, + "learning_rate": 0.000319518555667001, + "loss": 2.7093, + "theoretical_loss": 3.5840381294199477, + "tokens_seen": 1212542976 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003195085255767302, + "loss": 2.8855, + "theoretical_loss": 3.584020345999231, + "tokens_seen": 1212608512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031949849548645935, + "loss": 2.7143, + "theoretical_loss": 3.5840025638086965, + "tokens_seen": 1212674048 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003194884653961886, + "loss": 2.7684, + "theoretical_loss": 3.5839847828481934, + "tokens_seen": 1212739584 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031947843530591777, + "loss": 2.6283, + "theoretical_loss": 3.5839670031175697, + "tokens_seen": 1212805120 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031946840521564695, + "loss": 2.4975, + "theoretical_loss": 3.5839492246166733, + "tokens_seen": 1212870656 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031945837512537613, + "loss": 2.7088, + "theoretical_loss": 3.583931447345354, + "tokens_seen": 1212936192 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003194483450351053, + "loss": 2.6488, + "theoretical_loss": 3.5839136713034594, + "tokens_seen": 1213001728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003194383149448345, + "loss": 2.6018, + "theoretical_loss": 3.583895896490838, + "tokens_seen": 1213067264 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031942828485456373, + "loss": 2.7103, + "theoretical_loss": 3.5838781229073398, + "tokens_seen": 1213132800 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031941825476429286, + "loss": 2.6472, + "theoretical_loss": 3.5838603505528113, + "tokens_seen": 1213198336 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003194082246740221, + "loss": 2.5772, + "theoretical_loss": 3.5838425794271025, + "tokens_seen": 1213263872 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003193981945837512, + "loss": 2.4455, + "theoretical_loss": 3.5838248095300616, + "tokens_seen": 1213329408 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031938816449348045, + "loss": 2.7764, + "theoretical_loss": 3.583807040861538, + "tokens_seen": 1213394944 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031937813440320964, + "loss": 2.7773, + "theoretical_loss": 3.583789273421379, + "tokens_seen": 1213460480 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003193681043129388, + "loss": 2.7144, + "theoretical_loss": 3.583771507209435, + "tokens_seen": 1213526016 + }, + { + "epoch": 4.01, + "learning_rate": 0.000319358074222668, + "loss": 2.6453, + "theoretical_loss": 3.5837537422255537, + "tokens_seen": 1213591552 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031934804413239723, + "loss": 2.6259, + "theoretical_loss": 3.583735978469584, + "tokens_seen": 1213657088 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031933801404212636, + "loss": 2.6677, + "theoretical_loss": 3.5837182159413747, + "tokens_seen": 1213722624 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003193279839518556, + "loss": 2.5048, + "theoretical_loss": 3.583700454640775, + "tokens_seen": 1213788160 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003193179538615847, + "loss": 2.6941, + "theoretical_loss": 3.583682694567634, + "tokens_seen": 1213853696 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031930792377131396, + "loss": 2.821, + "theoretical_loss": 3.5836649357218002, + "tokens_seen": 1213919232 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031929789368104314, + "loss": 2.8494, + "theoretical_loss": 3.5836471781031225, + "tokens_seen": 1213984768 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1415070, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7974677085876465, + "objective/train/theoretical_loss": 3.58362942171145, + "objective/train/tokens_used": 1234510304, + "theoretical_loss": 3.58362942171145, + "tokens_seen": 1214050304 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003192878635907723, + "loss": 2.8067, + "theoretical_loss": 3.58362942171145, + "tokens_seen": 1214050304 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003192778335005015, + "loss": 2.7528, + "theoretical_loss": 3.5836116665466315, + "tokens_seen": 1214115840 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003192678034102307, + "loss": 2.6348, + "theoretical_loss": 3.583593912608516, + "tokens_seen": 1214181376 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031925777331995986, + "loss": 2.5474, + "theoretical_loss": 3.5835761598969524, + "tokens_seen": 1214246912 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003192477432296891, + "loss": 2.6987, + "theoretical_loss": 3.5835584084117906, + "tokens_seen": 1214312448 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003192377131394182, + "loss": 2.6957, + "theoretical_loss": 3.583540658152879, + "tokens_seen": 1214377984 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031922768304914746, + "loss": 2.6414, + "theoretical_loss": 3.583522909120067, + "tokens_seen": 1214443520 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003192176529588766, + "loss": 2.6054, + "theoretical_loss": 3.583505161313204, + "tokens_seen": 1214509056 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003192076228686058, + "loss": 2.7971, + "theoretical_loss": 3.5834874147321383, + "tokens_seen": 1214574592 + }, + { + "epoch": 4.01, + "learning_rate": 0.000319197592778335, + "loss": 2.8113, + "theoretical_loss": 3.5834696693767194, + "tokens_seen": 1214640128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003191875626880642, + "loss": 2.7911, + "theoretical_loss": 3.583451925246797, + "tokens_seen": 1214705664 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031917753259779337, + "loss": 2.7277, + "theoretical_loss": 3.58343418234222, + "tokens_seen": 1214771200 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003191675025075226, + "loss": 2.6605, + "theoretical_loss": 3.583416440662838, + "tokens_seen": 1214836736 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031915747241725173, + "loss": 2.6873, + "theoretical_loss": 3.5833987002085, + "tokens_seen": 1214902272 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031914744232698096, + "loss": 2.6275, + "theoretical_loss": 3.5833809609790555, + "tokens_seen": 1214967808 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003191374122367101, + "loss": 2.6332, + "theoretical_loss": 3.5833632229743535, + "tokens_seen": 1215033344 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003191273821464393, + "loss": 2.6789, + "theoretical_loss": 3.5833454861942435, + "tokens_seen": 1215098880 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003191173520561685, + "loss": 2.7704, + "theoretical_loss": 3.583327750638576, + "tokens_seen": 1215164416 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003191073219658977, + "loss": 2.7698, + "theoretical_loss": 3.5833100163071987, + "tokens_seen": 1215229952 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031909729187562687, + "loss": 2.8002, + "theoretical_loss": 3.5832922831999623, + "tokens_seen": 1215295488 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031908726178535605, + "loss": 2.6626, + "theoretical_loss": 3.5832745513167152, + "tokens_seen": 1215361024 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031907723169508523, + "loss": 2.6527, + "theoretical_loss": 3.5832568206573088, + "tokens_seen": 1215426560 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031906720160481447, + "loss": 2.6216, + "theoretical_loss": 3.58323909122159, + "tokens_seen": 1215492096 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031905717151454365, + "loss": 2.6442, + "theoretical_loss": 3.583221363009411, + "tokens_seen": 1215557632 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031904714142427283, + "loss": 2.6907, + "theoretical_loss": 3.5832036360206203, + "tokens_seen": 1215623168 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1416232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.866744041442871, + "objective/train/theoretical_loss": 3.5831859102550667, + "objective/train/tokens_used": 1236148704, + "theoretical_loss": 3.5831859102550667, + "tokens_seen": 1215688704 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031903711133400206, + "loss": 2.7375, + "theoretical_loss": 3.5831859102550667, + "tokens_seen": 1215688704 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003190270812437312, + "loss": 2.9198, + "theoretical_loss": 3.583168185712601, + "tokens_seen": 1215754240 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003190170511534604, + "loss": 2.5112, + "theoretical_loss": 3.5831504623930726, + "tokens_seen": 1215819776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031900702106318955, + "loss": 2.8218, + "theoretical_loss": 3.583132740296331, + "tokens_seen": 1215885312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003189969909729188, + "loss": 2.6373, + "theoretical_loss": 3.583115019422226, + "tokens_seen": 1215950848 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031898696088264797, + "loss": 2.6703, + "theoretical_loss": 3.5830972997706074, + "tokens_seen": 1216016384 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031897693079237715, + "loss": 2.6533, + "theoretical_loss": 3.583079581341325, + "tokens_seen": 1216081920 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031896690070210633, + "loss": 2.705, + "theoretical_loss": 3.583061864134229, + "tokens_seen": 1216147456 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003189568706118355, + "loss": 2.8154, + "theoretical_loss": 3.583044148149168, + "tokens_seen": 1216212992 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003189468405215647, + "loss": 2.7847, + "theoretical_loss": 3.583026433385993, + "tokens_seen": 1216278528 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031893681043129393, + "loss": 2.7064, + "theoretical_loss": 3.583008719844554, + "tokens_seen": 1216344064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031892678034102306, + "loss": 2.677, + "theoretical_loss": 3.5829910075247007, + "tokens_seen": 1216409600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003189167502507523, + "loss": 2.5897, + "theoretical_loss": 3.5829732964262826, + "tokens_seen": 1216475136 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003189067201604814, + "loss": 2.6573, + "theoretical_loss": 3.58295558654915, + "tokens_seen": 1216540672 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031889669007021065, + "loss": 2.7557, + "theoretical_loss": 3.582937877893153, + "tokens_seen": 1216606208 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031888665997993984, + "loss": 2.7416, + "theoretical_loss": 3.582920170458141, + "tokens_seen": 1216671744 + }, + { + "epoch": 4.01, + "learning_rate": 0.000318876629889669, + "loss": 2.6819, + "theoretical_loss": 3.582902464243965, + "tokens_seen": 1216737280 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003188665997993982, + "loss": 2.6083, + "theoretical_loss": 3.582884759250474, + "tokens_seen": 1216802816 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031885656970912743, + "loss": 2.7381, + "theoretical_loss": 3.5828670554775197, + "tokens_seen": 1216868352 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031884653961885656, + "loss": 2.8159, + "theoretical_loss": 3.5828493529249505, + "tokens_seen": 1216933888 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003188365095285858, + "loss": 2.5582, + "theoretical_loss": 3.5828316515926177, + "tokens_seen": 1216999424 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003188264794383149, + "loss": 2.7293, + "theoretical_loss": 3.5828139514803707, + "tokens_seen": 1217064960 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031881644934804416, + "loss": 2.5561, + "theoretical_loss": 3.582796252588061, + "tokens_seen": 1217130496 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031880641925777334, + "loss": 2.8685, + "theoretical_loss": 3.582778554915537, + "tokens_seen": 1217196032 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003187963891675025, + "loss": 2.5221, + "theoretical_loss": 3.5827608584626502, + "tokens_seen": 1217261568 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1416846, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6486992835998535, + "objective/train/theoretical_loss": 3.5827431632292503, + "objective/train/tokens_used": 1237787104, + "theoretical_loss": 3.5827431632292503, + "tokens_seen": 1217327104 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003187863590772317, + "loss": 2.7341, + "theoretical_loss": 3.5827431632292503, + "tokens_seen": 1217327104 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003187763289869609, + "loss": 2.8365, + "theoretical_loss": 3.5827254692151884, + "tokens_seen": 1217392640 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031876629889669006, + "loss": 2.6498, + "theoretical_loss": 3.582707776420315, + "tokens_seen": 1217458176 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003187562688064193, + "loss": 2.532, + "theoretical_loss": 3.5826900848444785, + "tokens_seen": 1217523712 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003187462387161484, + "loss": 2.4932, + "theoretical_loss": 3.582672394487531, + "tokens_seen": 1217589248 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031873620862587766, + "loss": 2.7149, + "theoretical_loss": 3.582654705349323, + "tokens_seen": 1217654784 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003187261785356068, + "loss": 2.8224, + "theoretical_loss": 3.5826370174297044, + "tokens_seen": 1217720320 + }, + { + "epoch": 4.01, + "learning_rate": 0.000318716148445336, + "loss": 2.4685, + "theoretical_loss": 3.5826193307285257, + "tokens_seen": 1217785856 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003187061183550652, + "loss": 2.5885, + "theoretical_loss": 3.5826016452456377, + "tokens_seen": 1217851392 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003186960882647944, + "loss": 2.7836, + "theoretical_loss": 3.5825839609808905, + "tokens_seen": 1217916928 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031868605817452357, + "loss": 2.8991, + "theoretical_loss": 3.5825662779341347, + "tokens_seen": 1217982464 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003186760280842528, + "loss": 2.5322, + "theoretical_loss": 3.5825485961052213, + "tokens_seen": 1218048000 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031866599799398193, + "loss": 2.5996, + "theoretical_loss": 3.582530915494001, + "tokens_seen": 1218113536 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031865596790371116, + "loss": 2.7052, + "theoretical_loss": 3.5825132361003234, + "tokens_seen": 1218179072 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003186459378134403, + "loss": 2.6446, + "theoretical_loss": 3.58249555792404, + "tokens_seen": 1218244608 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003186359077231695, + "loss": 2.7679, + "theoretical_loss": 3.5824778809650017, + "tokens_seen": 1218310144 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003186258776328987, + "loss": 2.6478, + "theoretical_loss": 3.582460205223059, + "tokens_seen": 1218375680 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003186158475426279, + "loss": 2.5404, + "theoretical_loss": 3.582442530698062, + "tokens_seen": 1218441216 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031860581745235707, + "loss": 2.7493, + "theoretical_loss": 3.5824248573898623, + "tokens_seen": 1218506752 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031859578736208625, + "loss": 2.6604, + "theoretical_loss": 3.58240718529831, + "tokens_seen": 1218572288 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031858575727181543, + "loss": 2.6678, + "theoretical_loss": 3.582389514423257, + "tokens_seen": 1218637824 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031857572718154467, + "loss": 2.9431, + "theoretical_loss": 3.5823718447645527, + "tokens_seen": 1218703360 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003185656970912738, + "loss": 2.8017, + "theoretical_loss": 3.582354176322049, + "tokens_seen": 1218768896 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031855566700100303, + "loss": 2.5855, + "theoretical_loss": 3.5823365090955965, + "tokens_seen": 1218834432 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003185456369107322, + "loss": 2.7062, + "theoretical_loss": 3.582318843085046, + "tokens_seen": 1218899968 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1418472, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8614342212677, + "objective/train/theoretical_loss": 3.5823011782902485, + "objective/train/tokens_used": 1239425504, + "theoretical_loss": 3.5823011782902485, + "tokens_seen": 1218965504 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003185356068204614, + "loss": 2.7619, + "theoretical_loss": 3.5823011782902485, + "tokens_seen": 1218965504 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031852557673019057, + "loss": 2.8181, + "theoretical_loss": 3.5822835147110554, + "tokens_seen": 1219031040 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031851554663991975, + "loss": 2.5943, + "theoretical_loss": 3.582265852347317, + "tokens_seen": 1219096576 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031850551654964893, + "loss": 2.6939, + "theoretical_loss": 3.582248191198885, + "tokens_seen": 1219162112 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031849548645937817, + "loss": 2.5154, + "theoretical_loss": 3.58223053126561, + "tokens_seen": 1219227648 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003184854563691073, + "loss": 2.5464, + "theoretical_loss": 3.5822128725473434, + "tokens_seen": 1219293184 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031847542627883653, + "loss": 2.5821, + "theoretical_loss": 3.582195215043936, + "tokens_seen": 1219358720 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031846539618856566, + "loss": 2.4363, + "theoretical_loss": 3.582177558755239, + "tokens_seen": 1219424256 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003184553660982949, + "loss": 2.6333, + "theoretical_loss": 3.5821599036811036, + "tokens_seen": 1219489792 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003184453360080241, + "loss": 2.7076, + "theoretical_loss": 3.582142249821381, + "tokens_seen": 1219555328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031843530591775326, + "loss": 2.626, + "theoretical_loss": 3.5821245971759224, + "tokens_seen": 1219620864 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031842527582748244, + "loss": 2.4653, + "theoretical_loss": 3.582106945744579, + "tokens_seen": 1219686400 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003184152457372116, + "loss": 2.5816, + "theoretical_loss": 3.5820892955272026, + "tokens_seen": 1219751936 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003184052156469408, + "loss": 2.7463, + "theoretical_loss": 3.5820716465236435, + "tokens_seen": 1219817472 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031839518555667004, + "loss": 2.65, + "theoretical_loss": 3.5820539987337536, + "tokens_seen": 1219883008 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031838515546639916, + "loss": 2.8067, + "theoretical_loss": 3.5820363521573846, + "tokens_seen": 1219948544 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003183751253761284, + "loss": 2.7794, + "theoretical_loss": 3.582018706794387, + "tokens_seen": 1220014080 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003183650952858576, + "loss": 2.6482, + "theoretical_loss": 3.582001062644613, + "tokens_seen": 1220079616 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031835506519558676, + "loss": 2.542, + "theoretical_loss": 3.5819834197079135, + "tokens_seen": 1220145152 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031834503510531594, + "loss": 2.5878, + "theoretical_loss": 3.58196577798414, + "tokens_seen": 1220210688 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003183350050150451, + "loss": 2.4764, + "theoretical_loss": 3.581948137473144, + "tokens_seen": 1220276224 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003183249749247743, + "loss": 2.6947, + "theoretical_loss": 3.5819304981747777, + "tokens_seen": 1220341760 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031831494483450354, + "loss": 2.6588, + "theoretical_loss": 3.5819128600888916, + "tokens_seen": 1220407296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003183049147442327, + "loss": 2.7199, + "theoretical_loss": 3.581895223215337, + "tokens_seen": 1220472832 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003182948846539619, + "loss": 2.6414, + "theoretical_loss": 3.5818775875539672, + "tokens_seen": 1220538368 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1419245, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.898358106613159, + "objective/train/theoretical_loss": 3.5818599531046322, + "objective/train/tokens_used": 1241063904, + "theoretical_loss": 3.5818599531046322, + "tokens_seen": 1220603904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003182848545636911, + "loss": 2.807, + "theoretical_loss": 3.5818599531046322, + "tokens_seen": 1220603904 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031827482447342026, + "loss": 2.5729, + "theoretical_loss": 3.5818423198671847, + "tokens_seen": 1220669440 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003182647943831495, + "loss": 2.6056, + "theoretical_loss": 3.5818246878414755, + "tokens_seen": 1220734976 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003182547642928786, + "loss": 2.5781, + "theoretical_loss": 3.5818070570273566, + "tokens_seen": 1220800512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031824473420260786, + "loss": 2.726, + "theoretical_loss": 3.58178942742468, + "tokens_seen": 1220866048 + }, + { + "epoch": 4.01, + "learning_rate": 0.000318234704112337, + "loss": 2.7488, + "theoretical_loss": 3.581771799033297, + "tokens_seen": 1220931584 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003182246740220662, + "loss": 2.4787, + "theoretical_loss": 3.58175417185306, + "tokens_seen": 1220997120 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003182146439317954, + "loss": 2.6335, + "theoretical_loss": 3.5817365458838193, + "tokens_seen": 1221062656 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003182046138415246, + "loss": 2.6146, + "theoretical_loss": 3.581718921125429, + "tokens_seen": 1221128192 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031819458375125377, + "loss": 2.8097, + "theoretical_loss": 3.5817012975777387, + "tokens_seen": 1221193728 + }, + { + "epoch": 4.01, + "learning_rate": 0.000318184553660983, + "loss": 2.6973, + "theoretical_loss": 3.5816836752406016, + "tokens_seen": 1221259264 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031817452357071213, + "loss": 2.677, + "theoretical_loss": 3.581666054113869, + "tokens_seen": 1221324800 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031816449348044136, + "loss": 2.6547, + "theoretical_loss": 3.5816484341973935, + "tokens_seen": 1221390336 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003181544633901705, + "loss": 2.465, + "theoretical_loss": 3.5816308154910264, + "tokens_seen": 1221455872 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003181444332998997, + "loss": 2.6044, + "theoretical_loss": 3.58161319799462, + "tokens_seen": 1221521408 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003181344032096289, + "loss": 2.7337, + "theoretical_loss": 3.5815955817080263, + "tokens_seen": 1221586944 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003181243731193581, + "loss": 2.8371, + "theoretical_loss": 3.581577966631097, + "tokens_seen": 1221652480 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031811434302908727, + "loss": 2.5997, + "theoretical_loss": 3.5815603527636846, + "tokens_seen": 1221718016 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031810431293881645, + "loss": 2.5927, + "theoretical_loss": 3.5815427401056406, + "tokens_seen": 1221783552 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031809428284854563, + "loss": 2.7329, + "theoretical_loss": 3.5815251286568177, + "tokens_seen": 1221849088 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031808425275827487, + "loss": 2.6258, + "theoretical_loss": 3.5815075184170673, + "tokens_seen": 1221914624 + }, + { + "epoch": 4.01, + "learning_rate": 0.000318074222668004, + "loss": 2.8222, + "theoretical_loss": 3.5814899093862422, + "tokens_seen": 1221980160 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031806419257773323, + "loss": 3.0164, + "theoretical_loss": 3.5814723015641947, + "tokens_seen": 1222045696 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003180541624874624, + "loss": 2.7932, + "theoretical_loss": 3.5814546949507764, + "tokens_seen": 1222111232 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003180441323971916, + "loss": 2.6886, + "theoretical_loss": 3.5814370895458403, + "tokens_seen": 1222176768 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1420806, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.511540174484253, + "objective/train/theoretical_loss": 3.5814194853492376, + "objective/train/tokens_used": 1242702304, + "theoretical_loss": 3.5814194853492376, + "tokens_seen": 1222242304 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031803410230692077, + "loss": 2.626, + "theoretical_loss": 3.5814194853492376, + "tokens_seen": 1222242304 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031802407221664995, + "loss": 2.7816, + "theoretical_loss": 3.5814018823608214, + "tokens_seen": 1222307840 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031801404212637913, + "loss": 2.6152, + "theoretical_loss": 3.5813842805804437, + "tokens_seen": 1222373376 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031800401203610837, + "loss": 2.6969, + "theoretical_loss": 3.581366680007957, + "tokens_seen": 1222438912 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003179939819458375, + "loss": 2.663, + "theoretical_loss": 3.581349080643214, + "tokens_seen": 1222504448 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031798395185556673, + "loss": 2.8343, + "theoretical_loss": 3.581331482486066, + "tokens_seen": 1222569984 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031797392176529586, + "loss": 2.7702, + "theoretical_loss": 3.581313885536366, + "tokens_seen": 1222635520 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003179638916750251, + "loss": 2.7763, + "theoretical_loss": 3.581296289793967, + "tokens_seen": 1222701056 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003179538615847543, + "loss": 2.8597, + "theoretical_loss": 3.581278695258721, + "tokens_seen": 1222766592 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031794383149448346, + "loss": 2.7061, + "theoretical_loss": 3.58126110193048, + "tokens_seen": 1222832128 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031793380140421264, + "loss": 2.643, + "theoretical_loss": 3.5812435098090973, + "tokens_seen": 1222897664 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003179237713139418, + "loss": 2.85, + "theoretical_loss": 3.5812259188944253, + "tokens_seen": 1222963200 + }, + { + "epoch": 4.01, + "learning_rate": 0.000317913741223671, + "loss": 2.6904, + "theoretical_loss": 3.5812083291863157, + "tokens_seen": 1223028736 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031790371113340024, + "loss": 2.6822, + "theoretical_loss": 3.5811907406846224, + "tokens_seen": 1223094272 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031789368104312936, + "loss": 2.7356, + "theoretical_loss": 3.5811731533891975, + "tokens_seen": 1223159808 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003178836509528586, + "loss": 2.7586, + "theoretical_loss": 3.581155567299893, + "tokens_seen": 1223225344 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003178736208625878, + "loss": 2.8288, + "theoretical_loss": 3.581137982416563, + "tokens_seen": 1223290880 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031786359077231696, + "loss": 2.5823, + "theoretical_loss": 3.581120398739059, + "tokens_seen": 1223356416 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031785356068204614, + "loss": 2.5514, + "theoretical_loss": 3.581102816267234, + "tokens_seen": 1223421952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003178435305917753, + "loss": 2.6135, + "theoretical_loss": 3.5810852350009403, + "tokens_seen": 1223487488 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003178335005015045, + "loss": 2.6177, + "theoretical_loss": 3.5810676549400315, + "tokens_seen": 1223553024 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031782347041123374, + "loss": 2.7085, + "theoretical_loss": 3.5810500760843604, + "tokens_seen": 1223618560 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031781344032096287, + "loss": 2.257, + "theoretical_loss": 3.581032498433779, + "tokens_seen": 1223684096 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003178034102306921, + "loss": 2.7905, + "theoretical_loss": 3.581014921988141, + "tokens_seen": 1223749632 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031779338014042123, + "loss": 2.6344, + "theoretical_loss": 3.580997346747299, + "tokens_seen": 1223815168 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1421310, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.000223159790039, + "objective/train/theoretical_loss": 3.580979772711106, + "objective/train/tokens_used": 1244340704, + "theoretical_loss": 3.580979772711106, + "tokens_seen": 1223880704 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031778335005015046, + "loss": 2.6517, + "theoretical_loss": 3.580979772711106, + "tokens_seen": 1223880704 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031777331995987964, + "loss": 2.7078, + "theoretical_loss": 3.580962199879414, + "tokens_seen": 1223946240 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003177632898696088, + "loss": 2.8244, + "theoretical_loss": 3.580944628252077, + "tokens_seen": 1224011776 + }, + { + "epoch": 4.01, + "learning_rate": 0.000317753259779338, + "loss": 2.7608, + "theoretical_loss": 3.5809270578289483, + "tokens_seen": 1224077312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003177432296890672, + "loss": 2.671, + "theoretical_loss": 3.58090948860988, + "tokens_seen": 1224142848 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031773319959879637, + "loss": 2.6531, + "theoretical_loss": 3.580891920594725, + "tokens_seen": 1224208384 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003177231695085256, + "loss": 2.5167, + "theoretical_loss": 3.5808743537833374, + "tokens_seen": 1224273920 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031771313941825473, + "loss": 2.7415, + "theoretical_loss": 3.58085678817557, + "tokens_seen": 1224339456 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031770310932798397, + "loss": 2.5674, + "theoretical_loss": 3.580839223771275, + "tokens_seen": 1224404992 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031769307923771315, + "loss": 2.7093, + "theoretical_loss": 3.5808216605703063, + "tokens_seen": 1224470528 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031768304914744233, + "loss": 2.5677, + "theoretical_loss": 3.580804098572517, + "tokens_seen": 1224536064 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003176730190571715, + "loss": 2.8175, + "theoretical_loss": 3.58078653777776, + "tokens_seen": 1224601600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003176629889669007, + "loss": 2.701, + "theoretical_loss": 3.580768978185889, + "tokens_seen": 1224667136 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031765295887662987, + "loss": 2.7692, + "theoretical_loss": 3.5807514197967567, + "tokens_seen": 1224732672 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003176429287863591, + "loss": 2.7475, + "theoretical_loss": 3.580733862610217, + "tokens_seen": 1224798208 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031763289869608823, + "loss": 2.8054, + "theoretical_loss": 3.5807163066261225, + "tokens_seen": 1224863744 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031762286860581747, + "loss": 2.6037, + "theoretical_loss": 3.580698751844327, + "tokens_seen": 1224929280 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003176128385155466, + "loss": 2.5773, + "theoretical_loss": 3.580681198264684, + "tokens_seen": 1224994816 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031760280842527583, + "loss": 2.7677, + "theoretical_loss": 3.580663645887046, + "tokens_seen": 1225060352 + }, + { + "epoch": 4.01, + "learning_rate": 0.000317592778335005, + "loss": 2.6867, + "theoretical_loss": 3.580646094711267, + "tokens_seen": 1225125888 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003175827482447342, + "loss": 2.708, + "theoretical_loss": 3.5806285447372006, + "tokens_seen": 1225191424 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003175727181544634, + "loss": 2.8949, + "theoretical_loss": 3.5806109959647, + "tokens_seen": 1225256960 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003175626880641926, + "loss": 2.3915, + "theoretical_loss": 3.5805934483936186, + "tokens_seen": 1225322496 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003175526579739218, + "loss": 2.7247, + "theoretical_loss": 3.58057590202381, + "tokens_seen": 1225388032 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031754262788365097, + "loss": 2.745, + "theoretical_loss": 3.5805583568551276, + "tokens_seen": 1225453568 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1422763, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5950467586517334, + "objective/train/theoretical_loss": 3.5805408128874254, + "objective/train/tokens_used": 1245979104, + "theoretical_loss": 3.5805408128874254, + "tokens_seen": 1225519104 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031753259779338015, + "loss": 2.619, + "theoretical_loss": 3.5805408128874254, + "tokens_seen": 1225519104 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031752256770310933, + "loss": 2.767, + "theoretical_loss": 3.5805232701205565, + "tokens_seen": 1225584640 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031751253761283857, + "loss": 2.8689, + "theoretical_loss": 3.580505728554374, + "tokens_seen": 1225650176 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003175025075225677, + "loss": 2.6551, + "theoretical_loss": 3.580488188188733, + "tokens_seen": 1225715712 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031749247743229693, + "loss": 2.7369, + "theoretical_loss": 3.5804706490234857, + "tokens_seen": 1225781248 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031748244734202606, + "loss": 2.4896, + "theoretical_loss": 3.580453111058487, + "tokens_seen": 1225846784 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003174724172517553, + "loss": 2.6095, + "theoretical_loss": 3.58043557429359, + "tokens_seen": 1225912320 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003174623871614845, + "loss": 2.8006, + "theoretical_loss": 3.5804180387286477, + "tokens_seen": 1225977856 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031745235707121366, + "loss": 2.6738, + "theoretical_loss": 3.5804005043635154, + "tokens_seen": 1226043392 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031744232698094284, + "loss": 2.7993, + "theoretical_loss": 3.5803829711980453, + "tokens_seen": 1226108928 + }, + { + "epoch": 4.01, + "learning_rate": 0.000317432296890672, + "loss": 2.5852, + "theoretical_loss": 3.580365439232092, + "tokens_seen": 1226174464 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003174222668004012, + "loss": 2.6334, + "theoretical_loss": 3.5803479084655097, + "tokens_seen": 1226240000 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031741223671013044, + "loss": 2.8156, + "theoretical_loss": 3.580330378898152, + "tokens_seen": 1226305536 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031740220661985956, + "loss": 2.5313, + "theoretical_loss": 3.5803128505298725, + "tokens_seen": 1226371072 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003173921765295888, + "loss": 2.848, + "theoretical_loss": 3.5802953233605246, + "tokens_seen": 1226436608 + }, + { + "epoch": 4.01, + "learning_rate": 0.000317382146439318, + "loss": 2.7893, + "theoretical_loss": 3.5802777973899635, + "tokens_seen": 1226502144 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031737211634904716, + "loss": 2.5006, + "theoretical_loss": 3.580260272618043, + "tokens_seen": 1226567680 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031736208625877634, + "loss": 2.6533, + "theoretical_loss": 3.580242749044616, + "tokens_seen": 1226633216 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003173520561685055, + "loss": 2.6162, + "theoretical_loss": 3.580225226669537, + "tokens_seen": 1226698752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003173420260782347, + "loss": 2.7069, + "theoretical_loss": 3.5802077054926604, + "tokens_seen": 1226764288 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031733199598796394, + "loss": 2.8578, + "theoretical_loss": 3.58019018551384, + "tokens_seen": 1226829824 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031732196589769307, + "loss": 2.5383, + "theoretical_loss": 3.5801726667329294, + "tokens_seen": 1226895360 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003173119358074223, + "loss": 2.6596, + "theoretical_loss": 3.5801551491497836, + "tokens_seen": 1226960896 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031730190571715143, + "loss": 2.6207, + "theoretical_loss": 3.5801376327642567, + "tokens_seen": 1227026432 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031729187562688066, + "loss": 2.6281, + "theoretical_loss": 3.5801201175762025, + "tokens_seen": 1227091968 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1423509, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.779162645339966, + "objective/train/theoretical_loss": 3.580102603585475, + "objective/train/tokens_used": 1247617504, + "theoretical_loss": 3.580102603585475, + "tokens_seen": 1227157504 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031728184553660984, + "loss": 2.7531, + "theoretical_loss": 3.580102603585475, + "tokens_seen": 1227157504 + }, + { + "epoch": 4.01, + "learning_rate": 0.000317271815446339, + "loss": 2.6181, + "theoretical_loss": 3.5800850907919286, + "tokens_seen": 1227223040 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003172617853560682, + "loss": 2.6957, + "theoretical_loss": 3.5800675791954175, + "tokens_seen": 1227288576 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003172517552657974, + "loss": 2.5925, + "theoretical_loss": 3.5800500687957966, + "tokens_seen": 1227354112 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031724172517552657, + "loss": 2.5597, + "theoretical_loss": 3.5800325595929188, + "tokens_seen": 1227419648 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003172316950852558, + "loss": 2.7134, + "theoretical_loss": 3.58001505158664, + "tokens_seen": 1227485184 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031722166499498493, + "loss": 2.4009, + "theoretical_loss": 3.5799975447768135, + "tokens_seen": 1227550720 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031721163490471417, + "loss": 2.7466, + "theoretical_loss": 3.579980039163294, + "tokens_seen": 1227616256 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031720160481444335, + "loss": 2.7712, + "theoretical_loss": 3.5799625347459356, + "tokens_seen": 1227681792 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031719157472417253, + "loss": 2.5761, + "theoretical_loss": 3.579945031524593, + "tokens_seen": 1227747328 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003171815446339017, + "loss": 2.5698, + "theoretical_loss": 3.5799275294991206, + "tokens_seen": 1227812864 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003171715145436309, + "loss": 2.7248, + "theoretical_loss": 3.5799100286693735, + "tokens_seen": 1227878400 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031716148445336007, + "loss": 2.7414, + "theoretical_loss": 3.579892529035205, + "tokens_seen": 1227943936 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003171514543630893, + "loss": 2.6472, + "theoretical_loss": 3.57987503059647, + "tokens_seen": 1228009472 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031714142427281843, + "loss": 2.7689, + "theoretical_loss": 3.579857533353024, + "tokens_seen": 1228075008 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031713139418254767, + "loss": 2.5172, + "theoretical_loss": 3.57984003730472, + "tokens_seen": 1228140544 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003171213640922768, + "loss": 2.7424, + "theoretical_loss": 3.5798225424514145, + "tokens_seen": 1228206080 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031711133400200603, + "loss": 2.6671, + "theoretical_loss": 3.5798050487929602, + "tokens_seen": 1228271616 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003171013039117352, + "loss": 2.4961, + "theoretical_loss": 3.5797875563292125, + "tokens_seen": 1228337152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003170912738214644, + "loss": 2.8536, + "theoretical_loss": 3.5797700650600266, + "tokens_seen": 1228402688 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003170812437311936, + "loss": 2.6944, + "theoretical_loss": 3.579752574985257, + "tokens_seen": 1228468224 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003170712136409228, + "loss": 2.6289, + "theoretical_loss": 3.5797350861047574, + "tokens_seen": 1228533760 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031706118355065194, + "loss": 2.8619, + "theoretical_loss": 3.5797175984183838, + "tokens_seen": 1228599296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003170511534603812, + "loss": 2.6812, + "theoretical_loss": 3.5797001119259906, + "tokens_seen": 1228664832 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003170411233701103, + "loss": 2.5994, + "theoretical_loss": 3.5796826266274326, + "tokens_seen": 1228730368 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1425069, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5875046253204346, + "objective/train/theoretical_loss": 3.5796651425225643, + "objective/train/tokens_used": 1249255904, + "theoretical_loss": 3.5796651425225643, + "tokens_seen": 1228795904 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031703109327983954, + "loss": 2.6018, + "theoretical_loss": 3.5796651425225643, + "tokens_seen": 1228795904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003170210631895687, + "loss": 2.8186, + "theoretical_loss": 3.5796476596112408, + "tokens_seen": 1228861440 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003170110330992979, + "loss": 2.7534, + "theoretical_loss": 3.5796301778933173, + "tokens_seen": 1228926976 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003170010030090271, + "loss": 2.7221, + "theoretical_loss": 3.579612697368648, + "tokens_seen": 1228992512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031699097291875626, + "loss": 2.6292, + "theoretical_loss": 3.5795952180370882, + "tokens_seen": 1229058048 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031698094282848544, + "loss": 2.5586, + "theoretical_loss": 3.579577739898493, + "tokens_seen": 1229123584 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003169709127382147, + "loss": 2.5425, + "theoretical_loss": 3.5795602629527172, + "tokens_seen": 1229189120 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003169608826479438, + "loss": 2.4003, + "theoretical_loss": 3.5795427871996157, + "tokens_seen": 1229254656 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031695085255767304, + "loss": 2.7679, + "theoretical_loss": 3.579525312639044, + "tokens_seen": 1229320192 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031694082246740217, + "loss": 2.8011, + "theoretical_loss": 3.579507839270857, + "tokens_seen": 1229385728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003169307923771314, + "loss": 2.7587, + "theoretical_loss": 3.579490367094909, + "tokens_seen": 1229451264 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003169207622868606, + "loss": 2.7321, + "theoretical_loss": 3.579472896111056, + "tokens_seen": 1229516800 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031691073219658976, + "loss": 2.8625, + "theoretical_loss": 3.5794554263191527, + "tokens_seen": 1229582336 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031690070210631894, + "loss": 2.5965, + "theoretical_loss": 3.5794379577190547, + "tokens_seen": 1229647872 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003168906720160482, + "loss": 2.7067, + "theoretical_loss": 3.579420490310617, + "tokens_seen": 1229713408 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003168806419257773, + "loss": 2.8471, + "theoretical_loss": 3.5794030240936943, + "tokens_seen": 1229778944 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031687061183550654, + "loss": 2.7518, + "theoretical_loss": 3.5793855590681423, + "tokens_seen": 1229844480 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031686058174523567, + "loss": 2.8173, + "theoretical_loss": 3.579368095233816, + "tokens_seen": 1229910016 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003168505516549649, + "loss": 2.6816, + "theoretical_loss": 3.579350632590571, + "tokens_seen": 1229975552 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003168405215646941, + "loss": 2.7107, + "theoretical_loss": 3.5793331711382628, + "tokens_seen": 1230041088 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031683049147442327, + "loss": 2.7466, + "theoretical_loss": 3.5793157108767457, + "tokens_seen": 1230106624 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031682046138415245, + "loss": 2.6286, + "theoretical_loss": 3.5792982518058762, + "tokens_seen": 1230172160 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031681043129388163, + "loss": 2.683, + "theoretical_loss": 3.579280793925509, + "tokens_seen": 1230237696 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031680040120361086, + "loss": 2.5633, + "theoretical_loss": 3.5792633372355, + "tokens_seen": 1230303232 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031679037111334004, + "loss": 2.626, + "theoretical_loss": 3.5792458817357042, + "tokens_seen": 1230368768 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1425869, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.124709367752075, + "objective/train/theoretical_loss": 3.579228427425978, + "objective/train/tokens_used": 1250894304, + "theoretical_loss": 3.579228427425978, + "tokens_seen": 1230434304 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003167803410230692, + "loss": 2.5447, + "theoretical_loss": 3.579228427425978, + "tokens_seen": 1230434304 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003167703109327984, + "loss": 2.7377, + "theoretical_loss": 3.579210974306175, + "tokens_seen": 1230499840 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003167602808425276, + "loss": 2.8242, + "theoretical_loss": 3.5791935223761526, + "tokens_seen": 1230565376 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031675025075225677, + "loss": 2.8878, + "theoretical_loss": 3.5791760716357652, + "tokens_seen": 1230630912 + }, + { + "epoch": 4.01, + "learning_rate": 0.000316740220661986, + "loss": 2.6552, + "theoretical_loss": 3.579158622084869, + "tokens_seen": 1230696448 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031673019057171513, + "loss": 2.7781, + "theoretical_loss": 3.579141173723319, + "tokens_seen": 1230761984 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031672016048144437, + "loss": 2.572, + "theoretical_loss": 3.5791237265509714, + "tokens_seen": 1230827520 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031671013039117355, + "loss": 2.5123, + "theoretical_loss": 3.5791062805676814, + "tokens_seen": 1230893056 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031670010030090273, + "loss": 2.4936, + "theoretical_loss": 3.5790888357733053, + "tokens_seen": 1230958592 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003166900702106319, + "loss": 2.703, + "theoretical_loss": 3.5790713921676978, + "tokens_seen": 1231024128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003166800401203611, + "loss": 2.8052, + "theoretical_loss": 3.5790539497507154, + "tokens_seen": 1231089664 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031667001003009027, + "loss": 2.6364, + "theoretical_loss": 3.579036508522213, + "tokens_seen": 1231155200 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003166599799398195, + "loss": 2.4755, + "theoretical_loss": 3.5790190684820473, + "tokens_seen": 1231220736 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031664994984954863, + "loss": 2.8016, + "theoretical_loss": 3.579001629630074, + "tokens_seen": 1231286272 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031663991975927787, + "loss": 2.4563, + "theoretical_loss": 3.5789841919661485, + "tokens_seen": 1231351808 + }, + { + "epoch": 4.01, + "learning_rate": 0.000316629889669007, + "loss": 2.9177, + "theoretical_loss": 3.5789667554901268, + "tokens_seen": 1231417344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031661985957873623, + "loss": 2.4134, + "theoretical_loss": 3.5789493202018643, + "tokens_seen": 1231482880 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003166098294884654, + "loss": 2.7808, + "theoretical_loss": 3.5789318861012176, + "tokens_seen": 1231548416 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003165997993981946, + "loss": 2.8649, + "theoretical_loss": 3.5789144531880424, + "tokens_seen": 1231613952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003165897693079238, + "loss": 2.8881, + "theoretical_loss": 3.5788970214621942, + "tokens_seen": 1231679488 + }, + { + "epoch": 4.01, + "learning_rate": 0.000316579739217653, + "loss": 2.8825, + "theoretical_loss": 3.57887959092353, + "tokens_seen": 1231745024 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031656970912738214, + "loss": 2.644, + "theoretical_loss": 3.5788621615719043, + "tokens_seen": 1231810560 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003165596790371114, + "loss": 2.6932, + "theoretical_loss": 3.5788447334071747, + "tokens_seen": 1231876096 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003165496489468405, + "loss": 2.7675, + "theoretical_loss": 3.578827306429196, + "tokens_seen": 1231941632 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031653961885656974, + "loss": 2.6962, + "theoretical_loss": 3.5788098806378246, + "tokens_seen": 1232007168 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1427266, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6803057193756104, + "objective/train/theoretical_loss": 3.578792456032917, + "objective/train/tokens_used": 1252532704, + "theoretical_loss": 3.578792456032917, + "tokens_seen": 1232072704 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003165295887662989, + "loss": 2.6417, + "theoretical_loss": 3.578792456032917, + "tokens_seen": 1232072704 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003165195586760281, + "loss": 2.4745, + "theoretical_loss": 3.5787750326143293, + "tokens_seen": 1232138240 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003165095285857573, + "loss": 2.571, + "theoretical_loss": 3.578757610381917, + "tokens_seen": 1232203776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031649949849548646, + "loss": 2.7561, + "theoretical_loss": 3.578740189335537, + "tokens_seen": 1232269312 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031648946840521564, + "loss": 2.5433, + "theoretical_loss": 3.5787227694750445, + "tokens_seen": 1232334848 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003164794383149449, + "loss": 3.0216, + "theoretical_loss": 3.578705350800297, + "tokens_seen": 1232400384 + }, + { + "epoch": 4.01, + "learning_rate": 0.000316469408224674, + "loss": 2.4573, + "theoretical_loss": 3.5786879333111496, + "tokens_seen": 1232465920 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031645937813440324, + "loss": 2.8524, + "theoretical_loss": 3.5786705170074598, + "tokens_seen": 1232531456 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031644934804413237, + "loss": 2.7487, + "theoretical_loss": 3.578653101889082, + "tokens_seen": 1232596992 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003164393179538616, + "loss": 2.6817, + "theoretical_loss": 3.5786356879558747, + "tokens_seen": 1232662528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003164292878635908, + "loss": 2.6149, + "theoretical_loss": 3.5786182752076927, + "tokens_seen": 1232728064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031641925777331996, + "loss": 2.6015, + "theoretical_loss": 3.578600863644393, + "tokens_seen": 1232793600 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031640922768304914, + "loss": 2.5019, + "theoretical_loss": 3.5785834532658316, + "tokens_seen": 1232859136 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003163991975927784, + "loss": 2.8141, + "theoretical_loss": 3.5785660440718656, + "tokens_seen": 1232924672 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003163891675025075, + "loss": 2.5939, + "theoretical_loss": 3.5785486360623504, + "tokens_seen": 1232990208 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031637913741223674, + "loss": 2.7359, + "theoretical_loss": 3.5785312292371434, + "tokens_seen": 1233055744 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031636910732196587, + "loss": 2.5487, + "theoretical_loss": 3.5785138235961007, + "tokens_seen": 1233121280 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003163590772316951, + "loss": 2.5529, + "theoretical_loss": 3.578496419139079, + "tokens_seen": 1233186816 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003163490471414243, + "loss": 3.0039, + "theoretical_loss": 3.5784790158659345, + "tokens_seen": 1233252352 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031633901705115347, + "loss": 2.5339, + "theoretical_loss": 3.578461613776524, + "tokens_seen": 1233317888 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031632898696088265, + "loss": 2.7298, + "theoretical_loss": 3.5784442128707044, + "tokens_seen": 1233383424 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031631895687061183, + "loss": 2.8187, + "theoretical_loss": 3.578426813148332, + "tokens_seen": 1233448960 + }, + { + "epoch": 4.01, + "learning_rate": 0.000316308926780341, + "loss": 2.7148, + "theoretical_loss": 3.578409414609263, + "tokens_seen": 1233514496 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031629889669007024, + "loss": 2.6501, + "theoretical_loss": 3.5783920172533543, + "tokens_seen": 1233580032 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031628886659979937, + "loss": 2.741, + "theoretical_loss": 3.5783746210804637, + "tokens_seen": 1233645568 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1427860, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9305169582366943, + "objective/train/theoretical_loss": 3.5783572260904464, + "objective/train/tokens_used": 1254171104, + "theoretical_loss": 3.5783572260904464, + "tokens_seen": 1233711104 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003162788365095286, + "loss": 2.7442, + "theoretical_loss": 3.5783572260904464, + "tokens_seen": 1233711104 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031626880641925773, + "loss": 2.633, + "theoretical_loss": 3.57833983228316, + "tokens_seen": 1233776640 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031625877632898697, + "loss": 2.5109, + "theoretical_loss": 3.5783224396584608, + "tokens_seen": 1233842176 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031624874623871615, + "loss": 2.4862, + "theoretical_loss": 3.5783050482162055, + "tokens_seen": 1233907712 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031623871614844533, + "loss": 2.6074, + "theoretical_loss": 3.5782876579562517, + "tokens_seen": 1233973248 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003162286860581745, + "loss": 2.5857, + "theoretical_loss": 3.5782702688784553, + "tokens_seen": 1234038784 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031621865596790375, + "loss": 2.5521, + "theoretical_loss": 3.578252880982674, + "tokens_seen": 1234104320 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003162086258776329, + "loss": 2.809, + "theoretical_loss": 3.578235494268764, + "tokens_seen": 1234169856 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003161985957873621, + "loss": 2.7293, + "theoretical_loss": 3.5782181087365825, + "tokens_seen": 1234235392 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031618856569709124, + "loss": 2.6321, + "theoretical_loss": 3.5782007243859866, + "tokens_seen": 1234300928 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031617853560682047, + "loss": 2.3845, + "theoretical_loss": 3.5781833412168336, + "tokens_seen": 1234366464 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031616850551654965, + "loss": 2.6426, + "theoretical_loss": 3.578165959228979, + "tokens_seen": 1234432000 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031615847542627883, + "loss": 2.6782, + "theoretical_loss": 3.5781485784222813, + "tokens_seen": 1234497536 + }, + { + "epoch": 4.01, + "learning_rate": 0.000316148445336008, + "loss": 2.6913, + "theoretical_loss": 3.5781311987965974, + "tokens_seen": 1234563072 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003161384152457372, + "loss": 2.6628, + "theoretical_loss": 3.578113820351784, + "tokens_seen": 1234628608 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003161283851554664, + "loss": 2.8064, + "theoretical_loss": 3.5780964430876976, + "tokens_seen": 1234694144 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003161183550651956, + "loss": 2.513, + "theoretical_loss": 3.5780790670041966, + "tokens_seen": 1234759680 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031610832497492474, + "loss": 2.624, + "theoretical_loss": 3.578061692101137, + "tokens_seen": 1234825216 + }, + { + "epoch": 4.01, + "learning_rate": 0.000316098294884654, + "loss": 2.6117, + "theoretical_loss": 3.5780443183783763, + "tokens_seen": 1234890752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003160882647943831, + "loss": 2.6111, + "theoretical_loss": 3.5780269458357723, + "tokens_seen": 1234956288 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031607823470411234, + "loss": 2.5321, + "theoretical_loss": 3.5780095744731817, + "tokens_seen": 1235021824 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003160682046138415, + "loss": 2.5517, + "theoretical_loss": 3.5779922042904615, + "tokens_seen": 1235087360 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003160581745235707, + "loss": 2.3564, + "theoretical_loss": 3.5779748352874696, + "tokens_seen": 1235152896 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031604814443329994, + "loss": 2.561, + "theoretical_loss": 3.577957467464062, + "tokens_seen": 1235218432 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003160381143430291, + "loss": 2.8883, + "theoretical_loss": 3.577940100820098, + "tokens_seen": 1235283968 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 1429255, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2701942920684814, + "objective/train/theoretical_loss": 3.5779227353554335, + "objective/train/tokens_used": 1255809504, + "theoretical_loss": 3.5779227353554335, + "tokens_seen": 1235349504 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003160280842527583, + "loss": 2.5859, + "theoretical_loss": 3.5779227353554335, + "tokens_seen": 1235349504 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003160180541624875, + "loss": 2.9854, + "theoretical_loss": 3.5779053710699262, + "tokens_seen": 1235415040 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031600802407221666, + "loss": 2.5211, + "theoretical_loss": 3.577888007963433, + "tokens_seen": 1235480576 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031599799398194584, + "loss": 2.6688, + "theoretical_loss": 3.5778706460358127, + "tokens_seen": 1235546112 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003159879638916751, + "loss": 2.6484, + "theoretical_loss": 3.5778532852869214, + "tokens_seen": 1235611648 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003159779338014042, + "loss": 2.6478, + "theoretical_loss": 3.5778359257166175, + "tokens_seen": 1235677184 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031596790371113344, + "loss": 2.5795, + "theoretical_loss": 3.5778185673247576, + "tokens_seen": 1235742720 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031595787362086257, + "loss": 2.5562, + "theoretical_loss": 3.5778012101111996, + "tokens_seen": 1235808256 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003159478435305918, + "loss": 2.8139, + "theoretical_loss": 3.577783854075801, + "tokens_seen": 1235873792 + }, + { + "epoch": 4.01, + "learning_rate": 0.000315937813440321, + "loss": 2.7729, + "theoretical_loss": 3.5777664992184195, + "tokens_seen": 1235939328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031592778335005016, + "loss": 2.8157, + "theoretical_loss": 3.5777491455389123, + "tokens_seen": 1236004864 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031591775325977934, + "loss": 2.5978, + "theoretical_loss": 3.577731793037138, + "tokens_seen": 1236070400 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003159077231695086, + "loss": 2.8192, + "theoretical_loss": 3.577714441712953, + "tokens_seen": 1236135936 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003158976930792377, + "loss": 2.5671, + "theoretical_loss": 3.577697091566216, + "tokens_seen": 1236201472 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031588766298896694, + "loss": 2.6757, + "theoretical_loss": 3.577679742596784, + "tokens_seen": 1236267008 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031587763289869607, + "loss": 2.6728, + "theoretical_loss": 3.5776623948045145, + "tokens_seen": 1236332544 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003158676028084253, + "loss": 2.5845, + "theoretical_loss": 3.5776450481892663, + "tokens_seen": 1236398080 + }, + { + "epoch": 4.01, + "learning_rate": 0.0003158575727181545, + "loss": 2.5337, + "theoretical_loss": 3.5776277027508963, + "tokens_seen": 1236463616 + }, + { + "epoch": 4.01, + "learning_rate": 0.00031584754262788367, + "loss": 2.8174, + "theoretical_loss": 3.5776103584892622, + "tokens_seen": 1236529152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031583751253761285, + "loss": 2.6758, + "theoretical_loss": 3.577593015404222, + "tokens_seen": 1236594688 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031582748244734203, + "loss": 2.5004, + "theoretical_loss": 3.577575673495634, + "tokens_seen": 1236660224 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003158174523570712, + "loss": 2.5351, + "theoretical_loss": 3.5775583327633553, + "tokens_seen": 1236725760 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031580742226680044, + "loss": 2.7012, + "theoretical_loss": 3.577540993207245, + "tokens_seen": 1236791296 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031579739217652957, + "loss": 2.4612, + "theoretical_loss": 3.5775236548271594, + "tokens_seen": 1236856832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003157873620862588, + "loss": 2.6514, + "theoretical_loss": 3.5775063176229573, + "tokens_seen": 1236922368 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1430103, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.250220537185669, + "objective/train/theoretical_loss": 3.577488981594497, + "objective/train/tokens_used": 1257447904, + "theoretical_loss": 3.577488981594497, + "tokens_seen": 1236987904 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031577733199598793, + "loss": 2.5234, + "theoretical_loss": 3.577488981594497, + "tokens_seen": 1236987904 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031576730190571717, + "loss": 2.7337, + "theoretical_loss": 3.5774716467416354, + "tokens_seen": 1237053440 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031575727181544635, + "loss": 2.7496, + "theoretical_loss": 3.5774543130642313, + "tokens_seen": 1237118976 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031574724172517553, + "loss": 2.5515, + "theoretical_loss": 3.577436980562143, + "tokens_seen": 1237184512 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003157372116349047, + "loss": 2.8153, + "theoretical_loss": 3.5774196492352277, + "tokens_seen": 1237250048 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031572718154463395, + "loss": 2.5813, + "theoretical_loss": 3.577402319083344, + "tokens_seen": 1237315584 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003157171514543631, + "loss": 2.5498, + "theoretical_loss": 3.57738499010635, + "tokens_seen": 1237381120 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003157071213640923, + "loss": 2.7301, + "theoretical_loss": 3.5773676623041037, + "tokens_seen": 1237446656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031569709127382144, + "loss": 2.7963, + "theoretical_loss": 3.5773503356764635, + "tokens_seen": 1237512192 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031568706118355067, + "loss": 2.4556, + "theoretical_loss": 3.577333010223287, + "tokens_seen": 1237577728 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031567703109327985, + "loss": 2.4602, + "theoretical_loss": 3.5773156859444333, + "tokens_seen": 1237643264 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031566700100300903, + "loss": 2.5788, + "theoretical_loss": 3.5772983628397594, + "tokens_seen": 1237708800 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003156569709127382, + "loss": 2.7927, + "theoretical_loss": 3.5772810409091247, + "tokens_seen": 1237774336 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003156469408224674, + "loss": 2.7631, + "theoretical_loss": 3.577263720152387, + "tokens_seen": 1237839872 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003156369107321966, + "loss": 2.6811, + "theoretical_loss": 3.577246400569405, + "tokens_seen": 1237905408 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003156268806419258, + "loss": 2.5648, + "theoretical_loss": 3.5772290821600357, + "tokens_seen": 1237970944 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031561685055165494, + "loss": 2.5189, + "theoretical_loss": 3.5772117649241393, + "tokens_seen": 1238036480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003156068204613842, + "loss": 2.9144, + "theoretical_loss": 3.577194448861573, + "tokens_seen": 1238102016 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003155967903711133, + "loss": 2.7917, + "theoretical_loss": 3.5771771339721954, + "tokens_seen": 1238167552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031558676028084254, + "loss": 2.8326, + "theoretical_loss": 3.577159820255865, + "tokens_seen": 1238233088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003155767301905717, + "loss": 2.6731, + "theoretical_loss": 3.5771425077124404, + "tokens_seen": 1238298624 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003155667001003009, + "loss": 2.6652, + "theoretical_loss": 3.57712519634178, + "tokens_seen": 1238364160 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003155566700100301, + "loss": 2.5107, + "theoretical_loss": 3.5771078861437418, + "tokens_seen": 1238429696 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003155466399197593, + "loss": 2.5704, + "theoretical_loss": 3.577090577118185, + "tokens_seen": 1238495232 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031553660982948844, + "loss": 2.736, + "theoretical_loss": 3.577073269264968, + "tokens_seen": 1238560768 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1431596, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.694431781768799, + "objective/train/theoretical_loss": 3.577055962583949, + "objective/train/tokens_used": 1259086304, + "theoretical_loss": 3.577055962583949, + "tokens_seen": 1238626304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003155265797392177, + "loss": 2.7258, + "theoretical_loss": 3.577055962583949, + "tokens_seen": 1238626304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003155165496489468, + "loss": 2.8068, + "theoretical_loss": 3.5770386570749872, + "tokens_seen": 1238691840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031550651955867604, + "loss": 2.5443, + "theoretical_loss": 3.5770213527379404, + "tokens_seen": 1238757376 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003154964894684052, + "loss": 2.6994, + "theoretical_loss": 3.5770040495726687, + "tokens_seen": 1238822912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003154864593781344, + "loss": 2.7348, + "theoretical_loss": 3.576986747579029, + "tokens_seen": 1238888448 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003154764292878636, + "loss": 2.727, + "theoretical_loss": 3.5769694467568813, + "tokens_seen": 1238953984 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031546639919759277, + "loss": 2.8486, + "theoretical_loss": 3.5769521471060832, + "tokens_seen": 1239019520 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031545636910732195, + "loss": 2.7562, + "theoretical_loss": 3.5769348486264945, + "tokens_seen": 1239085056 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003154463390170512, + "loss": 2.8638, + "theoretical_loss": 3.5769175513179734, + "tokens_seen": 1239150592 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003154363089267803, + "loss": 2.577, + "theoretical_loss": 3.5769002551803792, + "tokens_seen": 1239216128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031542627883650954, + "loss": 2.6711, + "theoretical_loss": 3.5768829602135694, + "tokens_seen": 1239281664 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003154162487462387, + "loss": 2.6202, + "theoretical_loss": 3.5768656664174046, + "tokens_seen": 1239347200 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003154062186559679, + "loss": 2.7357, + "theoretical_loss": 3.5768483737917425, + "tokens_seen": 1239412736 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003153961885656971, + "loss": 2.7097, + "theoretical_loss": 3.576831082336443, + "tokens_seen": 1239478272 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031538615847542627, + "loss": 2.6539, + "theoretical_loss": 3.5768137920513636, + "tokens_seen": 1239543808 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031537612838515545, + "loss": 2.5496, + "theoretical_loss": 3.5767965029363644, + "tokens_seen": 1239609344 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003153660982948847, + "loss": 2.8014, + "theoretical_loss": 3.5767792149913036, + "tokens_seen": 1239674880 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003153560682046138, + "loss": 2.5942, + "theoretical_loss": 3.5767619282160403, + "tokens_seen": 1239740416 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031534603811434305, + "loss": 2.5592, + "theoretical_loss": 3.576744642610435, + "tokens_seen": 1239805952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003153360080240722, + "loss": 2.4833, + "theoretical_loss": 3.5767273581743444, + "tokens_seen": 1239871488 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003153259779338014, + "loss": 2.8076, + "theoretical_loss": 3.576710074907629, + "tokens_seen": 1239937024 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003153159478435306, + "loss": 2.751, + "theoretical_loss": 3.576692792810147, + "tokens_seen": 1240002560 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031530591775325977, + "loss": 2.8131, + "theoretical_loss": 3.576675511881759, + "tokens_seen": 1240068096 + }, + { + "epoch": 4.02, + "learning_rate": 0.000315295887662989, + "loss": 2.7056, + "theoretical_loss": 3.5766582321223224, + "tokens_seen": 1240133632 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031528585757271813, + "loss": 2.6935, + "theoretical_loss": 3.5766409535316974, + "tokens_seen": 1240199168 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1432295, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.533489227294922, + "objective/train/theoretical_loss": 3.576623676109743, + "objective/train/tokens_used": 1260724704, + "theoretical_loss": 3.576623676109743, + "tokens_seen": 1240264704 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031527582748244737, + "loss": 2.8536, + "theoretical_loss": 3.576623676109743, + "tokens_seen": 1240264704 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031526579739217655, + "loss": 2.5282, + "theoretical_loss": 3.5766063998563187, + "tokens_seen": 1240330240 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031525576730190573, + "loss": 2.5231, + "theoretical_loss": 3.5765891247712824, + "tokens_seen": 1240395776 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003152457372116349, + "loss": 2.6814, + "theoretical_loss": 3.5765718508544952, + "tokens_seen": 1240461312 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031523570712136415, + "loss": 2.5898, + "theoretical_loss": 3.5765545781058155, + "tokens_seen": 1240526848 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003152256770310933, + "loss": 2.8252, + "theoretical_loss": 3.5765373065251023, + "tokens_seen": 1240592384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003152156469408225, + "loss": 2.6618, + "theoretical_loss": 3.576520036112215, + "tokens_seen": 1240657920 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031520561685055164, + "loss": 2.8285, + "theoretical_loss": 3.5765027668670135, + "tokens_seen": 1240723456 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031519558676028087, + "loss": 2.5839, + "theoretical_loss": 3.576485498789357, + "tokens_seen": 1240788992 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031518555667001005, + "loss": 2.6935, + "theoretical_loss": 3.5764682318791046, + "tokens_seen": 1240854528 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031517552657973924, + "loss": 2.8678, + "theoretical_loss": 3.576450966136116, + "tokens_seen": 1240920064 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003151654964894684, + "loss": 2.66, + "theoretical_loss": 3.5764337015602505, + "tokens_seen": 1240985600 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003151554663991976, + "loss": 2.5715, + "theoretical_loss": 3.5764164381513677, + "tokens_seen": 1241051136 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003151454363089268, + "loss": 2.6386, + "theoretical_loss": 3.5763991759093265, + "tokens_seen": 1241116672 + }, + { + "epoch": 4.02, + "learning_rate": 0.000315135406218656, + "loss": 2.6935, + "theoretical_loss": 3.5763819148339877, + "tokens_seen": 1241182208 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031512537612838514, + "loss": 2.6195, + "theoretical_loss": 3.57636465492521, + "tokens_seen": 1241247744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003151153460381144, + "loss": 2.6906, + "theoretical_loss": 3.576347396182853, + "tokens_seen": 1241313280 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003151053159478435, + "loss": 2.5668, + "theoretical_loss": 3.5763301386067763, + "tokens_seen": 1241378816 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031509528585757274, + "loss": 2.9009, + "theoretical_loss": 3.5763128821968393, + "tokens_seen": 1241444352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003150852557673019, + "loss": 2.7263, + "theoretical_loss": 3.576295626952902, + "tokens_seen": 1241509888 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003150752256770311, + "loss": 2.7097, + "theoretical_loss": 3.5762783728748246, + "tokens_seen": 1241575424 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003150651955867603, + "loss": 2.636, + "theoretical_loss": 3.576261119962466, + "tokens_seen": 1241640960 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003150551654964895, + "loss": 2.792, + "theoretical_loss": 3.576243868215686, + "tokens_seen": 1241706496 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031504513540621864, + "loss": 2.7841, + "theoretical_loss": 3.5762266176343447, + "tokens_seen": 1241772032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003150351053159479, + "loss": 2.8194, + "theoretical_loss": 3.576209368218301, + "tokens_seen": 1241837568 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1433476, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.985485315322876, + "objective/train/theoretical_loss": 3.576192119967416, + "objective/train/tokens_used": 1262363104, + "theoretical_loss": 3.576192119967416, + "tokens_seen": 1241903104 + }, + { + "epoch": 4.02, + "learning_rate": 0.000315025075225677, + "loss": 2.8603, + "theoretical_loss": 3.576192119967416, + "tokens_seen": 1241903104 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031501504513540624, + "loss": 2.7702, + "theoretical_loss": 3.5761748728815483, + "tokens_seen": 1241968640 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003150050150451354, + "loss": 2.7794, + "theoretical_loss": 3.576157626960559, + "tokens_seen": 1242034176 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003149949849548646, + "loss": 2.6736, + "theoretical_loss": 3.576140382204307, + "tokens_seen": 1242099712 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003149849548645938, + "loss": 2.6884, + "theoretical_loss": 3.576123138612652, + "tokens_seen": 1242165248 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031497492477432297, + "loss": 2.4335, + "theoretical_loss": 3.576105896185455, + "tokens_seen": 1242230784 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031496489468405215, + "loss": 2.4949, + "theoretical_loss": 3.5760886549225748, + "tokens_seen": 1242296320 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003149548645937814, + "loss": 2.5471, + "theoretical_loss": 3.576071414823872, + "tokens_seen": 1242361856 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003149448345035105, + "loss": 2.7882, + "theoretical_loss": 3.576054175889207, + "tokens_seen": 1242427392 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031493480441323974, + "loss": 2.6353, + "theoretical_loss": 3.5760369381184383, + "tokens_seen": 1242492928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003149247743229689, + "loss": 2.764, + "theoretical_loss": 3.5760197015114277, + "tokens_seen": 1242558464 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003149147442326981, + "loss": 2.7535, + "theoretical_loss": 3.576002466068034, + "tokens_seen": 1242624000 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003149047141424273, + "loss": 2.5901, + "theoretical_loss": 3.575985231788118, + "tokens_seen": 1242689536 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031489468405215647, + "loss": 2.6292, + "theoretical_loss": 3.575967998671539, + "tokens_seen": 1242755072 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031488465396188565, + "loss": 2.3893, + "theoretical_loss": 3.575950766718158, + "tokens_seen": 1242820608 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003148746238716149, + "loss": 2.6584, + "theoretical_loss": 3.5759335359278355, + "tokens_seen": 1242886144 + }, + { + "epoch": 4.02, + "learning_rate": 0.000314864593781344, + "loss": 2.6964, + "theoretical_loss": 3.5759163063004302, + "tokens_seen": 1242951680 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031485456369107325, + "loss": 2.7812, + "theoretical_loss": 3.5758990778358033, + "tokens_seen": 1243017216 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003148445336008024, + "loss": 2.7795, + "theoretical_loss": 3.575881850533815, + "tokens_seen": 1243082752 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003148345035105316, + "loss": 2.6807, + "theoretical_loss": 3.5758646243943253, + "tokens_seen": 1243148288 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003148244734202608, + "loss": 2.715, + "theoretical_loss": 3.5758473994171944, + "tokens_seen": 1243213824 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031481444332998997, + "loss": 2.5597, + "theoretical_loss": 3.575830175602283, + "tokens_seen": 1243279360 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031480441323971915, + "loss": 2.8623, + "theoretical_loss": 3.575812952949451, + "tokens_seen": 1243344896 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031479438314944833, + "loss": 2.3986, + "theoretical_loss": 3.575795731458559, + "tokens_seen": 1243410432 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003147843530591775, + "loss": 2.5205, + "theoretical_loss": 3.5757785111294673, + "tokens_seen": 1243475968 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1434244, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7484705448150635, + "objective/train/theoretical_loss": 3.5757612919620367, + "objective/train/tokens_used": 1264001504, + "theoretical_loss": 3.5757612919620367, + "tokens_seen": 1243541504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031477432296890675, + "loss": 2.7885, + "theoretical_loss": 3.5757612919620367, + "tokens_seen": 1243541504 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003147642928786359, + "loss": 2.8115, + "theoretical_loss": 3.575744073956127, + "tokens_seen": 1243607040 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003147542627883651, + "loss": 2.6593, + "theoretical_loss": 3.575726857111598, + "tokens_seen": 1243672576 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003147442326980943, + "loss": 2.73, + "theoretical_loss": 3.575709641428312, + "tokens_seen": 1243738112 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003147342026078235, + "loss": 2.6826, + "theoretical_loss": 3.5756924269061288, + "tokens_seen": 1243803648 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031472417251755266, + "loss": 2.8426, + "theoretical_loss": 3.575675213544908, + "tokens_seen": 1243869184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031471414242728184, + "loss": 2.8811, + "theoretical_loss": 3.575658001344511, + "tokens_seen": 1243934720 + }, + { + "epoch": 4.02, + "learning_rate": 0.000314704112337011, + "loss": 2.726, + "theoretical_loss": 3.5756407903047984, + "tokens_seen": 1244000256 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031469408224674025, + "loss": 2.6208, + "theoretical_loss": 3.5756235804256304, + "tokens_seen": 1244065792 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003146840521564694, + "loss": 2.5834, + "theoretical_loss": 3.5756063717068676, + "tokens_seen": 1244131328 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003146740220661986, + "loss": 2.7418, + "theoretical_loss": 3.5755891641483712, + "tokens_seen": 1244196864 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031466399197592774, + "loss": 2.7675, + "theoretical_loss": 3.575571957750001, + "tokens_seen": 1244262400 + }, + { + "epoch": 4.02, + "learning_rate": 0.000314653961885657, + "loss": 2.6723, + "theoretical_loss": 3.5755547525116187, + "tokens_seen": 1244327936 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031464393179538616, + "loss": 2.5414, + "theoretical_loss": 3.575537548433084, + "tokens_seen": 1244393472 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031463390170511534, + "loss": 2.5029, + "theoretical_loss": 3.575520345514258, + "tokens_seen": 1244459008 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003146238716148445, + "loss": 2.8787, + "theoretical_loss": 3.575503143755002, + "tokens_seen": 1244524544 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003146138415245737, + "loss": 2.6157, + "theoretical_loss": 3.575485943155176, + "tokens_seen": 1244590080 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003146038114343029, + "loss": 2.671, + "theoretical_loss": 3.5754687437146417, + "tokens_seen": 1244655616 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003145937813440321, + "loss": 2.527, + "theoretical_loss": 3.5754515454332587, + "tokens_seen": 1244721152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031458375125376125, + "loss": 2.3568, + "theoretical_loss": 3.575434348310889, + "tokens_seen": 1244786688 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003145737211634905, + "loss": 2.4167, + "theoretical_loss": 3.575417152347393, + "tokens_seen": 1244852224 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003145636910732197, + "loss": 2.5593, + "theoretical_loss": 3.5753999575426314, + "tokens_seen": 1244917760 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031455366098294884, + "loss": 2.78, + "theoretical_loss": 3.5753827638964655, + "tokens_seen": 1244983296 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003145436308926781, + "loss": 2.7571, + "theoretical_loss": 3.575365571408756, + "tokens_seen": 1245048832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003145336008024072, + "loss": 2.4951, + "theoretical_loss": 3.5753483800793644, + "tokens_seen": 1245114368 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1435669, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.428680419921875, + "objective/train/theoretical_loss": 3.575331189908151, + "objective/train/tokens_used": 1265639904, + "theoretical_loss": 3.575331189908151, + "tokens_seen": 1245179904 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031452357071213644, + "loss": 2.7965, + "theoretical_loss": 3.575331189908151, + "tokens_seen": 1245179904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003145135406218656, + "loss": 2.6784, + "theoretical_loss": 3.575314000894976, + "tokens_seen": 1245245440 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003145035105315948, + "loss": 2.4244, + "theoretical_loss": 3.575296813039703, + "tokens_seen": 1245310976 + }, + { + "epoch": 4.02, + "learning_rate": 0.000314493480441324, + "loss": 2.7376, + "theoretical_loss": 3.575279626342191, + "tokens_seen": 1245376512 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031448345035105317, + "loss": 2.7399, + "theoretical_loss": 3.5752624408023017, + "tokens_seen": 1245442048 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031447342026078235, + "loss": 2.7588, + "theoretical_loss": 3.5752452564198967, + "tokens_seen": 1245507584 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003144633901705116, + "loss": 2.6889, + "theoretical_loss": 3.575228073194836, + "tokens_seen": 1245573120 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003144533600802407, + "loss": 2.6662, + "theoretical_loss": 3.575210891126982, + "tokens_seen": 1245638656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031444332998996994, + "loss": 2.5576, + "theoretical_loss": 3.575193710216195, + "tokens_seen": 1245704192 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003144332998996991, + "loss": 2.7878, + "theoretical_loss": 3.575176530462337, + "tokens_seen": 1245769728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003144232698094283, + "loss": 2.7706, + "theoretical_loss": 3.5751593518652682, + "tokens_seen": 1245835264 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003144132397191575, + "loss": 2.7004, + "theoretical_loss": 3.575142174424851, + "tokens_seen": 1245900800 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031440320962888667, + "loss": 2.8504, + "theoretical_loss": 3.5751249981409456, + "tokens_seen": 1245966336 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031439317953861585, + "loss": 2.6992, + "theoretical_loss": 3.575107823013414, + "tokens_seen": 1246031872 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003143831494483451, + "loss": 2.6023, + "theoretical_loss": 3.5750906490421173, + "tokens_seen": 1246097408 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003143731193580742, + "loss": 2.5395, + "theoretical_loss": 3.575073476226917, + "tokens_seen": 1246162944 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031436308926780345, + "loss": 2.5867, + "theoretical_loss": 3.575056304567674, + "tokens_seen": 1246228480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003143530591775326, + "loss": 2.572, + "theoretical_loss": 3.5750391340642507, + "tokens_seen": 1246294016 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003143430290872618, + "loss": 2.6351, + "theoretical_loss": 3.575021964716508, + "tokens_seen": 1246359552 + }, + { + "epoch": 4.02, + "learning_rate": 0.000314332998996991, + "loss": 2.4288, + "theoretical_loss": 3.5750047965243072, + "tokens_seen": 1246425088 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031432296890672017, + "loss": 2.5788, + "theoretical_loss": 3.57498762948751, + "tokens_seen": 1246490624 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031431293881644935, + "loss": 2.7888, + "theoretical_loss": 3.574970463605977, + "tokens_seen": 1246556160 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031430290872617853, + "loss": 2.682, + "theoretical_loss": 3.5749532988795716, + "tokens_seen": 1246621696 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003142928786359077, + "loss": 2.5778, + "theoretical_loss": 3.5749361353081537, + "tokens_seen": 1246687232 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031428284854563695, + "loss": 2.5332, + "theoretical_loss": 3.5749189728915853, + "tokens_seen": 1246752768 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1436465, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.797041893005371, + "objective/train/theoretical_loss": 3.5749018116297284, + "objective/train/tokens_used": 1267278304, + "theoretical_loss": 3.5749018116297284, + "tokens_seen": 1246818304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003142728184553661, + "loss": 2.7708, + "theoretical_loss": 3.5749018116297284, + "tokens_seen": 1246818304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003142627883650953, + "loss": 2.3856, + "theoretical_loss": 3.574884651522444, + "tokens_seen": 1246883840 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003142527582748245, + "loss": 2.7342, + "theoretical_loss": 3.5748674925695942, + "tokens_seen": 1246949376 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003142427281845537, + "loss": 2.5475, + "theoretical_loss": 3.5748503347710407, + "tokens_seen": 1247014912 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031423269809428286, + "loss": 2.4742, + "theoretical_loss": 3.574833178126645, + "tokens_seen": 1247080448 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031422266800401204, + "loss": 2.6937, + "theoretical_loss": 3.574816022636269, + "tokens_seen": 1247145984 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003142126379137412, + "loss": 2.6389, + "theoretical_loss": 3.5747988682997747, + "tokens_seen": 1247211520 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031420260782347045, + "loss": 2.6282, + "theoretical_loss": 3.5747817151170227, + "tokens_seen": 1247277056 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003141925777331996, + "loss": 2.6403, + "theoretical_loss": 3.574764563087876, + "tokens_seen": 1247342592 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003141825476429288, + "loss": 2.8371, + "theoretical_loss": 3.5747474122121954, + "tokens_seen": 1247408128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031417251755265794, + "loss": 2.5975, + "theoretical_loss": 3.574730262489844, + "tokens_seen": 1247473664 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003141624874623872, + "loss": 2.5355, + "theoretical_loss": 3.5747131139206827, + "tokens_seen": 1247539200 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031415245737211636, + "loss": 2.6023, + "theoretical_loss": 3.5746959665045734, + "tokens_seen": 1247604736 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031414242728184554, + "loss": 2.7003, + "theoretical_loss": 3.5746788202413784, + "tokens_seen": 1247670272 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003141323971915747, + "loss": 2.3663, + "theoretical_loss": 3.57466167513096, + "tokens_seen": 1247735808 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003141223671013039, + "loss": 2.7268, + "theoretical_loss": 3.5746445311731794, + "tokens_seen": 1247801344 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003141123370110331, + "loss": 2.6805, + "theoretical_loss": 3.5746273883678983, + "tokens_seen": 1247866880 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003141023069207623, + "loss": 2.7947, + "theoretical_loss": 3.57461024671498, + "tokens_seen": 1247932416 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031409227683049145, + "loss": 2.8079, + "theoretical_loss": 3.574593106214285, + "tokens_seen": 1247997952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003140822467402207, + "loss": 2.6496, + "theoretical_loss": 3.5745759668656767, + "tokens_seen": 1248063488 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031407221664994986, + "loss": 2.6264, + "theoretical_loss": 3.574558828669016, + "tokens_seen": 1248129024 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031406218655967904, + "loss": 2.2906, + "theoretical_loss": 3.574541691624166, + "tokens_seen": 1248194560 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003140521564694082, + "loss": 2.902, + "theoretical_loss": 3.574524555730988, + "tokens_seen": 1248260096 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003140421263791374, + "loss": 2.7177, + "theoretical_loss": 3.5745074209893453, + "tokens_seen": 1248325632 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003140320962888666, + "loss": 2.7631, + "theoretical_loss": 3.5744902873990982, + "tokens_seen": 1248391168 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1437026, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7030203342437744, + "objective/train/theoretical_loss": 3.5744731549601108, + "objective/train/tokens_used": 1268916704, + "theoretical_loss": 3.5744731549601108, + "tokens_seen": 1248456704 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003140220661985958, + "loss": 2.6616, + "theoretical_loss": 3.5744731549601108, + "tokens_seen": 1248456704 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031401203610832495, + "loss": 2.6589, + "theoretical_loss": 3.5744560236722442, + "tokens_seen": 1248522240 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003140020060180542, + "loss": 2.6114, + "theoretical_loss": 3.574438893535361, + "tokens_seen": 1248587776 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003139919759277833, + "loss": 2.7657, + "theoretical_loss": 3.5744217645493235, + "tokens_seen": 1248653312 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031398194583751255, + "loss": 2.6823, + "theoretical_loss": 3.574404636713994, + "tokens_seen": 1248718848 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031397191574724173, + "loss": 2.5717, + "theoretical_loss": 3.5743875100292346, + "tokens_seen": 1248784384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003139618856569709, + "loss": 2.6765, + "theoretical_loss": 3.574370384494908, + "tokens_seen": 1248849920 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003139518555667001, + "loss": 2.6439, + "theoretical_loss": 3.574353260110876, + "tokens_seen": 1248915456 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003139418254764293, + "loss": 2.5065, + "theoretical_loss": 3.574336136877001, + "tokens_seen": 1248980992 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031393179538615845, + "loss": 2.713, + "theoretical_loss": 3.5743190147931463, + "tokens_seen": 1249046528 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003139217652958877, + "loss": 2.5401, + "theoretical_loss": 3.5743018938591735, + "tokens_seen": 1249112064 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003139117352056168, + "loss": 2.6175, + "theoretical_loss": 3.574284774074945, + "tokens_seen": 1249177600 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031390170511534605, + "loss": 2.6046, + "theoretical_loss": 3.574267655440323, + "tokens_seen": 1249243136 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031389167502507523, + "loss": 2.6326, + "theoretical_loss": 3.574250537955172, + "tokens_seen": 1249308672 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003138816449348044, + "loss": 2.4136, + "theoretical_loss": 3.5742334216193523, + "tokens_seen": 1249374208 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003138716148445336, + "loss": 2.4814, + "theoretical_loss": 3.574216306432727, + "tokens_seen": 1249439744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003138615847542628, + "loss": 2.6682, + "theoretical_loss": 3.5741991923951595, + "tokens_seen": 1249505280 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031385155466399196, + "loss": 2.7518, + "theoretical_loss": 3.574182079506511, + "tokens_seen": 1249570816 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003138415245737212, + "loss": 2.5976, + "theoretical_loss": 3.5741649677666456, + "tokens_seen": 1249636352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003138314944834503, + "loss": 2.7658, + "theoretical_loss": 3.574147857175425, + "tokens_seen": 1249701888 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031382146439317955, + "loss": 2.8964, + "theoretical_loss": 3.574130747732712, + "tokens_seen": 1249767424 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031381143430290873, + "loss": 2.6613, + "theoretical_loss": 3.5741136394383695, + "tokens_seen": 1249832960 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003138014042126379, + "loss": 2.5456, + "theoretical_loss": 3.57409653229226, + "tokens_seen": 1249898496 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031379137412236715, + "loss": 2.7456, + "theoretical_loss": 3.5740794262942464, + "tokens_seen": 1249964032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003137813440320963, + "loss": 2.5863, + "theoretical_loss": 3.574062321444192, + "tokens_seen": 1250029568 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1438333, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5628535747528076, + "objective/train/theoretical_loss": 3.5740452177419577, + "objective/train/tokens_used": 1270555104, + "theoretical_loss": 3.5740452177419577, + "tokens_seen": 1250095104 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003137713139418255, + "loss": 2.7452, + "theoretical_loss": 3.5740452177419577, + "tokens_seen": 1250095104 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003137612838515547, + "loss": 2.5718, + "theoretical_loss": 3.5740281151874087, + "tokens_seen": 1250160640 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003137512537612839, + "loss": 2.5954, + "theoretical_loss": 3.5740110137804066, + "tokens_seen": 1250226176 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031374122367101306, + "loss": 2.577, + "theoretical_loss": 3.5739939135208143, + "tokens_seen": 1250291712 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031373119358074224, + "loss": 2.5118, + "theoretical_loss": 3.5739768144084945, + "tokens_seen": 1250357248 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003137211634904714, + "loss": 2.7736, + "theoretical_loss": 3.5739597164433103, + "tokens_seen": 1250422784 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031371113340020065, + "loss": 2.8973, + "theoretical_loss": 3.5739426196251247, + "tokens_seen": 1250488320 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003137011033099298, + "loss": 2.9025, + "theoretical_loss": 3.5739255239538013, + "tokens_seen": 1250553856 + }, + { + "epoch": 4.02, + "learning_rate": 0.000313691073219659, + "loss": 2.8072, + "theoretical_loss": 3.573908429429202, + "tokens_seen": 1250619392 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031368104312938814, + "loss": 2.5561, + "theoretical_loss": 3.5738913360511897, + "tokens_seen": 1250684928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003136710130391174, + "loss": 2.9685, + "theoretical_loss": 3.5738742438196285, + "tokens_seen": 1250750464 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031366098294884656, + "loss": 2.7795, + "theoretical_loss": 3.5738571527343805, + "tokens_seen": 1250816000 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031365095285857574, + "loss": 2.7314, + "theoretical_loss": 3.573840062795309, + "tokens_seen": 1250881536 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003136409227683049, + "loss": 2.8282, + "theoretical_loss": 3.5738229740022778, + "tokens_seen": 1250947072 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003136308926780341, + "loss": 2.6157, + "theoretical_loss": 3.5738058863551494, + "tokens_seen": 1251012608 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003136208625877633, + "loss": 2.8974, + "theoretical_loss": 3.5737887998537867, + "tokens_seen": 1251078144 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003136108324974925, + "loss": 2.7745, + "theoretical_loss": 3.5737717144980534, + "tokens_seen": 1251143680 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031360080240722165, + "loss": 2.7687, + "theoretical_loss": 3.573754630287812, + "tokens_seen": 1251209216 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003135907723169509, + "loss": 2.6226, + "theoretical_loss": 3.573737547222926, + "tokens_seen": 1251274752 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031358074222668006, + "loss": 2.568, + "theoretical_loss": 3.5737204653032597, + "tokens_seen": 1251340288 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031357071213640924, + "loss": 2.5222, + "theoretical_loss": 3.5737033845286748, + "tokens_seen": 1251405824 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003135606820461384, + "loss": 2.7934, + "theoretical_loss": 3.573686304899035, + "tokens_seen": 1251471360 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003135506519558676, + "loss": 2.5712, + "theoretical_loss": 3.5736692264142036, + "tokens_seen": 1251536896 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003135406218655968, + "loss": 2.4537, + "theoretical_loss": 3.5736521490740447, + "tokens_seen": 1251602432 + }, + { + "epoch": 4.02, + "learning_rate": 0.000313530591775326, + "loss": 2.4182, + "theoretical_loss": 3.573635072878421, + "tokens_seen": 1251667968 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1439071, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.880246639251709, + "objective/train/theoretical_loss": 3.5736179978271956, + "objective/train/tokens_used": 1272193504, + "theoretical_loss": 3.5736179978271956, + "tokens_seen": 1251733504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031352056168505515, + "loss": 2.4664, + "theoretical_loss": 3.5736179978271956, + "tokens_seen": 1251733504 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003135105315947844, + "loss": 2.862, + "theoretical_loss": 3.5736009239202327, + "tokens_seen": 1251799040 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003135005015045135, + "loss": 2.8991, + "theoretical_loss": 3.5735838511573945, + "tokens_seen": 1251864576 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031349047141424275, + "loss": 2.9548, + "theoretical_loss": 3.573566779538546, + "tokens_seen": 1251930112 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031348044132397193, + "loss": 2.5068, + "theoretical_loss": 3.5735497090635495, + "tokens_seen": 1251995648 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003134704112337011, + "loss": 2.6644, + "theoretical_loss": 3.573532639732269, + "tokens_seen": 1252061184 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003134603811434303, + "loss": 2.6921, + "theoretical_loss": 3.5735155715445677, + "tokens_seen": 1252126720 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003134503510531595, + "loss": 2.7023, + "theoretical_loss": 3.573498504500309, + "tokens_seen": 1252192256 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031344032096288865, + "loss": 2.725, + "theoretical_loss": 3.573481438599357, + "tokens_seen": 1252257792 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003134302908726179, + "loss": 2.7769, + "theoretical_loss": 3.5734643738415754, + "tokens_seen": 1252323328 + }, + { + "epoch": 4.02, + "learning_rate": 0.000313420260782347, + "loss": 2.5722, + "theoretical_loss": 3.573447310226827, + "tokens_seen": 1252388864 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031341023069207625, + "loss": 2.7007, + "theoretical_loss": 3.5734302477549758, + "tokens_seen": 1252454400 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031340020060180543, + "loss": 2.6761, + "theoretical_loss": 3.573413186425886, + "tokens_seen": 1252519936 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003133901705115346, + "loss": 2.5791, + "theoretical_loss": 3.5733961262394205, + "tokens_seen": 1252585472 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003133801404212638, + "loss": 2.5603, + "theoretical_loss": 3.5733790671954435, + "tokens_seen": 1252651008 + }, + { + "epoch": 4.02, + "learning_rate": 0.000313370110330993, + "loss": 2.617, + "theoretical_loss": 3.5733620092938185, + "tokens_seen": 1252716544 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031336008024072216, + "loss": 2.7296, + "theoretical_loss": 3.573344952534409, + "tokens_seen": 1252782080 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003133500501504514, + "loss": 2.5422, + "theoretical_loss": 3.573327896917079, + "tokens_seen": 1252847616 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003133400200601805, + "loss": 2.6829, + "theoretical_loss": 3.5733108424416926, + "tokens_seen": 1252913152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031332998996990975, + "loss": 2.48, + "theoretical_loss": 3.5732937891081136, + "tokens_seen": 1252978688 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003133199598796389, + "loss": 2.9448, + "theoretical_loss": 3.573276736916205, + "tokens_seen": 1253044224 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003133099297893681, + "loss": 2.6787, + "theoretical_loss": 3.5732596858658314, + "tokens_seen": 1253109760 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003132998996990973, + "loss": 2.6809, + "theoretical_loss": 3.5732426359568565, + "tokens_seen": 1253175296 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003132898696088265, + "loss": 2.4739, + "theoretical_loss": 3.5732255871891443, + "tokens_seen": 1253240832 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031327983951855566, + "loss": 2.7654, + "theoretical_loss": 3.5732085395625592, + "tokens_seen": 1253306368 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1440666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5984394550323486, + "objective/train/theoretical_loss": 3.573191493076964, + "objective/train/tokens_used": 1273831904, + "theoretical_loss": 3.573191493076964, + "tokens_seen": 1253371904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003132698094282849, + "loss": 2.6108, + "theoretical_loss": 3.573191493076964, + "tokens_seen": 1253371904 + }, + { + "epoch": 4.02, + "learning_rate": 0.000313259779338014, + "loss": 2.734, + "theoretical_loss": 3.5731744477322236, + "tokens_seen": 1253437440 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031324974924774326, + "loss": 2.7576, + "theoretical_loss": 3.5731574035282017, + "tokens_seen": 1253502976 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003132397191574724, + "loss": 2.8248, + "theoretical_loss": 3.5731403604647625, + "tokens_seen": 1253568512 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003132296890672016, + "loss": 2.7149, + "theoretical_loss": 3.5731233185417697, + "tokens_seen": 1253634048 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003132196589769308, + "loss": 2.5068, + "theoretical_loss": 3.5731062777590874, + "tokens_seen": 1253699584 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031320962888666, + "loss": 2.4724, + "theoretical_loss": 3.57308923811658, + "tokens_seen": 1253765120 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031319959879638916, + "loss": 2.5921, + "theoretical_loss": 3.5730721996141117, + "tokens_seen": 1253830656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031318956870611834, + "loss": 2.7723, + "theoretical_loss": 3.5730551622515465, + "tokens_seen": 1253896192 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003131795386158475, + "loss": 2.5817, + "theoretical_loss": 3.5730381260287483, + "tokens_seen": 1253961728 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031316950852557676, + "loss": 2.4434, + "theoretical_loss": 3.5730210909455815, + "tokens_seen": 1254027264 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003131594784353059, + "loss": 2.8277, + "theoretical_loss": 3.57300405700191, + "tokens_seen": 1254092800 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003131494483450351, + "loss": 2.4262, + "theoretical_loss": 3.572987024197599, + "tokens_seen": 1254158336 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031313941825476425, + "loss": 2.32, + "theoretical_loss": 3.572969992532512, + "tokens_seen": 1254223872 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003131293881644935, + "loss": 2.4445, + "theoretical_loss": 3.5729529620065126, + "tokens_seen": 1254289408 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031311935807422267, + "loss": 2.7825, + "theoretical_loss": 3.5729359326194663, + "tokens_seen": 1254354944 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031310932798395185, + "loss": 2.6967, + "theoretical_loss": 3.572918904371237, + "tokens_seen": 1254420480 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031309929789368103, + "loss": 2.5315, + "theoretical_loss": 3.57290187726169, + "tokens_seen": 1254486016 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031308926780341026, + "loss": 2.3605, + "theoretical_loss": 3.5728848512906874, + "tokens_seen": 1254551552 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003130792377131394, + "loss": 2.5921, + "theoretical_loss": 3.5728678264580958, + "tokens_seen": 1254617088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003130692076228686, + "loss": 2.8032, + "theoretical_loss": 3.572850802763778, + "tokens_seen": 1254682624 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003130591775325978, + "loss": 2.7727, + "theoretical_loss": 3.5728337802076, + "tokens_seen": 1254748160 + }, + { + "epoch": 4.02, + "learning_rate": 0.000313049147442327, + "loss": 2.5223, + "theoretical_loss": 3.5728167587894255, + "tokens_seen": 1254813696 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003130391173520562, + "loss": 2.7716, + "theoretical_loss": 3.5727997385091186, + "tokens_seen": 1254879232 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031302908726178535, + "loss": 2.7501, + "theoretical_loss": 3.5727827193665442, + "tokens_seen": 1254944768 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1441890, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5261950492858887, + "objective/train/theoretical_loss": 3.5727657013615666, + "objective/train/tokens_used": 1275470304, + "theoretical_loss": 3.5727657013615666, + "tokens_seen": 1255010304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003130190571715146, + "loss": 2.4896, + "theoretical_loss": 3.5727657013615666, + "tokens_seen": 1255010304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003130090270812437, + "loss": 2.6838, + "theoretical_loss": 3.572748684494051, + "tokens_seen": 1255075840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031299899699097295, + "loss": 2.6958, + "theoretical_loss": 3.5727316687638613, + "tokens_seen": 1255141376 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031298896690070213, + "loss": 2.7371, + "theoretical_loss": 3.5727146541708628, + "tokens_seen": 1255206912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003129789368104313, + "loss": 2.6208, + "theoretical_loss": 3.572697640714919, + "tokens_seen": 1255272448 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003129689067201605, + "loss": 2.7077, + "theoretical_loss": 3.5726806283958963, + "tokens_seen": 1255337984 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003129588766298897, + "loss": 2.81, + "theoretical_loss": 3.5726636172136574, + "tokens_seen": 1255403520 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031294884653961885, + "loss": 2.9799, + "theoretical_loss": 3.5726466071680685, + "tokens_seen": 1255469056 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003129388164493481, + "loss": 2.8607, + "theoretical_loss": 3.5726295982589935, + "tokens_seen": 1255534592 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003129287863590772, + "loss": 2.9026, + "theoretical_loss": 3.5726125904862975, + "tokens_seen": 1255600128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031291875626880645, + "loss": 2.6078, + "theoretical_loss": 3.5725955838498455, + "tokens_seen": 1255665664 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031290872617853563, + "loss": 2.4904, + "theoretical_loss": 3.5725785783495017, + "tokens_seen": 1255731200 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003128986960882648, + "loss": 2.7038, + "theoretical_loss": 3.572561573985131, + "tokens_seen": 1255796736 + }, + { + "epoch": 4.02, + "learning_rate": 0.000312888665997994, + "loss": 2.6649, + "theoretical_loss": 3.5725445707565986, + "tokens_seen": 1255862272 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003128786359077232, + "loss": 2.5088, + "theoretical_loss": 3.5725275686637694, + "tokens_seen": 1255927808 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031286860581745236, + "loss": 2.8002, + "theoretical_loss": 3.572510567706508, + "tokens_seen": 1255993344 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003128585757271816, + "loss": 2.9148, + "theoretical_loss": 3.572493567884679, + "tokens_seen": 1256058880 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003128485456369107, + "loss": 2.6956, + "theoretical_loss": 3.572476569198148, + "tokens_seen": 1256124416 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031283851554663995, + "loss": 2.6343, + "theoretical_loss": 3.57245957164678, + "tokens_seen": 1256189952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003128284854563691, + "loss": 2.7083, + "theoretical_loss": 3.5724425752304394, + "tokens_seen": 1256255488 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003128184553660983, + "loss": 2.7523, + "theoretical_loss": 3.5724255799489915, + "tokens_seen": 1256321024 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003128084252758275, + "loss": 2.9487, + "theoretical_loss": 3.572408585802301, + "tokens_seen": 1256386560 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003127983951855567, + "loss": 2.5786, + "theoretical_loss": 3.5723915927902334, + "tokens_seen": 1256452096 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031278836509528586, + "loss": 2.5373, + "theoretical_loss": 3.572374600912654, + "tokens_seen": 1256517632 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003127783350050151, + "loss": 2.825, + "theoretical_loss": 3.572357610169427, + "tokens_seen": 1256583168 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1442594, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8732402324676514, + "objective/train/theoretical_loss": 3.5723406205604182, + "objective/train/tokens_used": 1277108704, + "theoretical_loss": 3.5723406205604182, + "tokens_seen": 1256648704 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003127683049147442, + "loss": 2.6564, + "theoretical_loss": 3.5723406205604182, + "tokens_seen": 1256648704 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031275827482447346, + "loss": 2.6124, + "theoretical_loss": 3.572323632085493, + "tokens_seen": 1256714240 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003127482447342026, + "loss": 2.7265, + "theoretical_loss": 3.5723066447445158, + "tokens_seen": 1256779776 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003127382146439318, + "loss": 2.6958, + "theoretical_loss": 3.572289658537352, + "tokens_seen": 1256845312 + }, + { + "epoch": 4.02, + "learning_rate": 0.000312728184553661, + "loss": 2.7697, + "theoretical_loss": 3.572272673463867, + "tokens_seen": 1256910848 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003127181544633902, + "loss": 2.5176, + "theoretical_loss": 3.5722556895239257, + "tokens_seen": 1256976384 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031270812437311936, + "loss": 2.8485, + "theoretical_loss": 3.5722387067173935, + "tokens_seen": 1257041920 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031269809428284854, + "loss": 2.9032, + "theoretical_loss": 3.5722217250441366, + "tokens_seen": 1257107456 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003126880641925777, + "loss": 2.7659, + "theoretical_loss": 3.5722047445040186, + "tokens_seen": 1257172992 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031267803410230696, + "loss": 2.6285, + "theoretical_loss": 3.5721877650969063, + "tokens_seen": 1257238528 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003126680040120361, + "loss": 2.7843, + "theoretical_loss": 3.572170786822664, + "tokens_seen": 1257304064 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003126579739217653, + "loss": 2.5797, + "theoretical_loss": 3.572153809681158, + "tokens_seen": 1257369600 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031264794383149445, + "loss": 2.5359, + "theoretical_loss": 3.572136833672253, + "tokens_seen": 1257435136 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003126379137412237, + "loss": 2.5155, + "theoretical_loss": 3.5721198587958147, + "tokens_seen": 1257500672 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031262788365095287, + "loss": 2.6478, + "theoretical_loss": 3.5721028850517085, + "tokens_seen": 1257566208 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031261785356068205, + "loss": 2.7653, + "theoretical_loss": 3.5720859124397997, + "tokens_seen": 1257631744 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031260782347041123, + "loss": 2.6135, + "theoretical_loss": 3.572068940959954, + "tokens_seen": 1257697280 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031259779338014046, + "loss": 2.6955, + "theoretical_loss": 3.572051970612037, + "tokens_seen": 1257762816 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003125877632898696, + "loss": 2.7319, + "theoretical_loss": 3.5720350013959137, + "tokens_seen": 1257828352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003125777331995988, + "loss": 2.7793, + "theoretical_loss": 3.5720180333114504, + "tokens_seen": 1257893888 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031256770310932795, + "loss": 2.6664, + "theoretical_loss": 3.572001066358512, + "tokens_seen": 1257959424 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003125576730190572, + "loss": 2.7865, + "theoretical_loss": 3.571984100536964, + "tokens_seen": 1258024960 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031254764292878637, + "loss": 2.5398, + "theoretical_loss": 3.5719671358466734, + "tokens_seen": 1258090496 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031253761283851555, + "loss": 2.8289, + "theoretical_loss": 3.5719501722875044, + "tokens_seen": 1258156032 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031252758274824473, + "loss": 2.6467, + "theoretical_loss": 3.5719332098593233, + "tokens_seen": 1258221568 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1443926, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4860124588012695, + "objective/train/theoretical_loss": 3.5719162485619953, + "objective/train/tokens_used": 1278747104, + "theoretical_loss": 3.5719162485619953, + "tokens_seen": 1258287104 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003125175526579739, + "loss": 2.571, + "theoretical_loss": 3.5719162485619953, + "tokens_seen": 1258287104 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003125075225677031, + "loss": 2.7123, + "theoretical_loss": 3.5718992883953864, + "tokens_seen": 1258352640 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031249749247743233, + "loss": 2.6302, + "theoretical_loss": 3.571882329359363, + "tokens_seen": 1258418176 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031248746238716146, + "loss": 2.7074, + "theoretical_loss": 3.57186537145379, + "tokens_seen": 1258483712 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003124774322968907, + "loss": 2.6631, + "theoretical_loss": 3.571848414678533, + "tokens_seen": 1258549248 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003124674022066198, + "loss": 2.6379, + "theoretical_loss": 3.5718314590334583, + "tokens_seen": 1258614784 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031245737211634905, + "loss": 2.8167, + "theoretical_loss": 3.571814504518432, + "tokens_seen": 1258680320 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031244734202607823, + "loss": 2.6393, + "theoretical_loss": 3.5717975511333195, + "tokens_seen": 1258745856 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003124373119358074, + "loss": 2.586, + "theoretical_loss": 3.5717805988779867, + "tokens_seen": 1258811392 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003124272818455366, + "loss": 2.6975, + "theoretical_loss": 3.5717636477522996, + "tokens_seen": 1258876928 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031241725175526583, + "loss": 2.8738, + "theoretical_loss": 3.5717466977561245, + "tokens_seen": 1258942464 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031240722166499496, + "loss": 2.5827, + "theoretical_loss": 3.5717297488893265, + "tokens_seen": 1259008000 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003123971915747242, + "loss": 2.835, + "theoretical_loss": 3.5717128011517723, + "tokens_seen": 1259073536 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003123871614844533, + "loss": 2.6795, + "theoretical_loss": 3.5716958545433277, + "tokens_seen": 1259139072 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031237713139418256, + "loss": 2.5723, + "theoretical_loss": 3.5716789090638583, + "tokens_seen": 1259204608 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031236710130391174, + "loss": 2.5694, + "theoretical_loss": 3.5716619647132304, + "tokens_seen": 1259270144 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003123570712136409, + "loss": 2.7332, + "theoretical_loss": 3.5716450214913102, + "tokens_seen": 1259335680 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003123470411233701, + "loss": 2.7726, + "theoretical_loss": 3.5716280793979642, + "tokens_seen": 1259401216 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003123370110330993, + "loss": 2.6838, + "theoretical_loss": 3.5716111384330578, + "tokens_seen": 1259466752 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031232698094282846, + "loss": 2.7907, + "theoretical_loss": 3.5715941985964568, + "tokens_seen": 1259532288 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003123169508525577, + "loss": 2.6441, + "theoretical_loss": 3.571577259888028, + "tokens_seen": 1259597824 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003123069207622869, + "loss": 2.5449, + "theoretical_loss": 3.5715603223076378, + "tokens_seen": 1259663360 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031229689067201606, + "loss": 2.6923, + "theoretical_loss": 3.5715433858551524, + "tokens_seen": 1259728896 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003122868605817453, + "loss": 2.7129, + "theoretical_loss": 3.571526450530437, + "tokens_seen": 1259794432 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003122768304914744, + "loss": 2.7213, + "theoretical_loss": 3.571509516333359, + "tokens_seen": 1259859968 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1444504, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6051039695739746, + "objective/train/theoretical_loss": 3.5714925832637836, + "objective/train/tokens_used": 1280385504, + "theoretical_loss": 3.5714925832637836, + "tokens_seen": 1259925504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031226680040120366, + "loss": 2.6518, + "theoretical_loss": 3.5714925832637836, + "tokens_seen": 1259925504 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003122567703109328, + "loss": 2.6353, + "theoretical_loss": 3.571475651321578, + "tokens_seen": 1259991040 + }, + { + "epoch": 4.02, + "learning_rate": 0.000312246740220662, + "loss": 2.8753, + "theoretical_loss": 3.571458720506608, + "tokens_seen": 1260056576 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003122367101303912, + "loss": 2.6656, + "theoretical_loss": 3.5714417908187404, + "tokens_seen": 1260122112 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003122266800401204, + "loss": 2.6615, + "theoretical_loss": 3.5714248622578415, + "tokens_seen": 1260187648 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031221664994984956, + "loss": 2.6877, + "theoretical_loss": 3.571407934823777, + "tokens_seen": 1260253184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031220661985957874, + "loss": 2.7432, + "theoretical_loss": 3.5713910085164136, + "tokens_seen": 1260318720 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003121965897693079, + "loss": 2.6483, + "theoretical_loss": 3.571374083335618, + "tokens_seen": 1260384256 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031218655967903716, + "loss": 2.4913, + "theoretical_loss": 3.5713571592812565, + "tokens_seen": 1260449792 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003121765295887663, + "loss": 2.5252, + "theoretical_loss": 3.571340236353196, + "tokens_seen": 1260515328 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003121664994984955, + "loss": 2.9449, + "theoretical_loss": 3.5713233145513024, + "tokens_seen": 1260580864 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031215646940822465, + "loss": 2.8003, + "theoretical_loss": 3.5713063938754424, + "tokens_seen": 1260646400 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003121464393179539, + "loss": 2.5984, + "theoretical_loss": 3.571289474325482, + "tokens_seen": 1260711936 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031213640922768307, + "loss": 2.7581, + "theoretical_loss": 3.571272555901289, + "tokens_seen": 1260777472 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031212637913741225, + "loss": 2.8671, + "theoretical_loss": 3.5712556386027288, + "tokens_seen": 1260843008 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031211634904714143, + "loss": 2.5729, + "theoretical_loss": 3.571238722429669, + "tokens_seen": 1260908544 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031210631895687066, + "loss": 2.7379, + "theoretical_loss": 3.571221807381975, + "tokens_seen": 1260974080 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003120962888665998, + "loss": 2.4806, + "theoretical_loss": 3.571204893459515, + "tokens_seen": 1261039616 + }, + { + "epoch": 4.02, + "learning_rate": 0.000312086258776329, + "loss": 2.725, + "theoretical_loss": 3.5711879806621543, + "tokens_seen": 1261105152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031207622868605815, + "loss": 2.7716, + "theoretical_loss": 3.5711710689897602, + "tokens_seen": 1261170688 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003120661985957874, + "loss": 2.5052, + "theoretical_loss": 3.5711541584421993, + "tokens_seen": 1261236224 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031205616850551657, + "loss": 2.6935, + "theoretical_loss": 3.5711372490193383, + "tokens_seen": 1261301760 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031204613841524575, + "loss": 2.5666, + "theoretical_loss": 3.571120340721044, + "tokens_seen": 1261367296 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031203610832497493, + "loss": 2.6823, + "theoretical_loss": 3.571103433547184, + "tokens_seen": 1261432832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003120260782347041, + "loss": 2.7387, + "theoretical_loss": 3.5710865274976236, + "tokens_seen": 1261498368 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1445499, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4663262367248535, + "objective/train/theoretical_loss": 3.5710696225722307, + "objective/train/tokens_used": 1282023904, + "theoretical_loss": 3.5710696225722307, + "tokens_seen": 1261563904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003120160481444333, + "loss": 2.7363, + "theoretical_loss": 3.5710696225722307, + "tokens_seen": 1261563904 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031200601805416253, + "loss": 2.6551, + "theoretical_loss": 3.571052718770871, + "tokens_seen": 1261629440 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031199598796389166, + "loss": 2.9558, + "theoretical_loss": 3.5710358160934135, + "tokens_seen": 1261694976 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003119859578736209, + "loss": 2.7521, + "theoretical_loss": 3.571018914539723, + "tokens_seen": 1261760512 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031197592778335, + "loss": 2.6555, + "theoretical_loss": 3.5710020141096677, + "tokens_seen": 1261826048 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031196589769307925, + "loss": 2.6724, + "theoretical_loss": 3.570985114803114, + "tokens_seen": 1261891584 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031195586760280843, + "loss": 2.766, + "theoretical_loss": 3.570968216619929, + "tokens_seen": 1261957120 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003119458375125376, + "loss": 2.7561, + "theoretical_loss": 3.5709513195599794, + "tokens_seen": 1262022656 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003119358074222668, + "loss": 2.7715, + "theoretical_loss": 3.5709344236231324, + "tokens_seen": 1262088192 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031192577733199603, + "loss": 2.7651, + "theoretical_loss": 3.5709175288092556, + "tokens_seen": 1262153728 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031191574724172516, + "loss": 2.5331, + "theoretical_loss": 3.5709006351182153, + "tokens_seen": 1262219264 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003119057171514544, + "loss": 2.7933, + "theoretical_loss": 3.5708837425498787, + "tokens_seen": 1262284800 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003118956870611835, + "loss": 2.5899, + "theoretical_loss": 3.570866851104113, + "tokens_seen": 1262350336 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031188565697091276, + "loss": 2.7355, + "theoretical_loss": 3.570849960780786, + "tokens_seen": 1262415872 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031187562688064194, + "loss": 2.7772, + "theoretical_loss": 3.570833071579764, + "tokens_seen": 1262481408 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003118655967903711, + "loss": 2.3021, + "theoretical_loss": 3.570816183500914, + "tokens_seen": 1262546944 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003118555667001003, + "loss": 2.7408, + "theoretical_loss": 3.570799296544104, + "tokens_seen": 1262612480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003118455366098295, + "loss": 2.6958, + "theoretical_loss": 3.5707824107092003, + "tokens_seen": 1262678016 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031183550651955866, + "loss": 3.0141, + "theoretical_loss": 3.5707655259960713, + "tokens_seen": 1262743552 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003118254764292879, + "loss": 2.7682, + "theoretical_loss": 3.5707486424045833, + "tokens_seen": 1262809088 + }, + { + "epoch": 4.02, + "learning_rate": 0.000311815446339017, + "loss": 2.6249, + "theoretical_loss": 3.5707317599346036, + "tokens_seen": 1262874624 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031180541624874626, + "loss": 2.6679, + "theoretical_loss": 3.5707148785860006, + "tokens_seen": 1262940160 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003117953861584754, + "loss": 2.719, + "theoretical_loss": 3.57069799835864, + "tokens_seen": 1263005696 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003117853560682046, + "loss": 2.9154, + "theoretical_loss": 3.5706811192523906, + "tokens_seen": 1263071232 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003117753259779338, + "loss": 2.6642, + "theoretical_loss": 3.570664241267119, + "tokens_seen": 1263136768 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1446182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.874471664428711, + "objective/train/theoretical_loss": 3.570647364402693, + "objective/train/tokens_used": 1283662304, + "theoretical_loss": 3.570647364402693, + "tokens_seen": 1263202304 + }, + { + "epoch": 4.02, + "learning_rate": 0.000311765295887663, + "loss": 2.6386, + "theoretical_loss": 3.570647364402693, + "tokens_seen": 1263202304 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031175526579739217, + "loss": 2.6591, + "theoretical_loss": 3.5706304886589795, + "tokens_seen": 1263267840 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003117452357071214, + "loss": 2.8144, + "theoretical_loss": 3.5706136140358464, + "tokens_seen": 1263333376 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031173520561685053, + "loss": 2.7972, + "theoretical_loss": 3.570596740533161, + "tokens_seen": 1263398912 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031172517552657976, + "loss": 2.7657, + "theoretical_loss": 3.5705798681507908, + "tokens_seen": 1263464448 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003117151454363089, + "loss": 2.8032, + "theoretical_loss": 3.5705629968886035, + "tokens_seen": 1263529984 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003117051153460381, + "loss": 2.6886, + "theoretical_loss": 3.570546126746466, + "tokens_seen": 1263595520 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003116950852557673, + "loss": 2.5839, + "theoretical_loss": 3.570529257724247, + "tokens_seen": 1263661056 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003116850551654965, + "loss": 2.6547, + "theoretical_loss": 3.5705123898218125, + "tokens_seen": 1263726592 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031167502507522567, + "loss": 2.67, + "theoretical_loss": 3.5704955230390314, + "tokens_seen": 1263792128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031166499498495485, + "loss": 2.6661, + "theoretical_loss": 3.570478657375771, + "tokens_seen": 1263857664 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031165496489468403, + "loss": 2.6153, + "theoretical_loss": 3.5704617928318987, + "tokens_seen": 1263923200 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031164493480441327, + "loss": 2.671, + "theoretical_loss": 3.5704449294072824, + "tokens_seen": 1263988736 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003116349047141424, + "loss": 2.8392, + "theoretical_loss": 3.5704280671017896, + "tokens_seen": 1264054272 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031162487462387163, + "loss": 2.9227, + "theoretical_loss": 3.5704112059152884, + "tokens_seen": 1264119808 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003116148445336008, + "loss": 2.6377, + "theoretical_loss": 3.570394345847646, + "tokens_seen": 1264185344 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031160481444333, + "loss": 2.5467, + "theoretical_loss": 3.570377486898731, + "tokens_seen": 1264250880 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031159478435305917, + "loss": 2.7076, + "theoretical_loss": 3.57036062906841, + "tokens_seen": 1264316416 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031158475426278835, + "loss": 2.6186, + "theoretical_loss": 3.5703437723565514, + "tokens_seen": 1264381952 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031157472417251753, + "loss": 2.7294, + "theoretical_loss": 3.5703269167630234, + "tokens_seen": 1264447488 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031156469408224677, + "loss": 2.7033, + "theoretical_loss": 3.5703100622876933, + "tokens_seen": 1264513024 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031155466399197595, + "loss": 2.4419, + "theoretical_loss": 3.570293208930429, + "tokens_seen": 1264578560 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031154463390170513, + "loss": 2.6084, + "theoretical_loss": 3.5702763566910987, + "tokens_seen": 1264644096 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003115346038114343, + "loss": 2.5783, + "theoretical_loss": 3.5702595055695703, + "tokens_seen": 1264709632 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003115245737211635, + "loss": 2.7979, + "theoretical_loss": 3.5702426555657114, + "tokens_seen": 1264775168 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1446866, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.502274751663208, + "objective/train/theoretical_loss": 3.57022580667939, + "objective/train/tokens_used": 1285300704, + "theoretical_loss": 3.57022580667939, + "tokens_seen": 1264840704 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031151454363089273, + "loss": 2.7241, + "theoretical_loss": 3.57022580667939, + "tokens_seen": 1264840704 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031150451354062186, + "loss": 2.6458, + "theoretical_loss": 3.5702089589104746, + "tokens_seen": 1264906240 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003114944834503511, + "loss": 2.719, + "theoretical_loss": 3.570192112258833, + "tokens_seen": 1264971776 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003114844533600802, + "loss": 2.7096, + "theoretical_loss": 3.5701752667243327, + "tokens_seen": 1265037312 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031147442326980945, + "loss": 2.929, + "theoretical_loss": 3.570158422306842, + "tokens_seen": 1265102848 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031146439317953863, + "loss": 3.0498, + "theoretical_loss": 3.57014157900623, + "tokens_seen": 1265168384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003114543630892678, + "loss": 2.6962, + "theoretical_loss": 3.570124736822363, + "tokens_seen": 1265233920 + }, + { + "epoch": 4.02, + "learning_rate": 0.000311444332998997, + "loss": 2.5755, + "theoretical_loss": 3.5701078957551107, + "tokens_seen": 1265299456 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031143430290872623, + "loss": 2.7734, + "theoretical_loss": 3.5700910558043404, + "tokens_seen": 1265364992 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031142427281845536, + "loss": 2.714, + "theoretical_loss": 3.5700742169699202, + "tokens_seen": 1265430528 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003114142427281846, + "loss": 2.6307, + "theoretical_loss": 3.5700573792517187, + "tokens_seen": 1265496064 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003114042126379137, + "loss": 2.5619, + "theoretical_loss": 3.570040542649604, + "tokens_seen": 1265561600 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031139418254764296, + "loss": 2.8593, + "theoretical_loss": 3.570023707163444, + "tokens_seen": 1265627136 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031138415245737214, + "loss": 2.5305, + "theoretical_loss": 3.5700068727931074, + "tokens_seen": 1265692672 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003113741223671013, + "loss": 2.7596, + "theoretical_loss": 3.569990039538462, + "tokens_seen": 1265758208 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003113640922768305, + "loss": 2.7541, + "theoretical_loss": 3.569973207399377, + "tokens_seen": 1265823744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003113540621865597, + "loss": 2.6803, + "theoretical_loss": 3.5699563763757194, + "tokens_seen": 1265889280 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031134403209628886, + "loss": 2.9374, + "theoretical_loss": 3.5699395464673587, + "tokens_seen": 1265954816 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003113340020060181, + "loss": 2.5742, + "theoretical_loss": 3.569922717674163, + "tokens_seen": 1266020352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003113239719157472, + "loss": 2.4029, + "theoretical_loss": 3.569905889996, + "tokens_seen": 1266085888 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031131394182547646, + "loss": 2.7946, + "theoretical_loss": 3.5698890634327385, + "tokens_seen": 1266151424 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003113039117352056, + "loss": 2.6271, + "theoretical_loss": 3.5698722379842476, + "tokens_seen": 1266216960 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003112938816449348, + "loss": 2.8781, + "theoretical_loss": 3.569855413650395, + "tokens_seen": 1266282496 + }, + { + "epoch": 4.02, + "learning_rate": 0.000311283851554664, + "loss": 2.6544, + "theoretical_loss": 3.569838590431049, + "tokens_seen": 1266348032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003112738214643932, + "loss": 2.8127, + "theoretical_loss": 3.5698217683260784, + "tokens_seen": 1266413568 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1448419, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8012731075286865, + "objective/train/theoretical_loss": 3.569804947335353, + "objective/train/tokens_used": 1286939104, + "theoretical_loss": 3.569804947335353, + "tokens_seen": 1266479104 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031126379137412237, + "loss": 2.6329, + "theoretical_loss": 3.569804947335353, + "tokens_seen": 1266479104 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003112537612838516, + "loss": 2.7451, + "theoretical_loss": 3.5697881274587386, + "tokens_seen": 1266544640 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031124373119358073, + "loss": 2.5991, + "theoretical_loss": 3.5697713086961063, + "tokens_seen": 1266610176 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031123370110330996, + "loss": 2.6133, + "theoretical_loss": 3.5697544910473233, + "tokens_seen": 1266675712 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003112236710130391, + "loss": 2.5417, + "theoretical_loss": 3.569737674512259, + "tokens_seen": 1266741248 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003112136409227683, + "loss": 2.6459, + "theoretical_loss": 3.569720859090781, + "tokens_seen": 1266806784 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003112036108324975, + "loss": 2.6508, + "theoretical_loss": 3.569704044782759, + "tokens_seen": 1266872320 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003111935807422267, + "loss": 2.8711, + "theoretical_loss": 3.5696872315880612, + "tokens_seen": 1266937856 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031118355065195587, + "loss": 2.6125, + "theoretical_loss": 3.5696704195065565, + "tokens_seen": 1267003392 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031117352056168505, + "loss": 2.6055, + "theoretical_loss": 3.569653608538113, + "tokens_seen": 1267068928 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031116349047141423, + "loss": 2.6969, + "theoretical_loss": 3.5696367986826005, + "tokens_seen": 1267134464 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031115346038114347, + "loss": 2.6464, + "theoretical_loss": 3.569619989939887, + "tokens_seen": 1267200000 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003111434302908726, + "loss": 2.5475, + "theoretical_loss": 3.5696031823098413, + "tokens_seen": 1267265536 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031113340020060183, + "loss": 2.7453, + "theoretical_loss": 3.5695863757923325, + "tokens_seen": 1267331072 + }, + { + "epoch": 4.02, + "learning_rate": 0.000311123370110331, + "loss": 2.5926, + "theoretical_loss": 3.5695695703872294, + "tokens_seen": 1267396608 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003111133400200602, + "loss": 2.687, + "theoretical_loss": 3.5695527660944006, + "tokens_seen": 1267462144 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031110330992978937, + "loss": 2.6837, + "theoretical_loss": 3.569535962913715, + "tokens_seen": 1267527680 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031109327983951855, + "loss": 2.942, + "theoretical_loss": 3.5695191608450423, + "tokens_seen": 1267593216 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031108324974924773, + "loss": 2.4684, + "theoretical_loss": 3.5695023598882507, + "tokens_seen": 1267658752 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031107321965897697, + "loss": 2.6351, + "theoretical_loss": 3.5694855600432085, + "tokens_seen": 1267724288 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003110631895687061, + "loss": 2.4623, + "theoretical_loss": 3.569468761309786, + "tokens_seen": 1267789824 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031105315947843533, + "loss": 2.663, + "theoretical_loss": 3.5694519636878512, + "tokens_seen": 1267855360 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031104312938816446, + "loss": 2.5563, + "theoretical_loss": 3.569435167177274, + "tokens_seen": 1267920896 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003110330992978937, + "loss": 2.6268, + "theoretical_loss": 3.569418371777923, + "tokens_seen": 1267986432 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003110230692076229, + "loss": 2.6797, + "theoretical_loss": 3.5694015774896664, + "tokens_seen": 1268051968 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 1449162, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7662107944488525, + "objective/train/theoretical_loss": 3.5693847843123745, + "objective/train/tokens_used": 1288577504, + "theoretical_loss": 3.5693847843123745, + "tokens_seen": 1268117504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031101303911735206, + "loss": 2.8176, + "theoretical_loss": 3.5693847843123745, + "tokens_seen": 1268117504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031100300902708124, + "loss": 2.7783, + "theoretical_loss": 3.569367992245916, + "tokens_seen": 1268183040 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003109929789368104, + "loss": 2.613, + "theoretical_loss": 3.56935120129016, + "tokens_seen": 1268248576 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003109829488465396, + "loss": 2.6012, + "theoretical_loss": 3.5693344114449754, + "tokens_seen": 1268314112 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031097291875626883, + "loss": 2.6034, + "theoretical_loss": 3.5693176227102317, + "tokens_seen": 1268379648 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031096288866599796, + "loss": 2.8255, + "theoretical_loss": 3.569300835085798, + "tokens_seen": 1268445184 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003109528585757272, + "loss": 2.5776, + "theoretical_loss": 3.5692840485715434, + "tokens_seen": 1268510720 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003109428284854564, + "loss": 2.5631, + "theoretical_loss": 3.5692672631673372, + "tokens_seen": 1268576256 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031093279839518556, + "loss": 2.9934, + "theoretical_loss": 3.569250478873049, + "tokens_seen": 1268641792 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031092276830491474, + "loss": 2.7714, + "theoretical_loss": 3.5692336956885473, + "tokens_seen": 1268707328 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003109127382146439, + "loss": 2.7042, + "theoretical_loss": 3.569216913613702, + "tokens_seen": 1268772864 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003109027081243731, + "loss": 2.6858, + "theoretical_loss": 3.5692001326483824, + "tokens_seen": 1268838400 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031089267803410234, + "loss": 2.4343, + "theoretical_loss": 3.569183352792458, + "tokens_seen": 1268903936 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031088264794383147, + "loss": 2.6434, + "theoretical_loss": 3.569166574045797, + "tokens_seen": 1268969472 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003108726178535607, + "loss": 2.7497, + "theoretical_loss": 3.5691497964082703, + "tokens_seen": 1269035008 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031086258776328983, + "loss": 2.7075, + "theoretical_loss": 3.5691330198797466, + "tokens_seen": 1269100544 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031085255767301906, + "loss": 2.7228, + "theoretical_loss": 3.569116244460096, + "tokens_seen": 1269166080 + }, + { + "epoch": 4.02, + "learning_rate": 0.00031084252758274824, + "loss": 2.5925, + "theoretical_loss": 3.5690994701491863, + "tokens_seen": 1269231616 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003108324974924774, + "loss": 2.6519, + "theoretical_loss": 3.5690826969468885, + "tokens_seen": 1269297152 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003108224674022066, + "loss": 2.727, + "theoretical_loss": 3.5690659248530716, + "tokens_seen": 1269362688 + }, + { + "epoch": 4.02, + "learning_rate": 0.0003108124373119358, + "loss": 2.7352, + "theoretical_loss": 3.5690491538676055, + "tokens_seen": 1269428224 + }, + { + "epoch": 4.02, + "learning_rate": 0.000310802407221665, + "loss": 2.61, + "theoretical_loss": 3.5690323839903586, + "tokens_seen": 1269493760 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003107923771313942, + "loss": 2.6601, + "theoretical_loss": 3.5690156152212023, + "tokens_seen": 1269559296 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003107823470411234, + "loss": 2.7277, + "theoretical_loss": 3.568998847560004, + "tokens_seen": 1269624832 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031077231695085257, + "loss": 2.6063, + "theoretical_loss": 3.5689820810066353, + "tokens_seen": 1269690368 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1449667, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5712544918060303, + "objective/train/theoretical_loss": 3.568965315560965, + "objective/train/tokens_used": 1290215904, + "theoretical_loss": 3.568965315560965, + "tokens_seen": 1269755904 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003107622868605818, + "loss": 2.55, + "theoretical_loss": 3.568965315560965, + "tokens_seen": 1269755904 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031075225677031093, + "loss": 2.6712, + "theoretical_loss": 3.5689485512228623, + "tokens_seen": 1269821440 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031074222668004016, + "loss": 2.435, + "theoretical_loss": 3.568931787992198, + "tokens_seen": 1269886976 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003107321965897693, + "loss": 2.6075, + "theoretical_loss": 3.56891502586884, + "tokens_seen": 1269952512 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003107221664994985, + "loss": 2.5393, + "theoretical_loss": 3.5688982648526597, + "tokens_seen": 1270018048 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003107121364092277, + "loss": 2.5827, + "theoretical_loss": 3.5688815049435263, + "tokens_seen": 1270083584 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003107021063189569, + "loss": 2.6393, + "theoretical_loss": 3.5688647461413097, + "tokens_seen": 1270149120 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031069207622868607, + "loss": 2.7534, + "theoretical_loss": 3.5688479884458797, + "tokens_seen": 1270214656 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031068204613841525, + "loss": 2.713, + "theoretical_loss": 3.5688312318571054, + "tokens_seen": 1270280192 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031067201604814443, + "loss": 2.6002, + "theoretical_loss": 3.5688144763748575, + "tokens_seen": 1270345728 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031066198595787367, + "loss": 2.7751, + "theoretical_loss": 3.5687977219990055, + "tokens_seen": 1270411264 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003106519558676028, + "loss": 2.5598, + "theoretical_loss": 3.5687809687294187, + "tokens_seen": 1270476800 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031064192577733203, + "loss": 2.7079, + "theoretical_loss": 3.5687642165659685, + "tokens_seen": 1270542336 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003106318956870612, + "loss": 2.467, + "theoretical_loss": 3.5687474655085234, + "tokens_seen": 1270607872 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003106218655967904, + "loss": 2.8467, + "theoretical_loss": 3.5687307155569536, + "tokens_seen": 1270673408 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031061183550651957, + "loss": 2.9576, + "theoretical_loss": 3.56871396671113, + "tokens_seen": 1270738944 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031060180541624875, + "loss": 2.5267, + "theoretical_loss": 3.5686972189709216, + "tokens_seen": 1270804480 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031059177532597793, + "loss": 2.7555, + "theoretical_loss": 3.5686804723361982, + "tokens_seen": 1270870016 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031058174523570717, + "loss": 2.6888, + "theoretical_loss": 3.5686637268068306, + "tokens_seen": 1270935552 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003105717151454363, + "loss": 2.7137, + "theoretical_loss": 3.568646982382689, + "tokens_seen": 1271001088 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031056168505516553, + "loss": 2.8593, + "theoretical_loss": 3.5686302390636424, + "tokens_seen": 1271066624 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031055165496489466, + "loss": 2.7043, + "theoretical_loss": 3.568613496849562, + "tokens_seen": 1271132160 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003105416248746239, + "loss": 2.8863, + "theoretical_loss": 3.568596755740317, + "tokens_seen": 1271197696 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003105315947843531, + "loss": 2.6993, + "theoretical_loss": 3.5685800157357788, + "tokens_seen": 1271263232 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031052156469408226, + "loss": 2.6819, + "theoretical_loss": 3.5685632768358158, + "tokens_seen": 1271328768 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1450856, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6388139724731445, + "objective/train/theoretical_loss": 3.5685465390402995, + "objective/train/tokens_used": 1291854304, + "theoretical_loss": 3.5685465390402995, + "tokens_seen": 1271394304 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031051153460381144, + "loss": 2.5599, + "theoretical_loss": 3.5685465390402995, + "tokens_seen": 1271394304 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003105015045135406, + "loss": 2.6993, + "theoretical_loss": 3.5685298023491, + "tokens_seen": 1271459840 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003104914744232698, + "loss": 2.7917, + "theoretical_loss": 3.568513066762087, + "tokens_seen": 1271525376 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031048144433299904, + "loss": 2.7555, + "theoretical_loss": 3.5684963322791305, + "tokens_seen": 1271590912 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031047141424272816, + "loss": 2.6375, + "theoretical_loss": 3.568479598900102, + "tokens_seen": 1271656448 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003104613841524574, + "loss": 2.581, + "theoretical_loss": 3.5684628666248708, + "tokens_seen": 1271721984 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003104513540621866, + "loss": 2.6204, + "theoretical_loss": 3.568446135453307, + "tokens_seen": 1271787520 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031044132397191576, + "loss": 2.7401, + "theoretical_loss": 3.568429405385282, + "tokens_seen": 1271853056 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031043129388164494, + "loss": 2.6455, + "theoretical_loss": 3.5684126764206656, + "tokens_seen": 1271918592 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003104212637913741, + "loss": 2.6757, + "theoretical_loss": 3.5683959485593277, + "tokens_seen": 1271984128 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003104112337011033, + "loss": 2.7583, + "theoretical_loss": 3.5683792218011394, + "tokens_seen": 1272049664 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031040120361083254, + "loss": 2.589, + "theoretical_loss": 3.568362496145971, + "tokens_seen": 1272115200 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031039117352056167, + "loss": 2.8462, + "theoretical_loss": 3.5683457715936924, + "tokens_seen": 1272180736 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003103811434302909, + "loss": 2.777, + "theoretical_loss": 3.5683290481441747, + "tokens_seen": 1272246272 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031037111334002003, + "loss": 2.5992, + "theoretical_loss": 3.5683123257972884, + "tokens_seen": 1272311808 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031036108324974926, + "loss": 2.7123, + "theoretical_loss": 3.568295604552903, + "tokens_seen": 1272377344 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031035105315947844, + "loss": 2.9384, + "theoretical_loss": 3.5682788844108906, + "tokens_seen": 1272442880 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003103410230692076, + "loss": 2.812, + "theoretical_loss": 3.5682621653711206, + "tokens_seen": 1272508416 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003103309929789368, + "loss": 2.7367, + "theoretical_loss": 3.5682454474334637, + "tokens_seen": 1272573952 + }, + { + "epoch": 4.03, + "learning_rate": 0.000310320962888666, + "loss": 2.761, + "theoretical_loss": 3.568228730597791, + "tokens_seen": 1272639488 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031031093279839517, + "loss": 2.8438, + "theoretical_loss": 3.568212014863973, + "tokens_seen": 1272705024 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003103009027081244, + "loss": 3.026, + "theoretical_loss": 3.56819530023188, + "tokens_seen": 1272770560 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031029087261785353, + "loss": 2.6869, + "theoretical_loss": 3.568178586701383, + "tokens_seen": 1272836096 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031028084252758277, + "loss": 2.7515, + "theoretical_loss": 3.568161874272352, + "tokens_seen": 1272901632 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031027081243731195, + "loss": 2.6262, + "theoretical_loss": 3.568145162944659, + "tokens_seen": 1272967168 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1451655, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.884730577468872, + "objective/train/theoretical_loss": 3.568128452718174, + "objective/train/tokens_used": 1293492704, + "theoretical_loss": 3.568128452718174, + "tokens_seen": 1273032704 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031026078234704113, + "loss": 2.7017, + "theoretical_loss": 3.568128452718174, + "tokens_seen": 1273032704 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003102507522567703, + "loss": 2.817, + "theoretical_loss": 3.568111743592767, + "tokens_seen": 1273098240 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003102407221664995, + "loss": 2.6407, + "theoretical_loss": 3.5680950355683096, + "tokens_seen": 1273163776 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031023069207622867, + "loss": 2.6872, + "theoretical_loss": 3.5680783286446727, + "tokens_seen": 1273229312 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003102206619859579, + "loss": 2.9628, + "theoretical_loss": 3.568061622821727, + "tokens_seen": 1273294848 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031021063189568703, + "loss": 2.7832, + "theoretical_loss": 3.568044918099343, + "tokens_seen": 1273360384 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031020060180541627, + "loss": 2.6355, + "theoretical_loss": 3.5680282144773923, + "tokens_seen": 1273425920 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003101905717151454, + "loss": 2.9272, + "theoretical_loss": 3.5680115119557447, + "tokens_seen": 1273491456 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031018054162487463, + "loss": 2.7187, + "theoretical_loss": 3.5679948105342714, + "tokens_seen": 1273556992 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003101705115346038, + "loss": 2.4965, + "theoretical_loss": 3.5679781102128443, + "tokens_seen": 1273622528 + }, + { + "epoch": 4.03, + "learning_rate": 0.000310160481444333, + "loss": 2.6559, + "theoretical_loss": 3.5679614109913333, + "tokens_seen": 1273688064 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003101504513540622, + "loss": 2.5532, + "theoretical_loss": 3.5679447128696102, + "tokens_seen": 1273753600 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003101404212637914, + "loss": 2.6703, + "theoretical_loss": 3.567928015847545, + "tokens_seen": 1273819136 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031013039117352054, + "loss": 2.7403, + "theoretical_loss": 3.567911319925009, + "tokens_seen": 1273884672 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031012036108324977, + "loss": 2.8405, + "theoretical_loss": 3.567894625101874, + "tokens_seen": 1273950208 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003101103309929789, + "loss": 2.8008, + "theoretical_loss": 3.5678779313780105, + "tokens_seen": 1274015744 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031010030090270813, + "loss": 2.6953, + "theoretical_loss": 3.567861238753289, + "tokens_seen": 1274081280 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003100902708124373, + "loss": 2.7145, + "theoretical_loss": 3.5678445472275815, + "tokens_seen": 1274146816 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003100802407221665, + "loss": 2.7383, + "theoretical_loss": 3.567827856800759, + "tokens_seen": 1274212352 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003100702106318957, + "loss": 2.6356, + "theoretical_loss": 3.5678111674726924, + "tokens_seen": 1274277888 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031006018054162486, + "loss": 2.8844, + "theoretical_loss": 3.5677944792432528, + "tokens_seen": 1274343424 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003100501504513541, + "loss": 2.789, + "theoretical_loss": 3.5677777921123113, + "tokens_seen": 1274408960 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003100401203610833, + "loss": 2.8134, + "theoretical_loss": 3.5677611060797396, + "tokens_seen": 1274474496 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031003009027081246, + "loss": 2.5342, + "theoretical_loss": 3.5677444211454086, + "tokens_seen": 1274540032 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031002006018054164, + "loss": 2.6298, + "theoretical_loss": 3.5677277373091894, + "tokens_seen": 1274605568 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1452721, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9962716102600098, + "objective/train/theoretical_loss": 3.5677110545709536, + "objective/train/tokens_used": 1295131104, + "theoretical_loss": 3.5677110545709536, + "tokens_seen": 1274671104 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003100100300902708, + "loss": 2.7639, + "theoretical_loss": 3.5677110545709536, + "tokens_seen": 1274671104 + }, + { + "epoch": 4.03, + "learning_rate": 0.00031, + "loss": 2.8508, + "theoretical_loss": 3.5676943729305726, + "tokens_seen": 1274736640 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030998996990972924, + "loss": 2.6672, + "theoretical_loss": 3.567677692387917, + "tokens_seen": 1274802176 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030997993981945836, + "loss": 2.6414, + "theoretical_loss": 3.5676610129428585, + "tokens_seen": 1274867712 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003099699097291876, + "loss": 2.8492, + "theoretical_loss": 3.5676443345952693, + "tokens_seen": 1274933248 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003099598796389168, + "loss": 2.8881, + "theoretical_loss": 3.567627657345019, + "tokens_seen": 1274998784 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030994984954864596, + "loss": 2.8469, + "theoretical_loss": 3.567610981191981, + "tokens_seen": 1275064320 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030993981945837514, + "loss": 2.5649, + "theoretical_loss": 3.5675943061360256, + "tokens_seen": 1275129856 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003099297893681043, + "loss": 2.7739, + "theoretical_loss": 3.5675776321770236, + "tokens_seen": 1275195392 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003099197592778335, + "loss": 2.5517, + "theoretical_loss": 3.5675609593148483, + "tokens_seen": 1275260928 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030990972918756274, + "loss": 2.7727, + "theoretical_loss": 3.56754428754937, + "tokens_seen": 1275326464 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030989969909729187, + "loss": 2.8845, + "theoretical_loss": 3.5675276168804597, + "tokens_seen": 1275392000 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003098896690070211, + "loss": 2.5749, + "theoretical_loss": 3.5675109473079902, + "tokens_seen": 1275457536 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030987963891675023, + "loss": 2.7791, + "theoretical_loss": 3.5674942788318322, + "tokens_seen": 1275523072 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030986960882647946, + "loss": 2.5838, + "theoretical_loss": 3.567477611451858, + "tokens_seen": 1275588608 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030985957873620864, + "loss": 2.7737, + "theoretical_loss": 3.567460945167938, + "tokens_seen": 1275654144 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003098495486459378, + "loss": 2.6846, + "theoretical_loss": 3.567444279979945, + "tokens_seen": 1275719680 + }, + { + "epoch": 4.03, + "learning_rate": 0.000309839518555667, + "loss": 2.7138, + "theoretical_loss": 3.56742761588775, + "tokens_seen": 1275785216 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003098294884653962, + "loss": 2.6993, + "theoretical_loss": 3.5674109528912252, + "tokens_seen": 1275850752 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030981945837512537, + "loss": 2.8416, + "theoretical_loss": 3.567394290990242, + "tokens_seen": 1275916288 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003098094282848546, + "loss": 2.7672, + "theoretical_loss": 3.567377630184672, + "tokens_seen": 1275981824 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030979939819458373, + "loss": 2.9553, + "theoretical_loss": 3.567360970474387, + "tokens_seen": 1276047360 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030978936810431297, + "loss": 2.5491, + "theoretical_loss": 3.567344311859258, + "tokens_seen": 1276112896 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030977933801404215, + "loss": 2.9679, + "theoretical_loss": 3.567327654339158, + "tokens_seen": 1276178432 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030976930792377133, + "loss": 2.7076, + "theoretical_loss": 3.5673109979139586, + "tokens_seen": 1276243968 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1453859, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2308290004730225, + "objective/train/theoretical_loss": 3.567294342583531, + "objective/train/tokens_used": 1296769504, + "theoretical_loss": 3.567294342583531, + "tokens_seen": 1276309504 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003097592778335005, + "loss": 2.9256, + "theoretical_loss": 3.567294342583531, + "tokens_seen": 1276309504 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003097492477432297, + "loss": 2.9109, + "theoretical_loss": 3.5672776883477475, + "tokens_seen": 1276375040 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030973921765295887, + "loss": 2.7835, + "theoretical_loss": 3.5672610352064797, + "tokens_seen": 1276440576 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003097291875626881, + "loss": 2.6977, + "theoretical_loss": 3.5672443831595997, + "tokens_seen": 1276506112 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030971915747241723, + "loss": 2.9232, + "theoretical_loss": 3.567227732206979, + "tokens_seen": 1276571648 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030970912738214647, + "loss": 2.8935, + "theoretical_loss": 3.56721108234849, + "tokens_seen": 1276637184 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003096990972918756, + "loss": 3.0577, + "theoretical_loss": 3.5671944335840045, + "tokens_seen": 1276702720 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030968906720160483, + "loss": 2.892, + "theoretical_loss": 3.567177785913394, + "tokens_seen": 1276768256 + }, + { + "epoch": 4.03, + "learning_rate": 0.000309679037111334, + "loss": 2.8298, + "theoretical_loss": 3.5671611393365312, + "tokens_seen": 1276833792 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003096690070210632, + "loss": 2.7139, + "theoretical_loss": 3.567144493853288, + "tokens_seen": 1276899328 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003096589769307924, + "loss": 2.8356, + "theoretical_loss": 3.567127849463536, + "tokens_seen": 1276964864 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003096489468405216, + "loss": 2.8401, + "theoretical_loss": 3.5671112061671475, + "tokens_seen": 1277030400 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030963891675025074, + "loss": 2.8479, + "theoretical_loss": 3.5670945639639946, + "tokens_seen": 1277095936 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030962888665997997, + "loss": 2.8155, + "theoretical_loss": 3.567077922853949, + "tokens_seen": 1277161472 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003096188565697091, + "loss": 2.6403, + "theoretical_loss": 3.567061282836884, + "tokens_seen": 1277227008 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030960882647943833, + "loss": 2.7317, + "theoretical_loss": 3.56704464391267, + "tokens_seen": 1277292544 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003095987963891675, + "loss": 2.6416, + "theoretical_loss": 3.5670280060811805, + "tokens_seen": 1277358080 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003095887662988967, + "loss": 2.8647, + "theoretical_loss": 3.5670113693422874, + "tokens_seen": 1277423616 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003095787362086259, + "loss": 2.7733, + "theoretical_loss": 3.5669947336958625, + "tokens_seen": 1277489152 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030956870611835506, + "loss": 2.9185, + "theoretical_loss": 3.566978099141778, + "tokens_seen": 1277554688 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030955867602808424, + "loss": 2.8959, + "theoretical_loss": 3.566961465679907, + "tokens_seen": 1277620224 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003095486459378135, + "loss": 2.9382, + "theoretical_loss": 3.566944833310121, + "tokens_seen": 1277685760 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003095386158475426, + "loss": 2.7745, + "theoretical_loss": 3.566928202032292, + "tokens_seen": 1277751296 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030952858575727184, + "loss": 2.6371, + "theoretical_loss": 3.566911571846293, + "tokens_seen": 1277816832 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030951855566700096, + "loss": 2.8015, + "theoretical_loss": 3.566894942751997, + "tokens_seen": 1277882368 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1454164, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9620399475097656, + "objective/train/theoretical_loss": 3.5668783147492746, + "objective/train/tokens_used": 1298407904, + "theoretical_loss": 3.5668783147492746, + "tokens_seen": 1277947904 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003095085255767302, + "loss": 2.9085, + "theoretical_loss": 3.5668783147492746, + "tokens_seen": 1277947904 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003094984954864594, + "loss": 2.9253, + "theoretical_loss": 3.5668616878379993, + "tokens_seen": 1278013440 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030948846539618856, + "loss": 2.6875, + "theoretical_loss": 3.5668450620180425, + "tokens_seen": 1278078976 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030947843530591774, + "loss": 2.8784, + "theoretical_loss": 3.566828437289278, + "tokens_seen": 1278144512 + }, + { + "epoch": 4.03, + "learning_rate": 0.000309468405215647, + "loss": 2.7315, + "theoretical_loss": 3.5668118136515776, + "tokens_seen": 1278210048 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003094583751253761, + "loss": 2.7856, + "theoretical_loss": 3.5667951911048137, + "tokens_seen": 1278275584 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030944834503510534, + "loss": 2.8026, + "theoretical_loss": 3.5667785696488585, + "tokens_seen": 1278341120 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030943831494483447, + "loss": 2.7517, + "theoretical_loss": 3.566761949283585, + "tokens_seen": 1278406656 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003094282848545637, + "loss": 2.9017, + "theoretical_loss": 3.5667453300088656, + "tokens_seen": 1278472192 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003094182547642929, + "loss": 2.6789, + "theoretical_loss": 3.566728711824573, + "tokens_seen": 1278537728 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030940822467402207, + "loss": 2.7979, + "theoretical_loss": 3.5667120947305793, + "tokens_seen": 1278603264 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030939819458375125, + "loss": 2.7914, + "theoretical_loss": 3.5666954787267575, + "tokens_seen": 1278668800 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030938816449348043, + "loss": 2.665, + "theoretical_loss": 3.56667886381298, + "tokens_seen": 1278734336 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003093781344032096, + "loss": 2.9661, + "theoretical_loss": 3.566662249989119, + "tokens_seen": 1278799872 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030936810431293884, + "loss": 2.9095, + "theoretical_loss": 3.566645637255048, + "tokens_seen": 1278865408 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030935807422266797, + "loss": 2.8724, + "theoretical_loss": 3.5666290256106397, + "tokens_seen": 1278930944 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003093480441323972, + "loss": 2.8794, + "theoretical_loss": 3.566612415055766, + "tokens_seen": 1278996480 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030933801404212633, + "loss": 2.7036, + "theoretical_loss": 3.5665958055903, + "tokens_seen": 1279062016 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030932798395185557, + "loss": 2.6173, + "theoretical_loss": 3.5665791972141148, + "tokens_seen": 1279127552 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030931795386158475, + "loss": 2.891, + "theoretical_loss": 3.5665625899270825, + "tokens_seen": 1279193088 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030930792377131393, + "loss": 3.0057, + "theoretical_loss": 3.5665459837290765, + "tokens_seen": 1279258624 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030929789368104317, + "loss": 2.7406, + "theoretical_loss": 3.566529378619969, + "tokens_seen": 1279324160 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030928786359077235, + "loss": 2.5874, + "theoretical_loss": 3.5665127745996332, + "tokens_seen": 1279389696 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030927783350050153, + "loss": 2.8983, + "theoretical_loss": 3.566496171667942, + "tokens_seen": 1279455232 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003092678034102307, + "loss": 2.8011, + "theoretical_loss": 3.566479569824768, + "tokens_seen": 1279520768 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1454164, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.791790723800659, + "objective/train/theoretical_loss": 3.566462969069984, + "objective/train/tokens_used": 1300046304, + "theoretical_loss": 3.566462969069984, + "tokens_seen": 1279586304 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003092577733199599, + "loss": 3.0594, + "theoretical_loss": 3.566462969069984, + "tokens_seen": 1279586304 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030924774322968907, + "loss": 2.856, + "theoretical_loss": 3.5664463694034634, + "tokens_seen": 1279651840 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003092377131394183, + "loss": 2.4826, + "theoretical_loss": 3.5664297708250787, + "tokens_seen": 1279717376 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030922768304914743, + "loss": 2.989, + "theoretical_loss": 3.5664131733347033, + "tokens_seen": 1279782912 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030921765295887667, + "loss": 2.8264, + "theoretical_loss": 3.5663965769322097, + "tokens_seen": 1279848448 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003092076228686058, + "loss": 2.9237, + "theoretical_loss": 3.5663799816174713, + "tokens_seen": 1279913984 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030919759277833503, + "loss": 2.7125, + "theoretical_loss": 3.5663633873903606, + "tokens_seen": 1279979520 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003091875626880642, + "loss": 2.7873, + "theoretical_loss": 3.566346794250751, + "tokens_seen": 1280045056 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003091775325977934, + "loss": 2.9169, + "theoretical_loss": 3.5663302021985155, + "tokens_seen": 1280110592 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003091675025075226, + "loss": 3.0449, + "theoretical_loss": 3.5663136112335274, + "tokens_seen": 1280176128 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003091574724172518, + "loss": 3.1691, + "theoretical_loss": 3.566297021355659, + "tokens_seen": 1280241664 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030914744232698094, + "loss": 2.9161, + "theoretical_loss": 3.5662804325647848, + "tokens_seen": 1280307200 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030913741223671017, + "loss": 2.8993, + "theoretical_loss": 3.5662638448607766, + "tokens_seen": 1280372736 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003091273821464393, + "loss": 3.1612, + "theoretical_loss": 3.566247258243508, + "tokens_seen": 1280438272 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030911735205616853, + "loss": 2.7969, + "theoretical_loss": 3.5662306727128525, + "tokens_seen": 1280503808 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003091073219658977, + "loss": 2.802, + "theoretical_loss": 3.5662140882686835, + "tokens_seen": 1280569344 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003090972918756269, + "loss": 2.9697, + "theoretical_loss": 3.5661975049108734, + "tokens_seen": 1280634880 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003090872617853561, + "loss": 2.8291, + "theoretical_loss": 3.5661809226392958, + "tokens_seen": 1280700416 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030907723169508526, + "loss": 2.9147, + "theoretical_loss": 3.566164341453824, + "tokens_seen": 1280765952 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030906720160481444, + "loss": 3.0401, + "theoretical_loss": 3.5661477613543315, + "tokens_seen": 1280831488 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003090571715145437, + "loss": 3.0241, + "theoretical_loss": 3.5661311823406914, + "tokens_seen": 1280897024 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003090471414242728, + "loss": 2.6549, + "theoretical_loss": 3.5661146044127774, + "tokens_seen": 1280962560 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030903711133400204, + "loss": 2.9967, + "theoretical_loss": 3.566098027570462, + "tokens_seen": 1281028096 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030902708124373116, + "loss": 2.6381, + "theoretical_loss": 3.5660814518136195, + "tokens_seen": 1281093632 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003090170511534604, + "loss": 2.6968, + "theoretical_loss": 3.566064877142123, + "tokens_seen": 1281159168 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1454920, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6659634113311768, + "objective/train/theoretical_loss": 3.5660483035558457, + "objective/train/tokens_used": 1301684704, + "theoretical_loss": 3.5660483035558457, + "tokens_seen": 1281224704 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003090070210631896, + "loss": 2.4981, + "theoretical_loss": 3.5660483035558457, + "tokens_seen": 1281224704 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030899699097291876, + "loss": 2.7639, + "theoretical_loss": 3.566031731054661, + "tokens_seen": 1281290240 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030898696088264794, + "loss": 2.7521, + "theoretical_loss": 3.566015159638442, + "tokens_seen": 1281355776 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003089769307923772, + "loss": 2.7621, + "theoretical_loss": 3.565998589307064, + "tokens_seen": 1281421312 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003089669007021063, + "loss": 3.0341, + "theoretical_loss": 3.5659820200603987, + "tokens_seen": 1281486848 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030895687061183554, + "loss": 2.8547, + "theoretical_loss": 3.56596545189832, + "tokens_seen": 1281552384 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030894684052156467, + "loss": 2.8013, + "theoretical_loss": 3.5659488848207013, + "tokens_seen": 1281617920 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003089368104312939, + "loss": 2.8409, + "theoretical_loss": 3.565932318827417, + "tokens_seen": 1281683456 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003089267803410231, + "loss": 2.8523, + "theoretical_loss": 3.5659157539183397, + "tokens_seen": 1281748992 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030891675025075227, + "loss": 3.1319, + "theoretical_loss": 3.5658991900933437, + "tokens_seen": 1281814528 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030890672016048145, + "loss": 2.5981, + "theoretical_loss": 3.5658826273523028, + "tokens_seen": 1281880064 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030889669007021063, + "loss": 2.8412, + "theoretical_loss": 3.56586606569509, + "tokens_seen": 1281945600 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003088866599799398, + "loss": 2.7512, + "theoretical_loss": 3.565849505121579, + "tokens_seen": 1282011136 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030887662988966904, + "loss": 2.7909, + "theoretical_loss": 3.5658329456316435, + "tokens_seen": 1282076672 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030886659979939817, + "loss": 2.9902, + "theoretical_loss": 3.5658163872251576, + "tokens_seen": 1282142208 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003088565697091274, + "loss": 3.0112, + "theoretical_loss": 3.5657998299019953, + "tokens_seen": 1282207744 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030884653961885653, + "loss": 2.8936, + "theoretical_loss": 3.5657832736620296, + "tokens_seen": 1282273280 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030883650952858577, + "loss": 2.9248, + "theoretical_loss": 3.5657667185051354, + "tokens_seen": 1282338816 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030882647943831495, + "loss": 2.8855, + "theoretical_loss": 3.5657501644311846, + "tokens_seen": 1282404352 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030881644934804413, + "loss": 2.9276, + "theoretical_loss": 3.5657336114400526, + "tokens_seen": 1282469888 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003088064192577733, + "loss": 2.8715, + "theoretical_loss": 3.5657170595316128, + "tokens_seen": 1282535424 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030879638916750255, + "loss": 2.7902, + "theoretical_loss": 3.565700508705739, + "tokens_seen": 1282600960 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003087863590772317, + "loss": 2.809, + "theoretical_loss": 3.565683958962305, + "tokens_seen": 1282666496 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003087763289869609, + "loss": 2.7277, + "theoretical_loss": 3.5656674103011854, + "tokens_seen": 1282732032 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030876629889669004, + "loss": 2.9848, + "theoretical_loss": 3.5656508627222534, + "tokens_seen": 1282797568 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1455734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.738227367401123, + "objective/train/theoretical_loss": 3.5656343162253825, + "objective/train/tokens_used": 1303323104, + "theoretical_loss": 3.5656343162253825, + "tokens_seen": 1282863104 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030875626880641927, + "loss": 2.8359, + "theoretical_loss": 3.5656343162253825, + "tokens_seen": 1282863104 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030874623871614845, + "loss": 2.641, + "theoretical_loss": 3.565617770810448, + "tokens_seen": 1282928640 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030873620862587763, + "loss": 2.9164, + "theoretical_loss": 3.5656012264773227, + "tokens_seen": 1282994176 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003087261785356068, + "loss": 2.7244, + "theoretical_loss": 3.5655846832258815, + "tokens_seen": 1283059712 + }, + { + "epoch": 4.03, + "learning_rate": 0.000308716148445336, + "loss": 3.0738, + "theoretical_loss": 3.565568141055998, + "tokens_seen": 1283125248 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003087061183550652, + "loss": 2.7018, + "theoretical_loss": 3.565551599967546, + "tokens_seen": 1283190784 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003086960882647944, + "loss": 2.5522, + "theoretical_loss": 3.5655350599604, + "tokens_seen": 1283256320 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030868605817452354, + "loss": 2.6565, + "theoretical_loss": 3.5655185210344333, + "tokens_seen": 1283321856 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003086760280842528, + "loss": 2.8782, + "theoretical_loss": 3.5655019831895216, + "tokens_seen": 1283387392 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003086659979939819, + "loss": 2.9636, + "theoretical_loss": 3.5654854464255377, + "tokens_seen": 1283452928 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030865596790371114, + "loss": 2.8601, + "theoretical_loss": 3.5654689107423563, + "tokens_seen": 1283518464 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003086459378134403, + "loss": 2.5827, + "theoretical_loss": 3.5654523761398513, + "tokens_seen": 1283584000 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003086359077231695, + "loss": 3.0475, + "theoretical_loss": 3.565435842617897, + "tokens_seen": 1283649536 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003086258776328987, + "loss": 2.9249, + "theoretical_loss": 3.5654193101763685, + "tokens_seen": 1283715072 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003086158475426279, + "loss": 2.855, + "theoretical_loss": 3.5654027788151383, + "tokens_seen": 1283780608 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030860581745235704, + "loss": 3.0105, + "theoretical_loss": 3.5653862485340824, + "tokens_seen": 1283846144 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003085957873620863, + "loss": 2.9229, + "theoretical_loss": 3.5653697193330736, + "tokens_seen": 1283911680 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003085857572718154, + "loss": 2.8418, + "theoretical_loss": 3.565353191211987, + "tokens_seen": 1283977216 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030857572718154464, + "loss": 2.8892, + "theoretical_loss": 3.565336664170697, + "tokens_seen": 1284042752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003085656970912738, + "loss": 2.7801, + "theoretical_loss": 3.565320138209078, + "tokens_seen": 1284108288 + }, + { + "epoch": 4.03, + "learning_rate": 0.000308555667001003, + "loss": 2.8704, + "theoretical_loss": 3.5653036133270035, + "tokens_seen": 1284173824 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030854563691073224, + "loss": 2.884, + "theoretical_loss": 3.565287089524349, + "tokens_seen": 1284239360 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030853560682046137, + "loss": 2.8085, + "theoretical_loss": 3.5652705668009883, + "tokens_seen": 1284304896 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003085255767301906, + "loss": 2.5732, + "theoretical_loss": 3.565254045156796, + "tokens_seen": 1284370432 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003085155466399198, + "loss": 2.8816, + "theoretical_loss": 3.5652375245916463, + "tokens_seen": 1284435968 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1456993, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8912301063537598, + "objective/train/theoretical_loss": 3.565221005105414, + "objective/train/tokens_used": 1304961504, + "theoretical_loss": 3.565221005105414, + "tokens_seen": 1284501504 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030850551654964896, + "loss": 2.9473, + "theoretical_loss": 3.565221005105414, + "tokens_seen": 1284501504 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030849548645937814, + "loss": 2.8586, + "theoretical_loss": 3.5652044866979735, + "tokens_seen": 1284567040 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003084854563691074, + "loss": 2.6153, + "theoretical_loss": 3.5651879693692, + "tokens_seen": 1284632576 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003084754262788365, + "loss": 2.9459, + "theoretical_loss": 3.565171453118966, + "tokens_seen": 1284698112 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030846539618856574, + "loss": 2.7047, + "theoretical_loss": 3.5651549379471486, + "tokens_seen": 1284763648 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030845536609829487, + "loss": 2.8943, + "theoretical_loss": 3.565138423853621, + "tokens_seen": 1284829184 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003084453360080241, + "loss": 2.6728, + "theoretical_loss": 3.5651219108382577, + "tokens_seen": 1284894720 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003084353059177533, + "loss": 2.8053, + "theoretical_loss": 3.5651053989009336, + "tokens_seen": 1284960256 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030842527582748247, + "loss": 2.7473, + "theoretical_loss": 3.5650888880415237, + "tokens_seen": 1285025792 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030841524573721165, + "loss": 2.9395, + "theoretical_loss": 3.5650723782599023, + "tokens_seen": 1285091328 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030840521564694083, + "loss": 2.6765, + "theoretical_loss": 3.565055869555944, + "tokens_seen": 1285156864 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030839518555667, + "loss": 2.8818, + "theoretical_loss": 3.5650393619295233, + "tokens_seen": 1285222400 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030838515546639924, + "loss": 2.7072, + "theoretical_loss": 3.5650228553805157, + "tokens_seen": 1285287936 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030837512537612837, + "loss": 2.7556, + "theoretical_loss": 3.5650063499087956, + "tokens_seen": 1285353472 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003083650952858576, + "loss": 3.0278, + "theoretical_loss": 3.564989845514237, + "tokens_seen": 1285419008 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030835506519558673, + "loss": 2.9658, + "theoretical_loss": 3.564973342196716, + "tokens_seen": 1285484544 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030834503510531597, + "loss": 3.0249, + "theoretical_loss": 3.5649568399561065, + "tokens_seen": 1285550080 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030833500501504515, + "loss": 2.7389, + "theoretical_loss": 3.564940338792284, + "tokens_seen": 1285615616 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030832497492477433, + "loss": 2.589, + "theoretical_loss": 3.5649238387051225, + "tokens_seen": 1285681152 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003083149448345035, + "loss": 2.8527, + "theoretical_loss": 3.5649073396944972, + "tokens_seen": 1285746688 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030830491474423275, + "loss": 2.6517, + "theoretical_loss": 3.564890841760284, + "tokens_seen": 1285812224 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003082948846539619, + "loss": 2.85, + "theoretical_loss": 3.564874344902356, + "tokens_seen": 1285877760 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003082848545636911, + "loss": 2.8013, + "theoretical_loss": 3.56485784912059, + "tokens_seen": 1285943296 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030827482447342024, + "loss": 2.8096, + "theoretical_loss": 3.564841354414859, + "tokens_seen": 1286008832 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030826479438314947, + "loss": 2.8349, + "theoretical_loss": 3.5648248607850395, + "tokens_seen": 1286074368 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1457561, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.149444818496704, + "objective/train/theoretical_loss": 3.5648083682310063, + "objective/train/tokens_used": 1306599904, + "theoretical_loss": 3.5648083682310063, + "tokens_seen": 1286139904 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030825476429287865, + "loss": 2.9877, + "theoretical_loss": 3.5648083682310063, + "tokens_seen": 1286139904 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030824473420260783, + "loss": 2.9448, + "theoretical_loss": 3.5647918767526336, + "tokens_seen": 1286205440 + }, + { + "epoch": 4.03, + "learning_rate": 0.000308234704112337, + "loss": 2.8879, + "theoretical_loss": 3.5647753863497975, + "tokens_seen": 1286270976 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003082246740220662, + "loss": 2.8146, + "theoretical_loss": 3.5647588970223723, + "tokens_seen": 1286336512 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003082146439317954, + "loss": 2.9556, + "theoretical_loss": 3.5647424087702335, + "tokens_seen": 1286402048 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003082046138415246, + "loss": 2.9042, + "theoretical_loss": 3.564725921593256, + "tokens_seen": 1286467584 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030819458375125374, + "loss": 2.7621, + "theoretical_loss": 3.5647094354913147, + "tokens_seen": 1286533120 + }, + { + "epoch": 4.03, + "learning_rate": 0.000308184553660983, + "loss": 2.7691, + "theoretical_loss": 3.564692950464285, + "tokens_seen": 1286598656 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003081745235707121, + "loss": 2.7588, + "theoretical_loss": 3.5646764665120427, + "tokens_seen": 1286664192 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030816449348044134, + "loss": 2.8401, + "theoretical_loss": 3.5646599836344617, + "tokens_seen": 1286729728 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003081544633901705, + "loss": 2.8366, + "theoretical_loss": 3.564643501831418, + "tokens_seen": 1286795264 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003081444332998997, + "loss": 2.7475, + "theoretical_loss": 3.564627021102787, + "tokens_seen": 1286860800 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003081344032096289, + "loss": 2.7572, + "theoretical_loss": 3.5646105414484435, + "tokens_seen": 1286926336 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003081243731193581, + "loss": 2.779, + "theoretical_loss": 3.5645940628682626, + "tokens_seen": 1286991872 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030811434302908724, + "loss": 2.9851, + "theoretical_loss": 3.5645775853621204, + "tokens_seen": 1287057408 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003081043129388165, + "loss": 2.7908, + "theoretical_loss": 3.564561108929891, + "tokens_seen": 1287122944 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003080942828485456, + "loss": 2.9733, + "theoretical_loss": 3.564544633571451, + "tokens_seen": 1287188480 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030808425275827484, + "loss": 2.7524, + "theoretical_loss": 3.5645281592866755, + "tokens_seen": 1287254016 + }, + { + "epoch": 4.03, + "learning_rate": 0.000308074222668004, + "loss": 2.7138, + "theoretical_loss": 3.5645116860754387, + "tokens_seen": 1287319552 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003080641925777332, + "loss": 2.8349, + "theoretical_loss": 3.564495213937618, + "tokens_seen": 1287385088 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003080541624874624, + "loss": 2.5351, + "theoretical_loss": 3.564478742873087, + "tokens_seen": 1287450624 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030804413239719157, + "loss": 2.5995, + "theoretical_loss": 3.5644622728817215, + "tokens_seen": 1287516160 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030803410230692075, + "loss": 2.8158, + "theoretical_loss": 3.564445803963398, + "tokens_seen": 1287581696 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030802407221665, + "loss": 2.7517, + "theoretical_loss": 3.564429336117991, + "tokens_seen": 1287647232 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003080140421263791, + "loss": 2.9679, + "theoretical_loss": 3.564412869345376, + "tokens_seen": 1287712768 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1459012, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3446168899536133, + "objective/train/theoretical_loss": 3.5643964036454294, + "objective/train/tokens_used": 1308238304, + "theoretical_loss": 3.5643964036454294, + "tokens_seen": 1287778304 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030800401203610834, + "loss": 2.78, + "theoretical_loss": 3.5643964036454294, + "tokens_seen": 1287778304 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003079939819458375, + "loss": 2.8933, + "theoretical_loss": 3.5643799390180257, + "tokens_seen": 1287843840 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003079839518555667, + "loss": 2.6891, + "theoretical_loss": 3.564363475463041, + "tokens_seen": 1287909376 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003079739217652959, + "loss": 2.7915, + "theoretical_loss": 3.5643470129803507, + "tokens_seen": 1287974912 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030796389167502507, + "loss": 2.6668, + "theoretical_loss": 3.5643305515698307, + "tokens_seen": 1288040448 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030795386158475425, + "loss": 2.6905, + "theoretical_loss": 3.5643140912313562, + "tokens_seen": 1288105984 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003079438314944835, + "loss": 2.9587, + "theoretical_loss": 3.5642976319648034, + "tokens_seen": 1288171520 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003079338014042126, + "loss": 2.9989, + "theoretical_loss": 3.5642811737700475, + "tokens_seen": 1288237056 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030792377131394185, + "loss": 2.8329, + "theoretical_loss": 3.5642647166469636, + "tokens_seen": 1288302592 + }, + { + "epoch": 4.03, + "learning_rate": 0.000307913741223671, + "loss": 2.7683, + "theoretical_loss": 3.564248260595429, + "tokens_seen": 1288368128 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003079037111334002, + "loss": 2.8708, + "theoretical_loss": 3.5642318056153184, + "tokens_seen": 1288433664 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003078936810431294, + "loss": 2.6001, + "theoretical_loss": 3.5642153517065074, + "tokens_seen": 1288499200 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030788365095285857, + "loss": 2.8064, + "theoretical_loss": 3.564198898868872, + "tokens_seen": 1288564736 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030787362086258775, + "loss": 2.7796, + "theoretical_loss": 3.5641824471022883, + "tokens_seen": 1288630272 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030786359077231693, + "loss": 2.6772, + "theoretical_loss": 3.564165996406632, + "tokens_seen": 1288695808 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003078535606820461, + "loss": 3.1331, + "theoretical_loss": 3.5641495467817785, + "tokens_seen": 1288761344 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030784353059177535, + "loss": 2.9419, + "theoretical_loss": 3.564133098227604, + "tokens_seen": 1288826880 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003078335005015045, + "loss": 2.7909, + "theoretical_loss": 3.5641166507439843, + "tokens_seen": 1288892416 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003078234704112337, + "loss": 2.5918, + "theoretical_loss": 3.5641002043307957, + "tokens_seen": 1288957952 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003078134403209629, + "loss": 2.8002, + "theoretical_loss": 3.564083758987913, + "tokens_seen": 1289023488 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003078034102306921, + "loss": 3.0434, + "theoretical_loss": 3.5640673147152135, + "tokens_seen": 1289089024 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003077933801404213, + "loss": 2.5849, + "theoretical_loss": 3.5640508715125723, + "tokens_seen": 1289154560 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030778335005015044, + "loss": 2.9531, + "theoretical_loss": 3.5640344293798654, + "tokens_seen": 1289220096 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030777331995987967, + "loss": 2.8258, + "theoretical_loss": 3.5640179883169694, + "tokens_seen": 1289285632 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030776328986960885, + "loss": 2.7369, + "theoretical_loss": 3.5640015483237595, + "tokens_seen": 1289351168 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1461873, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.972330093383789, + "objective/train/theoretical_loss": 3.5639851094001127, + "objective/train/tokens_used": 1309876704, + "theoretical_loss": 3.5639851094001127, + "tokens_seen": 1289416704 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030775325977933803, + "loss": 2.8289, + "theoretical_loss": 3.5639851094001127, + "tokens_seen": 1289416704 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003077432296890672, + "loss": 2.6794, + "theoretical_loss": 3.5639686715459042, + "tokens_seen": 1289482240 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003077331995987964, + "loss": 2.9203, + "theoretical_loss": 3.56395223476101, + "tokens_seen": 1289547776 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003077231695085256, + "loss": 2.7811, + "theoretical_loss": 3.563935799045307, + "tokens_seen": 1289613312 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003077131394182548, + "loss": 2.8663, + "theoretical_loss": 3.5639193643986706, + "tokens_seen": 1289678848 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030770310932798394, + "loss": 2.8262, + "theoretical_loss": 3.5639029308209778, + "tokens_seen": 1289744384 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003076930792377132, + "loss": 2.7288, + "theoretical_loss": 3.5638864983121037, + "tokens_seen": 1289809920 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003076830491474423, + "loss": 2.7878, + "theoretical_loss": 3.5638700668719254, + "tokens_seen": 1289875456 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030767301905717154, + "loss": 3.0314, + "theoretical_loss": 3.5638536365003186, + "tokens_seen": 1289940992 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003076629889669007, + "loss": 2.5894, + "theoretical_loss": 3.5638372071971594, + "tokens_seen": 1290006528 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003076529588766299, + "loss": 2.6758, + "theoretical_loss": 3.5638207789623246, + "tokens_seen": 1290072064 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003076429287863591, + "loss": 2.7737, + "theoretical_loss": 3.5638043517956897, + "tokens_seen": 1290137600 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003076328986960883, + "loss": 2.7539, + "theoretical_loss": 3.563787925697132, + "tokens_seen": 1290203136 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030762286860581744, + "loss": 2.8386, + "theoretical_loss": 3.563771500666527, + "tokens_seen": 1290268672 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003076128385155467, + "loss": 2.6868, + "theoretical_loss": 3.5637550767037514, + "tokens_seen": 1290334208 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003076028084252758, + "loss": 2.832, + "theoretical_loss": 3.563738653808681, + "tokens_seen": 1290399744 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030759277833500504, + "loss": 2.8009, + "theoretical_loss": 3.5637222319811928, + "tokens_seen": 1290465280 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003075827482447342, + "loss": 2.6638, + "theoretical_loss": 3.563705811221163, + "tokens_seen": 1290530816 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003075727181544634, + "loss": 2.754, + "theoretical_loss": 3.5636893915284675, + "tokens_seen": 1290596352 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003075626880641926, + "loss": 2.8253, + "theoretical_loss": 3.563672972902984, + "tokens_seen": 1290661888 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030755265797392177, + "loss": 2.7206, + "theoretical_loss": 3.563656555344588, + "tokens_seen": 1290727424 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030754262788365095, + "loss": 2.8326, + "theoretical_loss": 3.563640138853156, + "tokens_seen": 1290792960 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003075325977933802, + "loss": 2.8658, + "theoretical_loss": 3.5636237234285644, + "tokens_seen": 1290858496 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003075225677031093, + "loss": 2.9978, + "theoretical_loss": 3.56360730907069, + "tokens_seen": 1290924032 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030751253761283854, + "loss": 2.8136, + "theoretical_loss": 3.5635908957794094, + "tokens_seen": 1290989568 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1467150, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8942673206329346, + "objective/train/theoretical_loss": 3.563574483554599, + "objective/train/tokens_used": 1311515104, + "theoretical_loss": 3.563574483554599, + "tokens_seen": 1291055104 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003075025075225677, + "loss": 2.7327, + "theoretical_loss": 3.563574483554599, + "tokens_seen": 1291055104 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003074924774322969, + "loss": 2.8427, + "theoretical_loss": 3.5635580723961353, + "tokens_seen": 1291120640 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003074824473420261, + "loss": 2.8505, + "theoretical_loss": 3.563541662303895, + "tokens_seen": 1291186176 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030747241725175527, + "loss": 2.923, + "theoretical_loss": 3.563525253277755, + "tokens_seen": 1291251712 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030746238716148445, + "loss": 2.7578, + "theoretical_loss": 3.5635088453175916, + "tokens_seen": 1291317248 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003074523570712137, + "loss": 2.9013, + "theoretical_loss": 3.563492438423281, + "tokens_seen": 1291382784 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003074423269809428, + "loss": 2.6727, + "theoretical_loss": 3.563476032594701, + "tokens_seen": 1291448320 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030743229689067205, + "loss": 2.8296, + "theoretical_loss": 3.563459627831727, + "tokens_seen": 1291513856 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003074222668004012, + "loss": 2.6678, + "theoretical_loss": 3.563443224134237, + "tokens_seen": 1291579392 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003074122367101304, + "loss": 2.6133, + "theoretical_loss": 3.563426821502107, + "tokens_seen": 1291644928 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003074022066198596, + "loss": 2.7212, + "theoretical_loss": 3.563410419935214, + "tokens_seen": 1291710464 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030739217652958877, + "loss": 2.7894, + "theoretical_loss": 3.563394019433434, + "tokens_seen": 1291776000 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030738214643931795, + "loss": 2.9254, + "theoretical_loss": 3.563377619996645, + "tokens_seen": 1291841536 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030737211634904713, + "loss": 3.0763, + "theoretical_loss": 3.5633612216247235, + "tokens_seen": 1291907072 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003073620862587763, + "loss": 2.9024, + "theoretical_loss": 3.563344824317546, + "tokens_seen": 1291972608 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030735205616850555, + "loss": 3.1284, + "theoretical_loss": 3.5633284280749895, + "tokens_seen": 1292038144 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003073420260782347, + "loss": 2.679, + "theoretical_loss": 3.563312032896931, + "tokens_seen": 1292103680 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003073319959879639, + "loss": 2.7707, + "theoretical_loss": 3.563295638783247, + "tokens_seen": 1292169216 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003073219658976931, + "loss": 2.778, + "theoretical_loss": 3.5632792457338147, + "tokens_seen": 1292234752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003073119358074223, + "loss": 2.8644, + "theoretical_loss": 3.563262853748511, + "tokens_seen": 1292300288 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030730190571715146, + "loss": 2.8502, + "theoretical_loss": 3.5632464628272134, + "tokens_seen": 1292365824 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030729187562688064, + "loss": 2.6877, + "theoretical_loss": 3.563230072969798, + "tokens_seen": 1292431360 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003072818455366098, + "loss": 2.8643, + "theoretical_loss": 3.563213684176142, + "tokens_seen": 1292496896 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030727181544633905, + "loss": 2.8738, + "theoretical_loss": 3.5631972964461234, + "tokens_seen": 1292562432 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003072617853560682, + "loss": 2.7507, + "theoretical_loss": 3.563180909779618, + "tokens_seen": 1292627968 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1472020, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7531261444091797, + "objective/train/theoretical_loss": 3.5631645241765026, + "objective/train/tokens_used": 1313153504, + "theoretical_loss": 3.5631645241765026, + "tokens_seen": 1292693504 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003072517552657974, + "loss": 2.6529, + "theoretical_loss": 3.5631645241765026, + "tokens_seen": 1292693504 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030724172517552654, + "loss": 2.8054, + "theoretical_loss": 3.563148139636656, + "tokens_seen": 1292759040 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003072316950852558, + "loss": 2.904, + "theoretical_loss": 3.5631317561599545, + "tokens_seen": 1292824576 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030722166499498496, + "loss": 2.7999, + "theoretical_loss": 3.5631153737462746, + "tokens_seen": 1292890112 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030721163490471414, + "loss": 2.7538, + "theoretical_loss": 3.563098992395494, + "tokens_seen": 1292955648 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003072016048144433, + "loss": 2.7512, + "theoretical_loss": 3.5630826121074897, + "tokens_seen": 1293021184 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003071915747241725, + "loss": 2.4014, + "theoretical_loss": 3.5630662328821385, + "tokens_seen": 1293086720 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003071815446339017, + "loss": 2.5846, + "theoretical_loss": 3.563049854719319, + "tokens_seen": 1293152256 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003071715145436309, + "loss": 2.75, + "theoretical_loss": 3.563033477618907, + "tokens_seen": 1293217792 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030716148445336005, + "loss": 2.8021, + "theoretical_loss": 3.5630171015807806, + "tokens_seen": 1293283328 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003071514543630893, + "loss": 2.7755, + "theoretical_loss": 3.563000726604816, + "tokens_seen": 1293348864 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030714142427281846, + "loss": 2.8792, + "theoretical_loss": 3.5629843526908918, + "tokens_seen": 1293414400 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030713139418254764, + "loss": 2.8276, + "theoretical_loss": 3.562967979838885, + "tokens_seen": 1293479936 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003071213640922768, + "loss": 2.8245, + "theoretical_loss": 3.562951608048672, + "tokens_seen": 1293545472 + }, + { + "epoch": 4.03, + "learning_rate": 0.000307111334002006, + "loss": 2.7646, + "theoretical_loss": 3.562935237320131, + "tokens_seen": 1293611008 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003071013039117352, + "loss": 2.6934, + "theoretical_loss": 3.5629188676531394, + "tokens_seen": 1293676544 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003070912738214644, + "loss": 2.8052, + "theoretical_loss": 3.5629024990475746, + "tokens_seen": 1293742080 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030708124373119355, + "loss": 2.7833, + "theoretical_loss": 3.562886131503314, + "tokens_seen": 1293807616 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003070712136409228, + "loss": 2.8311, + "theoretical_loss": 3.562869765020234, + "tokens_seen": 1293873152 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003070611835506519, + "loss": 2.8201, + "theoretical_loss": 3.562853399598213, + "tokens_seen": 1293938688 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030705115346038115, + "loss": 2.7745, + "theoretical_loss": 3.562837035237129, + "tokens_seen": 1294004224 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003070411233701104, + "loss": 3.03, + "theoretical_loss": 3.5628206719368585, + "tokens_seen": 1294069760 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003070310932798395, + "loss": 2.877, + "theoretical_loss": 3.5628043096972792, + "tokens_seen": 1294135296 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030702106318956874, + "loss": 2.8951, + "theoretical_loss": 3.5627879485182685, + "tokens_seen": 1294200832 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003070110330992979, + "loss": 2.7721, + "theoretical_loss": 3.562771588399705, + "tokens_seen": 1294266368 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1477198, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.093344211578369, + "objective/train/theoretical_loss": 3.562755229341465, + "objective/train/tokens_used": 1314791904, + "theoretical_loss": 3.562755229341465, + "tokens_seen": 1294331904 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003070010030090271, + "loss": 2.9489, + "theoretical_loss": 3.562755229341465, + "tokens_seen": 1294331904 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003069909729187563, + "loss": 2.6401, + "theoretical_loss": 3.562738871343427, + "tokens_seen": 1294397440 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030698094282848547, + "loss": 2.7694, + "theoretical_loss": 3.5627225144054684, + "tokens_seen": 1294462976 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030697091273821465, + "loss": 2.7897, + "theoretical_loss": 3.562706158527466, + "tokens_seen": 1294528512 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003069608826479439, + "loss": 2.6688, + "theoretical_loss": 3.562689803709299, + "tokens_seen": 1294594048 + }, + { + "epoch": 4.03, + "learning_rate": 0.000306950852557673, + "loss": 2.8461, + "theoretical_loss": 3.562673449950844, + "tokens_seen": 1294659584 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030694082246740225, + "loss": 3.0205, + "theoretical_loss": 3.5626570972519787, + "tokens_seen": 1294725120 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003069307923771314, + "loss": 2.9597, + "theoretical_loss": 3.5626407456125806, + "tokens_seen": 1294790656 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003069207622868606, + "loss": 2.7252, + "theoretical_loss": 3.562624395032528, + "tokens_seen": 1294856192 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003069107321965898, + "loss": 2.7873, + "theoretical_loss": 3.5626080455116993, + "tokens_seen": 1294921728 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030690070210631897, + "loss": 2.7035, + "theoretical_loss": 3.562591697049971, + "tokens_seen": 1294987264 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030689067201604815, + "loss": 2.8921, + "theoretical_loss": 3.5625753496472212, + "tokens_seen": 1295052800 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030688064192577733, + "loss": 2.6662, + "theoretical_loss": 3.5625590033033285, + "tokens_seen": 1295118336 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003068706118355065, + "loss": 2.9771, + "theoretical_loss": 3.5625426580181703, + "tokens_seen": 1295183872 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030686058174523575, + "loss": 2.6901, + "theoretical_loss": 3.562526313791624, + "tokens_seen": 1295249408 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003068505516549649, + "loss": 2.9169, + "theoretical_loss": 3.5625099706235677, + "tokens_seen": 1295314944 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003068405215646941, + "loss": 2.6764, + "theoretical_loss": 3.562493628513879, + "tokens_seen": 1295380480 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003068304914744233, + "loss": 2.6589, + "theoretical_loss": 3.5624772874624373, + "tokens_seen": 1295446016 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003068204613841525, + "loss": 2.3541, + "theoretical_loss": 3.5624609474691185, + "tokens_seen": 1295511552 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030681043129388166, + "loss": 2.6614, + "theoretical_loss": 3.562444608533802, + "tokens_seen": 1295577088 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030680040120361084, + "loss": 2.6918, + "theoretical_loss": 3.5624282706563655, + "tokens_seen": 1295642624 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030679037111334, + "loss": 2.4951, + "theoretical_loss": 3.562411933836686, + "tokens_seen": 1295708160 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030678034102306925, + "loss": 2.7433, + "theoretical_loss": 3.5623955980746436, + "tokens_seen": 1295773696 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003067703109327984, + "loss": 2.8738, + "theoretical_loss": 3.562379263370114, + "tokens_seen": 1295839232 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003067602808425276, + "loss": 2.6308, + "theoretical_loss": 3.562362929722977, + "tokens_seen": 1295904768 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1480267, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.763779640197754, + "objective/train/theoretical_loss": 3.56234659713311, + "objective/train/tokens_used": 1316430304, + "theoretical_loss": 3.56234659713311, + "tokens_seen": 1295970304 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030675025075225674, + "loss": 2.5947, + "theoretical_loss": 3.56234659713311, + "tokens_seen": 1295970304 + }, + { + "epoch": 4.03, + "learning_rate": 0.000306740220661986, + "loss": 2.6591, + "theoretical_loss": 3.562330265600391, + "tokens_seen": 1296035840 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030673019057171516, + "loss": 2.8535, + "theoretical_loss": 3.562313935124698, + "tokens_seen": 1296101376 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030672016048144434, + "loss": 2.7508, + "theoretical_loss": 3.56229760570591, + "tokens_seen": 1296166912 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003067101303911735, + "loss": 2.7883, + "theoretical_loss": 3.562281277343904, + "tokens_seen": 1296232448 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003067001003009027, + "loss": 2.8509, + "theoretical_loss": 3.5622649500385593, + "tokens_seen": 1296297984 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003066900702106319, + "loss": 2.8038, + "theoretical_loss": 3.562248623789753, + "tokens_seen": 1296363520 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003066800401203611, + "loss": 2.7852, + "theoretical_loss": 3.5622322985973645, + "tokens_seen": 1296429056 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030667001003009025, + "loss": 2.7057, + "theoretical_loss": 3.562215974461271, + "tokens_seen": 1296494592 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003066599799398195, + "loss": 2.6778, + "theoretical_loss": 3.562199651381351, + "tokens_seen": 1296560128 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030664994984954866, + "loss": 2.7529, + "theoretical_loss": 3.5621833293574836, + "tokens_seen": 1296625664 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030663991975927784, + "loss": 2.5541, + "theoretical_loss": 3.562167008389546, + "tokens_seen": 1296691200 + }, + { + "epoch": 4.03, + "learning_rate": 0.000306629889669007, + "loss": 2.7384, + "theoretical_loss": 3.5621506884774172, + "tokens_seen": 1296756736 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003066198595787362, + "loss": 2.9487, + "theoretical_loss": 3.562134369620975, + "tokens_seen": 1296822272 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003066098294884654, + "loss": 2.71, + "theoretical_loss": 3.5621180518200988, + "tokens_seen": 1296887808 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003065997993981946, + "loss": 2.8281, + "theoretical_loss": 3.562101735074666, + "tokens_seen": 1296953344 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030658976930792375, + "loss": 2.6336, + "theoretical_loss": 3.5620854193845553, + "tokens_seen": 1297018880 + }, + { + "epoch": 4.03, + "learning_rate": 0.000306579739217653, + "loss": 2.8688, + "theoretical_loss": 3.562069104749645, + "tokens_seen": 1297084416 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003065697091273821, + "loss": 2.8225, + "theoretical_loss": 3.5620527911698137, + "tokens_seen": 1297149952 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030655967903711135, + "loss": 2.664, + "theoretical_loss": 3.56203647864494, + "tokens_seen": 1297215488 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030654964894684053, + "loss": 2.7885, + "theoretical_loss": 3.5620201671749023, + "tokens_seen": 1297281024 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003065396188565697, + "loss": 2.8634, + "theoretical_loss": 3.562003856759579, + "tokens_seen": 1297346560 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003065295887662989, + "loss": 2.7129, + "theoretical_loss": 3.5619875473988487, + "tokens_seen": 1297412096 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003065195586760281, + "loss": 2.7556, + "theoretical_loss": 3.56197123909259, + "tokens_seen": 1297477632 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030650952858575725, + "loss": 2.7793, + "theoretical_loss": 3.561954931840681, + "tokens_seen": 1297543168 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1480743, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7817656993865967, + "objective/train/theoretical_loss": 3.561938625643001, + "objective/train/tokens_used": 1318068704, + "theoretical_loss": 3.561938625643001, + "tokens_seen": 1297608704 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003064994984954865, + "loss": 2.6356, + "theoretical_loss": 3.561938625643001, + "tokens_seen": 1297608704 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003064894684052156, + "loss": 2.5468, + "theoretical_loss": 3.5619223204994284, + "tokens_seen": 1297674240 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030647943831494485, + "loss": 2.9011, + "theoretical_loss": 3.5619060164098415, + "tokens_seen": 1297739776 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030646940822467403, + "loss": 3.0608, + "theoretical_loss": 3.5618897133741196, + "tokens_seen": 1297805312 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003064593781344032, + "loss": 2.73, + "theoretical_loss": 3.56187341139214, + "tokens_seen": 1297870848 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003064493480441324, + "loss": 2.7114, + "theoretical_loss": 3.5618571104637833, + "tokens_seen": 1297936384 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003064393179538616, + "loss": 2.8433, + "theoretical_loss": 3.5618408105889268, + "tokens_seen": 1298001920 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030642928786359076, + "loss": 2.943, + "theoretical_loss": 3.5618245117674494, + "tokens_seen": 1298067456 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030641925777332, + "loss": 2.9349, + "theoretical_loss": 3.5618082139992304, + "tokens_seen": 1298132992 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003064092276830491, + "loss": 2.8906, + "theoretical_loss": 3.561791917284148, + "tokens_seen": 1298198528 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030639919759277835, + "loss": 2.8165, + "theoretical_loss": 3.5617756216220817, + "tokens_seen": 1298264064 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003063891675025075, + "loss": 2.6757, + "theoretical_loss": 3.5617593270129095, + "tokens_seen": 1298329600 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003063791374122367, + "loss": 2.7742, + "theoretical_loss": 3.561743033456511, + "tokens_seen": 1298395136 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003063691073219659, + "loss": 2.8762, + "theoretical_loss": 3.561726740952764, + "tokens_seen": 1298460672 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003063590772316951, + "loss": 2.4558, + "theoretical_loss": 3.5617104495015486, + "tokens_seen": 1298526208 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030634904714142426, + "loss": 2.8145, + "theoretical_loss": 3.561694159102743, + "tokens_seen": 1298591744 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003063390170511535, + "loss": 2.7078, + "theoretical_loss": 3.561677869756226, + "tokens_seen": 1298657280 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003063289869608826, + "loss": 2.8624, + "theoretical_loss": 3.561661581461877, + "tokens_seen": 1298722816 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030631895687061186, + "loss": 2.8161, + "theoretical_loss": 3.5616452942195744, + "tokens_seen": 1298788352 + }, + { + "epoch": 4.03, + "learning_rate": 0.000306308926780341, + "loss": 2.606, + "theoretical_loss": 3.5616290080291977, + "tokens_seen": 1298853888 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003062988966900702, + "loss": 2.7127, + "theoretical_loss": 3.561612722890626, + "tokens_seen": 1298919424 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030628886659979945, + "loss": 2.884, + "theoretical_loss": 3.561596438803737, + "tokens_seen": 1298984960 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003062788365095286, + "loss": 2.7747, + "theoretical_loss": 3.5615801557684117, + "tokens_seen": 1299050496 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003062688064192578, + "loss": 2.6148, + "theoretical_loss": 3.5615638737845274, + "tokens_seen": 1299116032 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030625877632898694, + "loss": 2.895, + "theoretical_loss": 3.561547592851964, + "tokens_seen": 1299181568 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1482030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.784689426422119, + "objective/train/theoretical_loss": 3.5615313129706005, + "objective/train/tokens_used": 1319707104, + "theoretical_loss": 3.5615313129706005, + "tokens_seen": 1299247104 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003062487462387162, + "loss": 2.6131, + "theoretical_loss": 3.5615313129706005, + "tokens_seen": 1299247104 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030623871614844536, + "loss": 2.7551, + "theoretical_loss": 3.5615150341403163, + "tokens_seen": 1299312640 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030622868605817454, + "loss": 2.7255, + "theoretical_loss": 3.56149875636099, + "tokens_seen": 1299378176 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003062186559679037, + "loss": 2.6936, + "theoretical_loss": 3.5614824796325006, + "tokens_seen": 1299443712 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003062086258776329, + "loss": 2.6426, + "theoretical_loss": 3.5614662039547285, + "tokens_seen": 1299509248 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003061985957873621, + "loss": 2.7086, + "theoretical_loss": 3.5614499293275514, + "tokens_seen": 1299574784 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003061885656970913, + "loss": 2.8353, + "theoretical_loss": 3.5614336557508492, + "tokens_seen": 1299640320 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030617853560682045, + "loss": 2.8087, + "theoretical_loss": 3.5614173832245015, + "tokens_seen": 1299705856 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003061685055165497, + "loss": 2.8922, + "theoretical_loss": 3.5614011117483866, + "tokens_seen": 1299771392 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030615847542627886, + "loss": 2.7872, + "theoretical_loss": 3.5613848413223845, + "tokens_seen": 1299836928 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030614844533600804, + "loss": 2.7919, + "theoretical_loss": 3.5613685719463746, + "tokens_seen": 1299902464 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003061384152457372, + "loss": 2.6497, + "theoretical_loss": 3.5613523036202355, + "tokens_seen": 1299968000 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003061283851554664, + "loss": 2.6103, + "theoretical_loss": 3.561336036343847, + "tokens_seen": 1300033536 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003061183550651956, + "loss": 2.468, + "theoretical_loss": 3.5613197701170884, + "tokens_seen": 1300099072 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003061083249749248, + "loss": 2.9235, + "theoretical_loss": 3.561303504939839, + "tokens_seen": 1300164608 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030609829488465395, + "loss": 2.8431, + "theoretical_loss": 3.5612872408119784, + "tokens_seen": 1300230144 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003060882647943832, + "loss": 2.9829, + "theoretical_loss": 3.5612709777333853, + "tokens_seen": 1300295680 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003060782347041123, + "loss": 2.7384, + "theoretical_loss": 3.5612547157039405, + "tokens_seen": 1300361216 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030606820461384155, + "loss": 2.9586, + "theoretical_loss": 3.561238454723522, + "tokens_seen": 1300426752 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030605817452357073, + "loss": 2.8804, + "theoretical_loss": 3.56122219479201, + "tokens_seen": 1300492288 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003060481444332999, + "loss": 2.764, + "theoretical_loss": 3.5612059359092836, + "tokens_seen": 1300557824 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003060381143430291, + "loss": 2.8745, + "theoretical_loss": 3.5611896780752232, + "tokens_seen": 1300623360 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003060280842527583, + "loss": 2.8413, + "theoretical_loss": 3.5611734212897073, + "tokens_seen": 1300688896 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030601805416248745, + "loss": 2.6772, + "theoretical_loss": 3.5611571655526157, + "tokens_seen": 1300754432 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003060080240722167, + "loss": 2.6801, + "theoretical_loss": 3.5611409108638283, + "tokens_seen": 1300819968 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1482698, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8064839839935303, + "objective/train/theoretical_loss": 3.5611246572232242, + "objective/train/tokens_used": 1321345504, + "theoretical_loss": 3.5611246572232242, + "tokens_seen": 1300885504 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003059979939819458, + "loss": 2.7375, + "theoretical_loss": 3.5611246572232242, + "tokens_seen": 1300885504 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030598796389167505, + "loss": 2.8112, + "theoretical_loss": 3.5611084046306836, + "tokens_seen": 1300951040 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030597793380140423, + "loss": 2.9242, + "theoretical_loss": 3.561092153086086, + "tokens_seen": 1301016576 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003059679037111334, + "loss": 2.624, + "theoretical_loss": 3.5610759025893106, + "tokens_seen": 1301082112 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003059578736208626, + "loss": 2.7119, + "theoretical_loss": 3.5610596531402376, + "tokens_seen": 1301147648 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003059478435305918, + "loss": 2.8299, + "theoretical_loss": 3.5610434047387463, + "tokens_seen": 1301213184 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030593781344032096, + "loss": 2.7453, + "theoretical_loss": 3.5610271573847165, + "tokens_seen": 1301278720 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003059277833500502, + "loss": 2.6769, + "theoretical_loss": 3.561010911078028, + "tokens_seen": 1301344256 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003059177532597793, + "loss": 2.5735, + "theoretical_loss": 3.5609946658185607, + "tokens_seen": 1301409792 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030590772316950855, + "loss": 2.814, + "theoretical_loss": 3.560978421606194, + "tokens_seen": 1301475328 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003058976930792377, + "loss": 2.7553, + "theoretical_loss": 3.560962178440808, + "tokens_seen": 1301540864 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003058876629889669, + "loss": 3.0141, + "theoretical_loss": 3.560945936322282, + "tokens_seen": 1301606400 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003058776328986961, + "loss": 2.7595, + "theoretical_loss": 3.560929695250497, + "tokens_seen": 1301671936 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003058676028084253, + "loss": 2.778, + "theoretical_loss": 3.5609134552253314, + "tokens_seen": 1301737472 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030585757271815446, + "loss": 2.8669, + "theoretical_loss": 3.5608972162466657, + "tokens_seen": 1301803008 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003058475426278837, + "loss": 2.7928, + "theoretical_loss": 3.5608809783143798, + "tokens_seen": 1301868544 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003058375125376128, + "loss": 2.6076, + "theoretical_loss": 3.560864741428354, + "tokens_seen": 1301934080 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030582748244734206, + "loss": 2.8176, + "theoretical_loss": 3.560848505588468, + "tokens_seen": 1301999616 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003058174523570712, + "loss": 2.7657, + "theoretical_loss": 3.560832270794601, + "tokens_seen": 1302065152 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003058074222668004, + "loss": 2.6445, + "theoretical_loss": 3.5608160370466333, + "tokens_seen": 1302130688 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003057973921765296, + "loss": 2.8125, + "theoretical_loss": 3.560799804344446, + "tokens_seen": 1302196224 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003057873620862588, + "loss": 2.5955, + "theoretical_loss": 3.5607835726879173, + "tokens_seen": 1302261760 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030577733199598796, + "loss": 2.7631, + "theoretical_loss": 3.560767342076929, + "tokens_seen": 1302327296 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030576730190571714, + "loss": 2.9111, + "theoretical_loss": 3.56075111251136, + "tokens_seen": 1302392832 + }, + { + "epoch": 4.03, + "learning_rate": 0.0003057572718154463, + "loss": 2.7766, + "theoretical_loss": 3.5607348839910906, + "tokens_seen": 1302458368 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 1483979, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.209777355194092, + "objective/train/theoretical_loss": 3.560718656516001, + "objective/train/tokens_used": 1322983904, + "theoretical_loss": 3.560718656516001, + "tokens_seen": 1302523904 + }, + { + "epoch": 4.03, + "learning_rate": 0.00030574724172517556, + "loss": 2.6928, + "theoretical_loss": 3.560718656516001, + "tokens_seen": 1302523904 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003057372116349047, + "loss": 2.7563, + "theoretical_loss": 3.5607024300859713, + "tokens_seen": 1302589440 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003057271815446339, + "loss": 2.7126, + "theoretical_loss": 3.5606862047008816, + "tokens_seen": 1302654976 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030571715145436305, + "loss": 2.735, + "theoretical_loss": 3.5606699803606117, + "tokens_seen": 1302720512 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003057071213640923, + "loss": 2.7698, + "theoretical_loss": 3.560653757065042, + "tokens_seen": 1302786048 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030569709127382147, + "loss": 2.9375, + "theoretical_loss": 3.5606375348140533, + "tokens_seen": 1302851584 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030568706118355065, + "loss": 2.8591, + "theoretical_loss": 3.560621313607525, + "tokens_seen": 1302917120 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030567703109327983, + "loss": 2.5649, + "theoretical_loss": 3.560605093445338, + "tokens_seen": 1302982656 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030566700100300906, + "loss": 2.9389, + "theoretical_loss": 3.5605888743273715, + "tokens_seen": 1303048192 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003056569709127382, + "loss": 2.5524, + "theoretical_loss": 3.5605726562535067, + "tokens_seen": 1303113728 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003056469408224674, + "loss": 2.9435, + "theoretical_loss": 3.5605564392236237, + "tokens_seen": 1303179264 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030563691073219655, + "loss": 2.9062, + "theoretical_loss": 3.560540223237602, + "tokens_seen": 1303244800 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003056268806419258, + "loss": 2.752, + "theoretical_loss": 3.5605240082953236, + "tokens_seen": 1303310336 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030561685055165497, + "loss": 2.8643, + "theoretical_loss": 3.5605077943966674, + "tokens_seen": 1303375872 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030560682046138415, + "loss": 2.9482, + "theoretical_loss": 3.5604915815415143, + "tokens_seen": 1303441408 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030559679037111333, + "loss": 2.8014, + "theoretical_loss": 3.560475369729744, + "tokens_seen": 1303506944 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003055867602808425, + "loss": 2.4184, + "theoretical_loss": 3.560459158961238, + "tokens_seen": 1303572480 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003055767301905717, + "loss": 2.7457, + "theoretical_loss": 3.5604429492358767, + "tokens_seen": 1303638016 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030556670010030093, + "loss": 2.69, + "theoretical_loss": 3.560426740553539, + "tokens_seen": 1303703552 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030555667001003006, + "loss": 2.5995, + "theoretical_loss": 3.5604105329141067, + "tokens_seen": 1303769088 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003055466399197593, + "loss": 2.9198, + "theoretical_loss": 3.5603943263174607, + "tokens_seen": 1303834624 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003055366098294885, + "loss": 2.5802, + "theoretical_loss": 3.56037812076348, + "tokens_seen": 1303900160 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030552657973921765, + "loss": 2.797, + "theoretical_loss": 3.560361916252046, + "tokens_seen": 1303965696 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003055165496489469, + "loss": 2.8847, + "theoretical_loss": 3.5603457127830396, + "tokens_seen": 1304031232 + }, + { + "epoch": 4.04, + "learning_rate": 0.000305506519558676, + "loss": 2.694, + "theoretical_loss": 3.5603295103563406, + "tokens_seen": 1304096768 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1484701, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.840122699737549, + "objective/train/theoretical_loss": 3.5603133089718293, + "objective/train/tokens_used": 1324622304, + "theoretical_loss": 3.5603133089718293, + "tokens_seen": 1304162304 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030549648946840525, + "loss": 2.7326, + "theoretical_loss": 3.5603133089718293, + "tokens_seen": 1304162304 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030548645937813443, + "loss": 2.8274, + "theoretical_loss": 3.5602971086293875, + "tokens_seen": 1304227840 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003054764292878636, + "loss": 2.5019, + "theoretical_loss": 3.5602809093288954, + "tokens_seen": 1304293376 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003054663991975928, + "loss": 2.6509, + "theoretical_loss": 3.560264711070233, + "tokens_seen": 1304358912 + }, + { + "epoch": 4.04, + "learning_rate": 0.000305456369107322, + "loss": 2.5558, + "theoretical_loss": 3.5602485138532813, + "tokens_seen": 1304424448 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030544633901705116, + "loss": 2.5316, + "theoretical_loss": 3.560232317677921, + "tokens_seen": 1304489984 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003054363089267804, + "loss": 2.8007, + "theoretical_loss": 3.560216122544033, + "tokens_seen": 1304555520 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003054262788365095, + "loss": 2.9023, + "theoretical_loss": 3.560199928451498, + "tokens_seen": 1304621056 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030541624874623875, + "loss": 2.6966, + "theoretical_loss": 3.560183735400196, + "tokens_seen": 1304686592 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003054062186559679, + "loss": 2.8067, + "theoretical_loss": 3.5601675433900093, + "tokens_seen": 1304752128 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003053961885656971, + "loss": 2.7268, + "theoretical_loss": 3.5601513524208173, + "tokens_seen": 1304817664 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003053861584754263, + "loss": 2.7663, + "theoretical_loss": 3.560135162492501, + "tokens_seen": 1304883200 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003053761283851555, + "loss": 2.6635, + "theoretical_loss": 3.5601189736049417, + "tokens_seen": 1304948736 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030536609829488466, + "loss": 2.8489, + "theoretical_loss": 3.5601027857580196, + "tokens_seen": 1305014272 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003053560682046139, + "loss": 2.8466, + "theoretical_loss": 3.5600865989516164, + "tokens_seen": 1305079808 + }, + { + "epoch": 4.04, + "learning_rate": 0.000305346038114343, + "loss": 2.8473, + "theoretical_loss": 3.5600704131856125, + "tokens_seen": 1305145344 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030533600802407226, + "loss": 2.6758, + "theoretical_loss": 3.5600542284598884, + "tokens_seen": 1305210880 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003053259779338014, + "loss": 2.598, + "theoretical_loss": 3.560038044774325, + "tokens_seen": 1305276416 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003053159478435306, + "loss": 2.6272, + "theoretical_loss": 3.5600218621288047, + "tokens_seen": 1305341952 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003053059177532598, + "loss": 2.7393, + "theoretical_loss": 3.5600056805232065, + "tokens_seen": 1305407488 + }, + { + "epoch": 4.04, + "learning_rate": 0.000305295887662989, + "loss": 2.8424, + "theoretical_loss": 3.5599894999574127, + "tokens_seen": 1305473024 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030528585757271816, + "loss": 2.7768, + "theoretical_loss": 3.559973320431304, + "tokens_seen": 1305538560 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030527582748244734, + "loss": 2.6083, + "theoretical_loss": 3.5599571419447607, + "tokens_seen": 1305604096 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003052657973921765, + "loss": 2.8214, + "theoretical_loss": 3.559940964497664, + "tokens_seen": 1305669632 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030525576730190576, + "loss": 2.8903, + "theoretical_loss": 3.559924788089896, + "tokens_seen": 1305735168 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1485880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7324297428131104, + "objective/train/theoretical_loss": 3.5599086127213373, + "objective/train/tokens_used": 1326260704, + "theoretical_loss": 3.5599086127213373, + "tokens_seen": 1305800704 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003052457372116349, + "loss": 2.7493, + "theoretical_loss": 3.5599086127213373, + "tokens_seen": 1305800704 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003052357071213641, + "loss": 2.8219, + "theoretical_loss": 3.559892438391868, + "tokens_seen": 1305866240 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030522567703109325, + "loss": 3.01, + "theoretical_loss": 3.559876265101371, + "tokens_seen": 1305931776 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003052156469408225, + "loss": 2.8747, + "theoretical_loss": 3.5598600928497257, + "tokens_seen": 1305997312 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030520561685055167, + "loss": 2.5935, + "theoretical_loss": 3.559843921636814, + "tokens_seen": 1306062848 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030519558676028085, + "loss": 2.9397, + "theoretical_loss": 3.5598277514625174, + "tokens_seen": 1306128384 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030518555667001003, + "loss": 2.9605, + "theoretical_loss": 3.5598115823267165, + "tokens_seen": 1306193920 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030517552657973926, + "loss": 2.6671, + "theoretical_loss": 3.5597954142292925, + "tokens_seen": 1306259456 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003051654964894684, + "loss": 2.8636, + "theoretical_loss": 3.559779247170127, + "tokens_seen": 1306324992 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003051554663991976, + "loss": 2.6677, + "theoretical_loss": 3.559763081149101, + "tokens_seen": 1306390528 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030514543630892675, + "loss": 2.8608, + "theoretical_loss": 3.5597469161660955, + "tokens_seen": 1306456064 + }, + { + "epoch": 4.04, + "learning_rate": 0.000305135406218656, + "loss": 2.7773, + "theoretical_loss": 3.559730752220993, + "tokens_seen": 1306521600 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030512537612838517, + "loss": 2.5519, + "theoretical_loss": 3.559714589313673, + "tokens_seen": 1306587136 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030511534603811435, + "loss": 2.752, + "theoretical_loss": 3.5596984274440184, + "tokens_seen": 1306652672 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030510531594784353, + "loss": 2.8003, + "theoretical_loss": 3.55968226661191, + "tokens_seen": 1306718208 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003050952858575727, + "loss": 2.7637, + "theoretical_loss": 3.5596661068172284, + "tokens_seen": 1306783744 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003050852557673019, + "loss": 2.5725, + "theoretical_loss": 3.559649948059856, + "tokens_seen": 1306849280 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030507522567703113, + "loss": 2.9491, + "theoretical_loss": 3.5596337903396735, + "tokens_seen": 1306914816 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030506519558676026, + "loss": 3.0898, + "theoretical_loss": 3.559617633656563, + "tokens_seen": 1306980352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003050551654964895, + "loss": 2.7713, + "theoretical_loss": 3.559601478010406, + "tokens_seen": 1307045888 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003050451354062186, + "loss": 2.734, + "theoretical_loss": 3.559585323401083, + "tokens_seen": 1307111424 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030503510531594785, + "loss": 2.5877, + "theoretical_loss": 3.559569169828476, + "tokens_seen": 1307176960 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030502507522567703, + "loss": 2.5828, + "theoretical_loss": 3.5595530172924663, + "tokens_seen": 1307242496 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003050150451354062, + "loss": 2.6721, + "theoretical_loss": 3.5595368657929365, + "tokens_seen": 1307308032 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003050050150451354, + "loss": 2.7474, + "theoretical_loss": 3.559520715329767, + "tokens_seen": 1307373568 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1486544, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7242846488952637, + "objective/train/theoretical_loss": 3.559504565902839, + "objective/train/tokens_used": 1327899104, + "theoretical_loss": 3.559504565902839, + "tokens_seen": 1307439104 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030499498495486463, + "loss": 2.6497, + "theoretical_loss": 3.559504565902839, + "tokens_seen": 1307439104 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030498495486459376, + "loss": 2.7098, + "theoretical_loss": 3.559488417512035, + "tokens_seen": 1307504640 + }, + { + "epoch": 4.04, + "learning_rate": 0.000304974924774323, + "loss": 2.7243, + "theoretical_loss": 3.5594722701572366, + "tokens_seen": 1307570176 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003049648946840521, + "loss": 2.7246, + "theoretical_loss": 3.559456123838325, + "tokens_seen": 1307635712 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030495486459378136, + "loss": 2.7058, + "theoretical_loss": 3.559439978555182, + "tokens_seen": 1307701248 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030494483450351054, + "loss": 2.8295, + "theoretical_loss": 3.5594238343076894, + "tokens_seen": 1307766784 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003049348044132397, + "loss": 2.8496, + "theoretical_loss": 3.5594076910957284, + "tokens_seen": 1307832320 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003049247743229689, + "loss": 2.6376, + "theoretical_loss": 3.559391548919181, + "tokens_seen": 1307897856 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003049147442326981, + "loss": 2.869, + "theoretical_loss": 3.559375407777929, + "tokens_seen": 1307963392 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030490471414242726, + "loss": 2.6927, + "theoretical_loss": 3.5593592676718533, + "tokens_seen": 1308028928 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003048946840521565, + "loss": 2.7517, + "theoretical_loss": 3.5593431286008372, + "tokens_seen": 1308094464 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003048846539618856, + "loss": 2.6785, + "theoretical_loss": 3.5593269905647613, + "tokens_seen": 1308160000 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030487462387161486, + "loss": 2.5399, + "theoretical_loss": 3.559310853563508, + "tokens_seen": 1308225536 + }, + { + "epoch": 4.04, + "learning_rate": 0.000304864593781344, + "loss": 2.8535, + "theoretical_loss": 3.5592947175969587, + "tokens_seen": 1308291072 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003048545636910732, + "loss": 2.7204, + "theoretical_loss": 3.5592785826649953, + "tokens_seen": 1308356608 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003048445336008024, + "loss": 2.5783, + "theoretical_loss": 3.5592624487675, + "tokens_seen": 1308422144 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003048345035105316, + "loss": 2.5697, + "theoretical_loss": 3.559246315904354, + "tokens_seen": 1308487680 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030482447342026076, + "loss": 2.7008, + "theoretical_loss": 3.55923018407544, + "tokens_seen": 1308553216 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030481444332999, + "loss": 2.6793, + "theoretical_loss": 3.5592140532806393, + "tokens_seen": 1308618752 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003048044132397192, + "loss": 2.6712, + "theoretical_loss": 3.559197923519834, + "tokens_seen": 1308684288 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030479438314944836, + "loss": 2.9295, + "theoretical_loss": 3.559181794792906, + "tokens_seen": 1308749824 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030478435305917754, + "loss": 2.8828, + "theoretical_loss": 3.5591656670997374, + "tokens_seen": 1308815360 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003047743229689067, + "loss": 2.6114, + "theoretical_loss": 3.5591495404402096, + "tokens_seen": 1308880896 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030476429287863596, + "loss": 2.6058, + "theoretical_loss": 3.5591334148142058, + "tokens_seen": 1308946432 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003047542627883651, + "loss": 2.7359, + "theoretical_loss": 3.559117290221607, + "tokens_seen": 1309011968 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1488092, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8345699310302734, + "objective/train/theoretical_loss": 3.5591011666622956, + "objective/train/tokens_used": 1329537504, + "theoretical_loss": 3.5591011666622956, + "tokens_seen": 1309077504 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003047442326980943, + "loss": 2.831, + "theoretical_loss": 3.5591011666622956, + "tokens_seen": 1309077504 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030473420260782345, + "loss": 2.7527, + "theoretical_loss": 3.5590850441361535, + "tokens_seen": 1309143040 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003047241725175527, + "loss": 2.9376, + "theoretical_loss": 3.559068922643063, + "tokens_seen": 1309208576 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030471414242728187, + "loss": 2.8627, + "theoretical_loss": 3.559052802182906, + "tokens_seen": 1309274112 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030470411233701105, + "loss": 2.8548, + "theoretical_loss": 3.559036682755565, + "tokens_seen": 1309339648 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030469408224674023, + "loss": 2.5759, + "theoretical_loss": 3.559020564360922, + "tokens_seen": 1309405184 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030468405215646946, + "loss": 2.7288, + "theoretical_loss": 3.559004446998858, + "tokens_seen": 1309470720 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003046740220661986, + "loss": 2.814, + "theoretical_loss": 3.5589883306692576, + "tokens_seen": 1309536256 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003046639919759278, + "loss": 2.7851, + "theoretical_loss": 3.558972215372001, + "tokens_seen": 1309601792 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030465396188565695, + "loss": 2.7324, + "theoretical_loss": 3.5589561011069706, + "tokens_seen": 1309667328 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003046439317953862, + "loss": 2.5386, + "theoretical_loss": 3.5589399878740493, + "tokens_seen": 1309732864 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030463390170511537, + "loss": 2.7966, + "theoretical_loss": 3.558923875673119, + "tokens_seen": 1309798400 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030462387161484455, + "loss": 2.8575, + "theoretical_loss": 3.5589077645040623, + "tokens_seen": 1309863936 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030461384152457373, + "loss": 2.7993, + "theoretical_loss": 3.5588916543667617, + "tokens_seen": 1309929472 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003046038114343029, + "loss": 2.8267, + "theoretical_loss": 3.558875545261098, + "tokens_seen": 1309995008 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003045937813440321, + "loss": 2.6302, + "theoretical_loss": 3.558859437186956, + "tokens_seen": 1310060544 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030458375125376133, + "loss": 2.8977, + "theoretical_loss": 3.5588433301442155, + "tokens_seen": 1310126080 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030457372116349046, + "loss": 2.5146, + "theoretical_loss": 3.5588272241327603, + "tokens_seen": 1310191616 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003045636910732197, + "loss": 2.5984, + "theoretical_loss": 3.5588111191524727, + "tokens_seen": 1310257152 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003045536609829488, + "loss": 2.7796, + "theoretical_loss": 3.558795015203235, + "tokens_seen": 1310322688 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030454363089267805, + "loss": 2.5794, + "theoretical_loss": 3.5587789122849296, + "tokens_seen": 1310388224 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030453360080240723, + "loss": 2.601, + "theoretical_loss": 3.5587628103974387, + "tokens_seen": 1310453760 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003045235707121364, + "loss": 2.7363, + "theoretical_loss": 3.558746709540645, + "tokens_seen": 1310519296 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003045135406218656, + "loss": 2.8367, + "theoretical_loss": 3.5587306097144307, + "tokens_seen": 1310584832 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030450351053159483, + "loss": 2.9193, + "theoretical_loss": 3.558714510918679, + "tokens_seen": 1310650368 + }, + { + "debugging/Self-BLEU-5": 0.5335497100843765, + "debugging/distinct-1-grams": 0.7520819374414864, + "debugging/distinct-2-grams": 0.9491809766666045, + "debugging/entropy-1-grams": 5.943300109412918, + "debugging/entropy-2-grams": 7.027315581762769, + "debugging/length": 477.3888888888889, + "debugging/num_segments": 18, + "debugging/score": 0.0018735238636605285, + "debugging/score_std": 0.0038943973989434763, + "epoch": 4.04, + "objective/train/docs_used": 1488756, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.642608642578125, + "objective/train/theoretical_loss": 3.558698413153272, + "objective/train/tokens_used": 1331175904, + "theoretical_loss": 3.558698413153272, + "tokens_seen": 1310715904 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030449348044132396, + "loss": 2.7676, + "theoretical_loss": 3.558698413153272, + "tokens_seen": 1310715904 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003044834503510532, + "loss": 2.6587, + "theoretical_loss": 3.558682316418092, + "tokens_seen": 1310781440 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003044734202607823, + "loss": 2.7785, + "theoretical_loss": 3.5586662207130217, + "tokens_seen": 1310846976 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030446339017051156, + "loss": 2.6989, + "theoretical_loss": 3.558650126037944, + "tokens_seen": 1310912512 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030445336008024074, + "loss": 2.6195, + "theoretical_loss": 3.5586340323927415, + "tokens_seen": 1310978048 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003044433299899699, + "loss": 2.8218, + "theoretical_loss": 3.558617939777297, + "tokens_seen": 1311043584 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003044332998996991, + "loss": 2.6592, + "theoretical_loss": 3.5586018481914916, + "tokens_seen": 1311109120 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003044232698094283, + "loss": 2.3855, + "theoretical_loss": 3.5585857576352096, + "tokens_seen": 1311174656 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030441323971915746, + "loss": 2.6516, + "theoretical_loss": 3.5585696681083334, + "tokens_seen": 1311240192 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003044032096288867, + "loss": 2.6568, + "theoretical_loss": 3.558553579610745, + "tokens_seen": 1311305728 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003043931795386158, + "loss": 2.5956, + "theoretical_loss": 3.558537492142328, + "tokens_seen": 1311371264 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030438314944834506, + "loss": 2.7413, + "theoretical_loss": 3.558521405702965, + "tokens_seen": 1311436800 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003043731193580742, + "loss": 2.4865, + "theoretical_loss": 3.558505320292538, + "tokens_seen": 1311502336 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003043630892678034, + "loss": 2.5645, + "theoretical_loss": 3.5584892359109306, + "tokens_seen": 1311567872 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003043530591775326, + "loss": 2.7924, + "theoretical_loss": 3.558473152558025, + "tokens_seen": 1311633408 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003043430290872618, + "loss": 2.6965, + "theoretical_loss": 3.558457070233704, + "tokens_seen": 1311698944 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030433299899699097, + "loss": 2.5144, + "theoretical_loss": 3.558440988937851, + "tokens_seen": 1311764480 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003043229689067202, + "loss": 2.6407, + "theoretical_loss": 3.558424908670349, + "tokens_seen": 1311830016 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030431293881644933, + "loss": 2.6782, + "theoretical_loss": 3.5584088294310803, + "tokens_seen": 1311895552 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030430290872617856, + "loss": 2.6771, + "theoretical_loss": 3.558392751219927, + "tokens_seen": 1311961088 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003042928786359077, + "loss": 2.7834, + "theoretical_loss": 3.558376674036774, + "tokens_seen": 1312026624 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003042828485456369, + "loss": 2.7159, + "theoretical_loss": 3.558360597881503, + "tokens_seen": 1312092160 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003042728184553661, + "loss": 2.6537, + "theoretical_loss": 3.5583445227539965, + "tokens_seen": 1312157696 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003042627883650953, + "loss": 2.79, + "theoretical_loss": 3.5583284486541387, + "tokens_seen": 1312223232 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030425275827482447, + "loss": 2.9484, + "theoretical_loss": 3.558312375581812, + "tokens_seen": 1312288768 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1490224, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.775263786315918, + "objective/train/theoretical_loss": 3.5582963035368986, + "objective/train/tokens_used": 1332814304, + "theoretical_loss": 3.5582963035368986, + "tokens_seen": 1312354304 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030424272818455365, + "loss": 2.5093, + "theoretical_loss": 3.5582963035368986, + "tokens_seen": 1312354304 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030423269809428283, + "loss": 2.9231, + "theoretical_loss": 3.558280232519283, + "tokens_seen": 1312419840 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030422266800401207, + "loss": 2.684, + "theoretical_loss": 3.558264162528847, + "tokens_seen": 1312485376 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003042126379137412, + "loss": 2.7024, + "theoretical_loss": 3.5582480935654743, + "tokens_seen": 1312550912 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030420260782347043, + "loss": 2.7182, + "theoretical_loss": 3.5582320256290485, + "tokens_seen": 1312616448 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003041925777331996, + "loss": 2.6307, + "theoretical_loss": 3.5582159587194515, + "tokens_seen": 1312681984 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003041825476429288, + "loss": 2.626, + "theoretical_loss": 3.5581998928365675, + "tokens_seen": 1312747520 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030417251755265797, + "loss": 2.7163, + "theoretical_loss": 3.5581838279802787, + "tokens_seen": 1312813056 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030416248746238715, + "loss": 3.0121, + "theoretical_loss": 3.5581677641504683, + "tokens_seen": 1312878592 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030415245737211633, + "loss": 2.7012, + "theoretical_loss": 3.558151701347021, + "tokens_seen": 1312944128 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030414242728184557, + "loss": 2.6484, + "theoretical_loss": 3.5581356395698176, + "tokens_seen": 1313009664 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003041323971915747, + "loss": 2.6568, + "theoretical_loss": 3.5581195788187436, + "tokens_seen": 1313075200 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030412236710130393, + "loss": 2.6755, + "theoretical_loss": 3.558103519093681, + "tokens_seen": 1313140736 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030411233701103306, + "loss": 2.6254, + "theoretical_loss": 3.558087460394513, + "tokens_seen": 1313206272 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003041023069207623, + "loss": 2.6578, + "theoretical_loss": 3.558071402721123, + "tokens_seen": 1313271808 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003040922768304915, + "loss": 2.8261, + "theoretical_loss": 3.558055346073395, + "tokens_seen": 1313337344 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030408224674022066, + "loss": 2.8016, + "theoretical_loss": 3.5580392904512115, + "tokens_seen": 1313402880 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030407221664994984, + "loss": 2.7408, + "theoretical_loss": 3.558023235854456, + "tokens_seen": 1313468416 + }, + { + "epoch": 4.04, + "learning_rate": 0.000304062186559679, + "loss": 2.6432, + "theoretical_loss": 3.5580071822830117, + "tokens_seen": 1313533952 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030405215646940825, + "loss": 2.6584, + "theoretical_loss": 3.5579911297367626, + "tokens_seen": 1313599488 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030404212637913743, + "loss": 2.5022, + "theoretical_loss": 3.5579750782155917, + "tokens_seen": 1313665024 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003040320962888666, + "loss": 2.6291, + "theoretical_loss": 3.557959027719382, + "tokens_seen": 1313730560 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003040220661985958, + "loss": 2.7724, + "theoretical_loss": 3.557942978248018, + "tokens_seen": 1313796096 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030401203610832503, + "loss": 2.5832, + "theoretical_loss": 3.557926929801382, + "tokens_seen": 1313861632 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030400200601805416, + "loss": 2.4701, + "theoretical_loss": 3.5579108823793577, + "tokens_seen": 1313927168 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1490984, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.301514148712158, + "objective/train/theoretical_loss": 3.5578948359818297, + "objective/train/tokens_used": 1334452704, + "theoretical_loss": 3.5578948359818297, + "tokens_seen": 1313992704 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003039919759277834, + "loss": 2.5106, + "theoretical_loss": 3.5578948359818297, + "tokens_seen": 1313992704 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003039819458375125, + "loss": 2.7581, + "theoretical_loss": 3.5578787906086795, + "tokens_seen": 1314058240 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030397191574724176, + "loss": 2.7226, + "theoretical_loss": 3.5578627462597927, + "tokens_seen": 1314123776 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030396188565697094, + "loss": 2.9072, + "theoretical_loss": 3.557846702935051, + "tokens_seen": 1314189312 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003039518555667001, + "loss": 2.5841, + "theoretical_loss": 3.5578306606343393, + "tokens_seen": 1314254848 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003039418254764293, + "loss": 2.3466, + "theoretical_loss": 3.557814619357541, + "tokens_seen": 1314320384 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003039317953861585, + "loss": 2.5913, + "theoretical_loss": 3.557798579104539, + "tokens_seen": 1314385920 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030392176529588766, + "loss": 2.6581, + "theoretical_loss": 3.5577825398752174, + "tokens_seen": 1314451456 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003039117352056169, + "loss": 2.61, + "theoretical_loss": 3.5577665016694597, + "tokens_seen": 1314516992 + }, + { + "epoch": 4.04, + "learning_rate": 0.000303901705115346, + "loss": 2.9948, + "theoretical_loss": 3.5577504644871496, + "tokens_seen": 1314582528 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030389167502507526, + "loss": 2.6986, + "theoretical_loss": 3.557734428328171, + "tokens_seen": 1314648064 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003038816449348044, + "loss": 2.808, + "theoretical_loss": 3.557718393192407, + "tokens_seen": 1314713600 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003038716148445336, + "loss": 2.6255, + "theoretical_loss": 3.5577023590797423, + "tokens_seen": 1314779136 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003038615847542628, + "loss": 2.5779, + "theoretical_loss": 3.5576863259900597, + "tokens_seen": 1314844672 + }, + { + "epoch": 4.04, + "learning_rate": 0.000303851554663992, + "loss": 2.6617, + "theoretical_loss": 3.5576702939232434, + "tokens_seen": 1314910208 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030384152457372117, + "loss": 2.8716, + "theoretical_loss": 3.5576542628791765, + "tokens_seen": 1314975744 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003038314944834504, + "loss": 2.8262, + "theoretical_loss": 3.557638232857744, + "tokens_seen": 1315041280 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030382146439317953, + "loss": 2.7641, + "theoretical_loss": 3.5576222038588288, + "tokens_seen": 1315106816 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030381143430290876, + "loss": 2.8674, + "theoretical_loss": 3.557606175882315, + "tokens_seen": 1315172352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003038014042126379, + "loss": 2.8339, + "theoretical_loss": 3.5575901489280866, + "tokens_seen": 1315237888 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003037913741223671, + "loss": 2.6716, + "theoretical_loss": 3.557574122996027, + "tokens_seen": 1315303424 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003037813440320963, + "loss": 2.8439, + "theoretical_loss": 3.5575580980860204, + "tokens_seen": 1315368960 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003037713139418255, + "loss": 2.7056, + "theoretical_loss": 3.5575420741979507, + "tokens_seen": 1315434496 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030376128385155467, + "loss": 2.7797, + "theoretical_loss": 3.5575260513317017, + "tokens_seen": 1315500032 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030375125376128385, + "loss": 2.5237, + "theoretical_loss": 3.5575100294871573, + "tokens_seen": 1315565568 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1492577, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.998716354370117, + "objective/train/theoretical_loss": 3.557494008664202, + "objective/train/tokens_used": 1336091104, + "theoretical_loss": 3.557494008664202, + "tokens_seen": 1315631104 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030374122367101303, + "loss": 2.6291, + "theoretical_loss": 3.557494008664202, + "tokens_seen": 1315631104 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030373119358074227, + "loss": 2.5316, + "theoretical_loss": 3.557477988862719, + "tokens_seen": 1315696640 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003037211634904714, + "loss": 2.768, + "theoretical_loss": 3.5574619700825925, + "tokens_seen": 1315762176 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030371113340020063, + "loss": 2.7363, + "theoretical_loss": 3.557445952323707, + "tokens_seen": 1315827712 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003037011033099298, + "loss": 2.671, + "theoretical_loss": 3.557429935585946, + "tokens_seen": 1315893248 + }, + { + "epoch": 4.04, + "learning_rate": 0.000303691073219659, + "loss": 2.6929, + "theoretical_loss": 3.5574139198691936, + "tokens_seen": 1315958784 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030368104312938817, + "loss": 2.8983, + "theoretical_loss": 3.5573979051733344, + "tokens_seen": 1316024320 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030367101303911735, + "loss": 2.7451, + "theoretical_loss": 3.5573818914982525, + "tokens_seen": 1316089856 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030366098294884653, + "loss": 2.506, + "theoretical_loss": 3.557365878843831, + "tokens_seen": 1316155392 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030365095285857577, + "loss": 2.5843, + "theoretical_loss": 3.557349867209955, + "tokens_seen": 1316220928 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003036409227683049, + "loss": 2.8426, + "theoretical_loss": 3.5573338565965074, + "tokens_seen": 1316286464 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030363089267803413, + "loss": 2.8309, + "theoretical_loss": 3.5573178470033744, + "tokens_seen": 1316352000 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030362086258776326, + "loss": 2.8832, + "theoretical_loss": 3.5573018384304382, + "tokens_seen": 1316417536 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003036108324974925, + "loss": 2.7057, + "theoretical_loss": 3.5572858308775843, + "tokens_seen": 1316483072 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003036008024072217, + "loss": 2.7707, + "theoretical_loss": 3.5572698243446963, + "tokens_seen": 1316548608 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030359077231695086, + "loss": 2.3975, + "theoretical_loss": 3.5572538188316587, + "tokens_seen": 1316614144 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030358074222668004, + "loss": 2.8102, + "theoretical_loss": 3.5572378143383556, + "tokens_seen": 1316679680 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003035707121364092, + "loss": 2.6688, + "theoretical_loss": 3.5572218108646716, + "tokens_seen": 1316745216 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003035606820461384, + "loss": 2.969, + "theoretical_loss": 3.5572058084104907, + "tokens_seen": 1316810752 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030355065195586763, + "loss": 2.5797, + "theoretical_loss": 3.557189806975697, + "tokens_seen": 1316876288 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030354062186559676, + "loss": 2.4623, + "theoretical_loss": 3.5571738065601757, + "tokens_seen": 1316941824 + }, + { + "epoch": 4.04, + "learning_rate": 0.000303530591775326, + "loss": 2.8467, + "theoretical_loss": 3.55715780716381, + "tokens_seen": 1317007360 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003035205616850552, + "loss": 2.8461, + "theoretical_loss": 3.5571418087864854, + "tokens_seen": 1317072896 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030351053159478436, + "loss": 2.6863, + "theoretical_loss": 3.5571258114280853, + "tokens_seen": 1317138432 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030350050150451354, + "loss": 2.8625, + "theoretical_loss": 3.557109815088495, + "tokens_seen": 1317203968 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1493455, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.698178291320801, + "objective/train/theoretical_loss": 3.557093819767598, + "objective/train/tokens_used": 1337729504, + "theoretical_loss": 3.557093819767598, + "tokens_seen": 1317269504 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003034904714142427, + "loss": 2.6329, + "theoretical_loss": 3.557093819767598, + "tokens_seen": 1317269504 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003034804413239719, + "loss": 2.5847, + "theoretical_loss": 3.5570778254652793, + "tokens_seen": 1317335040 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030347041123370114, + "loss": 2.5977, + "theoretical_loss": 3.5570618321814234, + "tokens_seen": 1317400576 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030346038114343026, + "loss": 2.711, + "theoretical_loss": 3.5570458399159146, + "tokens_seen": 1317466112 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003034503510531595, + "loss": 2.7197, + "theoretical_loss": 3.5570298486686376, + "tokens_seen": 1317531648 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003034403209628886, + "loss": 2.5191, + "theoretical_loss": 3.557013858439477, + "tokens_seen": 1317597184 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030343029087261786, + "loss": 2.568, + "theoretical_loss": 3.556997869228317, + "tokens_seen": 1317662720 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030342026078234704, + "loss": 2.6477, + "theoretical_loss": 3.5569818810350418, + "tokens_seen": 1317728256 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003034102306920762, + "loss": 2.7502, + "theoretical_loss": 3.5569658938595374, + "tokens_seen": 1317793792 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003034002006018054, + "loss": 2.7715, + "theoretical_loss": 3.5569499077016866, + "tokens_seen": 1317859328 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003033901705115346, + "loss": 2.7943, + "theoretical_loss": 3.5569339225613756, + "tokens_seen": 1317924864 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030338014042126377, + "loss": 2.6835, + "theoretical_loss": 3.556917938438488, + "tokens_seen": 1317990400 + }, + { + "epoch": 4.04, + "learning_rate": 0.000303370110330993, + "loss": 2.6136, + "theoretical_loss": 3.5569019553329095, + "tokens_seen": 1318055936 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030336008024072213, + "loss": 2.7573, + "theoretical_loss": 3.5568859732445235, + "tokens_seen": 1318121472 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030335005015045137, + "loss": 2.6267, + "theoretical_loss": 3.5568699921732154, + "tokens_seen": 1318187008 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030334002006018055, + "loss": 2.652, + "theoretical_loss": 3.5568540121188703, + "tokens_seen": 1318252544 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030332998996990973, + "loss": 2.6998, + "theoretical_loss": 3.556838033081372, + "tokens_seen": 1318318080 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003033199598796389, + "loss": 2.7622, + "theoretical_loss": 3.5568220550606053, + "tokens_seen": 1318383616 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003033099297893681, + "loss": 2.6623, + "theoretical_loss": 3.5568060780564563, + "tokens_seen": 1318449152 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003032998996990973, + "loss": 2.5731, + "theoretical_loss": 3.556790102068808, + "tokens_seen": 1318514688 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003032898696088265, + "loss": 2.7561, + "theoretical_loss": 3.5567741270975466, + "tokens_seen": 1318580224 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003032798395185557, + "loss": 2.7818, + "theoretical_loss": 3.5567581531425563, + "tokens_seen": 1318645760 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030326980942828487, + "loss": 2.5645, + "theoretical_loss": 3.556742180203722, + "tokens_seen": 1318711296 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030325977933801405, + "loss": 2.8654, + "theoretical_loss": 3.5567262082809283, + "tokens_seen": 1318776832 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030324974924774323, + "loss": 2.4856, + "theoretical_loss": 3.5567102373740607, + "tokens_seen": 1318842368 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1494825, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.679054021835327, + "objective/train/theoretical_loss": 3.5566942674830035, + "objective/train/tokens_used": 1339367904, + "theoretical_loss": 3.5566942674830035, + "tokens_seen": 1318907904 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030323971915747247, + "loss": 2.5742, + "theoretical_loss": 3.5566942674830035, + "tokens_seen": 1318907904 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003032296890672016, + "loss": 2.674, + "theoretical_loss": 3.5566782986076424, + "tokens_seen": 1318973440 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030321965897693083, + "loss": 2.6931, + "theoretical_loss": 3.5566623307478613, + "tokens_seen": 1319038976 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030320962888666, + "loss": 2.5451, + "theoretical_loss": 3.556646363903546, + "tokens_seen": 1319104512 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003031995987963892, + "loss": 2.7607, + "theoretical_loss": 3.556630398074581, + "tokens_seen": 1319170048 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030318956870611837, + "loss": 2.6781, + "theoretical_loss": 3.556614433260852, + "tokens_seen": 1319235584 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030317953861584755, + "loss": 2.9642, + "theoretical_loss": 3.5565984694622426, + "tokens_seen": 1319301120 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030316950852557673, + "loss": 2.8698, + "theoretical_loss": 3.556582506678639, + "tokens_seen": 1319366656 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030315947843530597, + "loss": 2.4901, + "theoretical_loss": 3.556566544909926, + "tokens_seen": 1319432192 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003031494483450351, + "loss": 2.6169, + "theoretical_loss": 3.556550584155989, + "tokens_seen": 1319497728 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030313941825476433, + "loss": 2.7552, + "theoretical_loss": 3.5565346244167126, + "tokens_seen": 1319563264 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030312938816449346, + "loss": 2.5759, + "theoretical_loss": 3.556518665691982, + "tokens_seen": 1319628800 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003031193580742227, + "loss": 2.8975, + "theoretical_loss": 3.556502707981682, + "tokens_seen": 1319694336 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003031093279839519, + "loss": 2.7001, + "theoretical_loss": 3.556486751285699, + "tokens_seen": 1319759872 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030309929789368106, + "loss": 2.5405, + "theoretical_loss": 3.556470795603916, + "tokens_seen": 1319825408 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030308926780341024, + "loss": 2.7051, + "theoretical_loss": 3.5564548409362207, + "tokens_seen": 1319890944 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030308926780341024, + "loss": 2.6966, + "theoretical_loss": 3.5564388872824964, + "tokens_seen": 1319956480 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003030792377131394, + "loss": 2.593, + "theoretical_loss": 3.556422934642629, + "tokens_seen": 1320022016 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003030692076228686, + "loss": 2.5576, + "theoretical_loss": 3.5564069830165037, + "tokens_seen": 1320087552 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030305917753259783, + "loss": 2.7134, + "theoretical_loss": 3.5563910324040062, + "tokens_seen": 1320153088 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030304914744232696, + "loss": 2.7155, + "theoretical_loss": 3.5563750828050207, + "tokens_seen": 1320218624 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003030391173520562, + "loss": 2.4824, + "theoretical_loss": 3.5563591342194334, + "tokens_seen": 1320284160 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003030290872617854, + "loss": 2.7657, + "theoretical_loss": 3.5563431866471293, + "tokens_seen": 1320349696 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030301905717151456, + "loss": 2.6831, + "theoretical_loss": 3.556327240087994, + "tokens_seen": 1320415232 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030300902708124374, + "loss": 2.77, + "theoretical_loss": 3.5563112945419126, + "tokens_seen": 1320480768 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1495445, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9724247455596924, + "objective/train/theoretical_loss": 3.55629535000877, + "objective/train/tokens_used": 1341006304, + "theoretical_loss": 3.55629535000877, + "tokens_seen": 1320546304 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003029989969909729, + "loss": 2.9013, + "theoretical_loss": 3.55629535000877, + "tokens_seen": 1320546304 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003029889669007021, + "loss": 2.7225, + "theoretical_loss": 3.5562794064884526, + "tokens_seen": 1320611840 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030297893681043134, + "loss": 2.4485, + "theoretical_loss": 3.5562634639808453, + "tokens_seen": 1320677376 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030296890672016046, + "loss": 2.7318, + "theoretical_loss": 3.556247522485833, + "tokens_seen": 1320742912 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003029588766298897, + "loss": 2.7806, + "theoretical_loss": 3.556231582003303, + "tokens_seen": 1320808448 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003029488465396188, + "loss": 2.6104, + "theoretical_loss": 3.5562156425331377, + "tokens_seen": 1320873984 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030293881644934806, + "loss": 2.7759, + "theoretical_loss": 3.5561997040752256, + "tokens_seen": 1320939520 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030292878635907724, + "loss": 2.5906, + "theoretical_loss": 3.5561837666294505, + "tokens_seen": 1321005056 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003029187562688064, + "loss": 2.8147, + "theoretical_loss": 3.5561678301956983, + "tokens_seen": 1321070592 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003029087261785356, + "loss": 2.6234, + "theoretical_loss": 3.556151894773854, + "tokens_seen": 1321136128 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003028986960882648, + "loss": 2.8506, + "theoretical_loss": 3.5561359603638047, + "tokens_seen": 1321201664 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030288866599799397, + "loss": 2.7474, + "theoretical_loss": 3.5561200269654343, + "tokens_seen": 1321267200 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003028786359077232, + "loss": 2.8021, + "theoretical_loss": 3.5561040945786293, + "tokens_seen": 1321332736 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030286860581745233, + "loss": 2.818, + "theoretical_loss": 3.556088163203275, + "tokens_seen": 1321398272 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030285857572718157, + "loss": 2.7076, + "theoretical_loss": 3.5560722328392576, + "tokens_seen": 1321463808 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030284854563691075, + "loss": 2.6369, + "theoretical_loss": 3.5560563034864616, + "tokens_seen": 1321529344 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030283851554663993, + "loss": 2.5183, + "theoretical_loss": 3.556040375144774, + "tokens_seen": 1321594880 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003028284854563691, + "loss": 2.545, + "theoretical_loss": 3.556024447814079, + "tokens_seen": 1321660416 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003028184553660983, + "loss": 2.4651, + "theoretical_loss": 3.556008521494264, + "tokens_seen": 1321725952 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030280842527582747, + "loss": 2.6089, + "theoretical_loss": 3.555992596185213, + "tokens_seen": 1321791488 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003027983951855567, + "loss": 2.6557, + "theoretical_loss": 3.555976671886813, + "tokens_seen": 1321857024 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030278836509528583, + "loss": 2.6763, + "theoretical_loss": 3.5559607485989497, + "tokens_seen": 1321922560 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030277833500501507, + "loss": 2.6129, + "theoretical_loss": 3.5559448263215083, + "tokens_seen": 1321988096 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003027683049147442, + "loss": 2.5999, + "theoretical_loss": 3.555928905054375, + "tokens_seen": 1322053632 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030275827482447343, + "loss": 2.923, + "theoretical_loss": 3.555912984797435, + "tokens_seen": 1322119168 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1496938, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7434074878692627, + "objective/train/theoretical_loss": 3.555897065550575, + "objective/train/tokens_used": 1342644704, + "theoretical_loss": 3.555897065550575, + "tokens_seen": 1322184704 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003027482447342026, + "loss": 2.7297, + "theoretical_loss": 3.555897065550575, + "tokens_seen": 1322184704 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003027382146439318, + "loss": 2.804, + "theoretical_loss": 3.5558811473136807, + "tokens_seen": 1322250240 + }, + { + "epoch": 4.04, + "learning_rate": 0.000302728184553661, + "loss": 2.3758, + "theoretical_loss": 3.5558652300866367, + "tokens_seen": 1322315776 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003027181544633902, + "loss": 2.5547, + "theoretical_loss": 3.555849313869331, + "tokens_seen": 1322381312 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030270812437311934, + "loss": 2.663, + "theoretical_loss": 3.5558333986616475, + "tokens_seen": 1322446848 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030269809428284857, + "loss": 2.5285, + "theoretical_loss": 3.555817484463474, + "tokens_seen": 1322512384 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003026880641925777, + "loss": 2.6559, + "theoretical_loss": 3.555801571274695, + "tokens_seen": 1322577920 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030267803410230693, + "loss": 2.7535, + "theoretical_loss": 3.5557856590951964, + "tokens_seen": 1322643456 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003026680040120361, + "loss": 2.8497, + "theoretical_loss": 3.5557697479248653, + "tokens_seen": 1322708992 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003026579739217653, + "loss": 2.6702, + "theoretical_loss": 3.555753837763587, + "tokens_seen": 1322774528 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003026479438314945, + "loss": 2.6117, + "theoretical_loss": 3.555737928611248, + "tokens_seen": 1322840064 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030263791374122366, + "loss": 2.5529, + "theoretical_loss": 3.5557220204677336, + "tokens_seen": 1322905600 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030262788365095284, + "loss": 2.6863, + "theoretical_loss": 3.5557061133329304, + "tokens_seen": 1322971136 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003026178535606821, + "loss": 2.7612, + "theoretical_loss": 3.555690207206724, + "tokens_seen": 1323036672 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003026078234704112, + "loss": 2.8795, + "theoretical_loss": 3.5556743020890007, + "tokens_seen": 1323102208 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030259779338014044, + "loss": 2.7482, + "theoretical_loss": 3.5556583979796477, + "tokens_seen": 1323167744 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030258776328986956, + "loss": 2.6481, + "theoretical_loss": 3.5556424948785494, + "tokens_seen": 1323233280 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003025777331995988, + "loss": 2.561, + "theoretical_loss": 3.555626592785593, + "tokens_seen": 1323298816 + }, + { + "epoch": 4.04, + "learning_rate": 0.000302567703109328, + "loss": 2.6666, + "theoretical_loss": 3.555610691700664, + "tokens_seen": 1323364352 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030255767301905716, + "loss": 2.6272, + "theoretical_loss": 3.5555947916236494, + "tokens_seen": 1323429888 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003025476429287864, + "loss": 2.755, + "theoretical_loss": 3.5555788925544345, + "tokens_seen": 1323495424 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003025376128385156, + "loss": 2.7486, + "theoretical_loss": 3.5555629944929064, + "tokens_seen": 1323560960 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030252758274824476, + "loss": 2.768, + "theoretical_loss": 3.555547097438951, + "tokens_seen": 1323626496 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030251755265797394, + "loss": 2.7912, + "theoretical_loss": 3.555531201392454, + "tokens_seen": 1323692032 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003025075225677031, + "loss": 2.7348, + "theoretical_loss": 3.5555153063533025, + "tokens_seen": 1323757568 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1497393, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.852140426635742, + "objective/train/theoretical_loss": 3.5554994123213826, + "objective/train/tokens_used": 1344283104, + "theoretical_loss": 3.5554994123213826, + "tokens_seen": 1323823104 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003024974924774323, + "loss": 2.8627, + "theoretical_loss": 3.5554994123213826, + "tokens_seen": 1323823104 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030248746238716154, + "loss": 2.9006, + "theoretical_loss": 3.55548351929658, + "tokens_seen": 1323888640 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030247743229689066, + "loss": 2.5953, + "theoretical_loss": 3.555467627278782, + "tokens_seen": 1323954176 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003024674022066199, + "loss": 2.5436, + "theoretical_loss": 3.555451736267874, + "tokens_seen": 1324019712 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030245737211634903, + "loss": 2.6353, + "theoretical_loss": 3.5554358462637436, + "tokens_seen": 1324085248 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030244734202607826, + "loss": 2.6937, + "theoretical_loss": 3.5554199572662757, + "tokens_seen": 1324150784 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030243731193580744, + "loss": 2.849, + "theoretical_loss": 3.5554040692753577, + "tokens_seen": 1324216320 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003024272818455366, + "loss": 2.7065, + "theoretical_loss": 3.555388182290876, + "tokens_seen": 1324281856 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003024172517552658, + "loss": 2.8507, + "theoretical_loss": 3.555372296312717, + "tokens_seen": 1324347392 + }, + { + "epoch": 4.04, + "learning_rate": 0.000302407221664995, + "loss": 2.6622, + "theoretical_loss": 3.555356411340766, + "tokens_seen": 1324412928 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030239719157472417, + "loss": 2.7269, + "theoretical_loss": 3.555340527374911, + "tokens_seen": 1324478464 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003023871614844534, + "loss": 2.6154, + "theoretical_loss": 3.5553246444150384, + "tokens_seen": 1324544000 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030237713139418253, + "loss": 2.501, + "theoretical_loss": 3.5553087624610336, + "tokens_seen": 1324609536 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030236710130391177, + "loss": 2.6562, + "theoretical_loss": 3.555292881512784, + "tokens_seen": 1324675072 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030235707121364095, + "loss": 2.5376, + "theoretical_loss": 3.5552770015701762, + "tokens_seen": 1324740608 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030234704112337013, + "loss": 2.8098, + "theoretical_loss": 3.5552611226330963, + "tokens_seen": 1324806144 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003023370110330993, + "loss": 2.767, + "theoretical_loss": 3.555245244701431, + "tokens_seen": 1324871680 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003023269809428285, + "loss": 2.5934, + "theoretical_loss": 3.5552293677750675, + "tokens_seen": 1324937216 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030231695085255767, + "loss": 2.7604, + "theoretical_loss": 3.5552134918538916, + "tokens_seen": 1325002752 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003023069207622869, + "loss": 2.7105, + "theoretical_loss": 3.5551976169377904, + "tokens_seen": 1325068288 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030229689067201603, + "loss": 2.5927, + "theoretical_loss": 3.5551817430266506, + "tokens_seen": 1325133824 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030228686058174527, + "loss": 2.7396, + "theoretical_loss": 3.5551658701203586, + "tokens_seen": 1325199360 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003022768304914744, + "loss": 2.7238, + "theoretical_loss": 3.5551499982188015, + "tokens_seen": 1325264896 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030226680040120363, + "loss": 2.7457, + "theoretical_loss": 3.5551341273218657, + "tokens_seen": 1325330432 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003022567703109328, + "loss": 2.7258, + "theoretical_loss": 3.555118257429438, + "tokens_seen": 1325395968 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1498967, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8756134510040283, + "objective/train/theoretical_loss": 3.555102388541405, + "objective/train/tokens_used": 1345921504, + "theoretical_loss": 3.555102388541405, + "tokens_seen": 1325461504 + }, + { + "epoch": 4.04, + "learning_rate": 0.000302246740220662, + "loss": 2.9374, + "theoretical_loss": 3.555102388541405, + "tokens_seen": 1325461504 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003022367101303912, + "loss": 2.7265, + "theoretical_loss": 3.555086520657654, + "tokens_seen": 1325527040 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003022266800401204, + "loss": 2.7696, + "theoretical_loss": 3.5550706537780714, + "tokens_seen": 1325592576 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030221664994984954, + "loss": 2.6507, + "theoretical_loss": 3.5550547879025443, + "tokens_seen": 1325658112 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030220661985957877, + "loss": 2.7139, + "theoretical_loss": 3.555038923030959, + "tokens_seen": 1325723648 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003021965897693079, + "loss": 2.8103, + "theoretical_loss": 3.5550230591632026, + "tokens_seen": 1325789184 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030218655967903713, + "loss": 2.656, + "theoretical_loss": 3.5550071962991616, + "tokens_seen": 1325854720 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003021765295887663, + "loss": 2.6585, + "theoretical_loss": 3.554991334438724, + "tokens_seen": 1325920256 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003021664994984955, + "loss": 2.7376, + "theoretical_loss": 3.554975473581776, + "tokens_seen": 1325985792 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003021564694082247, + "loss": 2.6548, + "theoretical_loss": 3.5549596137282045, + "tokens_seen": 1326051328 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030214643931795386, + "loss": 2.7974, + "theoretical_loss": 3.554943754877896, + "tokens_seen": 1326116864 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030213640922768304, + "loss": 2.6282, + "theoretical_loss": 3.5549278970307383, + "tokens_seen": 1326182400 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003021263791374123, + "loss": 2.695, + "theoretical_loss": 3.554912040186618, + "tokens_seen": 1326247936 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003021163490471414, + "loss": 2.7362, + "theoretical_loss": 3.554896184345422, + "tokens_seen": 1326313472 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030210631895687064, + "loss": 2.5531, + "theoretical_loss": 3.5548803295070375, + "tokens_seen": 1326379008 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030209628886659976, + "loss": 2.6168, + "theoretical_loss": 3.5548644756713514, + "tokens_seen": 1326444544 + }, + { + "epoch": 4.04, + "learning_rate": 0.000302086258776329, + "loss": 2.9134, + "theoretical_loss": 3.554848622838251, + "tokens_seen": 1326510080 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003020762286860582, + "loss": 2.7424, + "theoretical_loss": 3.5548327710076233, + "tokens_seen": 1326575616 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030206619859578736, + "loss": 2.648, + "theoretical_loss": 3.5548169201793547, + "tokens_seen": 1326641152 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030205616850551654, + "loss": 2.8463, + "theoretical_loss": 3.5548010703533333, + "tokens_seen": 1326706688 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003020461384152458, + "loss": 2.5416, + "theoretical_loss": 3.5547852215294453, + "tokens_seen": 1326772224 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003020361083249749, + "loss": 2.6082, + "theoretical_loss": 3.554769373707579, + "tokens_seen": 1326837760 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030202607823470414, + "loss": 2.6569, + "theoretical_loss": 3.5547535268876205, + "tokens_seen": 1326903296 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030201604814443327, + "loss": 2.6231, + "theoretical_loss": 3.554737681069458, + "tokens_seen": 1326968832 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003020060180541625, + "loss": 2.7764, + "theoretical_loss": 3.5547218362529773, + "tokens_seen": 1327034368 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1499694, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.877124547958374, + "objective/train/theoretical_loss": 3.5547059924380666, + "objective/train/tokens_used": 1347559904, + "theoretical_loss": 3.5547059924380666, + "tokens_seen": 1327099904 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003019959879638917, + "loss": 2.8643, + "theoretical_loss": 3.5547059924380666, + "tokens_seen": 1327099904 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030198595787362087, + "loss": 2.7271, + "theoretical_loss": 3.554690149624613, + "tokens_seen": 1327165440 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030197592778335005, + "loss": 2.7456, + "theoretical_loss": 3.554674307812504, + "tokens_seen": 1327230976 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030196589769307923, + "loss": 2.6898, + "theoretical_loss": 3.5546584670016257, + "tokens_seen": 1327296512 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003019558676028084, + "loss": 2.5976, + "theoretical_loss": 3.5546426271918667, + "tokens_seen": 1327362048 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030194583751253764, + "loss": 2.7791, + "theoretical_loss": 3.554626788383114, + "tokens_seen": 1327427584 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030193580742226677, + "loss": 2.7436, + "theoretical_loss": 3.554610950575255, + "tokens_seen": 1327493120 + }, + { + "epoch": 4.04, + "learning_rate": 0.000301925777331996, + "loss": 2.3796, + "theoretical_loss": 3.554595113768176, + "tokens_seen": 1327558656 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030191574724172513, + "loss": 2.3177, + "theoretical_loss": 3.554579277961766, + "tokens_seen": 1327624192 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030190571715145437, + "loss": 2.7672, + "theoretical_loss": 3.554563443155911, + "tokens_seen": 1327689728 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030189568706118355, + "loss": 2.8033, + "theoretical_loss": 3.5545476093504993, + "tokens_seen": 1327755264 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030188565697091273, + "loss": 2.5326, + "theoretical_loss": 3.554531776545418, + "tokens_seen": 1327820800 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003018756268806419, + "loss": 2.7771, + "theoretical_loss": 3.554515944740554, + "tokens_seen": 1327886336 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030186559679037115, + "loss": 2.9061, + "theoretical_loss": 3.554500113935796, + "tokens_seen": 1327951872 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003018555667001003, + "loss": 2.6614, + "theoretical_loss": 3.5544842841310302, + "tokens_seen": 1328017408 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003018455366098295, + "loss": 2.7331, + "theoretical_loss": 3.5544684553261447, + "tokens_seen": 1328082944 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030183550651955864, + "loss": 2.5361, + "theoretical_loss": 3.5544526275210266, + "tokens_seen": 1328148480 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030182547642928787, + "loss": 2.7823, + "theoretical_loss": 3.554436800715564, + "tokens_seen": 1328214016 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030181544633901705, + "loss": 2.5981, + "theoretical_loss": 3.5544209749096445, + "tokens_seen": 1328279552 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030180541624874623, + "loss": 2.6959, + "theoretical_loss": 3.554405150103155, + "tokens_seen": 1328345088 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030179538615847547, + "loss": 2.7439, + "theoretical_loss": 3.5543893262959836, + "tokens_seen": 1328410624 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003017853560682046, + "loss": 2.6138, + "theoretical_loss": 3.5543735034880175, + "tokens_seen": 1328476160 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030177532597793383, + "loss": 2.5255, + "theoretical_loss": 3.5543576816791447, + "tokens_seen": 1328541696 + }, + { + "epoch": 4.04, + "learning_rate": 0.000301765295887663, + "loss": 2.6184, + "theoretical_loss": 3.554341860869253, + "tokens_seen": 1328607232 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003017552657973922, + "loss": 2.557, + "theoretical_loss": 3.5543260410582294, + "tokens_seen": 1328672768 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1500360, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.750994920730591, + "objective/train/theoretical_loss": 3.554310222245962, + "objective/train/tokens_used": 1349198304, + "theoretical_loss": 3.554310222245962, + "tokens_seen": 1328738304 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003017452357071214, + "loss": 2.6728, + "theoretical_loss": 3.554310222245962, + "tokens_seen": 1328738304 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003017352056168506, + "loss": 2.6421, + "theoretical_loss": 3.5542944044323384, + "tokens_seen": 1328803840 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030172517552657974, + "loss": 2.787, + "theoretical_loss": 3.5542785876172465, + "tokens_seen": 1328869376 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030171514543630897, + "loss": 2.7608, + "theoretical_loss": 3.5542627718005733, + "tokens_seen": 1328934912 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003017051153460381, + "loss": 2.5684, + "theoretical_loss": 3.5542469569822073, + "tokens_seen": 1329000448 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030169508525576733, + "loss": 2.539, + "theoretical_loss": 3.554231143162036, + "tokens_seen": 1329065984 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003016850551654965, + "loss": 2.5904, + "theoretical_loss": 3.5542153303399475, + "tokens_seen": 1329131520 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003016750250752257, + "loss": 2.5785, + "theoretical_loss": 3.554199518515829, + "tokens_seen": 1329197056 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003016649949849549, + "loss": 2.8468, + "theoretical_loss": 3.554183707689569, + "tokens_seen": 1329262592 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030165496489468406, + "loss": 2.7968, + "theoretical_loss": 3.554167897861055, + "tokens_seen": 1329328128 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030164493480441324, + "loss": 2.4569, + "theoretical_loss": 3.5541520890301745, + "tokens_seen": 1329393664 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003016349047141425, + "loss": 2.6965, + "theoretical_loss": 3.554136281196816, + "tokens_seen": 1329459200 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003016248746238716, + "loss": 2.555, + "theoretical_loss": 3.5541204743608663, + "tokens_seen": 1329524736 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030161484453360084, + "loss": 2.7436, + "theoretical_loss": 3.5541046685222146, + "tokens_seen": 1329590272 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030160481444332996, + "loss": 2.76, + "theoretical_loss": 3.5540888636807484, + "tokens_seen": 1329655808 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003015947843530592, + "loss": 2.6591, + "theoretical_loss": 3.5540730598363552, + "tokens_seen": 1329721344 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003015847542627884, + "loss": 2.5531, + "theoretical_loss": 3.5540572569889237, + "tokens_seen": 1329786880 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030157472417251756, + "loss": 2.7457, + "theoretical_loss": 3.554041455138342, + "tokens_seen": 1329852416 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030156469408224674, + "loss": 2.6176, + "theoretical_loss": 3.5540256542844966, + "tokens_seen": 1329917952 + }, + { + "epoch": 4.04, + "learning_rate": 0.000301554663991976, + "loss": 2.508, + "theoretical_loss": 3.554009854427277, + "tokens_seen": 1329983488 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003015446339017051, + "loss": 2.8276, + "theoretical_loss": 3.5539940555665703, + "tokens_seen": 1330049024 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030153460381143434, + "loss": 2.7321, + "theoretical_loss": 3.5539782577022656, + "tokens_seen": 1330114560 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030152457372116347, + "loss": 2.5992, + "theoretical_loss": 3.55396246083425, + "tokens_seen": 1330180096 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003015145436308927, + "loss": 2.5204, + "theoretical_loss": 3.553946664962412, + "tokens_seen": 1330245632 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003015045135406219, + "loss": 2.8735, + "theoretical_loss": 3.5539308700866394, + "tokens_seen": 1330311168 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1501699, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.85687518119812, + "objective/train/theoretical_loss": 3.5539150762068212, + "objective/train/tokens_used": 1350836704, + "theoretical_loss": 3.5539150762068212, + "tokens_seen": 1330376704 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030149448345035107, + "loss": 2.6929, + "theoretical_loss": 3.5539150762068212, + "tokens_seen": 1330376704 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030148445336008025, + "loss": 2.5348, + "theoretical_loss": 3.5538992833228447, + "tokens_seen": 1330442240 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030147442326980943, + "loss": 2.8463, + "theoretical_loss": 3.553883491434598, + "tokens_seen": 1330507776 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003014643931795386, + "loss": 2.6109, + "theoretical_loss": 3.5538677005419697, + "tokens_seen": 1330573312 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030145436308926784, + "loss": 2.8107, + "theoretical_loss": 3.5538519106448483, + "tokens_seen": 1330638848 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030144433299899697, + "loss": 2.5365, + "theoretical_loss": 3.553836121743121, + "tokens_seen": 1330704384 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003014343029087262, + "loss": 2.7058, + "theoretical_loss": 3.5538203338366765, + "tokens_seen": 1330769920 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030142427281845533, + "loss": 2.6364, + "theoretical_loss": 3.553804546925404, + "tokens_seen": 1330835456 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030141424272818457, + "loss": 2.2382, + "theoretical_loss": 3.5537887610091903, + "tokens_seen": 1330900992 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030140421263791375, + "loss": 2.6748, + "theoretical_loss": 3.553772976087924, + "tokens_seen": 1330966528 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030139418254764293, + "loss": 2.7097, + "theoretical_loss": 3.5537571921614943, + "tokens_seen": 1331032064 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003013841524573721, + "loss": 2.6677, + "theoretical_loss": 3.5537414092297888, + "tokens_seen": 1331097600 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030137412236710135, + "loss": 2.5247, + "theoretical_loss": 3.553725627292696, + "tokens_seen": 1331163136 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003013640922768305, + "loss": 2.4289, + "theoretical_loss": 3.5537098463501042, + "tokens_seen": 1331228672 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003013540621865597, + "loss": 2.6032, + "theoretical_loss": 3.553694066401902, + "tokens_seen": 1331294208 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030134403209628884, + "loss": 2.7124, + "theoretical_loss": 3.553678287447977, + "tokens_seen": 1331359744 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030133400200601807, + "loss": 2.6053, + "theoretical_loss": 3.553662509488219, + "tokens_seen": 1331425280 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030132397191574725, + "loss": 2.6833, + "theoretical_loss": 3.553646732522515, + "tokens_seen": 1331490816 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030131394182547643, + "loss": 2.7973, + "theoretical_loss": 3.5536309565507542, + "tokens_seen": 1331556352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003013039117352056, + "loss": 2.5321, + "theoretical_loss": 3.5536151815728254, + "tokens_seen": 1331621888 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003012938816449348, + "loss": 2.7239, + "theoretical_loss": 3.5535994075886164, + "tokens_seen": 1331687424 + }, + { + "epoch": 4.04, + "learning_rate": 0.000301283851554664, + "loss": 2.8171, + "theoretical_loss": 3.5535836345980165, + "tokens_seen": 1331752960 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003012738214643932, + "loss": 2.7051, + "theoretical_loss": 3.5535678626009126, + "tokens_seen": 1331818496 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030126379137412234, + "loss": 2.6428, + "theoretical_loss": 3.5535520915971954, + "tokens_seen": 1331884032 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003012537612838516, + "loss": 2.857, + "theoretical_loss": 3.5535363215867517, + "tokens_seen": 1331949568 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1502342, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4824728965759277, + "objective/train/theoretical_loss": 3.553520552569471, + "objective/train/tokens_used": 1352475104, + "theoretical_loss": 3.553520552569471, + "tokens_seen": 1332015104 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003012437311935807, + "loss": 2.8718, + "theoretical_loss": 3.553520552569471, + "tokens_seen": 1332015104 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030123370110330994, + "loss": 2.7301, + "theoretical_loss": 3.5535047845452414, + "tokens_seen": 1332080640 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003012236710130391, + "loss": 2.6574, + "theoretical_loss": 3.553489017513952, + "tokens_seen": 1332146176 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003012136409227683, + "loss": 2.6469, + "theoretical_loss": 3.5534732514754914, + "tokens_seen": 1332211712 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003012036108324975, + "loss": 2.8646, + "theoretical_loss": 3.5534574864297475, + "tokens_seen": 1332277248 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003011935807422267, + "loss": 2.8386, + "theoretical_loss": 3.5534417223766095, + "tokens_seen": 1332342784 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030118355065195584, + "loss": 2.7916, + "theoretical_loss": 3.553425959315967, + "tokens_seen": 1332408320 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003011735205616851, + "loss": 2.7231, + "theoretical_loss": 3.5534101972477066, + "tokens_seen": 1332473856 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003011634904714142, + "loss": 2.6288, + "theoretical_loss": 3.553394436171719, + "tokens_seen": 1332539392 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030115346038114344, + "loss": 2.6364, + "theoretical_loss": 3.553378676087892, + "tokens_seen": 1332604928 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003011434302908726, + "loss": 2.5164, + "theoretical_loss": 3.553362916996114, + "tokens_seen": 1332670464 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003011334002006018, + "loss": 2.5531, + "theoretical_loss": 3.553347158896275, + "tokens_seen": 1332736000 + }, + { + "epoch": 4.04, + "learning_rate": 0.000301123370110331, + "loss": 2.8268, + "theoretical_loss": 3.5533314017882622, + "tokens_seen": 1332801536 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030111334002006016, + "loss": 2.658, + "theoretical_loss": 3.553315645671966, + "tokens_seen": 1332867072 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030110330992978935, + "loss": 2.6564, + "theoretical_loss": 3.553299890547274, + "tokens_seen": 1332932608 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003010932798395186, + "loss": 2.4595, + "theoretical_loss": 3.553284136414076, + "tokens_seen": 1332998144 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003010832497492477, + "loss": 2.5884, + "theoretical_loss": 3.5532683832722602, + "tokens_seen": 1333063680 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030107321965897694, + "loss": 2.6104, + "theoretical_loss": 3.553252631121716, + "tokens_seen": 1333129216 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030106318956870607, + "loss": 2.8035, + "theoretical_loss": 3.5532368799623315, + "tokens_seen": 1333194752 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003010531594784353, + "loss": 2.641, + "theoretical_loss": 3.553221129793996, + "tokens_seen": 1333260288 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030104312938816454, + "loss": 2.4992, + "theoretical_loss": 3.553205380616599, + "tokens_seen": 1333325824 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030103309929789367, + "loss": 2.7852, + "theoretical_loss": 3.5531896324300285, + "tokens_seen": 1333391360 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003010230692076229, + "loss": 2.8122, + "theoretical_loss": 3.553173885234174, + "tokens_seen": 1333456896 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003010130391173521, + "loss": 2.769, + "theoretical_loss": 3.553158139028925, + "tokens_seen": 1333522432 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030100300902708127, + "loss": 2.7795, + "theoretical_loss": 3.5531423938141695, + "tokens_seen": 1333587968 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1503973, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.984467029571533, + "objective/train/theoretical_loss": 3.553126649589797, + "objective/train/tokens_used": 1354113504, + "theoretical_loss": 3.553126649589797, + "tokens_seen": 1333653504 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030099297893681045, + "loss": 2.9471, + "theoretical_loss": 3.553126649589797, + "tokens_seen": 1333653504 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030098294884653963, + "loss": 2.8916, + "theoretical_loss": 3.5531109063556965, + "tokens_seen": 1333719040 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003009729187562688, + "loss": 2.3241, + "theoretical_loss": 3.553095164111757, + "tokens_seen": 1333784576 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030096288866599804, + "loss": 2.926, + "theoretical_loss": 3.553079422857868, + "tokens_seen": 1333850112 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030095285857572717, + "loss": 2.8648, + "theoretical_loss": 3.553063682593918, + "tokens_seen": 1333915648 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003009428284854564, + "loss": 2.6997, + "theoretical_loss": 3.5530479433197963, + "tokens_seen": 1333981184 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030093279839518553, + "loss": 2.6958, + "theoretical_loss": 3.553032205035392, + "tokens_seen": 1334046720 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030092276830491477, + "loss": 2.6967, + "theoretical_loss": 3.553016467740594, + "tokens_seen": 1334112256 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030091273821464395, + "loss": 2.5381, + "theoretical_loss": 3.5530007314352927, + "tokens_seen": 1334177792 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030090270812437313, + "loss": 2.7095, + "theoretical_loss": 3.552984996119376, + "tokens_seen": 1334243328 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003008926780341023, + "loss": 2.9267, + "theoretical_loss": 3.5529692617927333, + "tokens_seen": 1334308864 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030088264794383155, + "loss": 2.9963, + "theoretical_loss": 3.5529535284552543, + "tokens_seen": 1334374400 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003008726178535607, + "loss": 2.7728, + "theoretical_loss": 3.5529377961068276, + "tokens_seen": 1334439936 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003008625877632899, + "loss": 2.7666, + "theoretical_loss": 3.5529220647473427, + "tokens_seen": 1334505472 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030085255767301904, + "loss": 2.7262, + "theoretical_loss": 3.552906334376689, + "tokens_seen": 1334571008 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030084252758274827, + "loss": 2.5032, + "theoretical_loss": 3.552890604994756, + "tokens_seen": 1334636544 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030083249749247745, + "loss": 2.8243, + "theoretical_loss": 3.5528748766014333, + "tokens_seen": 1334702080 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030082246740220663, + "loss": 2.6245, + "theoretical_loss": 3.5528591491966086, + "tokens_seen": 1334767616 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003008124373119358, + "loss": 2.6308, + "theoretical_loss": 3.552843422780173, + "tokens_seen": 1334833152 + }, + { + "epoch": 4.04, + "learning_rate": 0.000300802407221665, + "loss": 2.6874, + "theoretical_loss": 3.552827697352015, + "tokens_seen": 1334898688 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003007923771313942, + "loss": 2.753, + "theoretical_loss": 3.5528119729120244, + "tokens_seen": 1334964224 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003007823470411234, + "loss": 2.6477, + "theoretical_loss": 3.5527962494600906, + "tokens_seen": 1335029760 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030077231695085254, + "loss": 2.7341, + "theoretical_loss": 3.552780526996102, + "tokens_seen": 1335095296 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003007622868605818, + "loss": 2.6776, + "theoretical_loss": 3.552764805519949, + "tokens_seen": 1335160832 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003007522567703109, + "loss": 2.6165, + "theoretical_loss": 3.5527490850315218, + "tokens_seen": 1335226368 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 1504757, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.535205125808716, + "objective/train/theoretical_loss": 3.552733365530708, + "objective/train/tokens_used": 1355751904, + "theoretical_loss": 3.552733365530708, + "tokens_seen": 1335291904 + }, + { + "epoch": 4.04, + "learning_rate": 0.00030074222668004014, + "loss": 2.5726, + "theoretical_loss": 3.552733365530708, + "tokens_seen": 1335291904 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003007321965897693, + "loss": 2.7941, + "theoretical_loss": 3.5527176470173982, + "tokens_seen": 1335357440 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003007221664994985, + "loss": 2.77, + "theoretical_loss": 3.552701929491482, + "tokens_seen": 1335422976 + }, + { + "epoch": 4.04, + "learning_rate": 0.0003007121364092277, + "loss": 2.4994, + "theoretical_loss": 3.5526862129528487, + "tokens_seen": 1335488512 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003007021063189569, + "loss": 2.6934, + "theoretical_loss": 3.5526704974013876, + "tokens_seen": 1335554048 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030069207622868604, + "loss": 2.7728, + "theoretical_loss": 3.5526547828369885, + "tokens_seen": 1335619584 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003006820461384153, + "loss": 2.7509, + "theoretical_loss": 3.5526390692595413, + "tokens_seen": 1335685120 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003006720160481444, + "loss": 2.7018, + "theoretical_loss": 3.552623356668935, + "tokens_seen": 1335750656 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030066198595787364, + "loss": 2.6671, + "theoretical_loss": 3.552607645065059, + "tokens_seen": 1335816192 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003006519558676028, + "loss": 2.7556, + "theoretical_loss": 3.552591934447804, + "tokens_seen": 1335881728 + }, + { + "epoch": 4.05, + "learning_rate": 0.000300641925777332, + "loss": 2.8379, + "theoretical_loss": 3.5525762248170585, + "tokens_seen": 1335947264 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003006318956870612, + "loss": 2.5638, + "theoretical_loss": 3.552560516172713, + "tokens_seen": 1336012800 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030062186559679036, + "loss": 2.7779, + "theoretical_loss": 3.552544808514657, + "tokens_seen": 1336078336 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030061183550651955, + "loss": 2.6395, + "theoretical_loss": 3.5525291018427803, + "tokens_seen": 1336143872 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003006018054162488, + "loss": 2.7103, + "theoretical_loss": 3.5525133961569715, + "tokens_seen": 1336209408 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003005917753259779, + "loss": 2.5746, + "theoretical_loss": 3.5524976914571216, + "tokens_seen": 1336274944 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030058174523570714, + "loss": 2.5816, + "theoretical_loss": 3.5524819877431204, + "tokens_seen": 1336340480 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030057171514543627, + "loss": 2.7479, + "theoretical_loss": 3.552466285014857, + "tokens_seen": 1336406016 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003005616850551655, + "loss": 2.7497, + "theoretical_loss": 3.5524505832722215, + "tokens_seen": 1336471552 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003005516549648947, + "loss": 2.7455, + "theoretical_loss": 3.5524348825151035, + "tokens_seen": 1336537088 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030054162487462387, + "loss": 2.5357, + "theoretical_loss": 3.5524191827433933, + "tokens_seen": 1336602624 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030053159478435305, + "loss": 2.6763, + "theoretical_loss": 3.5524034839569802, + "tokens_seen": 1336668160 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003005215646940823, + "loss": 2.6913, + "theoretical_loss": 3.5523877861557542, + "tokens_seen": 1336733696 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003005115346038114, + "loss": 2.693, + "theoretical_loss": 3.552372089339605, + "tokens_seen": 1336799232 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030050150451354065, + "loss": 2.6484, + "theoretical_loss": 3.5523563935084232, + "tokens_seen": 1336864768 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1506128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.197422742843628, + "objective/train/theoretical_loss": 3.552340698662098, + "objective/train/tokens_used": 1357390304, + "theoretical_loss": 3.552340698662098, + "tokens_seen": 1336930304 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003004914744232698, + "loss": 2.6837, + "theoretical_loss": 3.552340698662098, + "tokens_seen": 1336930304 + }, + { + "epoch": 4.05, + "learning_rate": 0.000300481444332999, + "loss": 2.4243, + "theoretical_loss": 3.5523250048005197, + "tokens_seen": 1336995840 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003004714142427282, + "loss": 2.6001, + "theoretical_loss": 3.5523093119235782, + "tokens_seen": 1337061376 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030046138415245737, + "loss": 2.7931, + "theoretical_loss": 3.5522936200311634, + "tokens_seen": 1337126912 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030045135406218655, + "loss": 2.5818, + "theoretical_loss": 3.552277929123165, + "tokens_seen": 1337192448 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030044132397191573, + "loss": 2.7045, + "theoretical_loss": 3.5522622391994734, + "tokens_seen": 1337257984 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003004312938816449, + "loss": 2.6828, + "theoretical_loss": 3.552246550259979, + "tokens_seen": 1337323520 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030042126379137415, + "loss": 2.4758, + "theoretical_loss": 3.552230862304571, + "tokens_seen": 1337389056 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003004112337011033, + "loss": 2.7366, + "theoretical_loss": 3.55221517533314, + "tokens_seen": 1337454592 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003004012036108325, + "loss": 2.8237, + "theoretical_loss": 3.5521994893455755, + "tokens_seen": 1337520128 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003003911735205617, + "loss": 2.7176, + "theoretical_loss": 3.5521838043417686, + "tokens_seen": 1337585664 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003003811434302909, + "loss": 2.6029, + "theoretical_loss": 3.552168120321608, + "tokens_seen": 1337651200 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030037111334002006, + "loss": 2.7121, + "theoretical_loss": 3.5521524372849846, + "tokens_seen": 1337716736 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030036108324974924, + "loss": 2.8973, + "theoretical_loss": 3.552136755231789, + "tokens_seen": 1337782272 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003003510531594784, + "loss": 2.8893, + "theoretical_loss": 3.552121074161911, + "tokens_seen": 1337847808 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030034102306920765, + "loss": 2.7755, + "theoretical_loss": 3.5521053940752405, + "tokens_seen": 1337913344 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003003309929789368, + "loss": 2.8843, + "theoretical_loss": 3.552089714971668, + "tokens_seen": 1337978880 + }, + { + "epoch": 4.05, + "learning_rate": 0.000300320962888666, + "loss": 2.6604, + "theoretical_loss": 3.5520740368510832, + "tokens_seen": 1338044416 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030031093279839514, + "loss": 2.7421, + "theoretical_loss": 3.552058359713377, + "tokens_seen": 1338109952 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003003009027081244, + "loss": 2.6681, + "theoretical_loss": 3.5520426835584393, + "tokens_seen": 1338175488 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003002908726178536, + "loss": 2.7552, + "theoretical_loss": 3.552027008386161, + "tokens_seen": 1338241024 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030028084252758274, + "loss": 2.633, + "theoretical_loss": 3.552011334196431, + "tokens_seen": 1338306560 + }, + { + "epoch": 4.05, + "learning_rate": 0.000300270812437312, + "loss": 2.6558, + "theoretical_loss": 3.5519956609891405, + "tokens_seen": 1338372096 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003002607823470411, + "loss": 2.635, + "theoretical_loss": 3.55197998876418, + "tokens_seen": 1338437632 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030025075225677034, + "loss": 2.8198, + "theoretical_loss": 3.5519643175214397, + "tokens_seen": 1338503168 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1506680, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8979873657226562, + "objective/train/theoretical_loss": 3.5519486472608097, + "objective/train/tokens_used": 1359028704, + "theoretical_loss": 3.5519486472608097, + "tokens_seen": 1338568704 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003002407221664995, + "loss": 3.014, + "theoretical_loss": 3.5519486472608097, + "tokens_seen": 1338568704 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003002306920762287, + "loss": 2.7011, + "theoretical_loss": 3.5519329779821804, + "tokens_seen": 1338634240 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003002206619859579, + "loss": 2.7054, + "theoretical_loss": 3.5519173096854426, + "tokens_seen": 1338699776 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003002106318956871, + "loss": 2.6003, + "theoretical_loss": 3.551901642370486, + "tokens_seen": 1338765312 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030020060180541624, + "loss": 2.5723, + "theoretical_loss": 3.5518859760372017, + "tokens_seen": 1338830848 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003001905717151455, + "loss": 2.5742, + "theoretical_loss": 3.55187031068548, + "tokens_seen": 1338896384 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003001805416248746, + "loss": 2.903, + "theoretical_loss": 3.5518546463152108, + "tokens_seen": 1338961920 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030017051153460384, + "loss": 2.753, + "theoretical_loss": 3.5518389829262853, + "tokens_seen": 1339027456 + }, + { + "epoch": 4.05, + "learning_rate": 0.000300160481444333, + "loss": 2.5019, + "theoretical_loss": 3.5518233205185936, + "tokens_seen": 1339092992 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003001504513540622, + "loss": 2.7562, + "theoretical_loss": 3.5518076590920264, + "tokens_seen": 1339158528 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003001404212637914, + "loss": 2.602, + "theoretical_loss": 3.5517919986464745, + "tokens_seen": 1339224064 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030013039117352056, + "loss": 2.6566, + "theoretical_loss": 3.551776339181828, + "tokens_seen": 1339289600 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030012036108324975, + "loss": 2.5779, + "theoretical_loss": 3.551760680697977, + "tokens_seen": 1339355136 + }, + { + "epoch": 4.05, + "learning_rate": 0.000300110330992979, + "loss": 2.7822, + "theoretical_loss": 3.5517450231948127, + "tokens_seen": 1339420672 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003001003009027081, + "loss": 2.9438, + "theoretical_loss": 3.551729366672226, + "tokens_seen": 1339486208 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030009027081243734, + "loss": 2.7497, + "theoretical_loss": 3.551713711130107, + "tokens_seen": 1339551744 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030008024072216647, + "loss": 2.8555, + "theoretical_loss": 3.5516980565683465, + "tokens_seen": 1339617280 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003000702106318957, + "loss": 2.4172, + "theoretical_loss": 3.5516824029868355, + "tokens_seen": 1339682816 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003000601805416249, + "loss": 2.5359, + "theoretical_loss": 3.551666750385464, + "tokens_seen": 1339748352 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030005015045135407, + "loss": 2.7497, + "theoretical_loss": 3.5516510987641228, + "tokens_seen": 1339813888 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030004012036108325, + "loss": 2.9051, + "theoretical_loss": 3.5516354481227035, + "tokens_seen": 1339879424 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003000300902708125, + "loss": 2.5491, + "theoretical_loss": 3.551619798461095, + "tokens_seen": 1339944960 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003000200601805416, + "loss": 2.9152, + "theoretical_loss": 3.55160414977919, + "tokens_seen": 1340010496 + }, + { + "epoch": 4.05, + "learning_rate": 0.00030001003009027085, + "loss": 2.7579, + "theoretical_loss": 3.5515885020768785, + "tokens_seen": 1340076032 + }, + { + "epoch": 4.05, + "learning_rate": 0.0003, + "loss": 2.7868, + "theoretical_loss": 3.551572855354051, + "tokens_seen": 1340141568 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1506682, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5435173511505127, + "objective/train/theoretical_loss": 3.551557209610599, + "objective/train/tokens_used": 1360667104, + "theoretical_loss": 3.551557209610599, + "tokens_seen": 1340207104 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002999899699097292, + "loss": 2.5745, + "theoretical_loss": 3.551557209610599, + "tokens_seen": 1340207104 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002999799398194584, + "loss": 2.315, + "theoretical_loss": 3.551541564846412, + "tokens_seen": 1340272640 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029996990972918757, + "loss": 2.8512, + "theoretical_loss": 3.551525921061382, + "tokens_seen": 1340338176 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029995987963891675, + "loss": 2.7299, + "theoretical_loss": 3.5515102782553996, + "tokens_seen": 1340403712 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029994984954864593, + "loss": 2.7094, + "theoretical_loss": 3.5514946364283557, + "tokens_seen": 1340469248 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002999398194583751, + "loss": 2.6636, + "theoretical_loss": 3.5514789955801405, + "tokens_seen": 1340534784 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029992978936810435, + "loss": 2.7232, + "theoretical_loss": 3.551463355710646, + "tokens_seen": 1340600320 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002999197592778335, + "loss": 2.6671, + "theoretical_loss": 3.5514477168197622, + "tokens_seen": 1340665856 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002999097291875627, + "loss": 2.8143, + "theoretical_loss": 3.5514320789073808, + "tokens_seen": 1340731392 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002998996990972919, + "loss": 2.4735, + "theoretical_loss": 3.551416441973392, + "tokens_seen": 1340796928 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002998896690070211, + "loss": 2.7152, + "theoretical_loss": 3.5514008060176874, + "tokens_seen": 1340862464 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029987963891675026, + "loss": 2.6733, + "theoretical_loss": 3.5513851710401574, + "tokens_seen": 1340928000 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029986960882647944, + "loss": 2.8356, + "theoretical_loss": 3.551369537040693, + "tokens_seen": 1340993536 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002998595787362086, + "loss": 2.7239, + "theoretical_loss": 3.5513539040191864, + "tokens_seen": 1341059072 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029984954864593785, + "loss": 2.6926, + "theoretical_loss": 3.5513382719755273, + "tokens_seen": 1341124608 + }, + { + "epoch": 4.05, + "learning_rate": 0.000299839518555667, + "loss": 2.537, + "theoretical_loss": 3.551322640909607, + "tokens_seen": 1341190144 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002998294884653962, + "loss": 2.826, + "theoretical_loss": 3.551307010821317, + "tokens_seen": 1341255680 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029981945837512534, + "loss": 2.705, + "theoretical_loss": 3.5512913817105485, + "tokens_seen": 1341321216 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002998094282848546, + "loss": 2.6646, + "theoretical_loss": 3.5512757535771917, + "tokens_seen": 1341386752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029979939819458376, + "loss": 2.6163, + "theoretical_loss": 3.5512601264211385, + "tokens_seen": 1341452288 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029978936810431294, + "loss": 2.6182, + "theoretical_loss": 3.55124450024228, + "tokens_seen": 1341517824 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002997793380140421, + "loss": 2.6714, + "theoretical_loss": 3.551228875040507, + "tokens_seen": 1341583360 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002997693079237713, + "loss": 2.7248, + "theoretical_loss": 3.551213250815711, + "tokens_seen": 1341648896 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002997592778335005, + "loss": 2.832, + "theoretical_loss": 3.5511976275677837, + "tokens_seen": 1341714432 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002997492477432297, + "loss": 2.7048, + "theoretical_loss": 3.551182005296615, + "tokens_seen": 1341779968 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1507446, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8023407459259033, + "objective/train/theoretical_loss": 3.551166384002097, + "objective/train/tokens_used": 1362305504, + "theoretical_loss": 3.551166384002097, + "tokens_seen": 1341845504 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029973921765295885, + "loss": 2.6677, + "theoretical_loss": 3.551166384002097, + "tokens_seen": 1341845504 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002997291875626881, + "loss": 2.635, + "theoretical_loss": 3.5511507636841206, + "tokens_seen": 1341911040 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029971915747241726, + "loss": 2.8049, + "theoretical_loss": 3.5511351443425774, + "tokens_seen": 1341976576 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029970912738214644, + "loss": 2.6777, + "theoretical_loss": 3.5511195259773585, + "tokens_seen": 1342042112 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002996990972918756, + "loss": 2.8039, + "theoretical_loss": 3.551103908588355, + "tokens_seen": 1342107648 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002996890672016048, + "loss": 2.7576, + "theoretical_loss": 3.551088292175459, + "tokens_seen": 1342173184 + }, + { + "epoch": 4.05, + "learning_rate": 0.000299679037111334, + "loss": 2.5265, + "theoretical_loss": 3.551072676738561, + "tokens_seen": 1342238720 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002996690070210632, + "loss": 2.6238, + "theoretical_loss": 3.551057062277553, + "tokens_seen": 1342304256 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029965897693079235, + "loss": 2.8529, + "theoretical_loss": 3.5510414487923256, + "tokens_seen": 1342369792 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002996489468405216, + "loss": 2.7188, + "theoretical_loss": 3.5510258362827707, + "tokens_seen": 1342435328 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002996389167502507, + "loss": 2.469, + "theoretical_loss": 3.5510102247487794, + "tokens_seen": 1342500864 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029962888665997995, + "loss": 2.737, + "theoretical_loss": 3.5509946141902433, + "tokens_seen": 1342566400 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029961885656970913, + "loss": 2.7072, + "theoretical_loss": 3.550979004607054, + "tokens_seen": 1342631936 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002996088264794383, + "loss": 2.8882, + "theoretical_loss": 3.550963395999103, + "tokens_seen": 1342697472 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002995987963891675, + "loss": 2.6852, + "theoretical_loss": 3.5509477883662814, + "tokens_seen": 1342763008 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029958876629889667, + "loss": 2.8004, + "theoretical_loss": 3.550932181708481, + "tokens_seen": 1342828544 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029957873620862585, + "loss": 2.566, + "theoretical_loss": 3.5509165760255934, + "tokens_seen": 1342894080 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002995687061183551, + "loss": 2.4521, + "theoretical_loss": 3.5509009713175095, + "tokens_seen": 1342959616 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002995586760280842, + "loss": 2.9367, + "theoretical_loss": 3.5508853675841214, + "tokens_seen": 1343025152 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029954864593781345, + "loss": 2.634, + "theoretical_loss": 3.5508697648253205, + "tokens_seen": 1343090688 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002995386158475427, + "loss": 2.6086, + "theoretical_loss": 3.5508541630409987, + "tokens_seen": 1343156224 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002995285857572718, + "loss": 2.542, + "theoretical_loss": 3.5508385622310468, + "tokens_seen": 1343221760 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029951855566700105, + "loss": 2.6771, + "theoretical_loss": 3.550822962395357, + "tokens_seen": 1343287296 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002995085255767302, + "loss": 2.6823, + "theoretical_loss": 3.5508073635338206, + "tokens_seen": 1343352832 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002994984954864594, + "loss": 2.4627, + "theoretical_loss": 3.55079176564633, + "tokens_seen": 1343418368 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1508187, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9692583084106445, + "objective/train/theoretical_loss": 3.550776168732776, + "objective/train/tokens_used": 1363943904, + "theoretical_loss": 3.550776168732776, + "tokens_seen": 1343483904 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002994884653961886, + "loss": 2.5772, + "theoretical_loss": 3.550776168732776, + "tokens_seen": 1343483904 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029947843530591777, + "loss": 2.6239, + "theoretical_loss": 3.550760572793051, + "tokens_seen": 1343549440 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029946840521564695, + "loss": 2.4927, + "theoretical_loss": 3.5507449778270463, + "tokens_seen": 1343614976 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029945837512537613, + "loss": 2.5679, + "theoretical_loss": 3.5507293838346534, + "tokens_seen": 1343680512 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002994483450351053, + "loss": 2.9101, + "theoretical_loss": 3.550713790815765, + "tokens_seen": 1343746048 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029943831494483455, + "loss": 2.6945, + "theoretical_loss": 3.550698198770271, + "tokens_seen": 1343811584 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002994282848545637, + "loss": 2.7543, + "theoretical_loss": 3.550682607698065, + "tokens_seen": 1343877120 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002994182547642929, + "loss": 2.6513, + "theoretical_loss": 3.5506670175990376, + "tokens_seen": 1343942656 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002994082246740221, + "loss": 2.7973, + "theoretical_loss": 3.5506514284730812, + "tokens_seen": 1344008192 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002993981945837513, + "loss": 2.803, + "theoretical_loss": 3.550635840320088, + "tokens_seen": 1344073728 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029938816449348046, + "loss": 2.5209, + "theoretical_loss": 3.5506202531399493, + "tokens_seen": 1344139264 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029937813440320964, + "loss": 2.601, + "theoretical_loss": 3.5506046669325566, + "tokens_seen": 1344204800 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002993681043129388, + "loss": 2.5979, + "theoretical_loss": 3.5505890816978027, + "tokens_seen": 1344270336 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029935807422266805, + "loss": 2.7865, + "theoretical_loss": 3.550573497435578, + "tokens_seen": 1344335872 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002993480441323972, + "loss": 2.6167, + "theoretical_loss": 3.5505579141457764, + "tokens_seen": 1344401408 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002993380140421264, + "loss": 2.5938, + "theoretical_loss": 3.5505423318282885, + "tokens_seen": 1344466944 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029932798395185554, + "loss": 2.6116, + "theoretical_loss": 3.5505267504830065, + "tokens_seen": 1344532480 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002993179538615848, + "loss": 2.6737, + "theoretical_loss": 3.5505111701098224, + "tokens_seen": 1344598016 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029930792377131396, + "loss": 2.6748, + "theoretical_loss": 3.550495590708628, + "tokens_seen": 1344663552 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029929789368104314, + "loss": 2.6336, + "theoretical_loss": 3.5504800122793156, + "tokens_seen": 1344729088 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002992878635907723, + "loss": 2.6307, + "theoretical_loss": 3.550464434821777, + "tokens_seen": 1344794624 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002992778335005015, + "loss": 2.6766, + "theoretical_loss": 3.5504488583359044, + "tokens_seen": 1344860160 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002992678034102307, + "loss": 2.5858, + "theoretical_loss": 3.55043328282159, + "tokens_seen": 1344925696 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002992577733199599, + "loss": 2.8844, + "theoretical_loss": 3.5504177082787254, + "tokens_seen": 1344991232 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029924774322968905, + "loss": 2.7651, + "theoretical_loss": 3.550402134707203, + "tokens_seen": 1345056768 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1509426, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.554690361022949, + "objective/train/theoretical_loss": 3.5503865621069144, + "objective/train/tokens_used": 1365582304, + "theoretical_loss": 3.5503865621069144, + "tokens_seen": 1345122304 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002992377131394183, + "loss": 2.6236, + "theoretical_loss": 3.5503865621069144, + "tokens_seen": 1345122304 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029922768304914746, + "loss": 2.7252, + "theoretical_loss": 3.5503709904777523, + "tokens_seen": 1345187840 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029921765295887664, + "loss": 2.8637, + "theoretical_loss": 3.550355419819609, + "tokens_seen": 1345253376 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002992076228686058, + "loss": 2.5164, + "theoretical_loss": 3.550339850132376, + "tokens_seen": 1345318912 + }, + { + "epoch": 4.05, + "learning_rate": 0.000299197592778335, + "loss": 2.7611, + "theoretical_loss": 3.550324281415946, + "tokens_seen": 1345384448 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002991875626880642, + "loss": 2.8538, + "theoretical_loss": 3.5503087136702103, + "tokens_seen": 1345449984 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002991775325977934, + "loss": 2.5655, + "theoretical_loss": 3.5502931468950623, + "tokens_seen": 1345515520 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029916750250752255, + "loss": 2.8136, + "theoretical_loss": 3.550277581090394, + "tokens_seen": 1345581056 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002991574724172518, + "loss": 2.8745, + "theoretical_loss": 3.5502620162560965, + "tokens_seen": 1345646592 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002991474423269809, + "loss": 2.728, + "theoretical_loss": 3.550246452392063, + "tokens_seen": 1345712128 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029913741223671015, + "loss": 2.765, + "theoretical_loss": 3.550230889498186, + "tokens_seen": 1345777664 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029912738214643933, + "loss": 2.8291, + "theoretical_loss": 3.550215327574357, + "tokens_seen": 1345843200 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002991173520561685, + "loss": 2.4863, + "theoretical_loss": 3.550199766620469, + "tokens_seen": 1345908736 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002991073219658977, + "loss": 2.7966, + "theoretical_loss": 3.550184206636414, + "tokens_seen": 1345974272 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029909729187562687, + "loss": 2.6954, + "theoretical_loss": 3.550168647622084, + "tokens_seen": 1346039808 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029908726178535605, + "loss": 2.8217, + "theoretical_loss": 3.550153089577372, + "tokens_seen": 1346105344 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002990772316950853, + "loss": 2.6436, + "theoretical_loss": 3.5501375325021707, + "tokens_seen": 1346170880 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002990672016048144, + "loss": 2.8473, + "theoretical_loss": 3.550121976396371, + "tokens_seen": 1346236416 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029905717151454365, + "loss": 2.7741, + "theoretical_loss": 3.5501064212598665, + "tokens_seen": 1346301952 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029904714142427283, + "loss": 2.477, + "theoretical_loss": 3.5500908670925493, + "tokens_seen": 1346367488 + }, + { + "epoch": 4.05, + "learning_rate": 0.000299037111334002, + "loss": 2.5492, + "theoretical_loss": 3.550075313894312, + "tokens_seen": 1346433024 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002990270812437312, + "loss": 2.7839, + "theoretical_loss": 3.5500597616650467, + "tokens_seen": 1346498560 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002990170511534604, + "loss": 2.6982, + "theoretical_loss": 3.5500442104046463, + "tokens_seen": 1346564096 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029900702106318956, + "loss": 2.6283, + "theoretical_loss": 3.5500286601130027, + "tokens_seen": 1346629632 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002989969909729188, + "loss": 2.6279, + "theoretical_loss": 3.550013110790009, + "tokens_seen": 1346695168 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1510013, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7701847553253174, + "objective/train/theoretical_loss": 3.549997562435558, + "objective/train/tokens_used": 1367220704, + "theoretical_loss": 3.549997562435558, + "tokens_seen": 1346760704 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002989869608826479, + "loss": 2.6528, + "theoretical_loss": 3.549997562435558, + "tokens_seen": 1346760704 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029897693079237715, + "loss": 2.8473, + "theoretical_loss": 3.549982015049541, + "tokens_seen": 1346826240 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002989669007021063, + "loss": 2.8033, + "theoretical_loss": 3.549966468631852, + "tokens_seen": 1346891776 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002989568706118355, + "loss": 2.8413, + "theoretical_loss": 3.5499509231823825, + "tokens_seen": 1346957312 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002989468405215647, + "loss": 2.7241, + "theoretical_loss": 3.5499353787010257, + "tokens_seen": 1347022848 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002989368104312939, + "loss": 2.5255, + "theoretical_loss": 3.5499198351876737, + "tokens_seen": 1347088384 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029892678034102306, + "loss": 2.782, + "theoretical_loss": 3.5499042926422195, + "tokens_seen": 1347153920 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002989167502507523, + "loss": 2.6674, + "theoretical_loss": 3.549888751064556, + "tokens_seen": 1347219456 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002989067201604814, + "loss": 2.7379, + "theoretical_loss": 3.5498732104545754, + "tokens_seen": 1347284992 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029889669007021066, + "loss": 2.7572, + "theoretical_loss": 3.5498576708121705, + "tokens_seen": 1347350528 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002988866599799398, + "loss": 2.8585, + "theoretical_loss": 3.549842132137234, + "tokens_seen": 1347416064 + }, + { + "epoch": 4.05, + "learning_rate": 0.000298876629889669, + "loss": 2.6141, + "theoretical_loss": 3.5498265944296588, + "tokens_seen": 1347481600 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002988665997993982, + "loss": 2.6897, + "theoretical_loss": 3.549811057689337, + "tokens_seen": 1347547136 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002988565697091274, + "loss": 2.9036, + "theoretical_loss": 3.5497955219161623, + "tokens_seen": 1347612672 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029884653961885656, + "loss": 2.8636, + "theoretical_loss": 3.549779987110027, + "tokens_seen": 1347678208 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029883650952858574, + "loss": 2.7638, + "theoretical_loss": 3.549764453270824, + "tokens_seen": 1347743744 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002988264794383149, + "loss": 2.8871, + "theoretical_loss": 3.5497489203984456, + "tokens_seen": 1347809280 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029881644934804416, + "loss": 2.513, + "theoretical_loss": 3.5497333884927853, + "tokens_seen": 1347874816 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002988064192577733, + "loss": 2.6083, + "theoretical_loss": 3.549717857553736, + "tokens_seen": 1347940352 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002987963891675025, + "loss": 2.7726, + "theoretical_loss": 3.549702327581189, + "tokens_seen": 1348005888 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002987863590772317, + "loss": 2.8613, + "theoretical_loss": 3.5496867985750393, + "tokens_seen": 1348071424 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002987763289869609, + "loss": 2.7175, + "theoretical_loss": 3.5496712705351787, + "tokens_seen": 1348136960 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002987662988966901, + "loss": 2.8827, + "theoretical_loss": 3.5496557434615, + "tokens_seen": 1348202496 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029875626880641925, + "loss": 2.7725, + "theoretical_loss": 3.549640217353897, + "tokens_seen": 1348268032 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002987462387161485, + "loss": 2.8003, + "theoretical_loss": 3.5496246922122614, + "tokens_seen": 1348333568 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1511229, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.716949939727783, + "objective/train/theoretical_loss": 3.549609168036487, + "objective/train/tokens_used": 1368859104, + "theoretical_loss": 3.549609168036487, + "tokens_seen": 1348399104 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029873620862587766, + "loss": 2.5916, + "theoretical_loss": 3.549609168036487, + "tokens_seen": 1348399104 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029872617853560684, + "loss": 2.5237, + "theoretical_loss": 3.5495936448264667, + "tokens_seen": 1348464640 + }, + { + "epoch": 4.05, + "learning_rate": 0.000298716148445336, + "loss": 2.7218, + "theoretical_loss": 3.549578122582093, + "tokens_seen": 1348530176 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002987061183550652, + "loss": 2.7952, + "theoretical_loss": 3.549562601303259, + "tokens_seen": 1348595712 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002986960882647944, + "loss": 2.7038, + "theoretical_loss": 3.5495470809898584, + "tokens_seen": 1348661248 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002986860581745236, + "loss": 2.7883, + "theoretical_loss": 3.5495315616417837, + "tokens_seen": 1348726784 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029867602808425275, + "loss": 2.5982, + "theoretical_loss": 3.549516043258928, + "tokens_seen": 1348792320 + }, + { + "epoch": 4.05, + "learning_rate": 0.000298665997993982, + "loss": 2.3621, + "theoretical_loss": 3.549500525841185, + "tokens_seen": 1348857856 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002986559679037111, + "loss": 2.8927, + "theoretical_loss": 3.5494850093884462, + "tokens_seen": 1348923392 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029864593781344035, + "loss": 2.95, + "theoretical_loss": 3.5494694939006064, + "tokens_seen": 1348988928 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029863590772316953, + "loss": 2.5967, + "theoretical_loss": 3.549453979377558, + "tokens_seen": 1349054464 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002986258776328987, + "loss": 2.8127, + "theoretical_loss": 3.549438465819194, + "tokens_seen": 1349120000 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002986158475426279, + "loss": 2.6664, + "theoretical_loss": 3.549422953225408, + "tokens_seen": 1349185536 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029860581745235707, + "loss": 2.7774, + "theoretical_loss": 3.5494074415960926, + "tokens_seen": 1349251072 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029859578736208625, + "loss": 2.5631, + "theoretical_loss": 3.5493919309311415, + "tokens_seen": 1349316608 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002985857572718155, + "loss": 2.6669, + "theoretical_loss": 3.5493764212304475, + "tokens_seen": 1349382144 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002985757271815446, + "loss": 2.6485, + "theoretical_loss": 3.5493609124939045, + "tokens_seen": 1349447680 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029856569709127385, + "loss": 2.6763, + "theoretical_loss": 3.549345404721405, + "tokens_seen": 1349513216 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029855566700100303, + "loss": 2.6191, + "theoretical_loss": 3.549329897912843, + "tokens_seen": 1349578752 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002985456369107322, + "loss": 2.8153, + "theoretical_loss": 3.5493143920681107, + "tokens_seen": 1349644288 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002985356068204614, + "loss": 2.9487, + "theoretical_loss": 3.5492988871871023, + "tokens_seen": 1349709824 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002985255767301906, + "loss": 2.7807, + "theoretical_loss": 3.549283383269711, + "tokens_seen": 1349775360 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029851554663991976, + "loss": 2.4485, + "theoretical_loss": 3.5492678803158295, + "tokens_seen": 1349840896 + }, + { + "epoch": 4.05, + "learning_rate": 0.000298505516549649, + "loss": 2.6957, + "theoretical_loss": 3.5492523783253525, + "tokens_seen": 1349906432 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002984954864593781, + "loss": 2.695, + "theoretical_loss": 3.549236877298172, + "tokens_seen": 1349971968 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1512083, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1851322650909424, + "objective/train/theoretical_loss": 3.549221377234182, + "objective/train/tokens_used": 1370497504, + "theoretical_loss": 3.549221377234182, + "tokens_seen": 1350037504 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029848545636910735, + "loss": 2.5258, + "theoretical_loss": 3.549221377234182, + "tokens_seen": 1350037504 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002984754262788365, + "loss": 2.8254, + "theoretical_loss": 3.5492058781332756, + "tokens_seen": 1350103040 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002984653961885657, + "loss": 2.7243, + "theoretical_loss": 3.5491903799953466, + "tokens_seen": 1350168576 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002984553660982949, + "loss": 2.6613, + "theoretical_loss": 3.549174882820288, + "tokens_seen": 1350234112 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002984453360080241, + "loss": 2.676, + "theoretical_loss": 3.549159386607993, + "tokens_seen": 1350299648 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029843530591775326, + "loss": 2.7808, + "theoretical_loss": 3.549143891358357, + "tokens_seen": 1350365184 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002984252758274825, + "loss": 2.6886, + "theoretical_loss": 3.5491283970712706, + "tokens_seen": 1350430720 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002984152457372116, + "loss": 2.4495, + "theoretical_loss": 3.549112903746629, + "tokens_seen": 1350496256 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029840521564694086, + "loss": 2.6895, + "theoretical_loss": 3.549097411384326, + "tokens_seen": 1350561792 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029839518555667, + "loss": 2.8342, + "theoretical_loss": 3.5490819199842543, + "tokens_seen": 1350627328 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002983851554663992, + "loss": 2.4988, + "theoretical_loss": 3.5490664295463077, + "tokens_seen": 1350692864 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002983751253761284, + "loss": 2.7996, + "theoretical_loss": 3.549050940070379, + "tokens_seen": 1350758400 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002983650952858576, + "loss": 2.7887, + "theoretical_loss": 3.549035451556364, + "tokens_seen": 1350823936 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029835506519558676, + "loss": 2.7817, + "theoretical_loss": 3.549019964004154, + "tokens_seen": 1350889472 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029834503510531594, + "loss": 2.747, + "theoretical_loss": 3.5490044774136433, + "tokens_seen": 1350955008 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002983350050150451, + "loss": 2.7483, + "theoretical_loss": 3.548988991784726, + "tokens_seen": 1351020544 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029832497492477436, + "loss": 2.6019, + "theoretical_loss": 3.548973507117296, + "tokens_seen": 1351086080 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002983149448345035, + "loss": 2.6762, + "theoretical_loss": 3.5489580234112457, + "tokens_seen": 1351151616 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002983049147442327, + "loss": 2.8147, + "theoretical_loss": 3.54894254066647, + "tokens_seen": 1351217152 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029829488465396185, + "loss": 2.6882, + "theoretical_loss": 3.5489270588828616, + "tokens_seen": 1351282688 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002982848545636911, + "loss": 2.5956, + "theoretical_loss": 3.548911578060315, + "tokens_seen": 1351348224 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029827482447342026, + "loss": 2.7254, + "theoretical_loss": 3.548896098198724, + "tokens_seen": 1351413760 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029826479438314945, + "loss": 2.5003, + "theoretical_loss": 3.5488806192979814, + "tokens_seen": 1351479296 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002982547642928786, + "loss": 2.9524, + "theoretical_loss": 3.5488651413579824, + "tokens_seen": 1351544832 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029824473420260786, + "loss": 2.7396, + "theoretical_loss": 3.5488496643786194, + "tokens_seen": 1351610368 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1513423, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7058820724487305, + "objective/train/theoretical_loss": 3.5488341883597867, + "objective/train/tokens_used": 1372135904, + "theoretical_loss": 3.5488341883597867, + "tokens_seen": 1351675904 + }, + { + "epoch": 4.05, + "learning_rate": 0.000298234704112337, + "loss": 2.7002, + "theoretical_loss": 3.5488341883597867, + "tokens_seen": 1351675904 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002982246740220662, + "loss": 2.4233, + "theoretical_loss": 3.5488187133013787, + "tokens_seen": 1351741440 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029821464393179535, + "loss": 2.6596, + "theoretical_loss": 3.5488032392032887, + "tokens_seen": 1351806976 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002982046138415246, + "loss": 2.768, + "theoretical_loss": 3.54878776606541, + "tokens_seen": 1351872512 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029819458375125377, + "loss": 2.6433, + "theoretical_loss": 3.5487722938876374, + "tokens_seen": 1351938048 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029818455366098295, + "loss": 2.6283, + "theoretical_loss": 3.5487568226698647, + "tokens_seen": 1352003584 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029817452357071213, + "loss": 2.8337, + "theoretical_loss": 3.5487413524119855, + "tokens_seen": 1352069120 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002981644934804413, + "loss": 2.7834, + "theoretical_loss": 3.5487258831138937, + "tokens_seen": 1352134656 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002981544633901705, + "loss": 2.5967, + "theoretical_loss": 3.548710414775483, + "tokens_seen": 1352200192 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029814443329989973, + "loss": 2.69, + "theoretical_loss": 3.5486949473966485, + "tokens_seen": 1352265728 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029813440320962885, + "loss": 2.7136, + "theoretical_loss": 3.548679480977283, + "tokens_seen": 1352331264 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002981243731193581, + "loss": 2.4809, + "theoretical_loss": 3.5486640155172804, + "tokens_seen": 1352396800 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002981143430290872, + "loss": 2.668, + "theoretical_loss": 3.5486485510165355, + "tokens_seen": 1352462336 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029810431293881645, + "loss": 2.6052, + "theoretical_loss": 3.5486330874749425, + "tokens_seen": 1352527872 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029809428284854563, + "loss": 2.697, + "theoretical_loss": 3.5486176248923944, + "tokens_seen": 1352593408 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002980842527582748, + "loss": 2.6051, + "theoretical_loss": 3.548602163268786, + "tokens_seen": 1352658944 + }, + { + "epoch": 4.05, + "learning_rate": 0.000298074222668004, + "loss": 2.6487, + "theoretical_loss": 3.5485867026040108, + "tokens_seen": 1352724480 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029806419257773323, + "loss": 2.7828, + "theoretical_loss": 3.5485712428979634, + "tokens_seen": 1352790016 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029805416248746236, + "loss": 2.5762, + "theoretical_loss": 3.5485557841505377, + "tokens_seen": 1352855552 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002980441323971916, + "loss": 2.6515, + "theoretical_loss": 3.548540326361628, + "tokens_seen": 1352921088 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002980341023069208, + "loss": 2.8252, + "theoretical_loss": 3.5485248695311284, + "tokens_seen": 1352986624 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029802407221664996, + "loss": 2.6945, + "theoretical_loss": 3.5485094136589326, + "tokens_seen": 1353052160 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002980140421263792, + "loss": 2.8939, + "theoretical_loss": 3.5484939587449356, + "tokens_seen": 1353117696 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002980040120361083, + "loss": 2.7708, + "theoretical_loss": 3.5484785047890304, + "tokens_seen": 1353183232 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029799398194583755, + "loss": 2.4756, + "theoretical_loss": 3.5484630517911127, + "tokens_seen": 1353248768 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1514073, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4900929927825928, + "objective/train/theoretical_loss": 3.548447599751076, + "objective/train/tokens_used": 1373774304, + "theoretical_loss": 3.548447599751076, + "tokens_seen": 1353314304 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002979839518555667, + "loss": 2.7386, + "theoretical_loss": 3.548447599751076, + "tokens_seen": 1353314304 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002979739217652959, + "loss": 2.6159, + "theoretical_loss": 3.548432148668814, + "tokens_seen": 1353379840 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002979638916750251, + "loss": 2.7703, + "theoretical_loss": 3.548416698544222, + "tokens_seen": 1353445376 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002979538615847543, + "loss": 2.6948, + "theoretical_loss": 3.5484012493771933, + "tokens_seen": 1353510912 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029794383149448346, + "loss": 2.7864, + "theoretical_loss": 3.5483858011676226, + "tokens_seen": 1353576448 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002979338014042127, + "loss": 2.5321, + "theoretical_loss": 3.548370353915405, + "tokens_seen": 1353641984 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002979237713139418, + "loss": 2.614, + "theoretical_loss": 3.548354907620433, + "tokens_seen": 1353707520 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029791374122367106, + "loss": 2.539, + "theoretical_loss": 3.5483394622826028, + "tokens_seen": 1353773056 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002979037111334002, + "loss": 2.6795, + "theoretical_loss": 3.548324017901807, + "tokens_seen": 1353838592 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002978936810431294, + "loss": 2.4953, + "theoretical_loss": 3.548308574477942, + "tokens_seen": 1353904128 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002978836509528586, + "loss": 2.658, + "theoretical_loss": 3.548293132010901, + "tokens_seen": 1353969664 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002978736208625878, + "loss": 2.5627, + "theoretical_loss": 3.548277690500578, + "tokens_seen": 1354035200 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029786359077231696, + "loss": 2.6826, + "theoretical_loss": 3.548262249946869, + "tokens_seen": 1354100736 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029785356068204614, + "loss": 2.5905, + "theoretical_loss": 3.5482468103496663, + "tokens_seen": 1354166272 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002978435305917753, + "loss": 2.7626, + "theoretical_loss": 3.548231371708866, + "tokens_seen": 1354231808 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029783350050150456, + "loss": 2.8008, + "theoretical_loss": 3.5482159340243618, + "tokens_seen": 1354297344 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002978234704112337, + "loss": 2.6798, + "theoretical_loss": 3.548200497296049, + "tokens_seen": 1354362880 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002978134403209629, + "loss": 2.7777, + "theoretical_loss": 3.548185061523821, + "tokens_seen": 1354428416 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029780341023069205, + "loss": 2.4268, + "theoretical_loss": 3.548169626707573, + "tokens_seen": 1354493952 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002977933801404213, + "loss": 2.4828, + "theoretical_loss": 3.5481541928472, + "tokens_seen": 1354559488 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029778335005015046, + "loss": 2.6172, + "theoretical_loss": 3.5481387599425953, + "tokens_seen": 1354625024 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029777331995987965, + "loss": 2.6898, + "theoretical_loss": 3.5481233279936544, + "tokens_seen": 1354690560 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029776328986960883, + "loss": 2.8771, + "theoretical_loss": 3.5481078970002713, + "tokens_seen": 1354756096 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029775325977933806, + "loss": 2.8269, + "theoretical_loss": 3.5480924669623413, + "tokens_seen": 1354821632 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002977432296890672, + "loss": 2.6392, + "theoretical_loss": 3.548077037879759, + "tokens_seen": 1354887168 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1515331, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.898995876312256, + "objective/train/theoretical_loss": 3.548061609752419, + "objective/train/tokens_used": 1375412704, + "theoretical_loss": 3.548061609752419, + "tokens_seen": 1354952704 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002977331995987964, + "loss": 2.8321, + "theoretical_loss": 3.548061609752419, + "tokens_seen": 1354952704 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029772316950852555, + "loss": 2.7478, + "theoretical_loss": 3.5480461825802148, + "tokens_seen": 1355018240 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002977131394182548, + "loss": 2.7196, + "theoretical_loss": 3.548030756363042, + "tokens_seen": 1355083776 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029770310932798397, + "loss": 2.8133, + "theoretical_loss": 3.548015331100796, + "tokens_seen": 1355149312 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029769307923771315, + "loss": 2.7607, + "theoretical_loss": 3.5479999067933705, + "tokens_seen": 1355214848 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029768304914744233, + "loss": 2.879, + "theoretical_loss": 3.5479844834406604, + "tokens_seen": 1355280384 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002976730190571715, + "loss": 2.8117, + "theoretical_loss": 3.5479690610425605, + "tokens_seen": 1355345920 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002976629889669007, + "loss": 2.6956, + "theoretical_loss": 3.547953639598966, + "tokens_seen": 1355411456 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029765295887662993, + "loss": 2.6193, + "theoretical_loss": 3.5479382191097706, + "tokens_seen": 1355476992 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029764292878635905, + "loss": 2.5348, + "theoretical_loss": 3.5479227995748706, + "tokens_seen": 1355542528 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002976328986960883, + "loss": 2.741, + "theoretical_loss": 3.5479073809941593, + "tokens_seen": 1355608064 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002976228686058174, + "loss": 2.6227, + "theoretical_loss": 3.5478919633675328, + "tokens_seen": 1355673600 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029761283851554665, + "loss": 2.7048, + "theoretical_loss": 3.5478765466948854, + "tokens_seen": 1355739136 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029760280842527583, + "loss": 2.6778, + "theoretical_loss": 3.5478611309761114, + "tokens_seen": 1355804672 + }, + { + "epoch": 4.05, + "learning_rate": 0.000297592778335005, + "loss": 2.7497, + "theoretical_loss": 3.5478457162111066, + "tokens_seen": 1355870208 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002975827482447342, + "loss": 2.9066, + "theoretical_loss": 3.547830302399765, + "tokens_seen": 1355935744 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029757271815446343, + "loss": 2.7068, + "theoretical_loss": 3.547814889541983, + "tokens_seen": 1356001280 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029756268806419256, + "loss": 2.4788, + "theoretical_loss": 3.547799477637654, + "tokens_seen": 1356066816 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002975526579739218, + "loss": 2.5697, + "theoretical_loss": 3.547784066686673, + "tokens_seen": 1356132352 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002975426278836509, + "loss": 2.4808, + "theoretical_loss": 3.547768656688936, + "tokens_seen": 1356197888 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029753259779338016, + "loss": 2.8969, + "theoretical_loss": 3.5477532476443376, + "tokens_seen": 1356263424 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029752256770310934, + "loss": 2.5684, + "theoretical_loss": 3.5477378395527723, + "tokens_seen": 1356328960 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002975125376128385, + "loss": 2.6905, + "theoretical_loss": 3.5477224324141354, + "tokens_seen": 1356394496 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002975025075225677, + "loss": 2.6665, + "theoretical_loss": 3.547707026228322, + "tokens_seen": 1356460032 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002974924774322969, + "loss": 2.6812, + "theoretical_loss": 3.547691620995227, + "tokens_seen": 1356525568 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1516059, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.887085437774658, + "objective/train/theoretical_loss": 3.5476762167147458, + "objective/train/tokens_used": 1377051104, + "theoretical_loss": 3.5476762167147458, + "tokens_seen": 1356591104 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029748244734202606, + "loss": 2.7841, + "theoretical_loss": 3.5476762167147458, + "tokens_seen": 1356591104 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002974724172517553, + "loss": 2.8691, + "theoretical_loss": 3.5476608133867735, + "tokens_seen": 1356656640 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002974623871614844, + "loss": 2.8379, + "theoretical_loss": 3.547645411011205, + "tokens_seen": 1356722176 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029745235707121366, + "loss": 2.6562, + "theoretical_loss": 3.5476300095879347, + "tokens_seen": 1356787712 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002974423269809428, + "loss": 2.8677, + "theoretical_loss": 3.5476146091168586, + "tokens_seen": 1356853248 + }, + { + "epoch": 4.05, + "learning_rate": 0.000297432296890672, + "loss": 2.5165, + "theoretical_loss": 3.5475992095978715, + "tokens_seen": 1356918784 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002974222668004012, + "loss": 2.6703, + "theoretical_loss": 3.547583811030869, + "tokens_seen": 1356984320 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002974122367101304, + "loss": 2.6626, + "theoretical_loss": 3.5475684134157457, + "tokens_seen": 1357049856 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029740220661985956, + "loss": 2.6634, + "theoretical_loss": 3.5475530167523974, + "tokens_seen": 1357115392 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002973921765295888, + "loss": 2.6589, + "theoretical_loss": 3.5475376210407186, + "tokens_seen": 1357180928 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002973821464393179, + "loss": 2.7632, + "theoretical_loss": 3.5475222262806048, + "tokens_seen": 1357246464 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029737211634904716, + "loss": 2.8114, + "theoretical_loss": 3.5475068324719516, + "tokens_seen": 1357312000 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002973620862587763, + "loss": 2.6815, + "theoretical_loss": 3.547491439614654, + "tokens_seen": 1357377536 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002973520561685055, + "loss": 2.6894, + "theoretical_loss": 3.5474760477086074, + "tokens_seen": 1357443072 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002973420260782347, + "loss": 2.8562, + "theoretical_loss": 3.5474606567537066, + "tokens_seen": 1357508608 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002973319959879639, + "loss": 2.5874, + "theoretical_loss": 3.5474452667498477, + "tokens_seen": 1357574144 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029732196589769307, + "loss": 2.5583, + "theoretical_loss": 3.547429877696925, + "tokens_seen": 1357639680 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029731193580742225, + "loss": 2.5643, + "theoretical_loss": 3.5474144895948347, + "tokens_seen": 1357705216 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029730190571715143, + "loss": 2.6758, + "theoretical_loss": 3.547399102443472, + "tokens_seen": 1357770752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029729187562688067, + "loss": 2.7373, + "theoretical_loss": 3.547383716242732, + "tokens_seen": 1357836288 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029728184553660985, + "loss": 2.7633, + "theoretical_loss": 3.5473683309925104, + "tokens_seen": 1357901824 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029727181544633903, + "loss": 2.643, + "theoretical_loss": 3.547352946692703, + "tokens_seen": 1357967360 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029726178535606826, + "loss": 2.4956, + "theoretical_loss": 3.5473375633432043, + "tokens_seen": 1358032896 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002972517552657974, + "loss": 2.7176, + "theoretical_loss": 3.54732218094391, + "tokens_seen": 1358098432 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002972417251755266, + "loss": 2.9503, + "theoretical_loss": 3.5473067994947156, + "tokens_seen": 1358163968 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1517513, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5391719341278076, + "objective/train/theoretical_loss": 3.547291418995517, + "objective/train/tokens_used": 1378689504, + "theoretical_loss": 3.547291418995517, + "tokens_seen": 1358229504 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029723169508525575, + "loss": 2.6743, + "theoretical_loss": 3.547291418995517, + "tokens_seen": 1358229504 + }, + { + "epoch": 4.05, + "learning_rate": 0.000297221664994985, + "loss": 2.8176, + "theoretical_loss": 3.547276039446209, + "tokens_seen": 1358295040 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029721163490471417, + "loss": 2.7804, + "theoretical_loss": 3.547260660846688, + "tokens_seen": 1358360576 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029720160481444335, + "loss": 2.6297, + "theoretical_loss": 3.5472452831968484, + "tokens_seen": 1358426112 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029719157472417253, + "loss": 2.7697, + "theoretical_loss": 3.547229906496587, + "tokens_seen": 1358491648 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002971815446339017, + "loss": 2.6438, + "theoretical_loss": 3.547214530745798, + "tokens_seen": 1358557184 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002971715145436309, + "loss": 2.8875, + "theoretical_loss": 3.5471991559443783, + "tokens_seen": 1358622720 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029716148445336013, + "loss": 2.5986, + "theoretical_loss": 3.5471837820922225, + "tokens_seen": 1358688256 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029715145436308926, + "loss": 2.7038, + "theoretical_loss": 3.5471684091892266, + "tokens_seen": 1358753792 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002971414242728185, + "loss": 2.7325, + "theoretical_loss": 3.5471530372352866, + "tokens_seen": 1358819328 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002971313941825476, + "loss": 2.5486, + "theoretical_loss": 3.5471376662302974, + "tokens_seen": 1358884864 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029712136409227685, + "loss": 2.9332, + "theoretical_loss": 3.547122296174155, + "tokens_seen": 1358950400 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029711133400200603, + "loss": 2.789, + "theoretical_loss": 3.5471069270667552, + "tokens_seen": 1359015936 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002971013039117352, + "loss": 2.6935, + "theoretical_loss": 3.5470915589079937, + "tokens_seen": 1359081472 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002970912738214644, + "loss": 2.6713, + "theoretical_loss": 3.547076191697766, + "tokens_seen": 1359147008 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029708124373119363, + "loss": 2.8167, + "theoretical_loss": 3.5470608254359672, + "tokens_seen": 1359212544 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029707121364092276, + "loss": 2.4853, + "theoretical_loss": 3.5470454601224946, + "tokens_seen": 1359278080 + }, + { + "epoch": 4.05, + "learning_rate": 0.000297061183550652, + "loss": 2.7376, + "theoretical_loss": 3.5470300957572425, + "tokens_seen": 1359343616 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002970511534603811, + "loss": 2.7228, + "theoretical_loss": 3.5470147323401076, + "tokens_seen": 1359409152 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029704112337011036, + "loss": 2.958, + "theoretical_loss": 3.546999369870985, + "tokens_seen": 1359474688 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029703109327983954, + "loss": 2.7322, + "theoretical_loss": 3.5469840083497717, + "tokens_seen": 1359540224 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002970210631895687, + "loss": 2.7878, + "theoretical_loss": 3.546968647776362, + "tokens_seen": 1359605760 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002970110330992979, + "loss": 2.3531, + "theoretical_loss": 3.5469532881506525, + "tokens_seen": 1359671296 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002970010030090271, + "loss": 2.8062, + "theoretical_loss": 3.5469379294725387, + "tokens_seen": 1359736832 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029699097291875626, + "loss": 2.582, + "theoretical_loss": 3.5469225717419173, + "tokens_seen": 1359802368 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1518071, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3069539070129395, + "objective/train/theoretical_loss": 3.5469072149586833, + "objective/train/tokens_used": 1380327904, + "theoretical_loss": 3.5469072149586833, + "tokens_seen": 1359867904 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002969809428284855, + "loss": 2.5244, + "theoretical_loss": 3.5469072149586833, + "tokens_seen": 1359867904 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002969709127382146, + "loss": 2.8248, + "theoretical_loss": 3.5468918591227334, + "tokens_seen": 1359933440 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029696088264794386, + "loss": 2.7735, + "theoretical_loss": 3.5468765042339623, + "tokens_seen": 1359998976 + }, + { + "epoch": 4.05, + "learning_rate": 0.000296950852557673, + "loss": 2.522, + "theoretical_loss": 3.5468611502922673, + "tokens_seen": 1360064512 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002969408224674022, + "loss": 2.7694, + "theoretical_loss": 3.546845797297544, + "tokens_seen": 1360130048 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002969408224674022, + "loss": 2.6656, + "theoretical_loss": 3.5468304452496873, + "tokens_seen": 1360195584 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002969307923771314, + "loss": 2.591, + "theoretical_loss": 3.5468150941485943, + "tokens_seen": 1360261120 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002969207622868606, + "loss": 2.7995, + "theoretical_loss": 3.5467997439941605, + "tokens_seen": 1360326656 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029691073219658976, + "loss": 2.8339, + "theoretical_loss": 3.5467843947862825, + "tokens_seen": 1360392192 + }, + { + "epoch": 4.05, + "learning_rate": 0.000296900702106319, + "loss": 2.5979, + "theoretical_loss": 3.5467690465248563, + "tokens_seen": 1360457728 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002968906720160481, + "loss": 2.6479, + "theoretical_loss": 3.546753699209777, + "tokens_seen": 1360523264 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029688064192577736, + "loss": 2.5995, + "theoretical_loss": 3.5467383528409417, + "tokens_seen": 1360588800 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002968706118355065, + "loss": 2.6558, + "theoretical_loss": 3.5467230074182456, + "tokens_seen": 1360654336 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002968605817452357, + "loss": 2.7648, + "theoretical_loss": 3.5467076629415857, + "tokens_seen": 1360719872 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002968505516549649, + "loss": 2.6445, + "theoretical_loss": 3.5466923194108575, + "tokens_seen": 1360785408 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002968405215646941, + "loss": 2.3767, + "theoretical_loss": 3.546676976825957, + "tokens_seen": 1360850944 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029683049147442327, + "loss": 2.5866, + "theoretical_loss": 3.5466616351867812, + "tokens_seen": 1360916480 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029682046138415245, + "loss": 2.742, + "theoretical_loss": 3.5466462944932253, + "tokens_seen": 1360982016 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029681043129388163, + "loss": 2.731, + "theoretical_loss": 3.5466309547451864, + "tokens_seen": 1361047552 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029680040120361087, + "loss": 2.913, + "theoretical_loss": 3.54661561594256, + "tokens_seen": 1361113088 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029679037111334, + "loss": 2.7811, + "theoretical_loss": 3.5466002780852426, + "tokens_seen": 1361178624 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029678034102306923, + "loss": 2.7598, + "theoretical_loss": 3.5465849411731307, + "tokens_seen": 1361244160 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002967703109327984, + "loss": 2.6426, + "theoretical_loss": 3.5465696052061197, + "tokens_seen": 1361309696 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002967602808425276, + "loss": 2.5915, + "theoretical_loss": 3.5465542701841066, + "tokens_seen": 1361375232 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029675025075225677, + "loss": 2.855, + "theoretical_loss": 3.5465389361069874, + "tokens_seen": 1361440768 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1519309, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.060181140899658, + "objective/train/theoretical_loss": 3.5465236029746583, + "objective/train/tokens_used": 1381966304, + "theoretical_loss": 3.5465236029746583, + "tokens_seen": 1361506304 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029674022066198595, + "loss": 2.7874, + "theoretical_loss": 3.5465236029746583, + "tokens_seen": 1361506304 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029673019057171513, + "loss": 2.7999, + "theoretical_loss": 3.546508270787016, + "tokens_seen": 1361571840 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029672016048144437, + "loss": 2.5814, + "theoretical_loss": 3.546492939543957, + "tokens_seen": 1361637376 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002967101303911735, + "loss": 2.6692, + "theoretical_loss": 3.5464776092453767, + "tokens_seen": 1361702912 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029670010030090273, + "loss": 2.9344, + "theoretical_loss": 3.5464622798911725, + "tokens_seen": 1361768448 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029669007021063186, + "loss": 2.7264, + "theoretical_loss": 3.5464469514812396, + "tokens_seen": 1361833984 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002966800401203611, + "loss": 2.8269, + "theoretical_loss": 3.546431624015476, + "tokens_seen": 1361899520 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002966700100300903, + "loss": 2.7462, + "theoretical_loss": 3.5464162974937765, + "tokens_seen": 1361965056 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029665997993981946, + "loss": 2.7305, + "theoretical_loss": 3.5464009719160385, + "tokens_seen": 1362030592 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029664994984954864, + "loss": 2.5155, + "theoretical_loss": 3.546385647282158, + "tokens_seen": 1362096128 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002966399197592778, + "loss": 2.6736, + "theoretical_loss": 3.546370323592032, + "tokens_seen": 1362161664 + }, + { + "epoch": 4.05, + "learning_rate": 0.000296629889669007, + "loss": 2.7749, + "theoretical_loss": 3.5463550008455567, + "tokens_seen": 1362227200 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029661985957873623, + "loss": 2.802, + "theoretical_loss": 3.546339679042628, + "tokens_seen": 1362292736 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029660982948846536, + "loss": 2.684, + "theoretical_loss": 3.546324358183143, + "tokens_seen": 1362358272 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002965997993981946, + "loss": 2.5993, + "theoretical_loss": 3.5463090382669984, + "tokens_seen": 1362423808 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002965897693079238, + "loss": 2.6696, + "theoretical_loss": 3.5462937192940904, + "tokens_seen": 1362489344 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029657973921765296, + "loss": 2.8625, + "theoretical_loss": 3.546278401264316, + "tokens_seen": 1362554880 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029656970912738214, + "loss": 2.7273, + "theoretical_loss": 3.5462630841775713, + "tokens_seen": 1362620416 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002965596790371113, + "loss": 2.6861, + "theoretical_loss": 3.546247768033753, + "tokens_seen": 1362685952 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002965496489468405, + "loss": 2.6665, + "theoretical_loss": 3.546232452832758, + "tokens_seen": 1362751488 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029653961885656974, + "loss": 2.6598, + "theoretical_loss": 3.546217138574482, + "tokens_seen": 1362817024 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002965295887662989, + "loss": 2.8421, + "theoretical_loss": 3.546201825258823, + "tokens_seen": 1362882560 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002965195586760281, + "loss": 2.7694, + "theoretical_loss": 3.546186512885676, + "tokens_seen": 1362948096 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002965095285857573, + "loss": 2.7711, + "theoretical_loss": 3.5461712014549396, + "tokens_seen": 1363013632 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029649949849548646, + "loss": 2.6412, + "theoretical_loss": 3.546155890966509, + "tokens_seen": 1363079168 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1519912, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8462095260620117, + "objective/train/theoretical_loss": 3.546140581420282, + "objective/train/tokens_used": 1383604704, + "theoretical_loss": 3.546140581420282, + "tokens_seen": 1363144704 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002964894684052157, + "loss": 2.7918, + "theoretical_loss": 3.546140581420282, + "tokens_seen": 1363144704 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002964794383149448, + "loss": 2.6971, + "theoretical_loss": 3.5461252728161545, + "tokens_seen": 1363210240 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029646940822467406, + "loss": 2.6667, + "theoretical_loss": 3.5461099651540233, + "tokens_seen": 1363275776 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002964593781344032, + "loss": 2.8574, + "theoretical_loss": 3.546094658433786, + "tokens_seen": 1363341312 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002964493480441324, + "loss": 2.8651, + "theoretical_loss": 3.5460793526553376, + "tokens_seen": 1363406848 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002964393179538616, + "loss": 2.6643, + "theoretical_loss": 3.546064047818577, + "tokens_seen": 1363472384 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002964292878635908, + "loss": 2.831, + "theoretical_loss": 3.5460487439233996, + "tokens_seen": 1363537920 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029641925777331996, + "loss": 2.587, + "theoretical_loss": 3.546033440969703, + "tokens_seen": 1363603456 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002964092276830492, + "loss": 2.6461, + "theoretical_loss": 3.546018138957383, + "tokens_seen": 1363668992 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002963991975927783, + "loss": 2.7813, + "theoretical_loss": 3.5460028378863377, + "tokens_seen": 1363734528 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029638916750250756, + "loss": 2.617, + "theoretical_loss": 3.5459875377564636, + "tokens_seen": 1363800064 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002963791374122367, + "loss": 2.8451, + "theoretical_loss": 3.5459722385676566, + "tokens_seen": 1363865600 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002963691073219659, + "loss": 2.7216, + "theoretical_loss": 3.5459569403198152, + "tokens_seen": 1363931136 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002963590772316951, + "loss": 2.6646, + "theoretical_loss": 3.545941643012835, + "tokens_seen": 1363996672 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002963490471414243, + "loss": 2.5081, + "theoretical_loss": 3.5459263466466133, + "tokens_seen": 1364062208 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029633901705115347, + "loss": 2.8276, + "theoretical_loss": 3.5459110512210477, + "tokens_seen": 1364127744 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029632898696088265, + "loss": 2.5364, + "theoretical_loss": 3.5458957567360345, + "tokens_seen": 1364193280 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029631895687061183, + "loss": 2.8552, + "theoretical_loss": 3.5458804631914704, + "tokens_seen": 1364258816 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029630892678034107, + "loss": 2.7965, + "theoretical_loss": 3.5458651705872537, + "tokens_seen": 1364324352 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002962988966900702, + "loss": 2.8659, + "theoretical_loss": 3.54584987892328, + "tokens_seen": 1364389888 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029628886659979943, + "loss": 2.7637, + "theoretical_loss": 3.5458345881994466, + "tokens_seen": 1364455424 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002962788365095286, + "loss": 2.7808, + "theoretical_loss": 3.5458192984156516, + "tokens_seen": 1364520960 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002962688064192578, + "loss": 2.6831, + "theoretical_loss": 3.545804009571791, + "tokens_seen": 1364586496 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029625877632898697, + "loss": 2.7481, + "theoretical_loss": 3.545788721667762, + "tokens_seen": 1364652032 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029624874623871615, + "loss": 2.6898, + "theoretical_loss": 3.545773434703462, + "tokens_seen": 1364717568 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1521305, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.574596881866455, + "objective/train/theoretical_loss": 3.5457581486787877, + "objective/train/tokens_used": 1385243104, + "theoretical_loss": 3.5457581486787877, + "tokens_seen": 1364783104 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029623871614844533, + "loss": 2.6539, + "theoretical_loss": 3.5457581486787877, + "tokens_seen": 1364783104 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029622868605817457, + "loss": 2.8171, + "theoretical_loss": 3.545742863593637, + "tokens_seen": 1364848640 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002962186559679037, + "loss": 2.6755, + "theoretical_loss": 3.5457275794479064, + "tokens_seen": 1364914176 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029620862587763293, + "loss": 2.5866, + "theoretical_loss": 3.5457122962414935, + "tokens_seen": 1364979712 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029619859578736206, + "loss": 2.6154, + "theoretical_loss": 3.5456970139742947, + "tokens_seen": 1365045248 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002961885656970913, + "loss": 2.6771, + "theoretical_loss": 3.545681732646208, + "tokens_seen": 1365110784 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002961785356068205, + "loss": 2.8346, + "theoretical_loss": 3.54566645225713, + "tokens_seen": 1365176320 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029616850551654966, + "loss": 2.6756, + "theoretical_loss": 3.5456511728069584, + "tokens_seen": 1365241856 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029615847542627884, + "loss": 2.7741, + "theoretical_loss": 3.5456358942955903, + "tokens_seen": 1365307392 + }, + { + "epoch": 4.05, + "learning_rate": 0.000296148445336008, + "loss": 2.8655, + "theoretical_loss": 3.545620616722923, + "tokens_seen": 1365372928 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002961384152457372, + "loss": 2.7293, + "theoretical_loss": 3.5456053400888536, + "tokens_seen": 1365438464 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029612838515546643, + "loss": 2.7781, + "theoretical_loss": 3.54559006439328, + "tokens_seen": 1365504000 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029611835506519556, + "loss": 2.8338, + "theoretical_loss": 3.545574789636098, + "tokens_seen": 1365569536 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002961083249749248, + "loss": 2.719, + "theoretical_loss": 3.545559515817207, + "tokens_seen": 1365635072 + }, + { + "epoch": 4.05, + "learning_rate": 0.000296098294884654, + "loss": 2.5393, + "theoretical_loss": 3.5455442429365025, + "tokens_seen": 1365700608 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029608826479438316, + "loss": 2.82, + "theoretical_loss": 3.545528970993883, + "tokens_seen": 1365766144 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029607823470411234, + "loss": 2.5854, + "theoretical_loss": 3.545513699989246, + "tokens_seen": 1365831680 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002960682046138415, + "loss": 2.6055, + "theoretical_loss": 3.545498429922487, + "tokens_seen": 1365897216 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002960581745235707, + "loss": 2.7797, + "theoretical_loss": 3.545483160793506, + "tokens_seen": 1365962752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029604814443329994, + "loss": 2.7801, + "theoretical_loss": 3.5454678926021987, + "tokens_seen": 1366028288 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029603811434302906, + "loss": 2.6195, + "theoretical_loss": 3.5454526253484633, + "tokens_seen": 1366093824 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002960280842527583, + "loss": 2.8992, + "theoretical_loss": 3.5454373590321966, + "tokens_seen": 1366159360 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002960180541624874, + "loss": 2.5448, + "theoretical_loss": 3.545422093653297, + "tokens_seen": 1366224896 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029600802407221666, + "loss": 3.0915, + "theoretical_loss": 3.545406829211661, + "tokens_seen": 1366290432 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029599799398194584, + "loss": 2.6512, + "theoretical_loss": 3.545391565707187, + "tokens_seen": 1366355968 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1521877, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7869224548339844, + "objective/train/theoretical_loss": 3.5453763031397716, + "objective/train/tokens_used": 1386881504, + "theoretical_loss": 3.5453763031397716, + "tokens_seen": 1366421504 + }, + { + "epoch": 4.05, + "learning_rate": 0.000295987963891675, + "loss": 2.62, + "theoretical_loss": 3.5453763031397716, + "tokens_seen": 1366421504 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002959779338014042, + "loss": 2.8634, + "theoretical_loss": 3.545361041509313, + "tokens_seen": 1366487040 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002959679037111334, + "loss": 2.8177, + "theoretical_loss": 3.5453457808157087, + "tokens_seen": 1366552576 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029595787362086257, + "loss": 2.8319, + "theoretical_loss": 3.545330521058856, + "tokens_seen": 1366618112 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002959478435305918, + "loss": 2.8518, + "theoretical_loss": 3.5453152622386526, + "tokens_seen": 1366683648 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029593781344032093, + "loss": 2.6757, + "theoretical_loss": 3.545300004354996, + "tokens_seen": 1366749184 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029592778335005016, + "loss": 2.5993, + "theoretical_loss": 3.545284747407784, + "tokens_seen": 1366814720 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029591775325977935, + "loss": 2.7115, + "theoretical_loss": 3.5452694913969145, + "tokens_seen": 1366880256 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029590772316950853, + "loss": 2.9373, + "theoretical_loss": 3.5452542363222843, + "tokens_seen": 1366945792 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002958976930792377, + "loss": 2.7642, + "theoretical_loss": 3.5452389821837915, + "tokens_seen": 1367011328 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002958876629889669, + "loss": 2.6147, + "theoretical_loss": 3.545223728981334, + "tokens_seen": 1367076864 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029587763289869607, + "loss": 2.5695, + "theoretical_loss": 3.5452084767148095, + "tokens_seen": 1367142400 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002958676028084253, + "loss": 2.5337, + "theoretical_loss": 3.545193225384115, + "tokens_seen": 1367207936 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029585757271815443, + "loss": 2.5786, + "theoretical_loss": 3.545177974989149, + "tokens_seen": 1367273472 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029584754262788367, + "loss": 2.6016, + "theoretical_loss": 3.545162725529809, + "tokens_seen": 1367339008 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002958375125376128, + "loss": 2.6825, + "theoretical_loss": 3.545147477005993, + "tokens_seen": 1367404544 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029582748244734203, + "loss": 2.7878, + "theoretical_loss": 3.545132229417599, + "tokens_seen": 1367470080 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002958174523570712, + "loss": 2.369, + "theoretical_loss": 3.5451169827645233, + "tokens_seen": 1367535616 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002958074222668004, + "loss": 2.4724, + "theoretical_loss": 3.5451017370466653, + "tokens_seen": 1367601152 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002957973921765296, + "loss": 2.6906, + "theoretical_loss": 3.5450864922639225, + "tokens_seen": 1367666688 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002957873620862588, + "loss": 2.7822, + "theoretical_loss": 3.5450712484161917, + "tokens_seen": 1367732224 + }, + { + "epoch": 4.05, + "learning_rate": 0.000295777331995988, + "loss": 2.6861, + "theoretical_loss": 3.5450560055033717, + "tokens_seen": 1367797760 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029576730190571717, + "loss": 2.868, + "theoretical_loss": 3.5450407635253605, + "tokens_seen": 1367863296 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029575727181544635, + "loss": 2.5099, + "theoretical_loss": 3.5450255224820557, + "tokens_seen": 1367928832 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029574724172517553, + "loss": 2.6857, + "theoretical_loss": 3.5450102823733554, + "tokens_seen": 1367994368 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 1523240, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.509470224380493, + "objective/train/theoretical_loss": 3.5449950431991573, + "objective/train/tokens_used": 1388519904, + "theoretical_loss": 3.5449950431991573, + "tokens_seen": 1368059904 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029573721163490477, + "loss": 2.6579, + "theoretical_loss": 3.5449950431991573, + "tokens_seen": 1368059904 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002957271815446339, + "loss": 2.673, + "theoretical_loss": 3.544979804959359, + "tokens_seen": 1368125440 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029571715145436313, + "loss": 2.5067, + "theoretical_loss": 3.544964567653859, + "tokens_seen": 1368190976 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029570712136409226, + "loss": 2.6669, + "theoretical_loss": 3.544949331282555, + "tokens_seen": 1368256512 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002956970912738215, + "loss": 2.496, + "theoretical_loss": 3.544934095845345, + "tokens_seen": 1368322048 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002956870611835507, + "loss": 2.662, + "theoretical_loss": 3.5449188613421274, + "tokens_seen": 1368387584 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029567703109327986, + "loss": 2.8064, + "theoretical_loss": 3.5449036277727997, + "tokens_seen": 1368453120 + }, + { + "epoch": 4.05, + "learning_rate": 0.00029566700100300904, + "loss": 2.6266, + "theoretical_loss": 3.54488839513726, + "tokens_seen": 1368518656 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002956569709127382, + "loss": 2.9046, + "theoretical_loss": 3.5448731634354065, + "tokens_seen": 1368584192 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002956469408224674, + "loss": 2.6641, + "theoretical_loss": 3.5448579326671372, + "tokens_seen": 1368649728 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029563691073219663, + "loss": 2.8689, + "theoretical_loss": 3.5448427028323506, + "tokens_seen": 1368715264 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029562688064192576, + "loss": 2.6824, + "theoretical_loss": 3.544827473930944, + "tokens_seen": 1368780800 + }, + { + "epoch": 4.06, + "learning_rate": 0.000295616850551655, + "loss": 2.6393, + "theoretical_loss": 3.5448122459628157, + "tokens_seen": 1368846336 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002956068204613842, + "loss": 2.5224, + "theoretical_loss": 3.544797018927864, + "tokens_seen": 1368911872 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029559679037111336, + "loss": 2.7649, + "theoretical_loss": 3.5447817928259875, + "tokens_seen": 1368977408 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029558676028084254, + "loss": 2.516, + "theoretical_loss": 3.544766567657084, + "tokens_seen": 1369042944 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002955767301905717, + "loss": 2.4305, + "theoretical_loss": 3.5447513434210514, + "tokens_seen": 1369108480 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002955667001003009, + "loss": 2.9265, + "theoretical_loss": 3.544736120117788, + "tokens_seen": 1369174016 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029555667001003014, + "loss": 2.8476, + "theoretical_loss": 3.544720897747192, + "tokens_seen": 1369239552 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029554663991975926, + "loss": 2.7277, + "theoretical_loss": 3.544705676309162, + "tokens_seen": 1369305088 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002955366098294885, + "loss": 2.6965, + "theoretical_loss": 3.544690455803596, + "tokens_seen": 1369370624 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002955265797392176, + "loss": 2.6146, + "theoretical_loss": 3.5446752362303924, + "tokens_seen": 1369436160 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029551654964894686, + "loss": 2.7972, + "theoretical_loss": 3.5446600175894485, + "tokens_seen": 1369501696 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029550651955867604, + "loss": 2.6935, + "theoretical_loss": 3.544644799880664, + "tokens_seen": 1369567232 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002954964894684052, + "loss": 2.7009, + "theoretical_loss": 3.5446295831039363, + "tokens_seen": 1369632768 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1523836, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.621335744857788, + "objective/train/theoretical_loss": 3.5446143672591646, + "objective/train/tokens_used": 1390158304, + "theoretical_loss": 3.5446143672591646, + "tokens_seen": 1369698304 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002954864593781344, + "loss": 2.811, + "theoretical_loss": 3.5446143672591646, + "tokens_seen": 1369698304 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002954764292878636, + "loss": 2.7063, + "theoretical_loss": 3.5445991523462457, + "tokens_seen": 1369763840 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029546639919759277, + "loss": 2.5469, + "theoretical_loss": 3.5445839383650797, + "tokens_seen": 1369829376 + }, + { + "epoch": 4.06, + "learning_rate": 0.000295456369107322, + "loss": 2.6845, + "theoretical_loss": 3.544568725315564, + "tokens_seen": 1369894912 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029544633901705113, + "loss": 2.4527, + "theoretical_loss": 3.544553513197597, + "tokens_seen": 1369960448 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029543630892678037, + "loss": 2.7605, + "theoretical_loss": 3.5445383020110772, + "tokens_seen": 1370025984 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029542627883650955, + "loss": 2.7853, + "theoretical_loss": 3.544523091755903, + "tokens_seen": 1370091520 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029541624874623873, + "loss": 2.4475, + "theoretical_loss": 3.544507882431973, + "tokens_seen": 1370157056 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002954062186559679, + "loss": 2.8883, + "theoretical_loss": 3.5444926740391853, + "tokens_seen": 1370222592 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002953961885656971, + "loss": 2.7648, + "theoretical_loss": 3.5444774665774395, + "tokens_seen": 1370288128 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029538615847542627, + "loss": 2.8952, + "theoretical_loss": 3.5444622600466325, + "tokens_seen": 1370353664 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002953761283851555, + "loss": 2.6118, + "theoretical_loss": 3.544447054446663, + "tokens_seen": 1370419200 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029536609829488463, + "loss": 2.7484, + "theoretical_loss": 3.5444318497774305, + "tokens_seen": 1370484736 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029535606820461387, + "loss": 2.6515, + "theoretical_loss": 3.544416646038833, + "tokens_seen": 1370550272 + }, + { + "epoch": 4.06, + "learning_rate": 0.000295346038114343, + "loss": 2.5422, + "theoretical_loss": 3.544401443230769, + "tokens_seen": 1370615808 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029533600802407223, + "loss": 2.587, + "theoretical_loss": 3.544386241353137, + "tokens_seen": 1370681344 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002953259779338014, + "loss": 2.7654, + "theoretical_loss": 3.544371040405836, + "tokens_seen": 1370746880 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002953159478435306, + "loss": 2.4942, + "theoretical_loss": 3.5443558403887643, + "tokens_seen": 1370812416 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002953059177532598, + "loss": 2.6671, + "theoretical_loss": 3.5443406413018197, + "tokens_seen": 1370877952 + }, + { + "epoch": 4.06, + "learning_rate": 0.000295295887662989, + "loss": 2.5489, + "theoretical_loss": 3.544325443144902, + "tokens_seen": 1370943488 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029528585757271814, + "loss": 2.5203, + "theoretical_loss": 3.5443102459179094, + "tokens_seen": 1371009024 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029527582748244737, + "loss": 2.9006, + "theoretical_loss": 3.5442950496207413, + "tokens_seen": 1371074560 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002952657973921765, + "loss": 2.9251, + "theoretical_loss": 3.5442798542532947, + "tokens_seen": 1371140096 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029525576730190573, + "loss": 2.6135, + "theoretical_loss": 3.54426465981547, + "tokens_seen": 1371205632 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002952457372116349, + "loss": 2.7752, + "theoretical_loss": 3.544249466307164, + "tokens_seen": 1371271168 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1525089, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.015775203704834, + "objective/train/theoretical_loss": 3.5442342737282777, + "objective/train/tokens_used": 1391796704, + "theoretical_loss": 3.5442342737282777, + "tokens_seen": 1371336704 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002952357071213641, + "loss": 2.9369, + "theoretical_loss": 3.5442342737282777, + "tokens_seen": 1371336704 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002952256770310933, + "loss": 2.5958, + "theoretical_loss": 3.5442190820787083, + "tokens_seen": 1371402240 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029521564694082246, + "loss": 2.8745, + "theoretical_loss": 3.544203891358355, + "tokens_seen": 1371467776 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029520561685055164, + "loss": 2.5521, + "theoretical_loss": 3.5441887015671165, + "tokens_seen": 1371533312 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002951955867602809, + "loss": 2.7852, + "theoretical_loss": 3.5441735127048917, + "tokens_seen": 1371598848 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029518555667001, + "loss": 2.6479, + "theoretical_loss": 3.544158324771579, + "tokens_seen": 1371664384 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029517552657973924, + "loss": 2.8104, + "theoretical_loss": 3.5441431377670773, + "tokens_seen": 1371729920 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029516549648946836, + "loss": 2.7465, + "theoretical_loss": 3.5441279516912862, + "tokens_seen": 1371795456 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002951554663991976, + "loss": 2.5285, + "theoretical_loss": 3.5441127665441035, + "tokens_seen": 1371860992 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002951454363089268, + "loss": 2.6205, + "theoretical_loss": 3.5440975823254286, + "tokens_seen": 1371926528 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029513540621865596, + "loss": 2.5794, + "theoretical_loss": 3.5440823990351604, + "tokens_seen": 1371992064 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029512537612838514, + "loss": 2.6045, + "theoretical_loss": 3.5440672166731977, + "tokens_seen": 1372057600 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002951153460381144, + "loss": 2.8003, + "theoretical_loss": 3.5440520352394396, + "tokens_seen": 1372123136 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002951053159478435, + "loss": 2.9421, + "theoretical_loss": 3.5440368547337844, + "tokens_seen": 1372188672 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029509528585757274, + "loss": 2.5751, + "theoretical_loss": 3.5440216751561318, + "tokens_seen": 1372254208 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029508525576730187, + "loss": 2.8673, + "theoretical_loss": 3.5440064965063804, + "tokens_seen": 1372319744 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002950752256770311, + "loss": 2.8663, + "theoretical_loss": 3.543991318784429, + "tokens_seen": 1372385280 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002950651955867603, + "loss": 2.6738, + "theoretical_loss": 3.5439761419901767, + "tokens_seen": 1372450816 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029505516549648946, + "loss": 2.6099, + "theoretical_loss": 3.5439609661235227, + "tokens_seen": 1372516352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029504513540621865, + "loss": 2.6654, + "theoretical_loss": 3.543945791184366, + "tokens_seen": 1372581888 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002950351053159478, + "loss": 2.9022, + "theoretical_loss": 3.5439306171726055, + "tokens_seen": 1372647424 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029502507522567706, + "loss": 2.6579, + "theoretical_loss": 3.54391544408814, + "tokens_seen": 1372712960 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029501504513540624, + "loss": 2.8065, + "theoretical_loss": 3.5439002719308696, + "tokens_seen": 1372778496 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002950050150451354, + "loss": 2.8156, + "theoretical_loss": 3.5438851007006917, + "tokens_seen": 1372844032 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002949949849548646, + "loss": 2.6603, + "theoretical_loss": 3.543869930397507, + "tokens_seen": 1372909568 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1525961, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1202309131622314, + "objective/train/theoretical_loss": 3.5438547610212137, + "objective/train/tokens_used": 1393435104, + "theoretical_loss": 3.5438547610212137, + "tokens_seen": 1372975104 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002949849548645938, + "loss": 2.8852, + "theoretical_loss": 3.5438547610212137, + "tokens_seen": 1372975104 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029497492477432297, + "loss": 2.841, + "theoretical_loss": 3.543839592571711, + "tokens_seen": 1373040640 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002949648946840522, + "loss": 2.7847, + "theoretical_loss": 3.543824425048898, + "tokens_seen": 1373106176 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029495486459378133, + "loss": 2.6657, + "theoretical_loss": 3.543809258452675, + "tokens_seen": 1373171712 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029494483450351057, + "loss": 2.8445, + "theoretical_loss": 3.5437940927829388, + "tokens_seen": 1373237248 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029493480441323975, + "loss": 2.8022, + "theoretical_loss": 3.5437789280395915, + "tokens_seen": 1373302784 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029492477432296893, + "loss": 2.6647, + "theoretical_loss": 3.54376376422253, + "tokens_seen": 1373368320 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002949147442326981, + "loss": 2.7687, + "theoretical_loss": 3.5437486013316546, + "tokens_seen": 1373433856 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002949047141424273, + "loss": 2.8542, + "theoretical_loss": 3.5437334393668642, + "tokens_seen": 1373499392 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029489468405215647, + "loss": 2.4105, + "theoretical_loss": 3.543718278328058, + "tokens_seen": 1373564928 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002948846539618857, + "loss": 2.8407, + "theoretical_loss": 3.5437031182151357, + "tokens_seen": 1373630464 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029487462387161483, + "loss": 2.7031, + "theoretical_loss": 3.543687959027996, + "tokens_seen": 1373696000 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029486459378134407, + "loss": 2.8304, + "theoretical_loss": 3.5436728007665392, + "tokens_seen": 1373761536 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002948545636910732, + "loss": 2.7599, + "theoretical_loss": 3.543657643430663, + "tokens_seen": 1373827072 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029484453360080243, + "loss": 2.7486, + "theoretical_loss": 3.5436424870202683, + "tokens_seen": 1373892608 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002948345035105316, + "loss": 2.7511, + "theoretical_loss": 3.5436273315352538, + "tokens_seen": 1373958144 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002948244734202608, + "loss": 2.7855, + "theoretical_loss": 3.543612176975519, + "tokens_seen": 1374023680 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029481444332999, + "loss": 2.836, + "theoretical_loss": 3.543597023340963, + "tokens_seen": 1374089216 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002948044132397192, + "loss": 2.4758, + "theoretical_loss": 3.5435818706314848, + "tokens_seen": 1374154752 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029479438314944834, + "loss": 2.6613, + "theoretical_loss": 3.5435667188469853, + "tokens_seen": 1374220288 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029478435305917757, + "loss": 2.6733, + "theoretical_loss": 3.5435515679873624, + "tokens_seen": 1374285824 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002947743229689067, + "loss": 2.6872, + "theoretical_loss": 3.5435364180525166, + "tokens_seen": 1374351360 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029476429287863593, + "loss": 3.1353, + "theoretical_loss": 3.5435212690423463, + "tokens_seen": 1374416896 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002947542627883651, + "loss": 2.7879, + "theoretical_loss": 3.543506120956752, + "tokens_seen": 1374482432 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002947442326980943, + "loss": 2.5917, + "theoretical_loss": 3.543490973795633, + "tokens_seen": 1374547968 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1531216, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.648439407348633, + "objective/train/theoretical_loss": 3.543475827558888, + "objective/train/tokens_used": 1395073504, + "theoretical_loss": 3.543475827558888, + "tokens_seen": 1374613504 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002947342026078235, + "loss": 2.9129, + "theoretical_loss": 3.543475827558888, + "tokens_seen": 1374613504 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029472417251755266, + "loss": 2.5568, + "theoretical_loss": 3.543460682246417, + "tokens_seen": 1374679040 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029471414242728184, + "loss": 2.6088, + "theoretical_loss": 3.54344553785812, + "tokens_seen": 1374744576 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002947041123370111, + "loss": 2.8646, + "theoretical_loss": 3.5434303943938965, + "tokens_seen": 1374810112 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002946940822467402, + "loss": 2.8851, + "theoretical_loss": 3.543415251853645, + "tokens_seen": 1374875648 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029468405215646944, + "loss": 2.6553, + "theoretical_loss": 3.5434001102372665, + "tokens_seen": 1374941184 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029467402206619856, + "loss": 2.9477, + "theoretical_loss": 3.54338496954466, + "tokens_seen": 1375006720 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002946639919759278, + "loss": 2.605, + "theoretical_loss": 3.543369829775725, + "tokens_seen": 1375072256 + }, + { + "epoch": 4.06, + "learning_rate": 0.000294653961885657, + "loss": 2.9116, + "theoretical_loss": 3.543354690930361, + "tokens_seen": 1375137792 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029464393179538616, + "loss": 2.7982, + "theoretical_loss": 3.543339553008468, + "tokens_seen": 1375203328 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029463390170511534, + "loss": 2.6475, + "theoretical_loss": 3.543324416009945, + "tokens_seen": 1375268864 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002946238716148446, + "loss": 2.7576, + "theoretical_loss": 3.5433092799346926, + "tokens_seen": 1375334400 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002946138415245737, + "loss": 2.9175, + "theoretical_loss": 3.5432941447826103, + "tokens_seen": 1375399936 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029460381143430294, + "loss": 2.9014, + "theoretical_loss": 3.543279010553597, + "tokens_seen": 1375465472 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029459378134403207, + "loss": 2.6803, + "theoretical_loss": 3.5432638772475533, + "tokens_seen": 1375531008 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002945837512537613, + "loss": 2.816, + "theoretical_loss": 3.543248744864379, + "tokens_seen": 1375596544 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002945737211634905, + "loss": 2.6571, + "theoretical_loss": 3.5432336134039732, + "tokens_seen": 1375662080 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029456369107321966, + "loss": 2.6177, + "theoretical_loss": 3.5432184828662363, + "tokens_seen": 1375727616 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029455366098294885, + "loss": 2.7331, + "theoretical_loss": 3.543203353251067, + "tokens_seen": 1375793152 + }, + { + "epoch": 4.06, + "learning_rate": 0.000294543630892678, + "loss": 2.8117, + "theoretical_loss": 3.5431882245583664, + "tokens_seen": 1375858688 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002945336008024072, + "loss": 2.6867, + "theoretical_loss": 3.543173096788034, + "tokens_seen": 1375924224 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029452357071213644, + "loss": 2.5584, + "theoretical_loss": 3.543157969939969, + "tokens_seen": 1375989760 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029451354062186557, + "loss": 2.6233, + "theoretical_loss": 3.5431428440140724, + "tokens_seen": 1376055296 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002945035105315948, + "loss": 2.7276, + "theoretical_loss": 3.5431277190102426, + "tokens_seen": 1376120832 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029449348044132393, + "loss": 2.9551, + "theoretical_loss": 3.543112594928381, + "tokens_seen": 1376186368 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1536193, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.979797601699829, + "objective/train/theoretical_loss": 3.543097471768386, + "objective/train/tokens_used": 1396711904, + "theoretical_loss": 3.543097471768386, + "tokens_seen": 1376251904 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029448345035105317, + "loss": 2.8658, + "theoretical_loss": 3.543097471768386, + "tokens_seen": 1376251904 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029447342026078235, + "loss": 2.7535, + "theoretical_loss": 3.5430823495301587, + "tokens_seen": 1376317440 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029446339017051153, + "loss": 2.8096, + "theoretical_loss": 3.5430672282135984, + "tokens_seen": 1376382976 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002944533600802407, + "loss": 2.5962, + "theoretical_loss": 3.5430521078186055, + "tokens_seen": 1376448512 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029444332998996995, + "loss": 2.9369, + "theoretical_loss": 3.5430369883450794, + "tokens_seen": 1376514048 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002944332998996991, + "loss": 2.8077, + "theoretical_loss": 3.5430218697929208, + "tokens_seen": 1376579584 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002944232698094283, + "loss": 2.5626, + "theoretical_loss": 3.5430067521620288, + "tokens_seen": 1376645120 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029441323971915744, + "loss": 2.7621, + "theoretical_loss": 3.542991635452304, + "tokens_seen": 1376710656 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029440320962888667, + "loss": 2.5922, + "theoretical_loss": 3.5429765196636462, + "tokens_seen": 1376776192 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029439317953861585, + "loss": 2.659, + "theoretical_loss": 3.542961404795956, + "tokens_seen": 1376841728 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029438314944834503, + "loss": 2.6147, + "theoretical_loss": 3.542946290849132, + "tokens_seen": 1376907264 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002943731193580742, + "loss": 2.6262, + "theoretical_loss": 3.5429311778230765, + "tokens_seen": 1376972800 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002943630892678034, + "loss": 2.7149, + "theoretical_loss": 3.5429160657176877, + "tokens_seen": 1377038336 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002943530591775326, + "loss": 2.8174, + "theoretical_loss": 3.5429009545328665, + "tokens_seen": 1377103872 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002943430290872618, + "loss": 2.9529, + "theoretical_loss": 3.542885844268513, + "tokens_seen": 1377169408 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029433299899699094, + "loss": 2.9298, + "theoretical_loss": 3.5428707349245263, + "tokens_seen": 1377234944 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002943229689067202, + "loss": 2.846, + "theoretical_loss": 3.5428556265008084, + "tokens_seen": 1377300480 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002943129388164493, + "loss": 2.5321, + "theoretical_loss": 3.5428405189972585, + "tokens_seen": 1377366016 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029430290872617854, + "loss": 2.5836, + "theoretical_loss": 3.5428254124137766, + "tokens_seen": 1377431552 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029429287863590777, + "loss": 2.6707, + "theoretical_loss": 3.5428103067502628, + "tokens_seen": 1377497088 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002942828485456369, + "loss": 2.6525, + "theoretical_loss": 3.5427952020066176, + "tokens_seen": 1377562624 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029427281845536613, + "loss": 2.6044, + "theoretical_loss": 3.5427800981827415, + "tokens_seen": 1377628160 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002942627883650953, + "loss": 2.6824, + "theoretical_loss": 3.5427649952785343, + "tokens_seen": 1377693696 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002942527582748245, + "loss": 2.5064, + "theoretical_loss": 3.5427498932938972, + "tokens_seen": 1377759232 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002942427281845537, + "loss": 2.8199, + "theoretical_loss": 3.5427347922287287, + "tokens_seen": 1377824768 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1541285, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9072976112365723, + "objective/train/theoretical_loss": 3.5427196920829305, + "objective/train/tokens_used": 1398350304, + "theoretical_loss": 3.5427196920829305, + "tokens_seen": 1377890304 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029423269809428286, + "loss": 2.781, + "theoretical_loss": 3.5427196920829305, + "tokens_seen": 1377890304 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029422266800401204, + "loss": 2.6368, + "theoretical_loss": 3.5427045928564027, + "tokens_seen": 1377955840 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002942126379137413, + "loss": 2.9298, + "theoretical_loss": 3.5426894945490446, + "tokens_seen": 1378021376 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002942026078234704, + "loss": 2.6657, + "theoretical_loss": 3.5426743971607584, + "tokens_seen": 1378086912 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029419257773319964, + "loss": 2.7524, + "theoretical_loss": 3.542659300691443, + "tokens_seen": 1378152448 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029418254764292876, + "loss": 2.7688, + "theoretical_loss": 3.5426442051409985, + "tokens_seen": 1378217984 + }, + { + "epoch": 4.06, + "learning_rate": 0.000294172517552658, + "loss": 2.8809, + "theoretical_loss": 3.542629110509327, + "tokens_seen": 1378283520 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002941624874623872, + "loss": 2.8175, + "theoretical_loss": 3.542614016796327, + "tokens_seen": 1378349056 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029415245737211636, + "loss": 2.5861, + "theoretical_loss": 3.5425989240019, + "tokens_seen": 1378414592 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029414242728184554, + "loss": 2.547, + "theoretical_loss": 3.5425838321259464, + "tokens_seen": 1378480128 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002941323971915748, + "loss": 2.653, + "theoretical_loss": 3.542568741168366, + "tokens_seen": 1378545664 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002941223671013039, + "loss": 2.7472, + "theoretical_loss": 3.54255365112906, + "tokens_seen": 1378611200 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029411233701103314, + "loss": 2.5158, + "theoretical_loss": 3.5425385620079286, + "tokens_seen": 1378676736 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029410230692076227, + "loss": 2.7334, + "theoretical_loss": 3.5425234738048723, + "tokens_seen": 1378742272 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002940922768304915, + "loss": 2.7095, + "theoretical_loss": 3.5425083865197915, + "tokens_seen": 1378807808 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002940822467402207, + "loss": 2.6454, + "theoretical_loss": 3.5424933001525867, + "tokens_seen": 1378873344 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029407221664994986, + "loss": 2.6793, + "theoretical_loss": 3.5424782147031584, + "tokens_seen": 1378938880 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029406218655967905, + "loss": 2.7721, + "theoretical_loss": 3.5424631301714076, + "tokens_seen": 1379004416 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002940521564694082, + "loss": 2.8642, + "theoretical_loss": 3.5424480465572343, + "tokens_seen": 1379069952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002940421263791374, + "loss": 2.719, + "theoretical_loss": 3.5424329638605396, + "tokens_seen": 1379135488 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029403209628886664, + "loss": 2.6414, + "theoretical_loss": 3.5424178820812235, + "tokens_seen": 1379201024 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029402206619859577, + "loss": 2.6062, + "theoretical_loss": 3.542402801219187, + "tokens_seen": 1379266560 + }, + { + "epoch": 4.06, + "learning_rate": 0.000294012036108325, + "loss": 2.721, + "theoretical_loss": 3.5423877212743307, + "tokens_seen": 1379332096 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029400200601805413, + "loss": 2.6475, + "theoretical_loss": 3.542372642246555, + "tokens_seen": 1379397632 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029399197592778337, + "loss": 2.6133, + "theoretical_loss": 3.542357564135761, + "tokens_seen": 1379463168 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1546362, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.725088119506836, + "objective/train/theoretical_loss": 3.542342486941849, + "objective/train/tokens_used": 1399988704, + "theoretical_loss": 3.542342486941849, + "tokens_seen": 1379528704 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029398194583751255, + "loss": 2.7855, + "theoretical_loss": 3.542342486941849, + "tokens_seen": 1379528704 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029397191574724173, + "loss": 2.7274, + "theoretical_loss": 3.5423274106647202, + "tokens_seen": 1379594240 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002939618856569709, + "loss": 2.7593, + "theoretical_loss": 3.542312335304275, + "tokens_seen": 1379659776 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029395185556670015, + "loss": 2.5319, + "theoretical_loss": 3.5422972608604133, + "tokens_seen": 1379725312 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002939418254764293, + "loss": 2.5069, + "theoretical_loss": 3.5422821873330372, + "tokens_seen": 1379790848 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002939317953861585, + "loss": 2.8197, + "theoretical_loss": 3.542267114722047, + "tokens_seen": 1379856384 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029392176529588764, + "loss": 2.6525, + "theoretical_loss": 3.542252043027343, + "tokens_seen": 1379921920 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029391173520561687, + "loss": 2.783, + "theoretical_loss": 3.5422369722488263, + "tokens_seen": 1379987456 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029390170511534605, + "loss": 2.641, + "theoretical_loss": 3.5422219023863977, + "tokens_seen": 1380052992 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029389167502507523, + "loss": 2.7446, + "theoretical_loss": 3.542206833439958, + "tokens_seen": 1380118528 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002938816449348044, + "loss": 2.8127, + "theoretical_loss": 3.5421917654094086, + "tokens_seen": 1380184064 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002938716148445336, + "loss": 2.7673, + "theoretical_loss": 3.542176698294649, + "tokens_seen": 1380249600 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002938615847542628, + "loss": 2.7564, + "theoretical_loss": 3.5421616320955813, + "tokens_seen": 1380315136 + }, + { + "epoch": 4.06, + "learning_rate": 0.000293851554663992, + "loss": 2.6335, + "theoretical_loss": 3.5421465668121064, + "tokens_seen": 1380380672 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029384152457372114, + "loss": 2.8381, + "theoretical_loss": 3.542131502444124, + "tokens_seen": 1380446208 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002938314944834504, + "loss": 2.9114, + "theoretical_loss": 3.542116438991536, + "tokens_seen": 1380511744 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002938214643931795, + "loss": 2.6381, + "theoretical_loss": 3.542101376454243, + "tokens_seen": 1380577280 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029381143430290874, + "loss": 2.6397, + "theoretical_loss": 3.5420863148321464, + "tokens_seen": 1380642816 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002938014042126379, + "loss": 2.5404, + "theoretical_loss": 3.5420712541251462, + "tokens_seen": 1380708352 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002937913741223671, + "loss": 2.5423, + "theoretical_loss": 3.5420561943331443, + "tokens_seen": 1380773888 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002937813440320963, + "loss": 2.795, + "theoretical_loss": 3.5420411354560413, + "tokens_seen": 1380839424 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002937713139418255, + "loss": 2.565, + "theoretical_loss": 3.542026077493738, + "tokens_seen": 1380904960 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029376128385155464, + "loss": 2.636, + "theoretical_loss": 3.542011020446136, + "tokens_seen": 1380970496 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002937512537612839, + "loss": 2.8553, + "theoretical_loss": 3.541995964313136, + "tokens_seen": 1381036032 + }, + { + "epoch": 4.06, + "learning_rate": 0.000293741223671013, + "loss": 2.5084, + "theoretical_loss": 3.5419809090946384, + "tokens_seen": 1381101568 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1551454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5593550205230713, + "objective/train/theoretical_loss": 3.5419658547905453, + "objective/train/tokens_used": 1401627104, + "theoretical_loss": 3.5419658547905453, + "tokens_seen": 1381167104 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029373119358074224, + "loss": 2.5186, + "theoretical_loss": 3.5419658547905453, + "tokens_seen": 1381167104 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002937211634904714, + "loss": 2.4587, + "theoretical_loss": 3.5419508014007572, + "tokens_seen": 1381232640 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002937111334002006, + "loss": 2.6895, + "theoretical_loss": 3.5419357489251757, + "tokens_seen": 1381298176 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002937011033099298, + "loss": 2.7143, + "theoretical_loss": 3.541920697363701, + "tokens_seen": 1381363712 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029369107321965896, + "loss": 2.7308, + "theoretical_loss": 3.5419056467162355, + "tokens_seen": 1381429248 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029368104312938815, + "loss": 2.8324, + "theoretical_loss": 3.5418905969826793, + "tokens_seen": 1381494784 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002936710130391174, + "loss": 2.6625, + "theoretical_loss": 3.541875548162934, + "tokens_seen": 1381560320 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002936609829488465, + "loss": 2.7212, + "theoretical_loss": 3.5418605002569006, + "tokens_seen": 1381625856 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029365095285857574, + "loss": 2.706, + "theoretical_loss": 3.5418454532644796, + "tokens_seen": 1381691392 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029364092276830487, + "loss": 2.4649, + "theoretical_loss": 3.5418304071855737, + "tokens_seen": 1381756928 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002936308926780341, + "loss": 2.7022, + "theoretical_loss": 3.5418153620200834, + "tokens_seen": 1381822464 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002936208625877633, + "loss": 2.6936, + "theoretical_loss": 3.5418003177679096, + "tokens_seen": 1381888000 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029361083249749247, + "loss": 2.9936, + "theoretical_loss": 3.541785274428954, + "tokens_seen": 1381953536 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029360080240722165, + "loss": 2.7206, + "theoretical_loss": 3.5417702320031177, + "tokens_seen": 1382019072 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002935907723169509, + "loss": 2.5303, + "theoretical_loss": 3.541755190490302, + "tokens_seen": 1382084608 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029358074222668, + "loss": 2.9346, + "theoretical_loss": 3.541740149890408, + "tokens_seen": 1382150144 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029357071213640925, + "loss": 2.3615, + "theoretical_loss": 3.541725110203337, + "tokens_seen": 1382215680 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002935606820461384, + "loss": 2.7336, + "theoretical_loss": 3.541710071428991, + "tokens_seen": 1382281216 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002935506519558676, + "loss": 2.5758, + "theoretical_loss": 3.5416950335672706, + "tokens_seen": 1382346752 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029354062186559684, + "loss": 2.6175, + "theoretical_loss": 3.541679996618077, + "tokens_seen": 1382412288 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029353059177532597, + "loss": 2.7783, + "theoretical_loss": 3.541664960581312, + "tokens_seen": 1382477824 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002935205616850552, + "loss": 2.6292, + "theoretical_loss": 3.541649925456878, + "tokens_seen": 1382543360 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029351053159478433, + "loss": 2.7404, + "theoretical_loss": 3.5416348912446742, + "tokens_seen": 1382608896 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029350050150451357, + "loss": 2.5828, + "theoretical_loss": 3.541619857944604, + "tokens_seen": 1382674432 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029349047141424275, + "loss": 2.6829, + "theoretical_loss": 3.5416048255565675, + "tokens_seen": 1382739968 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1552054, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0920627117156982, + "objective/train/theoretical_loss": 3.5415897940804664, + "objective/train/tokens_used": 1403265504, + "theoretical_loss": 3.5415897940804664, + "tokens_seen": 1382805504 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029348044132397193, + "loss": 2.8501, + "theoretical_loss": 3.5415897940804664, + "tokens_seen": 1382805504 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002934704112337011, + "loss": 2.7506, + "theoretical_loss": 3.541574763516203, + "tokens_seen": 1382871040 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029346038114343035, + "loss": 2.7521, + "theoretical_loss": 3.5415597338636777, + "tokens_seen": 1382936576 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002934503510531595, + "loss": 2.869, + "theoretical_loss": 3.5415447051227926, + "tokens_seen": 1383002112 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002934403209628887, + "loss": 2.6461, + "theoretical_loss": 3.5415296772934495, + "tokens_seen": 1383067648 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029343029087261784, + "loss": 2.7489, + "theoretical_loss": 3.541514650375549, + "tokens_seen": 1383133184 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029342026078234707, + "loss": 2.9143, + "theoretical_loss": 3.541499624368993, + "tokens_seen": 1383198720 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029341023069207625, + "loss": 2.7198, + "theoretical_loss": 3.5414845992736836, + "tokens_seen": 1383264256 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029340020060180543, + "loss": 2.4439, + "theoretical_loss": 3.541469575089522, + "tokens_seen": 1383329792 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002933901705115346, + "loss": 2.6594, + "theoretical_loss": 3.5414545518164093, + "tokens_seen": 1383395328 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002933801404212638, + "loss": 2.6616, + "theoretical_loss": 3.541439529454248, + "tokens_seen": 1383460864 + }, + { + "epoch": 4.06, + "learning_rate": 0.000293370110330993, + "loss": 2.879, + "theoretical_loss": 3.541424508002939, + "tokens_seen": 1383526400 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002933600802407222, + "loss": 2.6416, + "theoretical_loss": 3.541409487462384, + "tokens_seen": 1383591936 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029335005015045134, + "loss": 2.6155, + "theoretical_loss": 3.541394467832485, + "tokens_seen": 1383657472 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002933400200601806, + "loss": 2.6164, + "theoretical_loss": 3.541379449113144, + "tokens_seen": 1383723008 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002933299899699097, + "loss": 2.8243, + "theoretical_loss": 3.5413644313042614, + "tokens_seen": 1383788544 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029331995987963894, + "loss": 2.6085, + "theoretical_loss": 3.5413494144057402, + "tokens_seen": 1383854080 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002933099297893681, + "loss": 2.6678, + "theoretical_loss": 3.541334398417481, + "tokens_seen": 1383919616 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002932998996990973, + "loss": 2.6854, + "theoretical_loss": 3.541319383339387, + "tokens_seen": 1383985152 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002932898696088265, + "loss": 2.4524, + "theoretical_loss": 3.5413043691713586, + "tokens_seen": 1384050688 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002932798395185557, + "loss": 2.6441, + "theoretical_loss": 3.541289355913298, + "tokens_seen": 1384116224 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029326980942828484, + "loss": 2.7086, + "theoretical_loss": 3.5412743435651066, + "tokens_seen": 1384181760 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002932597793380141, + "loss": 2.6357, + "theoretical_loss": 3.541259332126687, + "tokens_seen": 1384247296 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002932497492477432, + "loss": 2.5468, + "theoretical_loss": 3.54124432159794, + "tokens_seen": 1384312832 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029323971915747244, + "loss": 2.7114, + "theoretical_loss": 3.5412293119787686, + "tokens_seen": 1384378368 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1553388, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.496983051300049, + "objective/train/theoretical_loss": 3.5412143032690735, + "objective/train/tokens_used": 1404903904, + "theoretical_loss": 3.5412143032690735, + "tokens_seen": 1384443904 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002932296890672016, + "loss": 2.5614, + "theoretical_loss": 3.5412143032690735, + "tokens_seen": 1384443904 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002932196589769308, + "loss": 2.6653, + "theoretical_loss": 3.5411992954687572, + "tokens_seen": 1384509440 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029320962888666, + "loss": 2.656, + "theoretical_loss": 3.541184288577721, + "tokens_seen": 1384574976 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029319959879638916, + "loss": 2.7702, + "theoretical_loss": 3.5411692825958676, + "tokens_seen": 1384640512 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029318956870611835, + "loss": 2.6418, + "theoretical_loss": 3.5411542775230984, + "tokens_seen": 1384706048 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002931795386158476, + "loss": 2.6088, + "theoretical_loss": 3.541139273359315, + "tokens_seen": 1384771584 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002931695085255767, + "loss": 2.9696, + "theoretical_loss": 3.54112427010442, + "tokens_seen": 1384837120 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029315947843530594, + "loss": 2.6929, + "theoretical_loss": 3.541109267758315, + "tokens_seen": 1384902656 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029314944834503507, + "loss": 2.9294, + "theoretical_loss": 3.541094266320902, + "tokens_seen": 1384968192 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002931394182547643, + "loss": 2.5998, + "theoretical_loss": 3.541079265792083, + "tokens_seen": 1385033728 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002931293881644935, + "loss": 2.5983, + "theoretical_loss": 3.5410642661717597, + "tokens_seen": 1385099264 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029311935807422267, + "loss": 2.7155, + "theoretical_loss": 3.541049267459834, + "tokens_seen": 1385164800 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029310932798395185, + "loss": 2.8903, + "theoretical_loss": 3.5410342696562083, + "tokens_seen": 1385230336 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002930992978936811, + "loss": 2.6472, + "theoretical_loss": 3.541019272760785, + "tokens_seen": 1385295872 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002930892678034102, + "loss": 2.8402, + "theoretical_loss": 3.5410042767734655, + "tokens_seen": 1385361408 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029307923771313945, + "loss": 2.6837, + "theoretical_loss": 3.5409892816941513, + "tokens_seen": 1385426944 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002930692076228686, + "loss": 2.6073, + "theoretical_loss": 3.5409742875227463, + "tokens_seen": 1385492480 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002930591775325978, + "loss": 2.6603, + "theoretical_loss": 3.540959294259151, + "tokens_seen": 1385558016 + }, + { + "epoch": 4.06, + "learning_rate": 0.000293049147442327, + "loss": 2.7129, + "theoretical_loss": 3.540944301903268, + "tokens_seen": 1385623552 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029303911735205617, + "loss": 2.4107, + "theoretical_loss": 3.540929310454999, + "tokens_seen": 1385689088 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029302908726178535, + "loss": 2.8114, + "theoretical_loss": 3.540914319914247, + "tokens_seen": 1385754624 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029301905717151453, + "loss": 2.68, + "theoretical_loss": 3.5408993302809133, + "tokens_seen": 1385820160 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002930090270812437, + "loss": 2.6964, + "theoretical_loss": 3.540884341554901, + "tokens_seen": 1385885696 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029299899699097295, + "loss": 2.6005, + "theoretical_loss": 3.5408693537361113, + "tokens_seen": 1385951232 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002929889669007021, + "loss": 2.7721, + "theoretical_loss": 3.540854366824447, + "tokens_seen": 1386016768 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1554184, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.679152250289917, + "objective/train/theoretical_loss": 3.5408393808198104, + "objective/train/tokens_used": 1406542304, + "theoretical_loss": 3.5408393808198104, + "tokens_seen": 1386082304 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002929789368104313, + "loss": 2.7258, + "theoretical_loss": 3.5408393808198104, + "tokens_seen": 1386082304 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002929689067201605, + "loss": 2.9701, + "theoretical_loss": 3.540824395722103, + "tokens_seen": 1386147840 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002929588766298897, + "loss": 2.6006, + "theoretical_loss": 3.540809411531228, + "tokens_seen": 1386213376 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029294884653961885, + "loss": 2.8074, + "theoretical_loss": 3.5407944282470867, + "tokens_seen": 1386278912 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029293881644934804, + "loss": 2.5889, + "theoretical_loss": 3.540779445869582, + "tokens_seen": 1386344448 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002929287863590772, + "loss": 2.7228, + "theoretical_loss": 3.540764464398616, + "tokens_seen": 1386409984 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029291875626880645, + "loss": 2.6788, + "theoretical_loss": 3.540749483834091, + "tokens_seen": 1386475520 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002929087261785356, + "loss": 2.8537, + "theoretical_loss": 3.540734504175909, + "tokens_seen": 1386541056 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002928986960882648, + "loss": 2.8571, + "theoretical_loss": 3.540719525423973, + "tokens_seen": 1386606592 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029288866599799394, + "loss": 2.8249, + "theoretical_loss": 3.540704547578185, + "tokens_seen": 1386672128 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002928786359077232, + "loss": 2.819, + "theoretical_loss": 3.540689570638447, + "tokens_seen": 1386737664 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029286860581745236, + "loss": 2.8209, + "theoretical_loss": 3.5406745946046616, + "tokens_seen": 1386803200 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029285857572718154, + "loss": 2.8291, + "theoretical_loss": 3.540659619476732, + "tokens_seen": 1386868736 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002928485456369107, + "loss": 2.6679, + "theoretical_loss": 3.5406446452545595, + "tokens_seen": 1386934272 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002928385155466399, + "loss": 2.8749, + "theoretical_loss": 3.5406296719380475, + "tokens_seen": 1386999808 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002928284854563691, + "loss": 2.7219, + "theoretical_loss": 3.5406146995270973, + "tokens_seen": 1387065344 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002928184553660983, + "loss": 2.7416, + "theoretical_loss": 3.540599728021612, + "tokens_seen": 1387130880 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029280842527582744, + "loss": 2.7386, + "theoretical_loss": 3.540584757421494, + "tokens_seen": 1387196416 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002927983951855567, + "loss": 2.4624, + "theoretical_loss": 3.5405697877266458, + "tokens_seen": 1387261952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002927883650952859, + "loss": 2.6384, + "theoretical_loss": 3.54055481893697, + "tokens_seen": 1387327488 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029277833500501504, + "loss": 2.6868, + "theoretical_loss": 3.540539851052369, + "tokens_seen": 1387393024 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002927683049147443, + "loss": 2.5707, + "theoretical_loss": 3.5405248840727452, + "tokens_seen": 1387458560 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002927582748244734, + "loss": 2.7434, + "theoretical_loss": 3.5405099179980013, + "tokens_seen": 1387524096 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029274824473420264, + "loss": 2.6813, + "theoretical_loss": 3.5404949528280394, + "tokens_seen": 1387589632 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002927382146439318, + "loss": 2.73, + "theoretical_loss": 3.5404799885627627, + "tokens_seen": 1387655168 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1555725, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6399734020233154, + "objective/train/theoretical_loss": 3.540465025202074, + "objective/train/tokens_used": 1408180704, + "theoretical_loss": 3.540465025202074, + "tokens_seen": 1387720704 + }, + { + "epoch": 4.06, + "learning_rate": 0.000292728184553661, + "loss": 2.8601, + "theoretical_loss": 3.540465025202074, + "tokens_seen": 1387720704 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002927181544633902, + "loss": 2.7116, + "theoretical_loss": 3.540450062745875, + "tokens_seen": 1387786240 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029270812437311936, + "loss": 2.6407, + "theoretical_loss": 3.540435101194069, + "tokens_seen": 1387851776 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029269809428284855, + "loss": 2.7734, + "theoretical_loss": 3.5404201405465585, + "tokens_seen": 1387917312 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002926880641925778, + "loss": 2.8065, + "theoretical_loss": 3.540405180803246, + "tokens_seen": 1387982848 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002926780341023069, + "loss": 2.4892, + "theoretical_loss": 3.540390221964034, + "tokens_seen": 1388048384 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029266800401203614, + "loss": 2.615, + "theoretical_loss": 3.5403752640288255, + "tokens_seen": 1388113920 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029265797392176527, + "loss": 2.6055, + "theoretical_loss": 3.540360306997523, + "tokens_seen": 1388179456 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002926479438314945, + "loss": 2.7447, + "theoretical_loss": 3.5403453508700298, + "tokens_seen": 1388244992 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002926379137412237, + "loss": 2.6932, + "theoretical_loss": 3.5403303956462477, + "tokens_seen": 1388310528 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029262788365095287, + "loss": 2.5505, + "theoretical_loss": 3.5403154413260802, + "tokens_seen": 1388376064 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029261785356068205, + "loss": 2.877, + "theoretical_loss": 3.5403004879094295, + "tokens_seen": 1388441600 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002926078234704113, + "loss": 2.5885, + "theoretical_loss": 3.5402855353961984, + "tokens_seen": 1388507136 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002925977933801404, + "loss": 2.6713, + "theoretical_loss": 3.54027058378629, + "tokens_seen": 1388572672 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029258776328986965, + "loss": 2.496, + "theoretical_loss": 3.540255633079607, + "tokens_seen": 1388638208 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002925777331995988, + "loss": 2.5759, + "theoretical_loss": 3.5402406832760525, + "tokens_seen": 1388703744 + }, + { + "epoch": 4.06, + "learning_rate": 0.000292567703109328, + "loss": 2.5836, + "theoretical_loss": 3.540225734375529, + "tokens_seen": 1388769280 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002925576730190572, + "loss": 2.5895, + "theoretical_loss": 3.540210786377939, + "tokens_seen": 1388834816 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029254764292878637, + "loss": 2.6178, + "theoretical_loss": 3.5401958392831854, + "tokens_seen": 1388900352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029253761283851555, + "loss": 2.4121, + "theoretical_loss": 3.540180893091172, + "tokens_seen": 1388965888 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029252758274824473, + "loss": 2.6844, + "theoretical_loss": 3.5401659478018006, + "tokens_seen": 1389031424 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002925175526579739, + "loss": 2.6171, + "theoretical_loss": 3.540151003414975, + "tokens_seen": 1389096960 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029250752256770315, + "loss": 2.6658, + "theoretical_loss": 3.5401360599305973, + "tokens_seen": 1389162496 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002924974924774323, + "loss": 2.774, + "theoretical_loss": 3.540121117348571, + "tokens_seen": 1389228032 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002924874623871615, + "loss": 2.7155, + "theoretical_loss": 3.540106175668799, + "tokens_seen": 1389293568 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1556470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5523812770843506, + "objective/train/theoretical_loss": 3.540091234891184, + "objective/train/tokens_used": 1409819104, + "theoretical_loss": 3.540091234891184, + "tokens_seen": 1389359104 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002924774322968907, + "loss": 2.7275, + "theoretical_loss": 3.540091234891184, + "tokens_seen": 1389359104 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002924674022066199, + "loss": 2.6922, + "theoretical_loss": 3.540076295015629, + "tokens_seen": 1389424640 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029245737211634906, + "loss": 2.7791, + "theoretical_loss": 3.5400613560420373, + "tokens_seen": 1389490176 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029244734202607824, + "loss": 2.8218, + "theoretical_loss": 3.5400464179703115, + "tokens_seen": 1389555712 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002924373119358074, + "loss": 2.4633, + "theoretical_loss": 3.540031480800355, + "tokens_seen": 1389621248 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029242728184553665, + "loss": 2.8368, + "theoretical_loss": 3.5400165445320706, + "tokens_seen": 1389686784 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002924172517552658, + "loss": 2.6233, + "theoretical_loss": 3.540001609165362, + "tokens_seen": 1389752320 + }, + { + "epoch": 4.06, + "learning_rate": 0.000292407221664995, + "loss": 2.7833, + "theoretical_loss": 3.539986674700131, + "tokens_seen": 1389817856 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029239719157472414, + "loss": 2.6574, + "theoretical_loss": 3.5399717411362817, + "tokens_seen": 1389883392 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002923871614844534, + "loss": 2.7321, + "theoretical_loss": 3.5399568084737165, + "tokens_seen": 1389948928 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029237713139418256, + "loss": 2.7022, + "theoretical_loss": 3.539941876712339, + "tokens_seen": 1390014464 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029236710130391174, + "loss": 2.8063, + "theoretical_loss": 3.5399269458520526, + "tokens_seen": 1390080000 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002923570712136409, + "loss": 2.786, + "theoretical_loss": 3.53991201589276, + "tokens_seen": 1390145536 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002923470411233701, + "loss": 2.6066, + "theoretical_loss": 3.539897086834364, + "tokens_seen": 1390211072 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002923370110330993, + "loss": 2.6857, + "theoretical_loss": 3.539882158676768, + "tokens_seen": 1390276608 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002923269809428285, + "loss": 2.6082, + "theoretical_loss": 3.5398672314198762, + "tokens_seen": 1390342144 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029231695085255765, + "loss": 2.6024, + "theoretical_loss": 3.539852305063591, + "tokens_seen": 1390407680 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002923069207622869, + "loss": 2.6098, + "theoretical_loss": 3.539837379607815, + "tokens_seen": 1390473216 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029229689067201606, + "loss": 2.5219, + "theoretical_loss": 3.539822455052452, + "tokens_seen": 1390538752 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029228686058174524, + "loss": 2.6146, + "theoretical_loss": 3.539807531397406, + "tokens_seen": 1390604288 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002922768304914744, + "loss": 2.6021, + "theoretical_loss": 3.539792608642579, + "tokens_seen": 1390669824 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002922668004012036, + "loss": 2.8935, + "theoretical_loss": 3.539777686787875, + "tokens_seen": 1390735360 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002922567703109328, + "loss": 2.566, + "theoretical_loss": 3.539762765833197, + "tokens_seen": 1390800896 + }, + { + "epoch": 4.06, + "learning_rate": 0.000292246740220662, + "loss": 2.7801, + "theoretical_loss": 3.5397478457784484, + "tokens_seen": 1390866432 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029223671013039115, + "loss": 2.6456, + "theoretical_loss": 3.5397329266235324, + "tokens_seen": 1390931968 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1557630, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9702374935150146, + "objective/train/theoretical_loss": 3.539718008368353, + "objective/train/tokens_used": 1411457504, + "theoretical_loss": 3.539718008368353, + "tokens_seen": 1390997504 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002922266800401204, + "loss": 2.6979, + "theoretical_loss": 3.539718008368353, + "tokens_seen": 1390997504 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002922166499498495, + "loss": 2.7617, + "theoretical_loss": 3.5397030910128127, + "tokens_seen": 1391063040 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029220661985957875, + "loss": 2.7569, + "theoretical_loss": 3.539688174556815, + "tokens_seen": 1391128576 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002921965897693079, + "loss": 2.6994, + "theoretical_loss": 3.5396732590002635, + "tokens_seen": 1391194112 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002921865596790371, + "loss": 2.7891, + "theoretical_loss": 3.5396583443430623, + "tokens_seen": 1391259648 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002921765295887663, + "loss": 2.6115, + "theoretical_loss": 3.539643430585113, + "tokens_seen": 1391325184 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029216649949849547, + "loss": 2.9884, + "theoretical_loss": 3.5396285177263214, + "tokens_seen": 1391390720 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029215646940822465, + "loss": 2.7357, + "theoretical_loss": 3.539613605766589, + "tokens_seen": 1391456256 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002921464393179539, + "loss": 2.7335, + "theoretical_loss": 3.53959869470582, + "tokens_seen": 1391521792 + }, + { + "epoch": 4.06, + "learning_rate": 0.000292136409227683, + "loss": 2.7324, + "theoretical_loss": 3.5395837845439173, + "tokens_seen": 1391587328 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029212637913741225, + "loss": 2.8466, + "theoretical_loss": 3.5395688752807857, + "tokens_seen": 1391652864 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029211634904714143, + "loss": 2.8641, + "theoretical_loss": 3.539553966916327, + "tokens_seen": 1391718400 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002921063189568706, + "loss": 2.6256, + "theoretical_loss": 3.539539059450447, + "tokens_seen": 1391783936 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002920962888665998, + "loss": 2.7322, + "theoretical_loss": 3.539524152883047, + "tokens_seen": 1391849472 + }, + { + "epoch": 4.06, + "learning_rate": 0.000292086258776329, + "loss": 2.6218, + "theoretical_loss": 3.539509247214031, + "tokens_seen": 1391915008 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029207622868605815, + "loss": 2.6119, + "theoretical_loss": 3.539494342443303, + "tokens_seen": 1391980544 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002920661985957874, + "loss": 2.8052, + "theoretical_loss": 3.539479438570767, + "tokens_seen": 1392046080 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002920561685055165, + "loss": 2.4703, + "theoretical_loss": 3.5394645355963257, + "tokens_seen": 1392111616 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029204613841524575, + "loss": 2.838, + "theoretical_loss": 3.539449633519883, + "tokens_seen": 1392177152 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029203610832497493, + "loss": 2.8107, + "theoretical_loss": 3.539434732341343, + "tokens_seen": 1392242688 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002920260782347041, + "loss": 2.6331, + "theoretical_loss": 3.5394198320606094, + "tokens_seen": 1392308224 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029201604814443335, + "loss": 2.962, + "theoretical_loss": 3.5394049326775847, + "tokens_seen": 1392373760 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002920060180541625, + "loss": 2.6808, + "theoretical_loss": 3.5393900341921736, + "tokens_seen": 1392439296 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002919959879638917, + "loss": 2.746, + "theoretical_loss": 3.5393751366042796, + "tokens_seen": 1392504832 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002919859578736209, + "loss": 2.7237, + "theoretical_loss": 3.5393602399138056, + "tokens_seen": 1392570368 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1558283, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4765679836273193, + "objective/train/theoretical_loss": 3.5393453441206564, + "objective/train/tokens_used": 1413095904, + "theoretical_loss": 3.5393453441206564, + "tokens_seen": 1392635904 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002919759277833501, + "loss": 2.6099, + "theoretical_loss": 3.5393453441206564, + "tokens_seen": 1392635904 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029196589769307926, + "loss": 2.64, + "theoretical_loss": 3.5393304492247353, + "tokens_seen": 1392701440 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029195586760280844, + "loss": 2.7599, + "theoretical_loss": 3.5393155552259463, + "tokens_seen": 1392766976 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002919458375125376, + "loss": 2.5263, + "theoretical_loss": 3.5393006621241927, + "tokens_seen": 1392832512 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029193580742226685, + "loss": 2.4846, + "theoretical_loss": 3.5392857699193785, + "tokens_seen": 1392898048 + }, + { + "epoch": 4.06, + "learning_rate": 0.000291925777331996, + "loss": 2.7272, + "theoretical_loss": 3.539270878611407, + "tokens_seen": 1392963584 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002919157472417252, + "loss": 2.8948, + "theoretical_loss": 3.539255988200183, + "tokens_seen": 1393029120 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029190571715145434, + "loss": 2.6318, + "theoretical_loss": 3.5392410986856095, + "tokens_seen": 1393094656 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002918956870611836, + "loss": 2.6671, + "theoretical_loss": 3.5392262100675906, + "tokens_seen": 1393160192 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029188565697091276, + "loss": 2.6477, + "theoretical_loss": 3.53921132234603, + "tokens_seen": 1393225728 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029187562688064194, + "loss": 2.6761, + "theoretical_loss": 3.5391964355208323, + "tokens_seen": 1393291264 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002918655967903711, + "loss": 2.6871, + "theoretical_loss": 3.5391815495919, + "tokens_seen": 1393356800 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002918555667001003, + "loss": 2.7094, + "theoretical_loss": 3.539166664559138, + "tokens_seen": 1393422336 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002918455366098295, + "loss": 2.7393, + "theoretical_loss": 3.5391517804224506, + "tokens_seen": 1393487872 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002918355065195587, + "loss": 2.6149, + "theoretical_loss": 3.5391368971817405, + "tokens_seen": 1393553408 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029182547642928785, + "loss": 2.6424, + "theoretical_loss": 3.539122014836912, + "tokens_seen": 1393618944 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002918154463390171, + "loss": 2.7817, + "theoretical_loss": 3.539107133387869, + "tokens_seen": 1393684480 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029180541624874626, + "loss": 2.7797, + "theoretical_loss": 3.539092252834517, + "tokens_seen": 1393750016 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029179538615847544, + "loss": 2.4839, + "theoretical_loss": 3.5390773731767577, + "tokens_seen": 1393815552 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002917853560682046, + "loss": 2.8268, + "theoretical_loss": 3.5390624944144964, + "tokens_seen": 1393881088 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002917753259779338, + "loss": 2.6579, + "theoretical_loss": 3.5390476165476366, + "tokens_seen": 1393946624 + }, + { + "epoch": 4.06, + "learning_rate": 0.000291765295887663, + "loss": 2.5451, + "theoretical_loss": 3.5390327395760828, + "tokens_seen": 1394012160 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002917552657973922, + "loss": 2.6735, + "theoretical_loss": 3.5390178634997382, + "tokens_seen": 1394077696 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029174523570712135, + "loss": 2.6657, + "theoretical_loss": 3.539002988318508, + "tokens_seen": 1394143232 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002917352056168506, + "loss": 2.6527, + "theoretical_loss": 3.5389881140322954, + "tokens_seen": 1394208768 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1559602, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4316999912261963, + "objective/train/theoretical_loss": 3.5389732406410044, + "objective/train/tokens_used": 1414734304, + "theoretical_loss": 3.5389732406410044, + "tokens_seen": 1394274304 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002917251755265797, + "loss": 2.7982, + "theoretical_loss": 3.5389732406410044, + "tokens_seen": 1394274304 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029171514543630895, + "loss": 2.7828, + "theoretical_loss": 3.53895836814454, + "tokens_seen": 1394339840 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002917051153460381, + "loss": 2.6216, + "theoretical_loss": 3.5389434965428057, + "tokens_seen": 1394405376 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002916950852557673, + "loss": 2.6401, + "theoretical_loss": 3.5389286258357053, + "tokens_seen": 1394470912 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002916850551654965, + "loss": 2.5921, + "theoretical_loss": 3.5389137560231436, + "tokens_seen": 1394536448 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029167502507522567, + "loss": 2.5946, + "theoretical_loss": 3.5388988871050246, + "tokens_seen": 1394601984 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029166499498495485, + "loss": 2.7326, + "theoretical_loss": 3.538884019081252, + "tokens_seen": 1394667520 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002916549648946841, + "loss": 2.6635, + "theoretical_loss": 3.5388691519517304, + "tokens_seen": 1394733056 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002916449348044132, + "loss": 2.6518, + "theoretical_loss": 3.538854285716364, + "tokens_seen": 1394798592 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029163490471414245, + "loss": 2.7532, + "theoretical_loss": 3.5388394203750573, + "tokens_seen": 1394864128 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029162487462387163, + "loss": 2.6184, + "theoretical_loss": 3.5388245559277136, + "tokens_seen": 1394929664 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002916148445336008, + "loss": 2.7091, + "theoretical_loss": 3.5388096923742385, + "tokens_seen": 1394995200 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029160481444333, + "loss": 2.4223, + "theoretical_loss": 3.5387948297145346, + "tokens_seen": 1395060736 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002915947843530592, + "loss": 2.5134, + "theoretical_loss": 3.5387799679485075, + "tokens_seen": 1395126272 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029158475426278835, + "loss": 2.5809, + "theoretical_loss": 3.5387651070760606, + "tokens_seen": 1395191808 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002915747241725176, + "loss": 2.7848, + "theoretical_loss": 3.538750247097099, + "tokens_seen": 1395257344 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002915646940822467, + "loss": 2.6923, + "theoretical_loss": 3.538735388011527, + "tokens_seen": 1395322880 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029155466399197595, + "loss": 2.5315, + "theoretical_loss": 3.5387205298192477, + "tokens_seen": 1395388416 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002915446339017051, + "loss": 2.7831, + "theoretical_loss": 3.538705672520167, + "tokens_seen": 1395453952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002915346038114343, + "loss": 2.5866, + "theoretical_loss": 3.5386908161141886, + "tokens_seen": 1395519488 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002915245737211635, + "loss": 2.5707, + "theoretical_loss": 3.538675960601217, + "tokens_seen": 1395585024 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002915145436308927, + "loss": 2.5683, + "theoretical_loss": 3.5386611059811557, + "tokens_seen": 1395650560 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029150451354062186, + "loss": 2.8669, + "theoretical_loss": 3.53864625225391, + "tokens_seen": 1395716096 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002914944834503511, + "loss": 2.4638, + "theoretical_loss": 3.5386313994193843, + "tokens_seen": 1395781632 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002914844533600802, + "loss": 2.4921, + "theoretical_loss": 3.538616547477483, + "tokens_seen": 1395847168 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1560364, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.397831916809082, + "objective/train/theoretical_loss": 3.5386016964281106, + "objective/train/tokens_used": 1416372704, + "theoretical_loss": 3.5386016964281106, + "tokens_seen": 1395912704 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029147442326980946, + "loss": 2.7208, + "theoretical_loss": 3.5386016964281106, + "tokens_seen": 1395912704 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002914643931795386, + "loss": 2.6569, + "theoretical_loss": 3.538586846271172, + "tokens_seen": 1395978240 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002914543630892678, + "loss": 2.8526, + "theoretical_loss": 3.5385719970065703, + "tokens_seen": 1396043776 + }, + { + "epoch": 4.06, + "learning_rate": 0.000291444332998997, + "loss": 2.7612, + "theoretical_loss": 3.5385571486342107, + "tokens_seen": 1396109312 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002914343029087262, + "loss": 2.7004, + "theoretical_loss": 3.5385423011539983, + "tokens_seen": 1396174848 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029142427281845536, + "loss": 2.6992, + "theoretical_loss": 3.5385274545658367, + "tokens_seen": 1396240384 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029141424272818454, + "loss": 2.6948, + "theoretical_loss": 3.5385126088696315, + "tokens_seen": 1396305920 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002914042126379137, + "loss": 2.4636, + "theoretical_loss": 3.538497764065286, + "tokens_seen": 1396371456 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029139418254764296, + "loss": 2.7728, + "theoretical_loss": 3.5384829201527057, + "tokens_seen": 1396436992 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002913841524573721, + "loss": 2.6798, + "theoretical_loss": 3.538468077131795, + "tokens_seen": 1396502528 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002913741223671013, + "loss": 2.813, + "theoretical_loss": 3.538453235002458, + "tokens_seen": 1396568064 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029136409227683045, + "loss": 2.5104, + "theoretical_loss": 3.5384383937646, + "tokens_seen": 1396633600 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002913540621865597, + "loss": 2.4528, + "theoretical_loss": 3.538423553418125, + "tokens_seen": 1396699136 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029134403209628886, + "loss": 2.546, + "theoretical_loss": 3.538408713962938, + "tokens_seen": 1396764672 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029133400200601805, + "loss": 2.4749, + "theoretical_loss": 3.5383938753989437, + "tokens_seen": 1396830208 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002913239719157472, + "loss": 2.5486, + "theoretical_loss": 3.538379037726047, + "tokens_seen": 1396895744 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029131394182547646, + "loss": 2.7272, + "theoretical_loss": 3.5383642009441516, + "tokens_seen": 1396961280 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002913039117352056, + "loss": 2.775, + "theoretical_loss": 3.5383493650531634, + "tokens_seen": 1397026816 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002912938816449348, + "loss": 2.7766, + "theoretical_loss": 3.5383345300529863, + "tokens_seen": 1397092352 + }, + { + "epoch": 4.06, + "learning_rate": 0.000291283851554664, + "loss": 2.5114, + "theoretical_loss": 3.5383196959435255, + "tokens_seen": 1397157888 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002912738214643932, + "loss": 2.717, + "theoretical_loss": 3.5383048627246856, + "tokens_seen": 1397223424 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002912637913741224, + "loss": 2.6179, + "theoretical_loss": 3.5382900303963707, + "tokens_seen": 1397288960 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029125376128385155, + "loss": 2.6235, + "theoretical_loss": 3.5382751989584866, + "tokens_seen": 1397354496 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002912437311935808, + "loss": 2.628, + "theoretical_loss": 3.538260368410938, + "tokens_seen": 1397420032 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002912337011033099, + "loss": 2.5759, + "theoretical_loss": 3.538245538753629, + "tokens_seen": 1397485568 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1561743, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9683220386505127, + "objective/train/theoretical_loss": 3.5382307099864647, + "objective/train/tokens_used": 1418011104, + "theoretical_loss": 3.5382307099864647, + "tokens_seen": 1397551104 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029122367101303915, + "loss": 2.7218, + "theoretical_loss": 3.5382307099864647, + "tokens_seen": 1397551104 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029121364092276833, + "loss": 2.5486, + "theoretical_loss": 3.53821588210935, + "tokens_seen": 1397616640 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002912036108324975, + "loss": 2.5438, + "theoretical_loss": 3.5382010551221903, + "tokens_seen": 1397682176 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002911935807422267, + "loss": 2.6942, + "theoretical_loss": 3.5381862290248893, + "tokens_seen": 1397747712 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029118355065195587, + "loss": 2.6858, + "theoretical_loss": 3.5381714038173526, + "tokens_seen": 1397813248 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029117352056168505, + "loss": 2.7571, + "theoretical_loss": 3.538156579499485, + "tokens_seen": 1397878784 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002911634904714143, + "loss": 2.7165, + "theoretical_loss": 3.538141756071192, + "tokens_seen": 1397944320 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002911534603811434, + "loss": 2.6838, + "theoretical_loss": 3.538126933532377, + "tokens_seen": 1398009856 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029114343029087265, + "loss": 2.4973, + "theoretical_loss": 3.5381121118829464, + "tokens_seen": 1398075392 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029113340020060183, + "loss": 2.541, + "theoretical_loss": 3.538097291122804, + "tokens_seen": 1398140928 + }, + { + "epoch": 4.06, + "learning_rate": 0.000291123370110331, + "loss": 2.6344, + "theoretical_loss": 3.538082471251856, + "tokens_seen": 1398206464 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002911133400200602, + "loss": 2.5838, + "theoretical_loss": 3.5380676522700067, + "tokens_seen": 1398272000 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002911033099297894, + "loss": 2.4584, + "theoretical_loss": 3.5380528341771607, + "tokens_seen": 1398337536 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029109327983951855, + "loss": 2.5209, + "theoretical_loss": 3.5380380169732235, + "tokens_seen": 1398403072 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002910832497492478, + "loss": 2.7778, + "theoretical_loss": 3.5380232006581003, + "tokens_seen": 1398468608 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002910732196589769, + "loss": 2.719, + "theoretical_loss": 3.538008385231696, + "tokens_seen": 1398534144 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029106318956870615, + "loss": 2.6171, + "theoretical_loss": 3.537993570693915, + "tokens_seen": 1398599680 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002910531594784353, + "loss": 2.5908, + "theoretical_loss": 3.5379787570446632, + "tokens_seen": 1398665216 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002910431293881645, + "loss": 2.6341, + "theoretical_loss": 3.5379639442838453, + "tokens_seen": 1398730752 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002910330992978937, + "loss": 2.7537, + "theoretical_loss": 3.5379491324113665, + "tokens_seen": 1398796288 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002910230692076229, + "loss": 2.8437, + "theoretical_loss": 3.537934321427132, + "tokens_seen": 1398861824 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029101303911735206, + "loss": 2.6106, + "theoretical_loss": 3.537919511331047, + "tokens_seen": 1398927360 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002910030090270813, + "loss": 2.6991, + "theoretical_loss": 3.537904702123016, + "tokens_seen": 1398992896 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002909929789368104, + "loss": 2.5194, + "theoretical_loss": 3.537889893802945, + "tokens_seen": 1399058432 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029098294884653966, + "loss": 2.6954, + "theoretical_loss": 3.537875086370738, + "tokens_seen": 1399123968 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1562396, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.784968137741089, + "objective/train/theoretical_loss": 3.537860279826302, + "objective/train/tokens_used": 1419649504, + "theoretical_loss": 3.537860279826302, + "tokens_seen": 1399189504 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002909729187562688, + "loss": 2.6959, + "theoretical_loss": 3.537860279826302, + "tokens_seen": 1399189504 + }, + { + "epoch": 4.06, + "learning_rate": 0.000290962888665998, + "loss": 2.7707, + "theoretical_loss": 3.53784547416954, + "tokens_seen": 1399255040 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002909528585757272, + "loss": 2.6713, + "theoretical_loss": 3.5378306694003596, + "tokens_seen": 1399320576 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002909428284854564, + "loss": 2.5541, + "theoretical_loss": 3.537815865518664, + "tokens_seen": 1399386112 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029093279839518556, + "loss": 2.7361, + "theoretical_loss": 3.537801062524359, + "tokens_seen": 1399451648 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029092276830491474, + "loss": 2.5931, + "theoretical_loss": 3.5377862604173504, + "tokens_seen": 1399517184 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002909127382146439, + "loss": 2.6053, + "theoretical_loss": 3.5377714591975433, + "tokens_seen": 1399582720 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029090270812437316, + "loss": 2.6946, + "theoretical_loss": 3.5377566588648426, + "tokens_seen": 1399648256 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002908926780341023, + "loss": 2.6109, + "theoretical_loss": 3.537741859419154, + "tokens_seen": 1399713792 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002908826479438315, + "loss": 2.5046, + "theoretical_loss": 3.537727060860383, + "tokens_seen": 1399779328 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029087261785356065, + "loss": 2.7201, + "theoretical_loss": 3.5377122631884337, + "tokens_seen": 1399844864 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002908625877632899, + "loss": 2.5966, + "theoretical_loss": 3.5376974664032126, + "tokens_seen": 1399910400 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029085255767301906, + "loss": 2.5142, + "theoretical_loss": 3.5376826705046254, + "tokens_seen": 1399975936 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029084252758274825, + "loss": 2.3679, + "theoretical_loss": 3.537667875492576, + "tokens_seen": 1400041472 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002908324974924774, + "loss": 2.6371, + "theoretical_loss": 3.537653081366971, + "tokens_seen": 1400107008 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029082246740220666, + "loss": 2.6135, + "theoretical_loss": 3.5376382881277153, + "tokens_seen": 1400172544 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002908124373119358, + "loss": 2.5081, + "theoretical_loss": 3.5376234957747146, + "tokens_seen": 1400238080 + }, + { + "epoch": 4.06, + "learning_rate": 0.000290802407221665, + "loss": 2.6574, + "theoretical_loss": 3.537608704307874, + "tokens_seen": 1400303616 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029079237713139415, + "loss": 2.4363, + "theoretical_loss": 3.537593913727099, + "tokens_seen": 1400369152 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002907823470411234, + "loss": 2.6265, + "theoretical_loss": 3.5375791240322956, + "tokens_seen": 1400434688 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029077231695085257, + "loss": 2.8114, + "theoretical_loss": 3.5375643352233683, + "tokens_seen": 1400500224 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029076228686058175, + "loss": 2.444, + "theoretical_loss": 3.537549547300223, + "tokens_seen": 1400565760 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029075225677031093, + "loss": 2.6148, + "theoretical_loss": 3.537534760262766, + "tokens_seen": 1400631296 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002907422266800401, + "loss": 2.5593, + "theoretical_loss": 3.537519974110902, + "tokens_seen": 1400696832 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002907321965897693, + "loss": 2.5544, + "theoretical_loss": 3.537505188844536, + "tokens_seen": 1400762368 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 1563023, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.582470655441284, + "objective/train/theoretical_loss": 3.5374904044635747, + "objective/train/tokens_used": 1421287904, + "theoretical_loss": 3.5374904044635747, + "tokens_seen": 1400827904 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029072216649949853, + "loss": 2.6132, + "theoretical_loss": 3.5374904044635747, + "tokens_seen": 1400827904 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029071213640922765, + "loss": 2.778, + "theoretical_loss": 3.537475620967923, + "tokens_seen": 1400893440 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002907021063189569, + "loss": 2.6992, + "theoretical_loss": 3.5374608383574873, + "tokens_seen": 1400958976 + }, + { + "epoch": 4.06, + "learning_rate": 0.000290692076228686, + "loss": 2.7268, + "theoretical_loss": 3.537446056632172, + "tokens_seen": 1401024512 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029068204613841525, + "loss": 2.3954, + "theoretical_loss": 3.5374312757918833, + "tokens_seen": 1401090048 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029067201604814443, + "loss": 2.7221, + "theoretical_loss": 3.537416495836527, + "tokens_seen": 1401155584 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002906619859578736, + "loss": 2.4826, + "theoretical_loss": 3.5374017167660075, + "tokens_seen": 1401221120 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002906519558676028, + "loss": 2.5533, + "theoretical_loss": 3.537386938580233, + "tokens_seen": 1401286656 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029064192577733203, + "loss": 2.7717, + "theoretical_loss": 3.537372161279106, + "tokens_seen": 1401352192 + }, + { + "epoch": 4.06, + "learning_rate": 0.00029063189568706116, + "loss": 2.566, + "theoretical_loss": 3.537357384862535, + "tokens_seen": 1401417728 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002906218655967904, + "loss": 2.6829, + "theoretical_loss": 3.5373426093304237, + "tokens_seen": 1401483264 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002906118355065195, + "loss": 2.7533, + "theoretical_loss": 3.537327834682679, + "tokens_seen": 1401548800 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029060180541624876, + "loss": 2.3787, + "theoretical_loss": 3.537313060919206, + "tokens_seen": 1401614336 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029059177532597794, + "loss": 2.3038, + "theoretical_loss": 3.5372982880399104, + "tokens_seen": 1401679872 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002905817452357071, + "loss": 2.4445, + "theoretical_loss": 3.537283516044698, + "tokens_seen": 1401745408 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002905717151454363, + "loss": 2.7568, + "theoretical_loss": 3.5372687449334754, + "tokens_seen": 1401810944 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002905616850551655, + "loss": 2.6059, + "theoretical_loss": 3.5372539747061476, + "tokens_seen": 1401876480 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029055165496489466, + "loss": 2.5998, + "theoretical_loss": 3.53723920536262, + "tokens_seen": 1401942016 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002905416248746239, + "loss": 2.6002, + "theoretical_loss": 3.5372244369027994, + "tokens_seen": 1402007552 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002905315947843531, + "loss": 2.7421, + "theoretical_loss": 3.537209669326591, + "tokens_seen": 1402073088 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029052156469408226, + "loss": 2.6018, + "theoretical_loss": 3.5371949026339005, + "tokens_seen": 1402138624 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002905115346038115, + "loss": 2.7309, + "theoretical_loss": 3.5371801368246345, + "tokens_seen": 1402204160 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002905015045135406, + "loss": 2.7116, + "theoretical_loss": 3.5371653718986975, + "tokens_seen": 1402269696 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029049147442326986, + "loss": 2.7406, + "theoretical_loss": 3.537150607855997, + "tokens_seen": 1402335232 + }, + { + "epoch": 4.07, + "learning_rate": 0.000290481444332999, + "loss": 2.6909, + "theoretical_loss": 3.537135844696438, + "tokens_seen": 1402400768 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1564219, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3950443267822266, + "objective/train/theoretical_loss": 3.5371210824199264, + "objective/train/tokens_used": 1422926304, + "theoretical_loss": 3.5371210824199264, + "tokens_seen": 1402466304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002904714142427282, + "loss": 2.5619, + "theoretical_loss": 3.5371210824199264, + "tokens_seen": 1402466304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002904613841524574, + "loss": 2.7336, + "theoretical_loss": 3.5371063210263682, + "tokens_seen": 1402531840 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002904513540621866, + "loss": 2.7499, + "theoretical_loss": 3.5370915605156696, + "tokens_seen": 1402597376 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029044132397191576, + "loss": 2.5718, + "theoretical_loss": 3.537076800887736, + "tokens_seen": 1402662912 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029043129388164494, + "loss": 2.5744, + "theoretical_loss": 3.537062042142474, + "tokens_seen": 1402728448 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002904212637913741, + "loss": 2.651, + "theoretical_loss": 3.5370472842797893, + "tokens_seen": 1402793984 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029041123370110336, + "loss": 2.6257, + "theoretical_loss": 3.537032527299588, + "tokens_seen": 1402859520 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002904012036108325, + "loss": 2.9672, + "theoretical_loss": 3.537017771201776, + "tokens_seen": 1402925056 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002903911735205617, + "loss": 2.7034, + "theoretical_loss": 3.537003015986259, + "tokens_seen": 1402990592 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029038114343029085, + "loss": 2.8006, + "theoretical_loss": 3.536988261652943, + "tokens_seen": 1403056128 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002903711133400201, + "loss": 2.7806, + "theoretical_loss": 3.536973508201735, + "tokens_seen": 1403121664 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029036108324974926, + "loss": 2.7626, + "theoretical_loss": 3.5369587556325404, + "tokens_seen": 1403187200 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029035105315947845, + "loss": 2.884, + "theoretical_loss": 3.536944003945265, + "tokens_seen": 1403252736 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002903410230692076, + "loss": 2.6588, + "theoretical_loss": 3.5369292531398155, + "tokens_seen": 1403318272 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029033099297893686, + "loss": 2.5644, + "theoretical_loss": 3.5369145032160976, + "tokens_seen": 1403383808 + }, + { + "epoch": 4.07, + "learning_rate": 0.000290320962888666, + "loss": 2.5827, + "theoretical_loss": 3.536899754174018, + "tokens_seen": 1403449344 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002903109327983952, + "loss": 2.5899, + "theoretical_loss": 3.536885006013482, + "tokens_seen": 1403514880 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029030090270812435, + "loss": 2.7309, + "theoretical_loss": 3.536870258734396, + "tokens_seen": 1403580416 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002902908726178536, + "loss": 2.5328, + "theoretical_loss": 3.5368555123366665, + "tokens_seen": 1403645952 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029028084252758277, + "loss": 2.8273, + "theoretical_loss": 3.5368407668202, + "tokens_seen": 1403711488 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029027081243731195, + "loss": 2.8104, + "theoretical_loss": 3.5368260221849015, + "tokens_seen": 1403777024 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029026078234704113, + "loss": 2.7349, + "theoretical_loss": 3.536811278430678, + "tokens_seen": 1403842560 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002902507522567703, + "loss": 2.7155, + "theoretical_loss": 3.536796535557436, + "tokens_seen": 1403908096 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002902407221664995, + "loss": 2.5073, + "theoretical_loss": 3.536781793565081, + "tokens_seen": 1403973632 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029023069207622873, + "loss": 2.5849, + "theoretical_loss": 3.53676705245352, + "tokens_seen": 1404039168 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1565469, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9763519763946533, + "objective/train/theoretical_loss": 3.5367523122226583, + "objective/train/tokens_used": 1424564704, + "theoretical_loss": 3.5367523122226583, + "tokens_seen": 1404104704 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029022066198595785, + "loss": 2.7733, + "theoretical_loss": 3.5367523122226583, + "tokens_seen": 1404104704 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002902106318956871, + "loss": 2.7111, + "theoretical_loss": 3.5367375728724033, + "tokens_seen": 1404170240 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002902006018054162, + "loss": 2.6443, + "theoretical_loss": 3.5367228344026604, + "tokens_seen": 1404235776 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029019057171514545, + "loss": 2.7371, + "theoretical_loss": 3.5367080968133364, + "tokens_seen": 1404301312 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029018054162487463, + "loss": 2.7286, + "theoretical_loss": 3.5366933601043375, + "tokens_seen": 1404366848 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002901705115346038, + "loss": 2.522, + "theoretical_loss": 3.53667862427557, + "tokens_seen": 1404432384 + }, + { + "epoch": 4.07, + "learning_rate": 0.000290160481444333, + "loss": 2.7689, + "theoretical_loss": 3.5366638893269404, + "tokens_seen": 1404497920 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029015045135406223, + "loss": 2.6427, + "theoretical_loss": 3.5366491552583548, + "tokens_seen": 1404563456 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029014042126379136, + "loss": 2.4538, + "theoretical_loss": 3.53663442206972, + "tokens_seen": 1404628992 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002901303911735206, + "loss": 2.6588, + "theoretical_loss": 3.5366196897609417, + "tokens_seen": 1404694528 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002901203610832497, + "loss": 2.7638, + "theoretical_loss": 3.536604958331927, + "tokens_seen": 1404760064 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029011033099297896, + "loss": 2.5228, + "theoretical_loss": 3.5365902277825816, + "tokens_seen": 1404825600 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029010030090270814, + "loss": 2.5804, + "theoretical_loss": 3.5365754981128132, + "tokens_seen": 1404891136 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002900902708124373, + "loss": 2.6405, + "theoretical_loss": 3.5365607693225267, + "tokens_seen": 1404956672 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002900802407221665, + "loss": 2.4659, + "theoretical_loss": 3.5365460414116296, + "tokens_seen": 1405022208 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002900702106318957, + "loss": 2.5059, + "theoretical_loss": 3.536531314380028, + "tokens_seen": 1405087744 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029006018054162486, + "loss": 2.4888, + "theoretical_loss": 3.5365165882276286, + "tokens_seen": 1405153280 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002900501504513541, + "loss": 2.6572, + "theoretical_loss": 3.5365018629543377, + "tokens_seen": 1405218816 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002900401203610832, + "loss": 2.7108, + "theoretical_loss": 3.5364871385600622, + "tokens_seen": 1405284352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029003009027081246, + "loss": 2.6045, + "theoretical_loss": 3.5364724150447078, + "tokens_seen": 1405349888 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002900200601805416, + "loss": 2.7128, + "theoretical_loss": 3.5364576924081823, + "tokens_seen": 1405415424 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002900100300902708, + "loss": 2.6778, + "theoretical_loss": 3.5364429706503913, + "tokens_seen": 1405480960 + }, + { + "epoch": 4.07, + "learning_rate": 0.00029, + "loss": 2.6373, + "theoretical_loss": 3.5364282497712414, + "tokens_seen": 1405546496 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002899899699097292, + "loss": 2.5146, + "theoretical_loss": 3.5364135297706394, + "tokens_seen": 1405612032 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028997993981945836, + "loss": 2.2929, + "theoretical_loss": 3.5363988106484925, + "tokens_seen": 1405677568 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1565832, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.80586838722229, + "objective/train/theoretical_loss": 3.5363840924047065, + "objective/train/tokens_used": 1426203104, + "theoretical_loss": 3.5363840924047065, + "tokens_seen": 1405743104 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002899699097291876, + "loss": 2.7417, + "theoretical_loss": 3.5363840924047065, + "tokens_seen": 1405743104 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002899598796389167, + "loss": 2.5818, + "theoretical_loss": 3.5363693750391882, + "tokens_seen": 1405808640 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028994984954864596, + "loss": 2.6419, + "theoretical_loss": 3.5363546585518444, + "tokens_seen": 1405874176 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002899398194583751, + "loss": 2.5231, + "theoretical_loss": 3.5363399429425817, + "tokens_seen": 1405939712 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002899297893681043, + "loss": 2.6878, + "theoretical_loss": 3.5363252282113073, + "tokens_seen": 1406005248 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002899197592778335, + "loss": 2.837, + "theoretical_loss": 3.536310514357927, + "tokens_seen": 1406070784 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002899097291875627, + "loss": 2.7435, + "theoretical_loss": 3.536295801382348, + "tokens_seen": 1406136320 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028989969909729187, + "loss": 2.6983, + "theoretical_loss": 3.5362810892844774, + "tokens_seen": 1406201856 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028988966900702105, + "loss": 2.5337, + "theoretical_loss": 3.5362663780642207, + "tokens_seen": 1406267392 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028987963891675023, + "loss": 2.5224, + "theoretical_loss": 3.536251667721486, + "tokens_seen": 1406332928 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028986960882647946, + "loss": 2.5878, + "theoretical_loss": 3.536236958256179, + "tokens_seen": 1406398464 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002898595787362086, + "loss": 2.6069, + "theoretical_loss": 3.5362222496682074, + "tokens_seen": 1406464000 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002898495486459378, + "loss": 2.5348, + "theoretical_loss": 3.536207541957477, + "tokens_seen": 1406529536 + }, + { + "epoch": 4.07, + "learning_rate": 0.000289839518555667, + "loss": 2.7274, + "theoretical_loss": 3.536192835123896, + "tokens_seen": 1406595072 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002898294884653962, + "loss": 2.6032, + "theoretical_loss": 3.53617812916737, + "tokens_seen": 1406660608 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028981945837512537, + "loss": 2.5445, + "theoretical_loss": 3.536163424087807, + "tokens_seen": 1406726144 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028980942828485455, + "loss": 2.7627, + "theoretical_loss": 3.5361487198851123, + "tokens_seen": 1406791680 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028979939819458373, + "loss": 2.6414, + "theoretical_loss": 3.5361340165591937, + "tokens_seen": 1406857216 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028978936810431297, + "loss": 2.8794, + "theoretical_loss": 3.536119314109958, + "tokens_seen": 1406922752 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028977933801404215, + "loss": 2.6667, + "theoretical_loss": 3.5361046125373123, + "tokens_seen": 1406988288 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028976930792377133, + "loss": 2.5148, + "theoretical_loss": 3.536089911841163, + "tokens_seen": 1407053824 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002897592778335005, + "loss": 2.489, + "theoretical_loss": 3.5360752120214167, + "tokens_seen": 1407119360 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002897492477432297, + "loss": 2.8873, + "theoretical_loss": 3.536060513077982, + "tokens_seen": 1407184896 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028973921765295893, + "loss": 2.7641, + "theoretical_loss": 3.5360458150107643, + "tokens_seen": 1407250432 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028972918756268805, + "loss": 2.5708, + "theoretical_loss": 3.536031117819671, + "tokens_seen": 1407315968 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1567098, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.710190773010254, + "objective/train/theoretical_loss": 3.536016421504609, + "objective/train/tokens_used": 1427841504, + "theoretical_loss": 3.536016421504609, + "tokens_seen": 1407381504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002897191574724173, + "loss": 2.6155, + "theoretical_loss": 3.536016421504609, + "tokens_seen": 1407381504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002897091273821464, + "loss": 2.5796, + "theoretical_loss": 3.5360017260654857, + "tokens_seen": 1407447040 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028969909729187565, + "loss": 2.5956, + "theoretical_loss": 3.5359870315022075, + "tokens_seen": 1407512576 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028968906720160483, + "loss": 2.8037, + "theoretical_loss": 3.535972337814682, + "tokens_seen": 1407578112 + }, + { + "epoch": 4.07, + "learning_rate": 0.000289679037111334, + "loss": 2.4828, + "theoretical_loss": 3.5359576450028163, + "tokens_seen": 1407643648 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002896690070210632, + "loss": 2.8295, + "theoretical_loss": 3.5359429530665167, + "tokens_seen": 1407709184 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028965897693079243, + "loss": 2.5858, + "theoretical_loss": 3.5359282620056907, + "tokens_seen": 1407774720 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028964894684052156, + "loss": 2.9195, + "theoretical_loss": 3.5359135718202452, + "tokens_seen": 1407840256 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002896389167502508, + "loss": 2.532, + "theoretical_loss": 3.5358988825100877, + "tokens_seen": 1407905792 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002896288866599799, + "loss": 2.8376, + "theoretical_loss": 3.535884194075125, + "tokens_seen": 1407971328 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028961885656970916, + "loss": 2.7427, + "theoretical_loss": 3.5358695065152643, + "tokens_seen": 1408036864 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028960882647943834, + "loss": 2.674, + "theoretical_loss": 3.535854819830413, + "tokens_seen": 1408102400 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002895987963891675, + "loss": 2.6753, + "theoretical_loss": 3.535840134020478, + "tokens_seen": 1408167936 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002895887662988967, + "loss": 2.6783, + "theoretical_loss": 3.5358254490853662, + "tokens_seen": 1408233472 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002895787362086259, + "loss": 2.6066, + "theoretical_loss": 3.5358107650249853, + "tokens_seen": 1408299008 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028956870611835506, + "loss": 2.7882, + "theoretical_loss": 3.535796081839242, + "tokens_seen": 1408364544 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002895586760280843, + "loss": 2.7076, + "theoretical_loss": 3.5357813995280436, + "tokens_seen": 1408430080 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002895486459378134, + "loss": 2.5654, + "theoretical_loss": 3.5357667180912973, + "tokens_seen": 1408495616 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028953861584754266, + "loss": 2.5214, + "theoretical_loss": 3.535752037528911, + "tokens_seen": 1408561152 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002895285857572718, + "loss": 2.8717, + "theoretical_loss": 3.535737357840791, + "tokens_seen": 1408626688 + }, + { + "epoch": 4.07, + "learning_rate": 0.000289518555667001, + "loss": 2.5479, + "theoretical_loss": 3.5357226790268452, + "tokens_seen": 1408692224 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002895085255767302, + "loss": 2.418, + "theoretical_loss": 3.535708001086981, + "tokens_seen": 1408757760 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002894984954864594, + "loss": 2.6083, + "theoretical_loss": 3.5356933240211053, + "tokens_seen": 1408823296 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028948846539618856, + "loss": 2.6263, + "theoretical_loss": 3.5356786478291253, + "tokens_seen": 1408888832 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002894784353059178, + "loss": 2.653, + "theoretical_loss": 3.535663972510948, + "tokens_seen": 1408954368 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1567854, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5623223781585693, + "objective/train/theoretical_loss": 3.535649298066482, + "objective/train/tokens_used": 1429479904, + "theoretical_loss": 3.535649298066482, + "tokens_seen": 1409019904 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002894684052156469, + "loss": 2.599, + "theoretical_loss": 3.535649298066482, + "tokens_seen": 1409019904 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028945837512537616, + "loss": 2.7912, + "theoretical_loss": 3.5356346244956334, + "tokens_seen": 1409085440 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002894483450351053, + "loss": 2.6228, + "theoretical_loss": 3.53561995179831, + "tokens_seen": 1409150976 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002894383149448345, + "loss": 2.5917, + "theoretical_loss": 3.53560527997442, + "tokens_seen": 1409216512 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002894282848545637, + "loss": 2.544, + "theoretical_loss": 3.535590609023869, + "tokens_seen": 1409282048 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002894182547642929, + "loss": 2.4938, + "theoretical_loss": 3.535575938946566, + "tokens_seen": 1409347584 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028940822467402207, + "loss": 2.6501, + "theoretical_loss": 3.5355612697424177, + "tokens_seen": 1409413120 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028939819458375125, + "loss": 2.5533, + "theoretical_loss": 3.5355466014113315, + "tokens_seen": 1409478656 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028938816449348043, + "loss": 2.7157, + "theoretical_loss": 3.535531933953215, + "tokens_seen": 1409544192 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028937813440320966, + "loss": 2.5321, + "theoretical_loss": 3.5355172673679762, + "tokens_seen": 1409609728 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002893681043129388, + "loss": 2.5582, + "theoretical_loss": 3.5355026016555215, + "tokens_seen": 1409675264 + }, + { + "epoch": 4.07, + "learning_rate": 0.000289358074222668, + "loss": 2.6674, + "theoretical_loss": 3.535487936815759, + "tokens_seen": 1409740800 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002893480441323972, + "loss": 2.6364, + "theoretical_loss": 3.5354732728485967, + "tokens_seen": 1409806336 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002893380140421264, + "loss": 2.3849, + "theoretical_loss": 3.535458609753941, + "tokens_seen": 1409871872 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028932798395185557, + "loss": 2.6955, + "theoretical_loss": 3.5354439475317, + "tokens_seen": 1409937408 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028931795386158475, + "loss": 2.6259, + "theoretical_loss": 3.5354292861817815, + "tokens_seen": 1410002944 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028930792377131393, + "loss": 2.6021, + "theoretical_loss": 3.5354146257040924, + "tokens_seen": 1410068480 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028929789368104317, + "loss": 2.7806, + "theoretical_loss": 3.535399966098541, + "tokens_seen": 1410134016 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002892878635907723, + "loss": 2.5702, + "theoretical_loss": 3.5353853073650345, + "tokens_seen": 1410199552 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028927783350050153, + "loss": 2.6023, + "theoretical_loss": 3.5353706495034807, + "tokens_seen": 1410265088 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028926780341023066, + "loss": 2.4183, + "theoretical_loss": 3.5353559925137867, + "tokens_seen": 1410330624 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002892577733199599, + "loss": 2.4087, + "theoretical_loss": 3.5353413363958612, + "tokens_seen": 1410396160 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002892477432296891, + "loss": 2.7805, + "theoretical_loss": 3.535326681149611, + "tokens_seen": 1410461696 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028923771313941825, + "loss": 2.7112, + "theoretical_loss": 3.5353120267749434, + "tokens_seen": 1410527232 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028922768304914744, + "loss": 2.6916, + "theoretical_loss": 3.5352973732717667, + "tokens_seen": 1410592768 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1569260, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.391869068145752, + "objective/train/theoretical_loss": 3.5352827206399886, + "objective/train/tokens_used": 1431118304, + "theoretical_loss": 3.5352827206399886, + "tokens_seen": 1410658304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002892176529588766, + "loss": 2.3331, + "theoretical_loss": 3.5352827206399886, + "tokens_seen": 1410658304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002892076228686058, + "loss": 2.6157, + "theoretical_loss": 3.5352680688795166, + "tokens_seen": 1410723840 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028919759277833503, + "loss": 2.5459, + "theoretical_loss": 3.535253417990259, + "tokens_seen": 1410789376 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028918756268806416, + "loss": 2.6511, + "theoretical_loss": 3.5352387679721224, + "tokens_seen": 1410854912 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002891775325977934, + "loss": 2.5784, + "theoretical_loss": 3.535224118825015, + "tokens_seen": 1410920448 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002891675025075226, + "loss": 2.603, + "theoretical_loss": 3.5352094705488453, + "tokens_seen": 1410985984 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028915747241725176, + "loss": 2.7896, + "theoretical_loss": 3.53519482314352, + "tokens_seen": 1411051520 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028914744232698094, + "loss": 2.6002, + "theoretical_loss": 3.5351801766089483, + "tokens_seen": 1411117056 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002891374122367101, + "loss": 2.6282, + "theoretical_loss": 3.5351655309450365, + "tokens_seen": 1411182592 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002891273821464393, + "loss": 2.3283, + "theoretical_loss": 3.535150886151693, + "tokens_seen": 1411248128 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028911735205616854, + "loss": 2.6072, + "theoretical_loss": 3.5351362422288255, + "tokens_seen": 1411313664 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028910732196589766, + "loss": 2.5544, + "theoretical_loss": 3.5351215991763416, + "tokens_seen": 1411379200 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002890972918756269, + "loss": 2.6091, + "theoretical_loss": 3.5351069569941505, + "tokens_seen": 1411444736 + }, + { + "epoch": 4.07, + "learning_rate": 0.000289087261785356, + "loss": 2.6773, + "theoretical_loss": 3.5350923156821583, + "tokens_seen": 1411510272 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028907723169508526, + "loss": 2.5854, + "theoretical_loss": 3.5350776752402737, + "tokens_seen": 1411575808 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028906720160481444, + "loss": 2.7125, + "theoretical_loss": 3.535063035668405, + "tokens_seen": 1411641344 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002890571715145436, + "loss": 2.5343, + "theoretical_loss": 3.5350483969664594, + "tokens_seen": 1411706880 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002890471414242728, + "loss": 2.5468, + "theoretical_loss": 3.5350337591343455, + "tokens_seen": 1411772416 + }, + { + "epoch": 4.07, + "learning_rate": 0.000289037111334002, + "loss": 2.6072, + "theoretical_loss": 3.5350191221719705, + "tokens_seen": 1411837952 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002890270812437312, + "loss": 2.7006, + "theoretical_loss": 3.5350044860792424, + "tokens_seen": 1411903488 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002890170511534604, + "loss": 2.5986, + "theoretical_loss": 3.53498985085607, + "tokens_seen": 1411969024 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002890070210631896, + "loss": 2.6427, + "theoretical_loss": 3.5349752165023602, + "tokens_seen": 1412034560 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028899699097291876, + "loss": 2.6691, + "theoretical_loss": 3.534960583018022, + "tokens_seen": 1412100096 + }, + { + "epoch": 4.07, + "learning_rate": 0.000288986960882648, + "loss": 2.7271, + "theoretical_loss": 3.534945950402963, + "tokens_seen": 1412165632 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002889769307923771, + "loss": 2.7876, + "theoretical_loss": 3.534931318657091, + "tokens_seen": 1412231168 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1569919, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.615917921066284, + "objective/train/theoretical_loss": 3.5349166877803144, + "objective/train/tokens_used": 1432756704, + "theoretical_loss": 3.5349166877803144, + "tokens_seen": 1412296704 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028896690070210636, + "loss": 2.5592, + "theoretical_loss": 3.5349166877803144, + "tokens_seen": 1412296704 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002889568706118355, + "loss": 2.5736, + "theoretical_loss": 3.534902057772541, + "tokens_seen": 1412362240 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002889468405215647, + "loss": 2.8109, + "theoretical_loss": 3.5348874286336787, + "tokens_seen": 1412427776 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002889368104312939, + "loss": 2.6942, + "theoretical_loss": 3.534872800363636, + "tokens_seen": 1412493312 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002889267803410231, + "loss": 2.7169, + "theoretical_loss": 3.5348581729623207, + "tokens_seen": 1412558848 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028891675025075227, + "loss": 2.6482, + "theoretical_loss": 3.534843546429641, + "tokens_seen": 1412624384 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028890672016048145, + "loss": 2.917, + "theoretical_loss": 3.534828920765505, + "tokens_seen": 1412689920 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028889669007021063, + "loss": 2.4384, + "theoretical_loss": 3.534814295969821, + "tokens_seen": 1412755456 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028888665997993986, + "loss": 2.8092, + "theoretical_loss": 3.5347996720424972, + "tokens_seen": 1412820992 + }, + { + "epoch": 4.07, + "learning_rate": 0.000288876629889669, + "loss": 2.6139, + "theoretical_loss": 3.5347850489834416, + "tokens_seen": 1412886528 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028886659979939823, + "loss": 2.747, + "theoretical_loss": 3.534770426792562, + "tokens_seen": 1412952064 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002888565697091274, + "loss": 2.7182, + "theoretical_loss": 3.534755805469767, + "tokens_seen": 1413017600 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002888465396188566, + "loss": 2.7166, + "theoretical_loss": 3.534741185014965, + "tokens_seen": 1413083136 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028883650952858577, + "loss": 2.515, + "theoretical_loss": 3.5347265654280635, + "tokens_seen": 1413148672 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028882647943831495, + "loss": 2.7623, + "theoretical_loss": 3.534711946708972, + "tokens_seen": 1413214208 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028881644934804413, + "loss": 2.6792, + "theoretical_loss": 3.5346973288575976, + "tokens_seen": 1413279744 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028880641925777337, + "loss": 2.7728, + "theoretical_loss": 3.534682711873849, + "tokens_seen": 1413345280 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002887963891675025, + "loss": 2.6048, + "theoretical_loss": 3.5346680957576337, + "tokens_seen": 1413410816 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028878635907723173, + "loss": 2.7749, + "theoretical_loss": 3.5346534805088616, + "tokens_seen": 1413476352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028877632898696086, + "loss": 2.5928, + "theoretical_loss": 3.5346388661274393, + "tokens_seen": 1413541888 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002887662988966901, + "loss": 2.4303, + "theoretical_loss": 3.5346242526132765, + "tokens_seen": 1413607424 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002887562688064193, + "loss": 2.727, + "theoretical_loss": 3.534609639966281, + "tokens_seen": 1413672960 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028874623871614845, + "loss": 2.5918, + "theoretical_loss": 3.5345950281863607, + "tokens_seen": 1413738496 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028873620862587764, + "loss": 2.6209, + "theoretical_loss": 3.5345804172734248, + "tokens_seen": 1413804032 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002887261785356068, + "loss": 2.5483, + "theoretical_loss": 3.5345658072273807, + "tokens_seen": 1413869568 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1570961, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.574770927429199, + "objective/train/theoretical_loss": 3.5345511980481374, + "objective/train/tokens_used": 1434395104, + "theoretical_loss": 3.5345511980481374, + "tokens_seen": 1413935104 + }, + { + "epoch": 4.07, + "learning_rate": 0.000288716148445336, + "loss": 2.646, + "theoretical_loss": 3.5345511980481374, + "tokens_seen": 1413935104 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028870611835506523, + "loss": 2.4373, + "theoretical_loss": 3.5345365897356036, + "tokens_seen": 1414000640 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028869608826479436, + "loss": 2.6903, + "theoretical_loss": 3.5345219822896867, + "tokens_seen": 1414066176 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002886860581745236, + "loss": 2.2652, + "theoretical_loss": 3.5345073757102963, + "tokens_seen": 1414131712 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002886760280842528, + "loss": 2.5962, + "theoretical_loss": 3.53449276999734, + "tokens_seen": 1414197248 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028866599799398196, + "loss": 2.4221, + "theoretical_loss": 3.5344781651507264, + "tokens_seen": 1414262784 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028865596790371114, + "loss": 2.4368, + "theoretical_loss": 3.5344635611703645, + "tokens_seen": 1414328320 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002886459378134403, + "loss": 2.5616, + "theoretical_loss": 3.5344489580561627, + "tokens_seen": 1414393856 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002886359077231695, + "loss": 2.5009, + "theoretical_loss": 3.5344343558080284, + "tokens_seen": 1414459392 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028862587763289874, + "loss": 2.7095, + "theoretical_loss": 3.534419754425871, + "tokens_seen": 1414524928 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028861584754262786, + "loss": 2.5479, + "theoretical_loss": 3.5344051539095993, + "tokens_seen": 1414590464 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002886058174523571, + "loss": 2.8498, + "theoretical_loss": 3.5343905542591214, + "tokens_seen": 1414656000 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002885957873620862, + "loss": 2.5168, + "theoretical_loss": 3.5343759554743457, + "tokens_seen": 1414721536 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028858575727181546, + "loss": 2.5852, + "theoretical_loss": 3.5343613575551815, + "tokens_seen": 1414787072 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028857572718154464, + "loss": 2.692, + "theoretical_loss": 3.5343467605015366, + "tokens_seen": 1414852608 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002885656970912738, + "loss": 2.4516, + "theoretical_loss": 3.53433216431332, + "tokens_seen": 1414918144 + }, + { + "epoch": 4.07, + "learning_rate": 0.000288555667001003, + "loss": 2.6683, + "theoretical_loss": 3.53431756899044, + "tokens_seen": 1414983680 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002885456369107322, + "loss": 2.6535, + "theoretical_loss": 3.534302974532805, + "tokens_seen": 1415049216 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028853560682046137, + "loss": 2.8792, + "theoretical_loss": 3.5342883809403247, + "tokens_seen": 1415114752 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002885255767301906, + "loss": 2.798, + "theoretical_loss": 3.534273788212907, + "tokens_seen": 1415180288 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028851554663991973, + "loss": 2.4001, + "theoretical_loss": 3.5342591963504604, + "tokens_seen": 1415245824 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028850551654964896, + "loss": 2.5954, + "theoretical_loss": 3.534244605352894, + "tokens_seen": 1415311360 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028849548645937815, + "loss": 2.586, + "theoretical_loss": 3.5342300152201167, + "tokens_seen": 1415376896 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002884854563691073, + "loss": 2.7573, + "theoretical_loss": 3.534215425952036, + "tokens_seen": 1415442432 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002884754262788365, + "loss": 2.7199, + "theoretical_loss": 3.5342008375485623, + "tokens_seen": 1415507968 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1571590, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.518846035003662, + "objective/train/theoretical_loss": 3.534186250009603, + "objective/train/tokens_used": 1436033504, + "theoretical_loss": 3.534186250009603, + "tokens_seen": 1415573504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002884653961885657, + "loss": 2.686, + "theoretical_loss": 3.534186250009603, + "tokens_seen": 1415573504 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028845536609829487, + "loss": 2.5762, + "theoretical_loss": 3.5341716633350675, + "tokens_seen": 1415639040 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002884453360080241, + "loss": 2.6943, + "theoretical_loss": 3.5341570775248643, + "tokens_seen": 1415704576 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028843530591775323, + "loss": 2.5015, + "theoretical_loss": 3.5341424925789022, + "tokens_seen": 1415770112 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028842527582748247, + "loss": 2.6678, + "theoretical_loss": 3.5341279084970902, + "tokens_seen": 1415835648 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002884152457372116, + "loss": 2.7635, + "theoretical_loss": 3.5341133252793373, + "tokens_seen": 1415901184 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028840521564694083, + "loss": 2.7631, + "theoretical_loss": 3.5340987429255515, + "tokens_seen": 1415966720 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028839518555667, + "loss": 2.6395, + "theoretical_loss": 3.534084161435642, + "tokens_seen": 1416032256 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002883851554663992, + "loss": 2.5964, + "theoretical_loss": 3.5340695808095184, + "tokens_seen": 1416097792 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002883751253761284, + "loss": 2.6531, + "theoretical_loss": 3.5340550010470886, + "tokens_seen": 1416163328 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002883650952858576, + "loss": 2.6817, + "theoretical_loss": 3.5340404221482618, + "tokens_seen": 1416228864 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028835506519558674, + "loss": 2.4982, + "theoretical_loss": 3.534025844112947, + "tokens_seen": 1416294400 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028834503510531597, + "loss": 2.6608, + "theoretical_loss": 3.534011266941053, + "tokens_seen": 1416359936 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002883350050150451, + "loss": 2.5268, + "theoretical_loss": 3.5339966906324882, + "tokens_seen": 1416425472 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028832497492477433, + "loss": 2.6619, + "theoretical_loss": 3.533982115187162, + "tokens_seen": 1416491008 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002883149448345035, + "loss": 2.6603, + "theoretical_loss": 3.5339675406049844, + "tokens_seen": 1416556544 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002883049147442327, + "loss": 2.6605, + "theoretical_loss": 3.5339529668858622, + "tokens_seen": 1416622080 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002882948846539619, + "loss": 2.7202, + "theoretical_loss": 3.533938394029706, + "tokens_seen": 1416687616 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028828485456369106, + "loss": 2.4932, + "theoretical_loss": 3.5339238220364244, + "tokens_seen": 1416753152 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002882748244734203, + "loss": 2.659, + "theoretical_loss": 3.533909250905926, + "tokens_seen": 1416818688 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002882647943831495, + "loss": 2.6021, + "theoretical_loss": 3.53389468063812, + "tokens_seen": 1416884224 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028825476429287866, + "loss": 2.7558, + "theoretical_loss": 3.533880111232916, + "tokens_seen": 1416949760 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028824473420260784, + "loss": 2.7942, + "theoretical_loss": 3.533865542690222, + "tokens_seen": 1417015296 + }, + { + "epoch": 4.07, + "learning_rate": 0.000288234704112337, + "loss": 2.7587, + "theoretical_loss": 3.5338509750099485, + "tokens_seen": 1417080832 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002882246740220662, + "loss": 2.6476, + "theoretical_loss": 3.533836408192003, + "tokens_seen": 1417146368 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1572254, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7383265495300293, + "objective/train/theoretical_loss": 3.533821842236295, + "objective/train/tokens_used": 1437671904, + "theoretical_loss": 3.533821842236295, + "tokens_seen": 1417211904 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028821464393179543, + "loss": 2.904, + "theoretical_loss": 3.533821842236295, + "tokens_seen": 1417211904 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028820461384152456, + "loss": 2.7954, + "theoretical_loss": 3.533807277142734, + "tokens_seen": 1417277440 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002881945837512538, + "loss": 2.466, + "theoretical_loss": 3.533792712911229, + "tokens_seen": 1417342976 + }, + { + "epoch": 4.07, + "learning_rate": 0.000288184553660983, + "loss": 2.723, + "theoretical_loss": 3.533778149541689, + "tokens_seen": 1417408512 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028817452357071216, + "loss": 2.7506, + "theoretical_loss": 3.5337635870340236, + "tokens_seen": 1417474048 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028816449348044134, + "loss": 2.5223, + "theoretical_loss": 3.5337490253881416, + "tokens_seen": 1417539584 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002881544633901705, + "loss": 2.5958, + "theoretical_loss": 3.5337344646039517, + "tokens_seen": 1417605120 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002881444332998997, + "loss": 2.6868, + "theoretical_loss": 3.533719904681363, + "tokens_seen": 1417670656 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028813440320962894, + "loss": 2.5505, + "theoretical_loss": 3.5337053456202865, + "tokens_seen": 1417736192 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028812437311935806, + "loss": 2.5364, + "theoretical_loss": 3.533690787420629, + "tokens_seen": 1417801728 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002881143430290873, + "loss": 2.5286, + "theoretical_loss": 3.5336762300823015, + "tokens_seen": 1417867264 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002881043129388164, + "loss": 2.84, + "theoretical_loss": 3.533661673605212, + "tokens_seen": 1417932800 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028809428284854566, + "loss": 2.6731, + "theoretical_loss": 3.5336471179892706, + "tokens_seen": 1417998336 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028808425275827484, + "loss": 2.6108, + "theoretical_loss": 3.533632563234386, + "tokens_seen": 1418063872 + }, + { + "epoch": 4.07, + "learning_rate": 0.000288074222668004, + "loss": 2.8358, + "theoretical_loss": 3.5336180093404677, + "tokens_seen": 1418129408 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002880641925777332, + "loss": 2.8047, + "theoretical_loss": 3.533603456307425, + "tokens_seen": 1418194944 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002880541624874624, + "loss": 2.6638, + "theoretical_loss": 3.5335889041351676, + "tokens_seen": 1418260480 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028804413239719157, + "loss": 2.7179, + "theoretical_loss": 3.533574352823604, + "tokens_seen": 1418326016 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002880341023069208, + "loss": 2.6641, + "theoretical_loss": 3.533559802372644, + "tokens_seen": 1418391552 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028802407221664993, + "loss": 2.6614, + "theoretical_loss": 3.533545252782197, + "tokens_seen": 1418457088 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028801404212637916, + "loss": 2.8433, + "theoretical_loss": 3.5335307040521724, + "tokens_seen": 1418522624 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028800401203610835, + "loss": 2.282, + "theoretical_loss": 3.533516156182479, + "tokens_seen": 1418588160 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002879939819458375, + "loss": 2.5477, + "theoretical_loss": 3.533501609173027, + "tokens_seen": 1418653696 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002879839518555667, + "loss": 2.6269, + "theoretical_loss": 3.533487063023725, + "tokens_seen": 1418719232 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002879739217652959, + "loss": 2.5457, + "theoretical_loss": 3.5334725177344835, + "tokens_seen": 1418784768 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1573335, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.203737497329712, + "objective/train/theoretical_loss": 3.53345797330521, + "objective/train/tokens_used": 1439310304, + "theoretical_loss": 3.53345797330521, + "tokens_seen": 1418850304 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028796389167502507, + "loss": 2.6313, + "theoretical_loss": 3.53345797330521, + "tokens_seen": 1418850304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002879538615847543, + "loss": 2.4921, + "theoretical_loss": 3.533443429735816, + "tokens_seen": 1418915840 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028794383149448343, + "loss": 2.6774, + "theoretical_loss": 3.53342888702621, + "tokens_seen": 1418981376 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028793380140421267, + "loss": 2.7072, + "theoretical_loss": 3.5334143451763014, + "tokens_seen": 1419046912 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002879237713139418, + "loss": 2.6758, + "theoretical_loss": 3.533399804186, + "tokens_seen": 1419112448 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028791374122367103, + "loss": 2.6, + "theoretical_loss": 3.5333852640552155, + "tokens_seen": 1419177984 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002879037111334002, + "loss": 2.538, + "theoretical_loss": 3.5333707247838566, + "tokens_seen": 1419243520 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002878936810431294, + "loss": 2.5949, + "theoretical_loss": 3.533356186371833, + "tokens_seen": 1419309056 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002878836509528586, + "loss": 2.7255, + "theoretical_loss": 3.5333416488190554, + "tokens_seen": 1419374592 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002878736208625878, + "loss": 2.6536, + "theoretical_loss": 3.5333271121254315, + "tokens_seen": 1419440128 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028786359077231694, + "loss": 2.7465, + "theoretical_loss": 3.5333125762908724, + "tokens_seen": 1419505664 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028785356068204617, + "loss": 2.581, + "theoretical_loss": 3.5332980413152866, + "tokens_seen": 1419571200 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002878435305917753, + "loss": 2.615, + "theoretical_loss": 3.5332835071985844, + "tokens_seen": 1419636736 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028783350050150453, + "loss": 2.7341, + "theoretical_loss": 3.5332689739406753, + "tokens_seen": 1419702272 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002878234704112337, + "loss": 2.5138, + "theoretical_loss": 3.5332544415414686, + "tokens_seen": 1419767808 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002878134403209629, + "loss": 2.6365, + "theoretical_loss": 3.5332399100008747, + "tokens_seen": 1419833344 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002878034102306921, + "loss": 2.8449, + "theoretical_loss": 3.533225379318802, + "tokens_seen": 1419898880 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028779338014042126, + "loss": 2.6908, + "theoretical_loss": 3.533210849495161, + "tokens_seen": 1419964416 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028778335005015044, + "loss": 2.3565, + "theoretical_loss": 3.5331963205298615, + "tokens_seen": 1420029952 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002877733199598797, + "loss": 2.7702, + "theoretical_loss": 3.5331817924228126, + "tokens_seen": 1420095488 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002877632898696088, + "loss": 2.5993, + "theoretical_loss": 3.5331672651739243, + "tokens_seen": 1420161024 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028775325977933804, + "loss": 2.6276, + "theoretical_loss": 3.533152738783106, + "tokens_seen": 1420226560 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028774322968906716, + "loss": 2.9462, + "theoretical_loss": 3.5331382132502682, + "tokens_seen": 1420292096 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002877331995987964, + "loss": 2.6313, + "theoretical_loss": 3.5331236885753197, + "tokens_seen": 1420357632 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002877231695085256, + "loss": 2.7853, + "theoretical_loss": 3.533109164758171, + "tokens_seen": 1420423168 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1574634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9888744354248047, + "objective/train/theoretical_loss": 3.5330946417987312, + "objective/train/tokens_used": 1440948704, + "theoretical_loss": 3.5330946417987312, + "tokens_seen": 1420488704 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028771313941825476, + "loss": 2.6085, + "theoretical_loss": 3.5330946417987312, + "tokens_seen": 1420488704 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028770310932798394, + "loss": 2.5191, + "theoretical_loss": 3.533080119696911, + "tokens_seen": 1420554240 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002876930792377132, + "loss": 2.7856, + "theoretical_loss": 3.5330655984526196, + "tokens_seen": 1420619776 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002876830491474423, + "loss": 2.586, + "theoretical_loss": 3.5330510780657662, + "tokens_seen": 1420685312 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028767301905717154, + "loss": 2.7563, + "theoretical_loss": 3.5330365585362618, + "tokens_seen": 1420750848 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028766298896690067, + "loss": 2.8421, + "theoretical_loss": 3.5330220398640155, + "tokens_seen": 1420816384 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002876529588766299, + "loss": 2.6803, + "theoretical_loss": 3.5330075220489374, + "tokens_seen": 1420881920 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002876429287863591, + "loss": 2.5614, + "theoretical_loss": 3.5329930050909373, + "tokens_seen": 1420947456 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028763289869608826, + "loss": 2.654, + "theoretical_loss": 3.532978488989925, + "tokens_seen": 1421012992 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028762286860581745, + "loss": 2.5684, + "theoretical_loss": 3.5329639737458107, + "tokens_seen": 1421078528 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002876128385155466, + "loss": 2.6544, + "theoretical_loss": 3.5329494593585036, + "tokens_seen": 1421144064 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002876028084252758, + "loss": 2.7579, + "theoretical_loss": 3.5329349458279147, + "tokens_seen": 1421209600 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028759277833500504, + "loss": 2.7276, + "theoretical_loss": 3.5329204331539534, + "tokens_seen": 1421275136 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028758274824473417, + "loss": 2.6584, + "theoretical_loss": 3.5329059213365293, + "tokens_seen": 1421340672 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002875727181544634, + "loss": 2.7329, + "theoretical_loss": 3.5328914103755524, + "tokens_seen": 1421406208 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028756268806419253, + "loss": 2.8148, + "theoretical_loss": 3.5328769002709333, + "tokens_seen": 1421471744 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028755265797392177, + "loss": 2.7957, + "theoretical_loss": 3.532862391022581, + "tokens_seen": 1421537280 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028754262788365095, + "loss": 2.9081, + "theoretical_loss": 3.5328478826304064, + "tokens_seen": 1421602816 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028753259779338013, + "loss": 2.7681, + "theoretical_loss": 3.5328333750943193, + "tokens_seen": 1421668352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028752256770310936, + "loss": 2.7856, + "theoretical_loss": 3.532818868414229, + "tokens_seen": 1421733888 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028751253761283855, + "loss": 2.744, + "theoretical_loss": 3.5328043625900474, + "tokens_seen": 1421799424 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002875025075225677, + "loss": 2.7367, + "theoretical_loss": 3.5327898576216823, + "tokens_seen": 1421864960 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002874924774322969, + "loss": 2.4577, + "theoretical_loss": 3.532775353509045, + "tokens_seen": 1421930496 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002874824473420261, + "loss": 2.7095, + "theoretical_loss": 3.5327608502520453, + "tokens_seen": 1421996032 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028747241725175527, + "loss": 2.8389, + "theoretical_loss": 3.5327463478505936, + "tokens_seen": 1422061568 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1575381, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.637848377227783, + "objective/train/theoretical_loss": 3.5327318463046, + "objective/train/tokens_used": 1442587104, + "theoretical_loss": 3.5327318463046, + "tokens_seen": 1422127104 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002874623871614845, + "loss": 2.8563, + "theoretical_loss": 3.5327318463046, + "tokens_seen": 1422127104 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028745235707121363, + "loss": 2.8276, + "theoretical_loss": 3.5327173456139733, + "tokens_seen": 1422192640 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028744232698094287, + "loss": 2.7305, + "theoretical_loss": 3.5327028457786254, + "tokens_seen": 1422258176 + }, + { + "epoch": 4.07, + "learning_rate": 0.000287432296890672, + "loss": 2.5748, + "theoretical_loss": 3.5326883467984658, + "tokens_seen": 1422323712 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028742226680040123, + "loss": 2.6212, + "theoretical_loss": 3.5326738486734044, + "tokens_seen": 1422389248 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002874122367101304, + "loss": 2.5636, + "theoretical_loss": 3.5326593514033515, + "tokens_seen": 1422454784 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002874022066198596, + "loss": 2.6832, + "theoretical_loss": 3.532644854988218, + "tokens_seen": 1422520320 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002873921765295888, + "loss": 2.6347, + "theoretical_loss": 3.532630359427913, + "tokens_seen": 1422585856 + }, + { + "epoch": 4.07, + "learning_rate": 0.000287382146439318, + "loss": 2.5502, + "theoretical_loss": 3.532615864722347, + "tokens_seen": 1422651392 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028737211634904714, + "loss": 2.4627, + "theoretical_loss": 3.532601370871431, + "tokens_seen": 1422716928 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028736208625877637, + "loss": 2.6478, + "theoretical_loss": 3.532586877875074, + "tokens_seen": 1422782464 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002873520561685055, + "loss": 2.6382, + "theoretical_loss": 3.532572385733187, + "tokens_seen": 1422848000 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028734202607823473, + "loss": 2.736, + "theoretical_loss": 3.532557894445681, + "tokens_seen": 1422913536 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002873319959879639, + "loss": 2.6704, + "theoretical_loss": 3.532543404012465, + "tokens_seen": 1422979072 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002873219658976931, + "loss": 2.7003, + "theoretical_loss": 3.532528914433449, + "tokens_seen": 1423044608 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002873119358074223, + "loss": 2.6309, + "theoretical_loss": 3.532514425708545, + "tokens_seen": 1423110144 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028730190571715146, + "loss": 2.5853, + "theoretical_loss": 3.5324999378376623, + "tokens_seen": 1423175680 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028729187562688064, + "loss": 2.7507, + "theoretical_loss": 3.532485450820711, + "tokens_seen": 1423241216 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002872818455366099, + "loss": 2.6948, + "theoretical_loss": 3.532470964657602, + "tokens_seen": 1423306752 + }, + { + "epoch": 4.07, + "learning_rate": 0.000287271815446339, + "loss": 2.5424, + "theoretical_loss": 3.5324564793482454, + "tokens_seen": 1423372288 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028726178535606824, + "loss": 2.5131, + "theoretical_loss": 3.5324419948925514, + "tokens_seen": 1423437824 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028725175526579736, + "loss": 2.636, + "theoretical_loss": 3.5324275112904306, + "tokens_seen": 1423503360 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002872417251755266, + "loss": 2.7088, + "theoretical_loss": 3.532413028541794, + "tokens_seen": 1423568896 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002872316950852558, + "loss": 2.6311, + "theoretical_loss": 3.532398546646551, + "tokens_seen": 1423634432 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028722166499498496, + "loss": 2.9132, + "theoretical_loss": 3.532384065604613, + "tokens_seen": 1423699968 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1576738, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3536036014556885, + "objective/train/theoretical_loss": 3.532369585415889, + "objective/train/tokens_used": 1444225504, + "theoretical_loss": 3.532369585415889, + "tokens_seen": 1423765504 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028721163490471414, + "loss": 2.5959, + "theoretical_loss": 3.532369585415889, + "tokens_seen": 1423765504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002872016048144434, + "loss": 2.4618, + "theoretical_loss": 3.532355106080291, + "tokens_seen": 1423831040 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002871915747241725, + "loss": 2.8228, + "theoretical_loss": 3.532340627597729, + "tokens_seen": 1423896576 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028718154463390174, + "loss": 2.927, + "theoretical_loss": 3.532326149968113, + "tokens_seen": 1423962112 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028717151454363087, + "loss": 2.5715, + "theoretical_loss": 3.5323116731913538, + "tokens_seen": 1424027648 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002871614844533601, + "loss": 2.7448, + "theoretical_loss": 3.5322971972673622, + "tokens_seen": 1424093184 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002871514543630893, + "loss": 2.6791, + "theoretical_loss": 3.532282722196048, + "tokens_seen": 1424158720 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028714142427281846, + "loss": 2.5363, + "theoretical_loss": 3.5322682479773224, + "tokens_seen": 1424224256 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028713139418254765, + "loss": 2.5642, + "theoretical_loss": 3.532253774611096, + "tokens_seen": 1424289792 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002871213640922768, + "loss": 2.6663, + "theoretical_loss": 3.5322393020972793, + "tokens_seen": 1424355328 + }, + { + "epoch": 4.07, + "learning_rate": 0.000287111334002006, + "loss": 2.8504, + "theoretical_loss": 3.5322248304357826, + "tokens_seen": 1424420864 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028710130391173524, + "loss": 2.8866, + "theoretical_loss": 3.532210359626516, + "tokens_seen": 1424486400 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028709127382146437, + "loss": 2.8101, + "theoretical_loss": 3.5321958896693917, + "tokens_seen": 1424551936 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002870812437311936, + "loss": 2.6613, + "theoretical_loss": 3.532181420564319, + "tokens_seen": 1424617472 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028707121364092273, + "loss": 2.6923, + "theoretical_loss": 3.5321669523112087, + "tokens_seen": 1424683008 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028706118355065197, + "loss": 2.7143, + "theoretical_loss": 3.5321524849099717, + "tokens_seen": 1424748544 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028705115346038115, + "loss": 2.7368, + "theoretical_loss": 3.532138018360518, + "tokens_seen": 1424814080 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028704112337011033, + "loss": 2.709, + "theoretical_loss": 3.53212355266276, + "tokens_seen": 1424879616 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002870310932798395, + "loss": 2.624, + "theoretical_loss": 3.5321090878166066, + "tokens_seen": 1424945152 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028702106318956875, + "loss": 2.7142, + "theoretical_loss": 3.532094623821969, + "tokens_seen": 1425010688 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002870110330992979, + "loss": 2.9406, + "theoretical_loss": 3.5320801606787584, + "tokens_seen": 1425076224 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002870010030090271, + "loss": 2.578, + "theoretical_loss": 3.5320656983868854, + "tokens_seen": 1425141760 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028699097291875624, + "loss": 2.6162, + "theoretical_loss": 3.53205123694626, + "tokens_seen": 1425207296 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028698094282848547, + "loss": 2.6059, + "theoretical_loss": 3.5320367763567937, + "tokens_seen": 1425272832 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028697091273821465, + "loss": 2.5249, + "theoretical_loss": 3.532022316618397, + "tokens_seen": 1425338368 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1577330, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.346597671508789, + "objective/train/theoretical_loss": 3.5320078577309806, + "objective/train/tokens_used": 1445863904, + "theoretical_loss": 3.5320078577309806, + "tokens_seen": 1425403904 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028696088264794383, + "loss": 2.605, + "theoretical_loss": 3.5320078577309806, + "tokens_seen": 1425403904 + }, + { + "epoch": 4.07, + "learning_rate": 0.000286950852557673, + "loss": 2.644, + "theoretical_loss": 3.531993399694456, + "tokens_seen": 1425469440 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002869408224674022, + "loss": 2.7256, + "theoretical_loss": 3.531978942508733, + "tokens_seen": 1425534976 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002869307923771314, + "loss": 2.6636, + "theoretical_loss": 3.531964486173723, + "tokens_seen": 1425600512 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002869207622868606, + "loss": 2.9186, + "theoretical_loss": 3.5319500306893366, + "tokens_seen": 1425666048 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028691073219658974, + "loss": 2.8108, + "theoretical_loss": 3.531935576055485, + "tokens_seen": 1425731584 + }, + { + "epoch": 4.07, + "learning_rate": 0.000286900702106319, + "loss": 2.8785, + "theoretical_loss": 3.5319211222720788, + "tokens_seen": 1425797120 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002868906720160481, + "loss": 2.8474, + "theoretical_loss": 3.531906669339029, + "tokens_seen": 1425862656 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028688064192577734, + "loss": 2.6304, + "theoretical_loss": 3.531892217256246, + "tokens_seen": 1425928192 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002868706118355065, + "loss": 2.753, + "theoretical_loss": 3.531877766023641, + "tokens_seen": 1425993728 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002868605817452357, + "loss": 2.8028, + "theoretical_loss": 3.531863315641125, + "tokens_seen": 1426059264 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002868505516549649, + "loss": 2.5905, + "theoretical_loss": 3.53184886610861, + "tokens_seen": 1426124800 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002868405215646941, + "loss": 3.0052, + "theoretical_loss": 3.531834417426005, + "tokens_seen": 1426190336 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028683049147442324, + "loss": 2.3945, + "theoretical_loss": 3.5318199695932218, + "tokens_seen": 1426255872 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002868204613841525, + "loss": 2.5216, + "theoretical_loss": 3.5318055226101714, + "tokens_seen": 1426321408 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002868104312938816, + "loss": 2.8529, + "theoretical_loss": 3.5317910764767655, + "tokens_seen": 1426386944 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028680040120361084, + "loss": 2.6891, + "theoretical_loss": 3.5317766311929133, + "tokens_seen": 1426452480 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028679037111334, + "loss": 2.5532, + "theoretical_loss": 3.5317621867585274, + "tokens_seen": 1426518016 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002867803410230692, + "loss": 2.6447, + "theoretical_loss": 3.5317477431735185, + "tokens_seen": 1426583552 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028677031093279844, + "loss": 2.5323, + "theoretical_loss": 3.5317333004377973, + "tokens_seen": 1426649088 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028676028084252756, + "loss": 2.6454, + "theoretical_loss": 3.531718858551275, + "tokens_seen": 1426714624 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002867502507522568, + "loss": 2.8039, + "theoretical_loss": 3.531704417513863, + "tokens_seen": 1426780160 + }, + { + "epoch": 4.07, + "learning_rate": 0.000286740220661986, + "loss": 2.3451, + "theoretical_loss": 3.531689977325472, + "tokens_seen": 1426845696 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028673019057171516, + "loss": 2.9817, + "theoretical_loss": 3.531675537986013, + "tokens_seen": 1426911232 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028672016048144434, + "loss": 2.749, + "theoretical_loss": 3.5316610994953974, + "tokens_seen": 1426976768 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1578940, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.657034158706665, + "objective/train/theoretical_loss": 3.531646661853536, + "objective/train/tokens_used": 1447502304, + "theoretical_loss": 3.531646661853536, + "tokens_seen": 1427042304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002867101303911736, + "loss": 2.8357, + "theoretical_loss": 3.531646661853536, + "tokens_seen": 1427042304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002867001003009027, + "loss": 2.7306, + "theoretical_loss": 3.5316322250603402, + "tokens_seen": 1427107840 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028669007021063194, + "loss": 2.5669, + "theoretical_loss": 3.531617789115721, + "tokens_seen": 1427173376 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028668004012036107, + "loss": 2.8655, + "theoretical_loss": 3.5316033540195892, + "tokens_seen": 1427238912 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002866700100300903, + "loss": 2.6255, + "theoretical_loss": 3.531588919771857, + "tokens_seen": 1427304448 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002866599799398195, + "loss": 2.7666, + "theoretical_loss": 3.5315744863724348, + "tokens_seen": 1427369984 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028664994984954866, + "loss": 2.7119, + "theoretical_loss": 3.531560053821234, + "tokens_seen": 1427435520 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028663991975927785, + "loss": 2.5825, + "theoretical_loss": 3.531545622118166, + "tokens_seen": 1427501056 + }, + { + "epoch": 4.07, + "learning_rate": 0.000286629889669007, + "loss": 2.3998, + "theoretical_loss": 3.5315311912631415, + "tokens_seen": 1427566592 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002866198595787362, + "loss": 2.5511, + "theoretical_loss": 3.531516761256072, + "tokens_seen": 1427632128 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028660982948846544, + "loss": 2.8859, + "theoretical_loss": 3.531502332096869, + "tokens_seen": 1427697664 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028659979939819457, + "loss": 2.5624, + "theoretical_loss": 3.531487903785443, + "tokens_seen": 1427763200 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002865897693079238, + "loss": 2.8676, + "theoretical_loss": 3.5314734763217066, + "tokens_seen": 1427828736 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028657973921765293, + "loss": 2.8014, + "theoretical_loss": 3.5314590497055702, + "tokens_seen": 1427894272 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028656970912738217, + "loss": 2.6093, + "theoretical_loss": 3.531444623936945, + "tokens_seen": 1427959808 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028655967903711135, + "loss": 2.5359, + "theoretical_loss": 3.5314301990157424, + "tokens_seen": 1428025344 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028654964894684053, + "loss": 2.6843, + "theoretical_loss": 3.531415774941874, + "tokens_seen": 1428090880 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002865396188565697, + "loss": 2.7487, + "theoretical_loss": 3.5314013517152514, + "tokens_seen": 1428156416 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028652958876629895, + "loss": 2.4328, + "theoretical_loss": 3.531386929335785, + "tokens_seen": 1428221952 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002865195586760281, + "loss": 2.7507, + "theoretical_loss": 3.531372507803387, + "tokens_seen": 1428287488 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002865095285857573, + "loss": 2.7066, + "theoretical_loss": 3.531358087117969, + "tokens_seen": 1428353024 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028649949849548644, + "loss": 2.5695, + "theoretical_loss": 3.531343667279441, + "tokens_seen": 1428418560 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028648946840521567, + "loss": 2.377, + "theoretical_loss": 3.531329248287716, + "tokens_seen": 1428484096 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028647943831494485, + "loss": 2.8074, + "theoretical_loss": 3.531314830142705, + "tokens_seen": 1428549632 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028646940822467403, + "loss": 2.5772, + "theoretical_loss": 3.5313004128443195, + "tokens_seen": 1428615168 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1579734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7796366214752197, + "objective/train/theoretical_loss": 3.53128599639247, + "objective/train/tokens_used": 1449140704, + "theoretical_loss": 3.53128599639247, + "tokens_seen": 1428680704 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002864593781344032, + "loss": 2.912, + "theoretical_loss": 3.53128599639247, + "tokens_seen": 1428680704 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002864493480441324, + "loss": 2.6375, + "theoretical_loss": 3.5312715807870685, + "tokens_seen": 1428746240 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002864393179538616, + "loss": 2.8322, + "theoretical_loss": 3.531257166028027, + "tokens_seen": 1428811776 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002864292878635908, + "loss": 2.5146, + "theoretical_loss": 3.5312427521152565, + "tokens_seen": 1428877312 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028641925777331994, + "loss": 2.7135, + "theoretical_loss": 3.5312283390486687, + "tokens_seen": 1428942848 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002864092276830492, + "loss": 2.7155, + "theoretical_loss": 3.5312139268281753, + "tokens_seen": 1429008384 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002863991975927783, + "loss": 2.5024, + "theoretical_loss": 3.531199515453687, + "tokens_seen": 1429073920 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028638916750250754, + "loss": 2.736, + "theoretical_loss": 3.5311851049251164, + "tokens_seen": 1429139456 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002863791374122367, + "loss": 2.7296, + "theoretical_loss": 3.5311706952423743, + "tokens_seen": 1429204992 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002863691073219659, + "loss": 2.68, + "theoretical_loss": 3.5311562864053725, + "tokens_seen": 1429270528 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002863590772316951, + "loss": 2.5365, + "theoretical_loss": 3.5311418784140227, + "tokens_seen": 1429336064 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002863490471414243, + "loss": 2.7918, + "theoretical_loss": 3.5311274712682366, + "tokens_seen": 1429401600 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028633901705115344, + "loss": 2.5645, + "theoretical_loss": 3.531113064967925, + "tokens_seen": 1429467136 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002863289869608827, + "loss": 2.7413, + "theoretical_loss": 3.5310986595130007, + "tokens_seen": 1429532672 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002863189568706118, + "loss": 2.554, + "theoretical_loss": 3.5310842549033747, + "tokens_seen": 1429598208 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028630892678034104, + "loss": 2.643, + "theoretical_loss": 3.5310698511389593, + "tokens_seen": 1429663744 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002862988966900702, + "loss": 2.5413, + "theoretical_loss": 3.531055448219665, + "tokens_seen": 1429729280 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002862888665997994, + "loss": 2.6201, + "theoretical_loss": 3.5310410461454045, + "tokens_seen": 1429794816 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002862788365095286, + "loss": 2.6887, + "theoretical_loss": 3.5310266449160888, + "tokens_seen": 1429860352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028626880641925776, + "loss": 2.6714, + "theoretical_loss": 3.53101224453163, + "tokens_seen": 1429925888 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028625877632898694, + "loss": 2.5562, + "theoretical_loss": 3.5309978449919397, + "tokens_seen": 1429991424 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002862487462387162, + "loss": 2.7693, + "theoretical_loss": 3.5309834462969296, + "tokens_seen": 1430056960 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002862387161484453, + "loss": 2.8589, + "theoretical_loss": 3.5309690484465115, + "tokens_seen": 1430122496 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028622868605817454, + "loss": 2.706, + "theoretical_loss": 3.530954651440597, + "tokens_seen": 1430188032 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028621865596790367, + "loss": 2.8354, + "theoretical_loss": 3.5309402552790985, + "tokens_seen": 1430253568 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1580976, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8712832927703857, + "objective/train/theoretical_loss": 3.530925859961927, + "objective/train/tokens_used": 1450779104, + "theoretical_loss": 3.530925859961927, + "tokens_seen": 1430319104 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002862086258776329, + "loss": 2.7676, + "theoretical_loss": 3.530925859961927, + "tokens_seen": 1430319104 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002861985957873621, + "loss": 2.5262, + "theoretical_loss": 3.5309114654889946, + "tokens_seen": 1430384640 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028618856569709127, + "loss": 2.659, + "theoretical_loss": 3.530897071860213, + "tokens_seen": 1430450176 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028617853560682045, + "loss": 2.7417, + "theoretical_loss": 3.530882679075494, + "tokens_seen": 1430515712 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002861685055165497, + "loss": 2.7481, + "theoretical_loss": 3.53086828713475, + "tokens_seen": 1430581248 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002861584754262788, + "loss": 2.532, + "theoretical_loss": 3.5308538960378923, + "tokens_seen": 1430646784 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028614844533600805, + "loss": 2.6953, + "theoretical_loss": 3.530839505784833, + "tokens_seen": 1430712320 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028613841524573717, + "loss": 2.6791, + "theoretical_loss": 3.5308251163754836, + "tokens_seen": 1430777856 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002861283851554664, + "loss": 2.5014, + "theoretical_loss": 3.5308107278097562, + "tokens_seen": 1430843392 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002861183550651956, + "loss": 2.7834, + "theoretical_loss": 3.530796340087563, + "tokens_seen": 1430908928 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028610832497492477, + "loss": 2.782, + "theoretical_loss": 3.530781953208816, + "tokens_seen": 1430974464 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028609829488465395, + "loss": 2.7377, + "theoretical_loss": 3.5307675671734264, + "tokens_seen": 1431040000 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028608826479438313, + "loss": 2.4967, + "theoretical_loss": 3.5307531819813063, + "tokens_seen": 1431105536 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002860782347041123, + "loss": 2.6931, + "theoretical_loss": 3.5307387976323685, + "tokens_seen": 1431171072 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028606820461384155, + "loss": 2.6167, + "theoretical_loss": 3.5307244141265235, + "tokens_seen": 1431236608 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002860581745235707, + "loss": 2.6194, + "theoretical_loss": 3.530710031463685, + "tokens_seen": 1431302144 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002860481444332999, + "loss": 2.6382, + "theoretical_loss": 3.5306956496437643, + "tokens_seen": 1431367680 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002860381143430291, + "loss": 2.6777, + "theoretical_loss": 3.5306812686666724, + "tokens_seen": 1431433216 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002860280842527583, + "loss": 2.7881, + "theoretical_loss": 3.5306668885323225, + "tokens_seen": 1431498752 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002860180541624875, + "loss": 2.5622, + "theoretical_loss": 3.5306525092406265, + "tokens_seen": 1431564288 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028600802407221664, + "loss": 2.7507, + "theoretical_loss": 3.5306381307914965, + "tokens_seen": 1431629824 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028599799398194587, + "loss": 2.7967, + "theoretical_loss": 3.530623753184844, + "tokens_seen": 1431695360 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028598796389167505, + "loss": 2.6421, + "theoretical_loss": 3.530609376420582, + "tokens_seen": 1431760896 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028597793380140423, + "loss": 2.7102, + "theoretical_loss": 3.530595000498621, + "tokens_seen": 1431826432 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002859679037111334, + "loss": 2.5797, + "theoretical_loss": 3.5305806254188745, + "tokens_seen": 1431891968 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1581614, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.419712543487549, + "objective/train/theoretical_loss": 3.5305662511812548, + "objective/train/tokens_used": 1452417504, + "theoretical_loss": 3.5305662511812548, + "tokens_seen": 1431957504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002859578736208626, + "loss": 2.4994, + "theoretical_loss": 3.5305662511812548, + "tokens_seen": 1431957504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002859478435305918, + "loss": 2.7283, + "theoretical_loss": 3.5305518777856726, + "tokens_seen": 1432023040 + }, + { + "epoch": 4.07, + "learning_rate": 0.000285937813440321, + "loss": 2.7827, + "theoretical_loss": 3.530537505232041, + "tokens_seen": 1432088576 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028592778335005014, + "loss": 2.7673, + "theoretical_loss": 3.5305231335202727, + "tokens_seen": 1432154112 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002859177532597794, + "loss": 2.849, + "theoretical_loss": 3.5305087626502787, + "tokens_seen": 1432219648 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002859077231695085, + "loss": 2.8395, + "theoretical_loss": 3.530494392621972, + "tokens_seen": 1432285184 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028589769307923774, + "loss": 2.6592, + "theoretical_loss": 3.5304800234352642, + "tokens_seen": 1432350720 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002858876629889669, + "loss": 2.6934, + "theoretical_loss": 3.530465655090068, + "tokens_seen": 1432416256 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002858776328986961, + "loss": 2.6282, + "theoretical_loss": 3.530451287586295, + "tokens_seen": 1432481792 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002858676028084253, + "loss": 2.6504, + "theoretical_loss": 3.530436920923858, + "tokens_seen": 1432547328 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002858575727181545, + "loss": 2.4934, + "theoretical_loss": 3.5304225551026693, + "tokens_seen": 1432612864 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028584754262788364, + "loss": 2.621, + "theoretical_loss": 3.5304081901226407, + "tokens_seen": 1432678400 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002858375125376129, + "loss": 2.5828, + "theoretical_loss": 3.530393825983685, + "tokens_seen": 1432743936 + }, + { + "epoch": 4.07, + "learning_rate": 0.000285827482447342, + "loss": 2.852, + "theoretical_loss": 3.5303794626857146, + "tokens_seen": 1432809472 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028581745235707124, + "loss": 2.5021, + "theoretical_loss": 3.5303651002286407, + "tokens_seen": 1432875008 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002858074222668004, + "loss": 2.7758, + "theoretical_loss": 3.530350738612377, + "tokens_seen": 1432940544 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002857973921765296, + "loss": 2.4004, + "theoretical_loss": 3.5303363778368344, + "tokens_seen": 1433006080 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002857873620862588, + "loss": 2.4702, + "theoretical_loss": 3.530322017901926, + "tokens_seen": 1433071616 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028577733199598796, + "loss": 2.6284, + "theoretical_loss": 3.5303076588075646, + "tokens_seen": 1433137152 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028576730190571714, + "loss": 2.9295, + "theoretical_loss": 3.5302933005536623, + "tokens_seen": 1433202688 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002857572718154464, + "loss": 2.5359, + "theoretical_loss": 3.530278943140131, + "tokens_seen": 1433268224 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002857472417251755, + "loss": 2.5897, + "theoretical_loss": 3.530264586566883, + "tokens_seen": 1433333760 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028573721163490474, + "loss": 2.6742, + "theoretical_loss": 3.5302502308338317, + "tokens_seen": 1433399296 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028572718154463387, + "loss": 2.7224, + "theoretical_loss": 3.5302358759408885, + "tokens_seen": 1433464832 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002857171514543631, + "loss": 2.5141, + "theoretical_loss": 3.5302215218879667, + "tokens_seen": 1433530368 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 1582179, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.372399091720581, + "objective/train/theoretical_loss": 3.530207168674978, + "objective/train/tokens_used": 1454055904, + "theoretical_loss": 3.530207168674978, + "tokens_seen": 1433595904 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002857071213640923, + "loss": 2.5721, + "theoretical_loss": 3.530207168674978, + "tokens_seen": 1433595904 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028569709127382147, + "loss": 2.4279, + "theoretical_loss": 3.5301928163018346, + "tokens_seen": 1433661440 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028568706118355065, + "loss": 2.593, + "theoretical_loss": 3.5301784647684498, + "tokens_seen": 1433726976 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002856770310932799, + "loss": 2.7227, + "theoretical_loss": 3.5301641140747364, + "tokens_seen": 1433792512 + }, + { + "epoch": 4.07, + "learning_rate": 0.000285667001003009, + "loss": 2.9401, + "theoretical_loss": 3.5301497642206057, + "tokens_seen": 1433858048 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028565697091273825, + "loss": 2.4208, + "theoretical_loss": 3.530135415205971, + "tokens_seen": 1433923584 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028564694082246737, + "loss": 2.3774, + "theoretical_loss": 3.530121067030745, + "tokens_seen": 1433989120 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002856369107321966, + "loss": 2.7481, + "theoretical_loss": 3.530106719694839, + "tokens_seen": 1434054656 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002856268806419258, + "loss": 2.5669, + "theoretical_loss": 3.5300923731981664, + "tokens_seen": 1434120192 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028561685055165497, + "loss": 2.6281, + "theoretical_loss": 3.5300780275406405, + "tokens_seen": 1434185728 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028560682046138415, + "loss": 2.5687, + "theoretical_loss": 3.5300636827221727, + "tokens_seen": 1434251264 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028559679037111333, + "loss": 2.5335, + "theoretical_loss": 3.530049338742676, + "tokens_seen": 1434316800 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002855867602808425, + "loss": 2.568, + "theoretical_loss": 3.5300349956020636, + "tokens_seen": 1434382336 + }, + { + "epoch": 4.07, + "learning_rate": 0.00028557673019057175, + "loss": 2.7643, + "theoretical_loss": 3.530020653300247, + "tokens_seen": 1434447872 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002855667001003009, + "loss": 2.5505, + "theoretical_loss": 3.5300063118371394, + "tokens_seen": 1434513408 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002855566700100301, + "loss": 2.7626, + "theoretical_loss": 3.5299919712126533, + "tokens_seen": 1434578944 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002855466399197593, + "loss": 2.6743, + "theoretical_loss": 3.5299776314267017, + "tokens_seen": 1434644480 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002855366098294885, + "loss": 2.7768, + "theoretical_loss": 3.5299632924791973, + "tokens_seen": 1434710016 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028552657973921765, + "loss": 2.8286, + "theoretical_loss": 3.5299489543700524, + "tokens_seen": 1434775552 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028551654964894684, + "loss": 2.5388, + "theoretical_loss": 3.529934617099179, + "tokens_seen": 1434841088 + }, + { + "epoch": 4.08, + "learning_rate": 0.000285506519558676, + "loss": 2.6986, + "theoretical_loss": 3.5299202806664915, + "tokens_seen": 1434906624 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028549648946840525, + "loss": 2.6987, + "theoretical_loss": 3.5299059450719015, + "tokens_seen": 1434972160 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002854864593781344, + "loss": 2.9928, + "theoretical_loss": 3.529891610315322, + "tokens_seen": 1435037696 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002854764292878636, + "loss": 2.8229, + "theoretical_loss": 3.5298772763966655, + "tokens_seen": 1435103232 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028546639919759274, + "loss": 2.7209, + "theoretical_loss": 3.5298629433158455, + "tokens_seen": 1435168768 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1583288, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.874298572540283, + "objective/train/theoretical_loss": 3.5298486110727736, + "objective/train/tokens_used": 1455694304, + "theoretical_loss": 3.5298486110727736, + "tokens_seen": 1435234304 + }, + { + "epoch": 4.08, + "learning_rate": 0.000285456369107322, + "loss": 2.8018, + "theoretical_loss": 3.5298486110727736, + "tokens_seen": 1435234304 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028544633901705116, + "loss": 2.578, + "theoretical_loss": 3.529834279667363, + "tokens_seen": 1435299840 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028543630892678034, + "loss": 2.7333, + "theoretical_loss": 3.5298199490995277, + "tokens_seen": 1435365376 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002854262788365095, + "loss": 2.7832, + "theoretical_loss": 3.529805619369179, + "tokens_seen": 1435430912 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002854162487462387, + "loss": 2.7618, + "theoretical_loss": 3.52979129047623, + "tokens_seen": 1435496448 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002854062186559679, + "loss": 2.5082, + "theoretical_loss": 3.5297769624205944, + "tokens_seen": 1435561984 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002853961885656971, + "loss": 2.6455, + "theoretical_loss": 3.529762635202184, + "tokens_seen": 1435627520 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028538615847542624, + "loss": 2.5146, + "theoretical_loss": 3.5297483088209125, + "tokens_seen": 1435693056 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002853761283851555, + "loss": 2.7314, + "theoretical_loss": 3.529733983276692, + "tokens_seen": 1435758592 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028536609829488466, + "loss": 2.7992, + "theoretical_loss": 3.529719658569436, + "tokens_seen": 1435824128 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028535606820461384, + "loss": 2.7624, + "theoretical_loss": 3.529705334699057, + "tokens_seen": 1435889664 + }, + { + "epoch": 4.08, + "learning_rate": 0.000285346038114343, + "loss": 2.5357, + "theoretical_loss": 3.5296910116654683, + "tokens_seen": 1435955200 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002853360080240722, + "loss": 2.587, + "theoretical_loss": 3.529676689468582, + "tokens_seen": 1436020736 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002853259779338014, + "loss": 2.6797, + "theoretical_loss": 3.5296623681083124, + "tokens_seen": 1436086272 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002853159478435306, + "loss": 2.6747, + "theoretical_loss": 3.5296480475845713, + "tokens_seen": 1436151808 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028530591775325975, + "loss": 2.9066, + "theoretical_loss": 3.5296337278972723, + "tokens_seen": 1436217344 + }, + { + "epoch": 4.08, + "learning_rate": 0.000285295887662989, + "loss": 2.9173, + "theoretical_loss": 3.5296194090463278, + "tokens_seen": 1436282880 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002852858575727181, + "loss": 2.7209, + "theoretical_loss": 3.5296050910316517, + "tokens_seen": 1436348416 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028527582748244735, + "loss": 2.6102, + "theoretical_loss": 3.5295907738531564, + "tokens_seen": 1436413952 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002852657973921766, + "loss": 2.6939, + "theoretical_loss": 3.529576457510754, + "tokens_seen": 1436479488 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002852557673019057, + "loss": 2.8535, + "theoretical_loss": 3.5295621420043597, + "tokens_seen": 1436545024 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028524573721163494, + "loss": 2.7347, + "theoretical_loss": 3.529547827333885, + "tokens_seen": 1436610560 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028523570712136407, + "loss": 2.7182, + "theoretical_loss": 3.5295335134992434, + "tokens_seen": 1436676096 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002852256770310933, + "loss": 2.6752, + "theoretical_loss": 3.5295192005003475, + "tokens_seen": 1436741632 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002852156469408225, + "loss": 2.9172, + "theoretical_loss": 3.529504888337111, + "tokens_seen": 1436807168 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1583910, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5404577255249023, + "objective/train/theoretical_loss": 3.529490577009447, + "objective/train/tokens_used": 1457332704, + "theoretical_loss": 3.529490577009447, + "tokens_seen": 1436872704 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028520561685055167, + "loss": 2.4372, + "theoretical_loss": 3.529490577009447, + "tokens_seen": 1436872704 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028519558676028085, + "loss": 2.4868, + "theoretical_loss": 3.529476266517268, + "tokens_seen": 1436938240 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002851855566700101, + "loss": 2.4531, + "theoretical_loss": 3.5294619568604877, + "tokens_seen": 1437003776 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002851755265797392, + "loss": 2.7788, + "theoretical_loss": 3.529447648039019, + "tokens_seen": 1437069312 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028516549648946845, + "loss": 2.7561, + "theoretical_loss": 3.5294333400527753, + "tokens_seen": 1437134848 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028515546639919757, + "loss": 2.8554, + "theoretical_loss": 3.529419032901669, + "tokens_seen": 1437200384 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002851454363089268, + "loss": 2.7275, + "theoretical_loss": 3.5294047265856143, + "tokens_seen": 1437265920 + }, + { + "epoch": 4.08, + "learning_rate": 0.000285135406218656, + "loss": 2.5726, + "theoretical_loss": 3.529390421104524, + "tokens_seen": 1437331456 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028512537612838517, + "loss": 2.8757, + "theoretical_loss": 3.5293761164583106, + "tokens_seen": 1437396992 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028511534603811435, + "loss": 2.6895, + "theoretical_loss": 3.5293618126468886, + "tokens_seen": 1437462528 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028510531594784353, + "loss": 2.6364, + "theoretical_loss": 3.52934750967017, + "tokens_seen": 1437528064 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002850952858575727, + "loss": 2.6436, + "theoretical_loss": 3.529333207528069, + "tokens_seen": 1437593600 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028508525576730195, + "loss": 2.4625, + "theoretical_loss": 3.5293189062204986, + "tokens_seen": 1437659136 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002850752256770311, + "loss": 2.7444, + "theoretical_loss": 3.529304605747371, + "tokens_seen": 1437724672 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002850651955867603, + "loss": 2.6947, + "theoretical_loss": 3.529290306108601, + "tokens_seen": 1437790208 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002850551654964895, + "loss": 2.6257, + "theoretical_loss": 3.5292760073041016, + "tokens_seen": 1437855744 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002850451354062187, + "loss": 2.544, + "theoretical_loss": 3.5292617093337855, + "tokens_seen": 1437921280 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028503510531594785, + "loss": 2.9978, + "theoretical_loss": 3.5292474121975665, + "tokens_seen": 1437986816 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028502507522567704, + "loss": 2.4569, + "theoretical_loss": 3.5292331158953574, + "tokens_seen": 1438052352 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002850150451354062, + "loss": 2.5917, + "theoretical_loss": 3.5292188204270722, + "tokens_seen": 1438117888 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028500501504513545, + "loss": 2.7325, + "theoretical_loss": 3.5292045257926237, + "tokens_seen": 1438183424 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002849949849548646, + "loss": 2.8818, + "theoretical_loss": 3.5291902319919255, + "tokens_seen": 1438248960 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002849849548645938, + "loss": 2.8042, + "theoretical_loss": 3.529175939024891, + "tokens_seen": 1438314496 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028497492477432294, + "loss": 2.7066, + "theoretical_loss": 3.529161646891434, + "tokens_seen": 1438380032 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002849648946840522, + "loss": 2.6888, + "theoretical_loss": 3.529147355591467, + "tokens_seen": 1438445568 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1585264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9627788066864014, + "objective/train/theoretical_loss": 3.5291330651249044, + "objective/train/tokens_used": 1458971104, + "theoretical_loss": 3.5291330651249044, + "tokens_seen": 1438511104 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028495486459378136, + "loss": 2.6728, + "theoretical_loss": 3.5291330651249044, + "tokens_seen": 1438511104 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028494483450351054, + "loss": 2.6711, + "theoretical_loss": 3.529118775491659, + "tokens_seen": 1438576640 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002849348044132397, + "loss": 2.7203, + "theoretical_loss": 3.5291044866916437, + "tokens_seen": 1438642176 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002849247743229689, + "loss": 2.835, + "theoretical_loss": 3.5290901987247736, + "tokens_seen": 1438707712 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002849147442326981, + "loss": 2.7568, + "theoretical_loss": 3.5290759115909607, + "tokens_seen": 1438773248 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002849047141424273, + "loss": 2.7452, + "theoretical_loss": 3.5290616252901192, + "tokens_seen": 1438838784 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028489468405215644, + "loss": 2.5437, + "theoretical_loss": 3.5290473398221627, + "tokens_seen": 1438904320 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002848846539618857, + "loss": 2.9406, + "theoretical_loss": 3.529033055187004, + "tokens_seen": 1438969856 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028487462387161486, + "loss": 2.7922, + "theoretical_loss": 3.529018771384557, + "tokens_seen": 1439035392 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028486459378134404, + "loss": 2.6864, + "theoretical_loss": 3.529004488414736, + "tokens_seen": 1439100928 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002848545636910732, + "loss": 2.873, + "theoretical_loss": 3.528990206277453, + "tokens_seen": 1439166464 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002848445336008024, + "loss": 2.725, + "theoretical_loss": 3.528975924972623, + "tokens_seen": 1439232000 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002848345035105316, + "loss": 2.6527, + "theoretical_loss": 3.528961644500159, + "tokens_seen": 1439297536 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002848244734202608, + "loss": 2.5969, + "theoretical_loss": 3.528947364859974, + "tokens_seen": 1439363072 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028481444332998995, + "loss": 2.6177, + "theoretical_loss": 3.5289330860519823, + "tokens_seen": 1439428608 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002848044132397192, + "loss": 2.7198, + "theoretical_loss": 3.5289188080760976, + "tokens_seen": 1439494144 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002847943831494483, + "loss": 2.4022, + "theoretical_loss": 3.5289045309322336, + "tokens_seen": 1439559680 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028478435305917755, + "loss": 2.7359, + "theoretical_loss": 3.528890254620303, + "tokens_seen": 1439625216 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002847743229689067, + "loss": 2.6777, + "theoretical_loss": 3.5288759791402207, + "tokens_seen": 1439690752 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002847642928786359, + "loss": 2.518, + "theoretical_loss": 3.5288617044918995, + "tokens_seen": 1439756288 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002847542627883651, + "loss": 2.7267, + "theoretical_loss": 3.5288474306752535, + "tokens_seen": 1439821824 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028474423269809427, + "loss": 2.708, + "theoretical_loss": 3.5288331576901966, + "tokens_seen": 1439887360 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028473420260782345, + "loss": 2.6584, + "theoretical_loss": 3.5288188855366416, + "tokens_seen": 1439952896 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002847241725175527, + "loss": 2.5356, + "theoretical_loss": 3.5288046142145033, + "tokens_seen": 1440018432 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002847141424272818, + "loss": 2.7895, + "theoretical_loss": 3.5287903437236947, + "tokens_seen": 1440083968 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1585991, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4762494564056396, + "objective/train/theoretical_loss": 3.5287760740641296, + "objective/train/tokens_used": 1460609504, + "theoretical_loss": 3.5287760740641296, + "tokens_seen": 1440149504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028470411233701105, + "loss": 2.4325, + "theoretical_loss": 3.5287760740641296, + "tokens_seen": 1440149504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028469408224674023, + "loss": 2.9182, + "theoretical_loss": 3.528761805235722, + "tokens_seen": 1440215040 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002846840521564694, + "loss": 2.6841, + "theoretical_loss": 3.5287475372383854, + "tokens_seen": 1440280576 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002846740220661986, + "loss": 2.6416, + "theoretical_loss": 3.528733270072034, + "tokens_seen": 1440346112 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002846639919759278, + "loss": 2.7276, + "theoretical_loss": 3.5287190037365814, + "tokens_seen": 1440411648 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028465396188565695, + "loss": 2.7666, + "theoretical_loss": 3.528704738231941, + "tokens_seen": 1440477184 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002846439317953862, + "loss": 2.4176, + "theoretical_loss": 3.5286904735580276, + "tokens_seen": 1440542720 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002846339017051153, + "loss": 2.6206, + "theoretical_loss": 3.528676209714754, + "tokens_seen": 1440608256 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028462387161484455, + "loss": 2.676, + "theoretical_loss": 3.5286619467020346, + "tokens_seen": 1440673792 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002846138415245737, + "loss": 2.9246, + "theoretical_loss": 3.5286476845197834, + "tokens_seen": 1440739328 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002846038114343029, + "loss": 2.565, + "theoretical_loss": 3.5286334231679133, + "tokens_seen": 1440804864 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002845937813440321, + "loss": 2.5963, + "theoretical_loss": 3.5286191626463395, + "tokens_seen": 1440870400 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002845837512537613, + "loss": 2.309, + "theoretical_loss": 3.5286049029549753, + "tokens_seen": 1440935936 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028457372116349046, + "loss": 2.7321, + "theoretical_loss": 3.5285906440937342, + "tokens_seen": 1441001472 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002845636910732197, + "loss": 2.9154, + "theoretical_loss": 3.528576386062531, + "tokens_seen": 1441067008 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002845536609829488, + "loss": 2.791, + "theoretical_loss": 3.528562128861279, + "tokens_seen": 1441132544 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028454363089267805, + "loss": 2.5997, + "theoretical_loss": 3.5285478724898924, + "tokens_seen": 1441198080 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028453360080240724, + "loss": 2.6299, + "theoretical_loss": 3.528533616948285, + "tokens_seen": 1441263616 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002845235707121364, + "loss": 2.5887, + "theoretical_loss": 3.5285193622363704, + "tokens_seen": 1441329152 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028451354062186565, + "loss": 2.6146, + "theoretical_loss": 3.5285051083540635, + "tokens_seen": 1441394688 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002845035105315948, + "loss": 2.6975, + "theoretical_loss": 3.528490855301278, + "tokens_seen": 1441460224 + }, + { + "epoch": 4.08, + "learning_rate": 0.000284493480441324, + "loss": 2.5712, + "theoretical_loss": 3.5284766030779275, + "tokens_seen": 1441525760 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028448345035105314, + "loss": 2.7834, + "theoretical_loss": 3.528462351683926, + "tokens_seen": 1441591296 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002844734202607824, + "loss": 2.5477, + "theoretical_loss": 3.5284481011191886, + "tokens_seen": 1441656832 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028446339017051156, + "loss": 2.693, + "theoretical_loss": 3.528433851383628, + "tokens_seen": 1441722368 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1587458, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6036527156829834, + "objective/train/theoretical_loss": 3.5284196024771592, + "objective/train/tokens_used": 1462247904, + "theoretical_loss": 3.5284196024771592, + "tokens_seen": 1441787904 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028445336008024074, + "loss": 2.7933, + "theoretical_loss": 3.5284196024771592, + "tokens_seen": 1441787904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002844433299899699, + "loss": 2.87, + "theoretical_loss": 3.5284053543996956, + "tokens_seen": 1441853440 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002844332998996991, + "loss": 2.5991, + "theoretical_loss": 3.528391107151152, + "tokens_seen": 1441918976 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002844232698094283, + "loss": 2.6916, + "theoretical_loss": 3.5283768607314414, + "tokens_seen": 1441984512 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002844132397191575, + "loss": 2.7467, + "theoretical_loss": 3.5283626151404794, + "tokens_seen": 1442050048 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028440320962888664, + "loss": 2.6921, + "theoretical_loss": 3.528348370378179, + "tokens_seen": 1442115584 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002843931795386159, + "loss": 2.8085, + "theoretical_loss": 3.528334126444455, + "tokens_seen": 1442181120 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028438314944834506, + "loss": 2.7245, + "theoretical_loss": 3.528319883339221, + "tokens_seen": 1442246656 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028437311935807424, + "loss": 2.7377, + "theoretical_loss": 3.528305641062391, + "tokens_seen": 1442312192 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002843630892678034, + "loss": 2.6306, + "theoretical_loss": 3.5282913996138805, + "tokens_seen": 1442377728 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002843530591775326, + "loss": 2.6701, + "theoretical_loss": 3.5282771589936024, + "tokens_seen": 1442443264 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002843430290872618, + "loss": 2.6255, + "theoretical_loss": 3.528262919201471, + "tokens_seen": 1442508800 + }, + { + "epoch": 4.08, + "learning_rate": 0.000284332998996991, + "loss": 2.5292, + "theoretical_loss": 3.528248680237401, + "tokens_seen": 1442574336 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028432296890672015, + "loss": 2.5836, + "theoretical_loss": 3.528234442101307, + "tokens_seen": 1442639872 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002843129388164494, + "loss": 2.5171, + "theoretical_loss": 3.5282202047931017, + "tokens_seen": 1442705408 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002843029087261785, + "loss": 2.6853, + "theoretical_loss": 3.5282059683127014, + "tokens_seen": 1442770944 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028429287863590775, + "loss": 2.6028, + "theoretical_loss": 3.528191732660019, + "tokens_seen": 1442836480 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002842828485456369, + "loss": 2.6122, + "theoretical_loss": 3.528177497834969, + "tokens_seen": 1442902016 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002842728184553661, + "loss": 2.7187, + "theoretical_loss": 3.528163263837466, + "tokens_seen": 1442967552 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002842627883650953, + "loss": 2.6089, + "theoretical_loss": 3.5281490306674237, + "tokens_seen": 1443033088 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028425275827482447, + "loss": 2.5862, + "theoretical_loss": 3.528134798324757, + "tokens_seen": 1443098624 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028424272818455365, + "loss": 2.5758, + "theoretical_loss": 3.5281205668093802, + "tokens_seen": 1443164160 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002842326980942829, + "loss": 2.8418, + "theoretical_loss": 3.5281063361212075, + "tokens_seen": 1443229696 + }, + { + "epoch": 4.08, + "learning_rate": 0.000284222668004012, + "loss": 2.6395, + "theoretical_loss": 3.5280921062601536, + "tokens_seen": 1443295232 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028421263791374125, + "loss": 2.7947, + "theoretical_loss": 3.5280778772261314, + "tokens_seen": 1443360768 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1588273, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.190006971359253, + "objective/train/theoretical_loss": 3.5280636490190576, + "objective/train/tokens_used": 1463886304, + "theoretical_loss": 3.5280636490190576, + "tokens_seen": 1443426304 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028420260782347043, + "loss": 2.4971, + "theoretical_loss": 3.5280636490190576, + "tokens_seen": 1443426304 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002841925777331996, + "loss": 2.6572, + "theoretical_loss": 3.528049421638845, + "tokens_seen": 1443491840 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002841825476429288, + "loss": 2.7693, + "theoretical_loss": 3.5280351950854087, + "tokens_seen": 1443557376 + }, + { + "epoch": 4.08, + "learning_rate": 0.000284172517552658, + "loss": 2.6632, + "theoretical_loss": 3.5280209693586624, + "tokens_seen": 1443622912 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028416248746238715, + "loss": 2.5126, + "theoretical_loss": 3.528006744458522, + "tokens_seen": 1443688448 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002841524573721164, + "loss": 2.466, + "theoretical_loss": 3.5279925203849, + "tokens_seen": 1443753984 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002841424272818455, + "loss": 2.7694, + "theoretical_loss": 3.527978297137712, + "tokens_seen": 1443819520 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028413239719157475, + "loss": 2.7568, + "theoretical_loss": 3.5279640747168726, + "tokens_seen": 1443885056 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002841223671013039, + "loss": 2.8102, + "theoretical_loss": 3.5279498531222955, + "tokens_seen": 1443950592 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002841123370110331, + "loss": 2.8483, + "theoretical_loss": 3.5279356323538957, + "tokens_seen": 1444016128 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002841023069207623, + "loss": 2.6838, + "theoretical_loss": 3.527921412411588, + "tokens_seen": 1444081664 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002840922768304915, + "loss": 2.4802, + "theoretical_loss": 3.527907193295287, + "tokens_seen": 1444147200 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028408224674022066, + "loss": 2.6184, + "theoretical_loss": 3.5278929750049057, + "tokens_seen": 1444212736 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002840722166499499, + "loss": 2.8073, + "theoretical_loss": 3.527878757540361, + "tokens_seen": 1444278272 + }, + { + "epoch": 4.08, + "learning_rate": 0.000284062186559679, + "loss": 2.8705, + "theoretical_loss": 3.5278645409015654, + "tokens_seen": 1444343808 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028405215646940825, + "loss": 2.7107, + "theoretical_loss": 3.527850325088435, + "tokens_seen": 1444409344 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002840421263791374, + "loss": 2.6718, + "theoretical_loss": 3.5278361101008833, + "tokens_seen": 1444474880 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002840320962888666, + "loss": 2.7017, + "theoretical_loss": 3.5278218959388257, + "tokens_seen": 1444540416 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002840220661985958, + "loss": 2.7548, + "theoretical_loss": 3.527807682602176, + "tokens_seen": 1444605952 + }, + { + "epoch": 4.08, + "learning_rate": 0.000284012036108325, + "loss": 2.4647, + "theoretical_loss": 3.5277934700908498, + "tokens_seen": 1444671488 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028400200601805416, + "loss": 2.52, + "theoretical_loss": 3.5277792584047605, + "tokens_seen": 1444737024 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028399197592778334, + "loss": 2.7247, + "theoretical_loss": 3.5277650475438236, + "tokens_seen": 1444802560 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002839819458375125, + "loss": 2.6323, + "theoretical_loss": 3.5277508375079543, + "tokens_seen": 1444868096 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028397191574724176, + "loss": 2.763, + "theoretical_loss": 3.527736628297066, + "tokens_seen": 1444933632 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002839618856569709, + "loss": 2.557, + "theoretical_loss": 3.5277224199110746, + "tokens_seen": 1444999168 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1589516, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.031261682510376, + "objective/train/theoretical_loss": 3.5277082123498937, + "objective/train/tokens_used": 1465524704, + "theoretical_loss": 3.5277082123498937, + "tokens_seen": 1445064704 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002839518555667001, + "loss": 2.8223, + "theoretical_loss": 3.5277082123498937, + "tokens_seen": 1445064704 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028394182547642925, + "loss": 2.7678, + "theoretical_loss": 3.5276940056134385, + "tokens_seen": 1445130240 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002839317953861585, + "loss": 2.5075, + "theoretical_loss": 3.527679799701624, + "tokens_seen": 1445195776 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028392176529588766, + "loss": 2.5564, + "theoretical_loss": 3.5276655946143642, + "tokens_seen": 1445261312 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028391173520561684, + "loss": 2.7625, + "theoretical_loss": 3.5276513903515747, + "tokens_seen": 1445326848 + }, + { + "epoch": 4.08, + "learning_rate": 0.000283901705115346, + "loss": 2.6866, + "theoretical_loss": 3.5276371869131697, + "tokens_seen": 1445392384 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028389167502507526, + "loss": 2.5275, + "theoretical_loss": 3.5276229842990645, + "tokens_seen": 1445457920 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002838816449348044, + "loss": 2.4997, + "theoretical_loss": 3.5276087825091738, + "tokens_seen": 1445523456 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002838716148445336, + "loss": 2.8864, + "theoretical_loss": 3.5275945815434113, + "tokens_seen": 1445588992 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028386158475426275, + "loss": 2.5567, + "theoretical_loss": 3.5275803814016937, + "tokens_seen": 1445654528 + }, + { + "epoch": 4.08, + "learning_rate": 0.000283851554663992, + "loss": 2.5588, + "theoretical_loss": 3.527566182083934, + "tokens_seen": 1445720064 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028384152457372117, + "loss": 2.6456, + "theoretical_loss": 3.5275519835900484, + "tokens_seen": 1445785600 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028383149448345035, + "loss": 2.6197, + "theoretical_loss": 3.527537785919951, + "tokens_seen": 1445851136 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028382146439317953, + "loss": 2.7781, + "theoretical_loss": 3.527523589073557, + "tokens_seen": 1445916672 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002838114343029087, + "loss": 2.4368, + "theoretical_loss": 3.527509393050781, + "tokens_seen": 1445982208 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002838014042126379, + "loss": 2.6069, + "theoretical_loss": 3.527495197851538, + "tokens_seen": 1446047744 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002837913741223671, + "loss": 2.5526, + "theoretical_loss": 3.527481003475743, + "tokens_seen": 1446113280 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002837813440320963, + "loss": 2.7014, + "theoretical_loss": 3.527466809923311, + "tokens_seen": 1446178816 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002837713139418255, + "loss": 2.5824, + "theoretical_loss": 3.5274526171941574, + "tokens_seen": 1446244352 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028376128385155467, + "loss": 2.5125, + "theoretical_loss": 3.527438425288196, + "tokens_seen": 1446309888 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028375125376128385, + "loss": 2.6314, + "theoretical_loss": 3.527424234205342, + "tokens_seen": 1446375424 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002837412236710131, + "loss": 2.3199, + "theoretical_loss": 3.527410043945511, + "tokens_seen": 1446440960 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002837311935807422, + "loss": 2.8419, + "theoretical_loss": 3.527395854508618, + "tokens_seen": 1446506496 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028372116349047145, + "loss": 2.8332, + "theoretical_loss": 3.527381665894577, + "tokens_seen": 1446572032 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028371113340020063, + "loss": 2.5837, + "theoretical_loss": 3.5273674781033044, + "tokens_seen": 1446637568 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1590115, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.46427321434021, + "objective/train/theoretical_loss": 3.5273532911347143, + "objective/train/tokens_used": 1467163104, + "theoretical_loss": 3.5273532911347143, + "tokens_seen": 1446703104 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002837011033099298, + "loss": 2.628, + "theoretical_loss": 3.5273532911347143, + "tokens_seen": 1446703104 + }, + { + "epoch": 4.08, + "learning_rate": 0.000283691073219659, + "loss": 2.4291, + "theoretical_loss": 3.5273391049887213, + "tokens_seen": 1446768640 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002836810431293882, + "loss": 2.6043, + "theoretical_loss": 3.527324919665242, + "tokens_seen": 1446834176 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028367101303911735, + "loss": 2.4816, + "theoretical_loss": 3.52731073516419, + "tokens_seen": 1446899712 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002836609829488466, + "loss": 2.534, + "theoretical_loss": 3.527296551485481, + "tokens_seen": 1446965248 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002836509528585757, + "loss": 2.5662, + "theoretical_loss": 3.5272823686290296, + "tokens_seen": 1447030784 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028364092276830495, + "loss": 2.6041, + "theoretical_loss": 3.5272681865947515, + "tokens_seen": 1447096320 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002836308926780341, + "loss": 2.6108, + "theoretical_loss": 3.527254005382562, + "tokens_seen": 1447161856 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002836208625877633, + "loss": 2.2632, + "theoretical_loss": 3.5272398249923755, + "tokens_seen": 1447227392 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002836108324974925, + "loss": 2.6465, + "theoretical_loss": 3.5272256454241075, + "tokens_seen": 1447292928 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002836008024072217, + "loss": 2.7067, + "theoretical_loss": 3.527211466677673, + "tokens_seen": 1447358464 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028359077231695086, + "loss": 2.3787, + "theoretical_loss": 3.527197288752987, + "tokens_seen": 1447424000 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002835807422266801, + "loss": 2.5942, + "theoretical_loss": 3.527183111649965, + "tokens_seen": 1447489536 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002835707121364092, + "loss": 2.8501, + "theoretical_loss": 3.5271689353685223, + "tokens_seen": 1447555072 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028356068204613846, + "loss": 2.8219, + "theoretical_loss": 3.5271547599085737, + "tokens_seen": 1447620608 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002835506519558676, + "loss": 2.7338, + "theoretical_loss": 3.527140585270035, + "tokens_seen": 1447686144 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002835406218655968, + "loss": 2.8349, + "theoretical_loss": 3.5271264114528202, + "tokens_seen": 1447751680 + }, + { + "epoch": 4.08, + "learning_rate": 0.000283530591775326, + "loss": 2.5583, + "theoretical_loss": 3.527112238456846, + "tokens_seen": 1447817216 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002835205616850552, + "loss": 2.5847, + "theoretical_loss": 3.527098066282027, + "tokens_seen": 1447882752 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028351053159478436, + "loss": 2.5197, + "theoretical_loss": 3.5270838949282783, + "tokens_seen": 1447948288 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028350050150451354, + "loss": 2.5117, + "theoretical_loss": 3.5270697243955156, + "tokens_seen": 1448013824 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002834904714142427, + "loss": 2.6788, + "theoretical_loss": 3.5270555546836535, + "tokens_seen": 1448079360 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028348044132397196, + "loss": 2.8404, + "theoretical_loss": 3.527041385792608, + "tokens_seen": 1448144896 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002834704112337011, + "loss": 2.6377, + "theoretical_loss": 3.527027217722294, + "tokens_seen": 1448210432 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002834603811434303, + "loss": 2.5585, + "theoretical_loss": 3.527013050472627, + "tokens_seen": 1448275968 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1591064, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.98321795463562, + "objective/train/theoretical_loss": 3.5269988840435222, + "objective/train/tokens_used": 1468801504, + "theoretical_loss": 3.5269988840435222, + "tokens_seen": 1448341504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028345035105315945, + "loss": 2.9032, + "theoretical_loss": 3.5269988840435222, + "tokens_seen": 1448341504 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002834403209628887, + "loss": 2.719, + "theoretical_loss": 3.5269847184348952, + "tokens_seen": 1448407040 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028343029087261786, + "loss": 2.6729, + "theoretical_loss": 3.526970553646661, + "tokens_seen": 1448472576 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028342026078234705, + "loss": 2.483, + "theoretical_loss": 3.526956389678735, + "tokens_seen": 1448538112 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002834102306920762, + "loss": 2.5919, + "theoretical_loss": 3.526942226531033, + "tokens_seen": 1448603648 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028340020060180546, + "loss": 2.6517, + "theoretical_loss": 3.52692806420347, + "tokens_seen": 1448669184 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002833901705115346, + "loss": 2.5691, + "theoretical_loss": 3.526913902695962, + "tokens_seen": 1448734720 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002833801404212638, + "loss": 2.924, + "theoretical_loss": 3.526899742008423, + "tokens_seen": 1448800256 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028337011033099295, + "loss": 2.8697, + "theoretical_loss": 3.5268855821407703, + "tokens_seen": 1448865792 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002833600802407222, + "loss": 2.475, + "theoretical_loss": 3.526871423092918, + "tokens_seen": 1448931328 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028335005015045137, + "loss": 2.416, + "theoretical_loss": 3.5268572648647822, + "tokens_seen": 1448996864 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028334002006018055, + "loss": 2.7226, + "theoretical_loss": 3.526843107456278, + "tokens_seen": 1449062400 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028332998996990973, + "loss": 2.537, + "theoretical_loss": 3.526828950867321, + "tokens_seen": 1449127936 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002833199598796389, + "loss": 2.5673, + "theoretical_loss": 3.526814795097827, + "tokens_seen": 1449193472 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002833099297893681, + "loss": 2.6473, + "theoretical_loss": 3.5268006401477114, + "tokens_seen": 1449259008 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002832998996990973, + "loss": 2.5081, + "theoretical_loss": 3.5267864860168894, + "tokens_seen": 1449324544 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028328986960882645, + "loss": 2.6657, + "theoretical_loss": 3.526772332705277, + "tokens_seen": 1449390080 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002832798395185557, + "loss": 2.6562, + "theoretical_loss": 3.5267581802127888, + "tokens_seen": 1449455616 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002832698094282848, + "loss": 2.634, + "theoretical_loss": 3.5267440285393414, + "tokens_seen": 1449521152 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028325977933801405, + "loss": 2.645, + "theoretical_loss": 3.52672987768485, + "tokens_seen": 1449586688 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028324974924774323, + "loss": 2.623, + "theoretical_loss": 3.5267157276492305, + "tokens_seen": 1449652224 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002832397191574724, + "loss": 2.714, + "theoretical_loss": 3.526701578432398, + "tokens_seen": 1449717760 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002832296890672016, + "loss": 2.6843, + "theoretical_loss": 3.5266874300342685, + "tokens_seen": 1449783296 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028321965897693083, + "loss": 2.6982, + "theoretical_loss": 3.526673282454757, + "tokens_seen": 1449848832 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028320962888665996, + "loss": 2.6372, + "theoretical_loss": 3.5266591356937793, + "tokens_seen": 1449914368 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1591697, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.471153736114502, + "objective/train/theoretical_loss": 3.5266449897512517, + "objective/train/tokens_used": 1470439904, + "theoretical_loss": 3.5266449897512517, + "tokens_seen": 1449979904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002831995987963892, + "loss": 2.5672, + "theoretical_loss": 3.5266449897512517, + "tokens_seen": 1449979904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002831895687061183, + "loss": 2.8299, + "theoretical_loss": 3.52663084462709, + "tokens_seen": 1450045440 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028317953861584755, + "loss": 2.8545, + "theoretical_loss": 3.5266167003212088, + "tokens_seen": 1450110976 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028316950852557674, + "loss": 2.716, + "theoretical_loss": 3.5266025568335246, + "tokens_seen": 1450176512 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002831594784353059, + "loss": 2.8401, + "theoretical_loss": 3.5265884141639523, + "tokens_seen": 1450242048 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002831494483450351, + "loss": 2.4752, + "theoretical_loss": 3.5265742723124087, + "tokens_seen": 1450307584 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002831394182547643, + "loss": 2.6708, + "theoretical_loss": 3.5265601312788086, + "tokens_seen": 1450373120 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028312938816449346, + "loss": 2.5739, + "theoretical_loss": 3.5265459910630685, + "tokens_seen": 1450438656 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002831193580742227, + "loss": 2.3183, + "theoretical_loss": 3.526531851665103, + "tokens_seen": 1450504192 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002831093279839518, + "loss": 2.6056, + "theoretical_loss": 3.5265177130848295, + "tokens_seen": 1450569728 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028309929789368106, + "loss": 2.3986, + "theoretical_loss": 3.5265035753221623, + "tokens_seen": 1450635264 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002830892678034102, + "loss": 2.254, + "theoretical_loss": 3.526489438377018, + "tokens_seen": 1450700800 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002830792377131394, + "loss": 2.3774, + "theoretical_loss": 3.5264753022493123, + "tokens_seen": 1450766336 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002830692076228686, + "loss": 2.6581, + "theoretical_loss": 3.5264611669389603, + "tokens_seen": 1450831872 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002830591775325978, + "loss": 2.7144, + "theoretical_loss": 3.5264470324458785, + "tokens_seen": 1450897408 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028304914744232696, + "loss": 2.722, + "theoretical_loss": 3.5264328987699827, + "tokens_seen": 1450962944 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002830391173520562, + "loss": 2.7777, + "theoretical_loss": 3.526418765911189, + "tokens_seen": 1451028480 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002830290872617854, + "loss": 2.5871, + "theoretical_loss": 3.526404633869413, + "tokens_seen": 1451094016 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028301905717151456, + "loss": 2.676, + "theoretical_loss": 3.5263905026445705, + "tokens_seen": 1451159552 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028300902708124374, + "loss": 2.4169, + "theoretical_loss": 3.5263763722365766, + "tokens_seen": 1451225088 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002829989969909729, + "loss": 2.6727, + "theoretical_loss": 3.526362242645349, + "tokens_seen": 1451290624 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028298896690070216, + "loss": 2.5742, + "theoretical_loss": 3.526348113870802, + "tokens_seen": 1451356160 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002829789368104313, + "loss": 2.6444, + "theoretical_loss": 3.526333985912852, + "tokens_seen": 1451421696 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002829689067201605, + "loss": 2.5713, + "theoretical_loss": 3.5263198587714157, + "tokens_seen": 1451487232 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028295887662988965, + "loss": 2.6759, + "theoretical_loss": 3.5263057324464078, + "tokens_seen": 1451552768 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1592735, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.422314405441284, + "objective/train/theoretical_loss": 3.526291606937745, + "objective/train/tokens_used": 1472078304, + "theoretical_loss": 3.526291606937745, + "tokens_seen": 1451618304 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002829488465396189, + "loss": 2.6869, + "theoretical_loss": 3.526291606937745, + "tokens_seen": 1451618304 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028293881644934806, + "loss": 2.4801, + "theoretical_loss": 3.5262774822453435, + "tokens_seen": 1451683840 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028292878635907725, + "loss": 2.5524, + "theoretical_loss": 3.5262633583691185, + "tokens_seen": 1451749376 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002829187562688064, + "loss": 2.6287, + "theoretical_loss": 3.5262492353089865, + "tokens_seen": 1451814912 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028290872617853566, + "loss": 2.9081, + "theoretical_loss": 3.5262351130648635, + "tokens_seen": 1451880448 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002828986960882648, + "loss": 2.4654, + "theoretical_loss": 3.5262209916366656, + "tokens_seen": 1451945984 + }, + { + "epoch": 4.08, + "learning_rate": 0.000282888665997994, + "loss": 2.6926, + "theoretical_loss": 3.5262068710243084, + "tokens_seen": 1452011520 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028287863590772315, + "loss": 2.866, + "theoretical_loss": 3.5261927512277085, + "tokens_seen": 1452077056 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002828686058174524, + "loss": 2.7792, + "theoretical_loss": 3.5261786322467814, + "tokens_seen": 1452142592 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028285857572718157, + "loss": 2.631, + "theoretical_loss": 3.5261645140814437, + "tokens_seen": 1452208128 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028284854563691075, + "loss": 2.7531, + "theoretical_loss": 3.526150396731611, + "tokens_seen": 1452273664 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028283851554663993, + "loss": 2.6413, + "theoretical_loss": 3.5261362801972, + "tokens_seen": 1452339200 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002828284854563691, + "loss": 2.46, + "theoretical_loss": 3.526122164478126, + "tokens_seen": 1452404736 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002828184553660983, + "loss": 2.7587, + "theoretical_loss": 3.526108049574306, + "tokens_seen": 1452470272 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002828084252758275, + "loss": 2.7158, + "theoretical_loss": 3.5260939354856555, + "tokens_seen": 1452535808 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028279839518555665, + "loss": 2.7085, + "theoretical_loss": 3.5260798222120906, + "tokens_seen": 1452601344 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002827883650952859, + "loss": 2.5908, + "theoretical_loss": 3.526065709753528, + "tokens_seen": 1452666880 + }, + { + "epoch": 4.08, + "learning_rate": 0.000282778335005015, + "loss": 2.9158, + "theoretical_loss": 3.5260515981098832, + "tokens_seen": 1452732416 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028276830491474425, + "loss": 2.6908, + "theoretical_loss": 3.5260374872810734, + "tokens_seen": 1452797952 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028275827482447343, + "loss": 2.6313, + "theoretical_loss": 3.5260233772670135, + "tokens_seen": 1452863488 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002827482447342026, + "loss": 2.5825, + "theoretical_loss": 3.5260092680676207, + "tokens_seen": 1452929024 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002827382146439318, + "loss": 2.6314, + "theoretical_loss": 3.525995159682811, + "tokens_seen": 1452994560 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028272818455366103, + "loss": 2.7025, + "theoretical_loss": 3.525981052112501, + "tokens_seen": 1453060096 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028271815446339016, + "loss": 2.6858, + "theoretical_loss": 3.5259669453566054, + "tokens_seen": 1453125632 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002827081243731194, + "loss": 2.5014, + "theoretical_loss": 3.525952839415042, + "tokens_seen": 1453191168 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1593470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.433061122894287, + "objective/train/theoretical_loss": 3.5259387342877266, + "objective/train/tokens_used": 1473716704, + "theoretical_loss": 3.5259387342877266, + "tokens_seen": 1453256704 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002826980942828485, + "loss": 2.6465, + "theoretical_loss": 3.5259387342877266, + "tokens_seen": 1453256704 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028268806419257775, + "loss": 2.4855, + "theoretical_loss": 3.5259246299745755, + "tokens_seen": 1453322240 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028267803410230694, + "loss": 2.6825, + "theoretical_loss": 3.525910526475505, + "tokens_seen": 1453387776 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002826680040120361, + "loss": 2.7234, + "theoretical_loss": 3.5258964237904316, + "tokens_seen": 1453453312 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002826579739217653, + "loss": 2.2644, + "theoretical_loss": 3.525882321919271, + "tokens_seen": 1453518848 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002826479438314945, + "loss": 2.8498, + "theoretical_loss": 3.5258682208619403, + "tokens_seen": 1453584384 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028263791374122366, + "loss": 2.4475, + "theoretical_loss": 3.525854120618355, + "tokens_seen": 1453649920 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002826278836509529, + "loss": 2.6162, + "theoretical_loss": 3.525840021188432, + "tokens_seen": 1453715456 + }, + { + "epoch": 4.08, + "learning_rate": 0.000282617853560682, + "loss": 2.4738, + "theoretical_loss": 3.525825922572088, + "tokens_seen": 1453780992 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028260782347041126, + "loss": 2.4866, + "theoretical_loss": 3.5258118247692387, + "tokens_seen": 1453846528 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002825977933801404, + "loss": 2.4781, + "theoretical_loss": 3.5257977277798007, + "tokens_seen": 1453912064 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002825877632898696, + "loss": 2.5941, + "theoretical_loss": 3.5257836316036903, + "tokens_seen": 1453977600 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002825777331995988, + "loss": 2.4183, + "theoretical_loss": 3.5257695362408246, + "tokens_seen": 1454043136 + }, + { + "epoch": 4.08, + "learning_rate": 0.000282567703109328, + "loss": 2.6699, + "theoretical_loss": 3.5257554416911194, + "tokens_seen": 1454108672 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028255767301905716, + "loss": 2.4834, + "theoretical_loss": 3.525741347954491, + "tokens_seen": 1454174208 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002825476429287864, + "loss": 2.6883, + "theoretical_loss": 3.5257272550308563, + "tokens_seen": 1454239744 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002825376128385155, + "loss": 2.677, + "theoretical_loss": 3.5257131629201313, + "tokens_seen": 1454305280 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028252758274824476, + "loss": 2.5145, + "theoretical_loss": 3.5256990716222334, + "tokens_seen": 1454370816 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002825175526579739, + "loss": 2.429, + "theoretical_loss": 3.5256849811370783, + "tokens_seen": 1454436352 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002825075225677031, + "loss": 2.6504, + "theoretical_loss": 3.5256708914645825, + "tokens_seen": 1454501888 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002824974924774323, + "loss": 2.5089, + "theoretical_loss": 3.5256568026046624, + "tokens_seen": 1454567424 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002824874623871615, + "loss": 2.8249, + "theoretical_loss": 3.525642714557235, + "tokens_seen": 1454632960 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028247743229689067, + "loss": 2.4911, + "theoretical_loss": 3.525628627322217, + "tokens_seen": 1454698496 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028246740220661985, + "loss": 2.5383, + "theoretical_loss": 3.5256145408995243, + "tokens_seen": 1454764032 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028245737211634903, + "loss": 2.5971, + "theoretical_loss": 3.525600455289074, + "tokens_seen": 1454829568 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1594935, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5903680324554443, + "objective/train/theoretical_loss": 3.5255863704907826, + "objective/train/tokens_used": 1475355104, + "theoretical_loss": 3.5255863704907826, + "tokens_seen": 1454895104 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028244734202607826, + "loss": 2.7843, + "theoretical_loss": 3.5255863704907826, + "tokens_seen": 1454895104 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002824373119358074, + "loss": 2.6224, + "theoretical_loss": 3.525572286504566, + "tokens_seen": 1454960640 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002824272818455366, + "loss": 2.5467, + "theoretical_loss": 3.525558203330342, + "tokens_seen": 1455026176 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002824172517552658, + "loss": 2.4688, + "theoretical_loss": 3.525544120968026, + "tokens_seen": 1455091712 + }, + { + "epoch": 4.08, + "learning_rate": 0.000282407221664995, + "loss": 2.5837, + "theoretical_loss": 3.5255300394175357, + "tokens_seen": 1455157248 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028239719157472417, + "loss": 2.6213, + "theoretical_loss": 3.5255159586787865, + "tokens_seen": 1455222784 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028238716148445335, + "loss": 2.6744, + "theoretical_loss": 3.5255018787516965, + "tokens_seen": 1455288320 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028237713139418253, + "loss": 2.8926, + "theoretical_loss": 3.5254877996361813, + "tokens_seen": 1455353856 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028236710130391177, + "loss": 2.6875, + "theoretical_loss": 3.5254737213321583, + "tokens_seen": 1455419392 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002823570712136409, + "loss": 2.7662, + "theoretical_loss": 3.525459643839543, + "tokens_seen": 1455484928 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028234704112337013, + "loss": 2.8077, + "theoretical_loss": 3.525445567158254, + "tokens_seen": 1455550464 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028233701103309926, + "loss": 2.5677, + "theoretical_loss": 3.5254314912882063, + "tokens_seen": 1455616000 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002823269809428285, + "loss": 2.7994, + "theoretical_loss": 3.5254174162293177, + "tokens_seen": 1455681536 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002823169508525577, + "loss": 2.6619, + "theoretical_loss": 3.5254033419815043, + "tokens_seen": 1455747072 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028230692076228685, + "loss": 2.8364, + "theoretical_loss": 3.525389268544683, + "tokens_seen": 1455812608 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028229689067201604, + "loss": 2.9088, + "theoretical_loss": 3.5253751959187705, + "tokens_seen": 1455878144 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002822868605817452, + "loss": 2.4792, + "theoretical_loss": 3.525361124103684, + "tokens_seen": 1455943680 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028227683049147445, + "loss": 2.486, + "theoretical_loss": 3.5253470530993396, + "tokens_seen": 1456009216 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028226680040120363, + "loss": 2.7525, + "theoretical_loss": 3.5253329829056543, + "tokens_seen": 1456074752 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002822567703109328, + "loss": 2.7045, + "theoretical_loss": 3.525318913522546, + "tokens_seen": 1456140288 + }, + { + "epoch": 4.08, + "learning_rate": 0.000282246740220662, + "loss": 2.6482, + "theoretical_loss": 3.52530484494993, + "tokens_seen": 1456205824 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028223671013039123, + "loss": 2.884, + "theoretical_loss": 3.525290777187724, + "tokens_seen": 1456271360 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028222668004012036, + "loss": 2.6727, + "theoretical_loss": 3.525276710235844, + "tokens_seen": 1456336896 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002822166499498496, + "loss": 2.4325, + "theoretical_loss": 3.5252626440942088, + "tokens_seen": 1456402432 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002822066198595787, + "loss": 2.681, + "theoretical_loss": 3.525248578762733, + "tokens_seen": 1456467968 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1595425, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1949830055236816, + "objective/train/theoretical_loss": 3.525234514241334, + "objective/train/tokens_used": 1476993504, + "theoretical_loss": 3.525234514241334, + "tokens_seen": 1456533504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028219658976930795, + "loss": 2.3501, + "theoretical_loss": 3.525234514241334, + "tokens_seen": 1456533504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028218655967903714, + "loss": 2.496, + "theoretical_loss": 3.52522045052993, + "tokens_seen": 1456599040 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002821765295887663, + "loss": 2.7187, + "theoretical_loss": 3.5252063876284367, + "tokens_seen": 1456664576 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002821664994984955, + "loss": 2.5332, + "theoretical_loss": 3.5251923255367714, + "tokens_seen": 1456730112 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002821564694082247, + "loss": 2.4664, + "theoretical_loss": 3.5251782642548513, + "tokens_seen": 1456795648 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028214643931795386, + "loss": 2.754, + "theoretical_loss": 3.5251642037825928, + "tokens_seen": 1456861184 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002821364092276831, + "loss": 2.6015, + "theoretical_loss": 3.525150144119913, + "tokens_seen": 1456926720 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002821263791374122, + "loss": 2.8958, + "theoretical_loss": 3.5251360852667286, + "tokens_seen": 1456992256 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028211634904714146, + "loss": 2.6523, + "theoretical_loss": 3.5251220272229578, + "tokens_seen": 1457057792 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002821063189568706, + "loss": 2.6786, + "theoretical_loss": 3.5251079699885164, + "tokens_seen": 1457123328 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002820962888665998, + "loss": 2.7648, + "theoretical_loss": 3.525093913563322, + "tokens_seen": 1457188864 + }, + { + "epoch": 4.08, + "learning_rate": 0.000282086258776329, + "loss": 2.8179, + "theoretical_loss": 3.525079857947291, + "tokens_seen": 1457254400 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002820762286860582, + "loss": 2.5025, + "theoretical_loss": 3.5250658031403406, + "tokens_seen": 1457319936 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028206619859578736, + "loss": 2.604, + "theoretical_loss": 3.5250517491423885, + "tokens_seen": 1457385472 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002820561685055166, + "loss": 2.5271, + "theoretical_loss": 3.5250376959533516, + "tokens_seen": 1457451008 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002820461384152457, + "loss": 2.5158, + "theoretical_loss": 3.5250236435731463, + "tokens_seen": 1457516544 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028203610832497496, + "loss": 2.5655, + "theoretical_loss": 3.52500959200169, + "tokens_seen": 1457582080 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002820260782347041, + "loss": 2.622, + "theoretical_loss": 3.5249955412389005, + "tokens_seen": 1457647616 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002820160481444333, + "loss": 2.4582, + "theoretical_loss": 3.5249814912846933, + "tokens_seen": 1457713152 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002820060180541625, + "loss": 2.8293, + "theoretical_loss": 3.5249674421389874, + "tokens_seen": 1457778688 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002819959879638917, + "loss": 2.8033, + "theoretical_loss": 3.524953393801699, + "tokens_seen": 1457844224 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028198595787362087, + "loss": 2.7273, + "theoretical_loss": 3.5249393462727445, + "tokens_seen": 1457909760 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028197592778335005, + "loss": 2.7569, + "theoretical_loss": 3.5249252995520424, + "tokens_seen": 1457975296 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028196589769307923, + "loss": 2.706, + "theoretical_loss": 3.5249112536395093, + "tokens_seen": 1458040832 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028195586760280846, + "loss": 2.5933, + "theoretical_loss": 3.5248972085350623, + "tokens_seen": 1458106368 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1596012, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.374768018722534, + "objective/train/theoretical_loss": 3.5248831642386182, + "objective/train/tokens_used": 1478631904, + "theoretical_loss": 3.5248831642386182, + "tokens_seen": 1458171904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002819458375125376, + "loss": 2.6555, + "theoretical_loss": 3.5248831642386182, + "tokens_seen": 1458171904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002819358074222668, + "loss": 2.6308, + "theoretical_loss": 3.5248691207500955, + "tokens_seen": 1458237440 + }, + { + "epoch": 4.08, + "learning_rate": 0.000281925777331996, + "loss": 2.3286, + "theoretical_loss": 3.52485507806941, + "tokens_seen": 1458302976 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002819157472417252, + "loss": 2.4707, + "theoretical_loss": 3.52484103619648, + "tokens_seen": 1458368512 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028190571715145437, + "loss": 2.646, + "theoretical_loss": 3.524826995131222, + "tokens_seen": 1458434048 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028189568706118355, + "loss": 2.6066, + "theoretical_loss": 3.5248129548735534, + "tokens_seen": 1458499584 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028188565697091273, + "loss": 2.616, + "theoretical_loss": 3.524798915423392, + "tokens_seen": 1458565120 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028187562688064197, + "loss": 2.5804, + "theoretical_loss": 3.5247848767806538, + "tokens_seen": 1458630656 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002818655967903711, + "loss": 2.7479, + "theoretical_loss": 3.524770838945258, + "tokens_seen": 1458696192 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028185556670010033, + "loss": 2.4494, + "theoretical_loss": 3.5247568019171203, + "tokens_seen": 1458761728 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028184553660982946, + "loss": 2.7117, + "theoretical_loss": 3.5247427656961587, + "tokens_seen": 1458827264 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002818355065195587, + "loss": 2.6518, + "theoretical_loss": 3.5247287302822903, + "tokens_seen": 1458892800 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002818254764292879, + "loss": 2.5601, + "theoretical_loss": 3.5247146956754323, + "tokens_seen": 1458958336 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028181544633901705, + "loss": 2.1958, + "theoretical_loss": 3.524700661875503, + "tokens_seen": 1459023872 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028180541624874624, + "loss": 2.5015, + "theoretical_loss": 3.524686628882418, + "tokens_seen": 1459089408 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002817953861584754, + "loss": 2.5364, + "theoretical_loss": 3.524672596696097, + "tokens_seen": 1459154944 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002817853560682046, + "loss": 2.6715, + "theoretical_loss": 3.524658565316455, + "tokens_seen": 1459220480 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028177532597793383, + "loss": 2.4276, + "theoretical_loss": 3.524644534743411, + "tokens_seen": 1459286016 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028176529588766296, + "loss": 2.3955, + "theoretical_loss": 3.524630504976882, + "tokens_seen": 1459351552 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002817552657973922, + "loss": 2.4853, + "theoretical_loss": 3.524616476016785, + "tokens_seen": 1459417088 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002817452357071214, + "loss": 2.2738, + "theoretical_loss": 3.524602447863038, + "tokens_seen": 1459482624 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028173520561685056, + "loss": 2.4406, + "theoretical_loss": 3.524588420515559, + "tokens_seen": 1459548160 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028172517552657974, + "loss": 2.3551, + "theoretical_loss": 3.5245743939742633, + "tokens_seen": 1459613696 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002817151454363089, + "loss": 2.4789, + "theoretical_loss": 3.5245603682390705, + "tokens_seen": 1459679232 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002817051153460381, + "loss": 2.8512, + "theoretical_loss": 3.524546343309897, + "tokens_seen": 1459744768 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1597246, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.754894256591797, + "objective/train/theoretical_loss": 3.524532319186661, + "objective/train/tokens_used": 1480270304, + "theoretical_loss": 3.524532319186661, + "tokens_seen": 1459810304 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028169508525576734, + "loss": 2.6619, + "theoretical_loss": 3.524532319186661, + "tokens_seen": 1459810304 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028168505516549646, + "loss": 2.5364, + "theoretical_loss": 3.52451829586928, + "tokens_seen": 1459875840 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002816750250752257, + "loss": 2.5277, + "theoretical_loss": 3.52450427335767, + "tokens_seen": 1459941376 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002816649949849548, + "loss": 2.5471, + "theoretical_loss": 3.5244902516517507, + "tokens_seen": 1460006912 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028165496489468406, + "loss": 2.6017, + "theoretical_loss": 3.5244762307514383, + "tokens_seen": 1460072448 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028164493480441324, + "loss": 2.6548, + "theoretical_loss": 3.524462210656651, + "tokens_seen": 1460137984 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002816349047141424, + "loss": 2.3979, + "theoretical_loss": 3.5244481913673056, + "tokens_seen": 1460203520 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002816248746238716, + "loss": 2.5545, + "theoretical_loss": 3.5244341728833204, + "tokens_seen": 1460269056 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002816148445336008, + "loss": 2.6222, + "theoretical_loss": 3.524420155204613, + "tokens_seen": 1460334592 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028160481444332997, + "loss": 2.2757, + "theoretical_loss": 3.5244061383311003, + "tokens_seen": 1460400128 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002815947843530592, + "loss": 2.5474, + "theoretical_loss": 3.5243921222627006, + "tokens_seen": 1460465664 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028158475426278833, + "loss": 2.491, + "theoretical_loss": 3.524378106999331, + "tokens_seen": 1460531200 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028157472417251756, + "loss": 2.7228, + "theoretical_loss": 3.5243640925409103, + "tokens_seen": 1460596736 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028156469408224674, + "loss": 2.6113, + "theoretical_loss": 3.5243500788873545, + "tokens_seen": 1460662272 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002815546639919759, + "loss": 2.8053, + "theoretical_loss": 3.5243360660385825, + "tokens_seen": 1460727808 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002815446339017051, + "loss": 2.6721, + "theoretical_loss": 3.5243220539945113, + "tokens_seen": 1460793344 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002815346038114343, + "loss": 2.3533, + "theoretical_loss": 3.5243080427550586, + "tokens_seen": 1460858880 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002815245737211635, + "loss": 2.5673, + "theoretical_loss": 3.524294032320143, + "tokens_seen": 1460924416 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002815145436308927, + "loss": 2.4373, + "theoretical_loss": 3.5242800226896813, + "tokens_seen": 1460989952 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002815045135406219, + "loss": 2.5728, + "theoretical_loss": 3.524266013863591, + "tokens_seen": 1461055488 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028149448345035107, + "loss": 2.708, + "theoretical_loss": 3.5242520058417908, + "tokens_seen": 1461121024 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028148445336008025, + "loss": 2.3159, + "theoretical_loss": 3.524237998624198, + "tokens_seen": 1461186560 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028147442326980943, + "loss": 2.5539, + "theoretical_loss": 3.5242239922107306, + "tokens_seen": 1461252096 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028146439317953866, + "loss": 2.1944, + "theoretical_loss": 3.5242099866013055, + "tokens_seen": 1461317632 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002814543630892678, + "loss": 2.387, + "theoretical_loss": 3.5241959817958413, + "tokens_seen": 1461383168 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1598035, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9441568851470947, + "objective/train/theoretical_loss": 3.524181977794256, + "objective/train/tokens_used": 1481908704, + "theoretical_loss": 3.524181977794256, + "tokens_seen": 1461448704 + }, + { + "epoch": 4.08, + "learning_rate": 0.000281444332998997, + "loss": 2.757, + "theoretical_loss": 3.524181977794256, + "tokens_seen": 1461448704 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002814343029087262, + "loss": 2.4984, + "theoretical_loss": 3.5241679745964665, + "tokens_seen": 1461514240 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002814242728184554, + "loss": 2.6137, + "theoretical_loss": 3.524153972202391, + "tokens_seen": 1461579776 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028141424272818457, + "loss": 2.5324, + "theoretical_loss": 3.524139970611948, + "tokens_seen": 1461645312 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028140421263791375, + "loss": 2.6002, + "theoretical_loss": 3.5241259698250547, + "tokens_seen": 1461710848 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028139418254764293, + "loss": 2.7401, + "theoretical_loss": 3.5241119698416288, + "tokens_seen": 1461776384 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028138415245737217, + "loss": 2.7361, + "theoretical_loss": 3.5240979706615887, + "tokens_seen": 1461841920 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002813741223671013, + "loss": 2.6632, + "theoretical_loss": 3.5240839722848523, + "tokens_seen": 1461907456 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028136409227683053, + "loss": 2.4013, + "theoretical_loss": 3.5240699747113364, + "tokens_seen": 1461972992 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028135406218655966, + "loss": 2.5163, + "theoretical_loss": 3.524055977940961, + "tokens_seen": 1462038528 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002813440320962889, + "loss": 2.5318, + "theoretical_loss": 3.5240419819736415, + "tokens_seen": 1462104064 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002813340020060181, + "loss": 2.6787, + "theoretical_loss": 3.5240279868092976, + "tokens_seen": 1462169600 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028132397191574725, + "loss": 2.6692, + "theoretical_loss": 3.524013992447847, + "tokens_seen": 1462235136 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028131394182547644, + "loss": 2.4938, + "theoretical_loss": 3.5239999988892077, + "tokens_seen": 1462300672 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002813039117352056, + "loss": 2.5738, + "theoretical_loss": 3.5239860061332964, + "tokens_seen": 1462366208 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002812938816449348, + "loss": 2.5126, + "theoretical_loss": 3.523972014180033, + "tokens_seen": 1462431744 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028128385155466403, + "loss": 2.4717, + "theoretical_loss": 3.5239580230293335, + "tokens_seen": 1462497280 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028127382146439316, + "loss": 2.6733, + "theoretical_loss": 3.5239440326811176, + "tokens_seen": 1462562816 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002812637913741224, + "loss": 2.6682, + "theoretical_loss": 3.5239300431353024, + "tokens_seen": 1462628352 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002812537612838516, + "loss": 2.5902, + "theoretical_loss": 3.5239160543918064, + "tokens_seen": 1462693888 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028124373119358076, + "loss": 2.415, + "theoretical_loss": 3.523902066450548, + "tokens_seen": 1462759424 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028123370110330994, + "loss": 2.5721, + "theoretical_loss": 3.523888079311444, + "tokens_seen": 1462824960 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002812236710130391, + "loss": 2.4526, + "theoretical_loss": 3.5238740929744132, + "tokens_seen": 1462890496 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002812136409227683, + "loss": 2.51, + "theoretical_loss": 3.523860107439374, + "tokens_seen": 1462956032 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028120361083249754, + "loss": 2.5591, + "theoretical_loss": 3.523846122706243, + "tokens_seen": 1463021568 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7301270961761475, + "objective/train/theoretical_loss": 3.5238321387749405, + "objective/train/tokens_used": 1483547104, + "theoretical_loss": 3.5238321387749405, + "tokens_seen": 1463087104 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028119358074222666, + "loss": 2.5086, + "theoretical_loss": 3.5238321387749405, + "tokens_seen": 1463087104 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002811835506519559, + "loss": 2.6898, + "theoretical_loss": 3.523818155645383, + "tokens_seen": 1463152640 + }, + { + "epoch": 4.08, + "learning_rate": 0.000281173520561685, + "loss": 2.3801, + "theoretical_loss": 3.5238041733174894, + "tokens_seen": 1463218176 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028116349047141426, + "loss": 2.4418, + "theoretical_loss": 3.523790191791178, + "tokens_seen": 1463283712 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028115346038114344, + "loss": 2.4984, + "theoretical_loss": 3.523776211066366, + "tokens_seen": 1463349248 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002811434302908726, + "loss": 2.51, + "theoretical_loss": 3.5237622311429724, + "tokens_seen": 1463414784 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002811334002006018, + "loss": 2.6372, + "theoretical_loss": 3.5237482520209147, + "tokens_seen": 1463480320 + }, + { + "epoch": 4.08, + "learning_rate": 0.000281123370110331, + "loss": 2.7331, + "theoretical_loss": 3.523734273700112, + "tokens_seen": 1463545856 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028111334002006017, + "loss": 2.3877, + "theoretical_loss": 3.5237202961804814, + "tokens_seen": 1463611392 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002811033099297894, + "loss": 2.3393, + "theoretical_loss": 3.523706319461942, + "tokens_seen": 1463676928 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028109327983951853, + "loss": 2.6241, + "theoretical_loss": 3.523692343544412, + "tokens_seen": 1463742464 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028108324974924776, + "loss": 2.5906, + "theoretical_loss": 3.5236783684278086, + "tokens_seen": 1463808000 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028107321965897695, + "loss": 2.1827, + "theoretical_loss": 3.5236643941120516, + "tokens_seen": 1463873536 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002810631895687061, + "loss": 2.8091, + "theoretical_loss": 3.5236504205970576, + "tokens_seen": 1463939072 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002810531594784353, + "loss": 2.621, + "theoretical_loss": 3.523636447882746, + "tokens_seen": 1464004608 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002810431293881645, + "loss": 2.3639, + "theoretical_loss": 3.5236224759690353, + "tokens_seen": 1464070144 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028103309929789367, + "loss": 2.708, + "theoretical_loss": 3.5236085048558428, + "tokens_seen": 1464135680 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002810230692076229, + "loss": 2.3876, + "theoretical_loss": 3.5235945345430872, + "tokens_seen": 1464201216 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028101303911735203, + "loss": 2.7517, + "theoretical_loss": 3.523580565030687, + "tokens_seen": 1464266752 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028100300902708127, + "loss": 2.5516, + "theoretical_loss": 3.5235665963185605, + "tokens_seen": 1464332288 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002809929789368104, + "loss": 2.5998, + "theoretical_loss": 3.523552628406626, + "tokens_seen": 1464397824 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028098294884653963, + "loss": 2.6177, + "theoretical_loss": 3.5235386612948014, + "tokens_seen": 1464463360 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002809729187562688, + "loss": 2.5494, + "theoretical_loss": 3.5235246949830064, + "tokens_seen": 1464528896 + }, + { + "epoch": 4.08, + "learning_rate": 0.000280962888665998, + "loss": 2.74, + "theoretical_loss": 3.5235107294711576, + "tokens_seen": 1464594432 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028095285857572717, + "loss": 2.7292, + "theoretical_loss": 3.523496764759175, + "tokens_seen": 1464659968 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.211653470993042, + "objective/train/theoretical_loss": 3.523482800846976, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.523482800846976, + "tokens_seen": 1464725504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028094282848545635, + "loss": 2.2415, + "theoretical_loss": 3.523482800846976, + "tokens_seen": 1464725504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028093279839518553, + "loss": 2.4344, + "theoretical_loss": 3.5234688377344785, + "tokens_seen": 1464791040 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028092276830491477, + "loss": 2.6155, + "theoretical_loss": 3.5234548754216024, + "tokens_seen": 1464856576 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002809127382146439, + "loss": 2.6634, + "theoretical_loss": 3.5234409139082654, + "tokens_seen": 1464922112 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028090270812437313, + "loss": 2.5183, + "theoretical_loss": 3.5234269531943863, + "tokens_seen": 1464987648 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002808926780341023, + "loss": 2.6951, + "theoretical_loss": 3.5234129932798828, + "tokens_seen": 1465053184 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002808826479438315, + "loss": 2.7359, + "theoretical_loss": 3.5233990341646737, + "tokens_seen": 1465118720 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002808726178535607, + "loss": 2.5248, + "theoretical_loss": 3.5233850758486778, + "tokens_seen": 1465184256 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028086258776328986, + "loss": 2.6991, + "theoretical_loss": 3.5233711183318137, + "tokens_seen": 1465249792 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028085255767301904, + "loss": 2.4007, + "theoretical_loss": 3.523357161613999, + "tokens_seen": 1465315328 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002808425275827483, + "loss": 2.5959, + "theoretical_loss": 3.523343205695153, + "tokens_seen": 1465380864 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002808324974924774, + "loss": 2.7473, + "theoretical_loss": 3.5233292505751943, + "tokens_seen": 1465446400 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028082246740220664, + "loss": 2.5896, + "theoretical_loss": 3.523315296254041, + "tokens_seen": 1465511936 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028081243731193576, + "loss": 2.8018, + "theoretical_loss": 3.523301342731612, + "tokens_seen": 1465577472 + }, + { + "epoch": 4.08, + "learning_rate": 0.000280802407221665, + "loss": 2.252, + "theoretical_loss": 3.523287390007826, + "tokens_seen": 1465643008 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002807923771313942, + "loss": 2.6345, + "theoretical_loss": 3.523273438082601, + "tokens_seen": 1465708544 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028078234704112336, + "loss": 2.6385, + "theoretical_loss": 3.5232594869558556, + "tokens_seen": 1465774080 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002807723169508526, + "loss": 2.603, + "theoretical_loss": 3.5232455366275093, + "tokens_seen": 1465839616 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002807622868605818, + "loss": 2.5463, + "theoretical_loss": 3.52323158709748, + "tokens_seen": 1465905152 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028075225677031096, + "loss": 2.6577, + "theoretical_loss": 3.5232176383656864, + "tokens_seen": 1465970688 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028074222668004014, + "loss": 2.6319, + "theoretical_loss": 3.523203690432047, + "tokens_seen": 1466036224 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002807321965897693, + "loss": 2.1931, + "theoretical_loss": 3.5231897432964807, + "tokens_seen": 1466101760 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002807221664994985, + "loss": 2.6428, + "theoretical_loss": 3.523175796958906, + "tokens_seen": 1466167296 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028071213640922774, + "loss": 2.5598, + "theoretical_loss": 3.5231618514192427, + "tokens_seen": 1466232832 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028070210631895686, + "loss": 2.7383, + "theoretical_loss": 3.5231479066774076, + "tokens_seen": 1466298368 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4567415714263916, + "objective/train/theoretical_loss": 3.5231339627333202, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.5231339627333202, + "tokens_seen": 1466363904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002806920762286861, + "loss": 2.4262, + "theoretical_loss": 3.5231339627333202, + "tokens_seen": 1466363904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002806820461384152, + "loss": 2.5823, + "theoretical_loss": 3.5231200195868997, + "tokens_seen": 1466429440 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028067201604814446, + "loss": 2.5947, + "theoretical_loss": 3.523106077238064, + "tokens_seen": 1466494976 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028066198595787364, + "loss": 2.4495, + "theoretical_loss": 3.523092135686732, + "tokens_seen": 1466560512 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002806519558676028, + "loss": 2.3593, + "theoretical_loss": 3.5230781949328236, + "tokens_seen": 1466626048 + }, + { + "epoch": 4.08, + "learning_rate": 0.000280641925777332, + "loss": 2.6452, + "theoretical_loss": 3.523064254976256, + "tokens_seen": 1466691584 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002806318956870612, + "loss": 2.5566, + "theoretical_loss": 3.523050315816949, + "tokens_seen": 1466757120 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028062186559679037, + "loss": 2.4694, + "theoretical_loss": 3.5230363774548206, + "tokens_seen": 1466822656 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002806118355065196, + "loss": 2.4054, + "theoretical_loss": 3.5230224398897905, + "tokens_seen": 1466888192 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028060180541624873, + "loss": 2.8066, + "theoretical_loss": 3.5230085031217766, + "tokens_seen": 1466953728 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028059177532597796, + "loss": 2.6842, + "theoretical_loss": 3.5229945671506986, + "tokens_seen": 1467019264 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028058174523570715, + "loss": 2.6991, + "theoretical_loss": 3.5229806319764743, + "tokens_seen": 1467084800 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002805717151454363, + "loss": 2.5365, + "theoretical_loss": 3.5229666975990233, + "tokens_seen": 1467150336 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002805616850551655, + "loss": 2.4685, + "theoretical_loss": 3.5229527640182647, + "tokens_seen": 1467215872 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002805516549648947, + "loss": 2.8453, + "theoretical_loss": 3.522938831234116, + "tokens_seen": 1467281408 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028054162487462387, + "loss": 2.7806, + "theoretical_loss": 3.522924899246498, + "tokens_seen": 1467346944 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002805315947843531, + "loss": 2.5334, + "theoretical_loss": 3.5229109680553283, + "tokens_seen": 1467412480 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028052156469408223, + "loss": 2.609, + "theoretical_loss": 3.5228970376605258, + "tokens_seen": 1467478016 + }, + { + "epoch": 4.08, + "learning_rate": 0.00028051153460381147, + "loss": 2.3968, + "theoretical_loss": 3.52288310806201, + "tokens_seen": 1467543552 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002805015045135406, + "loss": 2.8006, + "theoretical_loss": 3.5228691792596996, + "tokens_seen": 1467609088 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028049147442326983, + "loss": 2.5682, + "theoretical_loss": 3.5228552512535134, + "tokens_seen": 1467674624 + }, + { + "epoch": 4.09, + "learning_rate": 0.000280481444332999, + "loss": 2.7127, + "theoretical_loss": 3.5228413240433705, + "tokens_seen": 1467740160 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002804714142427282, + "loss": 2.2954, + "theoretical_loss": 3.5228273976291895, + "tokens_seen": 1467805696 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028046138415245737, + "loss": 2.8381, + "theoretical_loss": 3.52281347201089, + "tokens_seen": 1467871232 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028045135406218655, + "loss": 2.4445, + "theoretical_loss": 3.5227995471883906, + "tokens_seen": 1467936768 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5689315795898438, + "objective/train/theoretical_loss": 3.52278562316161, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.52278562316161, + "tokens_seen": 1468002304 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028044132397191574, + "loss": 2.6411, + "theoretical_loss": 3.52278562316161, + "tokens_seen": 1468002304 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028043129388164497, + "loss": 2.5643, + "theoretical_loss": 3.522771699930468, + "tokens_seen": 1468067840 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002804212637913741, + "loss": 2.7135, + "theoretical_loss": 3.522757777494883, + "tokens_seen": 1468133376 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028041123370110333, + "loss": 2.7565, + "theoretical_loss": 3.522743855854774, + "tokens_seen": 1468198912 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002804012036108325, + "loss": 2.6198, + "theoretical_loss": 3.5227299350100605, + "tokens_seen": 1468264448 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002803911735205617, + "loss": 2.6975, + "theoretical_loss": 3.522716014960661, + "tokens_seen": 1468329984 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002803811434302909, + "loss": 2.4973, + "theoretical_loss": 3.522702095706496, + "tokens_seen": 1468395520 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028037111334002006, + "loss": 2.5945, + "theoretical_loss": 3.522688177247482, + "tokens_seen": 1468461056 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028036108324974924, + "loss": 2.6434, + "theoretical_loss": 3.5226742595835407, + "tokens_seen": 1468526592 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002803510531594785, + "loss": 2.4662, + "theoretical_loss": 3.5226603427145893, + "tokens_seen": 1468592128 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002803410230692076, + "loss": 2.6721, + "theoretical_loss": 3.5226464266405477, + "tokens_seen": 1468657664 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028033099297893684, + "loss": 2.4547, + "theoretical_loss": 3.5226325113613353, + "tokens_seen": 1468723200 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028032096288866596, + "loss": 2.518, + "theoretical_loss": 3.522618596876871, + "tokens_seen": 1468788736 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002803109327983952, + "loss": 2.4428, + "theoretical_loss": 3.522604683187074, + "tokens_seen": 1468854272 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002803009027081244, + "loss": 2.8113, + "theoretical_loss": 3.5225907702918633, + "tokens_seen": 1468919808 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028029087261785356, + "loss": 2.336, + "theoretical_loss": 3.5225768581911576, + "tokens_seen": 1468985344 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028028084252758274, + "loss": 2.7046, + "theoretical_loss": 3.522562946884877, + "tokens_seen": 1469050880 + }, + { + "epoch": 4.09, + "learning_rate": 0.000280270812437312, + "loss": 2.4133, + "theoretical_loss": 3.5225490363729404, + "tokens_seen": 1469116416 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002802607823470411, + "loss": 2.4705, + "theoretical_loss": 3.522535126655267, + "tokens_seen": 1469181952 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028025075225677034, + "loss": 2.4143, + "theoretical_loss": 3.522521217731776, + "tokens_seen": 1469247488 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028024072216649947, + "loss": 2.7378, + "theoretical_loss": 3.5225073096023864, + "tokens_seen": 1469313024 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002802306920762287, + "loss": 2.5421, + "theoretical_loss": 3.5224934022670173, + "tokens_seen": 1469378560 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002802206619859579, + "loss": 2.4663, + "theoretical_loss": 3.5224794957255887, + "tokens_seen": 1469444096 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028021063189568706, + "loss": 2.6527, + "theoretical_loss": 3.5224655899780197, + "tokens_seen": 1469509632 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028020060180541624, + "loss": 2.5792, + "theoretical_loss": 3.5224516850242287, + "tokens_seen": 1469575168 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.003769636154175, + "objective/train/theoretical_loss": 3.522437780864136, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.522437780864136, + "tokens_seen": 1469640704 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002801905717151454, + "loss": 2.6305, + "theoretical_loss": 3.522437780864136, + "tokens_seen": 1469640704 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002801805416248746, + "loss": 2.6325, + "theoretical_loss": 3.5224238774976606, + "tokens_seen": 1469706240 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028017051153460384, + "loss": 2.5469, + "theoretical_loss": 3.5224099749247215, + "tokens_seen": 1469771776 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028016048144433297, + "loss": 2.7021, + "theoretical_loss": 3.5223960731452384, + "tokens_seen": 1469837312 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002801504513540622, + "loss": 2.4954, + "theoretical_loss": 3.522382172159131, + "tokens_seen": 1469902848 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028014042126379133, + "loss": 2.6064, + "theoretical_loss": 3.522368271966317, + "tokens_seen": 1469968384 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028013039117352057, + "loss": 2.6808, + "theoretical_loss": 3.522354372566718, + "tokens_seen": 1470033920 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028012036108324975, + "loss": 2.8744, + "theoretical_loss": 3.522340473960252, + "tokens_seen": 1470099456 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028011033099297893, + "loss": 2.5823, + "theoretical_loss": 3.5223265761468388, + "tokens_seen": 1470164992 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002801003009027081, + "loss": 2.361, + "theoretical_loss": 3.5223126791263977, + "tokens_seen": 1470230528 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028009027081243735, + "loss": 2.6317, + "theoretical_loss": 3.522298782898848, + "tokens_seen": 1470296064 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028008024072216647, + "loss": 2.5516, + "theoretical_loss": 3.5222848874641093, + "tokens_seen": 1470361600 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002800702106318957, + "loss": 2.6678, + "theoretical_loss": 3.5222709928221008, + "tokens_seen": 1470427136 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028006018054162483, + "loss": 2.4722, + "theoretical_loss": 3.522257098972742, + "tokens_seen": 1470492672 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028005015045135407, + "loss": 2.779, + "theoretical_loss": 3.522243205915953, + "tokens_seen": 1470558208 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028004012036108325, + "loss": 2.6158, + "theoretical_loss": 3.5222293136516525, + "tokens_seen": 1470623744 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028003009027081243, + "loss": 2.41, + "theoretical_loss": 3.52221542217976, + "tokens_seen": 1470689280 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028002006018054167, + "loss": 2.4115, + "theoretical_loss": 3.5222015315001958, + "tokens_seen": 1470754816 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002800100300902708, + "loss": 2.6044, + "theoretical_loss": 3.522187641612878, + "tokens_seen": 1470820352 + }, + { + "epoch": 4.09, + "learning_rate": 0.00028000000000000003, + "loss": 2.3933, + "theoretical_loss": 3.5221737525177277, + "tokens_seen": 1470885888 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002799899699097292, + "loss": 2.5452, + "theoretical_loss": 3.5221598642146636, + "tokens_seen": 1470951424 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002799799398194584, + "loss": 2.4104, + "theoretical_loss": 3.522145976703605, + "tokens_seen": 1471016960 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002799699097291876, + "loss": 2.4964, + "theoretical_loss": 3.522132089984472, + "tokens_seen": 1471082496 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027995987963891675, + "loss": 2.5139, + "theoretical_loss": 3.522118204057184, + "tokens_seen": 1471148032 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027994984954864594, + "loss": 2.5464, + "theoretical_loss": 3.5221043189216603, + "tokens_seen": 1471213568 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.143112897872925, + "objective/train/theoretical_loss": 3.5220904345778203, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.5220904345778203, + "tokens_seen": 1471279104 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027993981945837517, + "loss": 2.7808, + "theoretical_loss": 3.5220904345778203, + "tokens_seen": 1471279104 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002799297893681043, + "loss": 2.6289, + "theoretical_loss": 3.5220765510255845, + "tokens_seen": 1471344640 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027991975927783353, + "loss": 2.6422, + "theoretical_loss": 3.522062668264872, + "tokens_seen": 1471410176 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002799097291875627, + "loss": 2.5013, + "theoretical_loss": 3.5220487862956027, + "tokens_seen": 1471475712 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002798996990972919, + "loss": 2.5409, + "theoretical_loss": 3.522034905117695, + "tokens_seen": 1471541248 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002798896690070211, + "loss": 2.5736, + "theoretical_loss": 3.52202102473107, + "tokens_seen": 1471606784 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027987963891675026, + "loss": 2.7528, + "theoretical_loss": 3.5220071451356474, + "tokens_seen": 1471672320 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027986960882647944, + "loss": 2.2941, + "theoretical_loss": 3.5219932663313456, + "tokens_seen": 1471737856 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002798595787362087, + "loss": 2.6457, + "theoretical_loss": 3.5219793883180857, + "tokens_seen": 1471803392 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002798495486459378, + "loss": 2.5519, + "theoretical_loss": 3.5219655110957864, + "tokens_seen": 1471868928 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027983951855566704, + "loss": 2.6568, + "theoretical_loss": 3.521951634664368, + "tokens_seen": 1471934464 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027982948846539616, + "loss": 2.4972, + "theoretical_loss": 3.521937759023749, + "tokens_seen": 1472000000 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002798194583751254, + "loss": 2.4951, + "theoretical_loss": 3.5219238841738507, + "tokens_seen": 1472065536 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002798094282848546, + "loss": 2.6323, + "theoretical_loss": 3.521910010114593, + "tokens_seen": 1472131072 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027979939819458376, + "loss": 2.7124, + "theoretical_loss": 3.5218961368458936, + "tokens_seen": 1472196608 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027978936810431294, + "loss": 2.4036, + "theoretical_loss": 3.521882264367674, + "tokens_seen": 1472262144 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002797793380140422, + "loss": 2.3475, + "theoretical_loss": 3.5218683926798535, + "tokens_seen": 1472327680 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002797693079237713, + "loss": 2.6737, + "theoretical_loss": 3.521854521782352, + "tokens_seen": 1472393216 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027975927783350054, + "loss": 2.61, + "theoretical_loss": 3.5218406516750886, + "tokens_seen": 1472458752 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027974924774322967, + "loss": 2.4702, + "theoretical_loss": 3.5218267823579845, + "tokens_seen": 1472524288 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002797392176529589, + "loss": 2.5781, + "theoretical_loss": 3.521812913830958, + "tokens_seen": 1472589824 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002797291875626881, + "loss": 2.704, + "theoretical_loss": 3.52179904609393, + "tokens_seen": 1472655360 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027971915747241726, + "loss": 2.5015, + "theoretical_loss": 3.52178517914682, + "tokens_seen": 1472720896 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027970912738214644, + "loss": 2.4233, + "theoretical_loss": 3.521771312989548, + "tokens_seen": 1472786432 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002796990972918756, + "loss": 2.7242, + "theoretical_loss": 3.5217574476220337, + "tokens_seen": 1472851968 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5638537406921387, + "objective/train/theoretical_loss": 3.5217435830441968, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.5217435830441968, + "tokens_seen": 1472917504 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002796890672016048, + "loss": 2.5489, + "theoretical_loss": 3.5217435830441968, + "tokens_seen": 1472917504 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027967903711133404, + "loss": 2.5476, + "theoretical_loss": 3.521729719255957, + "tokens_seen": 1472983040 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027966900702106317, + "loss": 2.3642, + "theoretical_loss": 3.5217158562572353, + "tokens_seen": 1473048576 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002796589769307924, + "loss": 2.4547, + "theoretical_loss": 3.521701994047951, + "tokens_seen": 1473114112 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027964894684052153, + "loss": 2.5487, + "theoretical_loss": 3.5216881326280234, + "tokens_seen": 1473179648 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027963891675025077, + "loss": 2.819, + "theoretical_loss": 3.521674271997373, + "tokens_seen": 1473245184 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027962888665997995, + "loss": 2.6273, + "theoretical_loss": 3.52166041215592, + "tokens_seen": 1473310720 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027961885656970913, + "loss": 2.664, + "theoretical_loss": 3.521646553103584, + "tokens_seen": 1473376256 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002796088264794383, + "loss": 2.4287, + "theoretical_loss": 3.521632694840285, + "tokens_seen": 1473441792 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027959879638916755, + "loss": 2.3851, + "theoretical_loss": 3.521618837365943, + "tokens_seen": 1473507328 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027958876629889667, + "loss": 2.4027, + "theoretical_loss": 3.521604980680478, + "tokens_seen": 1473572864 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002795787362086259, + "loss": 2.6463, + "theoretical_loss": 3.5215911247838103, + "tokens_seen": 1473638400 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027956870611835503, + "loss": 2.3324, + "theoretical_loss": 3.5215772696758596, + "tokens_seen": 1473703936 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027955867602808427, + "loss": 2.2111, + "theoretical_loss": 3.521563415356546, + "tokens_seen": 1473769472 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027954864593781345, + "loss": 2.3678, + "theoretical_loss": 3.5215495618257897, + "tokens_seen": 1473835008 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027953861584754263, + "loss": 2.5052, + "theoretical_loss": 3.52153570908351, + "tokens_seen": 1473900544 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002795285857572718, + "loss": 2.6186, + "theoretical_loss": 3.5215218571296285, + "tokens_seen": 1473966080 + }, + { + "epoch": 4.09, + "learning_rate": 0.000279518555667001, + "loss": 2.4767, + "theoretical_loss": 3.5215080059640638, + "tokens_seen": 1474031616 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002795085255767302, + "loss": 2.3363, + "theoretical_loss": 3.5214941555867365, + "tokens_seen": 1474097152 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002794984954864594, + "loss": 2.7143, + "theoretical_loss": 3.5214803059975672, + "tokens_seen": 1474162688 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027948846539618854, + "loss": 2.4726, + "theoretical_loss": 3.521466457196475, + "tokens_seen": 1474228224 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002794784353059178, + "loss": 2.7101, + "theoretical_loss": 3.521452609183381, + "tokens_seen": 1474293760 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002794684052156469, + "loss": 2.31, + "theoretical_loss": 3.521438761958205, + "tokens_seen": 1474359296 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027945837512537614, + "loss": 2.6026, + "theoretical_loss": 3.521424915520867, + "tokens_seen": 1474424832 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002794483450351053, + "loss": 2.5403, + "theoretical_loss": 3.521411069871287, + "tokens_seen": 1474490368 + }, + { + "debugging/Self-BLEU-5": 0.40022323577174246, + "debugging/distinct-1-grams": 0.792449235368197, + "debugging/distinct-2-grams": 0.964039917418843, + "debugging/entropy-1-grams": 5.830051148996791, + "debugging/entropy-2-grams": 6.574183943619101, + "debugging/length": 506.7, + "debugging/num_segments": 10, + "debugging/score": 0.007128608811555618, + "debugging/score_std": 0.008951280671415335, + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.455482244491577, + "objective/train/theoretical_loss": 3.5213972250093857, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.5213972250093857, + "tokens_seen": 1474555904 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002794383149448345, + "loss": 2.5027, + "theoretical_loss": 3.5213972250093857, + "tokens_seen": 1474555904 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002794282848545637, + "loss": 2.409, + "theoretical_loss": 3.5213833809350827, + "tokens_seen": 1474621440 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002794182547642929, + "loss": 2.7487, + "theoretical_loss": 3.5213695376482983, + "tokens_seen": 1474686976 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027940822467402204, + "loss": 2.3847, + "theoretical_loss": 3.5213556951489533, + "tokens_seen": 1474752512 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002793981945837513, + "loss": 2.4294, + "theoretical_loss": 3.521341853436968, + "tokens_seen": 1474818048 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002793881644934804, + "loss": 2.7022, + "theoretical_loss": 3.521328012512261, + "tokens_seen": 1474883584 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027937813440320964, + "loss": 2.4663, + "theoretical_loss": 3.5213141723747543, + "tokens_seen": 1474949120 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002793681043129388, + "loss": 2.4916, + "theoretical_loss": 3.5213003330243673, + "tokens_seen": 1475014656 + }, + { + "epoch": 4.09, + "learning_rate": 0.000279358074222668, + "loss": 2.5263, + "theoretical_loss": 3.521286494461021, + "tokens_seen": 1475080192 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002793480441323972, + "loss": 2.5372, + "theoretical_loss": 3.521272656684635, + "tokens_seen": 1475145728 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027933801404212636, + "loss": 2.4316, + "theoretical_loss": 3.5212588196951295, + "tokens_seen": 1475211264 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027932798395185554, + "loss": 2.3126, + "theoretical_loss": 3.5212449834924255, + "tokens_seen": 1475276800 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002793179538615848, + "loss": 2.3813, + "theoretical_loss": 3.5212311480764424, + "tokens_seen": 1475342336 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002793079237713139, + "loss": 2.8092, + "theoretical_loss": 3.521217313447101, + "tokens_seen": 1475407872 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027929789368104314, + "loss": 2.5193, + "theoretical_loss": 3.5212034796043223, + "tokens_seen": 1475473408 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027928786359077227, + "loss": 2.5998, + "theoretical_loss": 3.5211896465480255, + "tokens_seen": 1475538944 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002792778335005015, + "loss": 2.4218, + "theoretical_loss": 3.5211758142781315, + "tokens_seen": 1475604480 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027926780341023074, + "loss": 2.5907, + "theoretical_loss": 3.5211619827945606, + "tokens_seen": 1475670016 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027925777331995987, + "loss": 2.3143, + "theoretical_loss": 3.5211481520972336, + "tokens_seen": 1475735552 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002792477432296891, + "loss": 2.8273, + "theoretical_loss": 3.52113432218607, + "tokens_seen": 1475801088 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002792377131394183, + "loss": 2.5817, + "theoretical_loss": 3.521120493060991, + "tokens_seen": 1475866624 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027922768304914746, + "loss": 2.3576, + "theoretical_loss": 3.5211066647219162, + "tokens_seen": 1475932160 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027921765295887664, + "loss": 2.3519, + "theoretical_loss": 3.521092837168767, + "tokens_seen": 1475997696 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002792076228686058, + "loss": 2.62, + "theoretical_loss": 3.521079010401463, + "tokens_seen": 1476063232 + }, + { + "epoch": 4.09, + "learning_rate": 0.000279197592778335, + "loss": 2.4679, + "theoretical_loss": 3.5210651844199257, + "tokens_seen": 1476128768 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.234206199645996, + "objective/train/theoretical_loss": 3.5210513592240744, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.5210513592240744, + "tokens_seen": 1476194304 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027918756268806424, + "loss": 2.6296, + "theoretical_loss": 3.5210513592240744, + "tokens_seen": 1476194304 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027917753259779337, + "loss": 2.5854, + "theoretical_loss": 3.5210375348138303, + "tokens_seen": 1476259840 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002791675025075226, + "loss": 2.4903, + "theoretical_loss": 3.5210237111891134, + "tokens_seen": 1476325376 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027915747241725173, + "loss": 2.7127, + "theoretical_loss": 3.5210098883498446, + "tokens_seen": 1476390912 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027914744232698097, + "loss": 2.8816, + "theoretical_loss": 3.520996066295944, + "tokens_seen": 1476456448 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027913741223671015, + "loss": 2.6881, + "theoretical_loss": 3.520982245027333, + "tokens_seen": 1476521984 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027912738214643933, + "loss": 2.446, + "theoretical_loss": 3.5209684245439314, + "tokens_seen": 1476587520 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002791173520561685, + "loss": 2.7502, + "theoretical_loss": 3.5209546048456595, + "tokens_seen": 1476653056 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027910732196589775, + "loss": 2.6565, + "theoretical_loss": 3.520940785932438, + "tokens_seen": 1476718592 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027909729187562687, + "loss": 2.3348, + "theoretical_loss": 3.520926967804188, + "tokens_seen": 1476784128 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002790872617853561, + "loss": 2.6025, + "theoretical_loss": 3.52091315046083, + "tokens_seen": 1476849664 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027907723169508523, + "loss": 2.6091, + "theoretical_loss": 3.5208993339022845, + "tokens_seen": 1476915200 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027906720160481447, + "loss": 2.4213, + "theoretical_loss": 3.5208855181284715, + "tokens_seen": 1476980736 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027905717151454365, + "loss": 2.6044, + "theoretical_loss": 3.5208717031393117, + "tokens_seen": 1477046272 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027904714142427283, + "loss": 2.4268, + "theoretical_loss": 3.520857888934727, + "tokens_seen": 1477111808 + }, + { + "epoch": 4.09, + "learning_rate": 0.000279037111334002, + "loss": 2.4377, + "theoretical_loss": 3.5208440755146366, + "tokens_seen": 1477177344 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002790270812437312, + "loss": 2.4946, + "theoretical_loss": 3.5208302628789614, + "tokens_seen": 1477242880 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002790170511534604, + "loss": 2.3929, + "theoretical_loss": 3.520816451027623, + "tokens_seen": 1477308416 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002790070210631896, + "loss": 2.605, + "theoretical_loss": 3.520802639960541, + "tokens_seen": 1477373952 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027899699097291874, + "loss": 2.2421, + "theoretical_loss": 3.5207888296776364, + "tokens_seen": 1477439488 + }, + { + "epoch": 4.09, + "learning_rate": 0.000278986960882648, + "loss": 2.5311, + "theoretical_loss": 3.52077502017883, + "tokens_seen": 1477505024 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002789769307923771, + "loss": 2.6109, + "theoretical_loss": 3.520761211464043, + "tokens_seen": 1477570560 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027896690070210634, + "loss": 2.516, + "theoretical_loss": 3.520747403533195, + "tokens_seen": 1477636096 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002789568706118355, + "loss": 2.4437, + "theoretical_loss": 3.5207335963862074, + "tokens_seen": 1477701632 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002789468405215647, + "loss": 2.8298, + "theoretical_loss": 3.5207197900230014, + "tokens_seen": 1477767168 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0272233486175537, + "objective/train/theoretical_loss": 3.5207059844434965, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.5207059844434965, + "tokens_seen": 1477832704 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002789368104312939, + "loss": 2.803, + "theoretical_loss": 3.5207059844434965, + "tokens_seen": 1477832704 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002789267803410231, + "loss": 2.6148, + "theoretical_loss": 3.5206921796476145, + "tokens_seen": 1477898240 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027891675025075224, + "loss": 2.6943, + "theoretical_loss": 3.520678375635276, + "tokens_seen": 1477963776 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002789067201604815, + "loss": 2.4597, + "theoretical_loss": 3.5206645724064014, + "tokens_seen": 1478029312 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002788966900702106, + "loss": 2.5187, + "theoretical_loss": 3.5206507699609118, + "tokens_seen": 1478094848 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027888665997993984, + "loss": 2.762, + "theoretical_loss": 3.520636968298728, + "tokens_seen": 1478160384 + }, + { + "epoch": 4.09, + "learning_rate": 0.000278876629889669, + "loss": 2.8494, + "theoretical_loss": 3.5206231674197705, + "tokens_seen": 1478225920 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002788665997993982, + "loss": 2.6576, + "theoretical_loss": 3.520609367323961, + "tokens_seen": 1478291456 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002788565697091274, + "loss": 2.6144, + "theoretical_loss": 3.520595568011219, + "tokens_seen": 1478356992 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027884653961885656, + "loss": 2.4924, + "theoretical_loss": 3.520581769481467, + "tokens_seen": 1478422528 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027883650952858574, + "loss": 2.482, + "theoretical_loss": 3.520567971734624, + "tokens_seen": 1478488064 + }, + { + "epoch": 4.09, + "learning_rate": 0.000278826479438315, + "loss": 2.7568, + "theoretical_loss": 3.5205541747706124, + "tokens_seen": 1478553600 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002788164493480441, + "loss": 2.5784, + "theoretical_loss": 3.5205403785893523, + "tokens_seen": 1478619136 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027880641925777334, + "loss": 2.6064, + "theoretical_loss": 3.5205265831907653, + "tokens_seen": 1478684672 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027879638916750247, + "loss": 2.2802, + "theoretical_loss": 3.520512788574772, + "tokens_seen": 1478750208 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002787863590772317, + "loss": 2.5806, + "theoretical_loss": 3.5204989947412924, + "tokens_seen": 1478815744 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002787763289869609, + "loss": 2.5576, + "theoretical_loss": 3.5204852016902484, + "tokens_seen": 1478881280 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027876629889669007, + "loss": 2.5985, + "theoretical_loss": 3.520471409421561, + "tokens_seen": 1478946816 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027875626880641925, + "loss": 2.5239, + "theoretical_loss": 3.5204576179351506, + "tokens_seen": 1479012352 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002787462387161485, + "loss": 2.4756, + "theoretical_loss": 3.520443827230939, + "tokens_seen": 1479077888 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002787362086258776, + "loss": 2.3017, + "theoretical_loss": 3.520430037308846, + "tokens_seen": 1479143424 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027872617853560685, + "loss": 2.4728, + "theoretical_loss": 3.520416248168794, + "tokens_seen": 1479208960 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027871614844533597, + "loss": 2.5853, + "theoretical_loss": 3.5204024598107027, + "tokens_seen": 1479274496 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002787061183550652, + "loss": 2.6454, + "theoretical_loss": 3.5203886722344935, + "tokens_seen": 1479340032 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002786960882647944, + "loss": 2.5003, + "theoretical_loss": 3.5203748854400883, + "tokens_seen": 1479405568 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.675086259841919, + "objective/train/theoretical_loss": 3.520361099427407, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.520361099427407, + "tokens_seen": 1479471104 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027868605817452357, + "loss": 2.5011, + "theoretical_loss": 3.520361099427407, + "tokens_seen": 1479471104 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027867602808425275, + "loss": 2.553, + "theoretical_loss": 3.5203473141963713, + "tokens_seen": 1479536640 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027866599799398193, + "loss": 2.6432, + "theoretical_loss": 3.5203335297469014, + "tokens_seen": 1479602176 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002786559679037111, + "loss": 2.724, + "theoretical_loss": 3.52031974607892, + "tokens_seen": 1479667712 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027864593781344035, + "loss": 2.5352, + "theoretical_loss": 3.520305963192347, + "tokens_seen": 1479733248 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002786359077231695, + "loss": 2.5688, + "theoretical_loss": 3.520292181087103, + "tokens_seen": 1479798784 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002786258776328987, + "loss": 2.5477, + "theoretical_loss": 3.520278399763111, + "tokens_seen": 1479864320 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002786158475426279, + "loss": 2.4729, + "theoretical_loss": 3.5202646192202898, + "tokens_seen": 1479929856 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027860581745235707, + "loss": 2.7487, + "theoretical_loss": 3.520250839458562, + "tokens_seen": 1479995392 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027859578736208625, + "loss": 2.4694, + "theoretical_loss": 3.520237060477849, + "tokens_seen": 1480060928 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027858575727181544, + "loss": 2.6762, + "theoretical_loss": 3.520223282278071, + "tokens_seen": 1480126464 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002785757271815446, + "loss": 2.7235, + "theoretical_loss": 3.520209504859149, + "tokens_seen": 1480192000 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027856569709127385, + "loss": 2.3901, + "theoretical_loss": 3.5201957282210055, + "tokens_seen": 1480257536 + }, + { + "epoch": 4.09, + "learning_rate": 0.000278555667001003, + "loss": 2.3777, + "theoretical_loss": 3.5201819523635605, + "tokens_seen": 1480323072 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002785456369107322, + "loss": 2.2904, + "theoretical_loss": 3.5201681772867355, + "tokens_seen": 1480388608 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027853560682046134, + "loss": 2.3948, + "theoretical_loss": 3.5201544029904523, + "tokens_seen": 1480454144 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002785255767301906, + "loss": 2.6037, + "theoretical_loss": 3.520140629474631, + "tokens_seen": 1480519680 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002785155466399198, + "loss": 2.7134, + "theoretical_loss": 3.5201268567391937, + "tokens_seen": 1480585216 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027850551654964894, + "loss": 2.8329, + "theoretical_loss": 3.520113084784062, + "tokens_seen": 1480650752 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002784954864593782, + "loss": 2.4178, + "theoretical_loss": 3.5200993136091556, + "tokens_seen": 1480716288 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002784854563691073, + "loss": 2.4676, + "theoretical_loss": 3.520085543214397, + "tokens_seen": 1480781824 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027847542627883654, + "loss": 2.1964, + "theoretical_loss": 3.5200717735997076, + "tokens_seen": 1480847360 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002784653961885657, + "loss": 2.7747, + "theoretical_loss": 3.5200580047650076, + "tokens_seen": 1480912896 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002784553660982949, + "loss": 2.5658, + "theoretical_loss": 3.52004423671022, + "tokens_seen": 1480978432 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002784453360080241, + "loss": 2.6829, + "theoretical_loss": 3.520030469435264, + "tokens_seen": 1481043968 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.576517105102539, + "objective/train/theoretical_loss": 3.520016702940063, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.520016702940063, + "tokens_seen": 1481109504 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002784353059177533, + "loss": 2.6277, + "theoretical_loss": 3.520016702940063, + "tokens_seen": 1481109504 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027842527582748244, + "loss": 2.4534, + "theoretical_loss": 3.520002937224537, + "tokens_seen": 1481175040 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002784152457372117, + "loss": 2.5497, + "theoretical_loss": 3.519989172288608, + "tokens_seen": 1481240576 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002784052156469408, + "loss": 2.5595, + "theoretical_loss": 3.519975408132197, + "tokens_seen": 1481306112 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027839518555667004, + "loss": 2.4755, + "theoretical_loss": 3.519961644755225, + "tokens_seen": 1481371648 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002783851554663992, + "loss": 2.5273, + "theoretical_loss": 3.5199478821576142, + "tokens_seen": 1481437184 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002783751253761284, + "loss": 2.6981, + "theoretical_loss": 3.5199341203392853, + "tokens_seen": 1481502720 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002783650952858576, + "loss": 2.6444, + "theoretical_loss": 3.5199203593001602, + "tokens_seen": 1481568256 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027835506519558676, + "loss": 2.3927, + "theoretical_loss": 3.5199065990401603, + "tokens_seen": 1481633792 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027834503510531594, + "loss": 2.4405, + "theoretical_loss": 3.5198928395592075, + "tokens_seen": 1481699328 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002783350050150452, + "loss": 2.536, + "theoretical_loss": 3.519879080857222, + "tokens_seen": 1481764864 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002783249749247743, + "loss": 2.4936, + "theoretical_loss": 3.5198653229341255, + "tokens_seen": 1481830400 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027831494483450354, + "loss": 2.2565, + "theoretical_loss": 3.51985156578984, + "tokens_seen": 1481895936 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027830491474423267, + "loss": 2.3814, + "theoretical_loss": 3.5198378094242875, + "tokens_seen": 1481961472 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002782948846539619, + "loss": 2.4476, + "theoretical_loss": 3.5198240538373877, + "tokens_seen": 1482027008 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002782848545636911, + "loss": 2.4944, + "theoretical_loss": 3.5198102990290643, + "tokens_seen": 1482092544 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027827482447342027, + "loss": 2.4673, + "theoretical_loss": 3.519796544999237, + "tokens_seen": 1482158080 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027826479438314945, + "loss": 2.3833, + "theoretical_loss": 3.5197827917478284, + "tokens_seen": 1482223616 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002782547642928787, + "loss": 2.6754, + "theoretical_loss": 3.5197690392747596, + "tokens_seen": 1482289152 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002782447342026078, + "loss": 2.2025, + "theoretical_loss": 3.519755287579952, + "tokens_seen": 1482354688 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027823470411233705, + "loss": 2.522, + "theoretical_loss": 3.519741536663327, + "tokens_seen": 1482420224 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027822467402206617, + "loss": 2.5283, + "theoretical_loss": 3.519727786524807, + "tokens_seen": 1482485760 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002782146439317954, + "loss": 2.3373, + "theoretical_loss": 3.519714037164313, + "tokens_seen": 1482551296 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002782046138415246, + "loss": 2.4523, + "theoretical_loss": 3.5197002885817668, + "tokens_seen": 1482616832 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027819458375125377, + "loss": 2.8188, + "theoretical_loss": 3.5196865407770894, + "tokens_seen": 1482682368 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 1603030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.83261775970459, + "objective/train/theoretical_loss": 3.5196727937502033, + "objective/train/tokens_used": 1483791840, + "theoretical_loss": 3.5196727937502033, + "tokens_seen": 1482747904 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027818455366098295, + "loss": 2.7207, + "theoretical_loss": 3.5196727937502033, + "tokens_seen": 1482747904 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027817452357071213, + "loss": 2.5233, + "theoretical_loss": 3.5196590475010296, + "tokens_seen": 1482813440 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002781644934804413, + "loss": 2.5227, + "theoretical_loss": 3.51964530202949, + "tokens_seen": 1482878976 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027815446339017055, + "loss": 2.5866, + "theoretical_loss": 3.519631557335506, + "tokens_seen": 1482944512 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002781444332998997, + "loss": 2.5737, + "theoretical_loss": 3.519617813419, + "tokens_seen": 1483010048 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002781344032096289, + "loss": 2.4936, + "theoretical_loss": 3.5196040702798923, + "tokens_seen": 1483075584 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002781243731193581, + "loss": 2.74, + "theoretical_loss": 3.519590327918106, + "tokens_seen": 1483141120 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002781143430290873, + "loss": 2.5662, + "theoretical_loss": 3.519576586333562, + "tokens_seen": 1483206656 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027810431293881645, + "loss": 2.6936, + "theoretical_loss": 3.519562845526182, + "tokens_seen": 1483272192 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027809428284854564, + "loss": 2.5848, + "theoretical_loss": 3.519549105495888, + "tokens_seen": 1483337728 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002780842527582748, + "loss": 2.735, + "theoretical_loss": 3.5195353662426014, + "tokens_seen": 1483403264 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027807422266800405, + "loss": 2.4056, + "theoretical_loss": 3.5195216277662444, + "tokens_seen": 1483468800 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002780641925777332, + "loss": 2.3115, + "theoretical_loss": 3.5195078900667385, + "tokens_seen": 1483534336 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002780541624874624, + "loss": 2.5639, + "theoretical_loss": 3.5194941531440054, + "tokens_seen": 1483599872 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027804413239719154, + "loss": 2.5111, + "theoretical_loss": 3.5194804169979665, + "tokens_seen": 1483665408 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002780341023069208, + "loss": 2.6556, + "theoretical_loss": 3.5194666816285447, + "tokens_seen": 1483730944 + }, + { + "epoch": 4.09, + "learning_rate": 0.00027802407221664996, + "loss": 2.8688, + "theoretical_loss": 3.519452947035661, + "tokens_seen": 1483796480 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027801404212637914, + "loss": 3.5425, + "theoretical_loss": 3.5194359944683216, + "tokens_seen": 1483877376 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002780040120361083, + "loss": 2.8675, + "theoretical_loss": 3.5194222616102326, + "tokens_seen": 1483942912 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002779939819458375, + "loss": 2.6917, + "theoretical_loss": 3.5194085295284285, + "tokens_seen": 1484008448 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002779839518555667, + "loss": 2.8906, + "theoretical_loss": 3.519394798222832, + "tokens_seen": 1484073984 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002779739217652959, + "loss": 2.6199, + "theoretical_loss": 3.519381067693365, + "tokens_seen": 1484139520 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027796389167502504, + "loss": 2.701, + "theoretical_loss": 3.519367337939948, + "tokens_seen": 1484205056 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002779538615847543, + "loss": 2.5548, + "theoretical_loss": 3.519353608962504, + "tokens_seen": 1484270592 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027794383149448346, + "loss": 2.709, + "theoretical_loss": 3.5193398807609544, + "tokens_seen": 1484336128 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1667839, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.698960781097412, + "objective/train/theoretical_loss": 3.5193295851189266, + "objective/train/tokens_used": 1504845280, + "theoretical_loss": 3.5193295851189266, + "tokens_seen": 1484385280 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027793380140421264, + "loss": 2.6594, + "theoretical_loss": 3.519326153335222, + "tokens_seen": 1484401664 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002779237713139418, + "loss": 2.679, + "theoretical_loss": 3.5193124266852274, + "tokens_seen": 1484467200 + }, + { + "epoch": 5.0, + "learning_rate": 0.000277913741223671, + "loss": 2.6059, + "theoretical_loss": 3.519298700810893, + "tokens_seen": 1484532736 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002779037111334002, + "loss": 2.5911, + "theoretical_loss": 3.519284975712141, + "tokens_seen": 1484598272 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002778936810431294, + "loss": 2.542, + "theoretical_loss": 3.5192712513888935, + "tokens_seen": 1484663808 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027788365095285855, + "loss": 2.634, + "theoretical_loss": 3.5192575278410723, + "tokens_seen": 1484729344 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002778736208625878, + "loss": 2.5404, + "theoretical_loss": 3.519243805068599, + "tokens_seen": 1484794880 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002778635907723169, + "loss": 2.4077, + "theoretical_loss": 3.5192300830713954, + "tokens_seen": 1484860416 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027785356068204614, + "loss": 2.8491, + "theoretical_loss": 3.5192163618493844, + "tokens_seen": 1484925952 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002778435305917753, + "loss": 2.6214, + "theoretical_loss": 3.5192026414024875, + "tokens_seen": 1484991488 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002778335005015045, + "loss": 2.8468, + "theoretical_loss": 3.5191889217306267, + "tokens_seen": 1485057024 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002778234704112337, + "loss": 2.8204, + "theoretical_loss": 3.5191752028337238, + "tokens_seen": 1485122560 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027781344032096287, + "loss": 2.645, + "theoretical_loss": 3.5191614847117005, + "tokens_seen": 1485188096 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027780341023069205, + "loss": 2.8198, + "theoretical_loss": 3.51914776736448, + "tokens_seen": 1485253632 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002777933801404213, + "loss": 2.6427, + "theoretical_loss": 3.519134050791984, + "tokens_seen": 1485319168 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002777833500501504, + "loss": 2.6776, + "theoretical_loss": 3.519120334994134, + "tokens_seen": 1485384704 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027777331995987965, + "loss": 2.641, + "theoretical_loss": 3.5191066199708523, + "tokens_seen": 1485450240 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002777632898696089, + "loss": 2.4597, + "theoretical_loss": 3.5190929057220615, + "tokens_seen": 1485515776 + }, + { + "epoch": 5.0, + "learning_rate": 0.000277753259779338, + "loss": 2.6771, + "theoretical_loss": 3.5190791922476823, + "tokens_seen": 1485581312 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027774322968906725, + "loss": 2.6651, + "theoretical_loss": 3.5190654795476384, + "tokens_seen": 1485646848 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027773319959879637, + "loss": 2.4468, + "theoretical_loss": 3.5190517676218516, + "tokens_seen": 1485712384 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002777231695085256, + "loss": 2.6605, + "theoretical_loss": 3.5190380564702433, + "tokens_seen": 1485777920 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002777131394182548, + "loss": 2.7455, + "theoretical_loss": 3.519024346092736, + "tokens_seen": 1485843456 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027770310932798397, + "loss": 2.6425, + "theoretical_loss": 3.519010636489252, + "tokens_seen": 1485908992 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027769307923771315, + "loss": 2.6573, + "theoretical_loss": 3.518996927659714, + "tokens_seen": 1485974528 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1672845, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.550705909729004, + "objective/train/theoretical_loss": 3.5189866465454154, + "objective/train/tokens_used": 1506483680, + "theoretical_loss": 3.5189866465454154, + "tokens_seen": 1486023680 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027768304914744233, + "loss": 2.6352, + "theoretical_loss": 3.518983219604043, + "tokens_seen": 1486040064 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002776730190571715, + "loss": 2.702, + "theoretical_loss": 3.518969512322162, + "tokens_seen": 1486105600 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027766298896690075, + "loss": 2.739, + "theoretical_loss": 3.518955805813993, + "tokens_seen": 1486171136 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002776529588766299, + "loss": 2.5802, + "theoretical_loss": 3.518942100079458, + "tokens_seen": 1486236672 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002776429287863591, + "loss": 2.5728, + "theoretical_loss": 3.5189283951184795, + "tokens_seen": 1486302208 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002776328986960883, + "loss": 2.5793, + "theoretical_loss": 3.5189146909309796, + "tokens_seen": 1486367744 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002776228686058175, + "loss": 2.6295, + "theoretical_loss": 3.5189009875168806, + "tokens_seen": 1486433280 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027761283851554665, + "loss": 2.6874, + "theoretical_loss": 3.518887284876105, + "tokens_seen": 1486498816 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027760280842527584, + "loss": 2.6939, + "theoretical_loss": 3.518873583008574, + "tokens_seen": 1486564352 + }, + { + "epoch": 5.0, + "learning_rate": 0.000277592778335005, + "loss": 2.7765, + "theoretical_loss": 3.518859881914212, + "tokens_seen": 1486629888 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027758274824473425, + "loss": 2.6764, + "theoretical_loss": 3.518846181592939, + "tokens_seen": 1486695424 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002775727181544634, + "loss": 2.7328, + "theoretical_loss": 3.518832482044678, + "tokens_seen": 1486760960 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002775626880641926, + "loss": 2.4887, + "theoretical_loss": 3.518818783269352, + "tokens_seen": 1486826496 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027755265797392174, + "loss": 2.6795, + "theoretical_loss": 3.5188050852668833, + "tokens_seen": 1486892032 + }, + { + "epoch": 5.0, + "learning_rate": 0.000277542627883651, + "loss": 2.791, + "theoretical_loss": 3.5187913880371937, + "tokens_seen": 1486957568 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027753259779338016, + "loss": 2.6841, + "theoretical_loss": 3.5187776915802056, + "tokens_seen": 1487023104 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027752256770310934, + "loss": 2.5651, + "theoretical_loss": 3.518763995895841, + "tokens_seen": 1487088640 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002775125376128385, + "loss": 2.7094, + "theoretical_loss": 3.5187503009840233, + "tokens_seen": 1487154176 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002775025075225677, + "loss": 2.6081, + "theoretical_loss": 3.5187366068446746, + "tokens_seen": 1487219712 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002774924774322969, + "loss": 2.62, + "theoretical_loss": 3.518722913477716, + "tokens_seen": 1487285248 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002774824473420261, + "loss": 2.5542, + "theoretical_loss": 3.518709220883072, + "tokens_seen": 1487350784 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027747241725175524, + "loss": 2.6896, + "theoretical_loss": 3.518695529060663, + "tokens_seen": 1487416320 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002774623871614845, + "loss": 2.788, + "theoretical_loss": 3.518681838010413, + "tokens_seen": 1487481856 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027745235707121366, + "loss": 2.6174, + "theoretical_loss": 3.5186681477322437, + "tokens_seen": 1487547392 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027744232698094284, + "loss": 2.8734, + "theoretical_loss": 3.5186544582260773, + "tokens_seen": 1487612928 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1677864, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8672678470611572, + "objective/train/theoretical_loss": 3.518644191603033, + "objective/train/tokens_used": 1508122080, + "theoretical_loss": 3.518644191603033, + "tokens_seen": 1487662080 + }, + { + "epoch": 5.0, + "learning_rate": 0.000277432296890672, + "loss": 2.636, + "theoretical_loss": 3.518640769491837, + "tokens_seen": 1487678464 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002774222668004012, + "loss": 2.7245, + "theoretical_loss": 3.5186270815294445, + "tokens_seen": 1487744000 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002774122367101304, + "loss": 2.6811, + "theoretical_loss": 3.5186133943388227, + "tokens_seen": 1487809536 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002774022066198596, + "loss": 2.6277, + "theoretical_loss": 3.5185997079198943, + "tokens_seen": 1487875072 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027739217652958875, + "loss": 2.4533, + "theoretical_loss": 3.518586022272581, + "tokens_seen": 1487940608 + }, + { + "epoch": 5.0, + "learning_rate": 0.000277382146439318, + "loss": 2.6915, + "theoretical_loss": 3.5185723373968063, + "tokens_seen": 1488006144 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002773721163490471, + "loss": 2.6598, + "theoretical_loss": 3.5185586532924917, + "tokens_seen": 1488071680 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027736208625877634, + "loss": 2.5223, + "theoretical_loss": 3.518544969959561, + "tokens_seen": 1488137216 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002773520561685055, + "loss": 2.6451, + "theoretical_loss": 3.518531287397936, + "tokens_seen": 1488202752 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002773420260782347, + "loss": 2.6403, + "theoretical_loss": 3.5185176056075393, + "tokens_seen": 1488268288 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002773319959879639, + "loss": 2.7058, + "theoretical_loss": 3.518503924588293, + "tokens_seen": 1488333824 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027732196589769307, + "loss": 2.7885, + "theoretical_loss": 3.5184902443401205, + "tokens_seen": 1488399360 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027731193580742225, + "loss": 2.7033, + "theoretical_loss": 3.518476564862944, + "tokens_seen": 1488464896 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002773019057171515, + "loss": 2.6601, + "theoretical_loss": 3.5184628861566867, + "tokens_seen": 1488530432 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002772918756268806, + "loss": 2.7296, + "theoretical_loss": 3.51844920822127, + "tokens_seen": 1488595968 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027728184553660985, + "loss": 2.7617, + "theoretical_loss": 3.5184355310566175, + "tokens_seen": 1488661504 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027727181544633903, + "loss": 2.736, + "theoretical_loss": 3.518421854662652, + "tokens_seen": 1488727040 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002772617853560682, + "loss": 2.7205, + "theoretical_loss": 3.5184081790392945, + "tokens_seen": 1488792576 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002772517552657974, + "loss": 2.8174, + "theoretical_loss": 3.51839450418647, + "tokens_seen": 1488858112 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027724172517552657, + "loss": 2.6738, + "theoretical_loss": 3.5183808301040997, + "tokens_seen": 1488923648 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027723169508525575, + "loss": 2.6079, + "theoretical_loss": 3.518367156792106, + "tokens_seen": 1488989184 + }, + { + "epoch": 5.0, + "learning_rate": 0.000277221664994985, + "loss": 2.7517, + "theoretical_loss": 3.518353484250413, + "tokens_seen": 1489054720 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002772116349047141, + "loss": 2.6961, + "theoretical_loss": 3.518339812478942, + "tokens_seen": 1489120256 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027720160481444335, + "loss": 2.6361, + "theoretical_loss": 3.5183261414776172, + "tokens_seen": 1489185792 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002771915747241725, + "loss": 2.7121, + "theoretical_loss": 3.51831247124636, + "tokens_seen": 1489251328 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1682798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.633384943008423, + "objective/train/theoretical_loss": 3.5183022190782274, + "objective/train/tokens_used": 1509760480, + "theoretical_loss": 3.5183022190782274, + "tokens_seen": 1489300480 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002771815446339017, + "loss": 2.6501, + "theoretical_loss": 3.518298801785093, + "tokens_seen": 1489316864 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002771715145436309, + "loss": 2.6853, + "theoretical_loss": 3.51828513309374, + "tokens_seen": 1489382400 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002771614844533601, + "loss": 2.7524, + "theoretical_loss": 3.5182714651722238, + "tokens_seen": 1489447936 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027715145436308926, + "loss": 2.6166, + "theoretical_loss": 3.5182577980204663, + "tokens_seen": 1489513472 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002771414242728185, + "loss": 2.7881, + "theoretical_loss": 3.5182441316383906, + "tokens_seen": 1489579008 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002771313941825476, + "loss": 2.5965, + "theoretical_loss": 3.5182304660259196, + "tokens_seen": 1489644544 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027712136409227685, + "loss": 2.7613, + "theoretical_loss": 3.518216801182976, + "tokens_seen": 1489710080 + }, + { + "epoch": 5.0, + "learning_rate": 0.000277111334002006, + "loss": 2.791, + "theoretical_loss": 3.5182031371094826, + "tokens_seen": 1489775616 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002771013039117352, + "loss": 2.5291, + "theoretical_loss": 3.5181894738053625, + "tokens_seen": 1489841152 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002770912738214644, + "loss": 2.6356, + "theoretical_loss": 3.518175811270538, + "tokens_seen": 1489906688 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002770812437311936, + "loss": 2.7813, + "theoretical_loss": 3.518162149504933, + "tokens_seen": 1489972224 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027707121364092276, + "loss": 2.685, + "theoretical_loss": 3.5181484885084693, + "tokens_seen": 1490037760 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027706118355065194, + "loss": 2.5715, + "theoretical_loss": 3.5181348282810703, + "tokens_seen": 1490103296 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002770511534603811, + "loss": 2.4107, + "theoretical_loss": 3.518121168822658, + "tokens_seen": 1490168832 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027704112337011036, + "loss": 2.5857, + "theoretical_loss": 3.5181075101331567, + "tokens_seen": 1490234368 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002770310932798395, + "loss": 2.7753, + "theoretical_loss": 3.5180938522124885, + "tokens_seen": 1490299904 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002770210631895687, + "loss": 2.6862, + "theoretical_loss": 3.518080195060576, + "tokens_seen": 1490365440 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002770110330992979, + "loss": 2.7125, + "theoretical_loss": 3.518066538677343, + "tokens_seen": 1490430976 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002770010030090271, + "loss": 2.6126, + "theoretical_loss": 3.518052883062712, + "tokens_seen": 1490496512 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002769909729187563, + "loss": 2.5852, + "theoretical_loss": 3.5180392282166055, + "tokens_seen": 1490562048 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027698094282848544, + "loss": 2.5653, + "theoretical_loss": 3.5180255741389477, + "tokens_seen": 1490627584 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002769709127382147, + "loss": 2.7681, + "theoretical_loss": 3.51801192082966, + "tokens_seen": 1490693120 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027696088264794386, + "loss": 2.6062, + "theoretical_loss": 3.5179982682886664, + "tokens_seen": 1490758656 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027695085255767304, + "loss": 2.831, + "theoretical_loss": 3.5179846165158897, + "tokens_seen": 1490824192 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002769408224674022, + "loss": 2.7401, + "theoretical_loss": 3.5179709655112528, + "tokens_seen": 1490889728 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1687806, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.763873815536499, + "objective/train/theoretical_loss": 3.517960727761821, + "objective/train/tokens_used": 1511398880, + "theoretical_loss": 3.517960727761821, + "tokens_seen": 1490938880 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002769307923771314, + "loss": 2.7617, + "theoretical_loss": 3.5179573152746793, + "tokens_seen": 1490955264 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002769207622868606, + "loss": 2.5945, + "theoretical_loss": 3.517943665806091, + "tokens_seen": 1491020800 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002769107321965898, + "loss": 2.783, + "theoretical_loss": 3.517930017105412, + "tokens_seen": 1491086336 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027690070210631895, + "loss": 2.6221, + "theoretical_loss": 3.5179163691725646, + "tokens_seen": 1491151872 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002768906720160482, + "loss": 2.5724, + "theoretical_loss": 3.517902722007473, + "tokens_seen": 1491217408 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002768806419257773, + "loss": 2.7295, + "theoretical_loss": 3.517889075610059, + "tokens_seen": 1491282944 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027687061183550655, + "loss": 2.7662, + "theoretical_loss": 3.5178754299802457, + "tokens_seen": 1491348480 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002768605817452357, + "loss": 2.7277, + "theoretical_loss": 3.517861785117957, + "tokens_seen": 1491414016 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002768505516549649, + "loss": 2.7318, + "theoretical_loss": 3.5178481410231166, + "tokens_seen": 1491479552 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002768405215646941, + "loss": 2.7809, + "theoretical_loss": 3.517834497695646, + "tokens_seen": 1491545088 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027683049147442327, + "loss": 2.6275, + "theoretical_loss": 3.517820855135469, + "tokens_seen": 1491610624 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027682046138415245, + "loss": 2.5591, + "theoretical_loss": 3.5178072133425093, + "tokens_seen": 1491676160 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002768104312938817, + "loss": 2.908, + "theoretical_loss": 3.517793572316689, + "tokens_seen": 1491741696 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002768004012036108, + "loss": 2.5793, + "theoretical_loss": 3.517779932057932, + "tokens_seen": 1491807232 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027679037111334005, + "loss": 2.6962, + "theoretical_loss": 3.5177662925661615, + "tokens_seen": 1491872768 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027678034102306923, + "loss": 2.4658, + "theoretical_loss": 3.5177526538413, + "tokens_seen": 1491938304 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002767703109327984, + "loss": 2.5116, + "theoretical_loss": 3.5177390158832713, + "tokens_seen": 1492003840 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002767602808425276, + "loss": 2.6111, + "theoretical_loss": 3.517725378691998, + "tokens_seen": 1492069376 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027675025075225677, + "loss": 2.6248, + "theoretical_loss": 3.5177117422674047, + "tokens_seen": 1492134912 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027674022066198595, + "loss": 2.7078, + "theoretical_loss": 3.517698106609413, + "tokens_seen": 1492200448 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002767301905717152, + "loss": 2.6051, + "theoretical_loss": 3.5176844717179474, + "tokens_seen": 1492265984 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002767201604814443, + "loss": 2.6811, + "theoretical_loss": 3.51767083759293, + "tokens_seen": 1492331520 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027671013039117355, + "loss": 2.5908, + "theoretical_loss": 3.5176572042342853, + "tokens_seen": 1492397056 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002767001003009027, + "loss": 2.817, + "theoretical_loss": 3.517643571641935, + "tokens_seen": 1492462592 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002766900702106319, + "loss": 2.8072, + "theoretical_loss": 3.517629939815804, + "tokens_seen": 1492528128 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1692905, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.619441032409668, + "objective/train/theoretical_loss": 3.5176197164489906, + "objective/train/tokens_used": 1513037280, + "theoretical_loss": 3.5176197164489906, + "tokens_seen": 1492577280 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002766800401203611, + "loss": 2.6709, + "theoretical_loss": 3.517616308755815, + "tokens_seen": 1492593664 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002766700100300903, + "loss": 2.7228, + "theoretical_loss": 3.5176026784618912, + "tokens_seen": 1492659200 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027665997993981946, + "loss": 2.8684, + "theoretical_loss": 3.5175890489339556, + "tokens_seen": 1492724736 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002766499498495487, + "loss": 2.6712, + "theoretical_loss": 3.517575420171932, + "tokens_seen": 1492790272 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002766399197592778, + "loss": 2.5039, + "theoretical_loss": 3.5175617921757434, + "tokens_seen": 1492855808 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027662988966900705, + "loss": 2.7792, + "theoretical_loss": 3.517548164945313, + "tokens_seen": 1492921344 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002766198595787362, + "loss": 2.8762, + "theoretical_loss": 3.517534538480565, + "tokens_seen": 1492986880 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002766098294884654, + "loss": 2.7011, + "theoretical_loss": 3.517520912781422, + "tokens_seen": 1493052416 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002765997993981946, + "loss": 2.6816, + "theoretical_loss": 3.517507287847808, + "tokens_seen": 1493117952 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002765897693079238, + "loss": 2.6194, + "theoretical_loss": 3.517493663679646, + "tokens_seen": 1493183488 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027657973921765296, + "loss": 2.918, + "theoretical_loss": 3.517480040276859, + "tokens_seen": 1493249024 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027656970912738214, + "loss": 2.5906, + "theoretical_loss": 3.517466417639371, + "tokens_seen": 1493314560 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002765596790371113, + "loss": 2.7555, + "theoretical_loss": 3.517452795767105, + "tokens_seen": 1493380096 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027654964894684056, + "loss": 2.8005, + "theoretical_loss": 3.5174391746599856, + "tokens_seen": 1493445632 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002765396188565697, + "loss": 2.5952, + "theoretical_loss": 3.517425554317935, + "tokens_seen": 1493511168 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002765295887662989, + "loss": 2.6816, + "theoretical_loss": 3.5174119347408768, + "tokens_seen": 1493576704 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027651955867602805, + "loss": 2.6175, + "theoretical_loss": 3.5173983159287348, + "tokens_seen": 1493642240 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002765095285857573, + "loss": 2.6861, + "theoretical_loss": 3.5173846978814316, + "tokens_seen": 1493707776 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027649949849548646, + "loss": 2.6073, + "theoretical_loss": 3.5173710805988927, + "tokens_seen": 1493773312 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027648946840521564, + "loss": 2.5486, + "theoretical_loss": 3.51735746408104, + "tokens_seen": 1493838848 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002764794383149448, + "loss": 2.7331, + "theoretical_loss": 3.517343848327797, + "tokens_seen": 1493904384 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027646940822467406, + "loss": 2.6075, + "theoretical_loss": 3.517330233339088, + "tokens_seen": 1493969920 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002764593781344032, + "loss": 2.7378, + "theoretical_loss": 3.517316619114836, + "tokens_seen": 1494035456 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002764493480441324, + "loss": 2.7399, + "theoretical_loss": 3.517303005654964, + "tokens_seen": 1494100992 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027643931795386155, + "loss": 2.465, + "theoretical_loss": 3.517289392959397, + "tokens_seen": 1494166528 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1697937, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4400620460510254, + "objective/train/theoretical_loss": 3.5172791839392508, + "objective/train/tokens_used": 1514675680, + "theoretical_loss": 3.5172791839392508, + "tokens_seen": 1494215680 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002764292878635908, + "loss": 2.6827, + "theoretical_loss": 3.517275781028058, + "tokens_seen": 1494232064 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027641925777331997, + "loss": 2.6565, + "theoretical_loss": 3.51726216986087, + "tokens_seen": 1494297600 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027640922768304915, + "loss": 2.4964, + "theoretical_loss": 3.517248559457757, + "tokens_seen": 1494363136 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027639919759277833, + "loss": 2.8277, + "theoretical_loss": 3.5172349498186426, + "tokens_seen": 1494428672 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002763891675025075, + "loss": 2.545, + "theoretical_loss": 3.5172213409434505, + "tokens_seen": 1494494208 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002763791374122367, + "loss": 2.6756, + "theoretical_loss": 3.517207732832104, + "tokens_seen": 1494559744 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002763691073219659, + "loss": 2.6207, + "theoretical_loss": 3.5171941254845267, + "tokens_seen": 1494625280 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027635907723169505, + "loss": 2.6007, + "theoretical_loss": 3.517180518900643, + "tokens_seen": 1494690816 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002763490471414243, + "loss": 2.6077, + "theoretical_loss": 3.517166913080376, + "tokens_seen": 1494756352 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002763390170511534, + "loss": 2.7704, + "theoretical_loss": 3.5171533080236497, + "tokens_seen": 1494821888 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027632898696088265, + "loss": 2.7405, + "theoretical_loss": 3.517139703730387, + "tokens_seen": 1494887424 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027631895687061183, + "loss": 2.606, + "theoretical_loss": 3.5171261002005125, + "tokens_seen": 1494952960 + }, + { + "epoch": 5.0, + "learning_rate": 0.000276308926780341, + "loss": 2.5175, + "theoretical_loss": 3.517112497433949, + "tokens_seen": 1495018496 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002762988966900702, + "loss": 2.661, + "theoretical_loss": 3.517098895430621, + "tokens_seen": 1495084032 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027628886659979943, + "loss": 2.6579, + "theoretical_loss": 3.5170852941904522, + "tokens_seen": 1495149568 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027627883650952856, + "loss": 2.5546, + "theoretical_loss": 3.5170716937133655, + "tokens_seen": 1495215104 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002762688064192578, + "loss": 2.6988, + "theoretical_loss": 3.5170580939992853, + "tokens_seen": 1495280640 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027625877632898697, + "loss": 2.7503, + "theoretical_loss": 3.5170444950481357, + "tokens_seen": 1495346176 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027624874623871615, + "loss": 2.5672, + "theoretical_loss": 3.51703089685984, + "tokens_seen": 1495411712 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002762387161484454, + "loss": 2.8523, + "theoretical_loss": 3.517017299434322, + "tokens_seen": 1495477248 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002762286860581745, + "loss": 2.598, + "theoretical_loss": 3.517003702771505, + "tokens_seen": 1495542784 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027621865596790375, + "loss": 2.5702, + "theoretical_loss": 3.5169901068713134, + "tokens_seen": 1495608320 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002762086258776329, + "loss": 2.7542, + "theoretical_loss": 3.516976511733671, + "tokens_seen": 1495673856 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002761985957873621, + "loss": 2.7126, + "theoretical_loss": 3.516962917358502, + "tokens_seen": 1495739392 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002761885656970913, + "loss": 2.6053, + "theoretical_loss": 3.516949323745729, + "tokens_seen": 1495804928 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1700816, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9013986587524414, + "objective/train/theoretical_loss": 3.5169391290364267, + "objective/train/tokens_used": 1516314080, + "theoretical_loss": 3.5169391290364267, + "tokens_seen": 1495854080 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002761785356068205, + "loss": 2.8365, + "theoretical_loss": 3.516935730895277, + "tokens_seen": 1495870464 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027616850551654966, + "loss": 2.5379, + "theoretical_loss": 3.51692213880707, + "tokens_seen": 1495936000 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002761584754262789, + "loss": 2.7247, + "theoretical_loss": 3.51690854748103, + "tokens_seen": 1496001536 + }, + { + "epoch": 5.0, + "learning_rate": 0.000276148445336008, + "loss": 2.5445, + "theoretical_loss": 3.516894956917083, + "tokens_seen": 1496067072 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027613841524573725, + "loss": 2.6178, + "theoretical_loss": 3.516881367115152, + "tokens_seen": 1496132608 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002761283851554664, + "loss": 2.7579, + "theoretical_loss": 3.5168677780751616, + "tokens_seen": 1496198144 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002761183550651956, + "loss": 2.7081, + "theoretical_loss": 3.5168541897970345, + "tokens_seen": 1496263680 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002761083249749248, + "loss": 2.5498, + "theoretical_loss": 3.516840602280695, + "tokens_seen": 1496329216 + }, + { + "epoch": 5.0, + "learning_rate": 0.000276098294884654, + "loss": 2.5387, + "theoretical_loss": 3.5168270155260677, + "tokens_seen": 1496394752 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027608826479438316, + "loss": 2.6254, + "theoretical_loss": 3.5168134295330757, + "tokens_seen": 1496460288 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027607823470411234, + "loss": 2.6312, + "theoretical_loss": 3.5167998443016435, + "tokens_seen": 1496525824 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002760682046138415, + "loss": 2.6842, + "theoretical_loss": 3.516786259831695, + "tokens_seen": 1496591360 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027605817452357076, + "loss": 2.6796, + "theoretical_loss": 3.516772676123154, + "tokens_seen": 1496656896 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002760481444332999, + "loss": 2.6596, + "theoretical_loss": 3.5167590931759447, + "tokens_seen": 1496722432 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002760381143430291, + "loss": 2.7718, + "theoretical_loss": 3.5167455109899906, + "tokens_seen": 1496787968 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027602808425275825, + "loss": 2.7429, + "theoretical_loss": 3.516731929565216, + "tokens_seen": 1496853504 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002760180541624875, + "loss": 2.7515, + "theoretical_loss": 3.5167183489015454, + "tokens_seen": 1496919040 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027600802407221666, + "loss": 2.7387, + "theoretical_loss": 3.5167047689989026, + "tokens_seen": 1496984576 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027599799398194584, + "loss": 2.6507, + "theoretical_loss": 3.5166911898572106, + "tokens_seen": 1497050112 + }, + { + "epoch": 5.0, + "learning_rate": 0.000275987963891675, + "loss": 2.7042, + "theoretical_loss": 3.5166776114763953, + "tokens_seen": 1497115648 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027597793380140426, + "loss": 2.6221, + "theoretical_loss": 3.516664033856379, + "tokens_seen": 1497181184 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002759679037111334, + "loss": 2.7729, + "theoretical_loss": 3.5166504569970867, + "tokens_seen": 1497246720 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002759578736208626, + "loss": 2.7716, + "theoretical_loss": 3.5166368808984423, + "tokens_seen": 1497312256 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027594784353059175, + "loss": 2.6233, + "theoretical_loss": 3.51662330556037, + "tokens_seen": 1497377792 + }, + { + "epoch": 5.0, + "learning_rate": 0.000275937813440321, + "loss": 2.721, + "theoretical_loss": 3.516609730982794, + "tokens_seen": 1497443328 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1707842, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.416391372680664, + "objective/train/theoretical_loss": 3.5165995505486416, + "objective/train/tokens_used": 1517952480, + "theoretical_loss": 3.5165995505486416, + "tokens_seen": 1497492480 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027592778335005017, + "loss": 2.5181, + "theoretical_loss": 3.516596157165638, + "tokens_seen": 1497508864 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027591775325977935, + "loss": 2.5366, + "theoretical_loss": 3.5165825841088267, + "tokens_seen": 1497574400 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027590772316950853, + "loss": 2.6365, + "theoretical_loss": 3.5165690118122837, + "tokens_seen": 1497639936 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002758976930792377, + "loss": 2.67, + "theoretical_loss": 3.5165554402759334, + "tokens_seen": 1497705472 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002758876629889669, + "loss": 2.5694, + "theoretical_loss": 3.5165418694996995, + "tokens_seen": 1497771008 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002758776328986961, + "loss": 2.6292, + "theoretical_loss": 3.5165282994835065, + "tokens_seen": 1497836544 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027586760280842525, + "loss": 2.4334, + "theoretical_loss": 3.5165147302272795, + "tokens_seen": 1497902080 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002758575727181545, + "loss": 2.6236, + "theoretical_loss": 3.5165011617309414, + "tokens_seen": 1497967616 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002758475426278836, + "loss": 2.4739, + "theoretical_loss": 3.516487593994417, + "tokens_seen": 1498033152 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027583751253761285, + "loss": 2.5641, + "theoretical_loss": 3.51647402701763, + "tokens_seen": 1498098688 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027582748244734203, + "loss": 2.6901, + "theoretical_loss": 3.516460460800505, + "tokens_seen": 1498164224 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002758174523570712, + "loss": 2.6487, + "theoretical_loss": 3.5164468953429666, + "tokens_seen": 1498229760 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002758074222668004, + "loss": 2.5901, + "theoretical_loss": 3.5164333306449382, + "tokens_seen": 1498295296 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027579739217652963, + "loss": 2.5624, + "theoretical_loss": 3.5164197667063455, + "tokens_seen": 1498360832 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027578736208625876, + "loss": 2.7026, + "theoretical_loss": 3.5164062035271106, + "tokens_seen": 1498426368 + }, + { + "epoch": 5.0, + "learning_rate": 0.000275777331995988, + "loss": 2.7093, + "theoretical_loss": 3.51639264110716, + "tokens_seen": 1498491904 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002757673019057171, + "loss": 2.5889, + "theoretical_loss": 3.5163790794464163, + "tokens_seen": 1498557440 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027575727181544635, + "loss": 2.5409, + "theoretical_loss": 3.516365518544805, + "tokens_seen": 1498622976 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027574724172517554, + "loss": 2.592, + "theoretical_loss": 3.5163519584022493, + "tokens_seen": 1498688512 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002757372116349047, + "loss": 2.6117, + "theoretical_loss": 3.5163383990186743, + "tokens_seen": 1498754048 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002757271815446339, + "loss": 2.6196, + "theoretical_loss": 3.516324840394004, + "tokens_seen": 1498819584 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002757171514543631, + "loss": 2.6298, + "theoretical_loss": 3.5163112825281635, + "tokens_seen": 1498885120 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027570712136409226, + "loss": 2.6609, + "theoretical_loss": 3.516297725421076, + "tokens_seen": 1498950656 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002756970912738215, + "loss": 2.7129, + "theoretical_loss": 3.5162841690726667, + "tokens_seen": 1499016192 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002756870611835506, + "loss": 2.6601, + "theoretical_loss": 3.5162706134828596, + "tokens_seen": 1499081728 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1710663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6549386978149414, + "objective/train/theoretical_loss": 3.5162604472882912, + "objective/train/tokens_used": 1519590880, + "theoretical_loss": 3.5162604472882912, + "tokens_seen": 1499130880 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027567703109327986, + "loss": 2.6568, + "theoretical_loss": 3.516257058651579, + "tokens_seen": 1499147264 + }, + { + "epoch": 5.0, + "learning_rate": 0.000275667001003009, + "loss": 2.7675, + "theoretical_loss": 3.5162435045787497, + "tokens_seen": 1499212800 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002756569709127382, + "loss": 2.6526, + "theoretical_loss": 3.5162299512642954, + "tokens_seen": 1499278336 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002756469408224674, + "loss": 2.6224, + "theoretical_loss": 3.516216398708141, + "tokens_seen": 1499343872 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002756369107321966, + "loss": 2.682, + "theoretical_loss": 3.516202846910212, + "tokens_seen": 1499409408 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027562688064192576, + "loss": 2.7775, + "theoretical_loss": 3.516189295870431, + "tokens_seen": 1499474944 + }, + { + "epoch": 5.0, + "learning_rate": 0.000275616850551655, + "loss": 2.6704, + "theoretical_loss": 3.5161757455887233, + "tokens_seen": 1499540480 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002756068204613841, + "loss": 2.8034, + "theoretical_loss": 3.5161621960650136, + "tokens_seen": 1499606016 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027559679037111336, + "loss": 2.5323, + "theoretical_loss": 3.516148647299225, + "tokens_seen": 1499671552 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002755867602808425, + "loss": 2.7179, + "theoretical_loss": 3.5161350992912843, + "tokens_seen": 1499737088 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002755767301905717, + "loss": 2.5213, + "theoretical_loss": 3.516121552041114, + "tokens_seen": 1499802624 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002755667001003009, + "loss": 2.5912, + "theoretical_loss": 3.51610800554864, + "tokens_seen": 1499868160 + }, + { + "epoch": 5.0, + "learning_rate": 0.0002755566700100301, + "loss": 2.6746, + "theoretical_loss": 3.5160944598137855, + "tokens_seen": 1499933696 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027554663991975927, + "loss": 2.6959, + "theoretical_loss": 3.5160809148364764, + "tokens_seen": 1499999232 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027553660982948845, + "loss": 2.5214, + "theoretical_loss": 3.5160673706166357, + "tokens_seen": 1500064768 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027552657973921763, + "loss": 2.7854, + "theoretical_loss": 3.5160538271541895, + "tokens_seen": 1500130304 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027551654964894686, + "loss": 2.6988, + "theoretical_loss": 3.516040284449061, + "tokens_seen": 1500195840 + }, + { + "epoch": 5.0, + "learning_rate": 0.00027550651955867604, + "loss": 2.6114, + "theoretical_loss": 3.516026742501176, + "tokens_seen": 1500261376 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002754964894684052, + "loss": 2.5612, + "theoretical_loss": 3.516013201310458, + "tokens_seen": 1500326912 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027548645937813446, + "loss": 2.5483, + "theoretical_loss": 3.515999660876833, + "tokens_seen": 1500392448 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002754764292878636, + "loss": 2.7099, + "theoretical_loss": 3.515986121200224, + "tokens_seen": 1500457984 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002754663991975928, + "loss": 2.6436, + "theoretical_loss": 3.5159725822805568, + "tokens_seen": 1500523520 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027545636910732195, + "loss": 2.7573, + "theoretical_loss": 3.5159590441177544, + "tokens_seen": 1500589056 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002754463390170512, + "loss": 2.7131, + "theoretical_loss": 3.515945506711744, + "tokens_seen": 1500654592 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027543630892678037, + "loss": 2.5178, + "theoretical_loss": 3.5159319700624483, + "tokens_seen": 1500720128 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1715522, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.640793800354004, + "objective/train/theoretical_loss": 3.515921818072025, + "objective/train/tokens_used": 1521229280, + "theoretical_loss": 3.515921818072025, + "tokens_seen": 1500769280 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027542627883650955, + "loss": 2.7359, + "theoretical_loss": 3.5159184341697918, + "tokens_seen": 1500785664 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027541624874623873, + "loss": 2.7044, + "theoretical_loss": 3.5159048990337007, + "tokens_seen": 1500851200 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002754062186559679, + "loss": 2.5357, + "theoretical_loss": 3.515891364654099, + "tokens_seen": 1500916736 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002753961885656971, + "loss": 2.7109, + "theoretical_loss": 3.5158778310309104, + "tokens_seen": 1500982272 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002753861584754263, + "loss": 2.6688, + "theoretical_loss": 3.5158642981640607, + "tokens_seen": 1501047808 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027537612838515545, + "loss": 2.5025, + "theoretical_loss": 3.5158507660534744, + "tokens_seen": 1501113344 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002753660982948847, + "loss": 2.4948, + "theoretical_loss": 3.5158372346990765, + "tokens_seen": 1501178880 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002753560682046138, + "loss": 2.6206, + "theoretical_loss": 3.515823704100791, + "tokens_seen": 1501244416 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027534603811434305, + "loss": 2.5264, + "theoretical_loss": 3.5158101742585437, + "tokens_seen": 1501309952 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027533600802407223, + "loss": 2.7406, + "theoretical_loss": 3.5157966451722578, + "tokens_seen": 1501375488 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002753259779338014, + "loss": 2.6902, + "theoretical_loss": 3.515783116841859, + "tokens_seen": 1501441024 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002753159478435306, + "loss": 2.6181, + "theoretical_loss": 3.515769589267273, + "tokens_seen": 1501506560 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027530591775325983, + "loss": 2.6366, + "theoretical_loss": 3.515756062448423, + "tokens_seen": 1501572096 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027529588766298896, + "loss": 2.6837, + "theoretical_loss": 3.5157425363852344, + "tokens_seen": 1501637632 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002752858575727182, + "loss": 2.7563, + "theoretical_loss": 3.515729011077632, + "tokens_seen": 1501703168 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002752758274824473, + "loss": 2.6408, + "theoretical_loss": 3.515715486525541, + "tokens_seen": 1501768704 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027526579739217655, + "loss": 2.487, + "theoretical_loss": 3.515701962728885, + "tokens_seen": 1501834240 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027525576730190574, + "loss": 2.7103, + "theoretical_loss": 3.5156884396875907, + "tokens_seen": 1501899776 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002752457372116349, + "loss": 2.5855, + "theoretical_loss": 3.5156749174015816, + "tokens_seen": 1501965312 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002752357071213641, + "loss": 2.6207, + "theoretical_loss": 3.515661395870783, + "tokens_seen": 1502030848 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002752256770310933, + "loss": 2.7456, + "theoretical_loss": 3.5156478750951194, + "tokens_seen": 1502096384 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027521564694082246, + "loss": 2.5779, + "theoretical_loss": 3.515634355074517, + "tokens_seen": 1502161920 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002752056168505517, + "loss": 2.5976, + "theoretical_loss": 3.5156208358088987, + "tokens_seen": 1502227456 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002751955867602808, + "loss": 2.5067, + "theoretical_loss": 3.5156073172981905, + "tokens_seen": 1502292992 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027518555667001006, + "loss": 2.6524, + "theoretical_loss": 3.5155937995423177, + "tokens_seen": 1502358528 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1720743, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.869342803955078, + "objective/train/theoretical_loss": 3.515583661720728, + "objective/train/tokens_used": 1522867680, + "theoretical_loss": 3.515583661720728, + "tokens_seen": 1502407680 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002751755265797392, + "loss": 2.6944, + "theoretical_loss": 3.5155802825412046, + "tokens_seen": 1502424064 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002751654964894684, + "loss": 2.6483, + "theoretical_loss": 3.515566766294776, + "tokens_seen": 1502489600 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002751554663991976, + "loss": 2.5766, + "theoretical_loss": 3.515553250802957, + "tokens_seen": 1502555136 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002751454363089268, + "loss": 2.8034, + "theoretical_loss": 3.5155397360656733, + "tokens_seen": 1502620672 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027513540621865596, + "loss": 2.5673, + "theoretical_loss": 3.5155262220828485, + "tokens_seen": 1502686208 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002751253761283852, + "loss": 2.6488, + "theoretical_loss": 3.515512708854409, + "tokens_seen": 1502751744 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002751153460381143, + "loss": 2.5111, + "theoretical_loss": 3.5154991963802793, + "tokens_seen": 1502817280 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027510531594784356, + "loss": 2.7109, + "theoretical_loss": 3.5154856846603835, + "tokens_seen": 1502882816 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002750952858575727, + "loss": 2.5817, + "theoretical_loss": 3.515472173694648, + "tokens_seen": 1502948352 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002750852557673019, + "loss": 2.5877, + "theoretical_loss": 3.5154586634829967, + "tokens_seen": 1503013888 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002750752256770311, + "loss": 2.6702, + "theoretical_loss": 3.5154451540253553, + "tokens_seen": 1503079424 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002750651955867603, + "loss": 2.7882, + "theoretical_loss": 3.515431645321649, + "tokens_seen": 1503144960 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027505516549648947, + "loss": 2.5627, + "theoretical_loss": 3.5154181373718023, + "tokens_seen": 1503210496 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027504513540621865, + "loss": 2.7719, + "theoretical_loss": 3.5154046301757402, + "tokens_seen": 1503276032 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027503510531594783, + "loss": 2.8695, + "theoretical_loss": 3.515391123733388, + "tokens_seen": 1503341568 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027502507522567706, + "loss": 2.4746, + "theoretical_loss": 3.5153776180446714, + "tokens_seen": 1503407104 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002750150451354062, + "loss": 2.664, + "theoretical_loss": 3.5153641131095146, + "tokens_seen": 1503472640 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002750050150451354, + "loss": 2.6918, + "theoretical_loss": 3.515350608927843, + "tokens_seen": 1503538176 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027499498495486455, + "loss": 2.8506, + "theoretical_loss": 3.515337105499582, + "tokens_seen": 1503603712 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002749849548645938, + "loss": 2.6836, + "theoretical_loss": 3.5153236028246564, + "tokens_seen": 1503669248 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027497492477432297, + "loss": 2.752, + "theoretical_loss": 3.5153101009029912, + "tokens_seen": 1503734784 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027496489468405215, + "loss": 2.6325, + "theoretical_loss": 3.515296599734512, + "tokens_seen": 1503800320 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027495486459378133, + "loss": 2.7755, + "theoretical_loss": 3.515283099319144, + "tokens_seen": 1503865856 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027494483450351057, + "loss": 2.7143, + "theoretical_loss": 3.5152695996568117, + "tokens_seen": 1503931392 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002749348044132397, + "loss": 2.6559, + "theoretical_loss": 3.515256100747441, + "tokens_seen": 1503996928 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1725747, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6323916912078857, + "objective/train/theoretical_loss": 3.515245977059499, + "objective/train/tokens_used": 1524506080, + "theoretical_loss": 3.515245977059499, + "tokens_seen": 1504046080 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027492477432296893, + "loss": 2.7248, + "theoretical_loss": 3.515242602590957, + "tokens_seen": 1504062464 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027491474423269806, + "loss": 2.5724, + "theoretical_loss": 3.5152291051872844, + "tokens_seen": 1504128000 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002749047141424273, + "loss": 2.6799, + "theoretical_loss": 3.515215608536349, + "tokens_seen": 1504193536 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027489468405215647, + "loss": 2.515, + "theoretical_loss": 3.5152021126380752, + "tokens_seen": 1504259072 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027488465396188565, + "loss": 2.4422, + "theoretical_loss": 3.5151886174923894, + "tokens_seen": 1504324608 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027487462387161483, + "loss": 2.5653, + "theoretical_loss": 3.5151751230992163, + "tokens_seen": 1504390144 + }, + { + "epoch": 5.01, + "learning_rate": 0.000274864593781344, + "loss": 2.668, + "theoretical_loss": 3.515161629458481, + "tokens_seen": 1504455680 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002748545636910732, + "loss": 2.6074, + "theoretical_loss": 3.5151481365701085, + "tokens_seen": 1504521216 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027484453360080243, + "loss": 2.7049, + "theoretical_loss": 3.515134644434025, + "tokens_seen": 1504586752 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027483450351053156, + "loss": 2.7778, + "theoretical_loss": 3.5151211530501545, + "tokens_seen": 1504652288 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002748244734202608, + "loss": 2.4842, + "theoretical_loss": 3.5151076624184237, + "tokens_seen": 1504717824 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027481444332999, + "loss": 2.7311, + "theoretical_loss": 3.515094172538757, + "tokens_seen": 1504783360 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027480441323971916, + "loss": 2.7872, + "theoretical_loss": 3.515080683411081, + "tokens_seen": 1504848896 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027479438314944834, + "loss": 2.5982, + "theoretical_loss": 3.5150671950353187, + "tokens_seen": 1504914432 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002747843530591775, + "loss": 2.6789, + "theoretical_loss": 3.515053707411397, + "tokens_seen": 1504979968 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027477432296890675, + "loss": 2.6325, + "theoretical_loss": 3.5150402205392415, + "tokens_seen": 1505045504 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027476429287863594, + "loss": 2.5831, + "theoretical_loss": 3.515026734418777, + "tokens_seen": 1505111040 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002747542627883651, + "loss": 2.6323, + "theoretical_loss": 3.515013249049929, + "tokens_seen": 1505176576 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002747442326980943, + "loss": 2.5551, + "theoretical_loss": 3.5149997644326225, + "tokens_seen": 1505242112 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002747342026078235, + "loss": 2.6337, + "theoretical_loss": 3.5149862805667835, + "tokens_seen": 1505307648 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027472417251755266, + "loss": 2.5774, + "theoretical_loss": 3.5149727974523373, + "tokens_seen": 1505373184 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002747141424272819, + "loss": 2.5681, + "theoretical_loss": 3.514959315089209, + "tokens_seen": 1505438720 + }, + { + "epoch": 5.01, + "learning_rate": 0.000274704112337011, + "loss": 2.6553, + "theoretical_loss": 3.5149458334773245, + "tokens_seen": 1505504256 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027469408224674026, + "loss": 2.5902, + "theoretical_loss": 3.5149323526166087, + "tokens_seen": 1505569792 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002746840521564694, + "loss": 2.5925, + "theoretical_loss": 3.5149188725069873, + "tokens_seen": 1505635328 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1730984, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6221024990081787, + "objective/train/theoretical_loss": 3.514908762917632, + "objective/train/tokens_used": 1526144480, + "theoretical_loss": 3.514908762917632, + "tokens_seen": 1505684480 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002746740220661986, + "loss": 2.5532, + "theoretical_loss": 3.5149053931483856, + "tokens_seen": 1505700864 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002746639919759278, + "loss": 2.5331, + "theoretical_loss": 3.51489191454073, + "tokens_seen": 1505766400 + }, + { + "epoch": 5.01, + "learning_rate": 0.000274653961885657, + "loss": 2.6332, + "theoretical_loss": 3.5148784366839445, + "tokens_seen": 1505831936 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027464393179538616, + "loss": 2.7405, + "theoretical_loss": 3.514864959577955, + "tokens_seen": 1505897472 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002746339017051154, + "loss": 2.7854, + "theoretical_loss": 3.514851483222688, + "tokens_seen": 1505963008 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002746238716148445, + "loss": 2.6165, + "theoretical_loss": 3.514838007618068, + "tokens_seen": 1506028544 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027461384152457376, + "loss": 2.7394, + "theoretical_loss": 3.5148245327640213, + "tokens_seen": 1506094080 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002746038114343029, + "loss": 2.6271, + "theoretical_loss": 3.514811058660473, + "tokens_seen": 1506159616 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002745937813440321, + "loss": 2.6569, + "theoretical_loss": 3.514797585307348, + "tokens_seen": 1506225152 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002745837512537613, + "loss": 2.7429, + "theoretical_loss": 3.514784112704573, + "tokens_seen": 1506290688 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002745737211634905, + "loss": 2.7054, + "theoretical_loss": 3.514770640852073, + "tokens_seen": 1506356224 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027456369107321967, + "loss": 2.7931, + "theoretical_loss": 3.5147571697497737, + "tokens_seen": 1506421760 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027455366098294885, + "loss": 2.6221, + "theoretical_loss": 3.5147436993976005, + "tokens_seen": 1506487296 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027454363089267803, + "loss": 2.5807, + "theoretical_loss": 3.5147302297954797, + "tokens_seen": 1506552832 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027453360080240726, + "loss": 2.5029, + "theoretical_loss": 3.514716760943336, + "tokens_seen": 1506618368 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002745235707121364, + "loss": 2.5789, + "theoretical_loss": 3.5147032928410953, + "tokens_seen": 1506683904 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002745135406218656, + "loss": 2.564, + "theoretical_loss": 3.514689825488683, + "tokens_seen": 1506749440 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027450351053159475, + "loss": 2.6247, + "theoretical_loss": 3.5146763588860255, + "tokens_seen": 1506814976 + }, + { + "epoch": 5.01, + "learning_rate": 0.000274493480441324, + "loss": 2.6367, + "theoretical_loss": 3.514662893033048, + "tokens_seen": 1506880512 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027448345035105317, + "loss": 2.6014, + "theoretical_loss": 3.514649427929676, + "tokens_seen": 1506946048 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027447342026078235, + "loss": 2.6573, + "theoretical_loss": 3.5146359635758353, + "tokens_seen": 1507011584 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027446339017051153, + "loss": 2.7415, + "theoretical_loss": 3.514622499971452, + "tokens_seen": 1507077120 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027445336008024077, + "loss": 2.7173, + "theoretical_loss": 3.5146090371164505, + "tokens_seen": 1507142656 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002744433299899699, + "loss": 2.6439, + "theoretical_loss": 3.5145955750107585, + "tokens_seen": 1507208192 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027443329989969913, + "loss": 2.534, + "theoretical_loss": 3.5145821136543, + "tokens_seen": 1507273728 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1733768, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.598609209060669, + "objective/train/theoretical_loss": 3.5145720181285967, + "objective/train/tokens_used": 1527782880, + "theoretical_loss": 3.5145720181285967, + "tokens_seen": 1507322880 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027442326980942826, + "loss": 2.6034, + "theoretical_loss": 3.514568653047002, + "tokens_seen": 1507339264 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002744132397191575, + "loss": 2.5776, + "theoretical_loss": 3.514555193188789, + "tokens_seen": 1507404800 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027440320962888667, + "loss": 2.5075, + "theoretical_loss": 3.5145417340795877, + "tokens_seen": 1507470336 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027439317953861585, + "loss": 2.6137, + "theoretical_loss": 3.5145282757193232, + "tokens_seen": 1507535872 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027438314944834503, + "loss": 2.6294, + "theoretical_loss": 3.514514818107922, + "tokens_seen": 1507601408 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002743731193580742, + "loss": 2.5912, + "theoretical_loss": 3.5145013612453093, + "tokens_seen": 1507666944 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002743630892678034, + "loss": 2.4158, + "theoretical_loss": 3.514487905131411, + "tokens_seen": 1507732480 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027435305917753263, + "loss": 2.4401, + "theoretical_loss": 3.514474449766153, + "tokens_seen": 1507798016 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027434302908726176, + "loss": 2.6716, + "theoretical_loss": 3.514460995149461, + "tokens_seen": 1507863552 + }, + { + "epoch": 5.01, + "learning_rate": 0.000274332998996991, + "loss": 2.51, + "theoretical_loss": 3.5144475412812612, + "tokens_seen": 1507929088 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002743229689067202, + "loss": 2.5596, + "theoretical_loss": 3.514434088161479, + "tokens_seen": 1507994624 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027431293881644936, + "loss": 2.5836, + "theoretical_loss": 3.5144206357900405, + "tokens_seen": 1508060160 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027430290872617854, + "loss": 2.562, + "theoretical_loss": 3.5144071841668714, + "tokens_seen": 1508125696 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002742928786359077, + "loss": 2.6443, + "theoretical_loss": 3.514393733291897, + "tokens_seen": 1508191232 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002742828485456369, + "loss": 2.6106, + "theoretical_loss": 3.5143802831650444, + "tokens_seen": 1508256768 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027427281845536614, + "loss": 2.4577, + "theoretical_loss": 3.5143668337862386, + "tokens_seen": 1508322304 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027426278836509526, + "loss": 2.6082, + "theoretical_loss": 3.514353385155406, + "tokens_seen": 1508387840 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002742527582748245, + "loss": 2.6183, + "theoretical_loss": 3.514339937272472, + "tokens_seen": 1508453376 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002742427281845536, + "loss": 2.7128, + "theoretical_loss": 3.514326490137363, + "tokens_seen": 1508518912 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027423269809428286, + "loss": 2.7367, + "theoretical_loss": 3.514313043750005, + "tokens_seen": 1508584448 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027422266800401204, + "loss": 2.6525, + "theoretical_loss": 3.514299598110323, + "tokens_seen": 1508649984 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002742126379137412, + "loss": 2.6169, + "theoretical_loss": 3.5142861532182437, + "tokens_seen": 1508715520 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002742026078234704, + "loss": 2.7413, + "theoretical_loss": 3.514272709073693, + "tokens_seen": 1508781056 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002741925777331996, + "loss": 2.5877, + "theoretical_loss": 3.514259265676597, + "tokens_seen": 1508846592 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027418254764292877, + "loss": 2.5007, + "theoretical_loss": 3.5142458230268816, + "tokens_seen": 1508912128 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1734888, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6094367504119873, + "objective/train/theoretical_loss": 3.514235741530019, + "objective/train/tokens_used": 1529421280, + "theoretical_loss": 3.514235741530019, + "tokens_seen": 1508961280 + }, + { + "epoch": 5.01, + "learning_rate": 0.000274172517552658, + "loss": 2.504, + "theoretical_loss": 3.5142323811244722, + "tokens_seen": 1508977664 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027416248746238713, + "loss": 2.7591, + "theoretical_loss": 3.514218939969296, + "tokens_seen": 1509043200 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027415245737211636, + "loss": 2.4556, + "theoretical_loss": 3.514205499561278, + "tokens_seen": 1509108736 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027414242728184554, + "loss": 2.7308, + "theoretical_loss": 3.5141920599003447, + "tokens_seen": 1509174272 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002741323971915747, + "loss": 2.7473, + "theoretical_loss": 3.5141786209864216, + "tokens_seen": 1509239808 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002741223671013039, + "loss": 2.4884, + "theoretical_loss": 3.5141651828194354, + "tokens_seen": 1509305344 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002741123370110331, + "loss": 2.702, + "theoretical_loss": 3.514151745399312, + "tokens_seen": 1509370880 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027410230692076227, + "loss": 2.7508, + "theoretical_loss": 3.514138308725977, + "tokens_seen": 1509436416 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002740922768304915, + "loss": 2.5619, + "theoretical_loss": 3.514124872799357, + "tokens_seen": 1509501952 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027408224674022063, + "loss": 2.8769, + "theoretical_loss": 3.514111437619378, + "tokens_seen": 1509567488 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027407221664994987, + "loss": 2.7444, + "theoretical_loss": 3.514098003185966, + "tokens_seen": 1509633024 + }, + { + "epoch": 5.01, + "learning_rate": 0.000274062186559679, + "loss": 2.481, + "theoretical_loss": 3.5140845694990466, + "tokens_seen": 1509698560 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027405215646940823, + "loss": 2.6672, + "theoretical_loss": 3.5140711365585466, + "tokens_seen": 1509764096 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002740421263791374, + "loss": 2.6577, + "theoretical_loss": 3.5140577043643924, + "tokens_seen": 1509829632 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002740320962888666, + "loss": 2.5711, + "theoretical_loss": 3.514044272916509, + "tokens_seen": 1509895168 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002740220661985958, + "loss": 2.5343, + "theoretical_loss": 3.514030842214824, + "tokens_seen": 1509960704 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027401203610832495, + "loss": 2.4917, + "theoretical_loss": 3.5140174122592627, + "tokens_seen": 1510026240 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002740020060180542, + "loss": 2.7321, + "theoretical_loss": 3.5140039830497507, + "tokens_seen": 1510091776 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027399197592778337, + "loss": 2.5898, + "theoretical_loss": 3.513990554586216, + "tokens_seen": 1510157312 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027398194583751255, + "loss": 2.7621, + "theoretical_loss": 3.513977126868583, + "tokens_seen": 1510222848 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027397191574724173, + "loss": 2.4087, + "theoretical_loss": 3.5139636998967783, + "tokens_seen": 1510288384 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027396188565697097, + "loss": 2.6989, + "theoretical_loss": 3.513950273670728, + "tokens_seen": 1510353920 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002739518555667001, + "loss": 2.6959, + "theoretical_loss": 3.51393684819036, + "tokens_seen": 1510419456 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027394182547642933, + "loss": 2.5562, + "theoretical_loss": 3.513923423455598, + "tokens_seen": 1510484992 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027393179538615846, + "loss": 2.4551, + "theoretical_loss": 3.51390999946637, + "tokens_seen": 1510550528 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1735676, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.696056842803955, + "objective/train/theoretical_loss": 3.513899931963661, + "objective/train/tokens_used": 1531059680, + "theoretical_loss": 3.513899931963661, + "tokens_seen": 1510599680 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002739217652958877, + "loss": 2.5213, + "theoretical_loss": 3.5138965762226015, + "tokens_seen": 1510616064 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027391173520561687, + "loss": 2.6409, + "theoretical_loss": 3.513883153724219, + "tokens_seen": 1510681600 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027390170511534605, + "loss": 2.7535, + "theoretical_loss": 3.513869731971149, + "tokens_seen": 1510747136 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027389167502507524, + "loss": 2.6657, + "theoretical_loss": 3.5138563109633174, + "tokens_seen": 1510812672 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002738816449348044, + "loss": 2.759, + "theoretical_loss": 3.5138428907006505, + "tokens_seen": 1510878208 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002738716148445336, + "loss": 2.4819, + "theoretical_loss": 3.513829471183075, + "tokens_seen": 1510943744 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027386158475426283, + "loss": 2.7243, + "theoretical_loss": 3.513816052410517, + "tokens_seen": 1511009280 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027385155466399196, + "loss": 2.7477, + "theoretical_loss": 3.5138026343829027, + "tokens_seen": 1511074816 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002738415245737212, + "loss": 2.6833, + "theoretical_loss": 3.5137892171001583, + "tokens_seen": 1511140352 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002738314944834504, + "loss": 2.567, + "theoretical_loss": 3.5137758005622106, + "tokens_seen": 1511205888 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027382146439317956, + "loss": 2.689, + "theoretical_loss": 3.513762384768986, + "tokens_seen": 1511271424 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027381143430290874, + "loss": 2.7233, + "theoretical_loss": 3.5137489697204103, + "tokens_seen": 1511336960 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002738014042126379, + "loss": 2.6076, + "theoretical_loss": 3.5137355554164103, + "tokens_seen": 1511402496 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002737913741223671, + "loss": 2.8242, + "theoretical_loss": 3.513722141856912, + "tokens_seen": 1511468032 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027378134403209634, + "loss": 2.7138, + "theoretical_loss": 3.5137087290418423, + "tokens_seen": 1511533568 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027377131394182546, + "loss": 2.6239, + "theoretical_loss": 3.5136953169711274, + "tokens_seen": 1511599104 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002737612838515547, + "loss": 2.5643, + "theoretical_loss": 3.5136819056446935, + "tokens_seen": 1511664640 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002737512537612838, + "loss": 2.8376, + "theoretical_loss": 3.5136684950624675, + "tokens_seen": 1511730176 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027374122367101306, + "loss": 2.4982, + "theoretical_loss": 3.5136550852243755, + "tokens_seen": 1511795712 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027373119358074224, + "loss": 2.5081, + "theoretical_loss": 3.5136416761303444, + "tokens_seen": 1511861248 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002737211634904714, + "loss": 2.5411, + "theoretical_loss": 3.5136282677803, + "tokens_seen": 1511926784 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002737111334002006, + "loss": 2.689, + "theoretical_loss": 3.513614860174169, + "tokens_seen": 1511992320 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002737011033099298, + "loss": 2.6541, + "theoretical_loss": 3.513601453311878, + "tokens_seen": 1512057856 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027369107321965897, + "loss": 2.6243, + "theoretical_loss": 3.5135880471933536, + "tokens_seen": 1512123392 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002736810431293882, + "loss": 2.6046, + "theoretical_loss": 3.513574641818522, + "tokens_seen": 1512188928 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1736838, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.042192220687866, + "objective/train/theoretical_loss": 3.5135645882754023, + "objective/train/tokens_used": 1532698080, + "theoretical_loss": 3.5135645882754023, + "tokens_seen": 1512238080 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027367101303911733, + "loss": 2.7713, + "theoretical_loss": 3.51356123718731, + "tokens_seen": 1512254464 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027366098294884656, + "loss": 2.7605, + "theoretical_loss": 3.513547833299644, + "tokens_seen": 1512320000 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027365095285857574, + "loss": 2.6, + "theoretical_loss": 3.5135344301554503, + "tokens_seen": 1512385536 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002736409227683049, + "loss": 2.5412, + "theoretical_loss": 3.5135210277546562, + "tokens_seen": 1512451072 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002736308926780341, + "loss": 2.7291, + "theoretical_loss": 3.513507626097187, + "tokens_seen": 1512516608 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002736208625877633, + "loss": 2.4899, + "theoretical_loss": 3.5134942251829706, + "tokens_seen": 1512582144 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027361083249749247, + "loss": 2.5796, + "theoretical_loss": 3.513480825011933, + "tokens_seen": 1512647680 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002736008024072217, + "loss": 2.5228, + "theoretical_loss": 3.513467425584001, + "tokens_seen": 1512713216 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027359077231695083, + "loss": 2.7755, + "theoretical_loss": 3.5134540268991006, + "tokens_seen": 1512778752 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027358074222668007, + "loss": 2.5736, + "theoretical_loss": 3.5134406289571585, + "tokens_seen": 1512844288 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002735707121364092, + "loss": 2.7156, + "theoretical_loss": 3.5134272317581026, + "tokens_seen": 1512909824 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027356068204613843, + "loss": 2.702, + "theoretical_loss": 3.513413835301858, + "tokens_seen": 1512975360 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002735506519558676, + "loss": 2.7033, + "theoretical_loss": 3.513400439588352, + "tokens_seen": 1513040896 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002735406218655968, + "loss": 2.6876, + "theoretical_loss": 3.513387044617511, + "tokens_seen": 1513106432 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027353059177532597, + "loss": 2.7163, + "theoretical_loss": 3.5133736503892616, + "tokens_seen": 1513171968 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027352056168505515, + "loss": 2.7879, + "theoretical_loss": 3.513360256903531, + "tokens_seen": 1513237504 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027351053159478433, + "loss": 2.6548, + "theoretical_loss": 3.5133468641602454, + "tokens_seen": 1513303040 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027350050150451357, + "loss": 2.7286, + "theoretical_loss": 3.513333472159332, + "tokens_seen": 1513368576 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002734904714142427, + "loss": 2.7595, + "theoretical_loss": 3.5133200809007166, + "tokens_seen": 1513434112 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027348044132397193, + "loss": 2.5187, + "theoretical_loss": 3.513306690384327, + "tokens_seen": 1513499648 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002734704112337011, + "loss": 2.6689, + "theoretical_loss": 3.5132933006100893, + "tokens_seen": 1513565184 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002734603811434303, + "loss": 2.5274, + "theoretical_loss": 3.5132799115779303, + "tokens_seen": 1513630720 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002734503510531595, + "loss": 2.8102, + "theoretical_loss": 3.5132665232877764, + "tokens_seen": 1513696256 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027344032096288866, + "loss": 2.6089, + "theoretical_loss": 3.5132531357395553, + "tokens_seen": 1513761792 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027343029087261784, + "loss": 2.6788, + "theoretical_loss": 3.5132397489331932, + "tokens_seen": 1513827328 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1737452, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.892695426940918, + "objective/train/theoretical_loss": 3.5132297093152225, + "objective/train/tokens_used": 1534336480, + "theoretical_loss": 3.5132297093152225, + "tokens_seen": 1513876480 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002734202607823471, + "loss": 2.5524, + "theoretical_loss": 3.5132263628686164, + "tokens_seen": 1513892864 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002734102306920762, + "loss": 2.61, + "theoretical_loss": 3.513212977545753, + "tokens_seen": 1513958400 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027340020060180544, + "loss": 2.6958, + "theoretical_loss": 3.5131995929645283, + "tokens_seen": 1514023936 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027339017051153456, + "loss": 2.6809, + "theoretical_loss": 3.5131862091248696, + "tokens_seen": 1514089472 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002733801404212638, + "loss": 2.7049, + "theoretical_loss": 3.5131728260267048, + "tokens_seen": 1514155008 + }, + { + "epoch": 5.01, + "learning_rate": 0.000273370110330993, + "loss": 2.6297, + "theoretical_loss": 3.513159443669959, + "tokens_seen": 1514220544 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027336008024072216, + "loss": 2.7736, + "theoretical_loss": 3.51314606205456, + "tokens_seen": 1514286080 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027335005015045134, + "loss": 2.6571, + "theoretical_loss": 3.5131326811804344, + "tokens_seen": 1514351616 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002733400200601806, + "loss": 2.559, + "theoretical_loss": 3.5131193010475097, + "tokens_seen": 1514417152 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002733299899699097, + "loss": 2.8363, + "theoretical_loss": 3.513105921655712, + "tokens_seen": 1514482688 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027331995987963894, + "loss": 2.7014, + "theoretical_loss": 3.5130925430049684, + "tokens_seen": 1514548224 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027330992978936807, + "loss": 2.586, + "theoretical_loss": 3.513079165095206, + "tokens_seen": 1514613760 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002732998996990973, + "loss": 2.4239, + "theoretical_loss": 3.513065787926351, + "tokens_seen": 1514679296 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002732898696088265, + "loss": 2.4609, + "theoretical_loss": 3.5130524114983315, + "tokens_seen": 1514744832 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027327983951855566, + "loss": 2.5722, + "theoretical_loss": 3.5130390358110732, + "tokens_seen": 1514810368 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002732698094282849, + "loss": 2.5675, + "theoretical_loss": 3.513025660864504, + "tokens_seen": 1514875904 + }, + { + "epoch": 5.01, + "learning_rate": 0.000273259779338014, + "loss": 2.4711, + "theoretical_loss": 3.51301228665855, + "tokens_seen": 1514941440 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027324974924774326, + "loss": 2.494, + "theoretical_loss": 3.5129989131931394, + "tokens_seen": 1515006976 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027323971915747244, + "loss": 2.4075, + "theoretical_loss": 3.5129855404681978, + "tokens_seen": 1515072512 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002732296890672016, + "loss": 2.7047, + "theoretical_loss": 3.5129721684836523, + "tokens_seen": 1515138048 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002732196589769308, + "loss": 2.5743, + "theoretical_loss": 3.5129587972394307, + "tokens_seen": 1515203584 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027320962888666, + "loss": 2.5521, + "theoretical_loss": 3.5129454267354596, + "tokens_seen": 1515269120 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027319959879638917, + "loss": 2.7014, + "theoretical_loss": 3.512932056971666, + "tokens_seen": 1515334656 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002731895687061184, + "loss": 2.6108, + "theoretical_loss": 3.5129186879479772, + "tokens_seen": 1515400192 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027317953861584753, + "loss": 2.618, + "theoretical_loss": 3.51290531966432, + "tokens_seen": 1515465728 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1739078, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.715820789337158, + "objective/train/theoretical_loss": 3.5128952939371785, + "objective/train/tokens_used": 1535974880, + "theoretical_loss": 3.5128952939371785, + "tokens_seen": 1515514880 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027316950852557676, + "loss": 2.6964, + "theoretical_loss": 3.5128919521206208, + "tokens_seen": 1515531264 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027315947843530594, + "loss": 2.8865, + "theoretical_loss": 3.512878585316807, + "tokens_seen": 1515596800 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002731494483450351, + "loss": 2.5913, + "theoretical_loss": 3.5128652192528067, + "tokens_seen": 1515662336 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002731394182547643, + "loss": 2.4339, + "theoretical_loss": 3.512851853928546, + "tokens_seen": 1515727872 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002731293881644935, + "loss": 2.5569, + "theoretical_loss": 3.5128384893439524, + "tokens_seen": 1515793408 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027311935807422267, + "loss": 2.6283, + "theoretical_loss": 3.512825125498952, + "tokens_seen": 1515858944 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002731093279839519, + "loss": 2.6183, + "theoretical_loss": 3.512811762393473, + "tokens_seen": 1515924480 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027309929789368103, + "loss": 2.5518, + "theoretical_loss": 3.512798400027442, + "tokens_seen": 1515990016 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027308926780341027, + "loss": 2.6761, + "theoretical_loss": 3.5127850384007866, + "tokens_seen": 1516055552 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002730792377131394, + "loss": 2.5924, + "theoretical_loss": 3.5127716775134328, + "tokens_seen": 1516121088 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027306920762286863, + "loss": 2.7086, + "theoretical_loss": 3.5127583173653094, + "tokens_seen": 1516186624 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002730591775325978, + "loss": 2.7092, + "theoretical_loss": 3.5127449579563423, + "tokens_seen": 1516252160 + }, + { + "epoch": 5.01, + "learning_rate": 0.000273049147442327, + "loss": 2.6952, + "theoretical_loss": 3.5127315992864587, + "tokens_seen": 1516317696 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027303911735205617, + "loss": 2.5468, + "theoretical_loss": 3.5127182413555866, + "tokens_seen": 1516383232 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027302908726178535, + "loss": 2.581, + "theoretical_loss": 3.5127048841636523, + "tokens_seen": 1516448768 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027301905717151453, + "loss": 2.3842, + "theoretical_loss": 3.512691527710584, + "tokens_seen": 1516514304 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027300902708124377, + "loss": 2.5223, + "theoretical_loss": 3.512678171996307, + "tokens_seen": 1516579840 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002729989969909729, + "loss": 2.5641, + "theoretical_loss": 3.5126648170207515, + "tokens_seen": 1516645376 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027298896690070213, + "loss": 2.8428, + "theoretical_loss": 3.512651462783842, + "tokens_seen": 1516710912 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002729789368104313, + "loss": 2.4741, + "theoretical_loss": 3.512638109285507, + "tokens_seen": 1516776448 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002729689067201605, + "loss": 2.6423, + "theoretical_loss": 3.512624756525673, + "tokens_seen": 1516841984 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002729588766298897, + "loss": 2.5588, + "theoretical_loss": 3.512611404504268, + "tokens_seen": 1516907520 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027294884653961886, + "loss": 2.7271, + "theoretical_loss": 3.512598053221219, + "tokens_seen": 1516973056 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027293881644934804, + "loss": 2.5997, + "theoretical_loss": 3.5125847026764534, + "tokens_seen": 1517038592 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002729287863590773, + "loss": 2.5554, + "theoretical_loss": 3.5125713528698985, + "tokens_seen": 1517104128 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1739851, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.781825542449951, + "objective/train/theoretical_loss": 3.5125613409993894, + "objective/train/tokens_used": 1537613280, + "theoretical_loss": 3.5125613409993894, + "tokens_seen": 1517153280 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002729187562688064, + "loss": 2.5735, + "theoretical_loss": 3.512558003801481, + "tokens_seen": 1517169664 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027290872617853564, + "loss": 2.8424, + "theoretical_loss": 3.512544655471129, + "tokens_seen": 1517235200 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027289869608826476, + "loss": 2.7819, + "theoretical_loss": 3.5125313078787697, + "tokens_seen": 1517300736 + }, + { + "epoch": 5.01, + "learning_rate": 0.000272888665997994, + "loss": 2.8765, + "theoretical_loss": 3.51251796102433, + "tokens_seen": 1517366272 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002728786359077232, + "loss": 2.6052, + "theoretical_loss": 3.512504614907737, + "tokens_seen": 1517431808 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027286860581745236, + "loss": 2.6188, + "theoretical_loss": 3.512491269528919, + "tokens_seen": 1517497344 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027285857572718154, + "loss": 2.6225, + "theoretical_loss": 3.5124779248878024, + "tokens_seen": 1517562880 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002728485456369108, + "loss": 2.6682, + "theoretical_loss": 3.5124645809843154, + "tokens_seen": 1517628416 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002728385155466399, + "loss": 2.7252, + "theoretical_loss": 3.512451237818385, + "tokens_seen": 1517693952 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027282848545636914, + "loss": 2.5136, + "theoretical_loss": 3.512437895389938, + "tokens_seen": 1517759488 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027281845536609827, + "loss": 2.5185, + "theoretical_loss": 3.512424553698903, + "tokens_seen": 1517825024 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002728084252758275, + "loss": 2.5597, + "theoretical_loss": 3.5124112127452065, + "tokens_seen": 1517890560 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002727983951855567, + "loss": 2.5773, + "theoretical_loss": 3.5123978725287763, + "tokens_seen": 1517956096 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027278836509528586, + "loss": 2.4945, + "theoretical_loss": 3.5123845330495396, + "tokens_seen": 1518021632 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027277833500501504, + "loss": 2.591, + "theoretical_loss": 3.5123711943074234, + "tokens_seen": 1518087168 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002727683049147442, + "loss": 2.6178, + "theoretical_loss": 3.5123578563023568, + "tokens_seen": 1518152704 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002727582748244734, + "loss": 2.6907, + "theoretical_loss": 3.5123445190342655, + "tokens_seen": 1518218240 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027274824473420264, + "loss": 2.376, + "theoretical_loss": 3.5123311825030776, + "tokens_seen": 1518283776 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027273821464393177, + "loss": 2.5858, + "theoretical_loss": 3.512317846708721, + "tokens_seen": 1518349312 + }, + { + "epoch": 5.01, + "learning_rate": 0.000272728184553661, + "loss": 2.5271, + "theoretical_loss": 3.5123045116511227, + "tokens_seen": 1518414848 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027271815446339013, + "loss": 2.6194, + "theoretical_loss": 3.51229117733021, + "tokens_seen": 1518480384 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027270812437311937, + "loss": 2.5939, + "theoretical_loss": 3.5122778437459106, + "tokens_seen": 1518545920 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027269809428284855, + "loss": 2.6146, + "theoretical_loss": 3.5122645108981523, + "tokens_seen": 1518611456 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027268806419257773, + "loss": 2.6375, + "theoretical_loss": 3.5122511787868627, + "tokens_seen": 1518676992 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002726780341023069, + "loss": 2.6602, + "theoretical_loss": 3.512237847411969, + "tokens_seen": 1518742528 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1741412, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7110002040863037, + "objective/train/theoretical_loss": 3.512227849364015, + "objective/train/tokens_used": 1539251680, + "theoretical_loss": 3.512227849364015, + "tokens_seen": 1518791680 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027266800401203614, + "loss": 2.5932, + "theoretical_loss": 3.512224516773399, + "tokens_seen": 1518808064 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027265797392176527, + "loss": 2.7002, + "theoretical_loss": 3.51221118687108, + "tokens_seen": 1518873600 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002726479438314945, + "loss": 2.6501, + "theoretical_loss": 3.5121978577049395, + "tokens_seen": 1518939136 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027263791374122363, + "loss": 2.6066, + "theoretical_loss": 3.512184529274905, + "tokens_seen": 1519004672 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027262788365095287, + "loss": 2.6338, + "theoretical_loss": 3.5121712015809052, + "tokens_seen": 1519070208 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027261785356068205, + "loss": 2.7052, + "theoretical_loss": 3.5121578746228668, + "tokens_seen": 1519135744 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027260782347041123, + "loss": 2.6389, + "theoretical_loss": 3.512144548400717, + "tokens_seen": 1519201280 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002725977933801404, + "loss": 2.7032, + "theoretical_loss": 3.512131222914384, + "tokens_seen": 1519266816 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002725877632898696, + "loss": 2.6396, + "theoretical_loss": 3.512117898163795, + "tokens_seen": 1519332352 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002725777331995988, + "loss": 2.6893, + "theoretical_loss": 3.5121045741488786, + "tokens_seen": 1519397888 + }, + { + "epoch": 5.01, + "learning_rate": 0.000272567703109328, + "loss": 2.7178, + "theoretical_loss": 3.5120912508695614, + "tokens_seen": 1519463424 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027255767301905714, + "loss": 2.8325, + "theoretical_loss": 3.5120779283257715, + "tokens_seen": 1519528960 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027254764292878637, + "loss": 2.5731, + "theoretical_loss": 3.512064606517437, + "tokens_seen": 1519594496 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002725376128385155, + "loss": 2.5434, + "theoretical_loss": 3.5120512854444845, + "tokens_seen": 1519660032 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027252758274824473, + "loss": 2.5804, + "theoretical_loss": 3.512037965106843, + "tokens_seen": 1519725568 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027251755265797397, + "loss": 2.5707, + "theoretical_loss": 3.5120246455044395, + "tokens_seen": 1519791104 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002725075225677031, + "loss": 2.6185, + "theoretical_loss": 3.512011326637201, + "tokens_seen": 1519856640 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027249749247743233, + "loss": 2.5685, + "theoretical_loss": 3.5119980085050573, + "tokens_seen": 1519922176 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002724874623871615, + "loss": 2.6181, + "theoretical_loss": 3.5119846911079335, + "tokens_seen": 1519987712 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002724774322968907, + "loss": 2.5499, + "theoretical_loss": 3.5119713744457597, + "tokens_seen": 1520053248 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002724674022066199, + "loss": 2.4947, + "theoretical_loss": 3.511958058518462, + "tokens_seen": 1520118784 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027245737211634906, + "loss": 2.6411, + "theoretical_loss": 3.511944743325969, + "tokens_seen": 1520184320 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027244734202607824, + "loss": 2.5907, + "theoretical_loss": 3.511931428868208, + "tokens_seen": 1520249856 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002724373119358075, + "loss": 2.5607, + "theoretical_loss": 3.5119181151451078, + "tokens_seen": 1520315392 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002724272818455366, + "loss": 2.4661, + "theoretical_loss": 3.511904802156595, + "tokens_seen": 1520380928 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1741916, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7754528522491455, + "objective/train/theoretical_loss": 3.51189481789724, + "objective/train/tokens_used": 1540890080, + "theoretical_loss": 3.51189481789724, + "tokens_seen": 1520430080 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027241725175526584, + "loss": 2.504, + "theoretical_loss": 3.511891489902598, + "tokens_seen": 1520446464 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027240722166499496, + "loss": 2.7549, + "theoretical_loss": 3.511878178383044, + "tokens_seen": 1520512000 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002723971915747242, + "loss": 2.7554, + "theoretical_loss": 3.511864867597862, + "tokens_seen": 1520577536 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002723871614844534, + "loss": 2.7106, + "theoretical_loss": 3.5118515575469784, + "tokens_seen": 1520643072 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027237713139418256, + "loss": 2.6407, + "theoretical_loss": 3.5118382482303225, + "tokens_seen": 1520708608 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027236710130391174, + "loss": 2.5378, + "theoretical_loss": 3.511824939647821, + "tokens_seen": 1520774144 + }, + { + "epoch": 5.01, + "learning_rate": 0.000272357071213641, + "loss": 2.7233, + "theoretical_loss": 3.5118116317994024, + "tokens_seen": 1520839680 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002723470411233701, + "loss": 2.9172, + "theoretical_loss": 3.5117983246849946, + "tokens_seen": 1520905216 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027233701103309934, + "loss": 2.4646, + "theoretical_loss": 3.5117850183045247, + "tokens_seen": 1520970752 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027232698094282847, + "loss": 2.6007, + "theoretical_loss": 3.511771712657922, + "tokens_seen": 1521036288 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002723169508525577, + "loss": 2.4467, + "theoretical_loss": 3.5117584077451127, + "tokens_seen": 1521101824 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002723069207622869, + "loss": 2.5762, + "theoretical_loss": 3.5117451035660263, + "tokens_seen": 1521167360 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027229689067201606, + "loss": 2.7585, + "theoretical_loss": 3.51173180012059, + "tokens_seen": 1521232896 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027228686058174524, + "loss": 2.8096, + "theoretical_loss": 3.5117184974087317, + "tokens_seen": 1521298432 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002722768304914744, + "loss": 2.6325, + "theoretical_loss": 3.5117051954303795, + "tokens_seen": 1521363968 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002722668004012036, + "loss": 2.6678, + "theoretical_loss": 3.511691894185461, + "tokens_seen": 1521429504 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027225677031093284, + "loss": 2.7784, + "theoretical_loss": 3.5116785936739046, + "tokens_seen": 1521495040 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027224674022066197, + "loss": 2.8446, + "theoretical_loss": 3.5116652938956383, + "tokens_seen": 1521560576 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002722367101303912, + "loss": 2.9045, + "theoretical_loss": 3.51165199485059, + "tokens_seen": 1521626112 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027222668004012033, + "loss": 2.5861, + "theoretical_loss": 3.511638696538687, + "tokens_seen": 1521691648 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027221664994984957, + "loss": 2.6447, + "theoretical_loss": 3.5116253989598585, + "tokens_seen": 1521757184 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027220661985957875, + "loss": 2.5343, + "theoretical_loss": 3.5116121021140323, + "tokens_seen": 1521822720 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027219658976930793, + "loss": 2.7088, + "theoretical_loss": 3.5115988060011354, + "tokens_seen": 1521888256 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002721865596790371, + "loss": 2.4863, + "theoretical_loss": 3.5115855106210967, + "tokens_seen": 1521953792 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027217652958876635, + "loss": 2.5216, + "theoretical_loss": 3.5115722159738443, + "tokens_seen": 1522019328 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1743369, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8488197326660156, + "objective/train/theoretical_loss": 3.5115622454692526, + "objective/train/tokens_used": 1542528480, + "theoretical_loss": 3.5115622454692526, + "tokens_seen": 1522068480 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027216649949849547, + "loss": 2.6903, + "theoretical_loss": 3.511558922059306, + "tokens_seen": 1522084864 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002721564694082247, + "loss": 2.5928, + "theoretical_loss": 3.51154562887741, + "tokens_seen": 1522150400 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027214643931795383, + "loss": 2.532, + "theoretical_loss": 3.5115323364280844, + "tokens_seen": 1522215936 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027213640922768307, + "loss": 2.502, + "theoretical_loss": 3.5115190447112568, + "tokens_seen": 1522281472 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027212637913741225, + "loss": 2.5598, + "theoretical_loss": 3.511505753726856, + "tokens_seen": 1522347008 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027211634904714143, + "loss": 2.5905, + "theoretical_loss": 3.51149246347481, + "tokens_seen": 1522412544 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002721063189568706, + "loss": 2.7652, + "theoretical_loss": 3.5114791739550464, + "tokens_seen": 1522478080 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002720962888665998, + "loss": 2.6179, + "theoretical_loss": 3.511465885167494, + "tokens_seen": 1522543616 + }, + { + "epoch": 5.01, + "learning_rate": 0.000272086258776329, + "loss": 2.6407, + "theoretical_loss": 3.51145259711208, + "tokens_seen": 1522609152 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002720762286860582, + "loss": 2.5974, + "theoretical_loss": 3.511439309788734, + "tokens_seen": 1522674688 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027206619859578734, + "loss": 2.6676, + "theoretical_loss": 3.511426023197383, + "tokens_seen": 1522740224 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027205616850551657, + "loss": 2.4657, + "theoretical_loss": 3.511412737337955, + "tokens_seen": 1522805760 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002720461384152457, + "loss": 2.5989, + "theoretical_loss": 3.5113994522103793, + "tokens_seen": 1522871296 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027203610832497493, + "loss": 2.4463, + "theoretical_loss": 3.5113861678145835, + "tokens_seen": 1522936832 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002720260782347041, + "loss": 2.5057, + "theoretical_loss": 3.511372884150495, + "tokens_seen": 1523002368 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002720160481444333, + "loss": 2.6699, + "theoretical_loss": 3.511359601218044, + "tokens_seen": 1523067904 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002720060180541625, + "loss": 2.723, + "theoretical_loss": 3.5113463190171568, + "tokens_seen": 1523133440 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002719959879638917, + "loss": 2.7831, + "theoretical_loss": 3.5113330375477623, + "tokens_seen": 1523198976 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027198595787362084, + "loss": 2.7578, + "theoretical_loss": 3.5113197568097894, + "tokens_seen": 1523264512 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002719759277833501, + "loss": 2.4327, + "theoretical_loss": 3.511306476803165, + "tokens_seen": 1523330048 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002719658976930792, + "loss": 2.6328, + "theoretical_loss": 3.5112931975278183, + "tokens_seen": 1523395584 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027195586760280844, + "loss": 2.3377, + "theoretical_loss": 3.5112799189836776, + "tokens_seen": 1523461120 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002719458375125376, + "loss": 2.5108, + "theoretical_loss": 3.511266641170671, + "tokens_seen": 1523526656 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002719358074222668, + "loss": 2.4412, + "theoretical_loss": 3.511253364088727, + "tokens_seen": 1523592192 + }, + { + "epoch": 5.01, + "learning_rate": 0.000271925777331996, + "loss": 2.7561, + "theoretical_loss": 3.5112400877377734, + "tokens_seen": 1523657728 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1744115, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8299615383148193, + "objective/train/theoretical_loss": 3.5112301309542273, + "objective/train/tokens_used": 1544166880, + "theoretical_loss": 3.5112301309542273, + "tokens_seen": 1523706880 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027191574724172516, + "loss": 2.7278, + "theoretical_loss": 3.5112268121177386, + "tokens_seen": 1523723264 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027190571715145434, + "loss": 2.9239, + "theoretical_loss": 3.511213537228551, + "tokens_seen": 1523788800 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002718956870611836, + "loss": 2.5451, + "theoretical_loss": 3.5112002630701395, + "tokens_seen": 1523854336 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002718856569709127, + "loss": 2.5095, + "theoretical_loss": 3.5111869896424324, + "tokens_seen": 1523919872 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027187562688064194, + "loss": 2.7109, + "theoretical_loss": 3.5111737169453567, + "tokens_seen": 1523985408 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027186559679037107, + "loss": 2.753, + "theoretical_loss": 3.5111604449788425, + "tokens_seen": 1524050944 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002718555667001003, + "loss": 2.6246, + "theoretical_loss": 3.511147173742817, + "tokens_seen": 1524116480 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002718455366098295, + "loss": 2.4407, + "theoretical_loss": 3.511133903237209, + "tokens_seen": 1524182016 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027183550651955867, + "loss": 2.674, + "theoretical_loss": 3.511120633461947, + "tokens_seen": 1524247552 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027182547642928785, + "loss": 2.5761, + "theoretical_loss": 3.5111073644169593, + "tokens_seen": 1524313088 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002718154463390171, + "loss": 2.6593, + "theoretical_loss": 3.511094096102174, + "tokens_seen": 1524378624 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002718054162487462, + "loss": 2.5198, + "theoretical_loss": 3.51108082851752, + "tokens_seen": 1524444160 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027179538615847544, + "loss": 2.6014, + "theoretical_loss": 3.511067561662926, + "tokens_seen": 1524509696 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027178535606820457, + "loss": 2.7626, + "theoretical_loss": 3.5110542955383197, + "tokens_seen": 1524575232 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002717753259779338, + "loss": 2.3878, + "theoretical_loss": 3.51104103014363, + "tokens_seen": 1524640768 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027176529588766304, + "loss": 2.4231, + "theoretical_loss": 3.5110277654787847, + "tokens_seen": 1524706304 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027175526579739217, + "loss": 2.6618, + "theoretical_loss": 3.5110145015437135, + "tokens_seen": 1524771840 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002717452357071214, + "loss": 2.7225, + "theoretical_loss": 3.511001238338344, + "tokens_seen": 1524837376 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027173520561685053, + "loss": 2.6813, + "theoretical_loss": 3.5109879758626046, + "tokens_seen": 1524902912 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027172517552657977, + "loss": 2.6589, + "theoretical_loss": 3.5109747141164243, + "tokens_seen": 1524968448 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027171514543630895, + "loss": 2.7244, + "theoretical_loss": 3.5109614530997315, + "tokens_seen": 1525033984 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027170511534603813, + "loss": 2.5411, + "theoretical_loss": 3.510948192812455, + "tokens_seen": 1525099520 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002716950852557673, + "loss": 2.6788, + "theoretical_loss": 3.5109349332545223, + "tokens_seen": 1525165056 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027168505516549655, + "loss": 2.607, + "theoretical_loss": 3.510921674425863, + "tokens_seen": 1525230592 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027167502507522567, + "loss": 2.4647, + "theoretical_loss": 3.510908416326405, + "tokens_seen": 1525296128 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1744854, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9327380657196045, + "objective/train/theoretical_loss": 3.5108984732303075, + "objective/train/tokens_used": 1545805280, + "theoretical_loss": 3.5108984732303075, + "tokens_seen": 1525345280 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002716649949849549, + "loss": 2.6041, + "theoretical_loss": 3.5108951589560773, + "tokens_seen": 1525361664 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027165496489468403, + "loss": 2.6702, + "theoretical_loss": 3.5108819023148086, + "tokens_seen": 1525427200 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027164493480441327, + "loss": 2.725, + "theoretical_loss": 3.5108686464025274, + "tokens_seen": 1525492736 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027163490471414245, + "loss": 2.749, + "theoretical_loss": 3.5108553912191613, + "tokens_seen": 1525558272 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027162487462387163, + "loss": 2.7232, + "theoretical_loss": 3.5108421367646403, + "tokens_seen": 1525623808 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002716148445336008, + "loss": 2.596, + "theoretical_loss": 3.5108288830388923, + "tokens_seen": 1525689344 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027160481444333, + "loss": 2.5888, + "theoretical_loss": 3.5108156300418463, + "tokens_seen": 1525754880 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002715947843530592, + "loss": 2.5965, + "theoretical_loss": 3.5108023777734303, + "tokens_seen": 1525820416 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002715847542627884, + "loss": 2.5714, + "theoretical_loss": 3.5107891262335738, + "tokens_seen": 1525885952 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027157472417251754, + "loss": 2.7426, + "theoretical_loss": 3.5107758754222047, + "tokens_seen": 1525951488 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002715646940822468, + "loss": 2.4285, + "theoretical_loss": 3.510762625339252, + "tokens_seen": 1526017024 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002715546639919759, + "loss": 2.4279, + "theoretical_loss": 3.5107493759846444, + "tokens_seen": 1526082560 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027154463390170514, + "loss": 2.433, + "theoretical_loss": 3.5107361273583106, + "tokens_seen": 1526148096 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002715346038114343, + "loss": 2.8518, + "theoretical_loss": 3.5107228794601797, + "tokens_seen": 1526213632 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002715245737211635, + "loss": 2.5255, + "theoretical_loss": 3.510709632290179, + "tokens_seen": 1526279168 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002715145436308927, + "loss": 2.759, + "theoretical_loss": 3.510696385848239, + "tokens_seen": 1526344704 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002715045135406219, + "loss": 2.4789, + "theoretical_loss": 3.5106831401342875, + "tokens_seen": 1526410240 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027149448345035104, + "loss": 2.3398, + "theoretical_loss": 3.510669895148253, + "tokens_seen": 1526475776 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002714844533600803, + "loss": 2.5167, + "theoretical_loss": 3.5106566508900645, + "tokens_seen": 1526541312 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002714744232698094, + "loss": 2.5327, + "theoretical_loss": 3.510643407359651, + "tokens_seen": 1526606848 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027146439317953864, + "loss": 2.558, + "theoretical_loss": 3.5106301645569413, + "tokens_seen": 1526672384 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002714543630892678, + "loss": 2.6568, + "theoretical_loss": 3.510616922481864, + "tokens_seen": 1526737920 + }, + { + "epoch": 5.01, + "learning_rate": 0.000271444332998997, + "loss": 2.6549, + "theoretical_loss": 3.5106036811343477, + "tokens_seen": 1526803456 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002714343029087262, + "loss": 2.7285, + "theoretical_loss": 3.5105904405143216, + "tokens_seen": 1526868992 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027142427281845536, + "loss": 2.4567, + "theoretical_loss": 3.5105772006217144, + "tokens_seen": 1526934528 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1746475, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7063658237457275, + "objective/train/theoretical_loss": 3.5105672711795846, + "objective/train/tokens_used": 1547443680, + "theoretical_loss": 3.5105672711795846, + "tokens_seen": 1526983680 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027141424272818454, + "loss": 2.7586, + "theoretical_loss": 3.5105639614564543, + "tokens_seen": 1527000064 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002714042126379138, + "loss": 2.5778, + "theoretical_loss": 3.510550723018471, + "tokens_seen": 1527065600 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002713941825476429, + "loss": 2.6325, + "theoretical_loss": 3.510537485307693, + "tokens_seen": 1527131136 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027138415245737214, + "loss": 2.6391, + "theoretical_loss": 3.5105242483240486, + "tokens_seen": 1527196672 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027137412236710127, + "loss": 2.6166, + "theoretical_loss": 3.510511012067468, + "tokens_seen": 1527262208 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002713640922768305, + "loss": 2.8506, + "theoretical_loss": 3.510497776537879, + "tokens_seen": 1527327744 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002713540621865597, + "loss": 2.5764, + "theoretical_loss": 3.5104845417352104, + "tokens_seen": 1527393280 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027134403209628887, + "loss": 2.6689, + "theoretical_loss": 3.510471307659392, + "tokens_seen": 1527458816 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027133400200601805, + "loss": 2.7366, + "theoretical_loss": 3.5104580743103515, + "tokens_seen": 1527524352 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002713239719157473, + "loss": 2.6548, + "theoretical_loss": 3.5104448416880185, + "tokens_seen": 1527589888 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002713139418254764, + "loss": 2.5055, + "theoretical_loss": 3.510431609792322, + "tokens_seen": 1527655424 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027130391173520564, + "loss": 2.6919, + "theoretical_loss": 3.510418378623191, + "tokens_seen": 1527720960 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027129388164493477, + "loss": 2.437, + "theoretical_loss": 3.510405148180554, + "tokens_seen": 1527786496 + }, + { + "epoch": 5.01, + "learning_rate": 0.000271283851554664, + "loss": 2.5164, + "theoretical_loss": 3.5103919184643404, + "tokens_seen": 1527852032 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002712738214643932, + "loss": 2.5583, + "theoretical_loss": 3.5103786894744786, + "tokens_seen": 1527917568 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027126379137412237, + "loss": 2.5135, + "theoretical_loss": 3.510365461210898, + "tokens_seen": 1527983104 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027125376128385155, + "loss": 2.7525, + "theoretical_loss": 3.5103522336735278, + "tokens_seen": 1528048640 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027124373119358073, + "loss": 2.6378, + "theoretical_loss": 3.510339006862296, + "tokens_seen": 1528114176 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002712337011033099, + "loss": 2.7063, + "theoretical_loss": 3.5103257807771326, + "tokens_seen": 1528179712 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027122367101303915, + "loss": 2.4968, + "theoretical_loss": 3.5103125554179666, + "tokens_seen": 1528245248 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002712136409227683, + "loss": 2.6722, + "theoretical_loss": 3.5102993307847266, + "tokens_seen": 1528310784 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002712036108324975, + "loss": 2.5445, + "theoretical_loss": 3.510286106877341, + "tokens_seen": 1528376320 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002711935807422267, + "loss": 2.4946, + "theoretical_loss": 3.51027288369574, + "tokens_seen": 1528441856 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027118355065195587, + "loss": 2.6322, + "theoretical_loss": 3.5102596612398522, + "tokens_seen": 1528507392 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027117352056168505, + "loss": 2.7352, + "theoretical_loss": 3.510246439509607, + "tokens_seen": 1528572928 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1747177, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3864831924438477, + "objective/train/theoretical_loss": 3.5102365236880826, + "objective/train/tokens_used": 1549082080, + "theoretical_loss": 3.5102365236880826, + "tokens_seen": 1528622080 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027116349047141423, + "loss": 2.586, + "theoretical_loss": 3.5102332185049328, + "tokens_seen": 1528638464 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002711534603811434, + "loss": 2.5084, + "theoretical_loss": 3.510219998225759, + "tokens_seen": 1528704000 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027114343029087265, + "loss": 2.6257, + "theoretical_loss": 3.5102067786720146, + "tokens_seen": 1528769536 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002711334002006018, + "loss": 2.5572, + "theoretical_loss": 3.510193559843629, + "tokens_seen": 1528835072 + }, + { + "epoch": 5.01, + "learning_rate": 0.000271123370110331, + "loss": 2.5593, + "theoretical_loss": 3.510180341740531, + "tokens_seen": 1528900608 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027111334002006014, + "loss": 2.6829, + "theoretical_loss": 3.51016712436265, + "tokens_seen": 1528966144 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002711033099297894, + "loss": 2.7308, + "theoretical_loss": 3.510153907709914, + "tokens_seen": 1529031680 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027109327983951856, + "loss": 2.76, + "theoretical_loss": 3.5101406917822544, + "tokens_seen": 1529097216 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027108324974924774, + "loss": 2.4291, + "theoretical_loss": 3.5101274765795987, + "tokens_seen": 1529162752 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002710732196589769, + "loss": 2.5715, + "theoretical_loss": 3.510114262101876, + "tokens_seen": 1529228288 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002710631895687061, + "loss": 2.822, + "theoretical_loss": 3.510101048349016, + "tokens_seen": 1529293824 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002710531594784353, + "loss": 2.5231, + "theoretical_loss": 3.510087835320948, + "tokens_seen": 1529359360 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002710431293881645, + "loss": 2.6624, + "theoretical_loss": 3.5100746230176005, + "tokens_seen": 1529424896 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027103309929789364, + "loss": 2.5751, + "theoretical_loss": 3.5100614114389033, + "tokens_seen": 1529490432 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002710230692076229, + "loss": 2.7269, + "theoretical_loss": 3.5100482005847855, + "tokens_seen": 1529555968 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002710130391173521, + "loss": 2.7465, + "theoretical_loss": 3.5100349904551766, + "tokens_seen": 1529621504 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027100300902708124, + "loss": 2.4731, + "theoretical_loss": 3.5100217810500047, + "tokens_seen": 1529687040 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002709929789368105, + "loss": 2.578, + "theoretical_loss": 3.5100085723692, + "tokens_seen": 1529752576 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002709829488465396, + "loss": 2.4907, + "theoretical_loss": 3.509995364412692, + "tokens_seen": 1529818112 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027097291875626884, + "loss": 2.592, + "theoretical_loss": 3.509982157180409, + "tokens_seen": 1529883648 + }, + { + "epoch": 5.01, + "learning_rate": 0.000270962888665998, + "loss": 2.54, + "theoretical_loss": 3.5099689506722815, + "tokens_seen": 1529949184 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002709528585757272, + "loss": 2.6529, + "theoretical_loss": 3.5099557448882375, + "tokens_seen": 1530014720 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002709428284854564, + "loss": 2.7304, + "theoretical_loss": 3.509942539828207, + "tokens_seen": 1530080256 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027093279839518556, + "loss": 2.5196, + "theoretical_loss": 3.509929335492119, + "tokens_seen": 1530145792 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027092276830491474, + "loss": 2.6669, + "theoretical_loss": 3.5099161318799035, + "tokens_seen": 1530211328 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1748466, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8011348247528076, + "objective/train/theoretical_loss": 3.5099062296457397, + "objective/train/tokens_used": 1550720480, + "theoretical_loss": 3.5099062296457397, + "tokens_seen": 1530260480 + }, + { + "epoch": 5.01, + "learning_rate": 0.000270912738214644, + "loss": 2.5529, + "theoretical_loss": 3.5099029289914885, + "tokens_seen": 1530276864 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002709027081243731, + "loss": 2.4209, + "theoretical_loss": 3.509889726826805, + "tokens_seen": 1530342400 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027089267803410234, + "loss": 2.7938, + "theoretical_loss": 3.509876525385781, + "tokens_seen": 1530407936 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027088264794383147, + "loss": 2.5396, + "theoretical_loss": 3.5098633246683457, + "tokens_seen": 1530473472 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002708726178535607, + "loss": 2.5344, + "theoretical_loss": 3.50985012467443, + "tokens_seen": 1530539008 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002708625877632899, + "loss": 2.6562, + "theoretical_loss": 3.509836925403962, + "tokens_seen": 1530604544 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027085255767301907, + "loss": 2.6576, + "theoretical_loss": 3.509823726856871, + "tokens_seen": 1530670080 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027084252758274825, + "loss": 2.6448, + "theoretical_loss": 3.5098105290330874, + "tokens_seen": 1530735616 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002708324974924775, + "loss": 2.7434, + "theoretical_loss": 3.509797331932539, + "tokens_seen": 1530801152 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002708224674022066, + "loss": 2.6026, + "theoretical_loss": 3.509784135555157, + "tokens_seen": 1530866688 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027081243731193584, + "loss": 2.6652, + "theoretical_loss": 3.5097709399008696, + "tokens_seen": 1530932224 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027080240722166497, + "loss": 2.4217, + "theoretical_loss": 3.5097577449696074, + "tokens_seen": 1530997760 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002707923771313942, + "loss": 2.6055, + "theoretical_loss": 3.5097445507612983, + "tokens_seen": 1531063296 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002707823470411234, + "loss": 2.6107, + "theoretical_loss": 3.5097313572758724, + "tokens_seen": 1531128832 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027077231695085257, + "loss": 2.6486, + "theoretical_loss": 3.50971816451326, + "tokens_seen": 1531194368 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027076228686058175, + "loss": 2.6052, + "theoretical_loss": 3.509704972473389, + "tokens_seen": 1531259904 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027075225677031093, + "loss": 2.7182, + "theoretical_loss": 3.5096917811561905, + "tokens_seen": 1531325440 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002707422266800401, + "loss": 2.6964, + "theoretical_loss": 3.5096785905615926, + "tokens_seen": 1531390976 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027073219658976935, + "loss": 2.5845, + "theoretical_loss": 3.509665400689525, + "tokens_seen": 1531456512 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002707221664994985, + "loss": 2.6935, + "theoretical_loss": 3.5096522115399185, + "tokens_seen": 1531522048 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002707121364092277, + "loss": 2.6935, + "theoretical_loss": 3.509639023112701, + "tokens_seen": 1531587584 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002707021063189569, + "loss": 2.4785, + "theoretical_loss": 3.5096258354078036, + "tokens_seen": 1531653120 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027069207622868607, + "loss": 2.4478, + "theoretical_loss": 3.509612648425154, + "tokens_seen": 1531718656 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027068204613841525, + "loss": 2.5772, + "theoretical_loss": 3.509599462164683, + "tokens_seen": 1531784192 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027067201604814443, + "loss": 2.6304, + "theoretical_loss": 3.5095862766263206, + "tokens_seen": 1531849728 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1749169, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.604491949081421, + "objective/train/theoretical_loss": 3.509576387946389, + "objective/train/tokens_used": 1552358880, + "theoretical_loss": 3.509576387946389, + "tokens_seen": 1531898880 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002706619859578736, + "loss": 2.7063, + "theoretical_loss": 3.5095730918099948, + "tokens_seen": 1531915264 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027065195586760285, + "loss": 2.7013, + "theoretical_loss": 3.5095599077156363, + "tokens_seen": 1531980800 + }, + { + "epoch": 5.01, + "learning_rate": 0.000270641925777332, + "loss": 2.6335, + "theoretical_loss": 3.509546724343174, + "tokens_seen": 1532046336 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002706318956870612, + "loss": 2.4774, + "theoretical_loss": 3.5095335416925386, + "tokens_seen": 1532111872 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027062186559679034, + "loss": 2.5814, + "theoretical_loss": 3.5095203597636586, + "tokens_seen": 1532177408 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002706118355065196, + "loss": 2.6218, + "theoretical_loss": 3.509507178556464, + "tokens_seen": 1532242944 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027060180541624876, + "loss": 2.5315, + "theoretical_loss": 3.5094939980708846, + "tokens_seen": 1532308480 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027059177532597794, + "loss": 2.7335, + "theoretical_loss": 3.5094808183068498, + "tokens_seen": 1532374016 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002705817452357071, + "loss": 2.503, + "theoretical_loss": 3.509467639264289, + "tokens_seen": 1532439552 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002705717151454363, + "loss": 2.6337, + "theoretical_loss": 3.5094544609431324, + "tokens_seen": 1532505088 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002705616850551655, + "loss": 2.5151, + "theoretical_loss": 3.5094412833433095, + "tokens_seen": 1532570624 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002705516549648947, + "loss": 2.4761, + "theoretical_loss": 3.5094281064647497, + "tokens_seen": 1532636160 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027054162487462384, + "loss": 2.6397, + "theoretical_loss": 3.5094149303073827, + "tokens_seen": 1532701696 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002705315947843531, + "loss": 2.796, + "theoretical_loss": 3.509401754871139, + "tokens_seen": 1532767232 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027052156469408226, + "loss": 2.5567, + "theoretical_loss": 3.5093885801559472, + "tokens_seen": 1532832768 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027051153460381144, + "loss": 2.4442, + "theoretical_loss": 3.5093754061617375, + "tokens_seen": 1532898304 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002705015045135406, + "loss": 2.3743, + "theoretical_loss": 3.50936223288844, + "tokens_seen": 1532963840 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002704914744232698, + "loss": 2.8448, + "theoretical_loss": 3.5093490603359836, + "tokens_seen": 1533029376 + }, + { + "epoch": 5.01, + "learning_rate": 0.000270481444332999, + "loss": 2.6492, + "theoretical_loss": 3.5093358885042987, + "tokens_seen": 1533094912 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002704714142427282, + "loss": 2.6417, + "theoretical_loss": 3.5093227173933146, + "tokens_seen": 1533160448 + }, + { + "epoch": 5.01, + "learning_rate": 0.00027046138415245735, + "loss": 2.6674, + "theoretical_loss": 3.5093095470029616, + "tokens_seen": 1533225984 + }, + { + "epoch": 5.01, + "learning_rate": 0.0002704513540621866, + "loss": 2.7446, + "theoretical_loss": 3.509296377333169, + "tokens_seen": 1533291520 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002704413239719157, + "loss": 2.5301, + "theoretical_loss": 3.509283208383867, + "tokens_seen": 1533357056 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027043129388164494, + "loss": 2.2968, + "theoretical_loss": 3.509270040154985, + "tokens_seen": 1533422592 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002704212637913741, + "loss": 2.4793, + "theoretical_loss": 3.5092568726464526, + "tokens_seen": 1533488128 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1750709, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6057729721069336, + "objective/train/theoretical_loss": 3.5092469974877414, + "objective/train/tokens_used": 1553997280, + "theoretical_loss": 3.5092469974877414, + "tokens_seen": 1533537280 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002704112337011033, + "loss": 2.634, + "theoretical_loss": 3.509243705858201, + "tokens_seen": 1533553664 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002704012036108325, + "loss": 2.6113, + "theoretical_loss": 3.509230539790158, + "tokens_seen": 1533619200 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027039117352056167, + "loss": 2.5485, + "theoretical_loss": 3.5092173744422546, + "tokens_seen": 1533684736 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027038114343029085, + "loss": 2.6423, + "theoretical_loss": 3.5092042098144205, + "tokens_seen": 1533750272 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002703711133400201, + "loss": 2.7032, + "theoretical_loss": 3.5091910459065856, + "tokens_seen": 1533815808 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002703610832497492, + "loss": 2.5698, + "theoretical_loss": 3.5091778827186797, + "tokens_seen": 1533881344 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027035105315947845, + "loss": 2.6178, + "theoretical_loss": 3.5091647202506326, + "tokens_seen": 1533946880 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027034102306920763, + "loss": 2.5169, + "theoretical_loss": 3.5091515585023743, + "tokens_seen": 1534012416 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002703309929789368, + "loss": 2.6137, + "theoretical_loss": 3.5091383974738344, + "tokens_seen": 1534077952 + }, + { + "epoch": 5.02, + "learning_rate": 0.000270320962888666, + "loss": 2.7768, + "theoretical_loss": 3.509125237164943, + "tokens_seen": 1534143488 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027031093279839517, + "loss": 2.5983, + "theoretical_loss": 3.5091120775756304, + "tokens_seen": 1534209024 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027030090270812435, + "loss": 2.6226, + "theoretical_loss": 3.5090989187058255, + "tokens_seen": 1534274560 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002702908726178536, + "loss": 2.7032, + "theoretical_loss": 3.5090857605554593, + "tokens_seen": 1534340096 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002702808425275827, + "loss": 2.5462, + "theoretical_loss": 3.5090726031244612, + "tokens_seen": 1534405632 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027027081243731195, + "loss": 2.5581, + "theoretical_loss": 3.509059446412761, + "tokens_seen": 1534471168 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027026078234704113, + "loss": 2.6566, + "theoretical_loss": 3.5090462904202893, + "tokens_seen": 1534536704 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002702507522567703, + "loss": 2.4762, + "theoretical_loss": 3.5090331351469755, + "tokens_seen": 1534602240 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027024072216649955, + "loss": 2.6938, + "theoretical_loss": 3.50901998059275, + "tokens_seen": 1534667776 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002702306920762287, + "loss": 2.3536, + "theoretical_loss": 3.5090068267575423, + "tokens_seen": 1534733312 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002702206619859579, + "loss": 2.7465, + "theoretical_loss": 3.5089936736412826, + "tokens_seen": 1534798848 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002702106318956871, + "loss": 2.3321, + "theoretical_loss": 3.508980521243901, + "tokens_seen": 1534864384 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027020060180541627, + "loss": 2.6597, + "theoretical_loss": 3.508967369565328, + "tokens_seen": 1534929920 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027019057171514545, + "loss": 2.4596, + "theoretical_loss": 3.5089542186054925, + "tokens_seen": 1534995456 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027018054162487463, + "loss": 2.597, + "theoretical_loss": 3.508941068364325, + "tokens_seen": 1535060992 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002701705115346038, + "loss": 2.7156, + "theoretical_loss": 3.5089279188417564, + "tokens_seen": 1535126528 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1751481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.573998212814331, + "objective/train/theoretical_loss": 3.5089180571713676, + "objective/train/tokens_used": 1555635680, + "theoretical_loss": 3.5089180571713676, + "tokens_seen": 1535175680 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027016048144433305, + "loss": 2.7034, + "theoretical_loss": 3.5089147700377152, + "tokens_seen": 1535192064 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002701504513540622, + "loss": 2.6163, + "theoretical_loss": 3.508901621952133, + "tokens_seen": 1535257600 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002701404212637914, + "loss": 2.6168, + "theoretical_loss": 3.508888474584939, + "tokens_seen": 1535323136 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027013039117352054, + "loss": 2.6139, + "theoretical_loss": 3.508875327936063, + "tokens_seen": 1535388672 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002701203610832498, + "loss": 2.7076, + "theoretical_loss": 3.508862182005436, + "tokens_seen": 1535454208 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027011033099297896, + "loss": 2.5746, + "theoretical_loss": 3.508849036792988, + "tokens_seen": 1535519744 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027010030090270814, + "loss": 2.8796, + "theoretical_loss": 3.5088358922986482, + "tokens_seen": 1535585280 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002700902708124373, + "loss": 2.5963, + "theoretical_loss": 3.5088227485223475, + "tokens_seen": 1535650816 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002700802407221665, + "loss": 2.4503, + "theoretical_loss": 3.508809605464016, + "tokens_seen": 1535716352 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002700702106318957, + "loss": 2.7169, + "theoretical_loss": 3.5087964631235833, + "tokens_seen": 1535781888 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002700601805416249, + "loss": 2.8176, + "theoretical_loss": 3.5087833215009807, + "tokens_seen": 1535847424 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027005015045135404, + "loss": 2.5217, + "theoretical_loss": 3.508770180596137, + "tokens_seen": 1535912960 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002700401203610833, + "loss": 2.5807, + "theoretical_loss": 3.508757040408983, + "tokens_seen": 1535978496 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027003009027081246, + "loss": 2.8575, + "theoretical_loss": 3.508743900939449, + "tokens_seen": 1536044032 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027002006018054164, + "loss": 2.7623, + "theoretical_loss": 3.508730762187465, + "tokens_seen": 1536109568 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002700100300902708, + "loss": 2.7278, + "theoretical_loss": 3.508717624152961, + "tokens_seen": 1536175104 + }, + { + "epoch": 5.02, + "learning_rate": 0.00027, + "loss": 2.656, + "theoretical_loss": 3.5087044868358683, + "tokens_seen": 1536240640 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002699899699097292, + "loss": 2.6187, + "theoretical_loss": 3.5086913502361154, + "tokens_seen": 1536306176 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002699799398194584, + "loss": 2.4959, + "theoretical_loss": 3.508678214353634, + "tokens_seen": 1536371712 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026996990972918755, + "loss": 2.5832, + "theoretical_loss": 3.508665079188354, + "tokens_seen": 1536437248 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002699598796389168, + "loss": 2.5339, + "theoretical_loss": 3.5086519447402047, + "tokens_seen": 1536502784 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002699498495486459, + "loss": 2.4782, + "theoretical_loss": 3.508638811009118, + "tokens_seen": 1536568320 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026993981945837514, + "loss": 2.7155, + "theoretical_loss": 3.5086256779950222, + "tokens_seen": 1536633856 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002699297893681043, + "loss": 2.7312, + "theoretical_loss": 3.5086125456978494, + "tokens_seen": 1536699392 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002699197592778335, + "loss": 2.7479, + "theoretical_loss": 3.508599414117529, + "tokens_seen": 1536764928 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1752901, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.562196969985962, + "objective/train/theoretical_loss": 3.508589565902681, + "objective/train/tokens_used": 1557274080, + "theoretical_loss": 3.508589565902681, + "tokens_seen": 1536814080 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002699097291875627, + "loss": 2.6855, + "theoretical_loss": 3.508586283253991, + "tokens_seen": 1536830464 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026989969909729187, + "loss": 2.5581, + "theoretical_loss": 3.508573153107167, + "tokens_seen": 1536896000 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026988966900702105, + "loss": 2.6018, + "theoretical_loss": 3.508560023676986, + "tokens_seen": 1536961536 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002698796389167503, + "loss": 2.5101, + "theoretical_loss": 3.508546894963379, + "tokens_seen": 1537027072 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002698696088264794, + "loss": 2.5575, + "theoretical_loss": 3.5085337669662753, + "tokens_seen": 1537092608 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026985957873620865, + "loss": 2.6392, + "theoretical_loss": 3.5085206396856075, + "tokens_seen": 1537158144 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026984954864593783, + "loss": 2.6987, + "theoretical_loss": 3.5085075131213035, + "tokens_seen": 1537223680 + }, + { + "epoch": 5.02, + "learning_rate": 0.000269839518555667, + "loss": 2.6348, + "theoretical_loss": 3.508494387273295, + "tokens_seen": 1537289216 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002698294884653962, + "loss": 2.6593, + "theoretical_loss": 3.508481262141512, + "tokens_seen": 1537354752 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026981945837512537, + "loss": 2.7099, + "theoretical_loss": 3.508468137725885, + "tokens_seen": 1537420288 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026980942828485455, + "loss": 2.5047, + "theoretical_loss": 3.508455014026345, + "tokens_seen": 1537485824 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002697993981945838, + "loss": 2.6675, + "theoretical_loss": 3.5084418910428212, + "tokens_seen": 1537551360 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002697893681043129, + "loss": 2.6193, + "theoretical_loss": 3.5084287687752447, + "tokens_seen": 1537616896 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026977933801404215, + "loss": 2.7267, + "theoretical_loss": 3.5084156472235457, + "tokens_seen": 1537682432 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002697693079237713, + "loss": 2.6067, + "theoretical_loss": 3.508402526387655, + "tokens_seen": 1537747968 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002697592778335005, + "loss": 2.54, + "theoretical_loss": 3.5083894062675025, + "tokens_seen": 1537813504 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002697492477432297, + "loss": 2.7133, + "theoretical_loss": 3.5083762868630197, + "tokens_seen": 1537879040 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002697392176529589, + "loss": 2.3966, + "theoretical_loss": 3.508363168174136, + "tokens_seen": 1537944576 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026972918756268806, + "loss": 2.5644, + "theoretical_loss": 3.508350050200782, + "tokens_seen": 1538010112 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002697191574724173, + "loss": 2.6717, + "theoretical_loss": 3.508336932942888, + "tokens_seen": 1538075648 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002697091273821464, + "loss": 2.7196, + "theoretical_loss": 3.5083238164003863, + "tokens_seen": 1538141184 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026969909729187565, + "loss": 2.482, + "theoretical_loss": 3.5083107005732046, + "tokens_seen": 1538206720 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002696890672016048, + "loss": 2.6231, + "theoretical_loss": 3.5082975854612757, + "tokens_seen": 1538272256 + }, + { + "epoch": 5.02, + "learning_rate": 0.000269679037111334, + "loss": 2.6503, + "theoretical_loss": 3.508284471064529, + "tokens_seen": 1538337792 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002696690070210632, + "loss": 2.5515, + "theoretical_loss": 3.5082713573828954, + "tokens_seen": 1538403328 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1753471, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3333654403686523, + "objective/train/theoretical_loss": 3.508261522590921, + "objective/train/tokens_used": 1558912480, + "theoretical_loss": 3.508261522590921, + "tokens_seen": 1538452480 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002696589769307924, + "loss": 2.6811, + "theoretical_loss": 3.508258244416305, + "tokens_seen": 1538468864 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026964894684052156, + "loss": 2.8215, + "theoretical_loss": 3.508245132164689, + "tokens_seen": 1538534400 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026963891675025074, + "loss": 2.5912, + "theoretical_loss": 3.5082320206279776, + "tokens_seen": 1538599936 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002696288866599799, + "loss": 2.5781, + "theoretical_loss": 3.508218909806101, + "tokens_seen": 1538665472 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026961885656970916, + "loss": 2.7099, + "theoretical_loss": 3.508205799698991, + "tokens_seen": 1538731008 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002696088264794383, + "loss": 2.8744, + "theoretical_loss": 3.508192690306577, + "tokens_seen": 1538796544 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002695987963891675, + "loss": 2.599, + "theoretical_loss": 3.5081795816287897, + "tokens_seen": 1538862080 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026958876629889665, + "loss": 2.6964, + "theoretical_loss": 3.5081664736655602, + "tokens_seen": 1538927616 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002695787362086259, + "loss": 2.7133, + "theoretical_loss": 3.5081533664168196, + "tokens_seen": 1538993152 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026956870611835506, + "loss": 2.7044, + "theoretical_loss": 3.508140259882497, + "tokens_seen": 1539058688 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026955867602808424, + "loss": 2.4334, + "theoretical_loss": 3.5081271540625245, + "tokens_seen": 1539124224 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002695486459378134, + "loss": 2.7716, + "theoretical_loss": 3.5081140489568314, + "tokens_seen": 1539189760 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026953861584754266, + "loss": 2.6196, + "theoretical_loss": 3.50810094456535, + "tokens_seen": 1539255296 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002695285857572718, + "loss": 2.683, + "theoretical_loss": 3.5080878408880096, + "tokens_seen": 1539320832 + }, + { + "epoch": 5.02, + "learning_rate": 0.000269518555667001, + "loss": 2.5391, + "theoretical_loss": 3.5080747379247414, + "tokens_seen": 1539386368 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002695085255767302, + "loss": 2.3732, + "theoretical_loss": 3.5080616356754764, + "tokens_seen": 1539451904 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002694984954864594, + "loss": 2.4932, + "theoretical_loss": 3.5080485341401446, + "tokens_seen": 1539517440 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002694884653961886, + "loss": 2.5774, + "theoretical_loss": 3.5080354333186774, + "tokens_seen": 1539582976 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026947843530591775, + "loss": 2.5516, + "theoretical_loss": 3.508022333211005, + "tokens_seen": 1539648512 + }, + { + "epoch": 5.02, + "learning_rate": 0.000269468405215647, + "loss": 2.6029, + "theoretical_loss": 3.5080092338170585, + "tokens_seen": 1539714048 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002694583751253761, + "loss": 2.5132, + "theoretical_loss": 3.507996135136768, + "tokens_seen": 1539779584 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026944834503510534, + "loss": 2.6247, + "theoretical_loss": 3.507983037170065, + "tokens_seen": 1539845120 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002694383149448345, + "loss": 2.4717, + "theoretical_loss": 3.50796993991688, + "tokens_seen": 1539910656 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002694282848545637, + "loss": 2.6097, + "theoretical_loss": 3.5079568433771438, + "tokens_seen": 1539976192 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002694182547642929, + "loss": 2.463, + "theoretical_loss": 3.5079437475507866, + "tokens_seen": 1540041728 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1754850, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4801652431488037, + "objective/train/theoretical_loss": 3.507933926149133, + "objective/train/tokens_used": 1560550880, + "theoretical_loss": 3.507933926149133, + "tokens_seen": 1540090880 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026940822467402207, + "loss": 2.5182, + "theoretical_loss": 3.5079306524377403, + "tokens_seen": 1540107264 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026939819458375125, + "loss": 2.5174, + "theoretical_loss": 3.507917558037935, + "tokens_seen": 1540172800 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002693881644934805, + "loss": 2.5887, + "theoretical_loss": 3.5079044643513013, + "tokens_seen": 1540238336 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002693781344032096, + "loss": 2.5088, + "theoretical_loss": 3.507891371377771, + "tokens_seen": 1540303872 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026936810431293885, + "loss": 2.6441, + "theoretical_loss": 3.507878279117273, + "tokens_seen": 1540369408 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026935807422266803, + "loss": 2.6603, + "theoretical_loss": 3.507865187569741, + "tokens_seen": 1540434944 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002693480441323972, + "loss": 2.4486, + "theoretical_loss": 3.507852096735103, + "tokens_seen": 1540500480 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002693380140421264, + "loss": 2.4665, + "theoretical_loss": 3.507839006613291, + "tokens_seen": 1540566016 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026932798395185557, + "loss": 2.7573, + "theoretical_loss": 3.5078259172042365, + "tokens_seen": 1540631552 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026931795386158475, + "loss": 2.4961, + "theoretical_loss": 3.50781282850787, + "tokens_seen": 1540697088 + }, + { + "epoch": 5.02, + "learning_rate": 0.000269307923771314, + "loss": 2.6341, + "theoretical_loss": 3.5077997405241215, + "tokens_seen": 1540762624 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002692978936810431, + "loss": 2.4809, + "theoretical_loss": 3.507786653252923, + "tokens_seen": 1540828160 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026928786359077235, + "loss": 2.6053, + "theoretical_loss": 3.5077735666942047, + "tokens_seen": 1540893696 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002692778335005015, + "loss": 2.6354, + "theoretical_loss": 3.507760480847898, + "tokens_seen": 1540959232 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002692678034102307, + "loss": 2.5585, + "theoretical_loss": 3.507747395713934, + "tokens_seen": 1541024768 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002692577733199599, + "loss": 2.4733, + "theoretical_loss": 3.507734311292243, + "tokens_seen": 1541090304 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002692477432296891, + "loss": 2.6158, + "theoretical_loss": 3.507721227582756, + "tokens_seen": 1541155840 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026923771313941826, + "loss": 2.6618, + "theoretical_loss": 3.507708144585404, + "tokens_seen": 1541221376 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002692276830491475, + "loss": 2.8516, + "theoretical_loss": 3.507695062300118, + "tokens_seen": 1541286912 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002692176529588766, + "loss": 2.5671, + "theoretical_loss": 3.5076819807268294, + "tokens_seen": 1541352448 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026920762286860585, + "loss": 2.5019, + "theoretical_loss": 3.507668899865469, + "tokens_seen": 1541417984 + }, + { + "epoch": 5.02, + "learning_rate": 0.000269197592778335, + "loss": 2.6017, + "theoretical_loss": 3.5076558197159673, + "tokens_seen": 1541483520 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002691875626880642, + "loss": 2.7025, + "theoretical_loss": 3.507642740278256, + "tokens_seen": 1541549056 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002691775325977934, + "loss": 2.6308, + "theoretical_loss": 3.5076296615522655, + "tokens_seen": 1541614592 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002691675025075226, + "loss": 2.4931, + "theoretical_loss": 3.507616583537927, + "tokens_seen": 1541680128 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1755645, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.596935987472534, + "objective/train/theoretical_loss": 3.5076067754941533, + "objective/train/tokens_used": 1562189280, + "theoretical_loss": 3.5076067754941533, + "tokens_seen": 1541729280 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026915747241725176, + "loss": 2.7508, + "theoretical_loss": 3.5076035062351716, + "tokens_seen": 1541745664 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026914744232698094, + "loss": 2.6642, + "theoretical_loss": 3.5075904296439306, + "tokens_seen": 1541811200 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002691374122367101, + "loss": 2.749, + "theoretical_loss": 3.507577353764135, + "tokens_seen": 1541876736 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026912738214643936, + "loss": 2.6091, + "theoretical_loss": 3.507564278595715, + "tokens_seen": 1541942272 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002691173520561685, + "loss": 2.549, + "theoretical_loss": 3.5075512041386023, + "tokens_seen": 1542007808 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002691073219658977, + "loss": 2.5448, + "theoretical_loss": 3.507538130392728, + "tokens_seen": 1542073344 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026909729187562685, + "loss": 2.4752, + "theoretical_loss": 3.5075250573580234, + "tokens_seen": 1542138880 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002690872617853561, + "loss": 2.6303, + "theoretical_loss": 3.5075119850344194, + "tokens_seen": 1542204416 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026907723169508526, + "loss": 2.5021, + "theoretical_loss": 3.507498913421847, + "tokens_seen": 1542269952 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026906720160481444, + "loss": 2.6581, + "theoretical_loss": 3.5074858425202375, + "tokens_seen": 1542335488 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002690571715145436, + "loss": 2.6053, + "theoretical_loss": 3.5074727723295216, + "tokens_seen": 1542401024 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026904714142427286, + "loss": 2.5677, + "theoretical_loss": 3.507459702849631, + "tokens_seen": 1542466560 + }, + { + "epoch": 5.02, + "learning_rate": 0.000269037111334002, + "loss": 2.7589, + "theoretical_loss": 3.5074466340804964, + "tokens_seen": 1542532096 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002690270812437312, + "loss": 2.6401, + "theoretical_loss": 3.507433566022049, + "tokens_seen": 1542597632 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026901705115346035, + "loss": 2.4273, + "theoretical_loss": 3.5074204986742203, + "tokens_seen": 1542663168 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002690070210631896, + "loss": 2.7618, + "theoretical_loss": 3.507407432036941, + "tokens_seen": 1542728704 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026899699097291877, + "loss": 2.7275, + "theoretical_loss": 3.5073943661101428, + "tokens_seen": 1542794240 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026898696088264795, + "loss": 2.6308, + "theoretical_loss": 3.5073813008937567, + "tokens_seen": 1542859776 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026897693079237713, + "loss": 2.3812, + "theoretical_loss": 3.5073682363877134, + "tokens_seen": 1542925312 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002689669007021063, + "loss": 2.7549, + "theoretical_loss": 3.507355172591945, + "tokens_seen": 1542990848 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002689568706118355, + "loss": 2.3385, + "theoretical_loss": 3.507342109506382, + "tokens_seen": 1543056384 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002689468405215647, + "loss": 2.6484, + "theoretical_loss": 3.5073290471309564, + "tokens_seen": 1543121920 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026893681043129385, + "loss": 2.4296, + "theoretical_loss": 3.507315985465598, + "tokens_seen": 1543187456 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002689267803410231, + "loss": 2.732, + "theoretical_loss": 3.5073029245102396, + "tokens_seen": 1543252992 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002689167502507522, + "loss": 2.429, + "theoretical_loss": 3.507289864264812, + "tokens_seen": 1543318528 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1757071, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.122141122817993, + "objective/train/theoretical_loss": 3.5072800695465913, + "objective/train/tokens_used": 1563827680, + "theoretical_loss": 3.5072800695465913, + "tokens_seen": 1543367680 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026890672016048145, + "loss": 2.5328, + "theoretical_loss": 3.507276804729246, + "tokens_seen": 1543384064 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026889669007021063, + "loss": 2.6986, + "theoretical_loss": 3.5072637459034732, + "tokens_seen": 1543449600 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002688866599799398, + "loss": 2.3406, + "theoretical_loss": 3.5072506877874243, + "tokens_seen": 1543515136 + }, + { + "epoch": 5.02, + "learning_rate": 0.000268876629889669, + "loss": 2.4533, + "theoretical_loss": 3.5072376303810318, + "tokens_seen": 1543580672 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026886659979939823, + "loss": 2.6335, + "theoretical_loss": 3.5072245736842262, + "tokens_seen": 1543646208 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026885656970912736, + "loss": 2.7103, + "theoretical_loss": 3.5072115176969394, + "tokens_seen": 1543711744 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002688465396188566, + "loss": 2.771, + "theoretical_loss": 3.507198462419102, + "tokens_seen": 1543777280 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002688365095285857, + "loss": 2.6098, + "theoretical_loss": 3.5071854078506455, + "tokens_seen": 1543842816 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026882647943831495, + "loss": 2.6712, + "theoretical_loss": 3.507172353991501, + "tokens_seen": 1543908352 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026881644934804413, + "loss": 2.5959, + "theoretical_loss": 3.507159300841601, + "tokens_seen": 1543973888 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002688064192577733, + "loss": 2.8207, + "theoretical_loss": 3.5071462484008755, + "tokens_seen": 1544039424 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002687963891675025, + "loss": 2.3538, + "theoretical_loss": 3.507133196669257, + "tokens_seen": 1544104960 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002687863590772317, + "loss": 2.6198, + "theoretical_loss": 3.507120145646676, + "tokens_seen": 1544170496 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026877632898696086, + "loss": 2.6675, + "theoretical_loss": 3.5071070953330645, + "tokens_seen": 1544236032 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002687662988966901, + "loss": 2.5703, + "theoretical_loss": 3.507094045728353, + "tokens_seen": 1544301568 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002687562688064193, + "loss": 2.8102, + "theoretical_loss": 3.5070809968324745, + "tokens_seen": 1544367104 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026874623871614846, + "loss": 2.4871, + "theoretical_loss": 3.507067948645359, + "tokens_seen": 1544432640 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002687362086258777, + "loss": 2.674, + "theoretical_loss": 3.5070549011669385, + "tokens_seen": 1544498176 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002687261785356068, + "loss": 2.6122, + "theoretical_loss": 3.507041854397144, + "tokens_seen": 1544563712 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026871614844533605, + "loss": 2.4205, + "theoretical_loss": 3.5070288083359076, + "tokens_seen": 1544629248 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002687061183550652, + "loss": 2.5357, + "theoretical_loss": 3.50701576298316, + "tokens_seen": 1544694784 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002686960882647944, + "loss": 2.6311, + "theoretical_loss": 3.507002718338834, + "tokens_seen": 1544760320 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002686860581745236, + "loss": 2.7446, + "theoretical_loss": 3.5069896744028597, + "tokens_seen": 1544825856 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002686760280842528, + "loss": 2.7055, + "theoretical_loss": 3.506976631175169, + "tokens_seen": 1544891392 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026866599799398196, + "loss": 2.8652, + "theoretical_loss": 3.506963588655694, + "tokens_seen": 1544956928 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1757632, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.365391969680786, + "objective/train/theoretical_loss": 3.506953807230812, + "objective/train/tokens_used": 1565466080, + "theoretical_loss": 3.506953807230812, + "tokens_seen": 1545006080 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026865596790371114, + "loss": 2.5397, + "theoretical_loss": 3.506950546844365, + "tokens_seen": 1545022464 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002686459378134403, + "loss": 2.7858, + "theoretical_loss": 3.5069375057411145, + "tokens_seen": 1545088000 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026863590772316956, + "loss": 2.6465, + "theoretical_loss": 3.5069244653458735, + "tokens_seen": 1545153536 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002686258776328987, + "loss": 2.4181, + "theoretical_loss": 3.506911425658574, + "tokens_seen": 1545219072 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002686158475426279, + "loss": 2.6752, + "theoretical_loss": 3.5068983866791474, + "tokens_seen": 1545284608 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026860581745235705, + "loss": 2.8851, + "theoretical_loss": 3.506885348407525, + "tokens_seen": 1545350144 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002685957873620863, + "loss": 2.6663, + "theoretical_loss": 3.5068723108436384, + "tokens_seen": 1545415680 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026858575727181546, + "loss": 2.6977, + "theoretical_loss": 3.50685927398742, + "tokens_seen": 1545481216 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026857572718154464, + "loss": 2.7299, + "theoretical_loss": 3.5068462378387997, + "tokens_seen": 1545546752 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002685656970912738, + "loss": 2.7172, + "theoretical_loss": 3.5068332023977105, + "tokens_seen": 1545612288 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026855566700100306, + "loss": 2.5429, + "theoretical_loss": 3.5068201676640838, + "tokens_seen": 1545677824 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002685456369107322, + "loss": 2.7161, + "theoretical_loss": 3.506807133637851, + "tokens_seen": 1545743360 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002685356068204614, + "loss": 2.6667, + "theoretical_loss": 3.506794100318943, + "tokens_seen": 1545808896 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026852557673019055, + "loss": 2.8401, + "theoretical_loss": 3.506781067707293, + "tokens_seen": 1545874432 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002685155466399198, + "loss": 2.5245, + "theoretical_loss": 3.506768035802831, + "tokens_seen": 1545939968 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026850551654964897, + "loss": 2.7198, + "theoretical_loss": 3.5067550046054903, + "tokens_seen": 1546005504 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026849548645937815, + "loss": 2.5475, + "theoretical_loss": 3.506741974115201, + "tokens_seen": 1546071040 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026848545636910733, + "loss": 2.363, + "theoretical_loss": 3.5067289443318956, + "tokens_seen": 1546136576 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002684754262788365, + "loss": 2.755, + "theoretical_loss": 3.506715915255506, + "tokens_seen": 1546202112 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002684653961885657, + "loss": 2.4602, + "theoretical_loss": 3.5067028868859627, + "tokens_seen": 1546267648 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002684553660982949, + "loss": 2.4851, + "theoretical_loss": 3.506689859223199, + "tokens_seen": 1546333184 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026844533600802405, + "loss": 2.4482, + "theoretical_loss": 3.5066768322671455, + "tokens_seen": 1546398720 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002684353059177533, + "loss": 2.4997, + "theoretical_loss": 3.506663806017734, + "tokens_seen": 1546464256 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002684252758274824, + "loss": 2.5183, + "theoretical_loss": 3.5066507804748968, + "tokens_seen": 1546529792 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026841524573721165, + "loss": 2.7061, + "theoretical_loss": 3.506637755638565, + "tokens_seen": 1546595328 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1758939, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.418158769607544, + "objective/train/theoretical_loss": 3.5066279874749195, + "objective/train/tokens_used": 1567104480, + "theoretical_loss": 3.5066279874749195, + "tokens_seen": 1546644480 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026840521564694083, + "loss": 2.5213, + "theoretical_loss": 3.506624731508671, + "tokens_seen": 1546660864 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026839518555667, + "loss": 2.4823, + "theoretical_loss": 3.5066117080851456, + "tokens_seen": 1546726400 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002683851554663992, + "loss": 2.4184, + "theoretical_loss": 3.506598685367922, + "tokens_seen": 1546791936 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026837512537612843, + "loss": 2.6234, + "theoretical_loss": 3.5065856633569306, + "tokens_seen": 1546857472 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026836509528585756, + "loss": 2.7849, + "theoretical_loss": 3.5065726420521033, + "tokens_seen": 1546923008 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002683550651955868, + "loss": 2.7529, + "theoretical_loss": 3.506559621453373, + "tokens_seen": 1546988544 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002683450351053159, + "loss": 2.6108, + "theoretical_loss": 3.5065466015606708, + "tokens_seen": 1547054080 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026833500501504515, + "loss": 2.7479, + "theoretical_loss": 3.506533582373928, + "tokens_seen": 1547119616 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026832497492477433, + "loss": 2.6387, + "theoretical_loss": 3.506520563893077, + "tokens_seen": 1547185152 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002683149448345035, + "loss": 2.6088, + "theoretical_loss": 3.5065075461180495, + "tokens_seen": 1547250688 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002683049147442327, + "loss": 2.7851, + "theoretical_loss": 3.506494529048778, + "tokens_seen": 1547316224 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002682948846539619, + "loss": 2.5244, + "theoretical_loss": 3.506481512685193, + "tokens_seen": 1547381760 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026828485456369106, + "loss": 2.6853, + "theoretical_loss": 3.5064684970272277, + "tokens_seen": 1547447296 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002682748244734203, + "loss": 2.7179, + "theoretical_loss": 3.506455482074813, + "tokens_seen": 1547512832 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002682647943831494, + "loss": 2.4258, + "theoretical_loss": 3.506442467827881, + "tokens_seen": 1547578368 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026825476429287866, + "loss": 2.7019, + "theoretical_loss": 3.5064294542863643, + "tokens_seen": 1547643904 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002682447342026078, + "loss": 2.7355, + "theoretical_loss": 3.506416441450194, + "tokens_seen": 1547709440 + }, + { + "epoch": 5.02, + "learning_rate": 0.000268234704112337, + "loss": 2.5778, + "theoretical_loss": 3.506403429319302, + "tokens_seen": 1547774976 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002682246740220662, + "loss": 2.4957, + "theoretical_loss": 3.5063904178936207, + "tokens_seen": 1547840512 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002682146439317954, + "loss": 2.7598, + "theoretical_loss": 3.5063774071730816, + "tokens_seen": 1547906048 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026820461384152456, + "loss": 2.859, + "theoretical_loss": 3.5063643971576166, + "tokens_seen": 1547971584 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002681945837512538, + "loss": 2.73, + "theoretical_loss": 3.5063513878471584, + "tokens_seen": 1548037120 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002681845536609829, + "loss": 2.6316, + "theoretical_loss": 3.506338379241638, + "tokens_seen": 1548102656 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026817452357071216, + "loss": 2.4397, + "theoretical_loss": 3.5063253713409877, + "tokens_seen": 1548168192 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002681644934804413, + "loss": 2.6663, + "theoretical_loss": 3.5063123641451397, + "tokens_seen": 1548233728 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1759677, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.676605463027954, + "objective/train/theoretical_loss": 3.5063026092107394, + "objective/train/tokens_used": 1568742880, + "theoretical_loss": 3.5063026092107394, + "tokens_seen": 1548282880 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002681544633901705, + "loss": 2.6747, + "theoretical_loss": 3.506299357654026, + "tokens_seen": 1548299264 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002681444332998997, + "loss": 2.5841, + "theoretical_loss": 3.506286351867578, + "tokens_seen": 1548364800 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002681344032096289, + "loss": 2.6078, + "theoretical_loss": 3.5062733467857283, + "tokens_seen": 1548430336 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026812437311935807, + "loss": 2.7631, + "theoretical_loss": 3.5062603424084084, + "tokens_seen": 1548495872 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026811434302908725, + "loss": 2.5097, + "theoretical_loss": 3.506247338735551, + "tokens_seen": 1548561408 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026810431293881643, + "loss": 2.6734, + "theoretical_loss": 3.506234335767088, + "tokens_seen": 1548626944 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026809428284854566, + "loss": 2.2408, + "theoretical_loss": 3.5062213335029506, + "tokens_seen": 1548692480 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002680842527582748, + "loss": 2.6834, + "theoretical_loss": 3.5062083319430717, + "tokens_seen": 1548758016 + }, + { + "epoch": 5.02, + "learning_rate": 0.000268074222668004, + "loss": 2.9695, + "theoretical_loss": 3.5061953310873832, + "tokens_seen": 1548823552 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026806419257773315, + "loss": 2.544, + "theoretical_loss": 3.5061823309358164, + "tokens_seen": 1548889088 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002680541624874624, + "loss": 2.6153, + "theoretical_loss": 3.506169331488305, + "tokens_seen": 1548954624 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026804413239719157, + "loss": 2.7174, + "theoretical_loss": 3.506156332744779, + "tokens_seen": 1549020160 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026803410230692075, + "loss": 2.7422, + "theoretical_loss": 3.5061433347051727, + "tokens_seen": 1549085696 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026802407221664993, + "loss": 2.7367, + "theoretical_loss": 3.5061303373694166, + "tokens_seen": 1549151232 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026801404212637917, + "loss": 2.5945, + "theoretical_loss": 3.5061173407374433, + "tokens_seen": 1549216768 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026800401203610835, + "loss": 2.4936, + "theoretical_loss": 3.5061043448091853, + "tokens_seen": 1549282304 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026799398194583753, + "loss": 2.7071, + "theoretical_loss": 3.5060913495845742, + "tokens_seen": 1549347840 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002679839518555667, + "loss": 2.6455, + "theoretical_loss": 3.506078355063542, + "tokens_seen": 1549413376 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002679739217652959, + "loss": 2.599, + "theoretical_loss": 3.506065361246021, + "tokens_seen": 1549478912 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002679638916750251, + "loss": 2.7328, + "theoretical_loss": 3.5060523681319444, + "tokens_seen": 1549544448 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026795386158475425, + "loss": 2.6709, + "theoretical_loss": 3.506039375721243, + "tokens_seen": 1549609984 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002679438314944835, + "loss": 2.8018, + "theoretical_loss": 3.5060263840138495, + "tokens_seen": 1549675520 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002679338014042126, + "loss": 2.733, + "theoretical_loss": 3.506013393009696, + "tokens_seen": 1549741056 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026792377131394185, + "loss": 2.5778, + "theoretical_loss": 3.506000402708715, + "tokens_seen": 1549806592 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026791374122367103, + "loss": 2.4503, + "theoretical_loss": 3.505987413110838, + "tokens_seen": 1549872128 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1761272, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8500523567199707, + "objective/train/theoretical_loss": 3.5059776713738025, + "objective/train/tokens_used": 1570381280, + "theoretical_loss": 3.5059776713738025, + "tokens_seen": 1549921280 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002679037111334002, + "loss": 2.5811, + "theoretical_loss": 3.505974424215998, + "tokens_seen": 1549937664 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002678936810431294, + "loss": 2.7321, + "theoretical_loss": 3.505961436024127, + "tokens_seen": 1550003200 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026788365095285863, + "loss": 2.4762, + "theoretical_loss": 3.505948448535157, + "tokens_seen": 1550068736 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026787362086258776, + "loss": 2.72, + "theoretical_loss": 3.5059354617490204, + "tokens_seen": 1550134272 + }, + { + "epoch": 5.02, + "learning_rate": 0.000267863590772317, + "loss": 2.6949, + "theoretical_loss": 3.5059224756656495, + "tokens_seen": 1550199808 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002678535606820461, + "loss": 2.7632, + "theoretical_loss": 3.505909490284976, + "tokens_seen": 1550265344 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026784353059177535, + "loss": 2.6267, + "theoretical_loss": 3.5058965056069336, + "tokens_seen": 1550330880 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026783350050150453, + "loss": 2.6218, + "theoretical_loss": 3.5058835216314534, + "tokens_seen": 1550396416 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002678234704112337, + "loss": 2.7226, + "theoretical_loss": 3.5058705383584674, + "tokens_seen": 1550461952 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002678134403209629, + "loss": 2.5492, + "theoretical_loss": 3.505857555787909, + "tokens_seen": 1550527488 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002678034102306921, + "loss": 2.5225, + "theoretical_loss": 3.5058445739197097, + "tokens_seen": 1550593024 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026779338014042126, + "loss": 2.6679, + "theoretical_loss": 3.505831592753802, + "tokens_seen": 1550658560 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026779338014042126, + "loss": 2.7302, + "theoretical_loss": 3.5058186122901187, + "tokens_seen": 1550724096 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002677833500501505, + "loss": 2.5314, + "theoretical_loss": 3.5058056325285913, + "tokens_seen": 1550789632 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002677733199598796, + "loss": 2.5831, + "theoretical_loss": 3.5057926534691526, + "tokens_seen": 1550855168 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026776328986960886, + "loss": 2.685, + "theoretical_loss": 3.505779675111735, + "tokens_seen": 1550920704 + }, + { + "epoch": 5.02, + "learning_rate": 0.000267753259779338, + "loss": 2.6247, + "theoretical_loss": 3.505766697456271, + "tokens_seen": 1550986240 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002677432296890672, + "loss": 2.6779, + "theoretical_loss": 3.505753720502693, + "tokens_seen": 1551051776 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002677331995987964, + "loss": 2.5999, + "theoretical_loss": 3.5057407442509327, + "tokens_seen": 1551117312 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002677231695085256, + "loss": 2.4158, + "theoretical_loss": 3.5057277687009236, + "tokens_seen": 1551182848 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026771313941825476, + "loss": 2.4457, + "theoretical_loss": 3.5057147938525968, + "tokens_seen": 1551248384 + }, + { + "epoch": 5.02, + "learning_rate": 0.000267703109327984, + "loss": 2.3523, + "theoretical_loss": 3.5057018197058856, + "tokens_seen": 1551313920 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002676930792377131, + "loss": 2.6527, + "theoretical_loss": 3.505688846260722, + "tokens_seen": 1551379456 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026768304914744236, + "loss": 2.7381, + "theoretical_loss": 3.5056758735170392, + "tokens_seen": 1551444992 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002676730190571715, + "loss": 2.6779, + "theoretical_loss": 3.5056629014747687, + "tokens_seen": 1551510528 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1761875, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.585963010787964, + "objective/train/theoretical_loss": 3.5056531729033273, + "objective/train/tokens_used": 1572019680, + "theoretical_loss": 3.5056531729033273, + "tokens_seen": 1551559680 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002676629889669007, + "loss": 2.6818, + "theoretical_loss": 3.5056499301338433, + "tokens_seen": 1551576064 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002676529588766299, + "loss": 2.6806, + "theoretical_loss": 3.5056369594941956, + "tokens_seen": 1551641600 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002676429287863591, + "loss": 2.4555, + "theoretical_loss": 3.505623989555758, + "tokens_seen": 1551707136 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026763289869608827, + "loss": 2.4361, + "theoretical_loss": 3.505611020318463, + "tokens_seen": 1551772672 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026762286860581745, + "loss": 2.6085, + "theoretical_loss": 3.505598051782243, + "tokens_seen": 1551838208 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026761283851554663, + "loss": 2.5242, + "theoretical_loss": 3.5055850839470306, + "tokens_seen": 1551903744 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026760280842527586, + "loss": 2.6377, + "theoretical_loss": 3.5055721168127576, + "tokens_seen": 1551969280 + }, + { + "epoch": 5.02, + "learning_rate": 0.000267592778335005, + "loss": 2.6599, + "theoretical_loss": 3.5055591503793577, + "tokens_seen": 1552034816 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002675827482447342, + "loss": 2.6179, + "theoretical_loss": 3.505546184646763, + "tokens_seen": 1552100352 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026757271815446335, + "loss": 2.6306, + "theoretical_loss": 3.5055332196149056, + "tokens_seen": 1552165888 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002675626880641926, + "loss": 2.6899, + "theoretical_loss": 3.505520255283719, + "tokens_seen": 1552231424 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026755265797392177, + "loss": 2.6717, + "theoretical_loss": 3.5055072916531342, + "tokens_seen": 1552296960 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026754262788365095, + "loss": 2.8465, + "theoretical_loss": 3.505494328723085, + "tokens_seen": 1552362496 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026753259779338013, + "loss": 2.7475, + "theoretical_loss": 3.5054813664935036, + "tokens_seen": 1552428032 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026752256770310937, + "loss": 2.6695, + "theoretical_loss": 3.505468404964323, + "tokens_seen": 1552493568 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002675125376128385, + "loss": 2.5977, + "theoretical_loss": 3.5054554441354755, + "tokens_seen": 1552559104 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026750250752256773, + "loss": 2.4163, + "theoretical_loss": 3.5054424840068927, + "tokens_seen": 1552624640 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026749247743229686, + "loss": 2.7349, + "theoretical_loss": 3.505429524578509, + "tokens_seen": 1552690176 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002674824473420261, + "loss": 2.5672, + "theoretical_loss": 3.505416565850256, + "tokens_seen": 1552755712 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026747241725175527, + "loss": 2.6153, + "theoretical_loss": 3.505403607822066, + "tokens_seen": 1552821248 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026746238716148445, + "loss": 2.814, + "theoretical_loss": 3.5053906504938723, + "tokens_seen": 1552886784 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026745235707121363, + "loss": 2.6414, + "theoretical_loss": 3.505377693865608, + "tokens_seen": 1552952320 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002674423269809428, + "loss": 2.7226, + "theoretical_loss": 3.505364737937204, + "tokens_seen": 1553017856 + }, + { + "epoch": 5.02, + "learning_rate": 0.000267432296890672, + "loss": 2.7976, + "theoretical_loss": 3.505351782708595, + "tokens_seen": 1553083392 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026742226680040123, + "loss": 2.9329, + "theoretical_loss": 3.5053388281797124, + "tokens_seen": 1553148928 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1763200, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.717268466949463, + "objective/train/theoretical_loss": 3.5053291127422055, + "objective/train/tokens_used": 1573658080, + "theoretical_loss": 3.5053291127422055, + "tokens_seen": 1553198080 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026741223671013036, + "loss": 2.7241, + "theoretical_loss": 3.505325874350489, + "tokens_seen": 1553214464 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002674022066198596, + "loss": 2.6349, + "theoretical_loss": 3.505312921220858, + "tokens_seen": 1553280000 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002673921765295888, + "loss": 2.6627, + "theoretical_loss": 3.5052999687907516, + "tokens_seen": 1553345536 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026738214643931796, + "loss": 2.6706, + "theoretical_loss": 3.505287017060103, + "tokens_seen": 1553411072 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026737211634904714, + "loss": 2.7722, + "theoretical_loss": 3.5052740660288446, + "tokens_seen": 1553476608 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002673620862587763, + "loss": 2.7066, + "theoretical_loss": 3.5052611156969093, + "tokens_seen": 1553542144 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002673520561685055, + "loss": 2.5324, + "theoretical_loss": 3.50524816606423, + "tokens_seen": 1553607680 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026734202607823474, + "loss": 2.6319, + "theoretical_loss": 3.505235217130738, + "tokens_seen": 1553673216 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026733199598796386, + "loss": 2.7366, + "theoretical_loss": 3.5052222688963686, + "tokens_seen": 1553738752 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002673219658976931, + "loss": 2.8268, + "theoretical_loss": 3.5052093213610522, + "tokens_seen": 1553804288 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002673119358074222, + "loss": 2.4775, + "theoretical_loss": 3.5051963745247234, + "tokens_seen": 1553869824 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026730190571715146, + "loss": 2.6987, + "theoretical_loss": 3.505183428387314, + "tokens_seen": 1553935360 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026729187562688064, + "loss": 2.7447, + "theoretical_loss": 3.5051704829487567, + "tokens_seen": 1554000896 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002672818455366098, + "loss": 2.7616, + "theoretical_loss": 3.5051575382089846, + "tokens_seen": 1554066432 + }, + { + "epoch": 5.02, + "learning_rate": 0.000267271815446339, + "loss": 2.8065, + "theoretical_loss": 3.5051445941679304, + "tokens_seen": 1554131968 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002672617853560682, + "loss": 2.7473, + "theoretical_loss": 3.5051316508255272, + "tokens_seen": 1554197504 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002672517552657974, + "loss": 2.6068, + "theoretical_loss": 3.5051187081817075, + "tokens_seen": 1554263040 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002672417251755266, + "loss": 2.6967, + "theoretical_loss": 3.5051057662364045, + "tokens_seen": 1554328576 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002672316950852558, + "loss": 2.6442, + "theoretical_loss": 3.5050928249895508, + "tokens_seen": 1554394112 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026722166499498496, + "loss": 2.6032, + "theoretical_loss": 3.5050798844410793, + "tokens_seen": 1554459648 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002672116349047142, + "loss": 2.603, + "theoretical_loss": 3.5050669445909226, + "tokens_seen": 1554525184 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002672016048144433, + "loss": 2.5763, + "theoretical_loss": 3.505054005439014, + "tokens_seen": 1554590720 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026719157472417256, + "loss": 2.8347, + "theoretical_loss": 3.505041066985286, + "tokens_seen": 1554656256 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002671815446339017, + "loss": 2.6721, + "theoretical_loss": 3.5050281292296726, + "tokens_seen": 1554721792 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002671715145436309, + "loss": 2.6613, + "theoretical_loss": 3.505015192172105, + "tokens_seen": 1554787328 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1763875, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4189612865448, + "objective/train/theoretical_loss": 3.5050054898369822, + "objective/train/tokens_used": 1575296480, + "theoretical_loss": 3.5050054898369822, + "tokens_seen": 1554836480 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002671614844533601, + "loss": 2.6215, + "theoretical_loss": 3.505002255812517, + "tokens_seen": 1554852864 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002671514543630893, + "loss": 2.4481, + "theoretical_loss": 3.504989320150842, + "tokens_seen": 1554918400 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026714142427281847, + "loss": 2.583, + "theoretical_loss": 3.504976385187012, + "tokens_seen": 1554983936 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026713139418254765, + "loss": 2.4658, + "theoretical_loss": 3.5049634509209606, + "tokens_seen": 1555049472 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026712136409227683, + "loss": 2.6812, + "theoretical_loss": 3.5049505173526203, + "tokens_seen": 1555115008 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026711133400200606, + "loss": 2.7396, + "theoretical_loss": 3.504937584481924, + "tokens_seen": 1555180544 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002671013039117352, + "loss": 2.8398, + "theoretical_loss": 3.5049246523088056, + "tokens_seen": 1555246080 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002670912738214644, + "loss": 2.6925, + "theoretical_loss": 3.5049117208331975, + "tokens_seen": 1555311616 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026708124373119355, + "loss": 2.7556, + "theoretical_loss": 3.504898790055032, + "tokens_seen": 1555377152 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002670712136409228, + "loss": 2.7088, + "theoretical_loss": 3.504885859974243, + "tokens_seen": 1555442688 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026706118355065197, + "loss": 2.6861, + "theoretical_loss": 3.5048729305907633, + "tokens_seen": 1555508224 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026705115346038115, + "loss": 2.5036, + "theoretical_loss": 3.5048600019045257, + "tokens_seen": 1555573760 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026704112337011033, + "loss": 2.6393, + "theoretical_loss": 3.5048470739154634, + "tokens_seen": 1555639296 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026703109327983957, + "loss": 2.4422, + "theoretical_loss": 3.5048341466235096, + "tokens_seen": 1555704832 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002670210631895687, + "loss": 2.7981, + "theoretical_loss": 3.504821220028597, + "tokens_seen": 1555770368 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026701103309929793, + "loss": 2.5328, + "theoretical_loss": 3.504808294130659, + "tokens_seen": 1555835904 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026700100300902706, + "loss": 2.7095, + "theoretical_loss": 3.504795368929628, + "tokens_seen": 1555901440 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002669909729187563, + "loss": 2.6313, + "theoretical_loss": 3.504782444425438, + "tokens_seen": 1555966976 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026698094282848547, + "loss": 2.54, + "theoretical_loss": 3.504769520618021, + "tokens_seen": 1556032512 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026697091273821465, + "loss": 2.7396, + "theoretical_loss": 3.5047565975073116, + "tokens_seen": 1556098048 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026696088264794383, + "loss": 2.5316, + "theoretical_loss": 3.5047436750932417, + "tokens_seen": 1556163584 + }, + { + "epoch": 5.02, + "learning_rate": 0.000266950852557673, + "loss": 2.624, + "theoretical_loss": 3.504730753375745, + "tokens_seen": 1556229120 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002669408224674022, + "loss": 2.7467, + "theoretical_loss": 3.504717832354754, + "tokens_seen": 1556294656 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026693079237713143, + "loss": 2.5787, + "theoretical_loss": 3.504704912030202, + "tokens_seen": 1556360192 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026692076228686056, + "loss": 2.6299, + "theoretical_loss": 3.504691992402023, + "tokens_seen": 1556425728 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1765110, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9499735832214355, + "objective/train/theoretical_loss": 3.5046823031378422, + "objective/train/tokens_used": 1576934880, + "theoretical_loss": 3.5046823031378422, + "tokens_seen": 1556474880 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002669107321965898, + "loss": 2.9113, + "theoretical_loss": 3.5046790734701485, + "tokens_seen": 1556491264 + }, + { + "epoch": 5.02, + "learning_rate": 0.000266900702106319, + "loss": 2.4546, + "theoretical_loss": 3.5046661552345135, + "tokens_seen": 1556556800 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026689067201604816, + "loss": 2.8173, + "theoretical_loss": 3.5046532376950497, + "tokens_seen": 1556622336 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026688064192577734, + "loss": 2.6191, + "theoretical_loss": 3.504640320851691, + "tokens_seen": 1556687872 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002668706118355065, + "loss": 2.6093, + "theoretical_loss": 3.504627404704371, + "tokens_seen": 1556753408 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002668605817452357, + "loss": 2.7305, + "theoretical_loss": 3.504614489253022, + "tokens_seen": 1556818944 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026685055165496494, + "loss": 2.5957, + "theoretical_loss": 3.5046015744975776, + "tokens_seen": 1556884480 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026684052156469406, + "loss": 2.744, + "theoretical_loss": 3.504588660437971, + "tokens_seen": 1556950016 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002668304914744233, + "loss": 2.8159, + "theoretical_loss": 3.504575747074135, + "tokens_seen": 1557015552 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002668204613841524, + "loss": 2.5389, + "theoretical_loss": 3.504562834406004, + "tokens_seen": 1557081088 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026681043129388166, + "loss": 2.4884, + "theoretical_loss": 3.50454992243351, + "tokens_seen": 1557146624 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026680040120361084, + "loss": 2.7264, + "theoretical_loss": 3.5045370111565863, + "tokens_seen": 1557212160 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026679037111334, + "loss": 2.6224, + "theoretical_loss": 3.504524100575167, + "tokens_seen": 1557277696 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002667803410230692, + "loss": 2.5575, + "theoretical_loss": 3.504511190689185, + "tokens_seen": 1557343232 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002667703109327984, + "loss": 2.5781, + "theoretical_loss": 3.5044982814985737, + "tokens_seen": 1557408768 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026676028084252757, + "loss": 2.6302, + "theoretical_loss": 3.5044853730032655, + "tokens_seen": 1557474304 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002667502507522568, + "loss": 2.5893, + "theoretical_loss": 3.504472465203195, + "tokens_seen": 1557539840 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026674022066198593, + "loss": 2.5045, + "theoretical_loss": 3.5044595580982945, + "tokens_seen": 1557605376 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026673019057171516, + "loss": 2.556, + "theoretical_loss": 3.504446651688498, + "tokens_seen": 1557670912 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026672016048144434, + "loss": 2.8052, + "theoretical_loss": 3.5044337459737385, + "tokens_seen": 1557736448 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002667101303911735, + "loss": 2.6902, + "theoretical_loss": 3.5044208409539492, + "tokens_seen": 1557801984 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002667001003009027, + "loss": 2.7238, + "theoretical_loss": 3.5044079366290637, + "tokens_seen": 1557867520 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002666900702106319, + "loss": 2.6548, + "theoretical_loss": 3.5043950329990152, + "tokens_seen": 1557933056 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026668004012036107, + "loss": 2.6523, + "theoretical_loss": 3.504382130063737, + "tokens_seen": 1557998592 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002666700100300903, + "loss": 2.5946, + "theoretical_loss": 3.504369227823163, + "tokens_seen": 1558064128 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1766105, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.555666446685791, + "objective/train/theoretical_loss": 3.5043595515985917, + "objective/train/tokens_used": 1578573280, + "theoretical_loss": 3.5043595515985917, + "tokens_seen": 1558113280 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026665997993981943, + "loss": 2.6693, + "theoretical_loss": 3.504356326277226, + "tokens_seen": 1558129664 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026664994984954867, + "loss": 2.6442, + "theoretical_loss": 3.5043434254258594, + "tokens_seen": 1558195200 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002666399197592778, + "loss": 2.6077, + "theoretical_loss": 3.504330525268997, + "tokens_seen": 1558260736 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026662988966900703, + "loss": 2.8557, + "theoretical_loss": 3.504317625806572, + "tokens_seen": 1558326272 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002666198595787362, + "loss": 2.6912, + "theoretical_loss": 3.5043047270385177, + "tokens_seen": 1558391808 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002666098294884654, + "loss": 2.5286, + "theoretical_loss": 3.5042918289647673, + "tokens_seen": 1558457344 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026659979939819457, + "loss": 2.6179, + "theoretical_loss": 3.5042789315852545, + "tokens_seen": 1558522880 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026658976930792375, + "loss": 2.7925, + "theoretical_loss": 3.504266034899913, + "tokens_seen": 1558588416 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026657973921765293, + "loss": 2.6147, + "theoretical_loss": 3.504253138908676, + "tokens_seen": 1558653952 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026656970912738217, + "loss": 2.64, + "theoretical_loss": 3.504240243611477, + "tokens_seen": 1558719488 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002665596790371113, + "loss": 2.7022, + "theoretical_loss": 3.504227349008249, + "tokens_seen": 1558785024 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026654964894684053, + "loss": 2.422, + "theoretical_loss": 3.5042144550989267, + "tokens_seen": 1558850560 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002665396188565697, + "loss": 2.7155, + "theoretical_loss": 3.5042015618834426, + "tokens_seen": 1558916096 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002665295887662989, + "loss": 2.6079, + "theoretical_loss": 3.50418866936173, + "tokens_seen": 1558981632 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002665195586760281, + "loss": 2.6836, + "theoretical_loss": 3.5041757775337237, + "tokens_seen": 1559047168 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026650952858575726, + "loss": 2.4562, + "theoretical_loss": 3.5041628863993557, + "tokens_seen": 1559112704 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002664994984954865, + "loss": 2.7825, + "theoretical_loss": 3.50414999595856, + "tokens_seen": 1559178240 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026648946840521567, + "loss": 2.8305, + "theoretical_loss": 3.5041371062112705, + "tokens_seen": 1559243776 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026647943831494485, + "loss": 2.5331, + "theoretical_loss": 3.504124217157421, + "tokens_seen": 1559309312 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026646940822467403, + "loss": 2.619, + "theoretical_loss": 3.5041113287969434, + "tokens_seen": 1559374848 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002664593781344032, + "loss": 2.7494, + "theoretical_loss": 3.5040984411297735, + "tokens_seen": 1559440384 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002664493480441324, + "loss": 2.7947, + "theoretical_loss": 3.5040855541558438, + "tokens_seen": 1559505920 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026643931795386163, + "loss": 2.5748, + "theoretical_loss": 3.5040726678750875, + "tokens_seen": 1559571456 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026642928786359076, + "loss": 2.9177, + "theoretical_loss": 3.5040597822874386, + "tokens_seen": 1559636992 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026641925777332, + "loss": 2.75, + "theoretical_loss": 3.504046897392831, + "tokens_seen": 1559702528 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1766788, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9355313777923584, + "objective/train/theoretical_loss": 3.5040372341766433, + "objective/train/tokens_used": 1580211680, + "theoretical_loss": 3.5040372341766433, + "tokens_seen": 1559751680 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002664092276830492, + "loss": 2.9388, + "theoretical_loss": 3.5040340131911973, + "tokens_seen": 1559768064 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026639919759277836, + "loss": 2.8814, + "theoretical_loss": 3.5040211296824726, + "tokens_seen": 1559833600 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026638916750250754, + "loss": 2.5502, + "theoretical_loss": 3.5040082468665896, + "tokens_seen": 1559899136 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002663791374122367, + "loss": 2.7294, + "theoretical_loss": 3.503995364743482, + "tokens_seen": 1559964672 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002663691073219659, + "loss": 2.781, + "theoretical_loss": 3.5039824833130835, + "tokens_seen": 1560030208 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026635907723169514, + "loss": 2.7887, + "theoretical_loss": 3.503969602575328, + "tokens_seen": 1560095744 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026634904714142426, + "loss": 2.626, + "theoretical_loss": 3.5039567225301482, + "tokens_seen": 1560161280 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002663390170511535, + "loss": 2.7191, + "theoretical_loss": 3.5039438431774794, + "tokens_seen": 1560226816 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002663289869608826, + "loss": 2.5363, + "theoretical_loss": 3.5039309645172536, + "tokens_seen": 1560292352 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026631895687061186, + "loss": 2.8227, + "theoretical_loss": 3.5039180865494055, + "tokens_seen": 1560357888 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026630892678034104, + "loss": 2.8357, + "theoretical_loss": 3.503905209273869, + "tokens_seen": 1560423424 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002662988966900702, + "loss": 2.599, + "theoretical_loss": 3.503892332690577, + "tokens_seen": 1560488960 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002662888665997994, + "loss": 2.6265, + "theoretical_loss": 3.5038794567994636, + "tokens_seen": 1560554496 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002662788365095286, + "loss": 2.6774, + "theoretical_loss": 3.503866581600463, + "tokens_seen": 1560620032 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026626880641925777, + "loss": 2.6937, + "theoretical_loss": 3.503853707093508, + "tokens_seen": 1560685568 + }, + { + "epoch": 5.02, + "learning_rate": 0.000266258776328987, + "loss": 2.4141, + "theoretical_loss": 3.5038408332785327, + "tokens_seen": 1560751104 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026624874623871613, + "loss": 2.4629, + "theoretical_loss": 3.5038279601554714, + "tokens_seen": 1560816640 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026623871614844536, + "loss": 2.7788, + "theoretical_loss": 3.503815087724257, + "tokens_seen": 1560882176 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026622868605817454, + "loss": 2.4715, + "theoretical_loss": 3.503802215984824, + "tokens_seen": 1560947712 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002662186559679037, + "loss": 2.6418, + "theoretical_loss": 3.503789344937106, + "tokens_seen": 1561013248 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002662086258776329, + "loss": 2.796, + "theoretical_loss": 3.503776474581036, + "tokens_seen": 1561078784 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002661985957873621, + "loss": 2.5447, + "theoretical_loss": 3.503763604916549, + "tokens_seen": 1561144320 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026618856569709127, + "loss": 2.6333, + "theoretical_loss": 3.503750735943578, + "tokens_seen": 1561209856 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002661785356068205, + "loss": 2.657, + "theoretical_loss": 3.5037378676620574, + "tokens_seen": 1561275392 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026616850551654963, + "loss": 2.8328, + "theoretical_loss": 3.5037250000719204, + "tokens_seen": 1561340928 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1767472, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.634784460067749, + "objective/train/theoretical_loss": 3.5037153498329987, + "objective/train/tokens_used": 1581850080, + "theoretical_loss": 3.5037153498329987, + "tokens_seen": 1561390080 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026615847542627887, + "loss": 2.4677, + "theoretical_loss": 3.5037121331731016, + "tokens_seen": 1561406464 + }, + { + "epoch": 5.02, + "learning_rate": 0.000266148445336008, + "loss": 2.7243, + "theoretical_loss": 3.5036992669655342, + "tokens_seen": 1561472000 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026613841524573723, + "loss": 2.814, + "theoretical_loss": 3.503686401449152, + "tokens_seen": 1561537536 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002661283851554664, + "loss": 2.7473, + "theoretical_loss": 3.5036735366238894, + "tokens_seen": 1561603072 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002661183550651956, + "loss": 2.5054, + "theoretical_loss": 3.50366067248968, + "tokens_seen": 1561668608 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026610832497492477, + "loss": 2.3481, + "theoretical_loss": 3.5036478090464573, + "tokens_seen": 1561734144 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026609829488465395, + "loss": 2.5554, + "theoretical_loss": 3.503634946294156, + "tokens_seen": 1561799680 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026608826479438313, + "loss": 2.5268, + "theoretical_loss": 3.503622084232709, + "tokens_seen": 1561865216 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026607823470411237, + "loss": 2.5213, + "theoretical_loss": 3.5036092228620515, + "tokens_seen": 1561930752 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002660682046138415, + "loss": 2.7919, + "theoretical_loss": 3.5035963621821162, + "tokens_seen": 1561996288 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026605817452357073, + "loss": 2.657, + "theoretical_loss": 3.5035835021928374, + "tokens_seen": 1562061824 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002660481444332999, + "loss": 2.4765, + "theoretical_loss": 3.5035706428941493, + "tokens_seen": 1562127360 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002660381143430291, + "loss": 2.5715, + "theoretical_loss": 3.5035577842859857, + "tokens_seen": 1562192896 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002660280842527583, + "loss": 2.3602, + "theoretical_loss": 3.5035449263682805, + "tokens_seen": 1562258432 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026601805416248746, + "loss": 2.741, + "theoretical_loss": 3.503532069140968, + "tokens_seen": 1562323968 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026600802407221664, + "loss": 2.6975, + "theoretical_loss": 3.5035192126039814, + "tokens_seen": 1562389504 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026599799398194587, + "loss": 2.6357, + "theoretical_loss": 3.5035063567572555, + "tokens_seen": 1562455040 + }, + { + "epoch": 5.02, + "learning_rate": 0.000265987963891675, + "loss": 2.7784, + "theoretical_loss": 3.5034935016007243, + "tokens_seen": 1562520576 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026597793380140423, + "loss": 2.7628, + "theoretical_loss": 3.503480647134321, + "tokens_seen": 1562586112 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026596790371113336, + "loss": 2.7268, + "theoretical_loss": 3.50346779335798, + "tokens_seen": 1562651648 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002659578736208626, + "loss": 2.5596, + "theoretical_loss": 3.5034549402716353, + "tokens_seen": 1562717184 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002659478435305918, + "loss": 2.7433, + "theoretical_loss": 3.5034420878752206, + "tokens_seen": 1562782720 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026593781344032096, + "loss": 2.7509, + "theoretical_loss": 3.503429236168671, + "tokens_seen": 1562848256 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026592778335005014, + "loss": 2.7708, + "theoretical_loss": 3.5034163851519198, + "tokens_seen": 1562913792 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002659177532597794, + "loss": 2.6534, + "theoretical_loss": 3.5034035348249013, + "tokens_seen": 1562979328 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1768239, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8608767986297607, + "objective/train/theoretical_loss": 3.5033938975322343, + "objective/train/tokens_used": 1583488480, + "theoretical_loss": 3.5033938975322343, + "tokens_seen": 1563028480 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002659077231695085, + "loss": 2.7638, + "theoretical_loss": 3.503390685187549, + "tokens_seen": 1563044864 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026589769307923774, + "loss": 2.6592, + "theoretical_loss": 3.503377836239798, + "tokens_seen": 1563110400 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026588766298896686, + "loss": 2.7249, + "theoretical_loss": 3.5033649879815814, + "tokens_seen": 1563175936 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002658776328986961, + "loss": 2.6386, + "theoretical_loss": 3.503352140412833, + "tokens_seen": 1563241472 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002658676028084253, + "loss": 2.5453, + "theoretical_loss": 3.503339293533488, + "tokens_seen": 1563307008 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026585757271815446, + "loss": 2.7011, + "theoretical_loss": 3.5033264473434804, + "tokens_seen": 1563372544 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026584754262788364, + "loss": 2.805, + "theoretical_loss": 3.503313601842744, + "tokens_seen": 1563438080 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002658375125376128, + "loss": 2.708, + "theoretical_loss": 3.5033007570312122, + "tokens_seen": 1563503616 + }, + { + "epoch": 5.02, + "learning_rate": 0.000265827482447342, + "loss": 2.8063, + "theoretical_loss": 3.50328791290882, + "tokens_seen": 1563569152 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026581745235707124, + "loss": 2.6561, + "theoretical_loss": 3.5032750694755017, + "tokens_seen": 1563634688 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026580742226680037, + "loss": 2.6712, + "theoretical_loss": 3.5032622267311915, + "tokens_seen": 1563700224 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002657973921765296, + "loss": 2.639, + "theoretical_loss": 3.503249384675822, + "tokens_seen": 1563765760 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026578736208625873, + "loss": 2.7569, + "theoretical_loss": 3.5032365433093293, + "tokens_seen": 1563831296 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026577733199598797, + "loss": 2.5356, + "theoretical_loss": 3.5032237026316473, + "tokens_seen": 1563896832 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026576730190571715, + "loss": 2.5125, + "theoretical_loss": 3.503210862642709, + "tokens_seen": 1563962368 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026575727181544633, + "loss": 2.7183, + "theoretical_loss": 3.503198023342449, + "tokens_seen": 1564027904 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026574724172517556, + "loss": 2.4693, + "theoretical_loss": 3.5031851847308024, + "tokens_seen": 1564093440 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026573721163490474, + "loss": 2.8912, + "theoretical_loss": 3.503172346807703, + "tokens_seen": 1564158976 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002657271815446339, + "loss": 2.3473, + "theoretical_loss": 3.503159509573085, + "tokens_seen": 1564224512 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002657171514543631, + "loss": 2.6239, + "theoretical_loss": 3.5031466730268823, + "tokens_seen": 1564290048 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002657071213640923, + "loss": 2.5645, + "theoretical_loss": 3.5031338371690293, + "tokens_seen": 1564355584 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026569709127382147, + "loss": 2.6451, + "theoretical_loss": 3.5031210019994603, + "tokens_seen": 1564421120 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002656870611835507, + "loss": 2.6027, + "theoretical_loss": 3.5031081675181093, + "tokens_seen": 1564486656 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026567703109327983, + "loss": 2.6622, + "theoretical_loss": 3.503095333724911, + "tokens_seen": 1564552192 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026566700100300907, + "loss": 2.5073, + "theoretical_loss": 3.5030825006198, + "tokens_seen": 1564617728 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1769768, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.596804141998291, + "objective/train/theoretical_loss": 3.5030728762424834, + "objective/train/tokens_used": 1585126880, + "theoretical_loss": 3.5030728762424834, + "tokens_seen": 1564666880 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002656569709127382, + "loss": 2.5982, + "theoretical_loss": 3.503069668202709, + "tokens_seen": 1564683264 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026564694082246743, + "loss": 2.6338, + "theoretical_loss": 3.5030568364735744, + "tokens_seen": 1564748800 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002656369107321966, + "loss": 2.7805, + "theoretical_loss": 3.5030440054323293, + "tokens_seen": 1564814336 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002656268806419258, + "loss": 2.7623, + "theoretical_loss": 3.503031175078908, + "tokens_seen": 1564879872 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026561685055165497, + "loss": 2.4328, + "theoretical_loss": 3.503018345413245, + "tokens_seen": 1564945408 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026560682046138415, + "loss": 2.615, + "theoretical_loss": 3.503005516435275, + "tokens_seen": 1565010944 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026559679037111333, + "loss": 2.6627, + "theoretical_loss": 3.502992688144932, + "tokens_seen": 1565076480 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026558676028084257, + "loss": 2.5018, + "theoretical_loss": 3.5029798605421503, + "tokens_seen": 1565142016 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002655767301905717, + "loss": 2.3945, + "theoretical_loss": 3.5029670336268643, + "tokens_seen": 1565207552 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026556670010030093, + "loss": 2.6775, + "theoretical_loss": 3.5029542073990085, + "tokens_seen": 1565273088 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002655566700100301, + "loss": 2.7513, + "theoretical_loss": 3.502941381858517, + "tokens_seen": 1565338624 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002655466399197593, + "loss": 2.4477, + "theoretical_loss": 3.502928557005325, + "tokens_seen": 1565404160 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002655366098294885, + "loss": 2.8699, + "theoretical_loss": 3.502915732839366, + "tokens_seen": 1565469696 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026552657973921766, + "loss": 2.6521, + "theoretical_loss": 3.502902909360574, + "tokens_seen": 1565535232 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026551654964894684, + "loss": 2.6694, + "theoretical_loss": 3.502890086568885, + "tokens_seen": 1565600768 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026550651955867607, + "loss": 2.5778, + "theoretical_loss": 3.5028772644642316, + "tokens_seen": 1565666304 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002654964894684052, + "loss": 2.7916, + "theoretical_loss": 3.5028644430465503, + "tokens_seen": 1565731840 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026548645937813443, + "loss": 2.5472, + "theoretical_loss": 3.5028516223157737, + "tokens_seen": 1565797376 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026547642928786356, + "loss": 2.6669, + "theoretical_loss": 3.502838802271837, + "tokens_seen": 1565862912 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002654663991975928, + "loss": 2.6802, + "theoretical_loss": 3.5028259829146746, + "tokens_seen": 1565928448 + }, + { + "epoch": 5.02, + "learning_rate": 0.000265456369107322, + "loss": 2.548, + "theoretical_loss": 3.502813164244221, + "tokens_seen": 1565993984 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026544633901705116, + "loss": 2.7317, + "theoretical_loss": 3.5028003462604107, + "tokens_seen": 1566059520 + }, + { + "epoch": 5.02, + "learning_rate": 0.00026543630892678034, + "loss": 2.7592, + "theoretical_loss": 3.5027875289631782, + "tokens_seen": 1566125056 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002654262788365096, + "loss": 2.2943, + "theoretical_loss": 3.502774712352458, + "tokens_seen": 1566190592 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002654162487462387, + "loss": 2.5666, + "theoretical_loss": 3.5027618964281846, + "tokens_seen": 1566256128 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1770273, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4401655197143555, + "objective/train/theoretical_loss": 3.5027522849354202, + "objective/train/tokens_used": 1586765280, + "theoretical_loss": 3.5027522849354202, + "tokens_seen": 1566305280 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026540621865596794, + "loss": 2.5743, + "theoretical_loss": 3.5027490811902924, + "tokens_seen": 1566321664 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026539618856569707, + "loss": 2.7278, + "theoretical_loss": 3.5027362666387156, + "tokens_seen": 1566387200 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002653861584754263, + "loss": 2.7307, + "theoretical_loss": 3.5027234527733895, + "tokens_seen": 1566452736 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002653761283851555, + "loss": 2.7276, + "theoretical_loss": 3.502710639594248, + "tokens_seen": 1566518272 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026536609829488466, + "loss": 2.4546, + "theoretical_loss": 3.502697827101226, + "tokens_seen": 1566583808 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026535606820461384, + "loss": 2.4774, + "theoretical_loss": 3.5026850152942584, + "tokens_seen": 1566649344 + }, + { + "epoch": 5.03, + "learning_rate": 0.000265346038114343, + "loss": 2.6686, + "theoretical_loss": 3.5026722041732787, + "tokens_seen": 1566714880 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002653360080240722, + "loss": 2.5979, + "theoretical_loss": 3.502659393738223, + "tokens_seen": 1566780416 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026532597793380144, + "loss": 2.6913, + "theoretical_loss": 3.5026465839890237, + "tokens_seen": 1566845952 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026531594784353057, + "loss": 2.5873, + "theoretical_loss": 3.5026337749256182, + "tokens_seen": 1566911488 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002653059177532598, + "loss": 2.4014, + "theoretical_loss": 3.5026209665479384, + "tokens_seen": 1566977024 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026529588766298893, + "loss": 2.7904, + "theoretical_loss": 3.502608158855921, + "tokens_seen": 1567042560 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026528585757271817, + "loss": 2.5611, + "theoretical_loss": 3.5025953518494988, + "tokens_seen": 1567108096 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026527582748244735, + "loss": 2.5719, + "theoretical_loss": 3.502582545528608, + "tokens_seen": 1567173632 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026526579739217653, + "loss": 2.5116, + "theoretical_loss": 3.5025697398931825, + "tokens_seen": 1567239168 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002652557673019057, + "loss": 2.4293, + "theoretical_loss": 3.502556934943157, + "tokens_seen": 1567304704 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026524573721163494, + "loss": 2.7416, + "theoretical_loss": 3.502544130678466, + "tokens_seen": 1567370240 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026523570712136407, + "loss": 2.5996, + "theoretical_loss": 3.5025313270990446, + "tokens_seen": 1567435776 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002652256770310933, + "loss": 2.5542, + "theoretical_loss": 3.5025185242048273, + "tokens_seen": 1567501312 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026521564694082243, + "loss": 2.6636, + "theoretical_loss": 3.5025057219957487, + "tokens_seen": 1567566848 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026520561685055167, + "loss": 2.7647, + "theoretical_loss": 3.5024929204717434, + "tokens_seen": 1567632384 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026519558676028085, + "loss": 2.7057, + "theoretical_loss": 3.502480119632746, + "tokens_seen": 1567697920 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026518555667001003, + "loss": 2.5071, + "theoretical_loss": 3.5024673194786917, + "tokens_seen": 1567763456 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002651755265797392, + "loss": 2.8567, + "theoretical_loss": 3.5024545200095156, + "tokens_seen": 1567828992 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002651654964894684, + "loss": 2.7552, + "theoretical_loss": 3.502441721225151, + "tokens_seen": 1567894528 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1771462, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3107831478118896, + "objective/train/theoretical_loss": 3.5024321225862467, + "objective/train/tokens_used": 1588403680, + "theoretical_loss": 3.5024321225862467, + "tokens_seen": 1567943680 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002651554663991976, + "loss": 2.8068, + "theoretical_loss": 3.502428923125534, + "tokens_seen": 1567960064 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002651454363089268, + "loss": 2.6769, + "theoretical_loss": 3.5024161257105986, + "tokens_seen": 1568025600 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026513540621865594, + "loss": 2.4889, + "theoretical_loss": 3.5024033289802796, + "tokens_seen": 1568091136 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026512537612838517, + "loss": 2.7356, + "theoretical_loss": 3.5023905329345117, + "tokens_seen": 1568156672 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002651153460381143, + "loss": 2.5249, + "theoretical_loss": 3.50237773757323, + "tokens_seen": 1568222208 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026510531594784353, + "loss": 2.4725, + "theoretical_loss": 3.502364942896369, + "tokens_seen": 1568287744 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002650952858575727, + "loss": 2.6477, + "theoretical_loss": 3.5023521489038645, + "tokens_seen": 1568353280 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002650852557673019, + "loss": 2.5498, + "theoretical_loss": 3.5023393555956495, + "tokens_seen": 1568418816 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002650752256770311, + "loss": 2.7988, + "theoretical_loss": 3.50232656297166, + "tokens_seen": 1568484352 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002650651955867603, + "loss": 2.7056, + "theoretical_loss": 3.5023137710318304, + "tokens_seen": 1568549888 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026505516549648944, + "loss": 2.723, + "theoretical_loss": 3.5023009797760962, + "tokens_seen": 1568615424 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002650451354062187, + "loss": 2.8434, + "theoretical_loss": 3.5022881892043918, + "tokens_seen": 1568680960 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002650351053159478, + "loss": 2.6988, + "theoretical_loss": 3.5022753993166518, + "tokens_seen": 1568746496 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026502507522567704, + "loss": 2.5269, + "theoretical_loss": 3.502262610112811, + "tokens_seen": 1568812032 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002650150451354062, + "loss": 2.7634, + "theoretical_loss": 3.502249821592805, + "tokens_seen": 1568877568 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002650050150451354, + "loss": 2.457, + "theoretical_loss": 3.502237033756568, + "tokens_seen": 1568943104 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026499498495486464, + "loss": 2.5449, + "theoretical_loss": 3.5022242466040345, + "tokens_seen": 1569008640 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026498495486459376, + "loss": 2.5428, + "theoretical_loss": 3.5022114601351406, + "tokens_seen": 1569074176 + }, + { + "epoch": 5.03, + "learning_rate": 0.000264974924774323, + "loss": 2.5365, + "theoretical_loss": 3.5021986743498204, + "tokens_seen": 1569139712 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002649648946840522, + "loss": 2.7373, + "theoretical_loss": 3.502185889248009, + "tokens_seen": 1569205248 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026495486459378136, + "loss": 2.6758, + "theoretical_loss": 3.502173104829641, + "tokens_seen": 1569270784 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026494483450351054, + "loss": 2.9497, + "theoretical_loss": 3.5021603210946517, + "tokens_seen": 1569336320 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002649348044132398, + "loss": 2.7386, + "theoretical_loss": 3.5021475380429763, + "tokens_seen": 1569401856 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002649247743229689, + "loss": 2.3083, + "theoretical_loss": 3.502134755674549, + "tokens_seen": 1569467392 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026491474423269814, + "loss": 2.6491, + "theoretical_loss": 3.502121973989305, + "tokens_seen": 1569532928 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1772261, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.672778844833374, + "objective/train/theoretical_loss": 3.5021123881736727, + "objective/train/tokens_used": 1590042080, + "theoretical_loss": 3.5021123881736727, + "tokens_seen": 1569582080 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026490471414242727, + "loss": 2.5048, + "theoretical_loss": 3.50210919298718, + "tokens_seen": 1569598464 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002648946840521565, + "loss": 2.5785, + "theoretical_loss": 3.502096412668108, + "tokens_seen": 1569664000 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002648846539618857, + "loss": 2.6309, + "theoretical_loss": 3.5020836330320244, + "tokens_seen": 1569729536 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026487462387161486, + "loss": 2.6183, + "theoretical_loss": 3.5020708540788643, + "tokens_seen": 1569795072 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026486459378134404, + "loss": 2.7797, + "theoretical_loss": 3.502058075808562, + "tokens_seen": 1569860608 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002648545636910732, + "loss": 2.5406, + "theoretical_loss": 3.5020452982210535, + "tokens_seen": 1569926144 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002648445336008024, + "loss": 2.641, + "theoretical_loss": 3.5020325213162735, + "tokens_seen": 1569991680 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026483450351053164, + "loss": 2.7582, + "theoretical_loss": 3.5020197450941564, + "tokens_seen": 1570057216 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026482447342026077, + "loss": 2.8391, + "theoretical_loss": 3.502006969554638, + "tokens_seen": 1570122752 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026481444332999, + "loss": 2.5771, + "theoretical_loss": 3.501994194697653, + "tokens_seen": 1570188288 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026480441323971913, + "loss": 2.791, + "theoretical_loss": 3.5019814205231365, + "tokens_seen": 1570253824 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026479438314944837, + "loss": 2.7583, + "theoretical_loss": 3.501968647031023, + "tokens_seen": 1570319360 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026478435305917755, + "loss": 2.6849, + "theoretical_loss": 3.5019558742212493, + "tokens_seen": 1570384896 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026477432296890673, + "loss": 2.4884, + "theoretical_loss": 3.5019431020937484, + "tokens_seen": 1570450432 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002647642928786359, + "loss": 2.7081, + "theoretical_loss": 3.501930330648457, + "tokens_seen": 1570515968 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026475426278836514, + "loss": 2.5406, + "theoretical_loss": 3.501917559885309, + "tokens_seen": 1570581504 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026474423269809427, + "loss": 2.7217, + "theoretical_loss": 3.50190478980424, + "tokens_seen": 1570647040 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002647342026078235, + "loss": 2.8213, + "theoretical_loss": 3.501892020405185, + "tokens_seen": 1570712576 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026472417251755263, + "loss": 2.8029, + "theoretical_loss": 3.5018792516880795, + "tokens_seen": 1570778112 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026471414242728187, + "loss": 2.7009, + "theoretical_loss": 3.501866483652858, + "tokens_seen": 1570843648 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026470411233701105, + "loss": 2.5567, + "theoretical_loss": 3.5018537162994567, + "tokens_seen": 1570909184 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026469408224674023, + "loss": 2.6985, + "theoretical_loss": 3.5018409496278093, + "tokens_seen": 1570974720 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002646840521564694, + "loss": 2.6412, + "theoretical_loss": 3.501828183637852, + "tokens_seen": 1571040256 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002646740220661986, + "loss": 2.6996, + "theoretical_loss": 3.50181541832952, + "tokens_seen": 1571105792 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002646639919759278, + "loss": 2.4996, + "theoretical_loss": 3.5018026537027476, + "tokens_seen": 1571171328 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1773327, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5414953231811523, + "objective/train/theoretical_loss": 3.5017930806799034, + "objective/train/tokens_used": 1591680480, + "theoretical_loss": 3.5017930806799034, + "tokens_seen": 1571220480 + }, + { + "epoch": 5.03, + "learning_rate": 0.000264653961885657, + "loss": 2.4725, + "theoretical_loss": 3.501789889757471, + "tokens_seen": 1571236864 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026464393179538614, + "loss": 2.4687, + "theoretical_loss": 3.5017771264936246, + "tokens_seen": 1571302400 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026463390170511537, + "loss": 2.6159, + "theoretical_loss": 3.5017643639111435, + "tokens_seen": 1571367936 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002646238716148445, + "loss": 2.6949, + "theoretical_loss": 3.501751602009964, + "tokens_seen": 1571433472 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026461384152457373, + "loss": 2.6275, + "theoretical_loss": 3.5017388407900207, + "tokens_seen": 1571499008 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002646038114343029, + "loss": 2.6047, + "theoretical_loss": 3.5017260802512484, + "tokens_seen": 1571564544 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002645937813440321, + "loss": 2.6284, + "theoretical_loss": 3.501713320393583, + "tokens_seen": 1571630080 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002645837512537613, + "loss": 2.8202, + "theoretical_loss": 3.5017005612169596, + "tokens_seen": 1571695616 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002645737211634905, + "loss": 2.5874, + "theoretical_loss": 3.501687802721313, + "tokens_seen": 1571761152 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026456369107321964, + "loss": 2.7119, + "theoretical_loss": 3.5016750449065785, + "tokens_seen": 1571826688 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002645536609829489, + "loss": 2.5645, + "theoretical_loss": 3.501662287772692, + "tokens_seen": 1571892224 + }, + { + "epoch": 5.03, + "learning_rate": 0.000264543630892678, + "loss": 2.6312, + "theoretical_loss": 3.5016495313195883, + "tokens_seen": 1571957760 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026453360080240724, + "loss": 2.5627, + "theoretical_loss": 3.501636775547203, + "tokens_seen": 1572023296 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002645235707121364, + "loss": 2.8078, + "theoretical_loss": 3.5016240204554716, + "tokens_seen": 1572088832 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002645135406218656, + "loss": 2.4745, + "theoretical_loss": 3.5016112660443284, + "tokens_seen": 1572154368 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002645035105315948, + "loss": 2.4327, + "theoretical_loss": 3.501598512313709, + "tokens_seen": 1572219904 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026449348044132396, + "loss": 2.6913, + "theoretical_loss": 3.50158575926355, + "tokens_seen": 1572285440 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026448345035105314, + "loss": 2.6209, + "theoretical_loss": 3.5015730068937856, + "tokens_seen": 1572350976 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002644734202607824, + "loss": 2.6321, + "theoretical_loss": 3.501560255204351, + "tokens_seen": 1572416512 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002644633901705115, + "loss": 2.8599, + "theoretical_loss": 3.501547504195182, + "tokens_seen": 1572482048 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026445336008024074, + "loss": 2.852, + "theoretical_loss": 3.5015347538662143, + "tokens_seen": 1572547584 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026444332998996987, + "loss": 2.7477, + "theoretical_loss": 3.501522004217382, + "tokens_seen": 1572613120 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002644332998996991, + "loss": 2.7597, + "theoretical_loss": 3.5015092552486218, + "tokens_seen": 1572678656 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002644232698094283, + "loss": 2.8664, + "theoretical_loss": 3.5014965069598687, + "tokens_seen": 1572744192 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026441323971915747, + "loss": 2.6851, + "theoretical_loss": 3.501483759351058, + "tokens_seen": 1572809728 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1773816, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.9573558568954468, + "objective/train/theoretical_loss": 3.501474199090623, + "objective/train/tokens_used": 1593318880, + "theoretical_loss": 3.501474199090623, + "tokens_seen": 1572858880 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026440320962888665, + "loss": 2.6914, + "theoretical_loss": 3.501471012422125, + "tokens_seen": 1572875264 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002643931795386159, + "loss": 2.5659, + "theoretical_loss": 3.501458266173005, + "tokens_seen": 1572940800 + }, + { + "epoch": 5.03, + "learning_rate": 0.000264383149448345, + "loss": 2.8077, + "theoretical_loss": 3.501445520603634, + "tokens_seen": 1573006336 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026437311935807424, + "loss": 2.7236, + "theoretical_loss": 3.5014327757139467, + "tokens_seen": 1573071872 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026436308926780337, + "loss": 2.6091, + "theoretical_loss": 3.501420031503879, + "tokens_seen": 1573137408 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002643530591775326, + "loss": 2.5839, + "theoretical_loss": 3.5014072879733664, + "tokens_seen": 1573202944 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002643430290872618, + "loss": 2.5391, + "theoretical_loss": 3.501394545122344, + "tokens_seen": 1573268480 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026433299899699097, + "loss": 2.8725, + "theoretical_loss": 3.501381802950748, + "tokens_seen": 1573334016 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026432296890672015, + "loss": 2.803, + "theoretical_loss": 3.501369061458513, + "tokens_seen": 1573399552 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026431293881644933, + "loss": 2.682, + "theoretical_loss": 3.5013563206455744, + "tokens_seen": 1573465088 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002643029087261785, + "loss": 2.5005, + "theoretical_loss": 3.501343580511869, + "tokens_seen": 1573530624 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026429287863590775, + "loss": 2.9047, + "theoretical_loss": 3.5013308410573307, + "tokens_seen": 1573596160 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002642828485456369, + "loss": 2.7758, + "theoretical_loss": 3.501318102281896, + "tokens_seen": 1573661696 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002642728184553661, + "loss": 2.6322, + "theoretical_loss": 3.5013053641855008, + "tokens_seen": 1573727232 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026426278836509534, + "loss": 2.8375, + "theoretical_loss": 3.501292626768079, + "tokens_seen": 1573792768 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026425275827482447, + "loss": 2.714, + "theoretical_loss": 3.5012798900295676, + "tokens_seen": 1573858304 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002642427281845537, + "loss": 2.743, + "theoretical_loss": 3.501267153969902, + "tokens_seen": 1573923840 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026423269809428283, + "loss": 2.6854, + "theoretical_loss": 3.5012544185890166, + "tokens_seen": 1573989376 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026422266800401207, + "loss": 2.5074, + "theoretical_loss": 3.5012416838868483, + "tokens_seen": 1574054912 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026421263791374125, + "loss": 2.6326, + "theoretical_loss": 3.501228949863332, + "tokens_seen": 1574120448 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026420260782347043, + "loss": 2.7372, + "theoretical_loss": 3.5012162165184035, + "tokens_seen": 1574185984 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002641925777331996, + "loss": 2.724, + "theoretical_loss": 3.5012034838519988, + "tokens_seen": 1574251520 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002641825476429288, + "loss": 2.444, + "theoretical_loss": 3.5011907518640526, + "tokens_seen": 1574317056 + }, + { + "epoch": 5.03, + "learning_rate": 0.000264172517552658, + "loss": 2.8405, + "theoretical_loss": 3.5011780205545007, + "tokens_seen": 1574382592 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002641624874623872, + "loss": 2.5441, + "theoretical_loss": 3.5011652899232795, + "tokens_seen": 1574448128 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1774770, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.815545082092285, + "objective/train/theoretical_loss": 3.501155742394979, + "objective/train/tokens_used": 1594957280, + "theoretical_loss": 3.501155742394979, + "tokens_seen": 1574497280 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026415245737211634, + "loss": 2.7969, + "theoretical_loss": 3.5011525599703237, + "tokens_seen": 1574513664 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026414242728184557, + "loss": 2.6354, + "theoretical_loss": 3.5011398306955694, + "tokens_seen": 1574579200 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002641323971915747, + "loss": 2.7739, + "theoretical_loss": 3.5011271020989527, + "tokens_seen": 1574644736 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026412236710130393, + "loss": 2.8288, + "theoretical_loss": 3.501114374180408, + "tokens_seen": 1574710272 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002641123370110331, + "loss": 2.5933, + "theoretical_loss": 3.5011016469398717, + "tokens_seen": 1574775808 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002641023069207623, + "loss": 2.6361, + "theoretical_loss": 3.5010889203772804, + "tokens_seen": 1574841344 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002640922768304915, + "loss": 2.5667, + "theoretical_loss": 3.5010761944925677, + "tokens_seen": 1574906880 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002640822467402207, + "loss": 2.7329, + "theoretical_loss": 3.501063469285671, + "tokens_seen": 1574972416 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026407221664994984, + "loss": 2.7016, + "theoretical_loss": 3.501050744756525, + "tokens_seen": 1575037952 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002640621865596791, + "loss": 2.7043, + "theoretical_loss": 3.501038020905066, + "tokens_seen": 1575103488 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002640521564694082, + "loss": 2.8215, + "theoretical_loss": 3.5010252977312297, + "tokens_seen": 1575169024 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026404212637913744, + "loss": 2.5045, + "theoretical_loss": 3.5010125752349515, + "tokens_seen": 1575234560 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002640320962888666, + "loss": 2.7202, + "theoretical_loss": 3.5009998534161673, + "tokens_seen": 1575300096 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002640220661985958, + "loss": 2.7227, + "theoretical_loss": 3.5009871322748127, + "tokens_seen": 1575365632 + }, + { + "epoch": 5.03, + "learning_rate": 0.000264012036108325, + "loss": 2.6589, + "theoretical_loss": 3.500974411810824, + "tokens_seen": 1575431168 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026400200601805416, + "loss": 2.7051, + "theoretical_loss": 3.500961692024136, + "tokens_seen": 1575496704 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026399197592778334, + "loss": 2.717, + "theoretical_loss": 3.500948972914685, + "tokens_seen": 1575562240 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002639819458375126, + "loss": 2.7733, + "theoretical_loss": 3.500936254482407, + "tokens_seen": 1575627776 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002639719157472417, + "loss": 2.8105, + "theoretical_loss": 3.500923536727237, + "tokens_seen": 1575693312 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026396188565697094, + "loss": 2.8266, + "theoretical_loss": 3.5009108196491123, + "tokens_seen": 1575758848 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026395185556670007, + "loss": 2.8239, + "theoretical_loss": 3.500898103247967, + "tokens_seen": 1575824384 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002639418254764293, + "loss": 2.6971, + "theoretical_loss": 3.500885387523738, + "tokens_seen": 1575889920 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002639317953861585, + "loss": 2.7925, + "theoretical_loss": 3.500872672476361, + "tokens_seen": 1575955456 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026392176529588767, + "loss": 2.7641, + "theoretical_loss": 3.500859958105771, + "tokens_seen": 1576020992 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026391173520561685, + "loss": 2.9513, + "theoretical_loss": 3.5008472444119048, + "tokens_seen": 1576086528 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1774770, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.41874098777771, + "objective/train/theoretical_loss": 3.500837709585566, + "objective/train/tokens_used": 1596595680, + "theoretical_loss": 3.500837709585566, + "tokens_seen": 1576135680 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002639017051153461, + "loss": 2.8032, + "theoretical_loss": 3.5008345313946974, + "tokens_seen": 1576152064 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002638916750250752, + "loss": 2.5515, + "theoretical_loss": 3.5008218190540856, + "tokens_seen": 1576217600 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026388164493480444, + "loss": 2.8856, + "theoretical_loss": 3.5008091073900047, + "tokens_seen": 1576283136 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026387161484453357, + "loss": 2.8323, + "theoretical_loss": 3.5007963964023903, + "tokens_seen": 1576348672 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002638615847542628, + "loss": 2.6877, + "theoretical_loss": 3.500783686091179, + "tokens_seen": 1576414208 + }, + { + "epoch": 5.03, + "learning_rate": 0.000263851554663992, + "loss": 2.5179, + "theoretical_loss": 3.500770976456306, + "tokens_seen": 1576479744 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026384152457372117, + "loss": 2.9558, + "theoretical_loss": 3.5007582674977082, + "tokens_seen": 1576545280 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026383149448345035, + "loss": 2.8958, + "theoretical_loss": 3.5007455592153205, + "tokens_seen": 1576610816 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026382146439317953, + "loss": 2.7204, + "theoretical_loss": 3.500732851609079, + "tokens_seen": 1576676352 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002638114343029087, + "loss": 2.9708, + "theoretical_loss": 3.50072014467892, + "tokens_seen": 1576741888 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026380140421263795, + "loss": 2.8209, + "theoretical_loss": 3.5007074384247785, + "tokens_seen": 1576807424 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002637913741223671, + "loss": 3.039, + "theoretical_loss": 3.5006947328465916, + "tokens_seen": 1576872960 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002637813440320963, + "loss": 2.7014, + "theoretical_loss": 3.500682027944295, + "tokens_seen": 1576938496 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002637713139418255, + "loss": 2.9119, + "theoretical_loss": 3.5006693237178244, + "tokens_seen": 1577004032 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026376128385155467, + "loss": 2.7541, + "theoretical_loss": 3.5006566201671157, + "tokens_seen": 1577069568 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026375125376128385, + "loss": 3.0195, + "theoretical_loss": 3.5006439172921047, + "tokens_seen": 1577135104 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026374122367101303, + "loss": 2.8242, + "theoretical_loss": 3.500631215092728, + "tokens_seen": 1577200640 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002637311935807422, + "loss": 2.9552, + "theoretical_loss": 3.5006185135689214, + "tokens_seen": 1577266176 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026372116349047145, + "loss": 2.8054, + "theoretical_loss": 3.5006058127206208, + "tokens_seen": 1577331712 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002637111334002006, + "loss": 2.9058, + "theoretical_loss": 3.5005931125477616, + "tokens_seen": 1577397248 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002637011033099298, + "loss": 2.7677, + "theoretical_loss": 3.5005804130502804, + "tokens_seen": 1577462784 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026369107321965894, + "loss": 2.8287, + "theoretical_loss": 3.500567714228114, + "tokens_seen": 1577528320 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002636810431293882, + "loss": 2.7696, + "theoretical_loss": 3.500555016081197, + "tokens_seen": 1577593856 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026367101303911736, + "loss": 2.9985, + "theoretical_loss": 3.5005423186094666, + "tokens_seen": 1577659392 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026366098294884654, + "loss": 2.96, + "theoretical_loss": 3.5005296218128583, + "tokens_seen": 1577724928 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1775526, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7164270877838135, + "objective/train/theoretical_loss": 3.500520099658412, + "objective/train/tokens_used": 1598234080, + "theoretical_loss": 3.500520099658412, + "tokens_seen": 1577774080 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002636509528585757, + "loss": 2.664, + "theoretical_loss": 3.500516925691308, + "tokens_seen": 1577790464 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002636409227683049, + "loss": 2.6874, + "theoretical_loss": 3.5005042302447515, + "tokens_seen": 1577856000 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002636308926780341, + "loss": 2.6584, + "theoretical_loss": 3.500491535473126, + "tokens_seen": 1577921536 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002636208625877633, + "loss": 2.9151, + "theoretical_loss": 3.500478841376367, + "tokens_seen": 1577987072 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026361083249749244, + "loss": 3.0229, + "theoretical_loss": 3.50046614795441, + "tokens_seen": 1578052608 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002636008024072217, + "loss": 2.8641, + "theoretical_loss": 3.500453455207192, + "tokens_seen": 1578118144 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026359077231695086, + "loss": 2.692, + "theoretical_loss": 3.500440763134649, + "tokens_seen": 1578183680 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026358074222668004, + "loss": 2.9396, + "theoretical_loss": 3.5004280717367164, + "tokens_seen": 1578249216 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002635707121364092, + "loss": 2.8203, + "theoretical_loss": 3.5004153810133314, + "tokens_seen": 1578314752 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002635606820461384, + "loss": 2.5741, + "theoretical_loss": 3.5004026909644295, + "tokens_seen": 1578380288 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002635506519558676, + "loss": 2.9397, + "theoretical_loss": 3.5003900015899467, + "tokens_seen": 1578445824 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002635406218655968, + "loss": 2.8295, + "theoretical_loss": 3.5003773128898192, + "tokens_seen": 1578511360 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026353059177532595, + "loss": 2.7902, + "theoretical_loss": 3.500364624863984, + "tokens_seen": 1578576896 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002635205616850552, + "loss": 2.9844, + "theoretical_loss": 3.5003519375123764, + "tokens_seen": 1578642432 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026351053159478436, + "loss": 2.8751, + "theoretical_loss": 3.5003392508349327, + "tokens_seen": 1578707968 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026350050150451354, + "loss": 2.7126, + "theoretical_loss": 3.500326564831589, + "tokens_seen": 1578773504 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002634904714142428, + "loss": 2.8797, + "theoretical_loss": 3.500313879502282, + "tokens_seen": 1578839040 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002634804413239719, + "loss": 2.9553, + "theoretical_loss": 3.500301194846947, + "tokens_seen": 1578904576 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026347041123370114, + "loss": 2.9701, + "theoretical_loss": 3.5002885108655217, + "tokens_seen": 1578970112 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026346038114343027, + "loss": 2.9054, + "theoretical_loss": 3.5002758275579406, + "tokens_seen": 1579035648 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002634503510531595, + "loss": 2.67, + "theoretical_loss": 3.5002631449241415, + "tokens_seen": 1579101184 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002634403209628887, + "loss": 2.7871, + "theoretical_loss": 3.5002504629640594, + "tokens_seen": 1579166720 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026343029087261787, + "loss": 2.5263, + "theoretical_loss": 3.5002377816776318, + "tokens_seen": 1579232256 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026342026078234705, + "loss": 2.8631, + "theoretical_loss": 3.500225101064794, + "tokens_seen": 1579297792 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002634102306920763, + "loss": 2.9627, + "theoretical_loss": 3.5002124211254824, + "tokens_seen": 1579363328 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1776340, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.49752140045166, + "objective/train/theoretical_loss": 3.500202911612962, + "objective/train/tokens_used": 1599872480, + "theoretical_loss": 3.500202911612962, + "tokens_seen": 1579412480 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002634002006018054, + "loss": 2.5422, + "theoretical_loss": 3.500199741859633, + "tokens_seen": 1579428864 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026339017051153464, + "loss": 2.7841, + "theoretical_loss": 3.500187063267183, + "tokens_seen": 1579494400 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026338014042126377, + "loss": 2.7422, + "theoretical_loss": 3.5001743853480685, + "tokens_seen": 1579559936 + }, + { + "epoch": 5.03, + "learning_rate": 0.000263370110330993, + "loss": 2.7585, + "theoretical_loss": 3.500161708102225, + "tokens_seen": 1579625472 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002633600802407222, + "loss": 2.6565, + "theoretical_loss": 3.5001490315295896, + "tokens_seen": 1579691008 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026335005015045137, + "loss": 3.0777, + "theoretical_loss": 3.500136355630098, + "tokens_seen": 1579756544 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026334002006018055, + "loss": 2.8242, + "theoretical_loss": 3.5001236804036875, + "tokens_seen": 1579822080 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026332998996990973, + "loss": 2.898, + "theoretical_loss": 3.500111005850293, + "tokens_seen": 1579887616 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002633199598796389, + "loss": 2.5527, + "theoretical_loss": 3.500098331969852, + "tokens_seen": 1579953152 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026330992978936815, + "loss": 2.9106, + "theoretical_loss": 3.5000856587623006, + "tokens_seen": 1580018688 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002632998996990973, + "loss": 2.6539, + "theoretical_loss": 3.5000729862275755, + "tokens_seen": 1580084224 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002632898696088265, + "loss": 2.8476, + "theoretical_loss": 3.500060314365612, + "tokens_seen": 1580149760 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002632798395185557, + "loss": 2.7165, + "theoretical_loss": 3.500047643176347, + "tokens_seen": 1580215296 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026326980942828487, + "loss": 2.874, + "theoretical_loss": 3.5000349726597175, + "tokens_seen": 1580280832 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026325977933801405, + "loss": 2.8665, + "theoretical_loss": 3.5000223028156596, + "tokens_seen": 1580346368 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026324974924774323, + "loss": 3.0691, + "theoretical_loss": 3.5000096336441096, + "tokens_seen": 1580411904 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002632397191574724, + "loss": 2.854, + "theoretical_loss": 3.4999969651450034, + "tokens_seen": 1580477440 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026322968906720165, + "loss": 2.6651, + "theoretical_loss": 3.499984297318278, + "tokens_seen": 1580542976 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002632196589769308, + "loss": 2.9626, + "theoretical_loss": 3.49997163016387, + "tokens_seen": 1580608512 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026320962888666, + "loss": 2.6678, + "theoretical_loss": 3.499958963681715, + "tokens_seen": 1580674048 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026319959879638914, + "loss": 2.7437, + "theoretical_loss": 3.499946297871751, + "tokens_seen": 1580739584 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002631895687061184, + "loss": 2.8165, + "theoretical_loss": 3.499933632733913, + "tokens_seen": 1580805120 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026317953861584756, + "loss": 2.8247, + "theoretical_loss": 3.499920968268138, + "tokens_seen": 1580870656 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026316950852557674, + "loss": 2.7621, + "theoretical_loss": 3.499908304474362, + "tokens_seen": 1580936192 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002631594784353059, + "loss": 2.7327, + "theoretical_loss": 3.4998956413525226, + "tokens_seen": 1581001728 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1777599, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8258018493652344, + "objective/train/theoretical_loss": 3.499886144452063, + "objective/train/tokens_used": 1601510880, + "theoretical_loss": 3.499886144452063, + "tokens_seen": 1581050880 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002631494483450351, + "loss": 2.9461, + "theoretical_loss": 3.499882978902555, + "tokens_seen": 1581067264 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002631394182547643, + "loss": 2.7952, + "theoretical_loss": 3.499870317124397, + "tokens_seen": 1581132800 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002631293881644935, + "loss": 2.8345, + "theoretical_loss": 3.499857656017984, + "tokens_seen": 1581198336 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026311935807422264, + "loss": 2.544, + "theoretical_loss": 3.499844995583253, + "tokens_seen": 1581263872 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002631093279839519, + "loss": 2.7987, + "theoretical_loss": 3.499832335820141, + "tokens_seen": 1581329408 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026309929789368106, + "loss": 2.6151, + "theoretical_loss": 3.4998196767285834, + "tokens_seen": 1581394944 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026308926780341024, + "loss": 2.4125, + "theoretical_loss": 3.499807018308518, + "tokens_seen": 1581460480 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002630792377131394, + "loss": 2.914, + "theoretical_loss": 3.4997943605598802, + "tokens_seen": 1581526016 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002630692076228686, + "loss": 2.8157, + "theoretical_loss": 3.499781703482607, + "tokens_seen": 1581591552 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002630591775325978, + "loss": 3.023, + "theoretical_loss": 3.4997690470766356, + "tokens_seen": 1581657088 + }, + { + "epoch": 5.03, + "learning_rate": 0.000263049147442327, + "loss": 2.708, + "theoretical_loss": 3.4997563913419016, + "tokens_seen": 1581722624 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026303911735205615, + "loss": 2.8942, + "theoretical_loss": 3.4997437362783423, + "tokens_seen": 1581788160 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002630290872617854, + "loss": 2.7689, + "theoretical_loss": 3.499731081885894, + "tokens_seen": 1581853696 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002630190571715145, + "loss": 2.7761, + "theoretical_loss": 3.4997184281644937, + "tokens_seen": 1581919232 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026300902708124374, + "loss": 3.1327, + "theoretical_loss": 3.4997057751140774, + "tokens_seen": 1581984768 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002629989969909729, + "loss": 2.7557, + "theoretical_loss": 3.499693122734582, + "tokens_seen": 1582050304 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002629889669007021, + "loss": 2.7901, + "theoretical_loss": 3.499680471025944, + "tokens_seen": 1582115840 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002629789368104313, + "loss": 2.7257, + "theoretical_loss": 3.4996678199881, + "tokens_seen": 1582181376 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026296890672016047, + "loss": 2.7572, + "theoretical_loss": 3.499655169620987, + "tokens_seen": 1582246912 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026295887662988965, + "loss": 2.7508, + "theoretical_loss": 3.4996425199245422, + "tokens_seen": 1582312448 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002629488465396189, + "loss": 2.8917, + "theoretical_loss": 3.4996298708987013, + "tokens_seen": 1582377984 + }, + { + "epoch": 5.03, + "learning_rate": 0.000262938816449348, + "loss": 3.0054, + "theoretical_loss": 3.499617222543401, + "tokens_seen": 1582443520 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026292878635907725, + "loss": 2.4528, + "theoretical_loss": 3.499604574858578, + "tokens_seen": 1582509056 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026291875626880643, + "loss": 2.8008, + "theoretical_loss": 3.499591927844169, + "tokens_seen": 1582574592 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002629087261785356, + "loss": 2.8668, + "theoretical_loss": 3.499579281500112, + "tokens_seen": 1582640128 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1778167, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9653608798980713, + "objective/train/theoretical_loss": 3.499569797181948, + "objective/train/tokens_used": 1603149280, + "theoretical_loss": 3.499569797181948, + "tokens_seen": 1582689280 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002628986960882648, + "loss": 2.8982, + "theoretical_loss": 3.4995666358263415, + "tokens_seen": 1582705664 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026288866599799397, + "loss": 2.8606, + "theoretical_loss": 3.499553990822796, + "tokens_seen": 1582771200 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026287863590772315, + "loss": 2.8867, + "theoretical_loss": 3.499541346489411, + "tokens_seen": 1582836736 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002628686058174524, + "loss": 2.7481, + "theoretical_loss": 3.499528702826125, + "tokens_seen": 1582902272 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002628585757271815, + "loss": 2.69, + "theoretical_loss": 3.4995160598328727, + "tokens_seen": 1582967808 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026284854563691075, + "loss": 2.939, + "theoretical_loss": 3.499503417509592, + "tokens_seen": 1583033344 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002628385155466399, + "loss": 2.9496, + "theoretical_loss": 3.499490775856219, + "tokens_seen": 1583098880 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002628284854563691, + "loss": 3.0367, + "theoretical_loss": 3.4994781348726915, + "tokens_seen": 1583164416 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002628184553660983, + "loss": 2.8122, + "theoretical_loss": 3.4994654945589447, + "tokens_seen": 1583229952 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002628084252758275, + "loss": 2.8815, + "theoretical_loss": 3.4994528549149173, + "tokens_seen": 1583295488 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026279839518555666, + "loss": 2.6773, + "theoretical_loss": 3.499440215940545, + "tokens_seen": 1583361024 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002627883650952859, + "loss": 2.7158, + "theoretical_loss": 3.499427577635764, + "tokens_seen": 1583426560 + }, + { + "epoch": 5.03, + "learning_rate": 0.000262778335005015, + "loss": 2.6024, + "theoretical_loss": 3.499414940000513, + "tokens_seen": 1583492096 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026276830491474425, + "loss": 2.7819, + "theoretical_loss": 3.4994023030347265, + "tokens_seen": 1583557632 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026275827482447343, + "loss": 2.7682, + "theoretical_loss": 3.499389666738343, + "tokens_seen": 1583623168 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002627482447342026, + "loss": 2.922, + "theoretical_loss": 3.499377031111299, + "tokens_seen": 1583688704 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026273821464393185, + "loss": 2.8512, + "theoretical_loss": 3.4993643961535312, + "tokens_seen": 1583754240 + }, + { + "epoch": 5.03, + "learning_rate": 0.000262728184553661, + "loss": 2.9017, + "theoretical_loss": 3.4993517618649763, + "tokens_seen": 1583819776 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002627181544633902, + "loss": 2.6698, + "theoretical_loss": 3.499339128245571, + "tokens_seen": 1583885312 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026270812437311934, + "loss": 2.8969, + "theoretical_loss": 3.4993264952952527, + "tokens_seen": 1583950848 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002626980942828486, + "loss": 2.7105, + "theoretical_loss": 3.4993138630139584, + "tokens_seen": 1584016384 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026268806419257776, + "loss": 2.7529, + "theoretical_loss": 3.4993012314016245, + "tokens_seen": 1584081920 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026267803410230694, + "loss": 2.9912, + "theoretical_loss": 3.499288600458188, + "tokens_seen": 1584147456 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002626680040120361, + "loss": 2.6269, + "theoretical_loss": 3.499275970183586, + "tokens_seen": 1584212992 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002626579739217653, + "loss": 2.8153, + "theoretical_loss": 3.499263340577755, + "tokens_seen": 1584278528 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1779618, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2659285068511963, + "objective/train/theoretical_loss": 3.499253868812225, + "objective/train/tokens_used": 1604787680, + "theoretical_loss": 3.499253868812225, + "tokens_seen": 1584327680 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002626479438314945, + "loss": 2.7086, + "theoretical_loss": 3.4992507116406326, + "tokens_seen": 1584344064 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002626379137412237, + "loss": 2.8713, + "theoretical_loss": 3.499238083372155, + "tokens_seen": 1584409600 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026262788365095284, + "loss": 2.7341, + "theoretical_loss": 3.49922545577226, + "tokens_seen": 1584475136 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002626178535606821, + "loss": 2.5213, + "theoretical_loss": 3.4992128288408835, + "tokens_seen": 1584540672 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026260782347041126, + "loss": 3.0435, + "theoretical_loss": 3.4992002025779634, + "tokens_seen": 1584606208 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026259779338014044, + "loss": 2.5818, + "theoretical_loss": 3.499187576983436, + "tokens_seen": 1584671744 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002625877632898696, + "loss": 2.8307, + "theoretical_loss": 3.4991749520572384, + "tokens_seen": 1584737280 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002625777331995988, + "loss": 2.7998, + "theoretical_loss": 3.499162327799308, + "tokens_seen": 1584802816 + }, + { + "epoch": 5.03, + "learning_rate": 0.000262567703109328, + "loss": 2.754, + "theoretical_loss": 3.4991497042095814, + "tokens_seen": 1584868352 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002625576730190572, + "loss": 2.781, + "theoretical_loss": 3.499137081287996, + "tokens_seen": 1584933888 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026254764292878635, + "loss": 2.876, + "theoretical_loss": 3.4991244590344888, + "tokens_seen": 1584999424 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002625376128385156, + "loss": 2.8324, + "theoretical_loss": 3.4991118374489965, + "tokens_seen": 1585064960 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002625275827482447, + "loss": 2.6642, + "theoretical_loss": 3.4990992165314556, + "tokens_seen": 1585130496 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026251755265797394, + "loss": 2.7775, + "theoretical_loss": 3.499086596281804, + "tokens_seen": 1585196032 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002625075225677031, + "loss": 2.9426, + "theoretical_loss": 3.4990739766999788, + "tokens_seen": 1585261568 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002624974924774323, + "loss": 2.7238, + "theoretical_loss": 3.4990613577859166, + "tokens_seen": 1585327104 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002624874623871615, + "loss": 2.7153, + "theoretical_loss": 3.499048739539554, + "tokens_seen": 1585392640 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026247743229689067, + "loss": 2.6911, + "theoretical_loss": 3.4990361219608292, + "tokens_seen": 1585458176 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026246740220661985, + "loss": 2.6232, + "theoretical_loss": 3.4990235050496787, + "tokens_seen": 1585523712 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002624573721163491, + "loss": 2.8488, + "theoretical_loss": 3.49901088880604, + "tokens_seen": 1585589248 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002624473420260782, + "loss": 2.6884, + "theoretical_loss": 3.4989982732298497, + "tokens_seen": 1585654784 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026243731193580745, + "loss": 2.7215, + "theoretical_loss": 3.4989856583210446, + "tokens_seen": 1585720320 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026242728184553663, + "loss": 2.5312, + "theoretical_loss": 3.4989730440795626, + "tokens_seen": 1585785856 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002624172517552658, + "loss": 2.8339, + "theoretical_loss": 3.49896043050534, + "tokens_seen": 1585851392 + }, + { + "epoch": 5.03, + "learning_rate": 0.000262407221664995, + "loss": 2.8494, + "theoretical_loss": 3.498947817598315, + "tokens_seen": 1585916928 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1782479, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7037224769592285, + "objective/train/theoretical_loss": 3.4989383583558564, + "objective/train/tokens_used": 1606426080, + "theoretical_loss": 3.4989383583558564, + "tokens_seen": 1585966080 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026239719157472417, + "loss": 2.5762, + "theoretical_loss": 3.498935205358424, + "tokens_seen": 1585982464 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026238716148445335, + "loss": 2.5875, + "theoretical_loss": 3.4989225937856046, + "tokens_seen": 1586048000 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002623771313941826, + "loss": 2.8368, + "theoretical_loss": 3.498909982879793, + "tokens_seen": 1586113536 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002623671013039117, + "loss": 2.8362, + "theoretical_loss": 3.4988973726409274, + "tokens_seen": 1586179072 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026235707121364095, + "loss": 2.8293, + "theoretical_loss": 3.4988847630689444, + "tokens_seen": 1586244608 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002623470411233701, + "loss": 2.7497, + "theoretical_loss": 3.4988721541637817, + "tokens_seen": 1586310144 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002623370110330993, + "loss": 2.6262, + "theoretical_loss": 3.498859545925376, + "tokens_seen": 1586375680 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002623269809428285, + "loss": 2.7631, + "theoretical_loss": 3.4988469383536644, + "tokens_seen": 1586441216 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002623169508525577, + "loss": 2.7859, + "theoretical_loss": 3.498834331448585, + "tokens_seen": 1586506752 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026230692076228686, + "loss": 2.6273, + "theoretical_loss": 3.4988217252100737, + "tokens_seen": 1586572288 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002622968906720161, + "loss": 2.9191, + "theoretical_loss": 3.498809119638069, + "tokens_seen": 1586637824 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002622868605817452, + "loss": 2.5803, + "theoretical_loss": 3.4987965147325073, + "tokens_seen": 1586703360 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026227683049147445, + "loss": 2.6403, + "theoretical_loss": 3.4987839104933265, + "tokens_seen": 1586768896 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002622668004012036, + "loss": 2.7257, + "theoretical_loss": 3.498771306920463, + "tokens_seen": 1586834432 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002622567703109328, + "loss": 2.9422, + "theoretical_loss": 3.4987587040138544, + "tokens_seen": 1586899968 + }, + { + "epoch": 5.03, + "learning_rate": 0.000262246740220662, + "loss": 2.6503, + "theoretical_loss": 3.4987461017734383, + "tokens_seen": 1586965504 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002622367101303912, + "loss": 2.7249, + "theoretical_loss": 3.4987335001991515, + "tokens_seen": 1587031040 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026222668004012036, + "loss": 2.7806, + "theoretical_loss": 3.4987208992909316, + "tokens_seen": 1587096576 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026221664994984954, + "loss": 2.6487, + "theoretical_loss": 3.498708299048716, + "tokens_seen": 1587162112 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002622066198595787, + "loss": 2.6574, + "theoretical_loss": 3.4986956994724414, + "tokens_seen": 1587227648 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026219658976930796, + "loss": 2.7044, + "theoretical_loss": 3.498683100562046, + "tokens_seen": 1587293184 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002621865596790371, + "loss": 2.8528, + "theoretical_loss": 3.4986705023174665, + "tokens_seen": 1587358720 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002621765295887663, + "loss": 2.8764, + "theoretical_loss": 3.4986579047386406, + "tokens_seen": 1587424256 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026216649949849545, + "loss": 2.6997, + "theoretical_loss": 3.4986453078255053, + "tokens_seen": 1587489792 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002621564694082247, + "loss": 2.639, + "theoretical_loss": 3.498632711577998, + "tokens_seen": 1587555328 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1787756, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.56807279586792, + "objective/train/theoretical_loss": 3.498623264829148, + "objective/train/tokens_used": 1608064480, + "theoretical_loss": 3.498623264829148, + "tokens_seen": 1587604480 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026214643931795386, + "loss": 2.8254, + "theoretical_loss": 3.498620115996056, + "tokens_seen": 1587620864 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026213640922768304, + "loss": 2.7024, + "theoretical_loss": 3.4986075210796166, + "tokens_seen": 1587686400 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002621263791374122, + "loss": 2.7606, + "theoretical_loss": 3.498594926828618, + "tokens_seen": 1587751936 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026211634904714146, + "loss": 2.6963, + "theoretical_loss": 3.498582333242996, + "tokens_seen": 1587817472 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002621063189568706, + "loss": 2.6815, + "theoretical_loss": 3.498569740322689, + "tokens_seen": 1587883008 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002620962888665998, + "loss": 2.7735, + "theoretical_loss": 3.4985571480676354, + "tokens_seen": 1587948544 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026208625877632895, + "loss": 2.8927, + "theoretical_loss": 3.4985445564777704, + "tokens_seen": 1588014080 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002620762286860582, + "loss": 2.7365, + "theoretical_loss": 3.498531965553033, + "tokens_seen": 1588079616 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026206619859578737, + "loss": 2.8605, + "theoretical_loss": 3.49851937529336, + "tokens_seen": 1588145152 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026205616850551655, + "loss": 2.768, + "theoretical_loss": 3.498506785698689, + "tokens_seen": 1588210688 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026204613841524573, + "loss": 2.7571, + "theoretical_loss": 3.4984941967689576, + "tokens_seen": 1588276224 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002620361083249749, + "loss": 2.9175, + "theoretical_loss": 3.498481608504103, + "tokens_seen": 1588341760 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002620260782347041, + "loss": 2.8245, + "theoretical_loss": 3.4984690209040625, + "tokens_seen": 1588407296 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002620160481444333, + "loss": 2.7586, + "theoretical_loss": 3.498456433968774, + "tokens_seen": 1588472832 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002620060180541625, + "loss": 2.6923, + "theoretical_loss": 3.498443847698174, + "tokens_seen": 1588538368 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002619959879638917, + "loss": 2.4443, + "theoretical_loss": 3.4984312620922013, + "tokens_seen": 1588603904 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026198595787362087, + "loss": 2.8155, + "theoretical_loss": 3.498418677150793, + "tokens_seen": 1588669440 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026197592778335005, + "loss": 2.9109, + "theoretical_loss": 3.4984060928738856, + "tokens_seen": 1588734976 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002619658976930793, + "loss": 2.6826, + "theoretical_loss": 3.498393509261418, + "tokens_seen": 1588800512 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002619558676028084, + "loss": 2.8529, + "theoretical_loss": 3.4983809263133274, + "tokens_seen": 1588866048 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026194583751253765, + "loss": 2.6624, + "theoretical_loss": 3.4983683440295503, + "tokens_seen": 1588931584 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026193580742226683, + "loss": 2.7856, + "theoretical_loss": 3.4983557624100254, + "tokens_seen": 1588997120 + }, + { + "epoch": 5.03, + "learning_rate": 0.000261925777331996, + "loss": 2.7946, + "theoretical_loss": 3.4983431814546897, + "tokens_seen": 1589062656 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002619157472417252, + "loss": 2.487, + "theoretical_loss": 3.498330601163481, + "tokens_seen": 1589128192 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026190571715145437, + "loss": 2.6616, + "theoretical_loss": 3.4983180215363365, + "tokens_seen": 1589193728 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1792626, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8070321083068848, + "objective/train/theoretical_loss": 3.4983085872517328, + "objective/train/tokens_used": 1609702880, + "theoretical_loss": 3.4983085872517328, + "tokens_seen": 1589242880 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026189568706118355, + "loss": 2.7286, + "theoretical_loss": 3.498305442573194, + "tokens_seen": 1589259264 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002618856569709128, + "loss": 2.8236, + "theoretical_loss": 3.498292864273991, + "tokens_seen": 1589324800 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002618756268806419, + "loss": 2.7265, + "theoretical_loss": 3.498280286638665, + "tokens_seen": 1589390336 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026186559679037115, + "loss": 2.6342, + "theoretical_loss": 3.498267709667154, + "tokens_seen": 1589455872 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002618555667001003, + "loss": 2.5883, + "theoretical_loss": 3.498255133359395, + "tokens_seen": 1589521408 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002618455366098295, + "loss": 2.8205, + "theoretical_loss": 3.4982425577153267, + "tokens_seen": 1589586944 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002618355065195587, + "loss": 2.4463, + "theoretical_loss": 3.498229982734885, + "tokens_seen": 1589652480 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002618254764292879, + "loss": 2.73, + "theoretical_loss": 3.498217408418009, + "tokens_seen": 1589718016 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026181544633901706, + "loss": 2.6927, + "theoretical_loss": 3.4982048347646355, + "tokens_seen": 1589783552 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002618054162487463, + "loss": 2.8462, + "theoretical_loss": 3.4981922617747023, + "tokens_seen": 1589849088 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002617953861584754, + "loss": 2.648, + "theoretical_loss": 3.4981796894481474, + "tokens_seen": 1589914624 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026178535606820465, + "loss": 2.6182, + "theoretical_loss": 3.498167117784908, + "tokens_seen": 1589980160 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002617753259779338, + "loss": 2.7333, + "theoretical_loss": 3.498154546784922, + "tokens_seen": 1590045696 + }, + { + "epoch": 5.03, + "learning_rate": 0.000261765295887663, + "loss": 2.7277, + "theoretical_loss": 3.498141976448127, + "tokens_seen": 1590111232 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002617552657973922, + "loss": 2.7771, + "theoretical_loss": 3.4981294067744617, + "tokens_seen": 1590176768 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002617452357071214, + "loss": 2.7176, + "theoretical_loss": 3.498116837763862, + "tokens_seen": 1590242304 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026173520561685056, + "loss": 2.5995, + "theoretical_loss": 3.4981042694162663, + "tokens_seen": 1590307840 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026172517552657974, + "loss": 2.8918, + "theoretical_loss": 3.498091701731613, + "tokens_seen": 1590373376 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002617151454363089, + "loss": 2.9067, + "theoretical_loss": 3.4980791347098386, + "tokens_seen": 1590438912 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026170511534603816, + "loss": 2.637, + "theoretical_loss": 3.498066568350882, + "tokens_seen": 1590504448 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002616950852557673, + "loss": 2.973, + "theoretical_loss": 3.49805400265468, + "tokens_seen": 1590569984 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002616850551654965, + "loss": 2.7506, + "theoretical_loss": 3.498041437621171, + "tokens_seen": 1590635520 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026167502507522565, + "loss": 2.6337, + "theoretical_loss": 3.498028873250292, + "tokens_seen": 1590701056 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002616649949849549, + "loss": 2.7196, + "theoretical_loss": 3.498016309541982, + "tokens_seen": 1590766592 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026165496489468406, + "loss": 2.7316, + "theoretical_loss": 3.498003746496178, + "tokens_seen": 1590832128 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1797804, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7346248626708984, + "objective/train/theoretical_loss": 3.497994324646557, + "objective/train/tokens_used": 1611341280, + "theoretical_loss": 3.497994324646557, + "tokens_seen": 1590881280 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026164493480441324, + "loss": 2.7022, + "theoretical_loss": 3.4979911841128173, + "tokens_seen": 1590897664 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002616349047141424, + "loss": 2.4981, + "theoretical_loss": 3.4979786223918388, + "tokens_seen": 1590963200 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026162487462387166, + "loss": 2.7815, + "theoretical_loss": 3.4979660613331793, + "tokens_seen": 1591028736 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002616148445336008, + "loss": 2.7909, + "theoretical_loss": 3.4979535009367773, + "tokens_seen": 1591094272 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026160481444333, + "loss": 2.8896, + "theoretical_loss": 3.4979409412025695, + "tokens_seen": 1591159808 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026159478435305915, + "loss": 2.8332, + "theoretical_loss": 3.497928382130495, + "tokens_seen": 1591225344 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002615847542627884, + "loss": 2.6551, + "theoretical_loss": 3.4979158237204913, + "tokens_seen": 1591290880 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026157472417251757, + "loss": 2.8388, + "theoretical_loss": 3.4979032659724956, + "tokens_seen": 1591356416 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026156469408224675, + "loss": 2.8684, + "theoretical_loss": 3.4978907088864464, + "tokens_seen": 1591421952 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026155466399197593, + "loss": 2.8727, + "theoretical_loss": 3.4978781524622815, + "tokens_seen": 1591487488 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002615446339017051, + "loss": 2.7349, + "theoretical_loss": 3.4978655966999384, + "tokens_seen": 1591553024 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002615346038114343, + "loss": 2.8585, + "theoretical_loss": 3.497853041599355, + "tokens_seen": 1591618560 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002615245737211635, + "loss": 2.9453, + "theoretical_loss": 3.4978404871604702, + "tokens_seen": 1591684096 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026151454363089265, + "loss": 2.53, + "theoretical_loss": 3.4978279333832205, + "tokens_seen": 1591749632 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002615045135406219, + "loss": 2.663, + "theoretical_loss": 3.497815380267544, + "tokens_seen": 1591815168 + }, + { + "epoch": 5.03, + "learning_rate": 0.000261494483450351, + "loss": 2.6538, + "theoretical_loss": 3.4978028278133797, + "tokens_seen": 1591880704 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026148445336008025, + "loss": 2.9559, + "theoretical_loss": 3.497790276020664, + "tokens_seen": 1591946240 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026147442326980943, + "loss": 2.7134, + "theoretical_loss": 3.4977777248893354, + "tokens_seen": 1592011776 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002614643931795386, + "loss": 2.4606, + "theoretical_loss": 3.4977651744193325, + "tokens_seen": 1592077312 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002614543630892678, + "loss": 2.692, + "theoretical_loss": 3.4977526246105928, + "tokens_seen": 1592142848 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026144433299899703, + "loss": 2.8035, + "theoretical_loss": 3.497740075463054, + "tokens_seen": 1592208384 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026143430290872616, + "loss": 2.7924, + "theoretical_loss": 3.4977275269766537, + "tokens_seen": 1592273920 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002614242728184554, + "loss": 2.8615, + "theoretical_loss": 3.497714979151331, + "tokens_seen": 1592339456 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002614142427281845, + "loss": 2.8544, + "theoretical_loss": 3.4977024319870234, + "tokens_seen": 1592404992 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026140421263791375, + "loss": 2.6875, + "theoretical_loss": 3.497689885483668, + "tokens_seen": 1592470528 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1800873, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.813413143157959, + "objective/train/theoretical_loss": 3.4976804760398648, + "objective/train/tokens_used": 1612979680, + "theoretical_loss": 3.4976804760398648, + "tokens_seen": 1592519680 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026139418254764293, + "loss": 2.5509, + "theoretical_loss": 3.497677339641204, + "tokens_seen": 1592536064 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002613841524573721, + "loss": 2.6167, + "theoretical_loss": 3.4976647944595687, + "tokens_seen": 1592601600 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002613741223671013, + "loss": 2.8014, + "theoretical_loss": 3.4976522499387004, + "tokens_seen": 1592667136 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002613640922768305, + "loss": 2.6458, + "theoretical_loss": 3.4976397060785365, + "tokens_seen": 1592732672 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026135406218655966, + "loss": 2.7171, + "theoretical_loss": 3.497627162879016, + "tokens_seen": 1592798208 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002613440320962889, + "loss": 2.8442, + "theoretical_loss": 3.4976146203400766, + "tokens_seen": 1592863744 + }, + { + "epoch": 5.03, + "learning_rate": 0.000261334002006018, + "loss": 2.9746, + "theoretical_loss": 3.4976020784616555, + "tokens_seen": 1592929280 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026132397191574726, + "loss": 2.8905, + "theoretical_loss": 3.497589537243692, + "tokens_seen": 1592994816 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002613139418254764, + "loss": 2.7806, + "theoretical_loss": 3.4975769966861234, + "tokens_seen": 1593060352 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002613039117352056, + "loss": 2.7803, + "theoretical_loss": 3.497564456788888, + "tokens_seen": 1593125888 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002612938816449348, + "loss": 2.7665, + "theoretical_loss": 3.497551917551924, + "tokens_seen": 1593191424 + }, + { + "epoch": 5.03, + "learning_rate": 0.000261283851554664, + "loss": 2.6711, + "theoretical_loss": 3.4975393789751688, + "tokens_seen": 1593256960 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026127382146439316, + "loss": 2.6177, + "theoretical_loss": 3.497526841058561, + "tokens_seen": 1593322496 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002612637913741224, + "loss": 2.5415, + "theoretical_loss": 3.4975143038020384, + "tokens_seen": 1593388032 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002612537612838516, + "loss": 2.8254, + "theoretical_loss": 3.4975017672055397, + "tokens_seen": 1593453568 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026124373119358076, + "loss": 2.6499, + "theoretical_loss": 3.497489231269003, + "tokens_seen": 1593519104 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026123370110330994, + "loss": 2.8149, + "theoretical_loss": 3.4974766959923658, + "tokens_seen": 1593584640 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002612236710130391, + "loss": 2.6921, + "theoretical_loss": 3.497464161375566, + "tokens_seen": 1593650176 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026121364092276836, + "loss": 2.878, + "theoretical_loss": 3.4974516274185428, + "tokens_seen": 1593715712 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002612036108324975, + "loss": 2.6749, + "theoretical_loss": 3.497439094121234, + "tokens_seen": 1593781248 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002611935807422267, + "loss": 2.6689, + "theoretical_loss": 3.497426561483577, + "tokens_seen": 1593846784 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026118355065195585, + "loss": 2.6218, + "theoretical_loss": 3.497414029505511, + "tokens_seen": 1593912320 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002611735205616851, + "loss": 2.685, + "theoretical_loss": 3.4974014981869734, + "tokens_seen": 1593977856 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026116349047141426, + "loss": 2.7086, + "theoretical_loss": 3.497388967527902, + "tokens_seen": 1594043392 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026115346038114344, + "loss": 2.5245, + "theoretical_loss": 3.4973764375282363, + "tokens_seen": 1594108928 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1801349, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.83113694190979, + "objective/train/theoretical_loss": 3.4973670404611843, + "objective/train/tokens_used": 1614618080, + "theoretical_loss": 3.4973670404611843, + "tokens_seen": 1594158080 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002611434302908726, + "loss": 2.5249, + "theoretical_loss": 3.4973639081879138, + "tokens_seen": 1594174464 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026113340020060186, + "loss": 2.8712, + "theoretical_loss": 3.4973513795068727, + "tokens_seen": 1594240000 + }, + { + "epoch": 5.03, + "learning_rate": 0.000261123370110331, + "loss": 2.8316, + "theoretical_loss": 3.4973388514850514, + "tokens_seen": 1594305536 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002611133400200602, + "loss": 2.7948, + "theoretical_loss": 3.4973263241223873, + "tokens_seen": 1594371072 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026110330992978935, + "loss": 2.8875, + "theoretical_loss": 3.49731379741882, + "tokens_seen": 1594436608 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002610932798395186, + "loss": 2.7071, + "theoretical_loss": 3.4973012713742864, + "tokens_seen": 1594502144 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026108324974924777, + "loss": 2.6609, + "theoretical_loss": 3.4972887459887256, + "tokens_seen": 1594567680 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026107321965897695, + "loss": 2.9401, + "theoretical_loss": 3.4972762212620756, + "tokens_seen": 1594633216 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026106318956870613, + "loss": 2.6306, + "theoretical_loss": 3.4972636971942745, + "tokens_seen": 1594698752 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002610531594784353, + "loss": 2.9314, + "theoretical_loss": 3.497251173785261, + "tokens_seen": 1594764288 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002610431293881645, + "loss": 2.7628, + "theoretical_loss": 3.497238651034973, + "tokens_seen": 1594829824 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002610330992978937, + "loss": 2.6542, + "theoretical_loss": 3.497226128943349, + "tokens_seen": 1594895360 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026102306920762285, + "loss": 2.7527, + "theoretical_loss": 3.497213607510327, + "tokens_seen": 1594960896 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002610130391173521, + "loss": 2.9548, + "theoretical_loss": 3.4972010867358456, + "tokens_seen": 1595026432 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002610030090270812, + "loss": 2.7296, + "theoretical_loss": 3.4971885666198426, + "tokens_seen": 1595091968 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026099297893681045, + "loss": 2.8855, + "theoretical_loss": 3.497176047162257, + "tokens_seen": 1595157504 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026098294884653963, + "loss": 2.5206, + "theoretical_loss": 3.4971635283630267, + "tokens_seen": 1595223040 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002609729187562688, + "loss": 2.8073, + "theoretical_loss": 3.4971510102220904, + "tokens_seen": 1595288576 + }, + { + "epoch": 5.03, + "learning_rate": 0.000260962888665998, + "loss": 2.5685, + "theoretical_loss": 3.497138492739386, + "tokens_seen": 1595354112 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026095285857572723, + "loss": 2.7139, + "theoretical_loss": 3.4971259759148525, + "tokens_seen": 1595419648 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026094282848545636, + "loss": 2.6919, + "theoretical_loss": 3.4971134597484275, + "tokens_seen": 1595485184 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002609327983951856, + "loss": 2.7325, + "theoretical_loss": 3.4971009442400494, + "tokens_seen": 1595550720 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002609227683049147, + "loss": 2.6534, + "theoretical_loss": 3.497088429389657, + "tokens_seen": 1595616256 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026091273821464395, + "loss": 2.6995, + "theoretical_loss": 3.497075915197189, + "tokens_seen": 1595681792 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026090270812437313, + "loss": 2.8369, + "theoretical_loss": 3.497063401662583, + "tokens_seen": 1595747328 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1802636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.037900686264038, + "objective/train/theoretical_loss": 3.4970540169433133, + "objective/train/tokens_used": 1616256480, + "theoretical_loss": 3.4970540169433133, + "tokens_seen": 1595796480 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002608926780341023, + "loss": 2.5637, + "theoretical_loss": 3.4970508887857776, + "tokens_seen": 1595812864 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002608826479438315, + "loss": 2.7449, + "theoretical_loss": 3.4970383765667115, + "tokens_seen": 1595878400 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002608726178535607, + "loss": 2.7603, + "theoretical_loss": 3.497025865005323, + "tokens_seen": 1595943936 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026086258776328986, + "loss": 2.7163, + "theoretical_loss": 3.4970133541015507, + "tokens_seen": 1596009472 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002608525576730191, + "loss": 2.7163, + "theoretical_loss": 3.497000843855332, + "tokens_seen": 1596075008 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002608425275827482, + "loss": 2.876, + "theoretical_loss": 3.4969883342666073, + "tokens_seen": 1596140544 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026083249749247746, + "loss": 2.9052, + "theoretical_loss": 3.496975825335313, + "tokens_seen": 1596206080 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002608224674022066, + "loss": 2.751, + "theoretical_loss": 3.496963317061389, + "tokens_seen": 1596271616 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002608124373119358, + "loss": 2.7262, + "theoretical_loss": 3.496950809444773, + "tokens_seen": 1596337152 + }, + { + "epoch": 5.03, + "learning_rate": 0.000260802407221665, + "loss": 2.8612, + "theoretical_loss": 3.496938302485404, + "tokens_seen": 1596402688 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002607923771313942, + "loss": 2.6211, + "theoretical_loss": 3.49692579618322, + "tokens_seen": 1596468224 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026078234704112336, + "loss": 2.7479, + "theoretical_loss": 3.49691329053816, + "tokens_seen": 1596533760 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002607723169508526, + "loss": 2.7243, + "theoretical_loss": 3.496900785550162, + "tokens_seen": 1596599296 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002607622868605817, + "loss": 2.7206, + "theoretical_loss": 3.4968882812191646, + "tokens_seen": 1596664832 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026075225677031096, + "loss": 2.637, + "theoretical_loss": 3.4968757775451063, + "tokens_seen": 1596730368 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002607422266800401, + "loss": 2.6786, + "theoretical_loss": 3.496863274527926, + "tokens_seen": 1596795904 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002607321965897693, + "loss": 2.7166, + "theoretical_loss": 3.4968507721675617, + "tokens_seen": 1596861440 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002607221664994985, + "loss": 2.801, + "theoretical_loss": 3.4968382704639525, + "tokens_seen": 1596926976 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002607121364092277, + "loss": 2.6867, + "theoretical_loss": 3.4968257694170366, + "tokens_seen": 1596992512 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026070210631895687, + "loss": 2.7219, + "theoretical_loss": 3.4968132690267524, + "tokens_seen": 1597058048 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026069207622868605, + "loss": 2.7463, + "theoretical_loss": 3.4968007692930394, + "tokens_seen": 1597123584 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026068204613841523, + "loss": 2.6852, + "theoretical_loss": 3.496788270215835, + "tokens_seen": 1597189120 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026067201604814446, + "loss": 2.6713, + "theoretical_loss": 3.496775771795078, + "tokens_seen": 1597254656 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002606619859578736, + "loss": 2.618, + "theoretical_loss": 3.496763274030708, + "tokens_seen": 1597320192 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002606519558676028, + "loss": 2.7221, + "theoretical_loss": 3.496750776922662, + "tokens_seen": 1597385728 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1803304, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7198164463043213, + "objective/train/theoretical_loss": 3.4967414045223038, + "objective/train/tokens_used": 1617894880, + "theoretical_loss": 3.4967414045223038, + "tokens_seen": 1597434880 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026064192577733195, + "loss": 2.8111, + "theoretical_loss": 3.49673828047088, + "tokens_seen": 1597451264 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002606318956870612, + "loss": 2.7815, + "theoretical_loss": 3.4967257846752995, + "tokens_seen": 1597516800 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026062186559679037, + "loss": 2.6373, + "theoretical_loss": 3.4967132895358604, + "tokens_seen": 1597582336 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026061183550651955, + "loss": 2.8427, + "theoretical_loss": 3.4967007950525, + "tokens_seen": 1597647872 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026060180541624873, + "loss": 2.7914, + "theoretical_loss": 3.4966883012251584, + "tokens_seen": 1597713408 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026059177532597797, + "loss": 2.6316, + "theoretical_loss": 3.496675808053773, + "tokens_seen": 1597778944 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002605817452357071, + "loss": 2.8277, + "theoretical_loss": 3.4966633155382825, + "tokens_seen": 1597844480 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026057171514543633, + "loss": 2.7669, + "theoretical_loss": 3.4966508236786265, + "tokens_seen": 1597910016 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026056168505516546, + "loss": 2.4671, + "theoretical_loss": 3.4966383324747428, + "tokens_seen": 1597975552 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002605516549648947, + "loss": 2.7547, + "theoretical_loss": 3.49662584192657, + "tokens_seen": 1598041088 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026054162487462387, + "loss": 2.4883, + "theoretical_loss": 3.496613352034048, + "tokens_seen": 1598106624 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026053159478435305, + "loss": 2.676, + "theoretical_loss": 3.496600862797114, + "tokens_seen": 1598172160 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026052156469408223, + "loss": 2.6167, + "theoretical_loss": 3.4965883742157082, + "tokens_seen": 1598237696 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002605115346038114, + "loss": 2.9133, + "theoretical_loss": 3.496575886289768, + "tokens_seen": 1598303232 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026050150451354065, + "loss": 2.7645, + "theoretical_loss": 3.4965633990192324, + "tokens_seen": 1598368768 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026049147442326983, + "loss": 2.6988, + "theoretical_loss": 3.496550912404041, + "tokens_seen": 1598434304 + }, + { + "epoch": 5.03, + "learning_rate": 0.000260481444332999, + "loss": 2.688, + "theoretical_loss": 3.4965384264441313, + "tokens_seen": 1598499840 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002604714142427282, + "loss": 2.9453, + "theoretical_loss": 3.496525941139443, + "tokens_seen": 1598565376 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026046138415245743, + "loss": 2.7754, + "theoretical_loss": 3.4965134564899145, + "tokens_seen": 1598630912 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026045135406218656, + "loss": 2.8114, + "theoretical_loss": 3.4965009724954843, + "tokens_seen": 1598696448 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002604413239719158, + "loss": 2.7888, + "theoretical_loss": 3.496488489156092, + "tokens_seen": 1598761984 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002604312938816449, + "loss": 3.0041, + "theoretical_loss": 3.496476006471675, + "tokens_seen": 1598827520 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026042126379137415, + "loss": 2.6466, + "theoretical_loss": 3.4964635244421736, + "tokens_seen": 1598893056 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026041123370110333, + "loss": 2.5899, + "theoretical_loss": 3.496451043067526, + "tokens_seen": 1598958592 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002604012036108325, + "loss": 2.84, + "theoretical_loss": 3.49643856234767, + "tokens_seen": 1599024128 + }, + { + "epoch": 5.03, + "objective/train/docs_used": 1804585, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1171164512634277, + "objective/train/theoretical_loss": 3.4964292022374495, + "objective/train/tokens_used": 1619533280, + "theoretical_loss": 3.4964292022374495, + "tokens_seen": 1599073280 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002603911735205617, + "loss": 2.8474, + "theoretical_loss": 3.496426082282546, + "tokens_seen": 1599089664 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002603811434302909, + "loss": 2.5638, + "theoretical_loss": 3.4964136028720922, + "tokens_seen": 1599155200 + }, + { + "epoch": 5.03, + "learning_rate": 0.00026037111334002006, + "loss": 2.6904, + "theoretical_loss": 3.4964011241162476, + "tokens_seen": 1599220736 + }, + { + "epoch": 5.03, + "learning_rate": 0.0002603610832497493, + "loss": 2.6742, + "theoretical_loss": 3.4963886460149505, + "tokens_seen": 1599286272 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002603510531594784, + "loss": 2.7654, + "theoretical_loss": 3.49637616856814, + "tokens_seen": 1599351808 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026034102306920766, + "loss": 2.6412, + "theoretical_loss": 3.4963636917757555, + "tokens_seen": 1599417344 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002603309929789368, + "loss": 2.8438, + "theoretical_loss": 3.4963512156377345, + "tokens_seen": 1599482880 + }, + { + "epoch": 5.04, + "learning_rate": 0.000260320962888666, + "loss": 2.7242, + "theoretical_loss": 3.4963387401540174, + "tokens_seen": 1599548416 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002603109327983952, + "loss": 2.8938, + "theoretical_loss": 3.4963262653245426, + "tokens_seen": 1599613952 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002603009027081244, + "loss": 2.7322, + "theoretical_loss": 3.496313791149248, + "tokens_seen": 1599679488 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026029087261785356, + "loss": 2.489, + "theoretical_loss": 3.4963013176280744, + "tokens_seen": 1599745024 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002602808425275828, + "loss": 2.7086, + "theoretical_loss": 3.496288844760959, + "tokens_seen": 1599810560 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002602708124373119, + "loss": 2.6735, + "theoretical_loss": 3.4962763725478414, + "tokens_seen": 1599876096 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026026078234704116, + "loss": 2.527, + "theoretical_loss": 3.4962639009886605, + "tokens_seen": 1599941632 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002602507522567703, + "loss": 2.7704, + "theoretical_loss": 3.4962514300833556, + "tokens_seen": 1600007168 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002602407221664995, + "loss": 2.7558, + "theoretical_loss": 3.4962389598318646, + "tokens_seen": 1600072704 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002602306920762287, + "loss": 2.4681, + "theoretical_loss": 3.496226490234127, + "tokens_seen": 1600138240 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002602206619859579, + "loss": 2.437, + "theoretical_loss": 3.4962140212900827, + "tokens_seen": 1600203776 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026021063189568707, + "loss": 2.6458, + "theoretical_loss": 3.4962015529996693, + "tokens_seen": 1600269312 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026020060180541625, + "loss": 2.7163, + "theoretical_loss": 3.4961890853628264, + "tokens_seen": 1600334848 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026019057171514543, + "loss": 2.6421, + "theoretical_loss": 3.4961766183794922, + "tokens_seen": 1600400384 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026018054162487466, + "loss": 2.5906, + "theoretical_loss": 3.496164152049607, + "tokens_seen": 1600465920 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002601705115346038, + "loss": 2.7296, + "theoretical_loss": 3.496151686373109, + "tokens_seen": 1600531456 + }, + { + "epoch": 5.04, + "learning_rate": 0.000260160481444333, + "loss": 2.9529, + "theoretical_loss": 3.496139221349937, + "tokens_seen": 1600596992 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026015045135406215, + "loss": 2.7484, + "theoretical_loss": 3.4961267569800305, + "tokens_seen": 1600662528 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1805307, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9074008464813232, + "objective/train/theoretical_loss": 3.4961174091312692, + "objective/train/tokens_used": 1621171680, + "theoretical_loss": 3.4961174091312692, + "tokens_seen": 1600711680 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002601404212637914, + "loss": 2.8461, + "theoretical_loss": 3.4961142932633287, + "tokens_seen": 1600728064 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026013039117352057, + "loss": 2.701, + "theoretical_loss": 3.4961018301997697, + "tokens_seen": 1600793600 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026012036108324975, + "loss": 2.7185, + "theoretical_loss": 3.4960893677892937, + "tokens_seen": 1600859136 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026011033099297893, + "loss": 2.4665, + "theoretical_loss": 3.496076906031839, + "tokens_seen": 1600924672 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026010030090270817, + "loss": 2.6753, + "theoretical_loss": 3.4960644449273444, + "tokens_seen": 1600990208 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002600902708124373, + "loss": 2.8114, + "theoretical_loss": 3.49605198447575, + "tokens_seen": 1601055744 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026008024072216653, + "loss": 2.7058, + "theoretical_loss": 3.496039524676994, + "tokens_seen": 1601121280 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026007021063189566, + "loss": 2.7671, + "theoretical_loss": 3.496027065531015, + "tokens_seen": 1601186816 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002600601805416249, + "loss": 2.688, + "theoretical_loss": 3.496014607037754, + "tokens_seen": 1601252352 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026005015045135407, + "loss": 2.7581, + "theoretical_loss": 3.496002149197148, + "tokens_seen": 1601317888 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026004012036108325, + "loss": 2.8447, + "theoretical_loss": 3.4959896920091373, + "tokens_seen": 1601383424 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026003009027081243, + "loss": 2.6624, + "theoretical_loss": 3.495977235473661, + "tokens_seen": 1601448960 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002600200601805416, + "loss": 2.6325, + "theoretical_loss": 3.4959647795906577, + "tokens_seen": 1601514496 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002600100300902708, + "loss": 2.5836, + "theoretical_loss": 3.4959523243600668, + "tokens_seen": 1601580032 + }, + { + "epoch": 5.04, + "learning_rate": 0.00026000000000000003, + "loss": 2.7603, + "theoretical_loss": 3.495939869781827, + "tokens_seen": 1601645568 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025998996990972916, + "loss": 2.6747, + "theoretical_loss": 3.495927415855878, + "tokens_seen": 1601711104 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002599799398194584, + "loss": 2.5385, + "theoretical_loss": 3.495914962582159, + "tokens_seen": 1601776640 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002599699097291876, + "loss": 2.6931, + "theoretical_loss": 3.495902509960609, + "tokens_seen": 1601842176 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025995987963891676, + "loss": 2.5442, + "theoretical_loss": 3.4958900579911667, + "tokens_seen": 1601907712 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025994984954864594, + "loss": 2.7225, + "theoretical_loss": 3.495877606673772, + "tokens_seen": 1601973248 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002599398194583751, + "loss": 2.8537, + "theoretical_loss": 3.495865156008364, + "tokens_seen": 1602038784 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002599297893681043, + "loss": 2.4171, + "theoretical_loss": 3.495852705994881, + "tokens_seen": 1602104320 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025991975927783353, + "loss": 2.6795, + "theoretical_loss": 3.4958402566332634, + "tokens_seen": 1602169856 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025990972918756266, + "loss": 2.7819, + "theoretical_loss": 3.4958278079234493, + "tokens_seen": 1602235392 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002598996990972919, + "loss": 2.6898, + "theoretical_loss": 3.4958153598653787, + "tokens_seen": 1602300928 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1806486, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2253284454345703, + "objective/train/theoretical_loss": 3.4958060242494957, + "objective/train/tokens_used": 1622810080, + "theoretical_loss": 3.4958060242494957, + "tokens_seen": 1602350080 + }, + { + "epoch": 5.04, + "learning_rate": 0.000259889669007021, + "loss": 2.7106, + "theoretical_loss": 3.495802912458991, + "tokens_seen": 1602366464 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025987963891675026, + "loss": 2.9012, + "theoretical_loss": 3.4957904657042245, + "tokens_seen": 1602432000 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025986960882647944, + "loss": 2.6426, + "theoretical_loss": 3.495778019601019, + "tokens_seen": 1602497536 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002598595787362086, + "loss": 2.8334, + "theoretical_loss": 3.495765574149314, + "tokens_seen": 1602563072 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002598495486459378, + "loss": 2.8165, + "theoretical_loss": 3.4957531293490485, + "tokens_seen": 1602628608 + }, + { + "epoch": 5.04, + "learning_rate": 0.000259839518555667, + "loss": 2.6539, + "theoretical_loss": 3.495740685200161, + "tokens_seen": 1602694144 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025982948846539616, + "loss": 2.5846, + "theoretical_loss": 3.4957282417025928, + "tokens_seen": 1602759680 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002598194583751254, + "loss": 2.8119, + "theoretical_loss": 3.495715798856281, + "tokens_seen": 1602825216 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002598094282848545, + "loss": 2.7783, + "theoretical_loss": 3.4957033566611657, + "tokens_seen": 1602890752 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025979939819458376, + "loss": 2.4276, + "theoretical_loss": 3.4956909151171867, + "tokens_seen": 1602956288 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025978936810431294, + "loss": 2.6884, + "theoretical_loss": 3.495678474224283, + "tokens_seen": 1603021824 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002597793380140421, + "loss": 2.5727, + "theoretical_loss": 3.495666033982393, + "tokens_seen": 1603087360 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002597693079237713, + "loss": 2.6541, + "theoretical_loss": 3.4956535943914573, + "tokens_seen": 1603152896 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002597592778335005, + "loss": 2.813, + "theoretical_loss": 3.4956411554514144, + "tokens_seen": 1603218432 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002597492477432297, + "loss": 2.6882, + "theoretical_loss": 3.4956287171622047, + "tokens_seen": 1603283968 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002597392176529589, + "loss": 2.6274, + "theoretical_loss": 3.495616279523766, + "tokens_seen": 1603349504 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002597291875626881, + "loss": 2.5218, + "theoretical_loss": 3.495603842536039, + "tokens_seen": 1603415040 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025971915747241727, + "loss": 2.8809, + "theoretical_loss": 3.4955914061989626, + "tokens_seen": 1603480576 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025970912738214645, + "loss": 2.6464, + "theoretical_loss": 3.495578970512476, + "tokens_seen": 1603546112 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025969909729187563, + "loss": 2.7924, + "theoretical_loss": 3.4955665354765184, + "tokens_seen": 1603611648 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025968906720160486, + "loss": 2.6967, + "theoretical_loss": 3.49555410109103, + "tokens_seen": 1603677184 + }, + { + "epoch": 5.04, + "learning_rate": 0.000259679037111334, + "loss": 2.6436, + "theoretical_loss": 3.4955416673559494, + "tokens_seen": 1603742720 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002596690070210632, + "loss": 2.6897, + "theoretical_loss": 3.495529234271216, + "tokens_seen": 1603808256 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025965897693079235, + "loss": 2.7027, + "theoretical_loss": 3.4955168018367697, + "tokens_seen": 1603873792 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002596489468405216, + "loss": 2.8437, + "theoretical_loss": 3.49550437005255, + "tokens_seen": 1603939328 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1807150, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3969147205352783, + "objective/train/theoretical_loss": 3.495495046641059, + "objective/train/tokens_used": 1624448480, + "theoretical_loss": 3.495495046641059, + "tokens_seen": 1603988480 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025963891675025077, + "loss": 2.6088, + "theoretical_loss": 3.4954919389184953, + "tokens_seen": 1604004864 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025962888665997995, + "loss": 2.5807, + "theoretical_loss": 3.4954795084345465, + "tokens_seen": 1604070400 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025961885656970913, + "loss": 2.5547, + "theoretical_loss": 3.4954670786006417, + "tokens_seen": 1604135936 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025960882647943837, + "loss": 2.7026, + "theoretical_loss": 3.495454649416721, + "tokens_seen": 1604201472 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002595987963891675, + "loss": 2.704, + "theoretical_loss": 3.4954422208827243, + "tokens_seen": 1604267008 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025958876629889673, + "loss": 2.7917, + "theoretical_loss": 3.4954297929985905, + "tokens_seen": 1604332544 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025957873620862586, + "loss": 2.8454, + "theoretical_loss": 3.4954173657642587, + "tokens_seen": 1604398080 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002595687061183551, + "loss": 2.7291, + "theoretical_loss": 3.4954049391796693, + "tokens_seen": 1604463616 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025955867602808427, + "loss": 2.582, + "theoretical_loss": 3.495392513244761, + "tokens_seen": 1604529152 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025954864593781345, + "loss": 2.6793, + "theoretical_loss": 3.4953800879594743, + "tokens_seen": 1604594688 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025953861584754263, + "loss": 2.5043, + "theoretical_loss": 3.495367663323747, + "tokens_seen": 1604660224 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002595285857572718, + "loss": 2.5954, + "theoretical_loss": 3.4953552393375205, + "tokens_seen": 1604725760 + }, + { + "epoch": 5.04, + "learning_rate": 0.000259518555667001, + "loss": 2.6704, + "theoretical_loss": 3.495342816000733, + "tokens_seen": 1604791296 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025950852557673023, + "loss": 2.5959, + "theoretical_loss": 3.4953303933133246, + "tokens_seen": 1604856832 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025949849548645936, + "loss": 2.6485, + "theoretical_loss": 3.495317971275235, + "tokens_seen": 1604922368 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002594884653961886, + "loss": 2.6488, + "theoretical_loss": 3.495305549886403, + "tokens_seen": 1604987904 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002594784353059178, + "loss": 2.664, + "theoretical_loss": 3.495293129146769, + "tokens_seen": 1605053440 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025946840521564696, + "loss": 2.5702, + "theoretical_loss": 3.4952807090562725, + "tokens_seen": 1605118976 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025945837512537614, + "loss": 2.8248, + "theoretical_loss": 3.4952682896148524, + "tokens_seen": 1605184512 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002594483450351053, + "loss": 2.5666, + "theoretical_loss": 3.495255870822449, + "tokens_seen": 1605250048 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002594383149448345, + "loss": 2.5176, + "theoretical_loss": 3.4952434526790013, + "tokens_seen": 1605315584 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025942828485456373, + "loss": 2.7384, + "theoretical_loss": 3.495231035184449, + "tokens_seen": 1605381120 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025941825476429286, + "loss": 2.4549, + "theoretical_loss": 3.4952186183387326, + "tokens_seen": 1605446656 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002594082246740221, + "loss": 2.6885, + "theoretical_loss": 3.49520620214179, + "tokens_seen": 1605512192 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002593981945837512, + "loss": 2.7188, + "theoretical_loss": 3.4951937865935623, + "tokens_seen": 1605577728 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1808698, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.928004503250122, + "objective/train/theoretical_loss": 3.495184475358074, + "objective/train/tokens_used": 1626086880, + "theoretical_loss": 3.495184475358074, + "tokens_seen": 1605626880 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025938816449348046, + "loss": 2.8332, + "theoretical_loss": 3.495181371693989, + "tokens_seen": 1605643264 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025937813440320964, + "loss": 2.6397, + "theoretical_loss": 3.4951689574430085, + "tokens_seen": 1605708800 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002593681043129388, + "loss": 2.628, + "theoretical_loss": 3.4951565438405616, + "tokens_seen": 1605774336 + }, + { + "epoch": 5.04, + "learning_rate": 0.000259358074222668, + "loss": 2.8962, + "theoretical_loss": 3.4951441308865885, + "tokens_seen": 1605839872 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002593480441323972, + "loss": 2.7003, + "theoretical_loss": 3.495131718581027, + "tokens_seen": 1605905408 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025933801404212636, + "loss": 2.779, + "theoretical_loss": 3.4951193069238182, + "tokens_seen": 1605970944 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002593279839518556, + "loss": 2.77, + "theoretical_loss": 3.495106895914901, + "tokens_seen": 1606036480 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002593179538615847, + "loss": 2.6084, + "theoretical_loss": 3.4950944855542163, + "tokens_seen": 1606102016 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025930792377131396, + "loss": 2.7552, + "theoretical_loss": 3.4950820758417023, + "tokens_seen": 1606167552 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025929789368104314, + "loss": 2.5836, + "theoretical_loss": 3.4950696667772996, + "tokens_seen": 1606233088 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002592878635907723, + "loss": 2.63, + "theoretical_loss": 3.495057258360948, + "tokens_seen": 1606298624 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002592778335005015, + "loss": 2.6871, + "theoretical_loss": 3.495044850592586, + "tokens_seen": 1606364160 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002592678034102307, + "loss": 2.5204, + "theoretical_loss": 3.4950324434721547, + "tokens_seen": 1606429696 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025925777331995987, + "loss": 2.8965, + "theoretical_loss": 3.4950200369995934, + "tokens_seen": 1606495232 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002592477432296891, + "loss": 2.6848, + "theoretical_loss": 3.4950076311748415, + "tokens_seen": 1606560768 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025923771313941823, + "loss": 2.5228, + "theoretical_loss": 3.4949952259978394, + "tokens_seen": 1606626304 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025922768304914747, + "loss": 2.6486, + "theoretical_loss": 3.4949828214685263, + "tokens_seen": 1606691840 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002592176529588766, + "loss": 2.7272, + "theoretical_loss": 3.4949704175868423, + "tokens_seen": 1606757376 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025920762286860583, + "loss": 2.7516, + "theoretical_loss": 3.494958014352727, + "tokens_seen": 1606822912 + }, + { + "epoch": 5.04, + "learning_rate": 0.000259197592778335, + "loss": 2.7038, + "theoretical_loss": 3.4949456117661204, + "tokens_seen": 1606888448 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002591875626880642, + "loss": 2.6016, + "theoretical_loss": 3.4949332098269617, + "tokens_seen": 1606953984 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025917753259779337, + "loss": 2.7287, + "theoretical_loss": 3.4949208085351913, + "tokens_seen": 1607019520 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025916750250752255, + "loss": 2.6791, + "theoretical_loss": 3.4949084078907493, + "tokens_seen": 1607085056 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025915747241725173, + "loss": 2.9492, + "theoretical_loss": 3.4948960078935745, + "tokens_seen": 1607150592 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025914744232698097, + "loss": 2.7316, + "theoretical_loss": 3.4948836085436072, + "tokens_seen": 1607216128 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1809362, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.568077564239502, + "objective/train/theoretical_loss": 3.4948743094558257, + "objective/train/tokens_used": 1627725280, + "theoretical_loss": 3.4948743094558257, + "tokens_seen": 1607265280 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002591374122367101, + "loss": 2.7754, + "theoretical_loss": 3.4948712098407873, + "tokens_seen": 1607281664 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025912738214643933, + "loss": 2.8811, + "theoretical_loss": 3.494858811785055, + "tokens_seen": 1607347200 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002591173520561685, + "loss": 2.8683, + "theoretical_loss": 3.494846414376349, + "tokens_seen": 1607412736 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002591073219658977, + "loss": 2.8006, + "theoretical_loss": 3.494834017614611, + "tokens_seen": 1607478272 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002590972918756269, + "loss": 2.6925, + "theoretical_loss": 3.494821621499779, + "tokens_seen": 1607543808 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025908726178535606, + "loss": 2.5741, + "theoretical_loss": 3.4948092260317942, + "tokens_seen": 1607609344 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025907723169508524, + "loss": 2.6539, + "theoretical_loss": 3.4947968312105955, + "tokens_seen": 1607674880 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025906720160481447, + "loss": 2.5719, + "theoretical_loss": 3.4947844370361234, + "tokens_seen": 1607740416 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002590571715145436, + "loss": 2.818, + "theoretical_loss": 3.4947720435083176, + "tokens_seen": 1607805952 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025904714142427283, + "loss": 2.7345, + "theoretical_loss": 3.494759650627118, + "tokens_seen": 1607871488 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025903711133400196, + "loss": 2.6123, + "theoretical_loss": 3.494747258392465, + "tokens_seen": 1607937024 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002590270812437312, + "loss": 2.5474, + "theoretical_loss": 3.4947348668042975, + "tokens_seen": 1608002560 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002590170511534604, + "loss": 2.691, + "theoretical_loss": 3.494722475862556, + "tokens_seen": 1608068096 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025900702106318956, + "loss": 2.5017, + "theoretical_loss": 3.4947100855671804, + "tokens_seen": 1608133632 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002589969909729188, + "loss": 2.5006, + "theoretical_loss": 3.494697695918111, + "tokens_seen": 1608199168 + }, + { + "epoch": 5.04, + "learning_rate": 0.000258986960882648, + "loss": 2.4911, + "theoretical_loss": 3.4946853069152874, + "tokens_seen": 1608264704 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025897693079237716, + "loss": 2.7563, + "theoretical_loss": 3.494672918558649, + "tokens_seen": 1608330240 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025896690070210634, + "loss": 2.8289, + "theoretical_loss": 3.494660530848137, + "tokens_seen": 1608395776 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002589568706118355, + "loss": 2.7974, + "theoretical_loss": 3.4946481437836905, + "tokens_seen": 1608461312 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002589468405215647, + "loss": 2.6105, + "theoretical_loss": 3.49463575736525, + "tokens_seen": 1608526848 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025893681043129393, + "loss": 2.6397, + "theoretical_loss": 3.4946233715927546, + "tokens_seen": 1608592384 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025892678034102306, + "loss": 2.7687, + "theoretical_loss": 3.494610986466145, + "tokens_seen": 1608657920 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002589167502507523, + "loss": 2.7845, + "theoretical_loss": 3.494598601985362, + "tokens_seen": 1608723456 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002589067201604814, + "loss": 2.6956, + "theoretical_loss": 3.4945862181503435, + "tokens_seen": 1608788992 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025889669007021066, + "loss": 2.8939, + "theoretical_loss": 3.4945738349610314, + "tokens_seen": 1608854528 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1810830, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7473294734954834, + "objective/train/theoretical_loss": 3.4945645479927556, + "objective/train/tokens_used": 1629363680, + "theoretical_loss": 3.4945645479927556, + "tokens_seen": 1608903680 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025888665997993984, + "loss": 2.6125, + "theoretical_loss": 3.494561452417365, + "tokens_seen": 1608920064 + }, + { + "epoch": 5.04, + "learning_rate": 0.000258876629889669, + "loss": 2.8013, + "theoretical_loss": 3.4945490705192843, + "tokens_seen": 1608985600 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002588665997993982, + "loss": 2.5571, + "theoretical_loss": 3.4945366892667296, + "tokens_seen": 1609051136 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002588565697091274, + "loss": 2.6408, + "theoretical_loss": 3.4945243086596407, + "tokens_seen": 1609116672 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025884653961885657, + "loss": 2.5487, + "theoretical_loss": 3.494511928697958, + "tokens_seen": 1609182208 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002588365095285858, + "loss": 2.8844, + "theoretical_loss": 3.4944995493816213, + "tokens_seen": 1609247744 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025882647943831493, + "loss": 2.5929, + "theoretical_loss": 3.4944871707105705, + "tokens_seen": 1609313280 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025881644934804416, + "loss": 2.7201, + "theoretical_loss": 3.4944747926847457, + "tokens_seen": 1609378816 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025880641925777334, + "loss": 2.591, + "theoretical_loss": 3.4944624153040875, + "tokens_seen": 1609444352 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002587963891675025, + "loss": 2.6937, + "theoretical_loss": 3.494450038568536, + "tokens_seen": 1609509888 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002587863590772317, + "loss": 2.7347, + "theoretical_loss": 3.4944376624780307, + "tokens_seen": 1609575424 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002587763289869609, + "loss": 2.5359, + "theoretical_loss": 3.494425287032512, + "tokens_seen": 1609640960 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025876629889669007, + "loss": 2.771, + "theoretical_loss": 3.4944129122319207, + "tokens_seen": 1609706496 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002587562688064193, + "loss": 2.6761, + "theoretical_loss": 3.4944005380761958, + "tokens_seen": 1609772032 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025874623871614843, + "loss": 2.6478, + "theoretical_loss": 3.4943881645652777, + "tokens_seen": 1609837568 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025873620862587767, + "loss": 2.5769, + "theoretical_loss": 3.494375791699107, + "tokens_seen": 1609903104 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002587261785356068, + "loss": 2.5977, + "theoretical_loss": 3.494363419477624, + "tokens_seen": 1609968640 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025871614844533603, + "loss": 2.528, + "theoretical_loss": 3.4943510479007687, + "tokens_seen": 1610034176 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002587061183550652, + "loss": 2.6623, + "theoretical_loss": 3.4943386769684803, + "tokens_seen": 1610099712 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002586960882647944, + "loss": 2.8623, + "theoretical_loss": 3.4943263066807004, + "tokens_seen": 1610165248 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025868605817452357, + "loss": 2.5631, + "theoretical_loss": 3.4943139370373686, + "tokens_seen": 1610230784 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025867602808425275, + "loss": 2.7814, + "theoretical_loss": 3.494301568038425, + "tokens_seen": 1610296320 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025866599799398193, + "loss": 2.2406, + "theoretical_loss": 3.4942891996838097, + "tokens_seen": 1610361856 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025865596790371117, + "loss": 2.6263, + "theoretical_loss": 3.494276831973463, + "tokens_seen": 1610427392 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002586459378134403, + "loss": 2.7764, + "theoretical_loss": 3.494264464907326, + "tokens_seen": 1610492928 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1811590, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.983952760696411, + "objective/train/theoretical_loss": 3.494255190030449, + "objective/train/tokens_used": 1631002080, + "theoretical_loss": 3.494255190030449, + "tokens_seen": 1610542080 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025863590772316953, + "loss": 2.7173, + "theoretical_loss": 3.4942520984853376, + "tokens_seen": 1610558464 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002586258776328987, + "loss": 2.7323, + "theoretical_loss": 3.494239732707438, + "tokens_seen": 1610624000 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002586158475426279, + "loss": 2.8016, + "theoretical_loss": 3.4942273675735693, + "tokens_seen": 1610689536 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002586058174523571, + "loss": 2.7218, + "theoretical_loss": 3.49421500308367, + "tokens_seen": 1610755072 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025859578736208626, + "loss": 2.6106, + "theoretical_loss": 3.4942026392376806, + "tokens_seen": 1610820608 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025858575727181544, + "loss": 2.6007, + "theoretical_loss": 3.4941902760355417, + "tokens_seen": 1610886144 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025857572718154467, + "loss": 2.7377, + "theoretical_loss": 3.494177913477194, + "tokens_seen": 1610951680 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002585656970912738, + "loss": 2.5021, + "theoretical_loss": 3.4941655515625767, + "tokens_seen": 1611017216 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025855566700100303, + "loss": 2.7546, + "theoretical_loss": 3.494153190291631, + "tokens_seen": 1611082752 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025854563691073216, + "loss": 2.6762, + "theoretical_loss": 3.494140829664297, + "tokens_seen": 1611148288 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002585356068204614, + "loss": 2.6792, + "theoretical_loss": 3.494128469680515, + "tokens_seen": 1611213824 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002585255767301906, + "loss": 2.6197, + "theoretical_loss": 3.494116110340225, + "tokens_seen": 1611279360 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025851554663991976, + "loss": 2.4191, + "theoretical_loss": 3.4941037516433675, + "tokens_seen": 1611344896 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025850551654964894, + "loss": 2.7003, + "theoretical_loss": 3.4940913935898834, + "tokens_seen": 1611410432 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002584954864593782, + "loss": 2.8108, + "theoretical_loss": 3.494079036179712, + "tokens_seen": 1611475968 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002584854563691073, + "loss": 2.676, + "theoretical_loss": 3.4940666794127946, + "tokens_seen": 1611541504 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025847542627883654, + "loss": 2.7139, + "theoretical_loss": 3.494054323289071, + "tokens_seen": 1611607040 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025846539618856566, + "loss": 2.7514, + "theoretical_loss": 3.4940419678084815, + "tokens_seen": 1611672576 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002584553660982949, + "loss": 2.6642, + "theoretical_loss": 3.4940296129709667, + "tokens_seen": 1611738112 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002584453360080241, + "loss": 2.6856, + "theoretical_loss": 3.4940172587764673, + "tokens_seen": 1611803648 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025843530591775326, + "loss": 2.4908, + "theoretical_loss": 3.4940049052249234, + "tokens_seen": 1611869184 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025842527582748244, + "loss": 2.5908, + "theoretical_loss": 3.4939925523162754, + "tokens_seen": 1611934720 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002584152457372116, + "loss": 2.4254, + "theoretical_loss": 3.4939802000504634, + "tokens_seen": 1612000256 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002584052156469408, + "loss": 2.5605, + "theoretical_loss": 3.4939678484274284, + "tokens_seen": 1612065792 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025839518555667004, + "loss": 2.5772, + "theoretical_loss": 3.49395549744711, + "tokens_seen": 1612131328 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1813183, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.669978380203247, + "objective/train/theoretical_loss": 3.4939462346336185, + "objective/train/tokens_used": 1632640480, + "theoretical_loss": 3.4939462346336185, + "tokens_seen": 1612180480 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025838515546639917, + "loss": 2.6425, + "theoretical_loss": 3.4939431471094498, + "tokens_seen": 1612196864 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002583751253761284, + "loss": 2.664, + "theoretical_loss": 3.4939307974143867, + "tokens_seen": 1612262400 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025836509528585753, + "loss": 2.7776, + "theoretical_loss": 3.493918448361863, + "tokens_seen": 1612327936 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025835506519558677, + "loss": 2.6187, + "theoretical_loss": 3.4939060999518174, + "tokens_seen": 1612393472 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025834503510531595, + "loss": 2.6924, + "theoretical_loss": 3.493893752184192, + "tokens_seen": 1612459008 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025833500501504513, + "loss": 2.572, + "theoretical_loss": 3.493881405058926, + "tokens_seen": 1612524544 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002583249749247743, + "loss": 2.5499, + "theoretical_loss": 3.49386905857596, + "tokens_seen": 1612590080 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025831494483450354, + "loss": 2.7041, + "theoretical_loss": 3.4938567127352353, + "tokens_seen": 1612655616 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025830491474423267, + "loss": 2.5322, + "theoretical_loss": 3.4938443675366915, + "tokens_seen": 1612721152 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002582948846539619, + "loss": 2.6045, + "theoretical_loss": 3.4938320229802695, + "tokens_seen": 1612786688 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025828485456369103, + "loss": 2.9335, + "theoretical_loss": 3.49381967906591, + "tokens_seen": 1612852224 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025827482447342027, + "loss": 2.556, + "theoretical_loss": 3.4938073357935533, + "tokens_seen": 1612917760 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025826479438314945, + "loss": 2.5735, + "theoretical_loss": 3.4937949931631396, + "tokens_seen": 1612983296 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025825476429287863, + "loss": 2.7174, + "theoretical_loss": 3.49378265117461, + "tokens_seen": 1613048832 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025824473420260787, + "loss": 2.7566, + "theoretical_loss": 3.493770309827905, + "tokens_seen": 1613114368 + }, + { + "epoch": 5.04, + "learning_rate": 0.000258234704112337, + "loss": 2.668, + "theoretical_loss": 3.493757969122965, + "tokens_seen": 1613179904 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025822467402206623, + "loss": 2.6568, + "theoretical_loss": 3.49374562905973, + "tokens_seen": 1613245440 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002582146439317954, + "loss": 2.5037, + "theoretical_loss": 3.4937332896381417, + "tokens_seen": 1613310976 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002582046138415246, + "loss": 2.7663, + "theoretical_loss": 3.4937209508581395, + "tokens_seen": 1613376512 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025819458375125377, + "loss": 2.7212, + "theoretical_loss": 3.493708612719665, + "tokens_seen": 1613442048 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025818455366098295, + "loss": 2.6807, + "theoretical_loss": 3.493696275222658, + "tokens_seen": 1613507584 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025817452357071213, + "loss": 2.5853, + "theoretical_loss": 3.4936839383670595, + "tokens_seen": 1613573120 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025816449348044137, + "loss": 2.5835, + "theoretical_loss": 3.4936716021528103, + "tokens_seen": 1613638656 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002581544633901705, + "loss": 2.6914, + "theoretical_loss": 3.4936592665798507, + "tokens_seen": 1613704192 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025814443329989973, + "loss": 2.6472, + "theoretical_loss": 3.493646931648121, + "tokens_seen": 1613769728 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1814061, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6768484115600586, + "objective/train/theoretical_loss": 3.493637680870095, + "objective/train/tokens_used": 1634278880, + "theoretical_loss": 3.493637680870095, + "tokens_seen": 1613818880 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002581344032096289, + "loss": 2.5848, + "theoretical_loss": 3.493634597357562, + "tokens_seen": 1613835264 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002581243731193581, + "loss": 2.6934, + "theoretical_loss": 3.493622263708115, + "tokens_seen": 1613900800 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002581143430290873, + "loss": 2.5939, + "theoretical_loss": 3.49360993069972, + "tokens_seen": 1613966336 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025810431293881646, + "loss": 2.3703, + "theoretical_loss": 3.493597598332318, + "tokens_seen": 1614031872 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025809428284854564, + "loss": 2.7793, + "theoretical_loss": 3.49358526660585, + "tokens_seen": 1614097408 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025808425275827487, + "loss": 2.7513, + "theoretical_loss": 3.4935729355202554, + "tokens_seen": 1614162944 + }, + { + "epoch": 5.04, + "learning_rate": 0.000258074222668004, + "loss": 2.7046, + "theoretical_loss": 3.4935606050754755, + "tokens_seen": 1614228480 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025806419257773323, + "loss": 2.6072, + "theoretical_loss": 3.4935482752714515, + "tokens_seen": 1614294016 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025805416248746236, + "loss": 2.6282, + "theoretical_loss": 3.4935359461081235, + "tokens_seen": 1614359552 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002580441323971916, + "loss": 2.5873, + "theoretical_loss": 3.4935236175854323, + "tokens_seen": 1614425088 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002580341023069208, + "loss": 2.7102, + "theoretical_loss": 3.493511289703319, + "tokens_seen": 1614490624 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025802407221664996, + "loss": 2.6728, + "theoretical_loss": 3.4934989624617243, + "tokens_seen": 1614556160 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025801404212637914, + "loss": 2.7576, + "theoretical_loss": 3.493486635860588, + "tokens_seen": 1614621696 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002580040120361084, + "loss": 2.7063, + "theoretical_loss": 3.493474309899852, + "tokens_seen": 1614687232 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002579939819458375, + "loss": 2.6767, + "theoretical_loss": 3.493461984579456, + "tokens_seen": 1614752768 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025798395185556674, + "loss": 2.5628, + "theoretical_loss": 3.493449659899342, + "tokens_seen": 1614818304 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025797392176529586, + "loss": 2.701, + "theoretical_loss": 3.4934373358594497, + "tokens_seen": 1614883840 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002579638916750251, + "loss": 2.6278, + "theoretical_loss": 3.4934250124597197, + "tokens_seen": 1614949376 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002579538615847543, + "loss": 2.565, + "theoretical_loss": 3.4934126897000937, + "tokens_seen": 1615014912 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025794383149448346, + "loss": 2.6736, + "theoretical_loss": 3.493400367580512, + "tokens_seen": 1615080448 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025793380140421264, + "loss": 2.7948, + "theoretical_loss": 3.4933880461009155, + "tokens_seen": 1615145984 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002579237713139418, + "loss": 2.6424, + "theoretical_loss": 3.493375725261245, + "tokens_seen": 1615211520 + }, + { + "epoch": 5.04, + "learning_rate": 0.000257913741223671, + "loss": 2.597, + "theoretical_loss": 3.4933634050614413, + "tokens_seen": 1615277056 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025790371113340024, + "loss": 2.5277, + "theoretical_loss": 3.4933510855014447, + "tokens_seen": 1615342592 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025789368104312937, + "loss": 2.7266, + "theoretical_loss": 3.493338766581197, + "tokens_seen": 1615408128 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1814677, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.616262197494507, + "objective/train/theoretical_loss": 3.4933295278108103, + "objective/train/tokens_used": 1635917280, + "theoretical_loss": 3.4933295278108103, + "tokens_seen": 1615457280 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002578836509528586, + "loss": 2.6068, + "theoretical_loss": 3.493326448300638, + "tokens_seen": 1615473664 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025787362086258773, + "loss": 2.5708, + "theoretical_loss": 3.493314130659709, + "tokens_seen": 1615539200 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025786359077231697, + "loss": 2.6691, + "theoretical_loss": 3.493301813658351, + "tokens_seen": 1615604736 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025785356068204615, + "loss": 2.6395, + "theoretical_loss": 3.493289497296505, + "tokens_seen": 1615670272 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025784353059177533, + "loss": 2.6006, + "theoretical_loss": 3.4932771815741113, + "tokens_seen": 1615735808 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002578335005015045, + "loss": 2.5032, + "theoretical_loss": 3.493264866491111, + "tokens_seen": 1615801344 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025782347041123374, + "loss": 2.67, + "theoretical_loss": 3.493252552047445, + "tokens_seen": 1615866880 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025781344032096287, + "loss": 2.7613, + "theoretical_loss": 3.493240238243054, + "tokens_seen": 1615932416 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002578034102306921, + "loss": 2.6414, + "theoretical_loss": 3.4932279250778793, + "tokens_seen": 1615997952 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025779338014042123, + "loss": 2.7801, + "theoretical_loss": 3.493215612551862, + "tokens_seen": 1616063488 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025778335005015047, + "loss": 2.6191, + "theoretical_loss": 3.4932033006649417, + "tokens_seen": 1616129024 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025777331995987965, + "loss": 2.4602, + "theoretical_loss": 3.4931909894170605, + "tokens_seen": 1616194560 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025776328986960883, + "loss": 2.7383, + "theoretical_loss": 3.4931786788081594, + "tokens_seen": 1616260096 + }, + { + "epoch": 5.04, + "learning_rate": 0.000257753259779338, + "loss": 2.7904, + "theoretical_loss": 3.493166368838178, + "tokens_seen": 1616325632 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002577432296890672, + "loss": 2.7796, + "theoretical_loss": 3.4931540595070594, + "tokens_seen": 1616391168 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002577331995987964, + "loss": 2.615, + "theoretical_loss": 3.4931417508147424, + "tokens_seen": 1616456704 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002577231695085256, + "loss": 2.3553, + "theoretical_loss": 3.4931294427611697, + "tokens_seen": 1616522240 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025771313941825474, + "loss": 2.7692, + "theoretical_loss": 3.4931171353462807, + "tokens_seen": 1616587776 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025770310932798397, + "loss": 2.5657, + "theoretical_loss": 3.4931048285700177, + "tokens_seen": 1616653312 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002576930792377131, + "loss": 2.8594, + "theoretical_loss": 3.4930925224323204, + "tokens_seen": 1616718848 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025768304914744233, + "loss": 2.7142, + "theoretical_loss": 3.493080216933131, + "tokens_seen": 1616784384 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002576730190571715, + "loss": 2.5003, + "theoretical_loss": 3.493067912072389, + "tokens_seen": 1616849920 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002576629889669007, + "loss": 2.7273, + "theoretical_loss": 3.493055607850037, + "tokens_seen": 1616915456 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002576529588766299, + "loss": 2.7189, + "theoretical_loss": 3.4930433042660156, + "tokens_seen": 1616980992 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002576429287863591, + "loss": 2.5459, + "theoretical_loss": 3.4930310013202654, + "tokens_seen": 1617046528 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1816051, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6610658168792725, + "objective/train/theoretical_loss": 3.493021774529783, + "objective/train/tokens_used": 1637555680, + "theoretical_loss": 3.493021774529783, + "tokens_seen": 1617095680 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025763289869608824, + "loss": 2.6226, + "theoretical_loss": 3.493018699012728, + "tokens_seen": 1617112064 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002576228686058175, + "loss": 2.4937, + "theoretical_loss": 3.493006397343343, + "tokens_seen": 1617177600 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002576128385155466, + "loss": 2.5907, + "theoretical_loss": 3.492994096312053, + "tokens_seen": 1617243136 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025760280842527584, + "loss": 2.4163, + "theoretical_loss": 3.4929817959187988, + "tokens_seen": 1617308672 + }, + { + "epoch": 5.04, + "learning_rate": 0.000257592778335005, + "loss": 2.4671, + "theoretical_loss": 3.4929694961635205, + "tokens_seen": 1617374208 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002575827482447342, + "loss": 2.5927, + "theoretical_loss": 3.4929571970461604, + "tokens_seen": 1617439744 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002575727181544634, + "loss": 2.5462, + "theoretical_loss": 3.492944898566659, + "tokens_seen": 1617505280 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025756268806419256, + "loss": 2.4217, + "theoretical_loss": 3.4929326007249566, + "tokens_seen": 1617570816 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025755265797392174, + "loss": 2.6032, + "theoretical_loss": 3.4929203035209957, + "tokens_seen": 1617636352 + }, + { + "epoch": 5.04, + "learning_rate": 0.000257542627883651, + "loss": 2.7347, + "theoretical_loss": 3.4929080069547167, + "tokens_seen": 1617701888 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002575325977933801, + "loss": 2.5122, + "theoretical_loss": 3.4928957110260606, + "tokens_seen": 1617767424 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025752256770310934, + "loss": 2.7135, + "theoretical_loss": 3.4928834157349686, + "tokens_seen": 1617832960 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025751253761283847, + "loss": 2.6551, + "theoretical_loss": 3.4928711210813823, + "tokens_seen": 1617898496 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002575025075225677, + "loss": 2.9374, + "theoretical_loss": 3.4928588270652416, + "tokens_seen": 1617964032 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025749247743229694, + "loss": 2.5136, + "theoretical_loss": 3.492846533686489, + "tokens_seen": 1618029568 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025748244734202606, + "loss": 2.5835, + "theoretical_loss": 3.492834240945065, + "tokens_seen": 1618095104 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002574724172517553, + "loss": 2.5203, + "theoretical_loss": 3.492821948840911, + "tokens_seen": 1618160640 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002574623871614845, + "loss": 2.6176, + "theoretical_loss": 3.4928096573739675, + "tokens_seen": 1618226176 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025745235707121366, + "loss": 2.6967, + "theoretical_loss": 3.4927973665441763, + "tokens_seen": 1618291712 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025744232698094284, + "loss": 2.3947, + "theoretical_loss": 3.4927850763514785, + "tokens_seen": 1618357248 + }, + { + "epoch": 5.04, + "learning_rate": 0.000257432296890672, + "loss": 2.8494, + "theoretical_loss": 3.4927727867958147, + "tokens_seen": 1618422784 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002574222668004012, + "loss": 2.5314, + "theoretical_loss": 3.4927604978771276, + "tokens_seen": 1618488320 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025741223671013044, + "loss": 2.4884, + "theoretical_loss": 3.492748209595357, + "tokens_seen": 1618553856 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025740220661985957, + "loss": 2.6668, + "theoretical_loss": 3.4927359219504437, + "tokens_seen": 1618619392 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002573921765295888, + "loss": 2.5161, + "theoretical_loss": 3.4927236349423305, + "tokens_seen": 1618684928 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1816712, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6617588996887207, + "objective/train/theoretical_loss": 3.492714420104109, + "objective/train/tokens_used": 1639194080, + "theoretical_loss": 3.492714420104109, + "tokens_seen": 1618734080 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025738214643931793, + "loss": 2.4688, + "theoretical_loss": 3.492711348570957, + "tokens_seen": 1618750464 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025737211634904717, + "loss": 2.6443, + "theoretical_loss": 3.492699062836266, + "tokens_seen": 1618816000 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025736208625877635, + "loss": 2.8617, + "theoretical_loss": 3.4926867777381974, + "tokens_seen": 1618881536 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025735205616850553, + "loss": 2.5455, + "theoretical_loss": 3.4926744932766933, + "tokens_seen": 1618947072 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002573420260782347, + "loss": 2.6395, + "theoretical_loss": 3.4926622094516944, + "tokens_seen": 1619012608 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025733199598796394, + "loss": 2.5923, + "theoretical_loss": 3.4926499262631427, + "tokens_seen": 1619078144 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025732196589769307, + "loss": 2.6676, + "theoretical_loss": 3.4926376437109785, + "tokens_seen": 1619143680 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002573119358074223, + "loss": 2.7692, + "theoretical_loss": 3.492625361795144, + "tokens_seen": 1619209216 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025730190571715143, + "loss": 2.8743, + "theoretical_loss": 3.4926130805155795, + "tokens_seen": 1619274752 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025729187562688067, + "loss": 2.8537, + "theoretical_loss": 3.492600799872227, + "tokens_seen": 1619340288 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025728184553660985, + "loss": 2.6658, + "theoretical_loss": 3.492588519865028, + "tokens_seen": 1619405824 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025727181544633903, + "loss": 2.6737, + "theoretical_loss": 3.4925762404939227, + "tokens_seen": 1619471360 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002572617853560682, + "loss": 2.8957, + "theoretical_loss": 3.4925639617588535, + "tokens_seen": 1619536896 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002572517552657974, + "loss": 2.6378, + "theoretical_loss": 3.4925516836597614, + "tokens_seen": 1619602432 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002572417251755266, + "loss": 2.6016, + "theoretical_loss": 3.4925394061965873, + "tokens_seen": 1619667968 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002572316950852558, + "loss": 2.5372, + "theoretical_loss": 3.4925271293692735, + "tokens_seen": 1619733504 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025722166499498494, + "loss": 2.428, + "theoretical_loss": 3.4925148531777603, + "tokens_seen": 1619799040 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025721163490471417, + "loss": 2.5885, + "theoretical_loss": 3.4925025776219893, + "tokens_seen": 1619864576 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002572016048144433, + "loss": 2.6786, + "theoretical_loss": 3.4924903027019027, + "tokens_seen": 1619930112 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025719157472417253, + "loss": 2.5506, + "theoretical_loss": 3.492478028417441, + "tokens_seen": 1619995648 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002571815446339017, + "loss": 2.4361, + "theoretical_loss": 3.4924657547685456, + "tokens_seen": 1620061184 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002571715145436309, + "loss": 2.6449, + "theoretical_loss": 3.492453481755158, + "tokens_seen": 1620126720 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002571614844533601, + "loss": 2.548, + "theoretical_loss": 3.49244120937722, + "tokens_seen": 1620192256 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002571514543630893, + "loss": 2.6244, + "theoretical_loss": 3.4924289376346724, + "tokens_seen": 1620257792 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025714142427281844, + "loss": 2.7454, + "theoretical_loss": 3.4924166665274567, + "tokens_seen": 1620323328 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1817999, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.959540843963623, + "objective/train/theoretical_loss": 3.4924074636139464, + "objective/train/tokens_used": 1640832480, + "theoretical_loss": 3.4924074636139464, + "tokens_seen": 1620372480 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002571313941825477, + "loss": 2.6683, + "theoretical_loss": 3.4924043960555147, + "tokens_seen": 1620388864 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002571213640922768, + "loss": 2.671, + "theoretical_loss": 3.4923921262187876, + "tokens_seen": 1620454400 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025711133400200604, + "loss": 2.8518, + "theoretical_loss": 3.492379857017217, + "tokens_seen": 1620519936 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002571013039117352, + "loss": 2.3591, + "theoretical_loss": 3.492367588450744, + "tokens_seen": 1620585472 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002570912738214644, + "loss": 2.4134, + "theoretical_loss": 3.49235532051931, + "tokens_seen": 1620651008 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002570812437311936, + "loss": 2.6232, + "theoretical_loss": 3.492343053222857, + "tokens_seen": 1620716544 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025707121364092276, + "loss": 2.6938, + "theoretical_loss": 3.4923307865613262, + "tokens_seen": 1620782080 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025706118355065194, + "loss": 2.4861, + "theoretical_loss": 3.492318520534658, + "tokens_seen": 1620847616 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002570511534603812, + "loss": 2.7885, + "theoretical_loss": 3.492306255142796, + "tokens_seen": 1620913152 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002570411233701103, + "loss": 2.6434, + "theoretical_loss": 3.4922939903856802, + "tokens_seen": 1620978688 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025703109327983954, + "loss": 2.5004, + "theoretical_loss": 3.4922817262632524, + "tokens_seen": 1621044224 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025702106318956867, + "loss": 2.7335, + "theoretical_loss": 3.4922694627754542, + "tokens_seen": 1621109760 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002570110330992979, + "loss": 2.734, + "theoretical_loss": 3.4922571999222267, + "tokens_seen": 1621175296 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002570010030090271, + "loss": 2.7062, + "theoretical_loss": 3.4922449377035125, + "tokens_seen": 1621240832 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025699097291875626, + "loss": 2.5759, + "theoretical_loss": 3.4922326761192517, + "tokens_seen": 1621306368 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025698094282848545, + "loss": 2.4875, + "theoretical_loss": 3.4922204151693865, + "tokens_seen": 1621371904 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002569709127382147, + "loss": 2.8393, + "theoretical_loss": 3.492208154853859, + "tokens_seen": 1621437440 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002569608826479438, + "loss": 2.6645, + "theoretical_loss": 3.4921958951726095, + "tokens_seen": 1621502976 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025695085255767304, + "loss": 2.8177, + "theoretical_loss": 3.4921836361255805, + "tokens_seen": 1621568512 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025694082246740217, + "loss": 2.7412, + "theoretical_loss": 3.4921713777127135, + "tokens_seen": 1621634048 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002569307923771314, + "loss": 2.534, + "theoretical_loss": 3.4921591199339495, + "tokens_seen": 1621699584 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002569207622868606, + "loss": 2.5425, + "theoretical_loss": 3.4921468627892307, + "tokens_seen": 1621765120 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025691073219658977, + "loss": 2.6161, + "theoretical_loss": 3.4921346062784986, + "tokens_seen": 1621830656 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025690070210631895, + "loss": 2.3787, + "theoretical_loss": 3.4921223504016945, + "tokens_seen": 1621896192 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025689067201604813, + "loss": 2.6759, + "theoretical_loss": 3.49211009515876, + "tokens_seen": 1621961728 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1818775, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1677393913269043, + "objective/train/theoretical_loss": 3.492100904142501, + "objective/train/tokens_used": 1642470880, + "theoretical_loss": 3.492100904142501, + "tokens_seen": 1622010880 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002568806419257773, + "loss": 2.469, + "theoretical_loss": 3.4920978405496363, + "tokens_seen": 1622027264 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025687061183550655, + "loss": 2.6083, + "theoretical_loss": 3.4920855865742664, + "tokens_seen": 1622092800 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002568605817452357, + "loss": 2.5079, + "theoretical_loss": 3.4920733332325904, + "tokens_seen": 1622158336 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002568505516549649, + "loss": 2.6879, + "theoretical_loss": 3.492061080524551, + "tokens_seen": 1622223872 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025684052156469404, + "loss": 2.5899, + "theoretical_loss": 3.4920488284500895, + "tokens_seen": 1622289408 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025683049147442327, + "loss": 2.4714, + "theoretical_loss": 3.4920365770091473, + "tokens_seen": 1622354944 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025682046138415245, + "loss": 2.527, + "theoretical_loss": 3.4920243262016664, + "tokens_seen": 1622420480 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025681043129388163, + "loss": 2.808, + "theoretical_loss": 3.4920120760275877, + "tokens_seen": 1622486016 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002568004012036108, + "loss": 2.5658, + "theoretical_loss": 3.491999826486854, + "tokens_seen": 1622551552 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025679037111334005, + "loss": 2.596, + "theoretical_loss": 3.4919875775794065, + "tokens_seen": 1622617088 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002567803410230692, + "loss": 2.5577, + "theoretical_loss": 3.4919753293051867, + "tokens_seen": 1622682624 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002567703109327984, + "loss": 2.8376, + "theoretical_loss": 3.491963081664136, + "tokens_seen": 1622748160 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025676028084252754, + "loss": 2.6855, + "theoretical_loss": 3.4919508346561967, + "tokens_seen": 1622813696 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002567502507522568, + "loss": 2.5951, + "theoretical_loss": 3.491938588281311, + "tokens_seen": 1622879232 + }, + { + "epoch": 5.04, + "learning_rate": 0.000256740220661986, + "loss": 2.5642, + "theoretical_loss": 3.491926342539419, + "tokens_seen": 1622944768 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025673019057171514, + "loss": 2.6147, + "theoretical_loss": 3.491914097430464, + "tokens_seen": 1623010304 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025672016048144437, + "loss": 2.5273, + "theoretical_loss": 3.4919018529543866, + "tokens_seen": 1623075840 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002567101303911735, + "loss": 2.6065, + "theoretical_loss": 3.491889609111129, + "tokens_seen": 1623141376 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025670010030090273, + "loss": 2.5336, + "theoretical_loss": 3.491877365900633, + "tokens_seen": 1623206912 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002566900702106319, + "loss": 2.6442, + "theoretical_loss": 3.4918651233228406, + "tokens_seen": 1623272448 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002566800401203611, + "loss": 2.6089, + "theoretical_loss": 3.4918528813776932, + "tokens_seen": 1623337984 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002566700100300903, + "loss": 2.3904, + "theoretical_loss": 3.4918406400651327, + "tokens_seen": 1623403520 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002566599799398195, + "loss": 2.7396, + "theoretical_loss": 3.491828399385101, + "tokens_seen": 1623469056 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025664994984954864, + "loss": 2.7331, + "theoretical_loss": 3.491816159337539, + "tokens_seen": 1623534592 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002566399197592779, + "loss": 2.762, + "theoretical_loss": 3.49180391992239, + "tokens_seen": 1623600128 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1820300, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7725954055786133, + "objective/train/theoretical_loss": 3.4917947407760135, + "objective/train/tokens_used": 1644109280, + "theoretical_loss": 3.4917947407760135, + "tokens_seen": 1623649280 + }, + { + "epoch": 5.04, + "learning_rate": 0.000256629889669007, + "loss": 2.5771, + "theoretical_loss": 3.4917916811395946, + "tokens_seen": 1623665664 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025661985957873624, + "loss": 2.6068, + "theoretical_loss": 3.491779442989095, + "tokens_seen": 1623731200 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002566098294884654, + "loss": 2.5716, + "theoretical_loss": 3.491767205470833, + "tokens_seen": 1623796736 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002565997993981946, + "loss": 2.5825, + "theoretical_loss": 3.4917549685847504, + "tokens_seen": 1623862272 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002565897693079238, + "loss": 2.7518, + "theoretical_loss": 3.4917427323307892, + "tokens_seen": 1623927808 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025657973921765296, + "loss": 2.507, + "theoretical_loss": 3.4917304967088914, + "tokens_seen": 1623993344 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025656970912738214, + "loss": 2.7368, + "theoretical_loss": 3.491718261718998, + "tokens_seen": 1624058880 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002565596790371114, + "loss": 2.7704, + "theoretical_loss": 3.491706027361052, + "tokens_seen": 1624124416 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002565496489468405, + "loss": 2.5858, + "theoretical_loss": 3.4916937936349943, + "tokens_seen": 1624189952 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025653961885656974, + "loss": 2.8094, + "theoretical_loss": 3.4916815605407674, + "tokens_seen": 1624255488 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025652958876629887, + "loss": 2.7634, + "theoretical_loss": 3.4916693280783124, + "tokens_seen": 1624321024 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002565195586760281, + "loss": 2.5955, + "theoretical_loss": 3.491657096247572, + "tokens_seen": 1624386560 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002565095285857573, + "loss": 2.5747, + "theoretical_loss": 3.491644865048488, + "tokens_seen": 1624452096 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025649949849548647, + "loss": 2.7707, + "theoretical_loss": 3.491632634481002, + "tokens_seen": 1624517632 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025648946840521565, + "loss": 2.6173, + "theoretical_loss": 3.491620404545056, + "tokens_seen": 1624583168 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002564794383149449, + "loss": 2.6951, + "theoretical_loss": 3.4916081752405916, + "tokens_seen": 1624648704 + }, + { + "epoch": 5.04, + "learning_rate": 0.000256469408224674, + "loss": 2.7188, + "theoretical_loss": 3.4915959465675517, + "tokens_seen": 1624714240 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025645937813440324, + "loss": 2.4266, + "theoretical_loss": 3.4915837185258765, + "tokens_seen": 1624779776 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025644934804413237, + "loss": 2.6004, + "theoretical_loss": 3.49157149111551, + "tokens_seen": 1624845312 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002564393179538616, + "loss": 2.5292, + "theoretical_loss": 3.491559264336393, + "tokens_seen": 1624910848 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002564292878635908, + "loss": 2.4791, + "theoretical_loss": 3.4915470381884672, + "tokens_seen": 1624976384 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025641925777331997, + "loss": 2.6726, + "theoretical_loss": 3.4915348126716754, + "tokens_seen": 1625041920 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025640922768304915, + "loss": 2.8152, + "theoretical_loss": 3.4915225877859593, + "tokens_seen": 1625107456 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025639919759277833, + "loss": 2.5362, + "theoretical_loss": 3.49151036353126, + "tokens_seen": 1625172992 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002563891675025075, + "loss": 2.5838, + "theoretical_loss": 3.4914981399075207, + "tokens_seen": 1625238528 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1820966, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0666143894195557, + "objective/train/theoretical_loss": 3.491488972603748, + "objective/train/tokens_used": 1645747680, + "theoretical_loss": 3.491488972603748, + "tokens_seen": 1625287680 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025637913741223675, + "loss": 2.6085, + "theoretical_loss": 3.491485916914683, + "tokens_seen": 1625304064 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002563691073219659, + "loss": 2.7686, + "theoretical_loss": 3.4914736945526883, + "tokens_seen": 1625369600 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002563590772316951, + "loss": 2.6789, + "theoretical_loss": 3.491461472821479, + "tokens_seen": 1625435136 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025634904714142424, + "loss": 2.4061, + "theoretical_loss": 3.491449251720998, + "tokens_seen": 1625500672 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025633901705115347, + "loss": 2.522, + "theoretical_loss": 3.491437031251186, + "tokens_seen": 1625566208 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025632898696088265, + "loss": 2.5905, + "theoretical_loss": 3.491424811411986, + "tokens_seen": 1625631744 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025631895687061183, + "loss": 2.5403, + "theoretical_loss": 3.491412592203339, + "tokens_seen": 1625697280 + }, + { + "epoch": 5.04, + "learning_rate": 0.000256308926780341, + "loss": 2.7574, + "theoretical_loss": 3.491400373625188, + "tokens_seen": 1625762816 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025629889669007025, + "loss": 2.646, + "theoretical_loss": 3.491388155677475, + "tokens_seen": 1625828352 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002562888665997994, + "loss": 2.9001, + "theoretical_loss": 3.4913759383601413, + "tokens_seen": 1625893888 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002562788365095286, + "loss": 2.7254, + "theoretical_loss": 3.4913637216731295, + "tokens_seen": 1625959424 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025626880641925774, + "loss": 2.7043, + "theoretical_loss": 3.4913515056163815, + "tokens_seen": 1626024960 + }, + { + "epoch": 5.04, + "learning_rate": 0.000256258776328987, + "loss": 2.6255, + "theoretical_loss": 3.49133929018984, + "tokens_seen": 1626090496 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025624874623871616, + "loss": 2.5757, + "theoretical_loss": 3.4913270753934462, + "tokens_seen": 1626156032 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025623871614844534, + "loss": 2.5782, + "theoretical_loss": 3.491314861227143, + "tokens_seen": 1626221568 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002562286860581745, + "loss": 2.4161, + "theoretical_loss": 3.4913026476908717, + "tokens_seen": 1626287104 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002562186559679037, + "loss": 2.7689, + "theoretical_loss": 3.491290434784575, + "tokens_seen": 1626352640 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002562086258776329, + "loss": 2.7879, + "theoretical_loss": 3.4912782225081944, + "tokens_seen": 1626418176 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002561985957873621, + "loss": 2.5714, + "theoretical_loss": 3.491266010861673, + "tokens_seen": 1626483712 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025618856569709124, + "loss": 2.4168, + "theoretical_loss": 3.4912537998449515, + "tokens_seen": 1626549248 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002561785356068205, + "loss": 2.7801, + "theoretical_loss": 3.491241589457974, + "tokens_seen": 1626614784 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025616850551654966, + "loss": 2.4912, + "theoretical_loss": 3.4912293797006813, + "tokens_seen": 1626680320 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025615847542627884, + "loss": 2.6434, + "theoretical_loss": 3.491217170573016, + "tokens_seen": 1626745856 + }, + { + "epoch": 5.04, + "learning_rate": 0.000256148445336008, + "loss": 2.7688, + "theoretical_loss": 3.49120496207492, + "tokens_seen": 1626811392 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002561384152457372, + "loss": 2.777, + "theoretical_loss": 3.4911927542063355, + "tokens_seen": 1626876928 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1822305, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.456148147583008, + "objective/train/theoretical_loss": 3.4911835987179787, + "objective/train/tokens_used": 1647386080, + "theoretical_loss": 3.4911835987179787, + "tokens_seen": 1626926080 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002561283851554664, + "loss": 2.8612, + "theoretical_loss": 3.4911805469672044, + "tokens_seen": 1626942464 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002561183550651956, + "loss": 2.6333, + "theoretical_loss": 3.49116834035747, + "tokens_seen": 1627008000 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025610832497492475, + "loss": 2.8309, + "theoretical_loss": 3.491156134377073, + "tokens_seen": 1627073536 + }, + { + "epoch": 5.04, + "learning_rate": 0.000256098294884654, + "loss": 2.5449, + "theoretical_loss": 3.4911439290259567, + "tokens_seen": 1627139072 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002560882647943831, + "loss": 2.7455, + "theoretical_loss": 3.491131724304063, + "tokens_seen": 1627204608 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025607823470411234, + "loss": 2.7514, + "theoretical_loss": 3.4911195202113348, + "tokens_seen": 1627270144 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002560682046138415, + "loss": 2.5495, + "theoretical_loss": 3.491107316747713, + "tokens_seen": 1627335680 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002560581745235707, + "loss": 2.5578, + "theoretical_loss": 3.4910951139131408, + "tokens_seen": 1627401216 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002560481444332999, + "loss": 2.5765, + "theoretical_loss": 3.4910829117075597, + "tokens_seen": 1627466752 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025603811434302907, + "loss": 2.5857, + "theoretical_loss": 3.4910707101309124, + "tokens_seen": 1627532288 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025602808425275825, + "loss": 2.7842, + "theoretical_loss": 3.491058509183141, + "tokens_seen": 1627597824 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002560180541624875, + "loss": 2.6268, + "theoretical_loss": 3.4910463088641883, + "tokens_seen": 1627663360 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002560080240722166, + "loss": 2.7106, + "theoretical_loss": 3.491034109173996, + "tokens_seen": 1627728896 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025599799398194585, + "loss": 2.7822, + "theoretical_loss": 3.491021910112507, + "tokens_seen": 1627794432 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002559879638916751, + "loss": 2.4951, + "theoretical_loss": 3.491009711679663, + "tokens_seen": 1627859968 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002559779338014042, + "loss": 2.7125, + "theoretical_loss": 3.490997513875406, + "tokens_seen": 1627925504 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025596790371113344, + "loss": 2.4878, + "theoretical_loss": 3.490985316699679, + "tokens_seen": 1627991040 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025595787362086257, + "loss": 2.5812, + "theoretical_loss": 3.4909731201524243, + "tokens_seen": 1628056576 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002559478435305918, + "loss": 2.5068, + "theoretical_loss": 3.4909609242335833, + "tokens_seen": 1628122112 + }, + { + "epoch": 5.04, + "learning_rate": 0.000255937813440321, + "loss": 2.596, + "theoretical_loss": 3.4909487289430996, + "tokens_seen": 1628187648 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025592778335005017, + "loss": 2.5647, + "theoretical_loss": 3.490936534280915, + "tokens_seen": 1628253184 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025591775325977935, + "loss": 2.7733, + "theoretical_loss": 3.4909243402469716, + "tokens_seen": 1628318720 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025590772316950853, + "loss": 2.5484, + "theoretical_loss": 3.490912146841212, + "tokens_seen": 1628384256 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002558976930792377, + "loss": 2.493, + "theoretical_loss": 3.490899954063578, + "tokens_seen": 1628449792 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025588766298896695, + "loss": 2.6368, + "theoretical_loss": 3.4908877619140135, + "tokens_seen": 1628515328 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1822948, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1905274391174316, + "objective/train/theoretical_loss": 3.4908786182139746, + "objective/train/tokens_used": 1649024480, + "theoretical_loss": 3.4908786182139746, + "tokens_seen": 1628564480 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002558776328986961, + "loss": 2.5279, + "theoretical_loss": 3.4908755703924594, + "tokens_seen": 1628580864 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002558676028084253, + "loss": 2.841, + "theoretical_loss": 3.490863379498858, + "tokens_seen": 1628646400 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025585757271815444, + "loss": 2.432, + "theoretical_loss": 3.490851189233153, + "tokens_seen": 1628711936 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025584754262788367, + "loss": 2.8448, + "theoretical_loss": 3.4908389995952858, + "tokens_seen": 1628777472 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025583751253761285, + "loss": 2.4533, + "theoretical_loss": 3.4908268105851987, + "tokens_seen": 1628843008 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025582748244734203, + "loss": 2.639, + "theoretical_loss": 3.4908146222028344, + "tokens_seen": 1628908544 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002558174523570712, + "loss": 2.7056, + "theoretical_loss": 3.4908024344481356, + "tokens_seen": 1628974080 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025580742226680045, + "loss": 2.5056, + "theoretical_loss": 3.490790247321045, + "tokens_seen": 1629039616 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002557973921765296, + "loss": 2.5828, + "theoretical_loss": 3.490778060821504, + "tokens_seen": 1629105152 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002557873620862588, + "loss": 2.7358, + "theoretical_loss": 3.4907658749494552, + "tokens_seen": 1629170688 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025577733199598794, + "loss": 2.3925, + "theoretical_loss": 3.490753689704842, + "tokens_seen": 1629236224 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002557673019057172, + "loss": 2.5133, + "theoretical_loss": 3.490741505087606, + "tokens_seen": 1629301760 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025575727181544636, + "loss": 2.7957, + "theoretical_loss": 3.4907293210976897, + "tokens_seen": 1629367296 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025574724172517554, + "loss": 2.4953, + "theoretical_loss": 3.490717137735036, + "tokens_seen": 1629432832 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002557372116349047, + "loss": 2.4671, + "theoretical_loss": 3.4907049549995874, + "tokens_seen": 1629498368 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002557271815446339, + "loss": 2.3538, + "theoretical_loss": 3.490692772891286, + "tokens_seen": 1629563904 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002557171514543631, + "loss": 2.63, + "theoretical_loss": 3.4906805914100745, + "tokens_seen": 1629629440 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002557071213640923, + "loss": 2.5732, + "theoretical_loss": 3.4906684105558954, + "tokens_seen": 1629694976 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025569709127382144, + "loss": 2.575, + "theoretical_loss": 3.4906562303286908, + "tokens_seen": 1629760512 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002556870611835507, + "loss": 2.9597, + "theoretical_loss": 3.4906440507284042, + "tokens_seen": 1629826048 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025567703109327986, + "loss": 2.5434, + "theoretical_loss": 3.490631871754977, + "tokens_seen": 1629891584 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025566700100300904, + "loss": 2.3853, + "theoretical_loss": 3.4906196934083527, + "tokens_seen": 1629957120 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002556569709127382, + "loss": 2.6846, + "theoretical_loss": 3.490607515688473, + "tokens_seen": 1630022656 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002556469408224674, + "loss": 2.4938, + "theoretical_loss": 3.490595338595281, + "tokens_seen": 1630088192 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002556369107321966, + "loss": 2.77, + "theoretical_loss": 3.4905831621287193, + "tokens_seen": 1630153728 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1824579, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.721491575241089, + "objective/train/theoretical_loss": 3.4905740301899892, + "objective/train/tokens_used": 1650662880, + "theoretical_loss": 3.4905740301899892, + "tokens_seen": 1630202880 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002556268806419258, + "loss": 2.7009, + "theoretical_loss": 3.4905709862887297, + "tokens_seen": 1630219264 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025561685055165495, + "loss": 2.9586, + "theoretical_loss": 3.490558811075256, + "tokens_seen": 1630284800 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002556068204613842, + "loss": 2.7171, + "theoretical_loss": 3.49054663648824, + "tokens_seen": 1630350336 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002555967903711133, + "loss": 2.6523, + "theoretical_loss": 3.4905344625276244, + "tokens_seen": 1630415872 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025558676028084254, + "loss": 2.5221, + "theoretical_loss": 3.4905222891933514, + "tokens_seen": 1630481408 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002555767301905717, + "loss": 2.6267, + "theoretical_loss": 3.4905101164853645, + "tokens_seen": 1630546944 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002555667001003009, + "loss": 2.6661, + "theoretical_loss": 3.4904979444036055, + "tokens_seen": 1630612480 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002555566700100301, + "loss": 2.7051, + "theoretical_loss": 3.490485772948017, + "tokens_seen": 1630678016 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025554663991975927, + "loss": 2.7112, + "theoretical_loss": 3.4904736021185427, + "tokens_seen": 1630743552 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025553660982948845, + "loss": 2.7162, + "theoretical_loss": 3.490461431915124, + "tokens_seen": 1630809088 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002555265797392177, + "loss": 2.6602, + "theoretical_loss": 3.4904492623377044, + "tokens_seen": 1630874624 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002555165496489468, + "loss": 2.6757, + "theoretical_loss": 3.490437093386226, + "tokens_seen": 1630940160 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025550651955867605, + "loss": 2.5184, + "theoretical_loss": 3.4904249250606316, + "tokens_seen": 1631005696 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025549648946840523, + "loss": 2.65, + "theoretical_loss": 3.4904127573608634, + "tokens_seen": 1631071232 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002554864593781344, + "loss": 2.6598, + "theoretical_loss": 3.490400590286865, + "tokens_seen": 1631136768 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002554764292878636, + "loss": 2.4051, + "theoretical_loss": 3.4903884238385787, + "tokens_seen": 1631202304 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025546639919759277, + "loss": 2.7635, + "theoretical_loss": 3.4903762580159468, + "tokens_seen": 1631267840 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025545636910732195, + "loss": 2.6505, + "theoretical_loss": 3.4903640928189126, + "tokens_seen": 1631333376 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002554463390170512, + "loss": 2.6143, + "theoretical_loss": 3.4903519282474185, + "tokens_seen": 1631398912 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002554363089267803, + "loss": 2.8442, + "theoretical_loss": 3.490339764301407, + "tokens_seen": 1631464448 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025542627883650955, + "loss": 2.6405, + "theoretical_loss": 3.4903276009808213, + "tokens_seen": 1631529984 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002554162487462387, + "loss": 2.4992, + "theoretical_loss": 3.4903154382856036, + "tokens_seen": 1631595520 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002554062186559679, + "loss": 2.5556, + "theoretical_loss": 3.4903032762156974, + "tokens_seen": 1631661056 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002553961885656971, + "loss": 2.648, + "theoretical_loss": 3.4902911147710443, + "tokens_seen": 1631726592 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002553861584754263, + "loss": 2.6144, + "theoretical_loss": 3.490278953951588, + "tokens_seen": 1631792128 + }, + { + "epoch": 5.04, + "objective/train/docs_used": 1825363, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6945481300354004, + "objective/train/theoretical_loss": 3.490269833747246, + "objective/train/tokens_used": 1652301280, + "theoretical_loss": 3.490269833747246, + "tokens_seen": 1631841280 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025537612838515546, + "loss": 2.5007, + "theoretical_loss": 3.4902667937572707, + "tokens_seen": 1631857664 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025536609829488464, + "loss": 2.9637, + "theoretical_loss": 3.4902546341880356, + "tokens_seen": 1631923200 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002553560682046138, + "loss": 2.6689, + "theoretical_loss": 3.490242475243825, + "tokens_seen": 1631988736 + }, + { + "epoch": 5.04, + "learning_rate": 0.00025534603811434305, + "loss": 2.5679, + "theoretical_loss": 3.490230316924582, + "tokens_seen": 1632054272 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002553360080240722, + "loss": 2.5961, + "theoretical_loss": 3.490218159230249, + "tokens_seen": 1632119808 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002553259779338014, + "loss": 2.5681, + "theoretical_loss": 3.49020600216077, + "tokens_seen": 1632185344 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002553159478435306, + "loss": 2.5314, + "theoretical_loss": 3.490193845716086, + "tokens_seen": 1632250880 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002553059177532598, + "loss": 2.8687, + "theoretical_loss": 3.490181689896141, + "tokens_seen": 1632316416 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025529588766298896, + "loss": 2.5809, + "theoretical_loss": 3.4901695347008777, + "tokens_seen": 1632381952 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025528585757271814, + "loss": 2.4463, + "theoretical_loss": 3.4901573801302384, + "tokens_seen": 1632447488 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002552758274824473, + "loss": 2.5499, + "theoretical_loss": 3.4901452261841666, + "tokens_seen": 1632513024 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025526579739217656, + "loss": 2.5645, + "theoretical_loss": 3.4901330728626045, + "tokens_seen": 1632578560 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002552557673019057, + "loss": 2.341, + "theoretical_loss": 3.490120920165496, + "tokens_seen": 1632644096 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002552457372116349, + "loss": 2.5379, + "theoretical_loss": 3.490108768092782, + "tokens_seen": 1632709632 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002552357071213641, + "loss": 2.6763, + "theoretical_loss": 3.490096616644407, + "tokens_seen": 1632775168 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002552256770310933, + "loss": 2.8082, + "theoretical_loss": 3.4900844658203134, + "tokens_seen": 1632840704 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002552156469408225, + "loss": 2.6397, + "theoretical_loss": 3.4900723156204445, + "tokens_seen": 1632906240 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025520561685055164, + "loss": 2.4946, + "theoretical_loss": 3.4900601660447426, + "tokens_seen": 1632971776 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002551955867602809, + "loss": 2.6199, + "theoretical_loss": 3.4900480170931507, + "tokens_seen": 1633037312 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025518555667001006, + "loss": 2.5152, + "theoretical_loss": 3.4900358687656112, + "tokens_seen": 1633102848 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025517552657973924, + "loss": 2.6778, + "theoretical_loss": 3.4900237210620686, + "tokens_seen": 1633168384 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002551654964894684, + "loss": 2.4758, + "theoretical_loss": 3.4900115739824638, + "tokens_seen": 1633233920 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002551554663991976, + "loss": 2.4492, + "theoretical_loss": 3.489999427526741, + "tokens_seen": 1633299456 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002551454363089268, + "loss": 2.8651, + "theoretical_loss": 3.4899872816948427, + "tokens_seen": 1633364992 + }, + { + "epoch": 5.05, + "learning_rate": 0.000255135406218656, + "loss": 2.7476, + "theoretical_loss": 3.4899751364867124, + "tokens_seen": 1633430528 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1826734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7838215827941895, + "objective/train/theoretical_loss": 3.4899660279899276, + "objective/train/tokens_used": 1653939680, + "theoretical_loss": 3.4899660279899276, + "tokens_seen": 1633479680 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025512537612838515, + "loss": 2.687, + "theoretical_loss": 3.489962991902292, + "tokens_seen": 1633496064 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002551153460381144, + "loss": 2.6393, + "theoretical_loss": 3.4899508479415253, + "tokens_seen": 1633561600 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002551053159478435, + "loss": 2.8947, + "theoretical_loss": 3.489938704604355, + "tokens_seen": 1633627136 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025509528585757274, + "loss": 2.7382, + "theoretical_loss": 3.4899265618907243, + "tokens_seen": 1633692672 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002550852557673019, + "loss": 2.7037, + "theoretical_loss": 3.4899144198005754, + "tokens_seen": 1633758208 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002550752256770311, + "loss": 2.8429, + "theoretical_loss": 3.4899022783338522, + "tokens_seen": 1633823744 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002550651955867603, + "loss": 2.5956, + "theoretical_loss": 3.489890137490497, + "tokens_seen": 1633889280 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025505516549648947, + "loss": 2.5744, + "theoretical_loss": 3.4898779972704537, + "tokens_seen": 1633954816 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025504513540621865, + "loss": 2.344, + "theoretical_loss": 3.489865857673664, + "tokens_seen": 1634020352 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002550351053159479, + "loss": 2.459, + "theoretical_loss": 3.489853718700072, + "tokens_seen": 1634085888 + }, + { + "epoch": 5.05, + "learning_rate": 0.000255025075225677, + "loss": 2.4011, + "theoretical_loss": 3.48984158034962, + "tokens_seen": 1634151424 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025501504513540625, + "loss": 2.6465, + "theoretical_loss": 3.489829442622251, + "tokens_seen": 1634216960 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025500501504513543, + "loss": 2.6639, + "theoretical_loss": 3.489817305517909, + "tokens_seen": 1634282496 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002549949849548646, + "loss": 2.5077, + "theoretical_loss": 3.4898051690365364, + "tokens_seen": 1634348032 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002549849548645938, + "loss": 2.3277, + "theoretical_loss": 3.489793033178076, + "tokens_seen": 1634413568 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025497492477432297, + "loss": 2.8964, + "theoretical_loss": 3.489780897942471, + "tokens_seen": 1634479104 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025496489468405215, + "loss": 2.5993, + "theoretical_loss": 3.4897687633296646, + "tokens_seen": 1634544640 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002549548645937814, + "loss": 2.5346, + "theoretical_loss": 3.4897566293396, + "tokens_seen": 1634610176 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002549448345035105, + "loss": 2.6303, + "theoretical_loss": 3.4897444959722197, + "tokens_seen": 1634675712 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025493480441323975, + "loss": 2.3833, + "theoretical_loss": 3.4897323632274677, + "tokens_seen": 1634741248 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002549247743229689, + "loss": 2.7033, + "theoretical_loss": 3.489720231105286, + "tokens_seen": 1634806784 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002549147442326981, + "loss": 2.5677, + "theoretical_loss": 3.489708099605619, + "tokens_seen": 1634872320 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002549047141424273, + "loss": 2.3206, + "theoretical_loss": 3.489695968728409, + "tokens_seen": 1634937856 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002548946840521565, + "loss": 2.5654, + "theoretical_loss": 3.4896838384735984, + "tokens_seen": 1635003392 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025488465396188566, + "loss": 2.5729, + "theoretical_loss": 3.4896717088411315, + "tokens_seen": 1635068928 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1827286, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7116994857788086, + "objective/train/theoretical_loss": 3.4896626120251604, + "objective/train/tokens_used": 1655578080, + "theoretical_loss": 3.4896626120251604, + "tokens_seen": 1635118080 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025487462387161484, + "loss": 2.5476, + "theoretical_loss": 3.4896595798309513, + "tokens_seen": 1635134464 + }, + { + "epoch": 5.05, + "learning_rate": 0.000254864593781344, + "loss": 2.3753, + "theoretical_loss": 3.4896474514430005, + "tokens_seen": 1635200000 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025485456369107325, + "loss": 2.6086, + "theoretical_loss": 3.4896353236772226, + "tokens_seen": 1635265536 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002548445336008024, + "loss": 2.5868, + "theoretical_loss": 3.48962319653356, + "tokens_seen": 1635331072 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002548345035105316, + "loss": 2.7026, + "theoretical_loss": 3.489611070011957, + "tokens_seen": 1635396608 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002548244734202608, + "loss": 2.662, + "theoretical_loss": 3.489598944112356, + "tokens_seen": 1635462144 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025481444332999, + "loss": 2.7653, + "theoretical_loss": 3.4895868188347, + "tokens_seen": 1635527680 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025480441323971916, + "loss": 2.5967, + "theoretical_loss": 3.489574694178933, + "tokens_seen": 1635593216 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025479438314944834, + "loss": 2.6639, + "theoretical_loss": 3.4895625701449977, + "tokens_seen": 1635658752 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002547843530591775, + "loss": 2.5723, + "theoretical_loss": 3.4895504467328373, + "tokens_seen": 1635724288 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025477432296890676, + "loss": 2.4181, + "theoretical_loss": 3.489538323942395, + "tokens_seen": 1635789824 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002547642928786359, + "loss": 2.5984, + "theoretical_loss": 3.489526201773614, + "tokens_seen": 1635855360 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002547542627883651, + "loss": 2.7379, + "theoretical_loss": 3.4895140802264377, + "tokens_seen": 1635920896 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025474423269809425, + "loss": 2.6937, + "theoretical_loss": 3.489501959300809, + "tokens_seen": 1635986432 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002547342026078235, + "loss": 2.6661, + "theoretical_loss": 3.489489838996671, + "tokens_seen": 1636051968 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025472417251755266, + "loss": 2.8623, + "theoretical_loss": 3.4894777193139683, + "tokens_seen": 1636117504 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025471414242728184, + "loss": 2.8822, + "theoretical_loss": 3.4894656002526423, + "tokens_seen": 1636183040 + }, + { + "epoch": 5.05, + "learning_rate": 0.000254704112337011, + "loss": 2.4939, + "theoretical_loss": 3.489453481812637, + "tokens_seen": 1636248576 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025469408224674026, + "loss": 2.9008, + "theoretical_loss": 3.4894413639938957, + "tokens_seen": 1636314112 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002546840521564694, + "loss": 2.754, + "theoretical_loss": 3.4894292467963623, + "tokens_seen": 1636379648 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002546740220661986, + "loss": 2.4276, + "theoretical_loss": 3.4894171302199792, + "tokens_seen": 1636445184 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025466399197592775, + "loss": 2.67, + "theoretical_loss": 3.4894050142646895, + "tokens_seen": 1636510720 + }, + { + "epoch": 5.05, + "learning_rate": 0.000254653961885657, + "loss": 2.4721, + "theoretical_loss": 3.4893928989304372, + "tokens_seen": 1636576256 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025464393179538616, + "loss": 2.9586, + "theoretical_loss": 3.4893807842171656, + "tokens_seen": 1636641792 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025463390170511535, + "loss": 2.6099, + "theoretical_loss": 3.4893686701248177, + "tokens_seen": 1636707328 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1827288, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.952646493911743, + "objective/train/theoretical_loss": 3.4893595849630037, + "objective/train/tokens_used": 1657216480, + "theoretical_loss": 3.4893595849630037, + "tokens_seen": 1636756480 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025462387161484453, + "loss": 2.7477, + "theoretical_loss": 3.4893565566533367, + "tokens_seen": 1636772864 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002546138415245737, + "loss": 2.6074, + "theoretical_loss": 3.489344443802666, + "tokens_seen": 1636838400 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002546038114343029, + "loss": 2.8813, + "theoretical_loss": 3.4893323315727494, + "tokens_seen": 1636903936 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002545937813440321, + "loss": 2.5295, + "theoretical_loss": 3.4893202199635294, + "tokens_seen": 1636969472 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025458375125376125, + "loss": 2.7456, + "theoretical_loss": 3.4893081089749503, + "tokens_seen": 1637035008 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002545737211634905, + "loss": 2.6556, + "theoretical_loss": 3.4892959986069547, + "tokens_seen": 1637100544 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002545636910732196, + "loss": 2.7824, + "theoretical_loss": 3.489283888859486, + "tokens_seen": 1637166080 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025455366098294885, + "loss": 2.7233, + "theoretical_loss": 3.489271779732488, + "tokens_seen": 1637231616 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025454363089267803, + "loss": 2.649, + "theoretical_loss": 3.489259671225904, + "tokens_seen": 1637297152 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002545336008024072, + "loss": 2.5388, + "theoretical_loss": 3.489247563339677, + "tokens_seen": 1637362688 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002545235707121364, + "loss": 2.5585, + "theoretical_loss": 3.48923545607375, + "tokens_seen": 1637428224 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025451354062186563, + "loss": 2.5326, + "theoretical_loss": 3.489223349428068, + "tokens_seen": 1637493760 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002545035105315948, + "loss": 2.5587, + "theoretical_loss": 3.4892112434025733, + "tokens_seen": 1637559296 + }, + { + "epoch": 5.05, + "learning_rate": 0.000254493480441324, + "loss": 2.8104, + "theoretical_loss": 3.4891991379972094, + "tokens_seen": 1637624832 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025448345035105317, + "loss": 2.6764, + "theoretical_loss": 3.4891870332119193, + "tokens_seen": 1637690368 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025447342026078235, + "loss": 2.5624, + "theoretical_loss": 3.489174929046647, + "tokens_seen": 1637755904 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002544633901705116, + "loss": 2.5065, + "theoretical_loss": 3.489162825501336, + "tokens_seen": 1637821440 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002544533600802407, + "loss": 2.4074, + "theoretical_loss": 3.4891507225759297, + "tokens_seen": 1637886976 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025444332998996995, + "loss": 2.6044, + "theoretical_loss": 3.4891386202703716, + "tokens_seen": 1637952512 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002544332998996991, + "loss": 2.516, + "theoretical_loss": 3.4891265185846043, + "tokens_seen": 1638018048 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002544232698094283, + "loss": 2.4602, + "theoretical_loss": 3.4891144175185724, + "tokens_seen": 1638083584 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002544132397191575, + "loss": 2.6896, + "theoretical_loss": 3.4891023170722186, + "tokens_seen": 1638149120 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002544032096288867, + "loss": 2.7171, + "theoretical_loss": 3.4890902172454865, + "tokens_seen": 1638214656 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025439317953861586, + "loss": 2.6357, + "theoretical_loss": 3.4890781180383206, + "tokens_seen": 1638280192 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025438314944834504, + "loss": 2.5777, + "theoretical_loss": 3.4890660194506626, + "tokens_seen": 1638345728 + }, + { + "debugging/Self-BLEU-5": 0.28035211984762975, + "debugging/distinct-1-grams": 0.8180070273723525, + "debugging/distinct-2-grams": 0.9661205773003526, + "debugging/entropy-1-grams": 5.418256584988372, + "debugging/entropy-2-grams": 5.950145975632809, + "debugging/length": 475.1666666666667, + "debugging/num_segments": 6, + "debugging/score": 0.006501734263504407, + "debugging/score_std": 0.00880767366589508, + "epoch": 5.05, + "objective/train/docs_used": 1828052, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.91074538230896, + "objective/train/theoretical_loss": 3.4890569459164382, + "objective/train/tokens_used": 1658854880, + "theoretical_loss": 3.4890569459164382, + "tokens_seen": 1638394880 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002543731193580742, + "loss": 2.7252, + "theoretical_loss": 3.4890539214824576, + "tokens_seen": 1638411264 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025436308926780345, + "loss": 2.8064, + "theoretical_loss": 3.489041824133648, + "tokens_seen": 1638476800 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002543530591775326, + "loss": 2.5308, + "theoretical_loss": 3.489029727404178, + "tokens_seen": 1638542336 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002543430290872618, + "loss": 2.651, + "theoretical_loss": 3.489017631293991, + "tokens_seen": 1638607872 + }, + { + "epoch": 5.05, + "learning_rate": 0.000254332998996991, + "loss": 2.3064, + "theoretical_loss": 3.48900553580303, + "tokens_seen": 1638673408 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002543229689067202, + "loss": 2.8929, + "theoretical_loss": 3.4889934409312398, + "tokens_seen": 1638738944 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025431293881644936, + "loss": 2.516, + "theoretical_loss": 3.488981346678562, + "tokens_seen": 1638804480 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025430290872617854, + "loss": 2.5252, + "theoretical_loss": 3.4889692530449423, + "tokens_seen": 1638870016 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002542928786359077, + "loss": 2.4958, + "theoretical_loss": 3.4889571600303224, + "tokens_seen": 1638935552 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025428284854563696, + "loss": 2.6536, + "theoretical_loss": 3.4889450676346474, + "tokens_seen": 1639001088 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002542728184553661, + "loss": 2.7862, + "theoretical_loss": 3.4889329758578596, + "tokens_seen": 1639066624 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002542627883650953, + "loss": 2.6614, + "theoretical_loss": 3.4889208846999034, + "tokens_seen": 1639132160 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025425275827482445, + "loss": 2.631, + "theoretical_loss": 3.488908794160722, + "tokens_seen": 1639197696 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002542427281845537, + "loss": 2.7136, + "theoretical_loss": 3.4888967042402594, + "tokens_seen": 1639263232 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025423269809428286, + "loss": 2.5916, + "theoretical_loss": 3.4888846149384585, + "tokens_seen": 1639328768 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025422266800401204, + "loss": 2.4989, + "theoretical_loss": 3.4888725262552636, + "tokens_seen": 1639394304 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002542126379137412, + "loss": 2.7752, + "theoretical_loss": 3.488860438190618, + "tokens_seen": 1639459840 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025420260782347046, + "loss": 2.7072, + "theoretical_loss": 3.4888483507444654, + "tokens_seen": 1639525376 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002541925777331996, + "loss": 2.6852, + "theoretical_loss": 3.4888362639167494, + "tokens_seen": 1639590912 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002541825476429288, + "loss": 2.4922, + "theoretical_loss": 3.488824177707414, + "tokens_seen": 1639656448 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025417251755265795, + "loss": 2.611, + "theoretical_loss": 3.488812092116402, + "tokens_seen": 1639721984 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002541624874623872, + "loss": 2.8802, + "theoretical_loss": 3.488800007143658, + "tokens_seen": 1639787520 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025415245737211637, + "loss": 2.7579, + "theoretical_loss": 3.488787922789125, + "tokens_seen": 1639853056 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025414242728184555, + "loss": 2.868, + "theoretical_loss": 3.488775839052747, + "tokens_seen": 1639918592 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025413239719157473, + "loss": 2.8075, + "theoretical_loss": 3.4887637559344675, + "tokens_seen": 1639984128 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1828793, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.076061725616455, + "objective/train/theoretical_loss": 3.4887546940013507, + "objective/train/tokens_used": 1660493280, + "theoretical_loss": 3.4887546940013507, + "tokens_seen": 1640033280 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002541223671013039, + "loss": 2.544, + "theoretical_loss": 3.48875167343423, + "tokens_seen": 1640049664 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002541123370110331, + "loss": 2.5412, + "theoretical_loss": 3.4887395915519788, + "tokens_seen": 1640115200 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002541023069207623, + "loss": 2.5361, + "theoretical_loss": 3.4887275102876574, + "tokens_seen": 1640180736 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025409227683049145, + "loss": 2.9203, + "theoretical_loss": 3.488715429641209, + "tokens_seen": 1640246272 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002540822467402207, + "loss": 2.5023, + "theoretical_loss": 3.488703349612578, + "tokens_seen": 1640311808 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002540722166499498, + "loss": 2.7101, + "theoretical_loss": 3.4886912702017074, + "tokens_seen": 1640377344 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025406218655967905, + "loss": 2.7177, + "theoretical_loss": 3.4886791914085418, + "tokens_seen": 1640442880 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025405215646940823, + "loss": 2.6426, + "theoretical_loss": 3.488667113233024, + "tokens_seen": 1640508416 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002540421263791374, + "loss": 2.8069, + "theoretical_loss": 3.488655035675098, + "tokens_seen": 1640573952 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002540320962888666, + "loss": 2.714, + "theoretical_loss": 3.4886429587347085, + "tokens_seen": 1640639488 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025402206619859583, + "loss": 2.6145, + "theoretical_loss": 3.4886308824117984, + "tokens_seen": 1640705024 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025401203610832495, + "loss": 2.6574, + "theoretical_loss": 3.4886188067063113, + "tokens_seen": 1640770560 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002540020060180542, + "loss": 2.4431, + "theoretical_loss": 3.4886067316181917, + "tokens_seen": 1640836096 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002539919759277833, + "loss": 2.7695, + "theoretical_loss": 3.4885946571473827, + "tokens_seen": 1640901632 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025398194583751255, + "loss": 2.7228, + "theoretical_loss": 3.488582583293828, + "tokens_seen": 1640967168 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025397191574724173, + "loss": 2.6124, + "theoretical_loss": 3.488570510057472, + "tokens_seen": 1641032704 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002539618856569709, + "loss": 2.6302, + "theoretical_loss": 3.4885584374382583, + "tokens_seen": 1641098240 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002539518555667001, + "loss": 2.6826, + "theoretical_loss": 3.4885463654361306, + "tokens_seen": 1641163776 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002539418254764293, + "loss": 2.5072, + "theoretical_loss": 3.488534294051033, + "tokens_seen": 1641229312 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025393179538615846, + "loss": 2.5015, + "theoretical_loss": 3.4885222232829083, + "tokens_seen": 1641294848 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002539217652958877, + "loss": 2.7234, + "theoretical_loss": 3.4885101531317018, + "tokens_seen": 1641360384 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002539117352056168, + "loss": 2.8709, + "theoretical_loss": 3.4884980835973565, + "tokens_seen": 1641425920 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025390170511534606, + "loss": 2.6931, + "theoretical_loss": 3.4884860146798164, + "tokens_seen": 1641491456 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002538916750250752, + "loss": 2.4736, + "theoretical_loss": 3.488473946379025, + "tokens_seen": 1641556992 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002538816449348044, + "loss": 2.6352, + "theoretical_loss": 3.488461878694927, + "tokens_seen": 1641622528 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1830032, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0165646076202393, + "objective/train/theoretical_loss": 3.4884528283365244, + "objective/train/tokens_used": 1662131680, + "theoretical_loss": 3.4884528283365244, + "tokens_seen": 1641671680 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002538716148445336, + "loss": 2.8231, + "theoretical_loss": 3.488449811627466, + "tokens_seen": 1641688064 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002538615847542628, + "loss": 2.7286, + "theoretical_loss": 3.4884377451765847, + "tokens_seen": 1641753600 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025385155466399196, + "loss": 2.6162, + "theoretical_loss": 3.488425679342229, + "tokens_seen": 1641819136 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002538415245737212, + "loss": 2.6865, + "theoretical_loss": 3.488413614124341, + "tokens_seen": 1641884672 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002538314944834503, + "loss": 2.6419, + "theoretical_loss": 3.488401549522866, + "tokens_seen": 1641950208 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025382146439317956, + "loss": 2.6924, + "theoretical_loss": 3.4883894855377466, + "tokens_seen": 1642015744 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002538114343029087, + "loss": 2.498, + "theoretical_loss": 3.488377422168928, + "tokens_seen": 1642081280 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002538014042126379, + "loss": 2.7008, + "theoretical_loss": 3.4883653594163535, + "tokens_seen": 1642146816 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002537913741223671, + "loss": 2.6494, + "theoretical_loss": 3.4883532972799665, + "tokens_seen": 1642212352 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002537813440320963, + "loss": 2.5609, + "theoretical_loss": 3.4883412357597114, + "tokens_seen": 1642277888 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025377131394182546, + "loss": 2.6344, + "theoretical_loss": 3.4883291748555325, + "tokens_seen": 1642343424 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025376128385155465, + "loss": 2.6554, + "theoretical_loss": 3.4883171145673737, + "tokens_seen": 1642408960 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002537512537612839, + "loss": 2.6871, + "theoretical_loss": 3.4883050548951777, + "tokens_seen": 1642474496 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025374122367101306, + "loss": 2.554, + "theoretical_loss": 3.488292995838891, + "tokens_seen": 1642540032 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025373119358074224, + "loss": 2.6946, + "theoretical_loss": 3.4882809373984545, + "tokens_seen": 1642605568 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002537211634904714, + "loss": 2.7, + "theoretical_loss": 3.4882688795738144, + "tokens_seen": 1642671104 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025371113340020066, + "loss": 2.8572, + "theoretical_loss": 3.488256822364914, + "tokens_seen": 1642736640 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002537011033099298, + "loss": 2.7233, + "theoretical_loss": 3.4882447657716975, + "tokens_seen": 1642802176 + }, + { + "epoch": 5.05, + "learning_rate": 0.000253691073219659, + "loss": 2.5799, + "theoretical_loss": 3.4882327097941084, + "tokens_seen": 1642867712 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025368104312938815, + "loss": 2.6587, + "theoretical_loss": 3.4882206544320913, + "tokens_seen": 1642933248 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002536710130391174, + "loss": 2.5166, + "theoretical_loss": 3.4882085996855894, + "tokens_seen": 1642998784 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025366098294884657, + "loss": 2.5442, + "theoretical_loss": 3.4881965455545476, + "tokens_seen": 1643064320 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025365095285857575, + "loss": 2.7119, + "theoretical_loss": 3.4881844920389096, + "tokens_seen": 1643129856 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025364092276830493, + "loss": 2.648, + "theoretical_loss": 3.4881724391386193, + "tokens_seen": 1643195392 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002536308926780341, + "loss": 2.7163, + "theoretical_loss": 3.488160386853621, + "tokens_seen": 1643260928 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1830619, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8538637161254883, + "objective/train/theoretical_loss": 3.488151348043624, + "objective/train/tokens_used": 1663770080, + "theoretical_loss": 3.488151348043624, + "tokens_seen": 1643310080 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002536208625877633, + "loss": 2.7363, + "theoretical_loss": 3.4881483351838583, + "tokens_seen": 1643326464 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002536108324974925, + "loss": 2.5418, + "theoretical_loss": 3.488136284129276, + "tokens_seen": 1643392000 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025360080240722165, + "loss": 2.5203, + "theoretical_loss": 3.4881242336898177, + "tokens_seen": 1643457536 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002535907723169509, + "loss": 2.7345, + "theoretical_loss": 3.488112183865427, + "tokens_seen": 1643523072 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025358074222668, + "loss": 2.5824, + "theoretical_loss": 3.488100134656049, + "tokens_seen": 1643588608 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025357071213640925, + "loss": 2.6666, + "theoretical_loss": 3.4880880860616275, + "tokens_seen": 1643654144 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025356068204613843, + "loss": 2.7623, + "theoretical_loss": 3.488076038082106, + "tokens_seen": 1643719680 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002535506519558676, + "loss": 2.6642, + "theoretical_loss": 3.4880639907174285, + "tokens_seen": 1643785216 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002535406218655968, + "loss": 2.5737, + "theoretical_loss": 3.48805194396754, + "tokens_seen": 1643850752 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025353059177532603, + "loss": 2.5808, + "theoretical_loss": 3.488039897832385, + "tokens_seen": 1643916288 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025352056168505516, + "loss": 2.6431, + "theoretical_loss": 3.488027852311906, + "tokens_seen": 1643981824 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002535105315947844, + "loss": 2.4755, + "theoretical_loss": 3.488015807406048, + "tokens_seen": 1644047360 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002535005015045135, + "loss": 2.6564, + "theoretical_loss": 3.4880037631147554, + "tokens_seen": 1644112896 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025349047141424275, + "loss": 2.6924, + "theoretical_loss": 3.487991719437972, + "tokens_seen": 1644178432 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025348044132397193, + "loss": 2.6345, + "theoretical_loss": 3.487979676375642, + "tokens_seen": 1644243968 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002534704112337011, + "loss": 2.4965, + "theoretical_loss": 3.4879676339277097, + "tokens_seen": 1644309504 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002534603811434303, + "loss": 2.6393, + "theoretical_loss": 3.487955592094119, + "tokens_seen": 1644375040 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002534503510531595, + "loss": 2.8411, + "theoretical_loss": 3.487943550874814, + "tokens_seen": 1644440576 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025344032096288866, + "loss": 2.7036, + "theoretical_loss": 3.487931510269739, + "tokens_seen": 1644506112 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002534302908726179, + "loss": 2.6083, + "theoretical_loss": 3.4879194702788388, + "tokens_seen": 1644571648 + }, + { + "epoch": 5.05, + "learning_rate": 0.000253420260782347, + "loss": 2.8604, + "theoretical_loss": 3.487907430902057, + "tokens_seen": 1644637184 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025341023069207626, + "loss": 2.5739, + "theoretical_loss": 3.487895392139338, + "tokens_seen": 1644702720 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002534002006018054, + "loss": 2.6247, + "theoretical_loss": 3.4878833539906258, + "tokens_seen": 1644768256 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002533901705115346, + "loss": 2.6973, + "theoretical_loss": 3.4878713164558643, + "tokens_seen": 1644833792 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002533801404212638, + "loss": 2.6411, + "theoretical_loss": 3.487859279534999, + "tokens_seen": 1644899328 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1831835, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8763232231140137, + "objective/train/theoretical_loss": 3.487850252247185, + "objective/train/tokens_used": 1665408480, + "theoretical_loss": 3.487850252247185, + "tokens_seen": 1644948480 + }, + { + "epoch": 5.05, + "learning_rate": 0.000253370110330993, + "loss": 2.8589, + "theoretical_loss": 3.487847243227973, + "tokens_seen": 1644964864 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025336008024072216, + "loss": 2.5824, + "theoretical_loss": 3.4878352075347308, + "tokens_seen": 1645030400 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002533500501504514, + "loss": 2.7122, + "theoretical_loss": 3.487823172455217, + "tokens_seen": 1645095936 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002533400200601805, + "loss": 2.7071, + "theoretical_loss": 3.487811137989375, + "tokens_seen": 1645161472 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025332998996990976, + "loss": 2.5932, + "theoretical_loss": 3.4877991041371494, + "tokens_seen": 1645227008 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002533199598796389, + "loss": 2.7684, + "theoretical_loss": 3.4877870708984853, + "tokens_seen": 1645292544 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002533099297893681, + "loss": 2.6536, + "theoretical_loss": 3.4877750382733264, + "tokens_seen": 1645358080 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002532998996990973, + "loss": 2.5601, + "theoretical_loss": 3.4877630062616167, + "tokens_seen": 1645423616 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002532898696088265, + "loss": 2.557, + "theoretical_loss": 3.487750974863301, + "tokens_seen": 1645489152 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025327983951855566, + "loss": 2.7465, + "theoretical_loss": 3.4877389440783233, + "tokens_seen": 1645554688 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025326980942828485, + "loss": 2.2815, + "theoretical_loss": 3.4877269139066276, + "tokens_seen": 1645620224 + }, + { + "epoch": 5.05, + "learning_rate": 0.000253259779338014, + "loss": 2.5423, + "theoretical_loss": 3.4877148843481587, + "tokens_seen": 1645685760 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025324974924774326, + "loss": 2.7766, + "theoretical_loss": 3.487702855402861, + "tokens_seen": 1645751296 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002532397191574724, + "loss": 2.895, + "theoretical_loss": 3.4876908270706783, + "tokens_seen": 1645816832 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002532296890672016, + "loss": 2.8808, + "theoretical_loss": 3.487678799351556, + "tokens_seen": 1645882368 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025321965897693075, + "loss": 2.5823, + "theoretical_loss": 3.4876667722454373, + "tokens_seen": 1645947904 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025320962888666, + "loss": 2.641, + "theoretical_loss": 3.487654745752267, + "tokens_seen": 1646013440 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025319959879638917, + "loss": 2.5569, + "theoretical_loss": 3.487642719871989, + "tokens_seen": 1646078976 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025318956870611835, + "loss": 2.7729, + "theoretical_loss": 3.4876306946045483, + "tokens_seen": 1646144512 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025317953861584753, + "loss": 2.7548, + "theoretical_loss": 3.4876186699498897, + "tokens_seen": 1646210048 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025316950852557677, + "loss": 2.6345, + "theoretical_loss": 3.487606645907956, + "tokens_seen": 1646275584 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002531594784353059, + "loss": 2.5346, + "theoretical_loss": 3.487594622478693, + "tokens_seen": 1646341120 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025314944834503513, + "loss": 2.7141, + "theoretical_loss": 3.487582599662045, + "tokens_seen": 1646406656 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025313941825476425, + "loss": 2.7156, + "theoretical_loss": 3.4875705774579555, + "tokens_seen": 1646472192 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002531293881644935, + "loss": 2.6339, + "theoretical_loss": 3.4875585558663698, + "tokens_seen": 1646537728 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1832689, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.549023389816284, + "objective/train/theoretical_loss": 3.4875495400746024, + "objective/train/tokens_used": 1667046880, + "theoretical_loss": 3.4875495400746024, + "tokens_seen": 1646586880 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025311935807422267, + "loss": 2.5987, + "theoretical_loss": 3.4875465348872314, + "tokens_seen": 1646603264 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025310932798395185, + "loss": 2.8123, + "theoretical_loss": 3.487534514520486, + "tokens_seen": 1646668800 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025309929789368103, + "loss": 2.6352, + "theoretical_loss": 3.487522494766077, + "tokens_seen": 1646734336 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002530892678034102, + "loss": 2.5625, + "theoretical_loss": 3.4875104756239486, + "tokens_seen": 1646799872 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002530792377131394, + "loss": 2.5651, + "theoretical_loss": 3.4874984570940466, + "tokens_seen": 1646865408 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025306920762286863, + "loss": 2.6599, + "theoretical_loss": 3.4874864391763145, + "tokens_seen": 1646930944 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025305917753259776, + "loss": 2.5127, + "theoretical_loss": 3.4874744218706963, + "tokens_seen": 1646996480 + }, + { + "epoch": 5.05, + "learning_rate": 0.000253049147442327, + "loss": 2.4157, + "theoretical_loss": 3.487462405177138, + "tokens_seen": 1647062016 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002530391173520562, + "loss": 2.5339, + "theoretical_loss": 3.4874503890955824, + "tokens_seen": 1647127552 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025302908726178536, + "loss": 2.5977, + "theoretical_loss": 3.487438373625975, + "tokens_seen": 1647193088 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025301905717151454, + "loss": 2.7327, + "theoretical_loss": 3.48742635876826, + "tokens_seen": 1647258624 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002530090270812437, + "loss": 2.7865, + "theoretical_loss": 3.4874143445223824, + "tokens_seen": 1647324160 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025299899699097295, + "loss": 2.6948, + "theoretical_loss": 3.4874023308882856, + "tokens_seen": 1647389696 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025298896690070213, + "loss": 2.7051, + "theoretical_loss": 3.487390317865915, + "tokens_seen": 1647455232 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002529789368104313, + "loss": 2.6647, + "theoretical_loss": 3.4873783054552145, + "tokens_seen": 1647520768 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002529689067201605, + "loss": 2.564, + "theoretical_loss": 3.4873662936561294, + "tokens_seen": 1647586304 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002529588766298897, + "loss": 2.7542, + "theoretical_loss": 3.487354282468604, + "tokens_seen": 1647651840 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025294884653961886, + "loss": 2.3254, + "theoretical_loss": 3.487342271892582, + "tokens_seen": 1647717376 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002529388164493481, + "loss": 2.5738, + "theoretical_loss": 3.487330261928009, + "tokens_seen": 1647782912 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002529287863590772, + "loss": 2.679, + "theoretical_loss": 3.4873182525748296, + "tokens_seen": 1647848448 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025291875626880646, + "loss": 2.4659, + "theoretical_loss": 3.4873062438329874, + "tokens_seen": 1647913984 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002529087261785356, + "loss": 2.5778, + "theoretical_loss": 3.4872942357024277, + "tokens_seen": 1647979520 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002528986960882648, + "loss": 2.4391, + "theoretical_loss": 3.4872822281830946, + "tokens_seen": 1648045056 + }, + { + "epoch": 5.05, + "learning_rate": 0.000252888665997994, + "loss": 2.6824, + "theoretical_loss": 3.487270221274933, + "tokens_seen": 1648110592 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002528786359077232, + "loss": 2.7971, + "theoretical_loss": 3.4872582149778877, + "tokens_seen": 1648176128 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1834029, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5318849086761475, + "objective/train/theoretical_loss": 3.487249210656115, + "objective/train/tokens_used": 1668685280, + "theoretical_loss": 3.487249210656115, + "tokens_seen": 1648225280 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025286860581745236, + "loss": 2.652, + "theoretical_loss": 3.4872462092919028, + "tokens_seen": 1648241664 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002528585757271816, + "loss": 2.6296, + "theoretical_loss": 3.4872342042169233, + "tokens_seen": 1648307200 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002528485456369107, + "loss": 2.4941, + "theoretical_loss": 3.4872221997528934, + "tokens_seen": 1648372736 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025283851554663996, + "loss": 2.4465, + "theoretical_loss": 3.4872101958997583, + "tokens_seen": 1648438272 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002528284854563691, + "loss": 2.3638, + "theoretical_loss": 3.4871981926574622, + "tokens_seen": 1648503808 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002528184553660983, + "loss": 2.703, + "theoretical_loss": 3.48718619002595, + "tokens_seen": 1648569344 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002528084252758275, + "loss": 2.6302, + "theoretical_loss": 3.4871741880051657, + "tokens_seen": 1648634880 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002527983951855567, + "loss": 2.5522, + "theoretical_loss": 3.487162186595055, + "tokens_seen": 1648700416 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025278836509528586, + "loss": 2.687, + "theoretical_loss": 3.487150185795562, + "tokens_seen": 1648765952 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025277833500501505, + "loss": 2.5756, + "theoretical_loss": 3.487138185606631, + "tokens_seen": 1648831488 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002527683049147442, + "loss": 2.7657, + "theoretical_loss": 3.487126186028207, + "tokens_seen": 1648897024 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025275827482447346, + "loss": 2.6197, + "theoretical_loss": 3.487114187060235, + "tokens_seen": 1648962560 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002527482447342026, + "loss": 2.3855, + "theoretical_loss": 3.487102188702659, + "tokens_seen": 1649028096 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002527382146439318, + "loss": 2.6836, + "theoretical_loss": 3.487090190955424, + "tokens_seen": 1649093632 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025272818455366095, + "loss": 2.6625, + "theoretical_loss": 3.4870781938184754, + "tokens_seen": 1649159168 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002527181544633902, + "loss": 2.639, + "theoretical_loss": 3.4870661972917567, + "tokens_seen": 1649224704 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025270812437311937, + "loss": 2.6155, + "theoretical_loss": 3.4870542013752135, + "tokens_seen": 1649290240 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025269809428284855, + "loss": 2.753, + "theoretical_loss": 3.4870422060687902, + "tokens_seen": 1649355776 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025268806419257773, + "loss": 2.4928, + "theoretical_loss": 3.4870302113724314, + "tokens_seen": 1649421312 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025267803410230697, + "loss": 2.6719, + "theoretical_loss": 3.4870182172860824, + "tokens_seen": 1649486848 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002526680040120361, + "loss": 2.5814, + "theoretical_loss": 3.4870062238096873, + "tokens_seen": 1649552384 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025265797392176533, + "loss": 2.3619, + "theoretical_loss": 3.486994230943191, + "tokens_seen": 1649617920 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025264794383149445, + "loss": 2.5583, + "theoretical_loss": 3.4869822386865383, + "tokens_seen": 1649683456 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002526379137412237, + "loss": 2.5248, + "theoretical_loss": 3.4869702470396735, + "tokens_seen": 1649748992 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025262788365095287, + "loss": 2.7687, + "theoretical_loss": 3.4869582560025423, + "tokens_seen": 1649814528 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1834679, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2152130603790283, + "objective/train/theoretical_loss": 3.4869492631247985, + "objective/train/tokens_used": 1670323680, + "theoretical_loss": 3.4869492631247985, + "tokens_seen": 1649863680 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025261785356068205, + "loss": 2.7612, + "theoretical_loss": 3.4869462655750896, + "tokens_seen": 1649880064 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025260782347041123, + "loss": 2.6469, + "theoretical_loss": 3.486934275757259, + "tokens_seen": 1649945600 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002525977933801404, + "loss": 2.5645, + "theoretical_loss": 3.486922286548996, + "tokens_seen": 1650011136 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002525877632898696, + "loss": 2.5006, + "theoretical_loss": 3.4869102979502458, + "tokens_seen": 1650076672 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025257773319959883, + "loss": 2.5498, + "theoretical_loss": 3.486898309960952, + "tokens_seen": 1650142208 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025256770310932796, + "loss": 2.622, + "theoretical_loss": 3.4868863225810602, + "tokens_seen": 1650207744 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002525576730190572, + "loss": 2.6909, + "theoretical_loss": 3.4868743358105156, + "tokens_seen": 1650273280 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002525476429287864, + "loss": 2.6871, + "theoretical_loss": 3.4868623496492623, + "tokens_seen": 1650338816 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025253761283851556, + "loss": 2.8011, + "theoretical_loss": 3.4868503640972452, + "tokens_seen": 1650404352 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025252758274824474, + "loss": 2.6863, + "theoretical_loss": 3.48683837915441, + "tokens_seen": 1650469888 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002525175526579739, + "loss": 2.5185, + "theoretical_loss": 3.486826394820701, + "tokens_seen": 1650535424 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002525075225677031, + "loss": 2.578, + "theoretical_loss": 3.4868144110960624, + "tokens_seen": 1650600960 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025249749247743233, + "loss": 2.729, + "theoretical_loss": 3.4868024279804395, + "tokens_seen": 1650666496 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025248746238716146, + "loss": 2.5299, + "theoretical_loss": 3.486790445473778, + "tokens_seen": 1650732032 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002524774322968907, + "loss": 2.5645, + "theoretical_loss": 3.4867784635760217, + "tokens_seen": 1650797568 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002524674022066198, + "loss": 2.6888, + "theoretical_loss": 3.4867664822871163, + "tokens_seen": 1650863104 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025245737211634906, + "loss": 2.6336, + "theoretical_loss": 3.4867545016070056, + "tokens_seen": 1650928640 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025244734202607824, + "loss": 2.8631, + "theoretical_loss": 3.4867425215356356, + "tokens_seen": 1650994176 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002524373119358074, + "loss": 2.7158, + "theoretical_loss": 3.4867305420729506, + "tokens_seen": 1651059712 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002524272818455366, + "loss": 2.5847, + "theoretical_loss": 3.486718563218896, + "tokens_seen": 1651125248 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002524172517552658, + "loss": 2.7721, + "theoretical_loss": 3.4867065849734162, + "tokens_seen": 1651190784 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025240722166499496, + "loss": 2.7662, + "theoretical_loss": 3.4866946073364566, + "tokens_seen": 1651256320 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002523971915747242, + "loss": 2.7732, + "theoretical_loss": 3.4866826303079614, + "tokens_seen": 1651321856 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002523871614844533, + "loss": 2.7181, + "theoretical_loss": 3.4866706538878764, + "tokens_seen": 1651387392 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025237713139418256, + "loss": 2.5816, + "theoretical_loss": 3.486658678076146, + "tokens_seen": 1651452928 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1835937, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8342537879943848, + "objective/train/theoretical_loss": 3.486649696616548, + "objective/train/tokens_used": 1671962080, + "theoretical_loss": 3.486649696616548, + "tokens_seen": 1651502080 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025236710130391174, + "loss": 2.5049, + "theoretical_loss": 3.486646702872715, + "tokens_seen": 1651518464 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002523570712136409, + "loss": 2.6865, + "theoretical_loss": 3.4866347282775294, + "tokens_seen": 1651584000 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002523470411233701, + "loss": 2.6319, + "theoretical_loss": 3.4866227542905333, + "tokens_seen": 1651649536 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002523370110330993, + "loss": 2.6945, + "theoretical_loss": 3.486610780911672, + "tokens_seen": 1651715072 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025232698094282847, + "loss": 2.573, + "theoretical_loss": 3.48659880814089, + "tokens_seen": 1651780608 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002523169508525577, + "loss": 2.5833, + "theoretical_loss": 3.486586835978133, + "tokens_seen": 1651846144 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025230692076228683, + "loss": 2.7136, + "theoretical_loss": 3.486574864423345, + "tokens_seen": 1651911680 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025229689067201606, + "loss": 2.599, + "theoretical_loss": 3.486562893476472, + "tokens_seen": 1651977216 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002522868605817452, + "loss": 2.7669, + "theoretical_loss": 3.4865509231374587, + "tokens_seen": 1652042752 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025227683049147443, + "loss": 2.6754, + "theoretical_loss": 3.4865389534062503, + "tokens_seen": 1652108288 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002522668004012036, + "loss": 2.5622, + "theoretical_loss": 3.4865269842827913, + "tokens_seen": 1652173824 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002522567703109328, + "loss": 2.6537, + "theoretical_loss": 3.486515015767027, + "tokens_seen": 1652239360 + }, + { + "epoch": 5.05, + "learning_rate": 0.000252246740220662, + "loss": 2.6916, + "theoretical_loss": 3.4865030478589025, + "tokens_seen": 1652304896 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025223671013039115, + "loss": 2.8411, + "theoretical_loss": 3.486491080558363, + "tokens_seen": 1652370432 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002522266800401204, + "loss": 2.5618, + "theoretical_loss": 3.4864791138653537, + "tokens_seen": 1652435968 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025221664994984957, + "loss": 2.7389, + "theoretical_loss": 3.486467147779819, + "tokens_seen": 1652501504 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025220661985957875, + "loss": 2.5624, + "theoretical_loss": 3.4864551823017043, + "tokens_seen": 1652567040 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025219658976930793, + "loss": 2.6085, + "theoretical_loss": 3.4864432174309545, + "tokens_seen": 1652632576 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025218655967903717, + "loss": 2.7121, + "theoretical_loss": 3.4864312531675155, + "tokens_seen": 1652698112 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002521765295887663, + "loss": 2.8367, + "theoretical_loss": 3.4864192895113315, + "tokens_seen": 1652763648 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025216649949849553, + "loss": 2.6695, + "theoretical_loss": 3.486407326462348, + "tokens_seen": 1652829184 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025215646940822465, + "loss": 2.5184, + "theoretical_loss": 3.4863953640205096, + "tokens_seen": 1652894720 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002521464393179539, + "loss": 2.9462, + "theoretical_loss": 3.4863834021857625, + "tokens_seen": 1652960256 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025213640922768307, + "loss": 2.7201, + "theoretical_loss": 3.4863714409580506, + "tokens_seen": 1653025792 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025212637913741225, + "loss": 2.8356, + "theoretical_loss": 3.48635948033732, + "tokens_seen": 1653091328 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1836665, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.477010488510132, + "objective/train/theoretical_loss": 3.48635051027007, + "objective/train/tokens_used": 1673600480, + "theoretical_loss": 3.48635051027007, + "tokens_seen": 1653140480 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025211634904714143, + "loss": 2.5731, + "theoretical_loss": 3.4863475203235152, + "tokens_seen": 1653156864 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002521063189568706, + "loss": 2.686, + "theoretical_loss": 3.486335560916581, + "tokens_seen": 1653222400 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002520962888665998, + "loss": 2.6894, + "theoretical_loss": 3.486323602116464, + "tokens_seen": 1653287936 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025208625877632903, + "loss": 2.5485, + "theoretical_loss": 3.4863116439231083, + "tokens_seen": 1653353472 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025207622868605816, + "loss": 2.8246, + "theoretical_loss": 3.486299686336459, + "tokens_seen": 1653419008 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002520661985957874, + "loss": 2.6584, + "theoretical_loss": 3.4862877293564614, + "tokens_seen": 1653484544 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002520561685055166, + "loss": 2.7343, + "theoretical_loss": 3.486275772983061, + "tokens_seen": 1653550080 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025204613841524576, + "loss": 2.7907, + "theoretical_loss": 3.486263817216203, + "tokens_seen": 1653615616 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025203610832497494, + "loss": 2.5279, + "theoretical_loss": 3.486251862055832, + "tokens_seen": 1653681152 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002520260782347041, + "loss": 2.6601, + "theoretical_loss": 3.4862399075018935, + "tokens_seen": 1653746688 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002520160481444333, + "loss": 2.3958, + "theoretical_loss": 3.486227953554333, + "tokens_seen": 1653812224 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025200601805416253, + "loss": 2.5827, + "theoretical_loss": 3.4862160002130955, + "tokens_seen": 1653877760 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025199598796389166, + "loss": 2.6702, + "theoretical_loss": 3.486204047478126, + "tokens_seen": 1653943296 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002519859578736209, + "loss": 2.7266, + "theoretical_loss": 3.48619209534937, + "tokens_seen": 1654008832 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025197592778335, + "loss": 2.446, + "theoretical_loss": 3.4861801438267728, + "tokens_seen": 1654074368 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025196589769307926, + "loss": 2.6127, + "theoretical_loss": 3.4861681929102795, + "tokens_seen": 1654139904 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025195586760280844, + "loss": 2.7389, + "theoretical_loss": 3.4861562425998356, + "tokens_seen": 1654205440 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002519458375125376, + "loss": 2.5911, + "theoretical_loss": 3.4861442928953856, + "tokens_seen": 1654270976 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002519358074222668, + "loss": 2.6139, + "theoretical_loss": 3.486132343796876, + "tokens_seen": 1654336512 + }, + { + "epoch": 5.05, + "learning_rate": 0.000251925777331996, + "loss": 2.538, + "theoretical_loss": 3.4861203953042503, + "tokens_seen": 1654402048 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025191574724172516, + "loss": 2.6243, + "theoretical_loss": 3.4861084474174557, + "tokens_seen": 1654467584 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002519057171514544, + "loss": 2.8816, + "theoretical_loss": 3.486096500136436, + "tokens_seen": 1654533120 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002518956870611835, + "loss": 2.3886, + "theoretical_loss": 3.486084553461138, + "tokens_seen": 1654598656 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025188565697091276, + "loss": 2.7295, + "theoretical_loss": 3.4860726073915056, + "tokens_seen": 1654664192 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025187562688064194, + "loss": 2.5841, + "theoretical_loss": 3.4860606619274845, + "tokens_seen": 1654729728 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1838119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.717348575592041, + "objective/train/theoretical_loss": 3.4860517032268685, + "objective/train/tokens_used": 1675238880, + "theoretical_loss": 3.4860517032268685, + "tokens_seen": 1654778880 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002518655967903711, + "loss": 2.7718, + "theoretical_loss": 3.4860487170690204, + "tokens_seen": 1654795264 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002518555667001003, + "loss": 2.723, + "theoretical_loss": 3.4860367728160586, + "tokens_seen": 1654860800 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002518455366098295, + "loss": 2.5466, + "theoretical_loss": 3.4860248291685436, + "tokens_seen": 1654926336 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025183550651955867, + "loss": 2.7116, + "theoretical_loss": 3.4860128861264217, + "tokens_seen": 1654991872 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002518254764292879, + "loss": 2.6604, + "theoretical_loss": 3.486000943689638, + "tokens_seen": 1655057408 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025181544633901703, + "loss": 2.7034, + "theoretical_loss": 3.4859890018581376, + "tokens_seen": 1655122944 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025180541624874627, + "loss": 2.7358, + "theoretical_loss": 3.485977060631866, + "tokens_seen": 1655188480 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002517953861584754, + "loss": 2.8637, + "theoretical_loss": 3.485965120010768, + "tokens_seen": 1655254016 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025178535606820463, + "loss": 2.5282, + "theoretical_loss": 3.48595317999479, + "tokens_seen": 1655319552 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002517753259779338, + "loss": 2.7165, + "theoretical_loss": 3.4859412405838768, + "tokens_seen": 1655385088 + }, + { + "epoch": 5.05, + "learning_rate": 0.000251765295887663, + "loss": 2.6338, + "theoretical_loss": 3.485929301777974, + "tokens_seen": 1655450624 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025175526579739217, + "loss": 2.7675, + "theoretical_loss": 3.485917363577027, + "tokens_seen": 1655516160 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025174523570712135, + "loss": 2.5491, + "theoretical_loss": 3.485905425980981, + "tokens_seen": 1655581696 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025173520561685053, + "loss": 2.7313, + "theoretical_loss": 3.4858934889897815, + "tokens_seen": 1655647232 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025172517552657977, + "loss": 2.7039, + "theoretical_loss": 3.4858815526033737, + "tokens_seen": 1655712768 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002517151454363089, + "loss": 2.67, + "theoretical_loss": 3.485869616821703, + "tokens_seen": 1655778304 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025170511534603813, + "loss": 2.7161, + "theoretical_loss": 3.485857681644716, + "tokens_seen": 1655843840 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002516950852557673, + "loss": 2.6704, + "theoretical_loss": 3.4858457470723563, + "tokens_seen": 1655909376 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002516850551654965, + "loss": 2.5977, + "theoretical_loss": 3.4858338131045707, + "tokens_seen": 1655974912 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002516750250752257, + "loss": 2.5878, + "theoretical_loss": 3.485821879741304, + "tokens_seen": 1656040448 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025166499498495486, + "loss": 2.7017, + "theoretical_loss": 3.485809946982502, + "tokens_seen": 1656105984 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025165496489468404, + "loss": 2.5361, + "theoretical_loss": 3.4857980148281094, + "tokens_seen": 1656171520 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025164493480441327, + "loss": 2.8714, + "theoretical_loss": 3.4857860832780725, + "tokens_seen": 1656237056 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002516349047141424, + "loss": 2.7463, + "theoretical_loss": 3.485774152332337, + "tokens_seen": 1656302592 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025162487462387163, + "loss": 2.7296, + "theoretical_loss": 3.4857622219908473, + "tokens_seen": 1656368128 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1838677, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7587366104125977, + "objective/train/theoretical_loss": 3.4857532746312345, + "objective/train/tokens_used": 1676877280, + "theoretical_loss": 3.4857532746312345, + "tokens_seen": 1656417280 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025161484453360076, + "loss": 2.742, + "theoretical_loss": 3.48575029225355, + "tokens_seen": 1656433664 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025160481444333, + "loss": 2.5303, + "theoretical_loss": 3.48573836312039, + "tokens_seen": 1656499200 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002515947843530592, + "loss": 2.6139, + "theoretical_loss": 3.485726434591313, + "tokens_seen": 1656564736 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025158475426278836, + "loss": 2.6626, + "theoretical_loss": 3.4857145066662643, + "tokens_seen": 1656630272 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025157472417251754, + "loss": 2.7339, + "theoretical_loss": 3.4857025793451895, + "tokens_seen": 1656695808 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002515646940822468, + "loss": 2.5899, + "theoretical_loss": 3.4856906526280342, + "tokens_seen": 1656761344 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002515546639919759, + "loss": 2.7294, + "theoretical_loss": 3.4856787265147444, + "tokens_seen": 1656826880 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025154463390170514, + "loss": 2.7747, + "theoretical_loss": 3.485666801005264, + "tokens_seen": 1656892416 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025153460381143426, + "loss": 2.675, + "theoretical_loss": 3.4856548760995407, + "tokens_seen": 1656957952 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002515245737211635, + "loss": 2.6092, + "theoretical_loss": 3.485642951797519, + "tokens_seen": 1657023488 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002515145436308927, + "loss": 2.7799, + "theoretical_loss": 3.485631028099144, + "tokens_seen": 1657089024 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025150451354062186, + "loss": 2.6345, + "theoretical_loss": 3.4856191050043623, + "tokens_seen": 1657154560 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002514944834503511, + "loss": 2.5774, + "theoretical_loss": 3.485607182513119, + "tokens_seen": 1657220096 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002514844533600802, + "loss": 2.593, + "theoretical_loss": 3.4855952606253595, + "tokens_seen": 1657285632 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025147442326980946, + "loss": 2.564, + "theoretical_loss": 3.4855833393410296, + "tokens_seen": 1657351168 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025146439317953864, + "loss": 2.9311, + "theoretical_loss": 3.485571418660075, + "tokens_seen": 1657416704 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002514543630892678, + "loss": 2.5349, + "theoretical_loss": 3.4855594985824405, + "tokens_seen": 1657482240 + }, + { + "epoch": 5.05, + "learning_rate": 0.000251444332998997, + "loss": 2.7704, + "theoretical_loss": 3.485547579108073, + "tokens_seen": 1657547776 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002514343029087262, + "loss": 2.5155, + "theoretical_loss": 3.485535660236917, + "tokens_seen": 1657613312 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025142427281845536, + "loss": 2.5951, + "theoretical_loss": 3.485523741968919, + "tokens_seen": 1657678848 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002514142427281846, + "loss": 2.708, + "theoretical_loss": 3.485511824304024, + "tokens_seen": 1657744384 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002514042126379137, + "loss": 2.6505, + "theoretical_loss": 3.485499907242178, + "tokens_seen": 1657809920 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025139418254764296, + "loss": 2.5925, + "theoretical_loss": 3.4854879907833265, + "tokens_seen": 1657875456 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025138415245737214, + "loss": 2.8217, + "theoretical_loss": 3.4854760749274156, + "tokens_seen": 1657940992 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002513741223671013, + "loss": 2.6513, + "theoretical_loss": 3.48546415967439, + "tokens_seen": 1658006528 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1839915, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.636464834213257, + "objective/train/theoretical_loss": 3.485455223630232, + "objective/train/tokens_used": 1678515680, + "theoretical_loss": 3.485455223630232, + "tokens_seen": 1658055680 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002513640922768305, + "loss": 2.8804, + "theoretical_loss": 3.485452245024196, + "tokens_seen": 1658072064 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002513540621865597, + "loss": 2.5245, + "theoretical_loss": 3.4854403309767794, + "tokens_seen": 1658137600 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025134403209628887, + "loss": 2.5206, + "theoretical_loss": 3.4854284175320855, + "tokens_seen": 1658203136 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002513340020060181, + "loss": 2.8221, + "theoretical_loss": 3.4854165046900603, + "tokens_seen": 1658268672 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025132397191574723, + "loss": 2.7443, + "theoretical_loss": 3.4854045924506494, + "tokens_seen": 1658334208 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025131394182547647, + "loss": 2.5616, + "theoretical_loss": 3.4853926808137983, + "tokens_seen": 1658399744 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002513039117352056, + "loss": 2.7239, + "theoretical_loss": 3.4853807697794528, + "tokens_seen": 1658465280 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025129388164493483, + "loss": 2.6063, + "theoretical_loss": 3.4853688593475587, + "tokens_seen": 1658530816 + }, + { + "epoch": 5.05, + "learning_rate": 0.000251283851554664, + "loss": 2.7393, + "theoretical_loss": 3.485356949518062, + "tokens_seen": 1658596352 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002512738214643932, + "loss": 2.6428, + "theoretical_loss": 3.4853450402909076, + "tokens_seen": 1658661888 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025126379137412237, + "loss": 2.6324, + "theoretical_loss": 3.4853331316660423, + "tokens_seen": 1658727424 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025125376128385155, + "loss": 2.7628, + "theoretical_loss": 3.4853212236434112, + "tokens_seen": 1658792960 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025124373119358073, + "loss": 2.5693, + "theoretical_loss": 3.48530931622296, + "tokens_seen": 1658858496 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025123370110330997, + "loss": 2.6916, + "theoretical_loss": 3.4852974094046347, + "tokens_seen": 1658924032 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002512236710130391, + "loss": 2.5149, + "theoretical_loss": 3.485285503188381, + "tokens_seen": 1658989568 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025121364092276833, + "loss": 2.5957, + "theoretical_loss": 3.485273597574145, + "tokens_seen": 1659055104 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002512036108324975, + "loss": 2.5215, + "theoretical_loss": 3.485261692561872, + "tokens_seen": 1659120640 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002511935807422267, + "loss": 2.6081, + "theoretical_loss": 3.4852497881515077, + "tokens_seen": 1659186176 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002511835506519559, + "loss": 2.6398, + "theoretical_loss": 3.4852378843429985, + "tokens_seen": 1659251712 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025117352056168506, + "loss": 2.5746, + "theoretical_loss": 3.4852259811362893, + "tokens_seen": 1659317248 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025116349047141424, + "loss": 2.5554, + "theoretical_loss": 3.4852140785313273, + "tokens_seen": 1659382784 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025115346038114347, + "loss": 2.6804, + "theoretical_loss": 3.4852021765280568, + "tokens_seen": 1659448320 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002511434302908726, + "loss": 2.7468, + "theoretical_loss": 3.4851902751264245, + "tokens_seen": 1659513856 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025113340020060183, + "loss": 2.7423, + "theoretical_loss": 3.485178374326376, + "tokens_seen": 1659579392 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025112337011033096, + "loss": 2.5687, + "theoretical_loss": 3.4851664741278574, + "tokens_seen": 1659644928 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1840518, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6859548091888428, + "objective/train/theoretical_loss": 3.4851575493736897, + "objective/train/tokens_used": 1680154080, + "theoretical_loss": 3.4851575493736897, + "tokens_seen": 1659694080 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002511133400200602, + "loss": 2.5917, + "theoretical_loss": 3.4851545745308137, + "tokens_seen": 1659710464 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002511033099297894, + "loss": 2.4357, + "theoretical_loss": 3.485142675535192, + "tokens_seen": 1659776000 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025109327983951856, + "loss": 2.7973, + "theoretical_loss": 3.485130777140937, + "tokens_seen": 1659841536 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025108324974924774, + "loss": 2.3946, + "theoretical_loss": 3.4851188793479957, + "tokens_seen": 1659907072 + }, + { + "epoch": 5.05, + "learning_rate": 0.000251073219658977, + "loss": 2.5769, + "theoretical_loss": 3.4851069821563128, + "tokens_seen": 1659972608 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002510631895687061, + "loss": 2.638, + "theoretical_loss": 3.485095085565835, + "tokens_seen": 1660038144 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025105315947843534, + "loss": 2.7827, + "theoretical_loss": 3.485083189576508, + "tokens_seen": 1660103680 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025104312938816446, + "loss": 2.4849, + "theoretical_loss": 3.4850712941882778, + "tokens_seen": 1660169216 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002510330992978937, + "loss": 2.7166, + "theoretical_loss": 3.4850593994010897, + "tokens_seen": 1660234752 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002510230692076229, + "loss": 2.9174, + "theoretical_loss": 3.48504750521489, + "tokens_seen": 1660300288 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025101303911735206, + "loss": 2.9626, + "theoretical_loss": 3.485035611629625, + "tokens_seen": 1660365824 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025100300902708124, + "loss": 2.7942, + "theoretical_loss": 3.4850237186452397, + "tokens_seen": 1660431360 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002509929789368104, + "loss": 2.6116, + "theoretical_loss": 3.485011826261681, + "tokens_seen": 1660496896 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002509829488465396, + "loss": 2.7138, + "theoretical_loss": 3.4849999344788944, + "tokens_seen": 1660562432 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025097291875626884, + "loss": 2.5579, + "theoretical_loss": 3.4849880432968257, + "tokens_seen": 1660627968 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025096288866599797, + "loss": 2.7703, + "theoretical_loss": 3.4849761527154213, + "tokens_seen": 1660693504 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002509528585757272, + "loss": 2.574, + "theoretical_loss": 3.4849642627346267, + "tokens_seen": 1660759040 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025094282848545633, + "loss": 2.7389, + "theoretical_loss": 3.484952373354388, + "tokens_seen": 1660824576 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025093279839518556, + "loss": 2.5107, + "theoretical_loss": 3.484940484574651, + "tokens_seen": 1660890112 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025092276830491475, + "loss": 2.7075, + "theoretical_loss": 3.484928596395362, + "tokens_seen": 1660955648 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002509127382146439, + "loss": 2.6275, + "theoretical_loss": 3.484916708816467, + "tokens_seen": 1661021184 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002509027081243731, + "loss": 2.6991, + "theoretical_loss": 3.4849048218379117, + "tokens_seen": 1661086720 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025089267803410234, + "loss": 2.6848, + "theoretical_loss": 3.4848929354596425, + "tokens_seen": 1661152256 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025088264794383147, + "loss": 2.8059, + "theoretical_loss": 3.484881049681605, + "tokens_seen": 1661217792 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002508726178535607, + "loss": 2.7089, + "theoretical_loss": 3.4848691645037455, + "tokens_seen": 1661283328 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1841911, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.809731960296631, + "objective/train/theoretical_loss": 3.484860251014185, + "objective/train/tokens_used": 1681792480, + "theoretical_loss": 3.484860251014185, + "tokens_seen": 1661332480 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025086258776328983, + "loss": 2.712, + "theoretical_loss": 3.4848572799260094, + "tokens_seen": 1661348864 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025085255767301907, + "loss": 2.8215, + "theoretical_loss": 3.484845395948344, + "tokens_seen": 1661414400 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025084252758274825, + "loss": 2.4925, + "theoretical_loss": 3.484833512570694, + "tokens_seen": 1661479936 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025083249749247743, + "loss": 2.7134, + "theoretical_loss": 3.484821629793006, + "tokens_seen": 1661545472 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002508224674022066, + "loss": 2.8041, + "theoretical_loss": 3.484809747615226, + "tokens_seen": 1661611008 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002508124373119358, + "loss": 2.5556, + "theoretical_loss": 3.4847978660373005, + "tokens_seen": 1661676544 + }, + { + "epoch": 5.05, + "learning_rate": 0.000250802407221665, + "loss": 2.7569, + "theoretical_loss": 3.484785985059175, + "tokens_seen": 1661742080 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002507923771313942, + "loss": 2.5615, + "theoretical_loss": 3.4847741046807954, + "tokens_seen": 1661807616 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025078234704112334, + "loss": 2.6096, + "theoretical_loss": 3.4847622249021084, + "tokens_seen": 1661873152 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025077231695085257, + "loss": 2.4842, + "theoretical_loss": 3.4847503457230595, + "tokens_seen": 1661938688 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002507622868605817, + "loss": 2.7101, + "theoretical_loss": 3.484738467143595, + "tokens_seen": 1662004224 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025075225677031093, + "loss": 2.5674, + "theoretical_loss": 3.4847265891636616, + "tokens_seen": 1662069760 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025074222668004017, + "loss": 2.5887, + "theoretical_loss": 3.4847147117832042, + "tokens_seen": 1662135296 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002507321965897693, + "loss": 2.6562, + "theoretical_loss": 3.48470283500217, + "tokens_seen": 1662200832 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025072216649949853, + "loss": 2.7877, + "theoretical_loss": 3.484690958820505, + "tokens_seen": 1662266368 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002507121364092277, + "loss": 2.6638, + "theoretical_loss": 3.4846790832381545, + "tokens_seen": 1662331904 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002507021063189569, + "loss": 2.8784, + "theoretical_loss": 3.4846672082550656, + "tokens_seen": 1662397440 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002506920762286861, + "loss": 2.789, + "theoretical_loss": 3.484655333871183, + "tokens_seen": 1662462976 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025068204613841526, + "loss": 2.6966, + "theoretical_loss": 3.484643460086455, + "tokens_seen": 1662528512 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025067201604814444, + "loss": 2.7616, + "theoretical_loss": 3.484631586900826, + "tokens_seen": 1662594048 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025066198595787367, + "loss": 2.5012, + "theoretical_loss": 3.4846197143142428, + "tokens_seen": 1662659584 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002506519558676028, + "loss": 2.6553, + "theoretical_loss": 3.4846078423266516, + "tokens_seen": 1662725120 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025064192577733203, + "loss": 2.8815, + "theoretical_loss": 3.484595970937998, + "tokens_seen": 1662790656 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025063189568706116, + "loss": 2.6849, + "theoretical_loss": 3.484584100148229, + "tokens_seen": 1662856192 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002506218655967904, + "loss": 2.7489, + "theoretical_loss": 3.4845722299572905, + "tokens_seen": 1662921728 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1842483, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9067416191101074, + "objective/train/theoretical_loss": 3.4845633277070367, + "objective/train/tokens_used": 1683430880, + "theoretical_loss": 3.4845633277070367, + "tokens_seen": 1662970880 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002506118355065196, + "loss": 2.7631, + "theoretical_loss": 3.484560360365129, + "tokens_seen": 1662987264 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025060180541624876, + "loss": 2.4925, + "theoretical_loss": 3.4845484913716893, + "tokens_seen": 1663052800 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025059177532597794, + "loss": 2.7454, + "theoretical_loss": 3.4845366229769192, + "tokens_seen": 1663118336 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002505817452357072, + "loss": 2.5536, + "theoretical_loss": 3.4845247551807645, + "tokens_seen": 1663183872 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002505717151454363, + "loss": 2.6713, + "theoretical_loss": 3.4845128879831715, + "tokens_seen": 1663249408 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025056168505516554, + "loss": 2.785, + "theoretical_loss": 3.4845010213840855, + "tokens_seen": 1663314944 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025055165496489466, + "loss": 2.6075, + "theoretical_loss": 3.484489155383454, + "tokens_seen": 1663380480 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002505416248746239, + "loss": 2.7303, + "theoretical_loss": 3.4844772899812226, + "tokens_seen": 1663446016 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002505315947843531, + "loss": 2.7277, + "theoretical_loss": 3.4844654251773375, + "tokens_seen": 1663511552 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025052156469408226, + "loss": 2.5706, + "theoretical_loss": 3.484453560971745, + "tokens_seen": 1663577088 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025051153460381144, + "loss": 2.7104, + "theoretical_loss": 3.4844416973643915, + "tokens_seen": 1663642624 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002505015045135406, + "loss": 2.7243, + "theoretical_loss": 3.4844298343552236, + "tokens_seen": 1663708160 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002504914744232698, + "loss": 2.4113, + "theoretical_loss": 3.484417971944187, + "tokens_seen": 1663773696 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025048144433299904, + "loss": 2.799, + "theoretical_loss": 3.4844061101312276, + "tokens_seen": 1663839232 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025047141424272817, + "loss": 2.6084, + "theoretical_loss": 3.484394248916293, + "tokens_seen": 1663904768 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002504613841524574, + "loss": 2.5572, + "theoretical_loss": 3.4843823882993283, + "tokens_seen": 1663970304 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025045135406218653, + "loss": 2.7367, + "theoretical_loss": 3.4843705282802806, + "tokens_seen": 1664035840 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025044132397191576, + "loss": 2.6933, + "theoretical_loss": 3.484358668859096, + "tokens_seen": 1664101376 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025043129388164495, + "loss": 2.5641, + "theoretical_loss": 3.48434681003572, + "tokens_seen": 1664166912 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002504212637913741, + "loss": 2.7017, + "theoretical_loss": 3.4843349518101006, + "tokens_seen": 1664232448 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002504112337011033, + "loss": 2.4421, + "theoretical_loss": 3.4843230941821823, + "tokens_seen": 1664297984 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025040120361083254, + "loss": 2.6537, + "theoretical_loss": 3.4843112371519127, + "tokens_seen": 1664363520 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025039117352056167, + "loss": 2.4368, + "theoretical_loss": 3.4842993807192375, + "tokens_seen": 1664429056 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002503811434302909, + "loss": 2.8247, + "theoretical_loss": 3.4842875248841034, + "tokens_seen": 1664494592 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025037111334002003, + "loss": 2.7471, + "theoretical_loss": 3.484275669646457, + "tokens_seen": 1664560128 + }, + { + "epoch": 5.05, + "objective/train/docs_used": 1843846, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8670899868011475, + "objective/train/theoretical_loss": 3.4842667786102908, + "objective/train/tokens_used": 1685069280, + "theoretical_loss": 3.4842667786102908, + "tokens_seen": 1664609280 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025036108324974927, + "loss": 2.5737, + "theoretical_loss": 3.484263815006244, + "tokens_seen": 1664625664 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025035105315947845, + "loss": 2.5695, + "theoretical_loss": 3.4842519609634115, + "tokens_seen": 1664691200 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025034102306920763, + "loss": 2.6554, + "theoretical_loss": 3.4842401075179046, + "tokens_seen": 1664756736 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002503309929789368, + "loss": 2.8879, + "theoretical_loss": 3.484228254669671, + "tokens_seen": 1664822272 + }, + { + "epoch": 5.05, + "learning_rate": 0.000250320962888666, + "loss": 2.7967, + "theoretical_loss": 3.484216402418657, + "tokens_seen": 1664887808 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002503109327983952, + "loss": 2.6284, + "theoretical_loss": 3.484204550764809, + "tokens_seen": 1664953344 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002503009027081244, + "loss": 2.6012, + "theoretical_loss": 3.484192699708072, + "tokens_seen": 1665018880 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025029087261785354, + "loss": 2.8306, + "theoretical_loss": 3.484180849248394, + "tokens_seen": 1665084416 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025028084252758277, + "loss": 2.5193, + "theoretical_loss": 3.484168999385721, + "tokens_seen": 1665149952 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002502708124373119, + "loss": 2.7555, + "theoretical_loss": 3.4841571501199993, + "tokens_seen": 1665215488 + }, + { + "epoch": 5.05, + "learning_rate": 0.00025026078234704113, + "loss": 2.6078, + "theoretical_loss": 3.484145301451175, + "tokens_seen": 1665281024 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002502507522567703, + "loss": 2.7567, + "theoretical_loss": 3.4841334533791954, + "tokens_seen": 1665346560 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002502407221664995, + "loss": 2.7793, + "theoretical_loss": 3.484121605904006, + "tokens_seen": 1665412096 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002502306920762287, + "loss": 2.7018, + "theoretical_loss": 3.4841097590255545, + "tokens_seen": 1665477632 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002502206619859579, + "loss": 2.6769, + "theoretical_loss": 3.4840979127437857, + "tokens_seen": 1665543168 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025021063189568704, + "loss": 2.6593, + "theoretical_loss": 3.4840860670586475, + "tokens_seen": 1665608704 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002502006018054163, + "loss": 2.519, + "theoretical_loss": 3.4840742219700855, + "tokens_seen": 1665674240 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002501905717151454, + "loss": 2.8133, + "theoretical_loss": 3.484062377478047, + "tokens_seen": 1665739776 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025018054162487464, + "loss": 2.7593, + "theoretical_loss": 3.4840505335824776, + "tokens_seen": 1665805312 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002501705115346038, + "loss": 2.4849, + "theoretical_loss": 3.4840386902833242, + "tokens_seen": 1665870848 + }, + { + "epoch": 5.06, + "learning_rate": 0.000250160481444333, + "loss": 2.6172, + "theoretical_loss": 3.4840268475805334, + "tokens_seen": 1665936384 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002501504513540622, + "loss": 2.6403, + "theoretical_loss": 3.4840150054740517, + "tokens_seen": 1666001920 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025014042126379136, + "loss": 2.4497, + "theoretical_loss": 3.4840031639638256, + "tokens_seen": 1666067456 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025013039117352054, + "loss": 2.5176, + "theoretical_loss": 3.4839913230498016, + "tokens_seen": 1666132992 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002501203610832498, + "loss": 2.8501, + "theoretical_loss": 3.483979482731926, + "tokens_seen": 1666198528 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1844442, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7664921283721924, + "objective/train/theoretical_loss": 3.4839706028847095, + "objective/train/tokens_used": 1686707680, + "theoretical_loss": 3.4839706028847095, + "tokens_seen": 1666247680 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002501103309929789, + "loss": 2.6349, + "theoretical_loss": 3.4839676430101454, + "tokens_seen": 1666264064 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025010030090270814, + "loss": 2.837, + "theoretical_loss": 3.483955803884407, + "tokens_seen": 1666329600 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025009027081243727, + "loss": 2.5724, + "theoretical_loss": 3.4839439653546567, + "tokens_seen": 1666395136 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002500802407221665, + "loss": 2.7924, + "theoretical_loss": 3.483932127420841, + "tokens_seen": 1666460672 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002500702106318957, + "loss": 2.8922, + "theoretical_loss": 3.4839202900829065, + "tokens_seen": 1666526208 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025006018054162486, + "loss": 2.7352, + "theoretical_loss": 3.4839084533408, + "tokens_seen": 1666591744 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025005015045135405, + "loss": 2.7045, + "theoretical_loss": 3.4838966171944685, + "tokens_seen": 1666657280 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002500401203610833, + "loss": 2.7117, + "theoretical_loss": 3.483884781643858, + "tokens_seen": 1666722816 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002500300902708124, + "loss": 2.732, + "theoretical_loss": 3.4838729466889147, + "tokens_seen": 1666788352 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025002006018054164, + "loss": 2.4954, + "theoretical_loss": 3.483861112329586, + "tokens_seen": 1666853888 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025001003009027077, + "loss": 2.6166, + "theoretical_loss": 3.4838492785658186, + "tokens_seen": 1666919424 + }, + { + "epoch": 5.06, + "learning_rate": 0.00025, + "loss": 2.623, + "theoretical_loss": 3.4838374453975582, + "tokens_seen": 1666984960 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002499899699097292, + "loss": 2.4604, + "theoretical_loss": 3.4838256128247522, + "tokens_seen": 1667050496 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024997993981945837, + "loss": 2.3494, + "theoretical_loss": 3.483813780847347, + "tokens_seen": 1667116032 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024996990972918755, + "loss": 2.7256, + "theoretical_loss": 3.483801949465289, + "tokens_seen": 1667181568 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024995987963891673, + "loss": 2.5823, + "theoretical_loss": 3.483790118678525, + "tokens_seen": 1667247104 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024994984954864597, + "loss": 2.7191, + "theoretical_loss": 3.4837782884870023, + "tokens_seen": 1667312640 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024993981945837515, + "loss": 2.8548, + "theoretical_loss": 3.4837664588906665, + "tokens_seen": 1667378176 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024992978936810433, + "loss": 2.4082, + "theoretical_loss": 3.483754629889465, + "tokens_seen": 1667443712 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002499197592778335, + "loss": 2.6523, + "theoretical_loss": 3.483742801483344, + "tokens_seen": 1667509248 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002499097291875627, + "loss": 2.436, + "theoretical_loss": 3.4837309736722504, + "tokens_seen": 1667574784 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024989969909729187, + "loss": 2.4825, + "theoretical_loss": 3.4837191464561306, + "tokens_seen": 1667640320 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024988966900702105, + "loss": 2.7399, + "theoretical_loss": 3.4837073198349318, + "tokens_seen": 1667705856 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024987963891675023, + "loss": 2.5604, + "theoretical_loss": 3.483695493808601, + "tokens_seen": 1667771392 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002498696088264794, + "loss": 2.8503, + "theoretical_loss": 3.4836836683770835, + "tokens_seen": 1667836928 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1845695, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.797067880630493, + "objective/train/theoretical_loss": 3.4836747996937607, + "objective/train/tokens_used": 1688346080, + "theoretical_loss": 3.4836747996937607, + "tokens_seen": 1667886080 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024985957873620865, + "loss": 2.6675, + "theoretical_loss": 3.4836718435403276, + "tokens_seen": 1667902464 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024984954864593783, + "loss": 2.8577, + "theoretical_loss": 3.483660019298279, + "tokens_seen": 1667968000 + }, + { + "epoch": 5.06, + "learning_rate": 0.000249839518555667, + "loss": 2.5806, + "theoretical_loss": 3.4836481956508845, + "tokens_seen": 1668033536 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002498294884653962, + "loss": 2.6147, + "theoretical_loss": 3.4836363725980912, + "tokens_seen": 1668099072 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002498194583751254, + "loss": 2.8213, + "theoretical_loss": 3.4836245501398455, + "tokens_seen": 1668164608 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024980942828485455, + "loss": 2.6303, + "theoretical_loss": 3.4836127282760945, + "tokens_seen": 1668230144 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024979939819458374, + "loss": 2.8056, + "theoretical_loss": 3.4836009070067853, + "tokens_seen": 1668295680 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002497893681043129, + "loss": 2.4516, + "theoretical_loss": 3.4835890863318637, + "tokens_seen": 1668361216 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002497793380140421, + "loss": 2.3488, + "theoretical_loss": 3.4835772662512765, + "tokens_seen": 1668426752 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024976930792377133, + "loss": 2.7373, + "theoretical_loss": 3.4835654467649713, + "tokens_seen": 1668492288 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002497592778335005, + "loss": 2.6946, + "theoretical_loss": 3.4835536278728947, + "tokens_seen": 1668557824 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002497492477432297, + "loss": 2.6341, + "theoretical_loss": 3.4835418095749926, + "tokens_seen": 1668623360 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002497392176529589, + "loss": 2.6251, + "theoretical_loss": 3.4835299918712135, + "tokens_seen": 1668688896 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024972918756268806, + "loss": 2.6928, + "theoretical_loss": 3.4835181747615023, + "tokens_seen": 1668754432 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024971915747241724, + "loss": 2.6506, + "theoretical_loss": 3.4835063582458066, + "tokens_seen": 1668819968 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002497091273821464, + "loss": 2.7688, + "theoretical_loss": 3.483494542324074, + "tokens_seen": 1668885504 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002496990972918756, + "loss": 2.6239, + "theoretical_loss": 3.48348272699625, + "tokens_seen": 1668951040 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002496890672016048, + "loss": 2.6258, + "theoretical_loss": 3.4834709122622822, + "tokens_seen": 1669016576 + }, + { + "epoch": 5.06, + "learning_rate": 0.000249679037111334, + "loss": 2.6748, + "theoretical_loss": 3.483459098122117, + "tokens_seen": 1669082112 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002496690070210632, + "loss": 2.8101, + "theoretical_loss": 3.4834472845757016, + "tokens_seen": 1669147648 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002496589769307924, + "loss": 2.3921, + "theoretical_loss": 3.483435471622983, + "tokens_seen": 1669213184 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024964894684052156, + "loss": 2.7437, + "theoretical_loss": 3.4834236592639076, + "tokens_seen": 1669278720 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024963891675025074, + "loss": 2.7621, + "theoretical_loss": 3.483411847498423, + "tokens_seen": 1669344256 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002496288866599799, + "loss": 2.7915, + "theoretical_loss": 3.4834000363264748, + "tokens_seen": 1669409792 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024961885656970916, + "loss": 2.7576, + "theoretical_loss": 3.4833882257480107, + "tokens_seen": 1669475328 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1846567, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6542444229125977, + "objective/train/theoretical_loss": 3.4833793682036047, + "objective/train/tokens_used": 1689984480, + "theoretical_loss": 3.4833793682036047, + "tokens_seen": 1669524480 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024960882647943834, + "loss": 2.6263, + "theoretical_loss": 3.4833764157629776, + "tokens_seen": 1669540864 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002495987963891675, + "loss": 2.6988, + "theoretical_loss": 3.4833646063713224, + "tokens_seen": 1669606400 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002495887662988967, + "loss": 2.5219, + "theoretical_loss": 3.4833527975729917, + "tokens_seen": 1669671936 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002495787362086259, + "loss": 2.747, + "theoretical_loss": 3.483340989367933, + "tokens_seen": 1669737472 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024956870611835506, + "loss": 2.8014, + "theoretical_loss": 3.4833291817560923, + "tokens_seen": 1669803008 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024955867602808425, + "loss": 2.6533, + "theoretical_loss": 3.4833173747374175, + "tokens_seen": 1669868544 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002495486459378135, + "loss": 2.5929, + "theoretical_loss": 3.4833055683118546, + "tokens_seen": 1669934080 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024953861584754266, + "loss": 2.7315, + "theoretical_loss": 3.4832937624793514, + "tokens_seen": 1669999616 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024952858575727184, + "loss": 2.6349, + "theoretical_loss": 3.483281957239854, + "tokens_seen": 1670065152 + }, + { + "epoch": 5.06, + "learning_rate": 0.000249518555667001, + "loss": 2.5844, + "theoretical_loss": 3.4832701525933096, + "tokens_seen": 1670130688 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002495085255767302, + "loss": 2.6783, + "theoretical_loss": 3.4832583485396658, + "tokens_seen": 1670196224 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002494984954864594, + "loss": 2.7229, + "theoretical_loss": 3.483246545078869, + "tokens_seen": 1670261760 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024948846539618857, + "loss": 2.7989, + "theoretical_loss": 3.4832347422108665, + "tokens_seen": 1670327296 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024947843530591775, + "loss": 2.6835, + "theoretical_loss": 3.4832229399356045, + "tokens_seen": 1670392832 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024946840521564693, + "loss": 2.5618, + "theoretical_loss": 3.48321113825303, + "tokens_seen": 1670458368 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024945837512537617, + "loss": 2.5663, + "theoretical_loss": 3.483199337163091, + "tokens_seen": 1670523904 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024944834503510535, + "loss": 2.5753, + "theoretical_loss": 3.483187536665734, + "tokens_seen": 1670589440 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024943831494483453, + "loss": 2.5417, + "theoretical_loss": 3.483175736760906, + "tokens_seen": 1670654976 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002494282848545637, + "loss": 2.4416, + "theoretical_loss": 3.4831639374485546, + "tokens_seen": 1670720512 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002494182547642929, + "loss": 2.7506, + "theoretical_loss": 3.483152138728625, + "tokens_seen": 1670786048 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024940822467402207, + "loss": 2.579, + "theoretical_loss": 3.483140340601066, + "tokens_seen": 1670851584 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024939819458375125, + "loss": 2.6252, + "theoretical_loss": 3.483128543065824, + "tokens_seen": 1670917120 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024938816449348043, + "loss": 2.6801, + "theoretical_loss": 3.483116746122846, + "tokens_seen": 1670982656 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002493781344032096, + "loss": 2.591, + "theoretical_loss": 3.4831049497720787, + "tokens_seen": 1671048192 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024936810431293885, + "loss": 2.5669, + "theoretical_loss": 3.4830931540134697, + "tokens_seen": 1671113728 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1851822, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3846969604492188, + "objective/train/theoretical_loss": 3.483084307583085, + "objective/train/tokens_used": 1691622880, + "theoretical_loss": 3.483084307583085, + "tokens_seen": 1671162880 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024935807422266803, + "loss": 2.5773, + "theoretical_loss": 3.4830813588469667, + "tokens_seen": 1671179264 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002493480441323972, + "loss": 2.497, + "theoretical_loss": 3.4830695642725154, + "tokens_seen": 1671244800 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002493380140421264, + "loss": 2.4311, + "theoretical_loss": 3.483057770290063, + "tokens_seen": 1671310336 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002493279839518556, + "loss": 2.7017, + "theoretical_loss": 3.4830459768995574, + "tokens_seen": 1671375872 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024931795386158476, + "loss": 2.6477, + "theoretical_loss": 3.483034184100945, + "tokens_seen": 1671441408 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024930792377131394, + "loss": 2.9452, + "theoretical_loss": 3.483022391894173, + "tokens_seen": 1671506944 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002492978936810431, + "loss": 2.8465, + "theoretical_loss": 3.483010600279189, + "tokens_seen": 1671572480 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002492878635907723, + "loss": 2.9101, + "theoretical_loss": 3.4829988092559403, + "tokens_seen": 1671638016 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024927783350050153, + "loss": 2.6013, + "theoretical_loss": 3.482987018824373, + "tokens_seen": 1671703552 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002492678034102307, + "loss": 2.32, + "theoretical_loss": 3.4829752289844347, + "tokens_seen": 1671769088 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002492577733199599, + "loss": 2.6812, + "theoretical_loss": 3.4829634397360723, + "tokens_seen": 1671834624 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002492477432296891, + "loss": 2.8012, + "theoretical_loss": 3.482951651079233, + "tokens_seen": 1671900160 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024923771313941826, + "loss": 2.7656, + "theoretical_loss": 3.482939863013865, + "tokens_seen": 1671965696 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024922768304914744, + "loss": 2.6971, + "theoretical_loss": 3.4829280755399132, + "tokens_seen": 1672031232 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002492176529588766, + "loss": 2.8746, + "theoretical_loss": 3.482916288657327, + "tokens_seen": 1672096768 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002492076228686058, + "loss": 2.7422, + "theoretical_loss": 3.4829045023660523, + "tokens_seen": 1672162304 + }, + { + "epoch": 5.06, + "learning_rate": 0.000249197592778335, + "loss": 2.672, + "theoretical_loss": 3.4828927166660364, + "tokens_seen": 1672227840 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002491875626880642, + "loss": 2.4782, + "theoretical_loss": 3.482880931557227, + "tokens_seen": 1672293376 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002491775325977934, + "loss": 2.7754, + "theoretical_loss": 3.4828691470395707, + "tokens_seen": 1672358912 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002491675025075226, + "loss": 2.564, + "theoretical_loss": 3.4828573631130153, + "tokens_seen": 1672424448 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024915747241725176, + "loss": 2.6771, + "theoretical_loss": 3.482845579777507, + "tokens_seen": 1672489984 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024914744232698094, + "loss": 2.4914, + "theoretical_loss": 3.4828337970329937, + "tokens_seen": 1672555520 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002491374122367101, + "loss": 2.6356, + "theoretical_loss": 3.4828220148794227, + "tokens_seen": 1672621056 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002491273821464393, + "loss": 2.6977, + "theoretical_loss": 3.482810233316741, + "tokens_seen": 1672686592 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002491173520561685, + "loss": 2.4544, + "theoretical_loss": 3.482798452344896, + "tokens_seen": 1672752128 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1856799, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7923433780670166, + "objective/train/theoretical_loss": 3.482789617003717, + "objective/train/tokens_used": 1693261280, + "theoretical_loss": 3.482789617003717, + "tokens_seen": 1672801280 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024910732196589767, + "loss": 2.6336, + "theoretical_loss": 3.4827866719638347, + "tokens_seen": 1672817664 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002490972918756269, + "loss": 2.5704, + "theoretical_loss": 3.482774892173504, + "tokens_seen": 1672883200 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002490872617853561, + "loss": 2.8311, + "theoretical_loss": 3.482763112973852, + "tokens_seen": 1672948736 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024907723169508526, + "loss": 2.4595, + "theoretical_loss": 3.482751334364825, + "tokens_seen": 1673014272 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024906720160481445, + "loss": 2.5905, + "theoretical_loss": 3.482739556346371, + "tokens_seen": 1673079808 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002490571715145436, + "loss": 2.5621, + "theoretical_loss": 3.4827277789184365, + "tokens_seen": 1673145344 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002490471414242728, + "loss": 2.551, + "theoretical_loss": 3.48271600208097, + "tokens_seen": 1673210880 + }, + { + "epoch": 5.06, + "learning_rate": 0.000249037111334002, + "loss": 2.6624, + "theoretical_loss": 3.4827042258339174, + "tokens_seen": 1673276416 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024902708124373117, + "loss": 2.7077, + "theoretical_loss": 3.4826924501772267, + "tokens_seen": 1673341952 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024901705115346035, + "loss": 2.6516, + "theoretical_loss": 3.4826806751108452, + "tokens_seen": 1673407488 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002490070210631896, + "loss": 2.6348, + "theoretical_loss": 3.48266890063472, + "tokens_seen": 1673473024 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024899699097291877, + "loss": 2.5727, + "theoretical_loss": 3.4826571267487982, + "tokens_seen": 1673538560 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024898696088264795, + "loss": 2.6742, + "theoretical_loss": 3.4826453534530275, + "tokens_seen": 1673604096 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024897693079237713, + "loss": 2.6143, + "theoretical_loss": 3.4826335807473554, + "tokens_seen": 1673669632 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002489669007021063, + "loss": 2.6825, + "theoretical_loss": 3.4826218086317287, + "tokens_seen": 1673735168 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002489568706118355, + "loss": 2.649, + "theoretical_loss": 3.482610037106095, + "tokens_seen": 1673800704 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002489468405215647, + "loss": 2.7613, + "theoretical_loss": 3.482598266170401, + "tokens_seen": 1673866240 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024893681043129385, + "loss": 2.8634, + "theoretical_loss": 3.482586495824595, + "tokens_seen": 1673931776 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024892678034102304, + "loss": 2.6895, + "theoretical_loss": 3.4825747260686244, + "tokens_seen": 1673997312 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024891675025075227, + "loss": 2.8027, + "theoretical_loss": 3.4825629569024352, + "tokens_seen": 1674062848 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024890672016048145, + "loss": 2.6478, + "theoretical_loss": 3.482551188325976, + "tokens_seen": 1674128384 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024889669007021063, + "loss": 2.6996, + "theoretical_loss": 3.4825394203391937, + "tokens_seen": 1674193920 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002488866599799398, + "loss": 2.5905, + "theoretical_loss": 3.482527652942036, + "tokens_seen": 1674259456 + }, + { + "epoch": 5.06, + "learning_rate": 0.000248876629889669, + "loss": 2.6693, + "theoretical_loss": 3.4825158861344505, + "tokens_seen": 1674324992 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024886659979939823, + "loss": 2.6493, + "theoretical_loss": 3.482504119916383, + "tokens_seen": 1674390528 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1861891, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5499491691589355, + "objective/train/theoretical_loss": 3.4824952956396737, + "objective/train/tokens_used": 1694899680, + "theoretical_loss": 3.4824952956396737, + "tokens_seen": 1674439680 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002488565697091274, + "loss": 2.8284, + "theoretical_loss": 3.482492354287783, + "tokens_seen": 1674456064 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002488465396188566, + "loss": 2.7191, + "theoretical_loss": 3.4824805892485964, + "tokens_seen": 1674521600 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002488365095285858, + "loss": 2.554, + "theoretical_loss": 3.4824688247987714, + "tokens_seen": 1674587136 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024882647943831496, + "loss": 2.4408, + "theoretical_loss": 3.4824570609382555, + "tokens_seen": 1674652672 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024881644934804414, + "loss": 2.7672, + "theoretical_loss": 3.4824452976669953, + "tokens_seen": 1674718208 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002488064192577733, + "loss": 2.8057, + "theoretical_loss": 3.4824335349849385, + "tokens_seen": 1674783744 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002487963891675025, + "loss": 2.6273, + "theoretical_loss": 3.4824217728920335, + "tokens_seen": 1674849280 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024878635907723173, + "loss": 2.6652, + "theoretical_loss": 3.482410011388226, + "tokens_seen": 1674914816 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002487763289869609, + "loss": 2.7675, + "theoretical_loss": 3.482398250473465, + "tokens_seen": 1674980352 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002487662988966901, + "loss": 2.6411, + "theoretical_loss": 3.4823864901476975, + "tokens_seen": 1675045888 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002487562688064193, + "loss": 2.7621, + "theoretical_loss": 3.4823747304108705, + "tokens_seen": 1675111424 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024874623871614846, + "loss": 2.6126, + "theoretical_loss": 3.4823629712629316, + "tokens_seen": 1675176960 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024873620862587764, + "loss": 2.7368, + "theoretical_loss": 3.482351212703829, + "tokens_seen": 1675242496 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002487261785356068, + "loss": 2.5371, + "theoretical_loss": 3.4823394547335096, + "tokens_seen": 1675308032 + }, + { + "epoch": 5.06, + "learning_rate": 0.000248716148445336, + "loss": 2.7732, + "theoretical_loss": 3.482327697351921, + "tokens_seen": 1675373568 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002487061183550652, + "loss": 2.5345, + "theoretical_loss": 3.48231594055901, + "tokens_seen": 1675439104 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002486960882647944, + "loss": 2.705, + "theoretical_loss": 3.4823041843547253, + "tokens_seen": 1675504640 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002486860581745236, + "loss": 2.635, + "theoretical_loss": 3.482292428739014, + "tokens_seen": 1675570176 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002486760280842528, + "loss": 2.6765, + "theoretical_loss": 3.4822806737118235, + "tokens_seen": 1675635712 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024866599799398196, + "loss": 2.6615, + "theoretical_loss": 3.4822689192731007, + "tokens_seen": 1675701248 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024865596790371114, + "loss": 2.5132, + "theoretical_loss": 3.482257165422794, + "tokens_seen": 1675766784 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002486459378134403, + "loss": 2.6114, + "theoretical_loss": 3.4822454121608506, + "tokens_seen": 1675832320 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002486359077231695, + "loss": 2.5448, + "theoretical_loss": 3.482233659487218, + "tokens_seen": 1675897856 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002486258776328987, + "loss": 2.7996, + "theoretical_loss": 3.482221907401844, + "tokens_seen": 1675963392 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024861584754262787, + "loss": 2.7123, + "theoretical_loss": 3.482210155904676, + "tokens_seen": 1676028928 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1866968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4882490634918213, + "objective/train/theoretical_loss": 3.4822013426677785, + "objective/train/tokens_used": 1696538080, + "theoretical_loss": 3.4822013426677785, + "tokens_seen": 1676078080 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002486058174523571, + "loss": 2.5256, + "theoretical_loss": 3.4821984049956614, + "tokens_seen": 1676094464 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002485957873620863, + "loss": 2.5873, + "theoretical_loss": 3.4821866546747477, + "tokens_seen": 1676160000 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024858575727181546, + "loss": 2.8106, + "theoretical_loss": 3.4821749049418833, + "tokens_seen": 1676225536 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024857572718154465, + "loss": 2.8845, + "theoretical_loss": 3.4821631557970147, + "tokens_seen": 1676291072 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002485656970912738, + "loss": 2.665, + "theoretical_loss": 3.4821514072400905, + "tokens_seen": 1676356608 + }, + { + "epoch": 5.06, + "learning_rate": 0.000248555667001003, + "loss": 2.6054, + "theoretical_loss": 3.4821396592710574, + "tokens_seen": 1676422144 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002485456369107322, + "loss": 2.82, + "theoretical_loss": 3.4821279118898634, + "tokens_seen": 1676487680 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024853560682046137, + "loss": 2.7213, + "theoretical_loss": 3.482116165096456, + "tokens_seen": 1676553216 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024852557673019055, + "loss": 2.848, + "theoretical_loss": 3.482104418890783, + "tokens_seen": 1676618752 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002485155466399198, + "loss": 2.8316, + "theoretical_loss": 3.4820926732727915, + "tokens_seen": 1676684288 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024850551654964897, + "loss": 2.6367, + "theoretical_loss": 3.48208092824243, + "tokens_seen": 1676749824 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024849548645937815, + "loss": 2.8121, + "theoretical_loss": 3.4820691837996454, + "tokens_seen": 1676815360 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024848545636910733, + "loss": 2.6977, + "theoretical_loss": 3.4820574399443855, + "tokens_seen": 1676880896 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002484754262788365, + "loss": 2.769, + "theoretical_loss": 3.4820456966765985, + "tokens_seen": 1676946432 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002484653961885657, + "loss": 2.9141, + "theoretical_loss": 3.4820339539962313, + "tokens_seen": 1677011968 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002484553660982949, + "loss": 2.5645, + "theoretical_loss": 3.482022211903232, + "tokens_seen": 1677077504 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024844533600802405, + "loss": 2.6541, + "theoretical_loss": 3.4820104703975483, + "tokens_seen": 1677143040 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024843530591775324, + "loss": 2.5289, + "theoretical_loss": 3.4819987294791273, + "tokens_seen": 1677208576 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024842527582748247, + "loss": 2.5894, + "theoretical_loss": 3.4819869891479174, + "tokens_seen": 1677274112 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024841524573721165, + "loss": 2.7741, + "theoretical_loss": 3.481975249403866, + "tokens_seen": 1677339648 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024840521564694083, + "loss": 2.7429, + "theoretical_loss": 3.4819635102469206, + "tokens_seen": 1677405184 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024839518555667, + "loss": 2.7622, + "theoretical_loss": 3.4819517716770294, + "tokens_seen": 1677470720 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002483851554663992, + "loss": 2.485, + "theoretical_loss": 3.4819400336941393, + "tokens_seen": 1677536256 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002483751253761284, + "loss": 2.4139, + "theoretical_loss": 3.4819282962981988, + "tokens_seen": 1677601792 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024836509528585756, + "loss": 2.495, + "theoretical_loss": 3.4819165594891555, + "tokens_seen": 1677667328 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1869719, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.232656478881836, + "objective/train/theoretical_loss": 3.481907757267493, + "objective/train/tokens_used": 1698176480, + "theoretical_loss": 3.481907757267493, + "tokens_seen": 1677716480 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024835506519558674, + "loss": 2.6169, + "theoretical_loss": 3.481904823266957, + "tokens_seen": 1677732864 + }, + { + "epoch": 5.06, + "learning_rate": 0.000248345035105316, + "loss": 2.4468, + "theoretical_loss": 3.4818930876315504, + "tokens_seen": 1677798400 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024833500501504516, + "loss": 2.7499, + "theoretical_loss": 3.4818813525828842, + "tokens_seen": 1677863936 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024832497492477434, + "loss": 2.6404, + "theoretical_loss": 3.4818696181209066, + "tokens_seen": 1677929472 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002483149448345035, + "loss": 2.6785, + "theoretical_loss": 3.4818578842455645, + "tokens_seen": 1677995008 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002483049147442327, + "loss": 2.7903, + "theoretical_loss": 3.4818461509568057, + "tokens_seen": 1678060544 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002482948846539619, + "loss": 2.6728, + "theoretical_loss": 3.481834418254578, + "tokens_seen": 1678126080 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024828485456369106, + "loss": 2.5731, + "theoretical_loss": 3.48182268613883, + "tokens_seen": 1678191616 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024827482447342024, + "loss": 2.6582, + "theoretical_loss": 3.481810954609508, + "tokens_seen": 1678257152 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002482647943831494, + "loss": 2.4655, + "theoretical_loss": 3.4817992236665614, + "tokens_seen": 1678322688 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024825476429287866, + "loss": 2.8264, + "theoretical_loss": 3.481787493309937, + "tokens_seen": 1678388224 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024824473420260784, + "loss": 2.7262, + "theoretical_loss": 3.4817757635395825, + "tokens_seen": 1678453760 + }, + { + "epoch": 5.06, + "learning_rate": 0.000248234704112337, + "loss": 2.7272, + "theoretical_loss": 3.4817640343554466, + "tokens_seen": 1678519296 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002482246740220662, + "loss": 2.6597, + "theoretical_loss": 3.4817523057574764, + "tokens_seen": 1678584832 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002482146439317954, + "loss": 2.9039, + "theoretical_loss": 3.4817405777456196, + "tokens_seen": 1678650368 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024820461384152456, + "loss": 2.7093, + "theoretical_loss": 3.4817288503198243, + "tokens_seen": 1678715904 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024819458375125375, + "loss": 2.6062, + "theoretical_loss": 3.4817171234800384, + "tokens_seen": 1678781440 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002481845536609829, + "loss": 2.6626, + "theoretical_loss": 3.48170539722621, + "tokens_seen": 1678846976 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002481745235707121, + "loss": 2.6459, + "theoretical_loss": 3.4816936715582862, + "tokens_seen": 1678912512 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024816449348044134, + "loss": 2.9435, + "theoretical_loss": 3.4816819464762156, + "tokens_seen": 1678978048 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002481544633901705, + "loss": 2.599, + "theoretical_loss": 3.4816702219799454, + "tokens_seen": 1679043584 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002481444332998997, + "loss": 2.5981, + "theoretical_loss": 3.4816584980694243, + "tokens_seen": 1679109120 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002481344032096289, + "loss": 2.7337, + "theoretical_loss": 3.481646774744599, + "tokens_seen": 1679174656 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024812437311935807, + "loss": 2.5465, + "theoretical_loss": 3.4816350520054185, + "tokens_seen": 1679240192 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002481143430290873, + "loss": 2.6617, + "theoretical_loss": 3.4816233298518307, + "tokens_seen": 1679305728 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1872660, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.565626382827759, + "objective/train/theoretical_loss": 3.481614538620903, + "objective/train/tokens_used": 1699814880, + "theoretical_loss": 3.481614538620903, + "tokens_seen": 1679354880 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002481043129388165, + "loss": 2.6087, + "theoretical_loss": 3.4816116082837825, + "tokens_seen": 1679371264 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024809428284854566, + "loss": 2.6264, + "theoretical_loss": 3.4815998873012224, + "tokens_seen": 1679436800 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024808425275827485, + "loss": 2.5967, + "theoretical_loss": 3.4815881669040984, + "tokens_seen": 1679502336 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024807422266800403, + "loss": 2.6033, + "theoretical_loss": 3.4815764470923583, + "tokens_seen": 1679567872 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002480641925777332, + "loss": 2.7472, + "theoretical_loss": 3.4815647278659503, + "tokens_seen": 1679633408 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002480541624874624, + "loss": 2.7379, + "theoretical_loss": 3.4815530092248217, + "tokens_seen": 1679698944 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024804413239719157, + "loss": 2.6267, + "theoretical_loss": 3.4815412911689205, + "tokens_seen": 1679764480 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024803410230692075, + "loss": 2.5733, + "theoretical_loss": 3.4815295736981953, + "tokens_seen": 1679830016 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024802407221665, + "loss": 2.7833, + "theoretical_loss": 3.4815178568125935, + "tokens_seen": 1679895552 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024801404212637917, + "loss": 2.5866, + "theoretical_loss": 3.4815061405120638, + "tokens_seen": 1679961088 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024800401203610835, + "loss": 2.5243, + "theoretical_loss": 3.481494424796553, + "tokens_seen": 1680026624 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024799398194583753, + "loss": 2.643, + "theoretical_loss": 3.4814827096660097, + "tokens_seen": 1680092160 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002479839518555667, + "loss": 2.3232, + "theoretical_loss": 3.481470995120382, + "tokens_seen": 1680157696 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002479739217652959, + "loss": 2.944, + "theoretical_loss": 3.4814592811596174, + "tokens_seen": 1680223232 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002479638916750251, + "loss": 2.6646, + "theoretical_loss": 3.4814475677836647, + "tokens_seen": 1680288768 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024795386158475425, + "loss": 2.6038, + "theoretical_loss": 3.4814358549924713, + "tokens_seen": 1680354304 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024794383149448344, + "loss": 2.3074, + "theoretical_loss": 3.4814241427859853, + "tokens_seen": 1680419840 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024793380140421267, + "loss": 2.4174, + "theoretical_loss": 3.4814124311641548, + "tokens_seen": 1680485376 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024792377131394185, + "loss": 2.7789, + "theoretical_loss": 3.4814007201269277, + "tokens_seen": 1680550912 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024791374122367103, + "loss": 2.6737, + "theoretical_loss": 3.4813890096742517, + "tokens_seen": 1680616448 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002479037111334002, + "loss": 2.4805, + "theoretical_loss": 3.4813772998060752, + "tokens_seen": 1680681984 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002478936810431294, + "loss": 2.6141, + "theoretical_loss": 3.481365590522347, + "tokens_seen": 1680747520 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002478836509528586, + "loss": 2.6539, + "theoretical_loss": 3.4813538818230136, + "tokens_seen": 1680813056 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024787362086258776, + "loss": 2.7768, + "theoretical_loss": 3.4813421737080237, + "tokens_seen": 1680878592 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024786359077231694, + "loss": 2.7976, + "theoretical_loss": 3.481330466177326, + "tokens_seen": 1680944128 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1873994, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5642471313476562, + "objective/train/theoretical_loss": 3.4813216859127127, + "objective/train/tokens_used": 1701453280, + "theoretical_loss": 3.4813216859127127, + "tokens_seen": 1680993280 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002478535606820462, + "loss": 2.6382, + "theoretical_loss": 3.481318759230868, + "tokens_seen": 1681009664 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024784353059177536, + "loss": 2.6304, + "theoretical_loss": 3.4813070528685977, + "tokens_seen": 1681075200 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024783350050150454, + "loss": 2.6522, + "theoretical_loss": 3.4812953470904633, + "tokens_seen": 1681140736 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002478234704112337, + "loss": 2.5529, + "theoretical_loss": 3.481283641896413, + "tokens_seen": 1681206272 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002478134403209629, + "loss": 2.7005, + "theoretical_loss": 3.481271937286394, + "tokens_seen": 1681271808 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002478034102306921, + "loss": 2.7119, + "theoretical_loss": 3.4812602332603557, + "tokens_seen": 1681337344 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024779338014042126, + "loss": 2.8493, + "theoretical_loss": 3.4812485298182456, + "tokens_seen": 1681402880 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024778335005015044, + "loss": 2.6648, + "theoretical_loss": 3.481236826960012, + "tokens_seen": 1681468416 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002477733199598796, + "loss": 2.6848, + "theoretical_loss": 3.4812251246856025, + "tokens_seen": 1681533952 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024776328986960886, + "loss": 2.6678, + "theoretical_loss": 3.4812134229949665, + "tokens_seen": 1681599488 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024775325977933804, + "loss": 2.5974, + "theoretical_loss": 3.48120172188805, + "tokens_seen": 1681665024 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002477432296890672, + "loss": 2.7596, + "theoretical_loss": 3.481190021364803, + "tokens_seen": 1681730560 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002477331995987964, + "loss": 2.6662, + "theoretical_loss": 3.4811783214251735, + "tokens_seen": 1681796096 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002477231695085256, + "loss": 2.5193, + "theoretical_loss": 3.481166622069108, + "tokens_seen": 1681861632 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024771313941825476, + "loss": 2.6158, + "theoretical_loss": 3.4811549232965566, + "tokens_seen": 1681927168 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024770310932798395, + "loss": 2.6327, + "theoretical_loss": 3.4811432251074668, + "tokens_seen": 1681992704 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002476930792377131, + "loss": 2.7649, + "theoretical_loss": 3.4811315275017862, + "tokens_seen": 1682058240 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002476830491474423, + "loss": 2.7189, + "theoretical_loss": 3.4811198304794635, + "tokens_seen": 1682123776 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024767301905717154, + "loss": 2.8346, + "theoretical_loss": 3.481108134040447, + "tokens_seen": 1682189312 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002476629889669007, + "loss": 2.6257, + "theoretical_loss": 3.481096438184684, + "tokens_seen": 1682254848 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002476529588766299, + "loss": 2.6779, + "theoretical_loss": 3.4810847429121243, + "tokens_seen": 1682320384 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002476429287863591, + "loss": 2.4946, + "theoretical_loss": 3.481073048222715, + "tokens_seen": 1682385920 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024763289869608827, + "loss": 2.7373, + "theoretical_loss": 3.481061354116404, + "tokens_seen": 1682451456 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024762286860581745, + "loss": 2.7842, + "theoretical_loss": 3.4810496605931407, + "tokens_seen": 1682516992 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024761283851554663, + "loss": 2.6967, + "theoretical_loss": 3.481037967652872, + "tokens_seen": 1682582528 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1874790, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5453438758850098, + "objective/train/theoretical_loss": 3.48102919833023, + "objective/train/tokens_used": 1703091680, + "theoretical_loss": 3.48102919833023, + "tokens_seen": 1682631680 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002476028084252758, + "loss": 2.7356, + "theoretical_loss": 3.4810262752955468, + "tokens_seen": 1682648064 + }, + { + "epoch": 5.06, + "learning_rate": 0.000247592778335005, + "loss": 2.5842, + "theoretical_loss": 3.4810145835211133, + "tokens_seen": 1682713600 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024758274824473423, + "loss": 2.7985, + "theoretical_loss": 3.48100289232952, + "tokens_seen": 1682779136 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002475727181544634, + "loss": 2.9402, + "theoretical_loss": 3.4809912017207147, + "tokens_seen": 1682844672 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002475626880641926, + "loss": 2.6934, + "theoretical_loss": 3.4809795116946463, + "tokens_seen": 1682910208 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024755265797392177, + "loss": 2.4446, + "theoretical_loss": 3.4809678222512623, + "tokens_seen": 1682975744 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024754262788365095, + "loss": 2.6008, + "theoretical_loss": 3.480956133390511, + "tokens_seen": 1683041280 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024753259779338013, + "loss": 2.6511, + "theoretical_loss": 3.480944445112341, + "tokens_seen": 1683106816 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002475225677031093, + "loss": 2.6042, + "theoretical_loss": 3.4809327574167006, + "tokens_seen": 1683172352 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002475125376128385, + "loss": 2.5817, + "theoretical_loss": 3.4809210703035376, + "tokens_seen": 1683237888 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002475025075225677, + "loss": 2.6429, + "theoretical_loss": 3.4809093837728016, + "tokens_seen": 1683303424 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002474924774322969, + "loss": 2.7739, + "theoretical_loss": 3.4808976978244397, + "tokens_seen": 1683368960 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002474824473420261, + "loss": 2.4947, + "theoretical_loss": 3.4808860124584005, + "tokens_seen": 1683434496 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002474724172517553, + "loss": 2.6598, + "theoretical_loss": 3.4808743276746315, + "tokens_seen": 1683500032 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024746238716148445, + "loss": 2.5955, + "theoretical_loss": 3.480862643473083, + "tokens_seen": 1683565568 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024745235707121364, + "loss": 2.4825, + "theoretical_loss": 3.4808509598537016, + "tokens_seen": 1683631104 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002474423269809428, + "loss": 2.7375, + "theoretical_loss": 3.480839276816436, + "tokens_seen": 1683696640 + }, + { + "epoch": 5.06, + "learning_rate": 0.000247432296890672, + "loss": 2.6268, + "theoretical_loss": 3.4808275943612355, + "tokens_seen": 1683762176 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002474222668004012, + "loss": 2.5124, + "theoretical_loss": 3.4808159124880467, + "tokens_seen": 1683827712 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024741223671013036, + "loss": 2.5939, + "theoretical_loss": 3.48080423119682, + "tokens_seen": 1683893248 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002474022066198596, + "loss": 2.5684, + "theoretical_loss": 3.480792550487502, + "tokens_seen": 1683958784 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002473921765295888, + "loss": 2.6595, + "theoretical_loss": 3.480780870360042, + "tokens_seen": 1684024320 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024738214643931796, + "loss": 2.4989, + "theoretical_loss": 3.480769190814388, + "tokens_seen": 1684089856 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024737211634904714, + "loss": 2.655, + "theoretical_loss": 3.4807575118504888, + "tokens_seen": 1684155392 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002473620862587764, + "loss": 2.8025, + "theoretical_loss": 3.4807458334682924, + "tokens_seen": 1684220928 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1876331, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8225319385528564, + "objective/train/theoretical_loss": 3.4807370750633564, + "objective/train/tokens_used": 1704730080, + "theoretical_loss": 3.4807370750633564, + "tokens_seen": 1684270080 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024735205616850556, + "loss": 2.4744, + "theoretical_loss": 3.480734155667747, + "tokens_seen": 1684286464 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024734202607823474, + "loss": 2.8002, + "theoretical_loss": 3.4807224784488016, + "tokens_seen": 1684352000 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002473319959879639, + "loss": 2.5085, + "theoretical_loss": 3.480710801811404, + "tokens_seen": 1684417536 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002473219658976931, + "loss": 2.7414, + "theoretical_loss": 3.4806991257555033, + "tokens_seen": 1684483072 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002473119358074223, + "loss": 2.5254, + "theoretical_loss": 3.4806874502810476, + "tokens_seen": 1684548608 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024730190571715146, + "loss": 2.8898, + "theoretical_loss": 3.480675775387985, + "tokens_seen": 1684614144 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024729187562688064, + "loss": 2.6767, + "theoretical_loss": 3.4806641010762647, + "tokens_seen": 1684679680 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002472818455366098, + "loss": 2.5005, + "theoretical_loss": 3.4806524273458344, + "tokens_seen": 1684745216 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024727181544633906, + "loss": 2.6453, + "theoretical_loss": 3.480640754196642, + "tokens_seen": 1684810752 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024726178535606824, + "loss": 2.6126, + "theoretical_loss": 3.480629081628638, + "tokens_seen": 1684876288 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002472517552657974, + "loss": 2.7029, + "theoretical_loss": 3.480617409641769, + "tokens_seen": 1684941824 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002472417251755266, + "loss": 2.5718, + "theoretical_loss": 3.480605738235984, + "tokens_seen": 1685007360 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002472316950852558, + "loss": 2.6484, + "theoretical_loss": 3.4805940674112317, + "tokens_seen": 1685072896 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024722166499498496, + "loss": 2.5785, + "theoretical_loss": 3.4805823971674603, + "tokens_seen": 1685138432 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024721163490471415, + "loss": 2.631, + "theoretical_loss": 3.4805707275046185, + "tokens_seen": 1685203968 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002472016048144433, + "loss": 2.7035, + "theoretical_loss": 3.4805590584226547, + "tokens_seen": 1685269504 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002471915747241725, + "loss": 2.8303, + "theoretical_loss": 3.4805473899215174, + "tokens_seen": 1685335040 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024718154463390174, + "loss": 2.7087, + "theoretical_loss": 3.480535722001155, + "tokens_seen": 1685400576 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002471715145436309, + "loss": 2.6553, + "theoretical_loss": 3.480524054661516, + "tokens_seen": 1685466112 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002471614844533601, + "loss": 2.7302, + "theoretical_loss": 3.4805123879025492, + "tokens_seen": 1685531648 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002471514543630893, + "loss": 2.7144, + "theoretical_loss": 3.480500721724203, + "tokens_seen": 1685597184 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024714142427281847, + "loss": 2.5567, + "theoretical_loss": 3.4804890561264257, + "tokens_seen": 1685662720 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024713139418254765, + "loss": 2.5962, + "theoretical_loss": 3.4804773911091664, + "tokens_seen": 1685728256 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024712136409227683, + "loss": 2.5234, + "theoretical_loss": 3.480465726672373, + "tokens_seen": 1685793792 + }, + { + "epoch": 5.06, + "learning_rate": 0.000247111334002006, + "loss": 2.7882, + "theoretical_loss": 3.4804540628159946, + "tokens_seen": 1685859328 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1877076, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.837002992630005, + "objective/train/theoretical_loss": 3.4804453153045767, + "objective/train/tokens_used": 1706368480, + "theoretical_loss": 3.4804453153045767, + "tokens_seen": 1685908480 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002471013039117352, + "loss": 2.7338, + "theoretical_loss": 3.4804423995399794, + "tokens_seen": 1685924864 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024709127382146443, + "loss": 2.6317, + "theoretical_loss": 3.4804307368442755, + "tokens_seen": 1685990400 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002470812437311936, + "loss": 2.7775, + "theoretical_loss": 3.4804190747288324, + "tokens_seen": 1686055936 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002470712136409228, + "loss": 2.5035, + "theoretical_loss": 3.4804074131935985, + "tokens_seen": 1686121472 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024706118355065197, + "loss": 2.5781, + "theoretical_loss": 3.480395752238522, + "tokens_seen": 1686187008 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024705115346038115, + "loss": 2.6215, + "theoretical_loss": 3.4803840918635514, + "tokens_seen": 1686252544 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024704112337011033, + "loss": 2.8578, + "theoretical_loss": 3.480372432068636, + "tokens_seen": 1686318080 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002470310932798395, + "loss": 2.5608, + "theoretical_loss": 3.480360772853724, + "tokens_seen": 1686383616 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002470210631895687, + "loss": 2.5256, + "theoretical_loss": 3.4803491142187637, + "tokens_seen": 1686449152 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002470110330992979, + "loss": 2.7507, + "theoretical_loss": 3.4803374561637037, + "tokens_seen": 1686514688 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002470010030090271, + "loss": 2.5409, + "theoretical_loss": 3.4803257986884937, + "tokens_seen": 1686580224 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002469909729187563, + "loss": 2.741, + "theoretical_loss": 3.480314141793081, + "tokens_seen": 1686645760 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002469809428284855, + "loss": 2.7925, + "theoretical_loss": 3.4803024854774147, + "tokens_seen": 1686711296 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024697091273821466, + "loss": 2.6954, + "theoretical_loss": 3.4802908297414437, + "tokens_seen": 1686776832 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024696088264794384, + "loss": 2.6802, + "theoretical_loss": 3.4802791745851165, + "tokens_seen": 1686842368 + }, + { + "epoch": 5.06, + "learning_rate": 0.000246950852557673, + "loss": 2.5237, + "theoretical_loss": 3.4802675200083817, + "tokens_seen": 1686907904 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002469408224674022, + "loss": 2.8827, + "theoretical_loss": 3.4802558660111877, + "tokens_seen": 1686973440 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002469307923771314, + "loss": 2.5404, + "theoretical_loss": 3.480244212593484, + "tokens_seen": 1687038976 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024692076228686056, + "loss": 2.9592, + "theoretical_loss": 3.4802325597552186, + "tokens_seen": 1687104512 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002469107321965898, + "loss": 2.5541, + "theoretical_loss": 3.4802209074963404, + "tokens_seen": 1687170048 + }, + { + "epoch": 5.06, + "learning_rate": 0.000246900702106319, + "loss": 2.6262, + "theoretical_loss": 3.4802092558167974, + "tokens_seen": 1687235584 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024689067201604816, + "loss": 2.5708, + "theoretical_loss": 3.4801976047165395, + "tokens_seen": 1687301120 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024688064192577734, + "loss": 2.5384, + "theoretical_loss": 3.4801859541955147, + "tokens_seen": 1687366656 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002468706118355065, + "loss": 2.4918, + "theoretical_loss": 3.480174304253672, + "tokens_seen": 1687432192 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002468605817452357, + "loss": 2.5022, + "theoretical_loss": 3.480162654890959, + "tokens_seen": 1687497728 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1878236, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3599436283111572, + "objective/train/theoretical_loss": 3.4801539182489485, + "objective/train/tokens_used": 1708006880, + "theoretical_loss": 3.4801539182489485, + "tokens_seen": 1687546880 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002468505516549649, + "loss": 2.5223, + "theoretical_loss": 3.4801510061073264, + "tokens_seen": 1687563264 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024684052156469406, + "loss": 2.7041, + "theoretical_loss": 3.4801393579027216, + "tokens_seen": 1687628800 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024683049147442325, + "loss": 2.5553, + "theoretical_loss": 3.4801277102770936, + "tokens_seen": 1687694336 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002468204613841525, + "loss": 2.4113, + "theoretical_loss": 3.480116063230391, + "tokens_seen": 1687759872 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024681043129388166, + "loss": 2.5911, + "theoretical_loss": 3.4801044167625625, + "tokens_seen": 1687825408 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024680040120361084, + "loss": 2.464, + "theoretical_loss": 3.4800927708735574, + "tokens_seen": 1687890944 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024679037111334, + "loss": 2.6288, + "theoretical_loss": 3.4800811255633244, + "tokens_seen": 1687956480 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002467803410230692, + "loss": 2.6166, + "theoretical_loss": 3.480069480831811, + "tokens_seen": 1688022016 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002467703109327984, + "loss": 2.7455, + "theoretical_loss": 3.480057836678968, + "tokens_seen": 1688087552 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024676028084252757, + "loss": 2.5236, + "theoretical_loss": 3.4800461931047426, + "tokens_seen": 1688153088 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024675025075225675, + "loss": 2.518, + "theoretical_loss": 3.4800345501090844, + "tokens_seen": 1688218624 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024674022066198593, + "loss": 2.7516, + "theoretical_loss": 3.480022907691941, + "tokens_seen": 1688284160 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024673019057171516, + "loss": 2.6459, + "theoretical_loss": 3.480011265853263, + "tokens_seen": 1688349696 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024672016048144435, + "loss": 2.496, + "theoretical_loss": 3.479999624592998, + "tokens_seen": 1688415232 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002467101303911735, + "loss": 2.631, + "theoretical_loss": 3.4799879839110957, + "tokens_seen": 1688480768 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002467001003009027, + "loss": 2.666, + "theoretical_loss": 3.479976343807504, + "tokens_seen": 1688546304 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002466900702106319, + "loss": 2.7148, + "theoretical_loss": 3.479964704282172, + "tokens_seen": 1688611840 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024668004012036107, + "loss": 2.472, + "theoretical_loss": 3.4799530653350486, + "tokens_seen": 1688677376 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024667001003009025, + "loss": 2.6537, + "theoretical_loss": 3.4799414269660827, + "tokens_seen": 1688742912 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024665997993981943, + "loss": 2.7607, + "theoretical_loss": 3.4799297891752228, + "tokens_seen": 1688808448 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002466499498495486, + "loss": 2.4073, + "theoretical_loss": 3.4799181519624183, + "tokens_seen": 1688873984 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024663991975927785, + "loss": 2.6109, + "theoretical_loss": 3.4799065153276176, + "tokens_seen": 1688939520 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024662988966900703, + "loss": 2.5761, + "theoretical_loss": 3.4798948792707702, + "tokens_seen": 1689005056 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002466198595787362, + "loss": 2.7118, + "theoretical_loss": 3.479883243791824, + "tokens_seen": 1689070592 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002466098294884654, + "loss": 2.5484, + "theoretical_loss": 3.479871608890729, + "tokens_seen": 1689136128 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1878889, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8874025344848633, + "objective/train/theoretical_loss": 3.4798628830940914, + "objective/train/tokens_used": 1709645280, + "theoretical_loss": 3.4798628830940914, + "tokens_seen": 1689185280 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024659979939819463, + "loss": 2.6763, + "theoretical_loss": 3.4798599745674332, + "tokens_seen": 1689201664 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002465897693079238, + "loss": 2.5434, + "theoretical_loss": 3.479848340821886, + "tokens_seen": 1689267200 + }, + { + "epoch": 5.06, + "learning_rate": 0.000246579739217653, + "loss": 2.5924, + "theoretical_loss": 3.4798367076540355, + "tokens_seen": 1689332736 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024656970912738217, + "loss": 2.7583, + "theoretical_loss": 3.479825075063832, + "tokens_seen": 1689398272 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024655967903711135, + "loss": 2.6424, + "theoretical_loss": 3.4798134430512233, + "tokens_seen": 1689463808 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024654964894684053, + "loss": 2.5617, + "theoretical_loss": 3.4798018116161584, + "tokens_seen": 1689529344 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002465396188565697, + "loss": 2.7188, + "theoretical_loss": 3.4797901807585863, + "tokens_seen": 1689594880 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002465295887662989, + "loss": 2.6075, + "theoretical_loss": 3.479778550478456, + "tokens_seen": 1689660416 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002465195586760281, + "loss": 2.8712, + "theoretical_loss": 3.479766920775717, + "tokens_seen": 1689725952 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002465095285857573, + "loss": 2.6307, + "theoretical_loss": 3.479755291650318, + "tokens_seen": 1689791488 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002464994984954865, + "loss": 2.641, + "theoretical_loss": 3.479743663102207, + "tokens_seen": 1689857024 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002464894684052157, + "loss": 2.8352, + "theoretical_loss": 3.479732035131334, + "tokens_seen": 1689922560 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024647943831494486, + "loss": 2.2635, + "theoretical_loss": 3.4797204077376476, + "tokens_seen": 1689988096 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024646940822467404, + "loss": 2.5352, + "theoretical_loss": 3.4797087809210967, + "tokens_seen": 1690053632 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002464593781344032, + "loss": 2.6277, + "theoretical_loss": 3.4796971546816304, + "tokens_seen": 1690119168 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002464493480441324, + "loss": 2.602, + "theoretical_loss": 3.4796855290191977, + "tokens_seen": 1690184704 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002464393179538616, + "loss": 2.8406, + "theoretical_loss": 3.4796739039337474, + "tokens_seen": 1690250240 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024642928786359076, + "loss": 2.464, + "theoretical_loss": 3.479662279425229, + "tokens_seen": 1690315776 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024641925777332, + "loss": 2.6208, + "theoretical_loss": 3.4796506554935904, + "tokens_seen": 1690381312 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002464092276830492, + "loss": 2.6901, + "theoretical_loss": 3.479639032138782, + "tokens_seen": 1690446848 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024639919759277836, + "loss": 2.7325, + "theoretical_loss": 3.4796274093607518, + "tokens_seen": 1690512384 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024638916750250754, + "loss": 2.5599, + "theoretical_loss": 3.4796157871594495, + "tokens_seen": 1690577920 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002463791374122367, + "loss": 2.5645, + "theoretical_loss": 3.4796041655348233, + "tokens_seen": 1690643456 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002463691073219659, + "loss": 2.5906, + "theoretical_loss": 3.479592544486823, + "tokens_seen": 1690708992 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002463590772316951, + "loss": 2.7846, + "theoretical_loss": 3.4795809240153974, + "tokens_seen": 1690774528 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1880208, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2902045249938965, + "objective/train/theoretical_loss": 3.4795722090401746, + "objective/train/tokens_used": 1711283680, + "theoretical_loss": 3.4795722090401746, + "tokens_seen": 1690823680 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024634904714142426, + "loss": 2.4409, + "theoretical_loss": 3.4795693041204956, + "tokens_seen": 1690840064 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024633901705115345, + "loss": 2.7185, + "theoretical_loss": 3.4795576848020664, + "tokens_seen": 1690905600 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002463289869608827, + "loss": 2.661, + "theoretical_loss": 3.479546066060059, + "tokens_seen": 1690971136 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024631895687061186, + "loss": 2.5703, + "theoretical_loss": 3.479534447894423, + "tokens_seen": 1691036672 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024630892678034104, + "loss": 2.8567, + "theoretical_loss": 3.4795228303051062, + "tokens_seen": 1691102208 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002462988966900702, + "loss": 2.6391, + "theoretical_loss": 3.479511213292059, + "tokens_seen": 1691167744 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002462888665997994, + "loss": 2.789, + "theoretical_loss": 3.47949959685523, + "tokens_seen": 1691233280 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002462788365095286, + "loss": 2.8185, + "theoretical_loss": 3.4794879809945676, + "tokens_seen": 1691298816 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024626880641925777, + "loss": 2.5201, + "theoretical_loss": 3.479476365710022, + "tokens_seen": 1691364352 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024625877632898695, + "loss": 2.5299, + "theoretical_loss": 3.4794647510015415, + "tokens_seen": 1691429888 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024624874623871613, + "loss": 2.6722, + "theoretical_loss": 3.4794531368690755, + "tokens_seen": 1691495424 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024623871614844536, + "loss": 2.5748, + "theoretical_loss": 3.4794415233125733, + "tokens_seen": 1691560960 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024622868605817455, + "loss": 2.5648, + "theoretical_loss": 3.4794299103319846, + "tokens_seen": 1691626496 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002462186559679037, + "loss": 2.6098, + "theoretical_loss": 3.479418297927257, + "tokens_seen": 1691692032 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002462086258776329, + "loss": 2.6492, + "theoretical_loss": 3.479406686098341, + "tokens_seen": 1691757568 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002461985957873621, + "loss": 2.733, + "theoretical_loss": 3.4793950748451845, + "tokens_seen": 1691823104 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024618856569709127, + "loss": 2.767, + "theoretical_loss": 3.4793834641677375, + "tokens_seen": 1691888640 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024617853560682045, + "loss": 2.7365, + "theoretical_loss": 3.479371854065949, + "tokens_seen": 1691954176 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024616850551654963, + "loss": 2.6511, + "theoretical_loss": 3.4793602445397687, + "tokens_seen": 1692019712 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002461584754262788, + "loss": 2.6388, + "theoretical_loss": 3.4793486355891448, + "tokens_seen": 1692085248 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024614844533600805, + "loss": 2.5312, + "theoretical_loss": 3.4793370272140267, + "tokens_seen": 1692150784 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024613841524573723, + "loss": 2.4346, + "theoretical_loss": 3.479325419414364, + "tokens_seen": 1692216320 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002461283851554664, + "loss": 2.655, + "theoretical_loss": 3.4793138121901057, + "tokens_seen": 1692281856 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002461183550651956, + "loss": 2.7167, + "theoretical_loss": 3.479302205541201, + "tokens_seen": 1692347392 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002461083249749248, + "loss": 2.4752, + "theoretical_loss": 3.4792905994675993, + "tokens_seen": 1692412928 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1880970, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6470119953155518, + "objective/train/theoretical_loss": 3.4792818952899096, + "objective/train/tokens_used": 1712922080, + "theoretical_loss": 3.4792818952899096, + "tokens_seen": 1692462080 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024609829488465395, + "loss": 2.7106, + "theoretical_loss": 3.479278993969249, + "tokens_seen": 1692478464 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024608826479438314, + "loss": 2.5693, + "theoretical_loss": 3.4792673890461003, + "tokens_seen": 1692544000 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002460782347041123, + "loss": 2.3347, + "theoretical_loss": 3.4792557846981023, + "tokens_seen": 1692609536 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002460682046138415, + "loss": 2.829, + "theoretical_loss": 3.4792441809252033, + "tokens_seen": 1692675072 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024605817452357073, + "loss": 2.5794, + "theoretical_loss": 3.4792325777273536, + "tokens_seen": 1692740608 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002460481444332999, + "loss": 2.8959, + "theoretical_loss": 3.479220975104502, + "tokens_seen": 1692806144 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002460381143430291, + "loss": 2.6899, + "theoretical_loss": 3.4792093730565976, + "tokens_seen": 1692871680 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002460280842527583, + "loss": 2.6765, + "theoretical_loss": 3.47919777158359, + "tokens_seen": 1692937216 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024601805416248746, + "loss": 2.4803, + "theoretical_loss": 3.479186170685428, + "tokens_seen": 1693002752 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024600802407221664, + "loss": 2.891, + "theoretical_loss": 3.479174570362061, + "tokens_seen": 1693068288 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002459979939819458, + "loss": 2.7209, + "theoretical_loss": 3.479162970613439, + "tokens_seen": 1693133824 + }, + { + "epoch": 5.06, + "learning_rate": 0.000245987963891675, + "loss": 2.5663, + "theoretical_loss": 3.4791513714395106, + "tokens_seen": 1693199360 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002459779338014042, + "loss": 2.6018, + "theoretical_loss": 3.4791397728402247, + "tokens_seen": 1693264896 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002459679037111334, + "loss": 2.5586, + "theoretical_loss": 3.4791281748155316, + "tokens_seen": 1693330432 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002459578736208626, + "loss": 2.5292, + "theoretical_loss": 3.47911657736538, + "tokens_seen": 1693395968 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002459478435305918, + "loss": 2.7144, + "theoretical_loss": 3.479104980489719, + "tokens_seen": 1693461504 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024593781344032096, + "loss": 2.5481, + "theoretical_loss": 3.4790933841884986, + "tokens_seen": 1693527040 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024592778335005014, + "loss": 2.5721, + "theoretical_loss": 3.4790817884616674, + "tokens_seen": 1693592576 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002459177532597793, + "loss": 2.6367, + "theoretical_loss": 3.479070193309175, + "tokens_seen": 1693658112 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002459077231695085, + "loss": 2.5987, + "theoretical_loss": 3.479058598730971, + "tokens_seen": 1693723648 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002458976930792377, + "loss": 2.4246, + "theoretical_loss": 3.479047004727004, + "tokens_seen": 1693789184 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024588766298896687, + "loss": 2.5929, + "theoretical_loss": 3.4790354112972244, + "tokens_seen": 1693854720 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002458776328986961, + "loss": 2.6773, + "theoretical_loss": 3.479023818441581, + "tokens_seen": 1693920256 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002458676028084253, + "loss": 2.5382, + "theoretical_loss": 3.4790122261600223, + "tokens_seen": 1693985792 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024585757271815446, + "loss": 2.5502, + "theoretical_loss": 3.4790006344524995, + "tokens_seen": 1694051328 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1881622, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.60414719581604, + "objective/train/theoretical_loss": 3.4789919410485366, + "objective/train/tokens_used": 1714560480, + "theoretical_loss": 3.4789919410485366, + "tokens_seen": 1694100480 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024584754262788365, + "loss": 2.8176, + "theoretical_loss": 3.4789890433189603, + "tokens_seen": 1694116864 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002458375125376129, + "loss": 2.7954, + "theoretical_loss": 3.478977452759355, + "tokens_seen": 1694182400 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024582748244734206, + "loss": 2.623, + "theoretical_loss": 3.4789658627736326, + "tokens_seen": 1694247936 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024581745235707124, + "loss": 2.4837, + "theoretical_loss": 3.478954273361743, + "tokens_seen": 1694313472 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002458074222668004, + "loss": 2.6813, + "theoretical_loss": 3.478942684523635, + "tokens_seen": 1694379008 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002457973921765296, + "loss": 2.6443, + "theoretical_loss": 3.4789310962592586, + "tokens_seen": 1694444544 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002457873620862588, + "loss": 2.5793, + "theoretical_loss": 3.4789195085685622, + "tokens_seen": 1694510080 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024577733199598797, + "loss": 2.8744, + "theoretical_loss": 3.478907921451496, + "tokens_seen": 1694575616 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024576730190571715, + "loss": 2.5411, + "theoretical_loss": 3.4788963349080095, + "tokens_seen": 1694641152 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024575727181544633, + "loss": 2.4227, + "theoretical_loss": 3.478884748938052, + "tokens_seen": 1694706688 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024574724172517556, + "loss": 2.4837, + "theoretical_loss": 3.4788731635415724, + "tokens_seen": 1694772224 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024573721163490475, + "loss": 2.7427, + "theoretical_loss": 3.4788615787185204, + "tokens_seen": 1694837760 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024572718154463393, + "loss": 2.3685, + "theoretical_loss": 3.4788499944688462, + "tokens_seen": 1694903296 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002457171514543631, + "loss": 2.5526, + "theoretical_loss": 3.4788384107924983, + "tokens_seen": 1694968832 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002457071213640923, + "loss": 2.5533, + "theoretical_loss": 3.478826827689427, + "tokens_seen": 1695034368 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024569709127382147, + "loss": 2.4204, + "theoretical_loss": 3.4788152451595806, + "tokens_seen": 1695099904 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024568706118355065, + "loss": 2.4667, + "theoretical_loss": 3.4788036632029096, + "tokens_seen": 1695165440 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024567703109327983, + "loss": 2.6711, + "theoretical_loss": 3.478792081819363, + "tokens_seen": 1695230976 + }, + { + "epoch": 5.06, + "learning_rate": 0.000245667001003009, + "loss": 2.4347, + "theoretical_loss": 3.478780501008891, + "tokens_seen": 1695296512 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024565697091273825, + "loss": 2.4649, + "theoretical_loss": 3.478768920771442, + "tokens_seen": 1695362048 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024564694082246743, + "loss": 2.77, + "theoretical_loss": 3.4787573411069657, + "tokens_seen": 1695427584 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002456369107321966, + "loss": 2.5298, + "theoretical_loss": 3.4787457620154125, + "tokens_seen": 1695493120 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002456268806419258, + "loss": 2.545, + "theoretical_loss": 3.478734183496731, + "tokens_seen": 1695558656 + }, + { + "epoch": 5.06, + "learning_rate": 0.000245616850551655, + "loss": 2.5921, + "theoretical_loss": 3.478722605550871, + "tokens_seen": 1695624192 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024560682046138415, + "loss": 2.542, + "theoretical_loss": 3.4787110281777824, + "tokens_seen": 1695689728 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1883002, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5183184146881104, + "objective/train/theoretical_loss": 3.478702345523817, + "objective/train/tokens_used": 1716198880, + "theoretical_loss": 3.478702345523817, + "tokens_seen": 1695738880 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024559679037111334, + "loss": 2.4763, + "theoretical_loss": 3.4786994513774143, + "tokens_seen": 1695755264 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002455867602808425, + "loss": 2.7363, + "theoretical_loss": 3.478687875149716, + "tokens_seen": 1695820800 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002455767301905717, + "loss": 2.6995, + "theoretical_loss": 3.478676299494638, + "tokens_seen": 1695886336 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024556670010030093, + "loss": 2.5592, + "theoretical_loss": 3.4786647244121287, + "tokens_seen": 1695951872 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002455566700100301, + "loss": 2.5753, + "theoretical_loss": 3.4786531499021383, + "tokens_seen": 1696017408 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002455466399197593, + "loss": 2.9172, + "theoretical_loss": 3.478641575964616, + "tokens_seen": 1696082944 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002455366098294885, + "loss": 2.5561, + "theoretical_loss": 3.478630002599512, + "tokens_seen": 1696148480 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024552657973921766, + "loss": 2.7266, + "theoretical_loss": 3.478618429806775, + "tokens_seen": 1696214016 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024551654964894684, + "loss": 2.5351, + "theoretical_loss": 3.478606857586356, + "tokens_seen": 1696279552 + }, + { + "epoch": 5.06, + "learning_rate": 0.000245506519558676, + "loss": 2.6695, + "theoretical_loss": 3.4785952859382023, + "tokens_seen": 1696345088 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002454964894684052, + "loss": 2.5377, + "theoretical_loss": 3.478583714862266, + "tokens_seen": 1696410624 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002454864593781344, + "loss": 2.4337, + "theoretical_loss": 3.4785721443584947, + "tokens_seen": 1696476160 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002454764292878636, + "loss": 2.5636, + "theoretical_loss": 3.4785605744268393, + "tokens_seen": 1696541696 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002454663991975928, + "loss": 2.5889, + "theoretical_loss": 3.478549005067249, + "tokens_seen": 1696607232 + }, + { + "epoch": 5.06, + "learning_rate": 0.000245456369107322, + "loss": 2.6652, + "theoretical_loss": 3.478537436279673, + "tokens_seen": 1696672768 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024544633901705116, + "loss": 2.7154, + "theoretical_loss": 3.478525868064062, + "tokens_seen": 1696738304 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024543630892678034, + "loss": 2.6535, + "theoretical_loss": 3.478514300420364, + "tokens_seen": 1696803840 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002454262788365095, + "loss": 2.726, + "theoretical_loss": 3.47850273334853, + "tokens_seen": 1696869376 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002454162487462387, + "loss": 2.6327, + "theoretical_loss": 3.4784911668485092, + "tokens_seen": 1696934912 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002454062186559679, + "loss": 2.5269, + "theoretical_loss": 3.4784796009202514, + "tokens_seen": 1697000448 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024539618856569707, + "loss": 2.3751, + "theoretical_loss": 3.478468035563706, + "tokens_seen": 1697065984 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002453861584754263, + "loss": 2.6501, + "theoretical_loss": 3.478456470778823, + "tokens_seen": 1697131520 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002453761283851555, + "loss": 2.6098, + "theoretical_loss": 3.4784449065655516, + "tokens_seen": 1697197056 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024536609829488466, + "loss": 2.2943, + "theoretical_loss": 3.478433342923842, + "tokens_seen": 1697262592 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024535606820461385, + "loss": 2.7368, + "theoretical_loss": 3.478421779853643, + "tokens_seen": 1697328128 + }, + { + "epoch": 5.06, + "objective/train/docs_used": 1883629, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4837749004364014, + "objective/train/theoretical_loss": 3.478413107926018, + "objective/train/tokens_used": 1717837280, + "theoretical_loss": 3.478413107926018, + "tokens_seen": 1697377280 + }, + { + "epoch": 5.06, + "learning_rate": 0.000245346038114343, + "loss": 2.5267, + "theoretical_loss": 3.478410217354906, + "tokens_seen": 1697393664 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002453360080240722, + "loss": 2.4651, + "theoretical_loss": 3.4783986554275783, + "tokens_seen": 1697459200 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002453259779338014, + "loss": 2.6235, + "theoretical_loss": 3.4783870940716115, + "tokens_seen": 1697524736 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024531594784353057, + "loss": 2.6325, + "theoretical_loss": 3.478375533286955, + "tokens_seen": 1697590272 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024530591775325975, + "loss": 2.8083, + "theoretical_loss": 3.4783639730735576, + "tokens_seen": 1697655808 + }, + { + "epoch": 5.06, + "learning_rate": 0.000245295887662989, + "loss": 2.862, + "theoretical_loss": 3.47835241343137, + "tokens_seen": 1697721344 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024528585757271817, + "loss": 2.669, + "theoretical_loss": 3.478340854360342, + "tokens_seen": 1697786880 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024527582748244735, + "loss": 2.7136, + "theoretical_loss": 3.478329295860422, + "tokens_seen": 1697852416 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024526579739217653, + "loss": 2.4787, + "theoretical_loss": 3.4783177379315613, + "tokens_seen": 1697917952 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002452557673019057, + "loss": 2.7976, + "theoretical_loss": 3.4783061805737088, + "tokens_seen": 1697983488 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002452457372116349, + "loss": 2.6688, + "theoretical_loss": 3.478294623786814, + "tokens_seen": 1698049024 + }, + { + "epoch": 5.06, + "learning_rate": 0.0002452357071213641, + "loss": 2.7439, + "theoretical_loss": 3.478283067570828, + "tokens_seen": 1698114560 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024522567703109325, + "loss": 2.6271, + "theoretical_loss": 3.4782715119256995, + "tokens_seen": 1698180096 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024521564694082244, + "loss": 2.5078, + "theoretical_loss": 3.478259956851378, + "tokens_seen": 1698245632 + }, + { + "epoch": 5.06, + "learning_rate": 0.00024520561685055167, + "loss": 2.5927, + "theoretical_loss": 3.4782484023478135, + "tokens_seen": 1698311168 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024519558676028085, + "loss": 2.7185, + "theoretical_loss": 3.4782368484149564, + "tokens_seen": 1698376704 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024518555667001003, + "loss": 2.4333, + "theoretical_loss": 3.478225295052756, + "tokens_seen": 1698442240 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002451755265797392, + "loss": 2.5554, + "theoretical_loss": 3.478213742261162, + "tokens_seen": 1698507776 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002451654964894684, + "loss": 2.7211, + "theoretical_loss": 3.4782021900401254, + "tokens_seen": 1698573312 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002451554663991976, + "loss": 2.3918, + "theoretical_loss": 3.478190638389594, + "tokens_seen": 1698638848 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024514543630892676, + "loss": 2.8187, + "theoretical_loss": 3.4781790873095186, + "tokens_seen": 1698704384 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024513540621865594, + "loss": 2.7536, + "theoretical_loss": 3.4781675367998495, + "tokens_seen": 1698769920 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002451253761283852, + "loss": 2.5379, + "theoretical_loss": 3.4781559868605356, + "tokens_seen": 1698835456 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024511534603811435, + "loss": 2.7946, + "theoretical_loss": 3.478144437491528, + "tokens_seen": 1698900992 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024510531594784354, + "loss": 2.5094, + "theoretical_loss": 3.478132888692775, + "tokens_seen": 1698966528 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1884825, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1551620960235596, + "objective/train/theoretical_loss": 3.47812422746791, + "objective/train/tokens_used": 1719475680, + "theoretical_loss": 3.47812422746791, + "tokens_seen": 1699015680 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002450952858575727, + "loss": 2.7158, + "theoretical_loss": 3.4781213404642273, + "tokens_seen": 1699032064 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002450852557673019, + "loss": 2.671, + "theoretical_loss": 3.478109792805835, + "tokens_seen": 1699097600 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024507522567703113, + "loss": 2.4975, + "theoretical_loss": 3.478098245717547, + "tokens_seen": 1699163136 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002450651955867603, + "loss": 2.4876, + "theoretical_loss": 3.4780866991993142, + "tokens_seen": 1699228672 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002450551654964895, + "loss": 2.4424, + "theoretical_loss": 3.478075153251086, + "tokens_seen": 1699294208 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002450451354062187, + "loss": 2.6092, + "theoretical_loss": 3.478063607872812, + "tokens_seen": 1699359744 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024503510531594786, + "loss": 2.7041, + "theoretical_loss": 3.4780520630644434, + "tokens_seen": 1699425280 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024502507522567704, + "loss": 2.7049, + "theoretical_loss": 3.478040518825928, + "tokens_seen": 1699490816 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002450150451354062, + "loss": 2.6834, + "theoretical_loss": 3.4780289751572173, + "tokens_seen": 1699556352 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002450050150451354, + "loss": 2.4515, + "theoretical_loss": 3.4780174320582606, + "tokens_seen": 1699621888 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002449949849548646, + "loss": 2.8748, + "theoretical_loss": 3.4780058895290082, + "tokens_seen": 1699687424 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002449849548645938, + "loss": 2.5109, + "theoretical_loss": 3.4779943475694095, + "tokens_seen": 1699752960 + }, + { + "epoch": 5.07, + "learning_rate": 0.000244974924774323, + "loss": 2.8826, + "theoretical_loss": 3.4779828061794147, + "tokens_seen": 1699818496 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002449648946840522, + "loss": 2.7469, + "theoretical_loss": 3.4779712653589736, + "tokens_seen": 1699884032 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024495486459378136, + "loss": 2.8803, + "theoretical_loss": 3.477959725108036, + "tokens_seen": 1699949568 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024494483450351054, + "loss": 2.7144, + "theoretical_loss": 3.477948185426553, + "tokens_seen": 1700015104 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002449348044132397, + "loss": 2.533, + "theoretical_loss": 3.4779366463144727, + "tokens_seen": 1700080640 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002449247743229689, + "loss": 2.8896, + "theoretical_loss": 3.477925107771746, + "tokens_seen": 1700146176 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002449147442326981, + "loss": 2.7806, + "theoretical_loss": 3.4779135697983232, + "tokens_seen": 1700211712 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024490471414242727, + "loss": 2.6462, + "theoretical_loss": 3.4779020323941543, + "tokens_seen": 1700277248 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002448946840521565, + "loss": 2.6449, + "theoretical_loss": 3.4778904955591883, + "tokens_seen": 1700342784 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002448846539618857, + "loss": 2.5859, + "theoretical_loss": 3.477878959293376, + "tokens_seen": 1700408320 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024487462387161486, + "loss": 2.7549, + "theoretical_loss": 3.477867423596667, + "tokens_seen": 1700473856 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024486459378134405, + "loss": 2.4488, + "theoretical_loss": 3.477855888469011, + "tokens_seen": 1700539392 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002448545636910732, + "loss": 2.763, + "theoretical_loss": 3.4778443539103594, + "tokens_seen": 1700604928 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1885409, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.284358501434326, + "objective/train/theoretical_loss": 3.477835703364749, + "objective/train/tokens_used": 1721114080, + "theoretical_loss": 3.477835703364749, + "tokens_seen": 1700654080 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002448445336008024, + "loss": 2.5178, + "theoretical_loss": 3.4778328199206605, + "tokens_seen": 1700670464 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002448345035105316, + "loss": 2.6816, + "theoretical_loss": 3.4778212864998657, + "tokens_seen": 1700736000 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024482447342026077, + "loss": 2.5925, + "theoretical_loss": 3.477809753647924, + "tokens_seen": 1700801536 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024481444332998995, + "loss": 2.4707, + "theoretical_loss": 3.477798221364786, + "tokens_seen": 1700867072 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002448044132397192, + "loss": 2.4054, + "theoretical_loss": 3.4777866896504013, + "tokens_seen": 1700932608 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024479438314944837, + "loss": 2.5581, + "theoretical_loss": 3.4777751585047203, + "tokens_seen": 1700998144 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024478435305917755, + "loss": 2.6453, + "theoretical_loss": 3.477763627927693, + "tokens_seen": 1701063680 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024477432296890673, + "loss": 2.6341, + "theoretical_loss": 3.4777520979192693, + "tokens_seen": 1701129216 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002447642928786359, + "loss": 2.6894, + "theoretical_loss": 3.4777405684793994, + "tokens_seen": 1701194752 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002447542627883651, + "loss": 2.6921, + "theoretical_loss": 3.4777290396080334, + "tokens_seen": 1701260288 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002447442326980943, + "loss": 2.5279, + "theoretical_loss": 3.4777175113051206, + "tokens_seen": 1701325824 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024473420260782345, + "loss": 2.7208, + "theoretical_loss": 3.4777059835706123, + "tokens_seen": 1701391360 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024472417251755264, + "loss": 2.4482, + "theoretical_loss": 3.477694456404458, + "tokens_seen": 1701456896 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024471414242728187, + "loss": 2.6528, + "theoretical_loss": 3.4776829298066074, + "tokens_seen": 1701522432 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024470411233701105, + "loss": 2.7029, + "theoretical_loss": 3.477671403777012, + "tokens_seen": 1701587968 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024469408224674023, + "loss": 2.6105, + "theoretical_loss": 3.4776598783156203, + "tokens_seen": 1701653504 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002446840521564694, + "loss": 2.6593, + "theoretical_loss": 3.4776483534223823, + "tokens_seen": 1701719040 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002446740220661986, + "loss": 2.41, + "theoretical_loss": 3.47763682909725, + "tokens_seen": 1701784576 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002446639919759278, + "loss": 2.6067, + "theoretical_loss": 3.4776253053401716, + "tokens_seen": 1701850112 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024465396188565696, + "loss": 2.5346, + "theoretical_loss": 3.4776137821510984, + "tokens_seen": 1701915648 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024464393179538614, + "loss": 2.544, + "theoretical_loss": 3.47760225952998, + "tokens_seen": 1701981184 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002446339017051154, + "loss": 2.5942, + "theoretical_loss": 3.477590737476766, + "tokens_seen": 1702046720 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024462387161484456, + "loss": 2.7062, + "theoretical_loss": 3.4775792159914083, + "tokens_seen": 1702112256 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024461384152457374, + "loss": 2.8202, + "theoretical_loss": 3.477567695073855, + "tokens_seen": 1702177792 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002446038114343029, + "loss": 2.6792, + "theoretical_loss": 3.4775561747240578, + "tokens_seen": 1702243328 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1886438, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9899559020996094, + "objective/train/theoretical_loss": 3.4775475348342693, + "objective/train/tokens_used": 1722752480, + "theoretical_loss": 3.4775475348342693, + "tokens_seen": 1702292480 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002445937813440321, + "loss": 2.5989, + "theoretical_loss": 3.477544654941966, + "tokens_seen": 1702308864 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002445837512537613, + "loss": 2.4137, + "theoretical_loss": 3.47753313572753, + "tokens_seen": 1702374400 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024457372116349046, + "loss": 2.7298, + "theoretical_loss": 3.4775216170806997, + "tokens_seen": 1702439936 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024456369107321964, + "loss": 2.494, + "theoretical_loss": 3.4775100990014263, + "tokens_seen": 1702505472 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002445536609829488, + "loss": 2.6183, + "theoretical_loss": 3.477498581489659, + "tokens_seen": 1702571008 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024454363089267806, + "loss": 2.489, + "theoretical_loss": 3.477487064545348, + "tokens_seen": 1702636544 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024453360080240724, + "loss": 2.704, + "theoretical_loss": 3.477475548168444, + "tokens_seen": 1702702080 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002445235707121364, + "loss": 2.7633, + "theoretical_loss": 3.4774640323588972, + "tokens_seen": 1702767616 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002445135406218656, + "loss": 2.4936, + "theoretical_loss": 3.4774525171166575, + "tokens_seen": 1702833152 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002445035105315948, + "loss": 2.6406, + "theoretical_loss": 3.477441002441675, + "tokens_seen": 1702898688 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024449348044132396, + "loss": 2.6297, + "theoretical_loss": 3.4774294883339003, + "tokens_seen": 1702964224 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024448345035105315, + "loss": 2.7666, + "theoretical_loss": 3.4774179747932834, + "tokens_seen": 1703029760 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002444734202607823, + "loss": 2.5699, + "theoretical_loss": 3.4774064618197746, + "tokens_seen": 1703095296 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002444633901705115, + "loss": 2.6552, + "theoretical_loss": 3.4773949494133243, + "tokens_seen": 1703160832 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024445336008024074, + "loss": 2.4051, + "theoretical_loss": 3.4773834375738826, + "tokens_seen": 1703226368 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002444433299899699, + "loss": 2.79, + "theoretical_loss": 3.4773719263014, + "tokens_seen": 1703291904 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002444332998996991, + "loss": 2.3931, + "theoretical_loss": 3.4773604155958258, + "tokens_seen": 1703357440 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002444232698094283, + "loss": 2.61, + "theoretical_loss": 3.477348905457111, + "tokens_seen": 1703422976 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024441323971915747, + "loss": 2.5557, + "theoretical_loss": 3.4773373958852067, + "tokens_seen": 1703488512 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024440320962888665, + "loss": 2.7362, + "theoretical_loss": 3.4773258868800623, + "tokens_seen": 1703554048 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024439317953861583, + "loss": 2.411, + "theoretical_loss": 3.4773143784416276, + "tokens_seen": 1703619584 + }, + { + "epoch": 5.07, + "learning_rate": 0.000244383149448345, + "loss": 2.5664, + "theoretical_loss": 3.477302870569854, + "tokens_seen": 1703685120 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002443731193580742, + "loss": 2.563, + "theoretical_loss": 3.4772913632646905, + "tokens_seen": 1703750656 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002443630892678034, + "loss": 2.4995, + "theoretical_loss": 3.477279856526089, + "tokens_seen": 1703816192 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002443530591775326, + "loss": 2.8371, + "theoretical_loss": 3.4772683503539987, + "tokens_seen": 1703881728 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1887154, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4530248641967773, + "objective/train/theoretical_loss": 3.477259721096674, + "objective/train/tokens_used": 1724390880, + "theoretical_loss": 3.477259721096674, + "tokens_seen": 1703930880 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002443430290872618, + "loss": 2.5139, + "theoretical_loss": 3.47725684474837, + "tokens_seen": 1703947264 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024433299899699097, + "loss": 2.6165, + "theoretical_loss": 3.477245339709153, + "tokens_seen": 1704012800 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024432296890672015, + "loss": 2.5177, + "theoretical_loss": 3.4772338352362997, + "tokens_seen": 1704078336 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002443129388164494, + "loss": 2.5056, + "theoretical_loss": 3.4772223313297586, + "tokens_seen": 1704143872 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024430290872617857, + "loss": 2.6636, + "theoretical_loss": 3.4772108279894804, + "tokens_seen": 1704209408 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024429287863590775, + "loss": 2.6352, + "theoretical_loss": 3.477199325215416, + "tokens_seen": 1704274944 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024428284854563693, + "loss": 2.6679, + "theoretical_loss": 3.4771878230075153, + "tokens_seen": 1704340480 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002442728184553661, + "loss": 2.5305, + "theoretical_loss": 3.4771763213657287, + "tokens_seen": 1704406016 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002442627883650953, + "loss": 2.5662, + "theoretical_loss": 3.4771648202900067, + "tokens_seen": 1704471552 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002442527582748245, + "loss": 2.7343, + "theoretical_loss": 3.4771533197803, + "tokens_seen": 1704537088 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024424272818455365, + "loss": 2.6617, + "theoretical_loss": 3.4771418198365582, + "tokens_seen": 1704602624 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024423269809428284, + "loss": 2.7319, + "theoretical_loss": 3.4771303204587327, + "tokens_seen": 1704668160 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024422266800401207, + "loss": 2.2481, + "theoretical_loss": 3.4771188216467728, + "tokens_seen": 1704733696 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024421263791374125, + "loss": 2.6707, + "theoretical_loss": 3.47710732340063, + "tokens_seen": 1704799232 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024420260782347043, + "loss": 2.3707, + "theoretical_loss": 3.4770958257202538, + "tokens_seen": 1704864768 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002441925777331996, + "loss": 2.5867, + "theoretical_loss": 3.477084328605595, + "tokens_seen": 1704930304 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002441825476429288, + "loss": 2.8723, + "theoretical_loss": 3.477072832056604, + "tokens_seen": 1704995840 + }, + { + "epoch": 5.07, + "learning_rate": 0.000244172517552658, + "loss": 2.4757, + "theoretical_loss": 3.4770613360732314, + "tokens_seen": 1705061376 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024416248746238716, + "loss": 2.6273, + "theoretical_loss": 3.477049840655427, + "tokens_seen": 1705126912 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024415245737211634, + "loss": 2.5952, + "theoretical_loss": 3.477038345803142, + "tokens_seen": 1705192448 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002441424272818456, + "loss": 2.9498, + "theoretical_loss": 3.4770268515163263, + "tokens_seen": 1705257984 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024413239719157473, + "loss": 2.6461, + "theoretical_loss": 3.477015357794931, + "tokens_seen": 1705323520 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002441223671013039, + "loss": 2.652, + "theoretical_loss": 3.477003864638906, + "tokens_seen": 1705389056 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024411233701103312, + "loss": 2.7553, + "theoretical_loss": 3.476992372048202, + "tokens_seen": 1705454592 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002441023069207623, + "loss": 2.5469, + "theoretical_loss": 3.476980880022769, + "tokens_seen": 1705520128 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1888460, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.141219139099121, + "objective/train/theoretical_loss": 3.476972261374624, + "objective/train/tokens_used": 1726029280, + "theoretical_loss": 3.476972261374624, + "tokens_seen": 1705569280 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024409227683049148, + "loss": 2.5916, + "theoretical_loss": 3.476969388562558, + "tokens_seen": 1705585664 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024408224674022066, + "loss": 2.6427, + "theoretical_loss": 3.4769578976675195, + "tokens_seen": 1705651200 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024407221664994987, + "loss": 2.7488, + "theoretical_loss": 3.476946407337604, + "tokens_seen": 1705716736 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024406218655967905, + "loss": 2.5029, + "theoretical_loss": 3.4769349175727617, + "tokens_seen": 1705782272 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024405215646940823, + "loss": 2.8743, + "theoretical_loss": 3.4769234283729427, + "tokens_seen": 1705847808 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002440421263791374, + "loss": 2.5887, + "theoretical_loss": 3.4769119397380988, + "tokens_seen": 1705913344 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002440320962888666, + "loss": 2.544, + "theoretical_loss": 3.4769004516681794, + "tokens_seen": 1705978880 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002440220661985958, + "loss": 2.5441, + "theoretical_loss": 3.4768889641631353, + "tokens_seen": 1706044416 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024401203610832498, + "loss": 2.5945, + "theoretical_loss": 3.476877477222917, + "tokens_seen": 1706109952 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024400200601805416, + "loss": 2.5346, + "theoretical_loss": 3.4768659908474753, + "tokens_seen": 1706175488 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024399197592778335, + "loss": 2.6742, + "theoretical_loss": 3.4768545050367607, + "tokens_seen": 1706241024 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024398194583751255, + "loss": 2.8437, + "theoretical_loss": 3.4768430197907234, + "tokens_seen": 1706306560 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024397191574724173, + "loss": 2.7174, + "theoretical_loss": 3.476831535109315, + "tokens_seen": 1706372096 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024396188565697092, + "loss": 2.6447, + "theoretical_loss": 3.476820050992484, + "tokens_seen": 1706437632 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002439518555667001, + "loss": 2.7166, + "theoretical_loss": 3.476808567440183, + "tokens_seen": 1706503168 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024394182547642928, + "loss": 2.7657, + "theoretical_loss": 3.4767970844523615, + "tokens_seen": 1706568704 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024393179538615849, + "loss": 2.7163, + "theoretical_loss": 3.4767856020289702, + "tokens_seen": 1706634240 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024392176529588767, + "loss": 2.6052, + "theoretical_loss": 3.47677412016996, + "tokens_seen": 1706699776 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024391173520561685, + "loss": 2.6495, + "theoretical_loss": 3.4767626388752815, + "tokens_seen": 1706765312 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024390170511534603, + "loss": 2.4641, + "theoretical_loss": 3.476751158144885, + "tokens_seen": 1706830848 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024389167502507524, + "loss": 2.6208, + "theoretical_loss": 3.4767396779787205, + "tokens_seen": 1706896384 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024388164493480442, + "loss": 2.7115, + "theoretical_loss": 3.4767281983767404, + "tokens_seen": 1706961920 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002438716148445336, + "loss": 2.5616, + "theoretical_loss": 3.476716719338894, + "tokens_seen": 1707027456 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024386158475426278, + "loss": 2.7925, + "theoretical_loss": 3.4767052408651313, + "tokens_seen": 1707092992 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024385155466399196, + "loss": 2.3825, + "theoretical_loss": 3.4766937629554047, + "tokens_seen": 1707158528 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1889866, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4892916679382324, + "objective/train/theoretical_loss": 3.476685154893228, + "objective/train/tokens_used": 1727667680, + "theoretical_loss": 3.476685154893228, + "tokens_seen": 1707207680 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024384152457372117, + "loss": 2.7005, + "theoretical_loss": 3.4766822856096633, + "tokens_seen": 1707224064 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024383149448345035, + "loss": 2.5581, + "theoretical_loss": 3.4766708088278593, + "tokens_seen": 1707289600 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024382146439317953, + "loss": 2.5326, + "theoretical_loss": 3.4766593326099415, + "tokens_seen": 1707355136 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024381143430290871, + "loss": 2.5577, + "theoretical_loss": 3.4766478569558616, + "tokens_seen": 1707420672 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024380140421263792, + "loss": 2.6486, + "theoretical_loss": 3.4766363818655703, + "tokens_seen": 1707486208 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002437913741223671, + "loss": 2.5868, + "theoretical_loss": 3.4766249073390183, + "tokens_seen": 1707551744 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024378134403209628, + "loss": 2.4833, + "theoretical_loss": 3.476613433376156, + "tokens_seen": 1707617280 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024377131394182547, + "loss": 2.6606, + "theoretical_loss": 3.4766019599769336, + "tokens_seen": 1707682816 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024376128385155467, + "loss": 2.6002, + "theoretical_loss": 3.4765904871413023, + "tokens_seen": 1707748352 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024375125376128385, + "loss": 2.4796, + "theoretical_loss": 3.4765790148692135, + "tokens_seen": 1707813888 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024374122367101304, + "loss": 2.5643, + "theoretical_loss": 3.4765675431606167, + "tokens_seen": 1707879424 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024373119358074222, + "loss": 2.6508, + "theoretical_loss": 3.4765560720154634, + "tokens_seen": 1707944960 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002437211634904714, + "loss": 2.2442, + "theoretical_loss": 3.4765446014337034, + "tokens_seen": 1708010496 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002437111334002006, + "loss": 2.6216, + "theoretical_loss": 3.4765331314152883, + "tokens_seen": 1708076032 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002437011033099298, + "loss": 2.5393, + "theoretical_loss": 3.476521661960169, + "tokens_seen": 1708141568 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024369107321965897, + "loss": 2.0886, + "theoretical_loss": 3.476510193068295, + "tokens_seen": 1708207104 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024368104312938815, + "loss": 2.6487, + "theoretical_loss": 3.4764987247396184, + "tokens_seen": 1708272640 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024367101303911736, + "loss": 2.5434, + "theoretical_loss": 3.4764872569740892, + "tokens_seen": 1708338176 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024366098294884654, + "loss": 2.654, + "theoretical_loss": 3.4764757897716585, + "tokens_seen": 1708403712 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024365095285857572, + "loss": 2.976, + "theoretical_loss": 3.476464323132276, + "tokens_seen": 1708469248 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002436409227683049, + "loss": 2.4861, + "theoretical_loss": 3.476452857055894, + "tokens_seen": 1708534784 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024363089267803408, + "loss": 2.4107, + "theoretical_loss": 3.4764413915424623, + "tokens_seen": 1708600320 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002436208625877633, + "loss": 2.6158, + "theoretical_loss": 3.4764299265919316, + "tokens_seen": 1708665856 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002436108324974925, + "loss": 2.5283, + "theoretical_loss": 3.4764184622042533, + "tokens_seen": 1708731392 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024360080240722168, + "loss": 2.4351, + "theoretical_loss": 3.476406998379378, + "tokens_seen": 1708796928 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1890525, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.780611991882324, + "objective/train/theoretical_loss": 3.4763984008800315, + "objective/train/tokens_used": 1729306080, + "theoretical_loss": 3.4763984008800315, + "tokens_seen": 1708846080 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024359077231695086, + "loss": 2.4146, + "theoretical_loss": 3.4763955351172564, + "tokens_seen": 1708862464 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024358074222668007, + "loss": 2.4619, + "theoretical_loss": 3.476384072417839, + "tokens_seen": 1708928000 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024357071213640925, + "loss": 2.5084, + "theoretical_loss": 3.4763726102810772, + "tokens_seen": 1708993536 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024356068204613843, + "loss": 2.4609, + "theoretical_loss": 3.4763611487069213, + "tokens_seen": 1709059072 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002435506519558676, + "loss": 2.6531, + "theoretical_loss": 3.4763496876953224, + "tokens_seen": 1709124608 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002435406218655968, + "loss": 2.5835, + "theoretical_loss": 3.476338227246231, + "tokens_seen": 1709190144 + }, + { + "epoch": 5.07, + "learning_rate": 0.000243530591775326, + "loss": 2.5611, + "theoretical_loss": 3.4763267673595983, + "tokens_seen": 1709255680 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024352056168505518, + "loss": 2.4655, + "theoretical_loss": 3.4763153080353746, + "tokens_seen": 1709321216 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024351053159478436, + "loss": 2.3021, + "theoretical_loss": 3.476303849273511, + "tokens_seen": 1709386752 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024350050150451355, + "loss": 2.5719, + "theoretical_loss": 3.476292391073959, + "tokens_seen": 1709452288 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024349047141424275, + "loss": 2.4528, + "theoretical_loss": 3.4762809334366684, + "tokens_seen": 1709517824 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024348044132397193, + "loss": 2.6043, + "theoretical_loss": 3.476269476361591, + "tokens_seen": 1709583360 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024347041123370112, + "loss": 2.5228, + "theoretical_loss": 3.476258019848677, + "tokens_seen": 1709648896 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002434603811434303, + "loss": 2.628, + "theoretical_loss": 3.4762465638978775, + "tokens_seen": 1709714432 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024345035105315948, + "loss": 2.5261, + "theoretical_loss": 3.476235108509143, + "tokens_seen": 1709779968 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024344032096288869, + "loss": 2.6425, + "theoretical_loss": 3.4762236536824247, + "tokens_seen": 1709845504 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024343029087261787, + "loss": 2.4771, + "theoretical_loss": 3.476212199417674, + "tokens_seen": 1709911040 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024342026078234705, + "loss": 2.6719, + "theoretical_loss": 3.4762007457148414, + "tokens_seen": 1709976576 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024341023069207623, + "loss": 2.5271, + "theoretical_loss": 3.4761892925738773, + "tokens_seen": 1710042112 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024340020060180544, + "loss": 2.514, + "theoretical_loss": 3.476177839994733, + "tokens_seen": 1710107648 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024339017051153462, + "loss": 2.6661, + "theoretical_loss": 3.4761663879773597, + "tokens_seen": 1710173184 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002433801404212638, + "loss": 2.6962, + "theoretical_loss": 3.4761549365217075, + "tokens_seen": 1710238720 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024337011033099298, + "loss": 2.4143, + "theoretical_loss": 3.4761434856277287, + "tokens_seen": 1710304256 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024336008024072216, + "loss": 2.3391, + "theoretical_loss": 3.4761320352953726, + "tokens_seen": 1710369792 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024335005015045137, + "loss": 2.7978, + "theoretical_loss": 3.4761205855245914, + "tokens_seen": 1710435328 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1891173, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3964908123016357, + "objective/train/theoretical_loss": 3.476111998565009, + "objective/train/tokens_used": 1730944480, + "theoretical_loss": 3.476111998565009, + "tokens_seen": 1710484480 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024334002006018055, + "loss": 2.4441, + "theoretical_loss": 3.476109136315335, + "tokens_seen": 1710500864 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024332998996990973, + "loss": 2.2794, + "theoretical_loss": 3.476097687667556, + "tokens_seen": 1710566400 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024331995987963891, + "loss": 2.5423, + "theoretical_loss": 3.4760862395812033, + "tokens_seen": 1710631936 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024330992978936812, + "loss": 2.8489, + "theoretical_loss": 3.476074792056229, + "tokens_seen": 1710697472 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002432998996990973, + "loss": 2.5114, + "theoretical_loss": 3.476063345092584, + "tokens_seen": 1710763008 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024328986960882648, + "loss": 2.627, + "theoretical_loss": 3.476051898690219, + "tokens_seen": 1710828544 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024327983951855567, + "loss": 2.2973, + "theoretical_loss": 3.476040452849086, + "tokens_seen": 1710894080 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024326980942828487, + "loss": 2.7399, + "theoretical_loss": 3.4760290075691342, + "tokens_seen": 1710959616 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024325977933801405, + "loss": 2.4112, + "theoretical_loss": 3.4760175628503163, + "tokens_seen": 1711025152 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024324974924774324, + "loss": 2.4689, + "theoretical_loss": 3.476006118692582, + "tokens_seen": 1711090688 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024323971915747242, + "loss": 2.7799, + "theoretical_loss": 3.475994675095883, + "tokens_seen": 1711156224 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002432296890672016, + "loss": 2.4645, + "theoretical_loss": 3.47598323206017, + "tokens_seen": 1711221760 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002432196589769308, + "loss": 2.6299, + "theoretical_loss": 3.4759717895853943, + "tokens_seen": 1711287296 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024320962888666, + "loss": 2.6011, + "theoretical_loss": 3.475960347671507, + "tokens_seen": 1711352832 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024319959879638917, + "loss": 2.6792, + "theoretical_loss": 3.4759489063184583, + "tokens_seen": 1711418368 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024318956870611835, + "loss": 2.4681, + "theoretical_loss": 3.4759374655262008, + "tokens_seen": 1711483904 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024317953861584756, + "loss": 2.8033, + "theoretical_loss": 3.475926025294684, + "tokens_seen": 1711549440 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024316950852557674, + "loss": 2.4606, + "theoretical_loss": 3.475914585623859, + "tokens_seen": 1711614976 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024315947843530592, + "loss": 2.6388, + "theoretical_loss": 3.4759031465136783, + "tokens_seen": 1711680512 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002431494483450351, + "loss": 2.5338, + "theoretical_loss": 3.4758917079640916, + "tokens_seen": 1711746048 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024313941825476428, + "loss": 2.6495, + "theoretical_loss": 3.4758802699750504, + "tokens_seen": 1711811584 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002431293881644935, + "loss": 2.7267, + "theoretical_loss": 3.4758688325465057, + "tokens_seen": 1711877120 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024311935807422267, + "loss": 2.6212, + "theoretical_loss": 3.475857395678409, + "tokens_seen": 1711942656 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024310932798395185, + "loss": 2.5733, + "theoretical_loss": 3.4758459593707114, + "tokens_seen": 1712008192 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024309929789368103, + "loss": 2.6789, + "theoretical_loss": 3.4758345236233628, + "tokens_seen": 1712073728 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1892196, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7911088466644287, + "objective/train/theoretical_loss": 3.475825947180552, + "objective/train/tokens_used": 1732582880, + "theoretical_loss": 3.475825947180552, + "tokens_seen": 1712122880 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024308926780341024, + "loss": 2.7315, + "theoretical_loss": 3.4758230884363153, + "tokens_seen": 1712139264 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024307923771313942, + "loss": 2.6124, + "theoretical_loss": 3.4758116538095205, + "tokens_seen": 1712204800 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002430692076228686, + "loss": 2.7329, + "theoretical_loss": 3.4758002197429283, + "tokens_seen": 1712270336 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024305917753259779, + "loss": 2.8635, + "theoretical_loss": 3.4757887862364907, + "tokens_seen": 1712335872 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024304914744232697, + "loss": 2.7501, + "theoretical_loss": 3.475777353290158, + "tokens_seen": 1712401408 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024303911735205618, + "loss": 2.7512, + "theoretical_loss": 3.475765920903882, + "tokens_seen": 1712466944 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024302908726178536, + "loss": 2.8082, + "theoretical_loss": 3.4757544890776138, + "tokens_seen": 1712532480 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024301905717151454, + "loss": 2.6833, + "theoretical_loss": 3.475743057811304, + "tokens_seen": 1712598016 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024300902708124372, + "loss": 2.7082, + "theoretical_loss": 3.4757316271049046, + "tokens_seen": 1712663552 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024299899699097293, + "loss": 2.63, + "theoretical_loss": 3.475720196958366, + "tokens_seen": 1712729088 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002429889669007021, + "loss": 2.4383, + "theoretical_loss": 3.47570876737164, + "tokens_seen": 1712794624 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002429789368104313, + "loss": 2.7892, + "theoretical_loss": 3.475697338344677, + "tokens_seen": 1712860160 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024296890672016047, + "loss": 2.3883, + "theoretical_loss": 3.475685909877429, + "tokens_seen": 1712925696 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024295887662988965, + "loss": 2.7753, + "theoretical_loss": 3.475674481969846, + "tokens_seen": 1712991232 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024294884653961886, + "loss": 2.5914, + "theoretical_loss": 3.4756630546218807, + "tokens_seen": 1713056768 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024293881644934804, + "loss": 2.3637, + "theoretical_loss": 3.475651627833483, + "tokens_seen": 1713122304 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024292878635907722, + "loss": 2.6399, + "theoretical_loss": 3.4756402016046053, + "tokens_seen": 1713187840 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002429187562688064, + "loss": 2.2961, + "theoretical_loss": 3.4756287759351974, + "tokens_seen": 1713253376 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002429087261785356, + "loss": 2.1939, + "theoretical_loss": 3.4756173508252113, + "tokens_seen": 1713318912 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002428986960882648, + "loss": 2.579, + "theoretical_loss": 3.475605926274598, + "tokens_seen": 1713384448 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024288866599799397, + "loss": 2.55, + "theoretical_loss": 3.4755945022833097, + "tokens_seen": 1713449984 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024287863590772315, + "loss": 2.3549, + "theoretical_loss": 3.4755830788512965, + "tokens_seen": 1713515520 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024286860581745234, + "loss": 2.6728, + "theoretical_loss": 3.4755716559785093, + "tokens_seen": 1713581056 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024285857572718157, + "loss": 2.8096, + "theoretical_loss": 3.4755602336649005, + "tokens_seen": 1713646592 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024284854563691075, + "loss": 2.5765, + "theoretical_loss": 3.4755488119104205, + "tokens_seen": 1713712128 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1892860, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.680774450302124, + "objective/train/theoretical_loss": 3.4755402459614597, + "objective/train/tokens_used": 1734221280, + "theoretical_loss": 3.4755402459614597, + "tokens_seen": 1713761280 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024283851554663993, + "loss": 2.6888, + "theoretical_loss": 3.475537390715021, + "tokens_seen": 1713777664 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024282848545636911, + "loss": 2.773, + "theoretical_loss": 3.475525970078653, + "tokens_seen": 1713843200 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024281845536609832, + "loss": 2.6634, + "theoretical_loss": 3.475514550001268, + "tokens_seen": 1713908736 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002428084252758275, + "loss": 2.6947, + "theoretical_loss": 3.4755031304828172, + "tokens_seen": 1713974272 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024279839518555668, + "loss": 2.8475, + "theoretical_loss": 3.4754917115232513, + "tokens_seen": 1714039808 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024278836509528587, + "loss": 2.6211, + "theoretical_loss": 3.4754802931225224, + "tokens_seen": 1714105344 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024277833500501507, + "loss": 2.4339, + "theoretical_loss": 3.4754688752805816, + "tokens_seen": 1714170880 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024276830491474426, + "loss": 2.7516, + "theoretical_loss": 3.47545745799738, + "tokens_seen": 1714236416 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024275827482447344, + "loss": 2.7508, + "theoretical_loss": 3.4754460412728685, + "tokens_seen": 1714301952 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024274824473420262, + "loss": 2.8873, + "theoretical_loss": 3.4754346251069994, + "tokens_seen": 1714367488 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002427382146439318, + "loss": 2.6248, + "theoretical_loss": 3.4754232094997235, + "tokens_seen": 1714433024 + }, + { + "epoch": 5.07, + "learning_rate": 0.000242728184553661, + "loss": 2.6899, + "theoretical_loss": 3.4754117944509915, + "tokens_seen": 1714498560 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002427181544633902, + "loss": 2.7384, + "theoretical_loss": 3.475400379960756, + "tokens_seen": 1714564096 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024270812437311937, + "loss": 2.5872, + "theoretical_loss": 3.4753889660289676, + "tokens_seen": 1714629632 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024269809428284855, + "loss": 2.6461, + "theoretical_loss": 3.4753775526555772, + "tokens_seen": 1714695168 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024268806419257776, + "loss": 2.6588, + "theoretical_loss": 3.4753661398405367, + "tokens_seen": 1714760704 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024267803410230694, + "loss": 2.6616, + "theoretical_loss": 3.475354727583798, + "tokens_seen": 1714826240 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024266800401203612, + "loss": 2.7933, + "theoretical_loss": 3.475343315885311, + "tokens_seen": 1714891776 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002426579739217653, + "loss": 2.7128, + "theoretical_loss": 3.475331904745028, + "tokens_seen": 1714957312 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024264794383149448, + "loss": 2.6461, + "theoretical_loss": 3.475320494162901, + "tokens_seen": 1715022848 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002426379137412237, + "loss": 2.7418, + "theoretical_loss": 3.4753090841388796, + "tokens_seen": 1715088384 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024262788365095287, + "loss": 2.8494, + "theoretical_loss": 3.475297674672917, + "tokens_seen": 1715153920 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024261785356068205, + "loss": 2.6216, + "theoretical_loss": 3.4752862657649635, + "tokens_seen": 1715219456 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024260782347041123, + "loss": 2.588, + "theoretical_loss": 3.4752748574149703, + "tokens_seen": 1715284992 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024259779338014044, + "loss": 2.8301, + "theoretical_loss": 3.47526344962289, + "tokens_seen": 1715350528 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1893941, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9383318424224854, + "objective/train/theoretical_loss": 3.47525489414493, + "objective/train/tokens_used": 1735859680, + "theoretical_loss": 3.47525489414493, + "tokens_seen": 1715399680 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024258776328986962, + "loss": 2.707, + "theoretical_loss": 3.475252042388673, + "tokens_seen": 1715416064 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002425777331995988, + "loss": 2.4891, + "theoretical_loss": 3.4752406357122707, + "tokens_seen": 1715481600 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024256770310932799, + "loss": 2.537, + "theoretical_loss": 3.4752292295936353, + "tokens_seen": 1715547136 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024255767301905717, + "loss": 2.6718, + "theoretical_loss": 3.475217824032717, + "tokens_seen": 1715612672 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024254764292878638, + "loss": 2.3799, + "theoretical_loss": 3.4752064190294685, + "tokens_seen": 1715678208 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024253761283851556, + "loss": 2.5746, + "theoretical_loss": 3.47519501458384, + "tokens_seen": 1715743744 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024252758274824474, + "loss": 2.6712, + "theoretical_loss": 3.4751836106957845, + "tokens_seen": 1715809280 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024251755265797392, + "loss": 2.69, + "theoretical_loss": 3.4751722073652522, + "tokens_seen": 1715874816 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024250752256770313, + "loss": 2.5628, + "theoretical_loss": 3.475160804592195, + "tokens_seen": 1715940352 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002424974924774323, + "loss": 2.7379, + "theoretical_loss": 3.475149402376564, + "tokens_seen": 1716005888 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002424874623871615, + "loss": 2.7586, + "theoretical_loss": 3.475138000718311, + "tokens_seen": 1716071424 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024247743229689067, + "loss": 2.7421, + "theoretical_loss": 3.4751265996173872, + "tokens_seen": 1716136960 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024246740220661985, + "loss": 2.5806, + "theoretical_loss": 3.4751151990737448, + "tokens_seen": 1716202496 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024245737211634906, + "loss": 2.7074, + "theoretical_loss": 3.475103799087335, + "tokens_seen": 1716268032 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024244734202607824, + "loss": 2.6918, + "theoretical_loss": 3.475092399658108, + "tokens_seen": 1716333568 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024243731193580742, + "loss": 2.8639, + "theoretical_loss": 3.475081000786017, + "tokens_seen": 1716399104 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002424272818455366, + "loss": 2.6651, + "theoretical_loss": 3.475069602471012, + "tokens_seen": 1716464640 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002424172517552658, + "loss": 2.5035, + "theoretical_loss": 3.4750582047130463, + "tokens_seen": 1716530176 + }, + { + "epoch": 5.07, + "learning_rate": 0.000242407221664995, + "loss": 2.6768, + "theoretical_loss": 3.4750468075120704, + "tokens_seen": 1716595712 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024239719157472417, + "loss": 2.6516, + "theoretical_loss": 3.4750354108680357, + "tokens_seen": 1716661248 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024238716148445335, + "loss": 2.501, + "theoretical_loss": 3.4750240147808933, + "tokens_seen": 1716726784 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024237713139418254, + "loss": 2.4435, + "theoretical_loss": 3.475012619250596, + "tokens_seen": 1716792320 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024236710130391174, + "loss": 2.7047, + "theoretical_loss": 3.4750012242770945, + "tokens_seen": 1716857856 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024235707121364092, + "loss": 2.5849, + "theoretical_loss": 3.4749898298603403, + "tokens_seen": 1716923392 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002423470411233701, + "loss": 2.6612, + "theoretical_loss": 3.474978436000285, + "tokens_seen": 1716988928 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1894521, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6646251678466797, + "objective/train/theoretical_loss": 3.4749698909705486, + "objective/train/tokens_used": 1737498080, + "theoretical_loss": 3.4749698909705486, + "tokens_seen": 1717038080 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002423370110330993, + "loss": 2.6463, + "theoretical_loss": 3.474967042696881, + "tokens_seen": 1717054464 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002423269809428285, + "loss": 2.7477, + "theoretical_loss": 3.4749556499500787, + "tokens_seen": 1717120000 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024231695085255768, + "loss": 2.6424, + "theoretical_loss": 3.47494425775983, + "tokens_seen": 1717185536 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024230692076228686, + "loss": 2.6223, + "theoretical_loss": 3.4749328661260863, + "tokens_seen": 1717251072 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024229689067201604, + "loss": 2.7468, + "theoretical_loss": 3.4749214750488004, + "tokens_seen": 1717316608 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024228686058174522, + "loss": 2.7102, + "theoretical_loss": 3.4749100845279224, + "tokens_seen": 1717382144 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024227683049147443, + "loss": 2.6645, + "theoretical_loss": 3.4748986945634046, + "tokens_seen": 1717447680 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002422668004012036, + "loss": 2.613, + "theoretical_loss": 3.4748873051551983, + "tokens_seen": 1717513216 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002422567703109328, + "loss": 2.4985, + "theoretical_loss": 3.4748759163032554, + "tokens_seen": 1717578752 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024224674022066197, + "loss": 2.8519, + "theoretical_loss": 3.474864528007527, + "tokens_seen": 1717644288 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024223671013039118, + "loss": 2.5299, + "theoretical_loss": 3.4748531402679657, + "tokens_seen": 1717709824 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024222668004012036, + "loss": 2.5465, + "theoretical_loss": 3.4748417530845215, + "tokens_seen": 1717775360 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024221664994984954, + "loss": 2.4251, + "theoretical_loss": 3.474830366457148, + "tokens_seen": 1717840896 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024220661985957872, + "loss": 2.801, + "theoretical_loss": 3.4748189803857956, + "tokens_seen": 1717906432 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024219658976930793, + "loss": 2.6287, + "theoretical_loss": 3.4748075948704162, + "tokens_seen": 1717971968 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002421865596790371, + "loss": 2.7712, + "theoretical_loss": 3.474796209910961, + "tokens_seen": 1718037504 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002421765295887663, + "loss": 2.3438, + "theoretical_loss": 3.474784825507382, + "tokens_seen": 1718103040 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024216649949849547, + "loss": 2.7666, + "theoretical_loss": 3.4747734416596314, + "tokens_seen": 1718168576 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024215646940822466, + "loss": 2.4905, + "theoretical_loss": 3.4747620583676606, + "tokens_seen": 1718234112 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024214643931795386, + "loss": 2.6166, + "theoretical_loss": 3.474750675631421, + "tokens_seen": 1718299648 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024213640922768305, + "loss": 2.6589, + "theoretical_loss": 3.474739293450864, + "tokens_seen": 1718365184 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024212637913741223, + "loss": 2.6378, + "theoretical_loss": 3.4747279118259415, + "tokens_seen": 1718430720 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024211634904714143, + "loss": 2.6756, + "theoretical_loss": 3.4747165307566057, + "tokens_seen": 1718496256 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024210631895687064, + "loss": 2.9024, + "theoretical_loss": 3.4747051502428077, + "tokens_seen": 1718561792 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024209628886659982, + "loss": 2.6011, + "theoretical_loss": 3.4746937702844995, + "tokens_seen": 1718627328 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1895987, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1199429035186768, + "objective/train/theoretical_loss": 3.4746852356802798, + "objective/train/tokens_used": 1739136480, + "theoretical_loss": 3.4746852356802798, + "tokens_seen": 1718676480 + }, + { + "epoch": 5.07, + "learning_rate": 0.000242086258776329, + "loss": 2.8003, + "theoretical_loss": 3.474682390881633, + "tokens_seen": 1718692864 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024207622868605819, + "loss": 2.5173, + "theoretical_loss": 3.4746710120341593, + "tokens_seen": 1718758400 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024206619859578737, + "loss": 2.6379, + "theoretical_loss": 3.4746596337420304, + "tokens_seen": 1718823936 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024205616850551658, + "loss": 2.4741, + "theoretical_loss": 3.4746482560051986, + "tokens_seen": 1718889472 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024204613841524576, + "loss": 2.5645, + "theoretical_loss": 3.474636878823614, + "tokens_seen": 1718955008 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024203610832497494, + "loss": 2.5035, + "theoretical_loss": 3.4746255021972305, + "tokens_seen": 1719020544 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024202607823470412, + "loss": 2.6776, + "theoretical_loss": 3.4746141261259984, + "tokens_seen": 1719086080 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024201604814443333, + "loss": 2.5603, + "theoretical_loss": 3.47460275060987, + "tokens_seen": 1719151616 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002420060180541625, + "loss": 2.376, + "theoretical_loss": 3.4745913756487967, + "tokens_seen": 1719217152 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002419959879638917, + "loss": 2.7016, + "theoretical_loss": 3.4745800012427304, + "tokens_seen": 1719282688 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024198595787362087, + "loss": 2.6146, + "theoretical_loss": 3.4745686273916228, + "tokens_seen": 1719348224 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024197592778335005, + "loss": 2.6791, + "theoretical_loss": 3.474557254095426, + "tokens_seen": 1719413760 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024196589769307926, + "loss": 2.5862, + "theoretical_loss": 3.474545881354091, + "tokens_seen": 1719479296 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024195586760280844, + "loss": 2.6106, + "theoretical_loss": 3.4745345091675706, + "tokens_seen": 1719544832 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024194583751253762, + "loss": 2.8151, + "theoretical_loss": 3.4745231375358165, + "tokens_seen": 1719610368 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002419358074222668, + "loss": 2.6825, + "theoretical_loss": 3.4745117664587797, + "tokens_seen": 1719675904 + }, + { + "epoch": 5.07, + "learning_rate": 0.000241925777331996, + "loss": 2.6264, + "theoretical_loss": 3.4745003959364125, + "tokens_seen": 1719741440 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002419157472417252, + "loss": 2.5917, + "theoretical_loss": 3.4744890259686665, + "tokens_seen": 1719806976 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024190571715145437, + "loss": 2.5737, + "theoretical_loss": 3.474477656555494, + "tokens_seen": 1719872512 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024189568706118355, + "loss": 2.5603, + "theoretical_loss": 3.474466287696846, + "tokens_seen": 1719938048 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024188565697091274, + "loss": 2.7288, + "theoretical_loss": 3.4744549193926746, + "tokens_seen": 1720003584 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024187562688064194, + "loss": 2.6622, + "theoretical_loss": 3.4744435516429326, + "tokens_seen": 1720069120 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024186559679037113, + "loss": 2.748, + "theoretical_loss": 3.4744321844475703, + "tokens_seen": 1720134656 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002418555667001003, + "loss": 2.6754, + "theoretical_loss": 3.474420817806541, + "tokens_seen": 1720200192 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002418455366098295, + "loss": 2.6436, + "theoretical_loss": 3.4744094517197954, + "tokens_seen": 1720265728 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1897344, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5272271633148193, + "objective/train/theoretical_loss": 3.4744009275184564, + "objective/train/tokens_used": 1740774880, + "theoretical_loss": 3.4744009275184564, + "tokens_seen": 1720314880 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002418355065195587, + "loss": 2.8838, + "theoretical_loss": 3.4743980861872856, + "tokens_seen": 1720331264 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024182547642928788, + "loss": 2.8883, + "theoretical_loss": 3.4743867212089645, + "tokens_seen": 1720396800 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024181544633901706, + "loss": 2.5831, + "theoretical_loss": 3.4743753567847824, + "tokens_seen": 1720462336 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024180541624874624, + "loss": 2.852, + "theoretical_loss": 3.474363992914692, + "tokens_seen": 1720527872 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024179538615847542, + "loss": 2.683, + "theoretical_loss": 3.4743526295986458, + "tokens_seen": 1720593408 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024178535606820463, + "loss": 2.8938, + "theoretical_loss": 3.4743412668365945, + "tokens_seen": 1720658944 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002417753259779338, + "loss": 2.641, + "theoretical_loss": 3.4743299046284903, + "tokens_seen": 1720724480 + }, + { + "epoch": 5.07, + "learning_rate": 0.000241765295887663, + "loss": 2.5025, + "theoretical_loss": 3.4743185429742853, + "tokens_seen": 1720790016 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024175526579739217, + "loss": 2.6568, + "theoretical_loss": 3.4743071818739315, + "tokens_seen": 1720855552 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024174523570712138, + "loss": 2.681, + "theoretical_loss": 3.4742958213273805, + "tokens_seen": 1720921088 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024173520561685056, + "loss": 2.536, + "theoretical_loss": 3.4742844613345847, + "tokens_seen": 1720986624 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024172517552657974, + "loss": 2.6318, + "theoretical_loss": 3.4742731018954958, + "tokens_seen": 1721052160 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024171514543630892, + "loss": 2.5514, + "theoretical_loss": 3.4742617430100653, + "tokens_seen": 1721117696 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024170511534603813, + "loss": 2.5539, + "theoretical_loss": 3.4742503846782453, + "tokens_seen": 1721183232 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002416950852557673, + "loss": 2.8246, + "theoretical_loss": 3.4742390268999888, + "tokens_seen": 1721248768 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002416850551654965, + "loss": 2.6858, + "theoretical_loss": 3.474227669675246, + "tokens_seen": 1721314304 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024167502507522567, + "loss": 2.5706, + "theoretical_loss": 3.47421631300397, + "tokens_seen": 1721379840 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024166499498495486, + "loss": 2.4692, + "theoretical_loss": 3.474204956886113, + "tokens_seen": 1721445376 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024165496489468406, + "loss": 2.4604, + "theoretical_loss": 3.474193601321626, + "tokens_seen": 1721510912 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024164493480441325, + "loss": 2.7979, + "theoretical_loss": 3.474182246310461, + "tokens_seen": 1721576448 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024163490471414243, + "loss": 2.6462, + "theoretical_loss": 3.474170891852571, + "tokens_seen": 1721641984 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002416248746238716, + "loss": 2.6418, + "theoretical_loss": 3.474159537947907, + "tokens_seen": 1721707520 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024161484453360082, + "loss": 2.6061, + "theoretical_loss": 3.4741481845964217, + "tokens_seen": 1721773056 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024160481444333, + "loss": 2.5545, + "theoretical_loss": 3.4741368317980665, + "tokens_seen": 1721838592 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024159478435305918, + "loss": 2.9588, + "theoretical_loss": 3.474125479552794, + "tokens_seen": 1721904128 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1897936, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.201916456222534, + "objective/train/theoretical_loss": 3.4741169657317705, + "objective/train/tokens_used": 1742413280, + "theoretical_loss": 3.4741169657317705, + "tokens_seen": 1721953280 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024158475426278836, + "loss": 2.4735, + "theoretical_loss": 3.4741141278605556, + "tokens_seen": 1721969664 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024157472417251754, + "loss": 2.712, + "theoretical_loss": 3.4741027767213035, + "tokens_seen": 1722035200 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024156469408224675, + "loss": 2.578, + "theoretical_loss": 3.47409142613499, + "tokens_seen": 1722100736 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024155466399197593, + "loss": 2.4941, + "theoretical_loss": 3.4740800761015667, + "tokens_seen": 1722166272 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002415446339017051, + "loss": 2.3772, + "theoretical_loss": 3.474068726620986, + "tokens_seen": 1722231808 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002415346038114343, + "loss": 2.7132, + "theoretical_loss": 3.4740573776932, + "tokens_seen": 1722297344 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002415245737211635, + "loss": 2.6152, + "theoretical_loss": 3.47404602931816, + "tokens_seen": 1722362880 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024151454363089268, + "loss": 2.6641, + "theoretical_loss": 3.474034681495819, + "tokens_seen": 1722428416 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024150451354062186, + "loss": 2.4134, + "theoretical_loss": 3.4740233342261284, + "tokens_seen": 1722493952 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024149448345035104, + "loss": 2.5871, + "theoretical_loss": 3.4740119875090407, + "tokens_seen": 1722559488 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024148445336008022, + "loss": 2.5366, + "theoretical_loss": 3.4740006413445075, + "tokens_seen": 1722625024 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024147442326980943, + "loss": 2.7207, + "theoretical_loss": 3.4739892957324816, + "tokens_seen": 1722690560 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024146439317953861, + "loss": 2.5331, + "theoretical_loss": 3.4739779506729143, + "tokens_seen": 1722756096 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002414543630892678, + "loss": 2.6099, + "theoretical_loss": 3.4739666061657584, + "tokens_seen": 1722821632 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024144433299899698, + "loss": 2.5683, + "theoretical_loss": 3.473955262210965, + "tokens_seen": 1722887168 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024143430290872618, + "loss": 2.4318, + "theoretical_loss": 3.4739439188084864, + "tokens_seen": 1722952704 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024142427281845537, + "loss": 2.8148, + "theoretical_loss": 3.473932575958276, + "tokens_seen": 1723018240 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024141424272818455, + "loss": 2.6639, + "theoretical_loss": 3.4739212336602847, + "tokens_seen": 1723083776 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024140421263791373, + "loss": 2.5708, + "theoretical_loss": 3.473909891914465, + "tokens_seen": 1723149312 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002413941825476429, + "loss": 2.7305, + "theoretical_loss": 3.4738985507207687, + "tokens_seen": 1723214848 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024138415245737212, + "loss": 2.7451, + "theoretical_loss": 3.4738872100791482, + "tokens_seen": 1723280384 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002413741223671013, + "loss": 2.7798, + "theoretical_loss": 3.4738758699895556, + "tokens_seen": 1723345920 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002413640922768305, + "loss": 2.5937, + "theoretical_loss": 3.4738645304519427, + "tokens_seen": 1723411456 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002413540621865597, + "loss": 2.4362, + "theoretical_loss": 3.4738531914662625, + "tokens_seen": 1723476992 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002413440320962889, + "loss": 2.6428, + "theoretical_loss": 3.473841853032466, + "tokens_seen": 1723542528 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1899546, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3879358768463135, + "objective/train/theoretical_loss": 3.4738333495692646, + "objective/train/tokens_used": 1744051680, + "theoretical_loss": 3.4738333495692646, + "tokens_seen": 1723591680 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024133400200601808, + "loss": 2.5874, + "theoretical_loss": 3.4738305151505067, + "tokens_seen": 1723608064 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024132397191574726, + "loss": 2.6023, + "theoretical_loss": 3.473819177820336, + "tokens_seen": 1723673600 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024131394182547644, + "loss": 2.7231, + "theoretical_loss": 3.4738078410419053, + "tokens_seen": 1723739136 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024130391173520562, + "loss": 2.6347, + "theoretical_loss": 3.473796504815168, + "tokens_seen": 1723804672 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024129388164493483, + "loss": 2.433, + "theoretical_loss": 3.473785169140076, + "tokens_seen": 1723870208 + }, + { + "epoch": 5.07, + "learning_rate": 0.000241283851554664, + "loss": 2.6937, + "theoretical_loss": 3.473773834016581, + "tokens_seen": 1723935744 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002412738214643932, + "loss": 2.7075, + "theoretical_loss": 3.4737624994446357, + "tokens_seen": 1724001280 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024126379137412237, + "loss": 2.7224, + "theoretical_loss": 3.4737511654241926, + "tokens_seen": 1724066816 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024125376128385158, + "loss": 2.8195, + "theoretical_loss": 3.4737398319552026, + "tokens_seen": 1724132352 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024124373119358076, + "loss": 2.7756, + "theoretical_loss": 3.4737284990376196, + "tokens_seen": 1724197888 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024123370110330994, + "loss": 2.7265, + "theoretical_loss": 3.473717166671394, + "tokens_seen": 1724263424 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024122367101303912, + "loss": 2.7402, + "theoretical_loss": 3.4737058348564798, + "tokens_seen": 1724328960 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024121364092276833, + "loss": 2.7301, + "theoretical_loss": 3.473694503592828, + "tokens_seen": 1724394496 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002412036108324975, + "loss": 2.6324, + "theoretical_loss": 3.473683172880391, + "tokens_seen": 1724460032 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002411935807422267, + "loss": 2.6595, + "theoretical_loss": 3.473671842719122, + "tokens_seen": 1724525568 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024118355065195587, + "loss": 2.7607, + "theoretical_loss": 3.473660513108972, + "tokens_seen": 1724591104 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024117352056168506, + "loss": 2.5308, + "theoretical_loss": 3.4736491840498935, + "tokens_seen": 1724656640 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024116349047141426, + "loss": 2.6938, + "theoretical_loss": 3.4736378555418397, + "tokens_seen": 1724722176 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024115346038114345, + "loss": 2.6563, + "theoretical_loss": 3.473626527584762, + "tokens_seen": 1724787712 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024114343029087263, + "loss": 2.5753, + "theoretical_loss": 3.473615200178613, + "tokens_seen": 1724853248 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002411334002006018, + "loss": 2.6655, + "theoretical_loss": 3.473603873323344, + "tokens_seen": 1724918784 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024112337011033102, + "loss": 2.5401, + "theoretical_loss": 3.473592547018909, + "tokens_seen": 1724984320 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002411133400200602, + "loss": 2.6232, + "theoretical_loss": 3.473581221265259, + "tokens_seen": 1725049856 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024110330992978938, + "loss": 2.4218, + "theoretical_loss": 3.4735698960623465, + "tokens_seen": 1725115392 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024109327983951856, + "loss": 2.6231, + "theoretical_loss": 3.4735585714101243, + "tokens_seen": 1725180928 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1900340, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.313204288482666, + "objective/train/theoretical_loss": 3.4735500782823197, + "objective/train/tokens_used": 1745690080, + "theoretical_loss": 3.4735500782823197, + "tokens_seen": 1725230080 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024108324974924774, + "loss": 2.5151, + "theoretical_loss": 3.4735472473085443, + "tokens_seen": 1725246464 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024107321965897695, + "loss": 2.4362, + "theoretical_loss": 3.4735359237575594, + "tokens_seen": 1725312000 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024106318956870613, + "loss": 2.3677, + "theoretical_loss": 3.4735246007571208, + "tokens_seen": 1725377536 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002410531594784353, + "loss": 2.6924, + "theoretical_loss": 3.4735132783071814, + "tokens_seen": 1725443072 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002410431293881645, + "loss": 2.7478, + "theoretical_loss": 3.473501956407694, + "tokens_seen": 1725508608 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002410330992978937, + "loss": 2.6386, + "theoretical_loss": 3.47349063505861, + "tokens_seen": 1725574144 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024102306920762288, + "loss": 2.3273, + "theoretical_loss": 3.4734793142598828, + "tokens_seen": 1725639680 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024101303911735206, + "loss": 2.5599, + "theoretical_loss": 3.473467994011464, + "tokens_seen": 1725705216 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024100300902708124, + "loss": 2.8027, + "theoretical_loss": 3.473456674313306, + "tokens_seen": 1725770752 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024099297893681042, + "loss": 2.6098, + "theoretical_loss": 3.4734453551653615, + "tokens_seen": 1725836288 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024098294884653963, + "loss": 2.5631, + "theoretical_loss": 3.4734340365675824, + "tokens_seen": 1725901824 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024097291875626881, + "loss": 2.4398, + "theoretical_loss": 3.4734227185199216, + "tokens_seen": 1725967360 + }, + { + "epoch": 5.07, + "learning_rate": 0.000240962888665998, + "loss": 2.5744, + "theoretical_loss": 3.4734114010223314, + "tokens_seen": 1726032896 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024095285857572718, + "loss": 2.5865, + "theoretical_loss": 3.4734000840747634, + "tokens_seen": 1726098432 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024094282848545638, + "loss": 2.437, + "theoretical_loss": 3.4733887676771706, + "tokens_seen": 1726163968 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024093279839518557, + "loss": 2.7487, + "theoretical_loss": 3.473377451829506, + "tokens_seen": 1726229504 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024092276830491475, + "loss": 2.5801, + "theoretical_loss": 3.473366136531721, + "tokens_seen": 1726295040 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024091273821464393, + "loss": 2.6363, + "theoretical_loss": 3.4733548217837686, + "tokens_seen": 1726360576 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002409027081243731, + "loss": 2.4728, + "theoretical_loss": 3.473343507585601, + "tokens_seen": 1726426112 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024089267803410232, + "loss": 2.6296, + "theoretical_loss": 3.4733321939371704, + "tokens_seen": 1726491648 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002408826479438315, + "loss": 2.4372, + "theoretical_loss": 3.473320880838429, + "tokens_seen": 1726557184 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024087261785356068, + "loss": 2.6381, + "theoretical_loss": 3.4733095682893302, + "tokens_seen": 1726622720 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024086258776328986, + "loss": 2.5048, + "theoretical_loss": 3.473298256289826, + "tokens_seen": 1726688256 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024085255767301907, + "loss": 2.5741, + "theoretical_loss": 3.4732869448398684, + "tokens_seen": 1726753792 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024084252758274825, + "loss": 2.5833, + "theoretical_loss": 3.4732756339394104, + "tokens_seen": 1726819328 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1901013, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.839489221572876, + "objective/train/theoretical_loss": 3.4732671511246473, + "objective/train/tokens_used": 1747328480, + "theoretical_loss": 3.4732671511246473, + "tokens_seen": 1726868480 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024083249749247743, + "loss": 2.3898, + "theoretical_loss": 3.4732643235884044, + "tokens_seen": 1726884864 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002408224674022066, + "loss": 2.6219, + "theoretical_loss": 3.4732530137868025, + "tokens_seen": 1726950400 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002408124373119358, + "loss": 2.7078, + "theoretical_loss": 3.4732417045345576, + "tokens_seen": 1727015936 + }, + { + "epoch": 5.07, + "learning_rate": 0.000240802407221665, + "loss": 2.4721, + "theoretical_loss": 3.4732303958316217, + "tokens_seen": 1727081472 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024079237713139418, + "loss": 2.6135, + "theoretical_loss": 3.473219087677947, + "tokens_seen": 1727147008 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024078234704112336, + "loss": 2.5196, + "theoretical_loss": 3.473207780073487, + "tokens_seen": 1727212544 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024077231695085254, + "loss": 2.6051, + "theoretical_loss": 3.473196473018194, + "tokens_seen": 1727278080 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024076228686058175, + "loss": 2.6449, + "theoretical_loss": 3.4731851665120197, + "tokens_seen": 1727343616 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024075225677031093, + "loss": 2.5321, + "theoretical_loss": 3.473173860554917, + "tokens_seen": 1727409152 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024074222668004012, + "loss": 2.6258, + "theoretical_loss": 3.473162555146839, + "tokens_seen": 1727474688 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002407321965897693, + "loss": 2.7298, + "theoretical_loss": 3.4731512502877373, + "tokens_seen": 1727540224 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024072216649949848, + "loss": 2.8261, + "theoretical_loss": 3.473139945977565, + "tokens_seen": 1727605760 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024071213640922769, + "loss": 2.6033, + "theoretical_loss": 3.473128642216274, + "tokens_seen": 1727671296 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024070210631895687, + "loss": 2.5735, + "theoretical_loss": 3.473117339003818, + "tokens_seen": 1727736832 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024069207622868605, + "loss": 2.6386, + "theoretical_loss": 3.4731060363401483, + "tokens_seen": 1727802368 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024068204613841523, + "loss": 2.8478, + "theoretical_loss": 3.473094734225218, + "tokens_seen": 1727867904 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024067201604814444, + "loss": 2.8281, + "theoretical_loss": 3.4730834326589797, + "tokens_seen": 1727933440 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024066198595787362, + "loss": 2.4382, + "theoretical_loss": 3.473072131641386, + "tokens_seen": 1727998976 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002406519558676028, + "loss": 2.4344, + "theoretical_loss": 3.473060831172389, + "tokens_seen": 1728064512 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024064192577733198, + "loss": 2.7835, + "theoretical_loss": 3.4730495312519416, + "tokens_seen": 1728130048 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024063189568706116, + "loss": 2.7645, + "theoretical_loss": 3.473038231879996, + "tokens_seen": 1728195584 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024062186559679037, + "loss": 2.5402, + "theoretical_loss": 3.473026933056506, + "tokens_seen": 1728261120 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024061183550651958, + "loss": 2.6002, + "theoretical_loss": 3.4730156347814223, + "tokens_seen": 1728326656 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024060180541624876, + "loss": 2.5791, + "theoretical_loss": 3.4730043370546992, + "tokens_seen": 1728392192 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024059177532597794, + "loss": 2.6085, + "theoretical_loss": 3.4729930398762883, + "tokens_seen": 1728457728 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1902220, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.477761745452881, + "objective/train/theoretical_loss": 3.472984567352282, + "objective/train/tokens_used": 1748966880, + "theoretical_loss": 3.472984567352282, + "tokens_seen": 1728506880 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024058174523570715, + "loss": 2.4411, + "theoretical_loss": 3.4729817432461427, + "tokens_seen": 1728523264 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024057171514543633, + "loss": 2.6595, + "theoretical_loss": 3.472970447164214, + "tokens_seen": 1728588800 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002405616850551655, + "loss": 2.7046, + "theoretical_loss": 3.472959151630457, + "tokens_seen": 1728654336 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002405516549648947, + "loss": 2.6982, + "theoretical_loss": 3.4729478566448218, + "tokens_seen": 1728719872 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002405416248746239, + "loss": 2.6994, + "theoretical_loss": 3.4729365622072628, + "tokens_seen": 1728785408 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024053159478435308, + "loss": 2.6441, + "theoretical_loss": 3.4729252683177316, + "tokens_seen": 1728850944 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024052156469408226, + "loss": 2.4088, + "theoretical_loss": 3.472913974976181, + "tokens_seen": 1728916480 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024051153460381144, + "loss": 2.6808, + "theoretical_loss": 3.472902682182564, + "tokens_seen": 1728982016 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024050150451354062, + "loss": 2.6202, + "theoretical_loss": 3.4728913899368337, + "tokens_seen": 1729047552 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024049147442326983, + "loss": 2.6492, + "theoretical_loss": 3.4728800982389414, + "tokens_seen": 1729113088 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024048144433299901, + "loss": 2.5454, + "theoretical_loss": 3.4728688070888407, + "tokens_seen": 1729178624 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002404714142427282, + "loss": 2.5783, + "theoretical_loss": 3.472857516486484, + "tokens_seen": 1729244160 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024046138415245738, + "loss": 2.7631, + "theoretical_loss": 3.472846226431824, + "tokens_seen": 1729309696 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024045135406218658, + "loss": 2.3464, + "theoretical_loss": 3.472834936924814, + "tokens_seen": 1729375232 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024044132397191577, + "loss": 2.505, + "theoretical_loss": 3.4728236479654058, + "tokens_seen": 1729440768 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024043129388164495, + "loss": 2.6703, + "theoretical_loss": 3.4728123595535525, + "tokens_seen": 1729506304 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024042126379137413, + "loss": 2.3717, + "theoretical_loss": 3.472801071689206, + "tokens_seen": 1729571840 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002404112337011033, + "loss": 2.6483, + "theoretical_loss": 3.4727897843723206, + "tokens_seen": 1729637376 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024040120361083252, + "loss": 2.6225, + "theoretical_loss": 3.4727784976028477, + "tokens_seen": 1729702912 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002403911735205617, + "loss": 2.6753, + "theoretical_loss": 3.47276721138074, + "tokens_seen": 1729768448 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024038114343029088, + "loss": 2.4896, + "theoretical_loss": 3.4727559257059513, + "tokens_seen": 1729833984 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024037111334002006, + "loss": 2.5744, + "theoretical_loss": 3.4727446405784335, + "tokens_seen": 1729899520 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024036108324974927, + "loss": 2.5379, + "theoretical_loss": 3.4727333559981393, + "tokens_seen": 1729965056 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024035105315947845, + "loss": 2.6269, + "theoretical_loss": 3.4727220719650216, + "tokens_seen": 1730030592 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024034102306920763, + "loss": 2.5651, + "theoretical_loss": 3.4727107884790334, + "tokens_seen": 1730096128 + }, + { + "epoch": 5.07, + "objective/train/docs_used": 1902785, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.749056816101074, + "objective/train/theoretical_loss": 3.4727023262235672, + "objective/train/tokens_used": 1750605280, + "theoretical_loss": 3.4727023262235672, + "tokens_seen": 1730145280 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002403309929789368, + "loss": 2.6097, + "theoretical_loss": 3.4726995055401266, + "tokens_seen": 1730161664 + }, + { + "epoch": 5.07, + "learning_rate": 0.000240320962888666, + "loss": 2.6942, + "theoretical_loss": 3.472688223148255, + "tokens_seen": 1730227200 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002403109327983952, + "loss": 2.793, + "theoretical_loss": 3.472676941303371, + "tokens_seen": 1730292736 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024030090270812438, + "loss": 2.5069, + "theoretical_loss": 3.4726656600054273, + "tokens_seen": 1730358272 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024029087261785356, + "loss": 2.6964, + "theoretical_loss": 3.4726543792543767, + "tokens_seen": 1730423808 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024028084252758274, + "loss": 2.5978, + "theoretical_loss": 3.472643099050172, + "tokens_seen": 1730489344 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024027081243731195, + "loss": 2.8703, + "theoretical_loss": 3.472631819392766, + "tokens_seen": 1730554880 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024026078234704113, + "loss": 2.498, + "theoretical_loss": 3.4726205402821106, + "tokens_seen": 1730620416 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024025075225677032, + "loss": 2.5523, + "theoretical_loss": 3.4726092617181603, + "tokens_seen": 1730685952 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002402407221664995, + "loss": 2.6235, + "theoretical_loss": 3.4725979837008665, + "tokens_seen": 1730751488 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024023069207622868, + "loss": 2.5798, + "theoretical_loss": 3.4725867062301825, + "tokens_seen": 1730817024 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024022066198595789, + "loss": 2.5761, + "theoretical_loss": 3.4725754293060613, + "tokens_seen": 1730882560 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024021063189568707, + "loss": 2.5696, + "theoretical_loss": 3.4725641529284554, + "tokens_seen": 1730948096 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024020060180541625, + "loss": 2.6959, + "theoretical_loss": 3.4725528770973177, + "tokens_seen": 1731013632 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024019057171514543, + "loss": 2.3862, + "theoretical_loss": 3.472541601812601, + "tokens_seen": 1731079168 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024018054162487464, + "loss": 2.7569, + "theoretical_loss": 3.472530327074259, + "tokens_seen": 1731144704 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024017051153460382, + "loss": 2.4936, + "theoretical_loss": 3.472519052882243, + "tokens_seen": 1731210240 + }, + { + "epoch": 5.07, + "learning_rate": 0.000240160481444333, + "loss": 2.7608, + "theoretical_loss": 3.4725077792365067, + "tokens_seen": 1731275776 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024015045135406218, + "loss": 2.613, + "theoretical_loss": 3.472496506137003, + "tokens_seen": 1731341312 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024014042126379136, + "loss": 2.5029, + "theoretical_loss": 3.4724852335836847, + "tokens_seen": 1731406848 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024013039117352057, + "loss": 2.5302, + "theoretical_loss": 3.4724739615765046, + "tokens_seen": 1731472384 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024012036108324975, + "loss": 2.7247, + "theoretical_loss": 3.4724626901154156, + "tokens_seen": 1731537920 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024011033099297893, + "loss": 2.4738, + "theoretical_loss": 3.472451419200371, + "tokens_seen": 1731603456 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024010030090270811, + "loss": 2.4587, + "theoretical_loss": 3.472440148831322, + "tokens_seen": 1731668992 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024009027081243732, + "loss": 2.4913, + "theoretical_loss": 3.4724288790082234, + "tokens_seen": 1731734528 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1903894, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5215156078338623, + "objective/train/theoretical_loss": 3.4724204269991503, + "objective/train/tokens_used": 1752243680, + "theoretical_loss": 3.4724204269991503, + "tokens_seen": 1731783680 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002400802407221665, + "loss": 2.5855, + "theoretical_loss": 3.4724176097310275, + "tokens_seen": 1731800064 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024007021063189568, + "loss": 2.5977, + "theoretical_loss": 3.4724063409996866, + "tokens_seen": 1731865600 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024006018054162487, + "loss": 2.6142, + "theoretical_loss": 3.4723950728141544, + "tokens_seen": 1731931136 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024005015045135407, + "loss": 2.7326, + "theoretical_loss": 3.472383805174384, + "tokens_seen": 1731996672 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024004012036108325, + "loss": 2.8013, + "theoretical_loss": 3.4723725380803274, + "tokens_seen": 1732062208 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024003009027081244, + "loss": 2.6424, + "theoretical_loss": 3.4723612715319376, + "tokens_seen": 1732127744 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024002006018054162, + "loss": 2.5411, + "theoretical_loss": 3.4723500055291687, + "tokens_seen": 1732193280 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002400100300902708, + "loss": 2.5456, + "theoretical_loss": 3.4723387400719723, + "tokens_seen": 1732258816 + }, + { + "epoch": 5.08, + "learning_rate": 0.00024, + "loss": 2.492, + "theoretical_loss": 3.472327475160302, + "tokens_seen": 1732324352 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002399899699097292, + "loss": 2.6451, + "theoretical_loss": 3.4723162107941103, + "tokens_seen": 1732389888 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023997993981945837, + "loss": 2.4133, + "theoretical_loss": 3.4723049469733507, + "tokens_seen": 1732455424 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023996990972918755, + "loss": 2.5842, + "theoretical_loss": 3.472293683697976, + "tokens_seen": 1732520960 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023995987963891676, + "loss": 2.6355, + "theoretical_loss": 3.4722824209679395, + "tokens_seen": 1732586496 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023994984954864594, + "loss": 2.6265, + "theoretical_loss": 3.472271158783193, + "tokens_seen": 1732652032 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023993981945837512, + "loss": 2.6597, + "theoretical_loss": 3.4722598971436907, + "tokens_seen": 1732717568 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002399297893681043, + "loss": 2.667, + "theoretical_loss": 3.472248636049385, + "tokens_seen": 1732783104 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023991975927783348, + "loss": 2.6496, + "theoretical_loss": 3.472237375500229, + "tokens_seen": 1732848640 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002399097291875627, + "loss": 2.8399, + "theoretical_loss": 3.4722261154961753, + "tokens_seen": 1732914176 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023989969909729187, + "loss": 2.3269, + "theoretical_loss": 3.472214856037178, + "tokens_seen": 1732979712 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023988966900702105, + "loss": 2.8175, + "theoretical_loss": 3.4722035971231886, + "tokens_seen": 1733045248 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023987963891675023, + "loss": 2.8205, + "theoretical_loss": 3.4721923387541613, + "tokens_seen": 1733110784 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023986960882647944, + "loss": 2.7193, + "theoretical_loss": 3.4721810809300493, + "tokens_seen": 1733176320 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023985957873620865, + "loss": 2.5636, + "theoretical_loss": 3.4721698236508045, + "tokens_seen": 1733241856 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023984954864593783, + "loss": 2.6271, + "theoretical_loss": 3.4721585669163804, + "tokens_seen": 1733307392 + }, + { + "epoch": 5.08, + "learning_rate": 0.000239839518555667, + "loss": 2.5747, + "theoretical_loss": 3.47214731072673, + "tokens_seen": 1733372928 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1904516, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.672600507736206, + "objective/train/theoretical_loss": 3.472138868941972, + "objective/train/tokens_used": 1753882080, + "theoretical_loss": 3.472138868941972, + "tokens_seen": 1733422080 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002398294884653962, + "loss": 2.8116, + "theoretical_loss": 3.4721360550818066, + "tokens_seen": 1733438464 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002398194583751254, + "loss": 2.6618, + "theoretical_loss": 3.4721247999815636, + "tokens_seen": 1733504000 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023980942828485458, + "loss": 2.7925, + "theoretical_loss": 3.4721135454259526, + "tokens_seen": 1733569536 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023979939819458376, + "loss": 2.6558, + "theoretical_loss": 3.472102291414928, + "tokens_seen": 1733635072 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023978936810431295, + "loss": 2.7391, + "theoretical_loss": 3.472091037948443, + "tokens_seen": 1733700608 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023977933801404215, + "loss": 2.594, + "theoretical_loss": 3.47207978502645, + "tokens_seen": 1733766144 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023976930792377133, + "loss": 2.6426, + "theoretical_loss": 3.472068532648902, + "tokens_seen": 1733831680 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023975927783350052, + "loss": 2.7319, + "theoretical_loss": 3.472057280815752, + "tokens_seen": 1733897216 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002397492477432297, + "loss": 2.6239, + "theoretical_loss": 3.4720460295269535, + "tokens_seen": 1733962752 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023973921765295888, + "loss": 2.7048, + "theoretical_loss": 3.4720347787824597, + "tokens_seen": 1734028288 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023972918756268809, + "loss": 2.6214, + "theoretical_loss": 3.4720235285822234, + "tokens_seen": 1734093824 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023971915747241727, + "loss": 2.5345, + "theoretical_loss": 3.472012278926198, + "tokens_seen": 1734159360 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023970912738214645, + "loss": 2.7479, + "theoretical_loss": 3.4720010298143356, + "tokens_seen": 1734224896 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023969909729187563, + "loss": 2.6333, + "theoretical_loss": 3.4719897812465907, + "tokens_seen": 1734290432 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023968906720160484, + "loss": 2.3515, + "theoretical_loss": 3.4719785332229156, + "tokens_seen": 1734355968 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023967903711133402, + "loss": 2.5759, + "theoretical_loss": 3.471967285743264, + "tokens_seen": 1734421504 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002396690070210632, + "loss": 2.743, + "theoretical_loss": 3.4719560388075887, + "tokens_seen": 1734487040 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023965897693079238, + "loss": 2.6429, + "theoretical_loss": 3.4719447924158424, + "tokens_seen": 1734552576 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023964894684052156, + "loss": 2.661, + "theoretical_loss": 3.471933546567979, + "tokens_seen": 1734618112 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023963891675025077, + "loss": 2.6938, + "theoretical_loss": 3.471922301263951, + "tokens_seen": 1734683648 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023962888665997995, + "loss": 2.5533, + "theoretical_loss": 3.4719110565037123, + "tokens_seen": 1734749184 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023961885656970913, + "loss": 2.5957, + "theoretical_loss": 3.4718998122872153, + "tokens_seen": 1734814720 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023960882647943831, + "loss": 2.4648, + "theoretical_loss": 3.471888568614413, + "tokens_seen": 1734880256 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023959879638916752, + "loss": 2.5432, + "theoretical_loss": 3.47187732548526, + "tokens_seen": 1734945792 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002395887662988967, + "loss": 2.3256, + "theoretical_loss": 3.4718660828997088, + "tokens_seen": 1735011328 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1905870, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5277462005615234, + "objective/train/theoretical_loss": 3.471857651317255, + "objective/train/tokens_used": 1755520480, + "theoretical_loss": 3.471857651317255, + "tokens_seen": 1735060480 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023957873620862588, + "loss": 2.6839, + "theoretical_loss": 3.4718548408577115, + "tokens_seen": 1735076864 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023956870611835507, + "loss": 2.5377, + "theoretical_loss": 3.4718435993592225, + "tokens_seen": 1735142400 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023955867602808427, + "loss": 2.6352, + "theoretical_loss": 3.4718323584041944, + "tokens_seen": 1735207936 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023954864593781345, + "loss": 2.8516, + "theoretical_loss": 3.4718211179925804, + "tokens_seen": 1735273472 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023953861584754264, + "loss": 2.7982, + "theoretical_loss": 3.4718098781243345, + "tokens_seen": 1735339008 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023952858575727182, + "loss": 2.4681, + "theoretical_loss": 3.4717986387994095, + "tokens_seen": 1735404544 + }, + { + "epoch": 5.08, + "learning_rate": 0.000239518555667001, + "loss": 2.5523, + "theoretical_loss": 3.471787400017758, + "tokens_seen": 1735470080 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002395085255767302, + "loss": 2.5375, + "theoretical_loss": 3.4717761617793337, + "tokens_seen": 1735535616 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002394984954864594, + "loss": 2.583, + "theoretical_loss": 3.47176492408409, + "tokens_seen": 1735601152 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023948846539618857, + "loss": 2.5242, + "theoretical_loss": 3.4717536869319803, + "tokens_seen": 1735666688 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023947843530591775, + "loss": 2.5072, + "theoretical_loss": 3.4717424503229566, + "tokens_seen": 1735732224 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023946840521564696, + "loss": 2.6157, + "theoretical_loss": 3.471731214256974, + "tokens_seen": 1735797760 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023945837512537614, + "loss": 2.6391, + "theoretical_loss": 3.4717199787339843, + "tokens_seen": 1735863296 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023944834503510532, + "loss": 2.3814, + "theoretical_loss": 3.4717087437539416, + "tokens_seen": 1735928832 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002394383149448345, + "loss": 2.6469, + "theoretical_loss": 3.4716975093167983, + "tokens_seen": 1735994368 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023942828485456368, + "loss": 2.8137, + "theoretical_loss": 3.471686275422509, + "tokens_seen": 1736059904 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002394182547642929, + "loss": 2.7251, + "theoretical_loss": 3.471675042071025, + "tokens_seen": 1736125440 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023940822467402207, + "loss": 2.559, + "theoretical_loss": 3.471663809262302, + "tokens_seen": 1736190976 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023939819458375125, + "loss": 2.5346, + "theoretical_loss": 3.4716525769962914, + "tokens_seen": 1736256512 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023938816449348043, + "loss": 2.8473, + "theoretical_loss": 3.471641345272947, + "tokens_seen": 1736322048 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023937813440320964, + "loss": 2.6622, + "theoretical_loss": 3.471630114092223, + "tokens_seen": 1736387584 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023936810431293882, + "loss": 2.6066, + "theoretical_loss": 3.471618883454072, + "tokens_seen": 1736453120 + }, + { + "epoch": 5.08, + "learning_rate": 0.000239358074222668, + "loss": 2.6224, + "theoretical_loss": 3.4716076533584466, + "tokens_seen": 1736518656 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023934804413239719, + "loss": 2.4147, + "theoretical_loss": 3.471596423805301, + "tokens_seen": 1736584192 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023933801404212637, + "loss": 2.7065, + "theoretical_loss": 3.4715851947945886, + "tokens_seen": 1736649728 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1906597, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4826974868774414, + "objective/train/theoretical_loss": 3.4715767733924974, + "objective/train/tokens_used": 1757158880, + "theoretical_loss": 3.4715767733924974, + "tokens_seen": 1736698880 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023932798395185557, + "loss": 2.7574, + "theoretical_loss": 3.471573966326262, + "tokens_seen": 1736715264 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023931795386158476, + "loss": 2.6633, + "theoretical_loss": 3.4715627384002756, + "tokens_seen": 1736780800 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023930792377131394, + "loss": 2.4531, + "theoretical_loss": 3.4715515110165818, + "tokens_seen": 1736846336 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023929789368104312, + "loss": 2.7052, + "theoretical_loss": 3.471540284175134, + "tokens_seen": 1736911872 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023928786359077233, + "loss": 2.4909, + "theoretical_loss": 3.4715290578758857, + "tokens_seen": 1736977408 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002392778335005015, + "loss": 2.5898, + "theoretical_loss": 3.471517832118791, + "tokens_seen": 1737042944 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002392678034102307, + "loss": 2.6471, + "theoretical_loss": 3.4715066069038025, + "tokens_seen": 1737108480 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023925777331995987, + "loss": 2.8092, + "theoretical_loss": 3.4714953822308736, + "tokens_seen": 1737174016 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023924774322968905, + "loss": 2.6062, + "theoretical_loss": 3.4714841580999583, + "tokens_seen": 1737239552 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023923771313941826, + "loss": 2.5126, + "theoretical_loss": 3.4714729345110085, + "tokens_seen": 1737305088 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023922768304914744, + "loss": 2.5342, + "theoretical_loss": 3.4714617114639794, + "tokens_seen": 1737370624 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023921765295887662, + "loss": 2.4998, + "theoretical_loss": 3.471450488958823, + "tokens_seen": 1737436160 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002392076228686058, + "loss": 2.6087, + "theoretical_loss": 3.4714392669954934, + "tokens_seen": 1737501696 + }, + { + "epoch": 5.08, + "learning_rate": 0.000239197592778335, + "loss": 2.703, + "theoretical_loss": 3.471428045573944, + "tokens_seen": 1737567232 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002391875626880642, + "loss": 2.5164, + "theoretical_loss": 3.4714168246941286, + "tokens_seen": 1737632768 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023917753259779337, + "loss": 2.4231, + "theoretical_loss": 3.4714056043559993, + "tokens_seen": 1737698304 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023916750250752255, + "loss": 2.4953, + "theoretical_loss": 3.4713943845595105, + "tokens_seen": 1737763840 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023915747241725174, + "loss": 2.6741, + "theoretical_loss": 3.4713831653046157, + "tokens_seen": 1737829376 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023914744232698094, + "loss": 2.6472, + "theoretical_loss": 3.471371946591268, + "tokens_seen": 1737894912 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023913741223671012, + "loss": 2.3422, + "theoretical_loss": 3.4713607284194206, + "tokens_seen": 1737960448 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002391273821464393, + "loss": 2.5982, + "theoretical_loss": 3.471349510789028, + "tokens_seen": 1738025984 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002391173520561685, + "loss": 2.6063, + "theoretical_loss": 3.4713382937000423, + "tokens_seen": 1738091520 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023910732196589772, + "loss": 2.6709, + "theoretical_loss": 3.4713270771524174, + "tokens_seen": 1738157056 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002390972918756269, + "loss": 2.6769, + "theoretical_loss": 3.471315861146108, + "tokens_seen": 1738222592 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023908726178535608, + "loss": 2.5454, + "theoretical_loss": 3.471304645681065, + "tokens_seen": 1738288128 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1908064, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.734844446182251, + "objective/train/theoretical_loss": 3.4712962344374625, + "objective/train/tokens_used": 1758797280, + "theoretical_loss": 3.4712962344374625, + "tokens_seen": 1738337280 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023907723169508527, + "loss": 2.6917, + "theoretical_loss": 3.4712934307572443, + "tokens_seen": 1738353664 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023906720160481447, + "loss": 2.5215, + "theoretical_loss": 3.4712822163745987, + "tokens_seen": 1738419200 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023905717151454365, + "loss": 2.7552, + "theoretical_loss": 3.471271002533081, + "tokens_seen": 1738484736 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023904714142427284, + "loss": 2.671, + "theoretical_loss": 3.471259789232645, + "tokens_seen": 1738550272 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023903711133400202, + "loss": 2.7007, + "theoretical_loss": 3.4712485764732444, + "tokens_seen": 1738615808 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002390270812437312, + "loss": 2.5182, + "theoretical_loss": 3.471237364254833, + "tokens_seen": 1738681344 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002390170511534604, + "loss": 2.7234, + "theoretical_loss": 3.4712261525773638, + "tokens_seen": 1738746880 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002390070210631896, + "loss": 2.6131, + "theoretical_loss": 3.47121494144079, + "tokens_seen": 1738812416 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023899699097291877, + "loss": 2.6319, + "theoretical_loss": 3.471203730845066, + "tokens_seen": 1738877952 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023898696088264795, + "loss": 2.3191, + "theoretical_loss": 3.4711925207901446, + "tokens_seen": 1738943488 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023897693079237716, + "loss": 2.726, + "theoretical_loss": 3.47118131127598, + "tokens_seen": 1739009024 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023896690070210634, + "loss": 2.6659, + "theoretical_loss": 3.4711701023025254, + "tokens_seen": 1739074560 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023895687061183552, + "loss": 2.5602, + "theoretical_loss": 3.4711588938697338, + "tokens_seen": 1739140096 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002389468405215647, + "loss": 2.6598, + "theoretical_loss": 3.47114768597756, + "tokens_seen": 1739205632 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023893681043129388, + "loss": 2.3531, + "theoretical_loss": 3.471136478625956, + "tokens_seen": 1739271168 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002389267803410231, + "loss": 2.3663, + "theoretical_loss": 3.471125271814877, + "tokens_seen": 1739336704 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023891675025075227, + "loss": 2.7585, + "theoretical_loss": 3.4711140655442754, + "tokens_seen": 1739402240 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023890672016048145, + "loss": 2.7792, + "theoretical_loss": 3.471102859814105, + "tokens_seen": 1739467776 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023889669007021063, + "loss": 2.6674, + "theoretical_loss": 3.4710916546243196, + "tokens_seen": 1739533312 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023888665997993984, + "loss": 2.627, + "theoretical_loss": 3.4710804499748726, + "tokens_seen": 1739598848 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023887662988966902, + "loss": 2.688, + "theoretical_loss": 3.471069245865718, + "tokens_seen": 1739664384 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002388665997993982, + "loss": 2.6192, + "theoretical_loss": 3.471058042296809, + "tokens_seen": 1739729920 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023885656970912739, + "loss": 2.5775, + "theoretical_loss": 3.471046839268099, + "tokens_seen": 1739795456 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023884653961885657, + "loss": 2.8078, + "theoretical_loss": 3.4710356367795416, + "tokens_seen": 1739860992 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023883650952858577, + "loss": 2.6775, + "theoretical_loss": 3.471024434831091, + "tokens_seen": 1739926528 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1908879, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5237278938293457, + "objective/train/theoretical_loss": 3.47101603372417, + "objective/train/tokens_used": 1760435680, + "theoretical_loss": 3.47101603372417, + "tokens_seen": 1739975680 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023882647943831496, + "loss": 2.5443, + "theoretical_loss": 3.4710132334227004, + "tokens_seen": 1739992064 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023881644934804414, + "loss": 2.5402, + "theoretical_loss": 3.4710020325543236, + "tokens_seen": 1740057600 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023880641925777332, + "loss": 2.487, + "theoretical_loss": 3.4709908322259144, + "tokens_seen": 1740123136 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023879638916750253, + "loss": 2.5886, + "theoretical_loss": 3.470979632437426, + "tokens_seen": 1740188672 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002387863590772317, + "loss": 2.4845, + "theoretical_loss": 3.470968433188812, + "tokens_seen": 1740254208 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002387763289869609, + "loss": 2.5889, + "theoretical_loss": 3.4709572344800264, + "tokens_seen": 1740319744 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023876629889669007, + "loss": 2.4999, + "theoretical_loss": 3.4709460363110227, + "tokens_seen": 1740385280 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023875626880641925, + "loss": 2.6309, + "theoretical_loss": 3.470934838681755, + "tokens_seen": 1740450816 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023874623871614846, + "loss": 2.8465, + "theoretical_loss": 3.470923641592176, + "tokens_seen": 1740516352 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023873620862587764, + "loss": 2.7537, + "theoretical_loss": 3.4709124450422397, + "tokens_seen": 1740581888 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023872617853560682, + "loss": 2.4395, + "theoretical_loss": 3.4709012490319004, + "tokens_seen": 1740647424 + }, + { + "epoch": 5.08, + "learning_rate": 0.000238716148445336, + "loss": 2.6418, + "theoretical_loss": 3.470890053561112, + "tokens_seen": 1740712960 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002387061183550652, + "loss": 2.3037, + "theoretical_loss": 3.4708788586298267, + "tokens_seen": 1740778496 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002386960882647944, + "loss": 2.3457, + "theoretical_loss": 3.4708676642379994, + "tokens_seen": 1740844032 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023868605817452357, + "loss": 2.735, + "theoretical_loss": 3.4708564703855833, + "tokens_seen": 1740909568 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023867602808425275, + "loss": 2.5952, + "theoretical_loss": 3.4708452770725327, + "tokens_seen": 1740975104 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023866599799398194, + "loss": 2.4345, + "theoretical_loss": 3.4708340842988, + "tokens_seen": 1741040640 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023865596790371114, + "loss": 2.3175, + "theoretical_loss": 3.4708228920643402, + "tokens_seen": 1741106176 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023864593781344032, + "loss": 2.614, + "theoretical_loss": 3.470811700369107, + "tokens_seen": 1741171712 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002386359077231695, + "loss": 2.6332, + "theoretical_loss": 3.470800509213053, + "tokens_seen": 1741237248 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002386258776328987, + "loss": 2.7269, + "theoretical_loss": 3.470789318596133, + "tokens_seen": 1741302784 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002386158475426279, + "loss": 2.4992, + "theoretical_loss": 3.4707781285183, + "tokens_seen": 1741368320 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023860581745235708, + "loss": 2.7416, + "theoretical_loss": 3.4707669389795086, + "tokens_seen": 1741433856 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023859578736208626, + "loss": 2.6502, + "theoretical_loss": 3.4707557499797126, + "tokens_seen": 1741499392 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023858575727181544, + "loss": 2.59, + "theoretical_loss": 3.470744561518864, + "tokens_seen": 1741564928 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1910122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2382304668426514, + "objective/train/theoretical_loss": 3.4707361705268855, + "objective/train/tokens_used": 1762074080, + "theoretical_loss": 3.4707361705268855, + "tokens_seen": 1741614080 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023857572718154462, + "loss": 2.7162, + "theoretical_loss": 3.470733373596919, + "tokens_seen": 1741630464 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023856569709127383, + "loss": 2.6985, + "theoretical_loss": 3.4707221862138296, + "tokens_seen": 1741696000 + }, + { + "epoch": 5.08, + "learning_rate": 0.000238555667001003, + "loss": 2.6758, + "theoretical_loss": 3.47071099936955, + "tokens_seen": 1741761536 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002385456369107322, + "loss": 2.5572, + "theoretical_loss": 3.4706998130640345, + "tokens_seen": 1741827072 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023853560682046137, + "loss": 2.6633, + "theoretical_loss": 3.470688627297236, + "tokens_seen": 1741892608 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023852557673019058, + "loss": 2.7351, + "theoretical_loss": 3.470677442069109, + "tokens_seen": 1741958144 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023851554663991976, + "loss": 2.3723, + "theoretical_loss": 3.470666257379607, + "tokens_seen": 1742023680 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023850551654964894, + "loss": 2.7057, + "theoretical_loss": 3.4706550732286843, + "tokens_seen": 1742089216 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023849548645937812, + "loss": 2.5672, + "theoretical_loss": 3.470643889616294, + "tokens_seen": 1742154752 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023848545636910733, + "loss": 2.5477, + "theoretical_loss": 3.47063270654239, + "tokens_seen": 1742220288 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002384754262788365, + "loss": 2.7657, + "theoretical_loss": 3.4706215240069267, + "tokens_seen": 1742285824 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002384653961885657, + "loss": 2.3322, + "theoretical_loss": 3.4706103420098575, + "tokens_seen": 1742351360 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023845536609829487, + "loss": 2.3779, + "theoretical_loss": 3.470599160551136, + "tokens_seen": 1742416896 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023844533600802406, + "loss": 2.4812, + "theoretical_loss": 3.470587979630716, + "tokens_seen": 1742482432 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023843530591775326, + "loss": 2.3666, + "theoretical_loss": 3.4705767992485526, + "tokens_seen": 1742547968 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023842527582748244, + "loss": 2.5868, + "theoretical_loss": 3.4705656194045984, + "tokens_seen": 1742613504 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023841524573721163, + "loss": 2.5222, + "theoretical_loss": 3.470554440098807, + "tokens_seen": 1742679040 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002384052156469408, + "loss": 2.7548, + "theoretical_loss": 3.4705432613311333, + "tokens_seen": 1742744576 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023839518555667002, + "loss": 2.5344, + "theoretical_loss": 3.4705320831015305, + "tokens_seen": 1742810112 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002383851554663992, + "loss": 2.6955, + "theoretical_loss": 3.470520905409953, + "tokens_seen": 1742875648 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023837512537612838, + "loss": 2.5826, + "theoretical_loss": 3.4705097282563537, + "tokens_seen": 1742941184 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023836509528585756, + "loss": 2.5917, + "theoretical_loss": 3.470498551640687, + "tokens_seen": 1743006720 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023835506519558677, + "loss": 2.5342, + "theoretical_loss": 3.4704873755629073, + "tokens_seen": 1743072256 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023834503510531598, + "loss": 2.7632, + "theoretical_loss": 3.470476200022968, + "tokens_seen": 1743137792 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023833500501504516, + "loss": 2.7415, + "theoretical_loss": 3.470465025020823, + "tokens_seen": 1743203328 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1910721, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3395133018493652, + "objective/train/theoretical_loss": 3.4704566441221143, + "objective/train/tokens_used": 1763712480, + "theoretical_loss": 3.4704566441221143, + "tokens_seen": 1743252480 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023832497492477434, + "loss": 2.5318, + "theoretical_loss": 3.470453850556426, + "tokens_seen": 1743268864 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023831494483450352, + "loss": 2.6885, + "theoretical_loss": 3.470442676629732, + "tokens_seen": 1743334400 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023830491474423273, + "loss": 2.6771, + "theoretical_loss": 3.4704315032406936, + "tokens_seen": 1743399936 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002382948846539619, + "loss": 2.6166, + "theoretical_loss": 3.470420330389265, + "tokens_seen": 1743465472 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002382848545636911, + "loss": 2.6022, + "theoretical_loss": 3.470409158075401, + "tokens_seen": 1743531008 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023827482447342027, + "loss": 2.7071, + "theoretical_loss": 3.470397986299054, + "tokens_seen": 1743596544 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023826479438314945, + "loss": 2.6702, + "theoretical_loss": 3.470386815060179, + "tokens_seen": 1743662080 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023825476429287866, + "loss": 2.7055, + "theoretical_loss": 3.47037564435873, + "tokens_seen": 1743727616 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023824473420260784, + "loss": 2.4686, + "theoretical_loss": 3.470364474194661, + "tokens_seen": 1743793152 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023823470411233702, + "loss": 2.7126, + "theoretical_loss": 3.470353304567925, + "tokens_seen": 1743858688 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002382246740220662, + "loss": 2.5967, + "theoretical_loss": 3.470342135478477, + "tokens_seen": 1743924224 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002382146439317954, + "loss": 2.5003, + "theoretical_loss": 3.4703309669262703, + "tokens_seen": 1743989760 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002382046138415246, + "loss": 2.6342, + "theoretical_loss": 3.4703197989112597, + "tokens_seen": 1744055296 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023819458375125377, + "loss": 2.6063, + "theoretical_loss": 3.470308631433398, + "tokens_seen": 1744120832 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023818455366098295, + "loss": 2.341, + "theoretical_loss": 3.4702974644926403, + "tokens_seen": 1744186368 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023817452357071214, + "loss": 2.8871, + "theoretical_loss": 3.4702862980889395, + "tokens_seen": 1744251904 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023816449348044134, + "loss": 2.4633, + "theoretical_loss": 3.470275132222251, + "tokens_seen": 1744317440 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023815446339017052, + "loss": 2.6539, + "theoretical_loss": 3.470263966892527, + "tokens_seen": 1744382976 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002381444332998997, + "loss": 2.5904, + "theoretical_loss": 3.470252802099723, + "tokens_seen": 1744448512 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002381344032096289, + "loss": 2.7351, + "theoretical_loss": 3.4702416378437926, + "tokens_seen": 1744514048 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002381243731193581, + "loss": 2.5575, + "theoretical_loss": 3.4702304741246897, + "tokens_seen": 1744579584 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023811434302908728, + "loss": 2.4828, + "theoretical_loss": 3.470219310942368, + "tokens_seen": 1744645120 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023810431293881646, + "loss": 2.6416, + "theoretical_loss": 3.4702081482967824, + "tokens_seen": 1744710656 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023809428284854564, + "loss": 2.4422, + "theoretical_loss": 3.4701969861878865, + "tokens_seen": 1744776192 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023808425275827482, + "loss": 2.6188, + "theoretical_loss": 3.4701858246156334, + "tokens_seen": 1744841728 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1911241, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1583056449890137, + "objective/train/theoretical_loss": 3.470177453788589, + "objective/train/tokens_used": 1765350880, + "theoretical_loss": 3.470177453788589, + "tokens_seen": 1744890880 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023807422266800403, + "loss": 2.5005, + "theoretical_loss": 3.4701746635799786, + "tokens_seen": 1744907264 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002380641925777332, + "loss": 2.5805, + "theoretical_loss": 3.4701635030808755, + "tokens_seen": 1744972800 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002380541624874624, + "loss": 2.6591, + "theoretical_loss": 3.4701523431182784, + "tokens_seen": 1745038336 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023804413239719157, + "loss": 2.68, + "theoretical_loss": 3.4701411836921405, + "tokens_seen": 1745103872 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023803410230692078, + "loss": 2.8191, + "theoretical_loss": 3.470130024802417, + "tokens_seen": 1745169408 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023802407221664996, + "loss": 2.6565, + "theoretical_loss": 3.470118866449061, + "tokens_seen": 1745234944 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023801404212637914, + "loss": 2.5089, + "theoretical_loss": 3.470107708632028, + "tokens_seen": 1745300480 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023800401203610832, + "loss": 2.7494, + "theoretical_loss": 3.4700965513512703, + "tokens_seen": 1745366016 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023799398194583753, + "loss": 2.5532, + "theoretical_loss": 3.470085394606744, + "tokens_seen": 1745431552 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002379839518555667, + "loss": 2.4392, + "theoretical_loss": 3.470074238398401, + "tokens_seen": 1745497088 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002379739217652959, + "loss": 2.6781, + "theoretical_loss": 3.4700630827261962, + "tokens_seen": 1745562624 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023796389167502507, + "loss": 2.069, + "theoretical_loss": 3.4700519275900845, + "tokens_seen": 1745628160 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023795386158475426, + "loss": 2.551, + "theoretical_loss": 3.4700407729900196, + "tokens_seen": 1745693696 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023794383149448346, + "loss": 2.6675, + "theoretical_loss": 3.470029618925955, + "tokens_seen": 1745759232 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023793380140421265, + "loss": 2.5803, + "theoretical_loss": 3.470018465397846, + "tokens_seen": 1745824768 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023792377131394183, + "loss": 2.3581, + "theoretical_loss": 3.470007312405645, + "tokens_seen": 1745890304 + }, + { + "epoch": 5.08, + "learning_rate": 0.000237913741223671, + "loss": 2.5288, + "theoretical_loss": 3.469996159949308, + "tokens_seen": 1745955840 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023790371113340022, + "loss": 2.9079, + "theoretical_loss": 3.469985008028788, + "tokens_seen": 1746021376 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002378936810431294, + "loss": 2.4018, + "theoretical_loss": 3.4699738566440397, + "tokens_seen": 1746086912 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023788365095285858, + "loss": 2.4097, + "theoretical_loss": 3.469962705795017, + "tokens_seen": 1746152448 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023787362086258776, + "loss": 2.582, + "theoretical_loss": 3.469951555481674, + "tokens_seen": 1746217984 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023786359077231694, + "loss": 2.6095, + "theoretical_loss": 3.469940405703965, + "tokens_seen": 1746283520 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023785356068204615, + "loss": 2.4794, + "theoretical_loss": 3.4699292564618442, + "tokens_seen": 1746349056 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023784353059177533, + "loss": 2.5863, + "theoretical_loss": 3.469918107755265, + "tokens_seen": 1746414592 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002378335005015045, + "loss": 2.5886, + "theoretical_loss": 3.4699069595841827, + "tokens_seen": 1746480128 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1912303, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5562360286712646, + "objective/train/theoretical_loss": 3.469898598807263, + "objective/train/tokens_used": 1766989280, + "theoretical_loss": 3.469898598807263, + "tokens_seen": 1746529280 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002378234704112337, + "loss": 2.5574, + "theoretical_loss": 3.4698958119485512, + "tokens_seen": 1746545664 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002378134403209629, + "loss": 2.5225, + "theoretical_loss": 3.4698846648483244, + "tokens_seen": 1746611200 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023780341023069208, + "loss": 2.7256, + "theoretical_loss": 3.469873518283457, + "tokens_seen": 1746676736 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023779338014042126, + "loss": 2.5, + "theoretical_loss": 3.4698623722539024, + "tokens_seen": 1746742272 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023778335005015044, + "loss": 2.419, + "theoretical_loss": 3.4698512267596153, + "tokens_seen": 1746807808 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023777331995987962, + "loss": 2.6592, + "theoretical_loss": 3.46984008180055, + "tokens_seen": 1746873344 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023776328986960883, + "loss": 2.4534, + "theoretical_loss": 3.4698289373766604, + "tokens_seen": 1746938880 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023775325977933801, + "loss": 2.5242, + "theoretical_loss": 3.469817793487901, + "tokens_seen": 1747004416 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002377432296890672, + "loss": 2.6479, + "theoretical_loss": 3.4698066501342257, + "tokens_seen": 1747069952 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023773319959879638, + "loss": 2.4235, + "theoretical_loss": 3.4697955073155895, + "tokens_seen": 1747135488 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023772316950852558, + "loss": 2.6265, + "theoretical_loss": 3.4697843650319458, + "tokens_seen": 1747201024 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023771313941825477, + "loss": 2.3728, + "theoretical_loss": 3.469773223283249, + "tokens_seen": 1747266560 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023770310932798395, + "loss": 2.5163, + "theoretical_loss": 3.4697620820694537, + "tokens_seen": 1747332096 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023769307923771313, + "loss": 2.7606, + "theoretical_loss": 3.4697509413905143, + "tokens_seen": 1747397632 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002376830491474423, + "loss": 2.7657, + "theoretical_loss": 3.4697398012463845, + "tokens_seen": 1747463168 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023767301905717152, + "loss": 2.6913, + "theoretical_loss": 3.4697286616370184, + "tokens_seen": 1747528704 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002376629889669007, + "loss": 2.4656, + "theoretical_loss": 3.4697175225623713, + "tokens_seen": 1747594240 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023765295887662988, + "loss": 2.3816, + "theoretical_loss": 3.4697063840223965, + "tokens_seen": 1747659776 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023764292878635906, + "loss": 2.6114, + "theoretical_loss": 3.4696952460170487, + "tokens_seen": 1747725312 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023763289869608827, + "loss": 2.2499, + "theoretical_loss": 3.469684108546282, + "tokens_seen": 1747790848 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023762286860581745, + "loss": 2.4845, + "theoretical_loss": 3.469672971610051, + "tokens_seen": 1747856384 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023761283851554663, + "loss": 2.4963, + "theoretical_loss": 3.4696618352083104, + "tokens_seen": 1747921920 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023760280842527584, + "loss": 2.5093, + "theoretical_loss": 3.4696506993410132, + "tokens_seen": 1747987456 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023759277833500502, + "loss": 2.526, + "theoretical_loss": 3.469639564008115, + "tokens_seen": 1748052992 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023758274824473423, + "loss": 2.5725, + "theoretical_loss": 3.4696284292095694, + "tokens_seen": 1748118528 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1912905, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1926348209381104, + "objective/train/theoretical_loss": 3.4696200784613014, + "objective/train/tokens_used": 1768627680, + "theoretical_loss": 3.4696200784613014, + "tokens_seen": 1748167680 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002375727181544634, + "loss": 2.5438, + "theoretical_loss": 3.4696172949453308, + "tokens_seen": 1748184064 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002375626880641926, + "loss": 2.6008, + "theoretical_loss": 3.4696061612153537, + "tokens_seen": 1748249600 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023755265797392177, + "loss": 2.611, + "theoretical_loss": 3.469595028019593, + "tokens_seen": 1748315136 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023754262788365098, + "loss": 2.5359, + "theoretical_loss": 3.469583895358001, + "tokens_seen": 1748380672 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023753259779338016, + "loss": 2.5723, + "theoretical_loss": 3.469572763230535, + "tokens_seen": 1748446208 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023752256770310934, + "loss": 2.6016, + "theoretical_loss": 3.469561631637147, + "tokens_seen": 1748511744 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023751253761283852, + "loss": 2.7812, + "theoretical_loss": 3.4695505005777925, + "tokens_seen": 1748577280 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023750250752256773, + "loss": 2.7481, + "theoretical_loss": 3.4695393700524253, + "tokens_seen": 1748642816 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002374924774322969, + "loss": 2.7774, + "theoretical_loss": 3.469528240061001, + "tokens_seen": 1748708352 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002374824473420261, + "loss": 2.5839, + "theoretical_loss": 3.469517110603472, + "tokens_seen": 1748773888 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023747241725175527, + "loss": 2.6484, + "theoretical_loss": 3.469505981679794, + "tokens_seen": 1748839424 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023746238716148446, + "loss": 2.6807, + "theoretical_loss": 3.469494853289921, + "tokens_seen": 1748904960 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023745235707121366, + "loss": 2.4702, + "theoretical_loss": 3.4694837254338076, + "tokens_seen": 1748970496 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023744232698094285, + "loss": 2.4502, + "theoretical_loss": 3.469472598111408, + "tokens_seen": 1749036032 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023743229689067203, + "loss": 2.5881, + "theoretical_loss": 3.469461471322677, + "tokens_seen": 1749101568 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002374222668004012, + "loss": 2.3572, + "theoretical_loss": 3.4694503450675684, + "tokens_seen": 1749167104 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023741223671013042, + "loss": 2.6486, + "theoretical_loss": 3.469439219346037, + "tokens_seen": 1749232640 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002374022066198596, + "loss": 2.5639, + "theoretical_loss": 3.4694280941580367, + "tokens_seen": 1749298176 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023739217652958878, + "loss": 2.5482, + "theoretical_loss": 3.469416969503523, + "tokens_seen": 1749363712 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023738214643931796, + "loss": 2.4975, + "theoretical_loss": 3.4694058453824494, + "tokens_seen": 1749429248 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023737211634904714, + "loss": 2.4062, + "theoretical_loss": 3.469394721794771, + "tokens_seen": 1749494784 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023736208625877635, + "loss": 2.3567, + "theoretical_loss": 3.4693835987404418, + "tokens_seen": 1749560320 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023735205616850553, + "loss": 2.5393, + "theoretical_loss": 3.469372476219416, + "tokens_seen": 1749625856 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002373420260782347, + "loss": 2.4255, + "theoretical_loss": 3.4693613542316486, + "tokens_seen": 1749691392 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002373319959879639, + "loss": 2.4388, + "theoretical_loss": 3.469350232777094, + "tokens_seen": 1749756928 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1914076, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.612685203552246, + "objective/train/theoretical_loss": 3.469341892036071, + "objective/train/tokens_used": 1770266080, + "theoretical_loss": 3.469341892036071, + "tokens_seen": 1749806080 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002373219658976931, + "loss": 2.5625, + "theoretical_loss": 3.469339111855706, + "tokens_seen": 1749822464 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023731193580742228, + "loss": 2.5351, + "theoretical_loss": 3.46932799146744, + "tokens_seen": 1749888000 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023730190571715146, + "loss": 2.6054, + "theoretical_loss": 3.46931687161225, + "tokens_seen": 1749953536 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023729187562688064, + "loss": 2.687, + "theoretical_loss": 3.4693057522900905, + "tokens_seen": 1750019072 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023728184553660982, + "loss": 2.8091, + "theoretical_loss": 3.4692946335009163, + "tokens_seen": 1750084608 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023727181544633903, + "loss": 2.6319, + "theoretical_loss": 3.4692835152446815, + "tokens_seen": 1750150144 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023726178535606821, + "loss": 2.6145, + "theoretical_loss": 3.4692723975213404, + "tokens_seen": 1750215680 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002372517552657974, + "loss": 2.3468, + "theoretical_loss": 3.4692612803308482, + "tokens_seen": 1750281216 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023724172517552658, + "loss": 2.5325, + "theoretical_loss": 3.469250163673159, + "tokens_seen": 1750346752 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023723169508525578, + "loss": 2.6537, + "theoretical_loss": 3.4692390475482275, + "tokens_seen": 1750412288 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023722166499498497, + "loss": 2.6446, + "theoretical_loss": 3.4692279319560075, + "tokens_seen": 1750477824 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023721163490471415, + "loss": 2.6416, + "theoretical_loss": 3.469216816896455, + "tokens_seen": 1750543360 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023720160481444333, + "loss": 2.6147, + "theoretical_loss": 3.469205702369523, + "tokens_seen": 1750608896 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002371915747241725, + "loss": 2.7509, + "theoretical_loss": 3.469194588375167, + "tokens_seen": 1750674432 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023718154463390172, + "loss": 2.5367, + "theoretical_loss": 3.4691834749133412, + "tokens_seen": 1750739968 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002371715145436309, + "loss": 2.4031, + "theoretical_loss": 3.4691723619840005, + "tokens_seen": 1750805504 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023716148445336008, + "loss": 2.455, + "theoretical_loss": 3.4691612495870987, + "tokens_seen": 1750871040 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023715145436308926, + "loss": 2.655, + "theoretical_loss": 3.4691501377225906, + "tokens_seen": 1750936576 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023714142427281847, + "loss": 2.525, + "theoretical_loss": 3.4691390263904314, + "tokens_seen": 1751002112 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023713139418254765, + "loss": 2.4318, + "theoretical_loss": 3.469127915590575, + "tokens_seen": 1751067648 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023712136409227683, + "loss": 2.5182, + "theoretical_loss": 3.469116805322977, + "tokens_seen": 1751133184 + }, + { + "epoch": 5.08, + "learning_rate": 0.000237111334002006, + "loss": 2.5002, + "theoretical_loss": 3.46910569558759, + "tokens_seen": 1751198720 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002371013039117352, + "loss": 2.5603, + "theoretical_loss": 3.4690945863843705, + "tokens_seen": 1751264256 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002370912738214644, + "loss": 2.4171, + "theoretical_loss": 3.469083477713273, + "tokens_seen": 1751329792 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023708124373119358, + "loss": 2.5239, + "theoretical_loss": 3.4690723695742505, + "tokens_seen": 1751395328 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1914762, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5021634101867676, + "objective/train/theoretical_loss": 3.4690640388191314, + "objective/train/tokens_used": 1771904480, + "theoretical_loss": 3.4690640388191314, + "tokens_seen": 1751444480 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023707121364092276, + "loss": 2.629, + "theoretical_loss": 3.469061261967259, + "tokens_seen": 1751460864 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023706118355065194, + "loss": 2.5097, + "theoretical_loss": 3.469050154892253, + "tokens_seen": 1751526400 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023705115346038115, + "loss": 2.6499, + "theoretical_loss": 3.4690390483491864, + "tokens_seen": 1751591936 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023704112337011033, + "loss": 2.5563, + "theoretical_loss": 3.4690279423380144, + "tokens_seen": 1751657472 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023703109327983952, + "loss": 2.5532, + "theoretical_loss": 3.469016836858691, + "tokens_seen": 1751723008 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002370210631895687, + "loss": 2.6649, + "theoretical_loss": 3.469005731911172, + "tokens_seen": 1751788544 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023701103309929788, + "loss": 2.7437, + "theoretical_loss": 3.4689946274954115, + "tokens_seen": 1751854080 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023700100300902709, + "loss": 2.538, + "theoretical_loss": 3.4689835236113638, + "tokens_seen": 1751919616 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023699097291875627, + "loss": 2.6429, + "theoretical_loss": 3.468972420258983, + "tokens_seen": 1751985152 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023698094282848545, + "loss": 2.4713, + "theoretical_loss": 3.4689613174382252, + "tokens_seen": 1752050688 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023697091273821463, + "loss": 2.5272, + "theoretical_loss": 3.468950215149045, + "tokens_seen": 1752116224 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023696088264794384, + "loss": 2.6636, + "theoretical_loss": 3.4689391133913956, + "tokens_seen": 1752181760 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023695085255767302, + "loss": 2.5302, + "theoretical_loss": 3.4689280121652324, + "tokens_seen": 1752247296 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002369408224674022, + "loss": 2.5896, + "theoretical_loss": 3.4689169114705107, + "tokens_seen": 1752312832 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023693079237713138, + "loss": 2.5447, + "theoretical_loss": 3.4689058113071845, + "tokens_seen": 1752378368 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023692076228686056, + "loss": 2.4564, + "theoretical_loss": 3.4688947116752082, + "tokens_seen": 1752443904 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023691073219658977, + "loss": 2.8002, + "theoretical_loss": 3.4688836125745377, + "tokens_seen": 1752509440 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023690070210631895, + "loss": 2.4241, + "theoretical_loss": 3.4688725140051266, + "tokens_seen": 1752574976 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023689067201604813, + "loss": 2.7225, + "theoretical_loss": 3.46886141596693, + "tokens_seen": 1752640512 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002368806419257773, + "loss": 2.4833, + "theoretical_loss": 3.4688503184599027, + "tokens_seen": 1752706048 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023687061183550652, + "loss": 2.4478, + "theoretical_loss": 3.468839221483999, + "tokens_seen": 1752771584 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023686058174523573, + "loss": 2.6527, + "theoretical_loss": 3.468828125039175, + "tokens_seen": 1752837120 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002368505516549649, + "loss": 2.723, + "theoretical_loss": 3.4688170291253835, + "tokens_seen": 1752902656 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002368405215646941, + "loss": 2.7352, + "theoretical_loss": 3.46880593374258, + "tokens_seen": 1752968192 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002368304914744233, + "loss": 2.6156, + "theoretical_loss": 3.4687948388907195, + "tokens_seen": 1753033728 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1916031, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1218574047088623, + "objective/train/theoretical_loss": 3.468786518100228, + "objective/train/tokens_used": 1773542880, + "theoretical_loss": 3.468786518100228, + "tokens_seen": 1753082880 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023682046138415248, + "loss": 2.393, + "theoretical_loss": 3.4687837445697562, + "tokens_seen": 1753099264 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023681043129388166, + "loss": 2.4588, + "theoretical_loss": 3.4687726507796457, + "tokens_seen": 1753164800 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023680040120361084, + "loss": 2.6775, + "theoretical_loss": 3.4687615575203425, + "tokens_seen": 1753230336 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023679037111334002, + "loss": 2.5315, + "theoretical_loss": 3.4687504647918006, + "tokens_seen": 1753295872 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023678034102306923, + "loss": 2.4803, + "theoretical_loss": 3.4687393725939755, + "tokens_seen": 1753361408 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023677031093279841, + "loss": 2.7575, + "theoretical_loss": 3.468728280926822, + "tokens_seen": 1753426944 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002367602808425276, + "loss": 2.5684, + "theoretical_loss": 3.4687171897902944, + "tokens_seen": 1753492480 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023675025075225678, + "loss": 2.506, + "theoretical_loss": 3.468706099184348, + "tokens_seen": 1753558016 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023674022066198598, + "loss": 2.5699, + "theoretical_loss": 3.4686950091089375, + "tokens_seen": 1753623552 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023673019057171517, + "loss": 2.5449, + "theoretical_loss": 3.4686839195640173, + "tokens_seen": 1753689088 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023672016048144435, + "loss": 2.7494, + "theoretical_loss": 3.4686728305495422, + "tokens_seen": 1753754624 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023671013039117353, + "loss": 2.9374, + "theoretical_loss": 3.4686617420654673, + "tokens_seen": 1753820160 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002367001003009027, + "loss": 2.7874, + "theoretical_loss": 3.4686506541117477, + "tokens_seen": 1753885696 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023669007021063192, + "loss": 2.526, + "theoretical_loss": 3.4686395666883376, + "tokens_seen": 1753951232 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002366800401203611, + "loss": 2.8194, + "theoretical_loss": 3.4686284797951923, + "tokens_seen": 1754016768 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023667001003009028, + "loss": 2.5216, + "theoretical_loss": 3.4686173934322664, + "tokens_seen": 1754082304 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023665997993981946, + "loss": 2.8125, + "theoretical_loss": 3.4686063075995146, + "tokens_seen": 1754147840 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023664994984954867, + "loss": 2.5519, + "theoretical_loss": 3.4685952222968917, + "tokens_seen": 1754213376 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023663991975927785, + "loss": 2.6458, + "theoretical_loss": 3.468584137524353, + "tokens_seen": 1754278912 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023662988966900703, + "loss": 2.6073, + "theoretical_loss": 3.4685730532818533, + "tokens_seen": 1754344448 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002366198595787362, + "loss": 2.6115, + "theoretical_loss": 3.468561969569347, + "tokens_seen": 1754409984 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002366098294884654, + "loss": 2.5143, + "theoretical_loss": 3.4685508863867893, + "tokens_seen": 1754475520 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002365997993981946, + "loss": 2.6175, + "theoretical_loss": 3.468539803734135, + "tokens_seen": 1754541056 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023658976930792378, + "loss": 2.6636, + "theoretical_loss": 3.4685287216113387, + "tokens_seen": 1754606592 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023657973921765296, + "loss": 2.6426, + "theoretical_loss": 3.468517640018356, + "tokens_seen": 1754672128 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1916618, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8745269775390625, + "objective/train/theoretical_loss": 3.4685093291712814, + "objective/train/tokens_used": 1775181280, + "theoretical_loss": 3.4685093291712814, + "tokens_seen": 1754721280 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023656970912738214, + "loss": 2.744, + "theoretical_loss": 3.468506558955141, + "tokens_seen": 1754737664 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023655967903711135, + "loss": 2.4807, + "theoretical_loss": 3.468495478421649, + "tokens_seen": 1754803200 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023654964894684053, + "loss": 2.5068, + "theoretical_loss": 3.468484398417835, + "tokens_seen": 1754868736 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023653961885656972, + "loss": 2.4075, + "theoretical_loss": 3.468473318943653, + "tokens_seen": 1754934272 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002365295887662989, + "loss": 2.4409, + "theoretical_loss": 3.4684622399990594, + "tokens_seen": 1754999808 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023651955867602808, + "loss": 2.249, + "theoretical_loss": 3.4684511615840083, + "tokens_seen": 1755065344 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023650952858575729, + "loss": 2.4607, + "theoretical_loss": 3.468440083698454, + "tokens_seen": 1755130880 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023649949849548647, + "loss": 2.5932, + "theoretical_loss": 3.468429006342353, + "tokens_seen": 1755196416 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023648946840521565, + "loss": 2.4739, + "theoretical_loss": 3.4684179295156583, + "tokens_seen": 1755261952 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023647943831494483, + "loss": 2.7564, + "theoretical_loss": 3.468406853218326, + "tokens_seen": 1755327488 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023646940822467404, + "loss": 2.5157, + "theoretical_loss": 3.4683957774503114, + "tokens_seen": 1755393024 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023645937813440322, + "loss": 2.3968, + "theoretical_loss": 3.4683847022115684, + "tokens_seen": 1755458560 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002364493480441324, + "loss": 2.5065, + "theoretical_loss": 3.4683736275020527, + "tokens_seen": 1755524096 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023643931795386158, + "loss": 2.5286, + "theoretical_loss": 3.468362553321719, + "tokens_seen": 1755589632 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023642928786359076, + "loss": 2.5083, + "theoretical_loss": 3.468351479670522, + "tokens_seen": 1755655168 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023641925777331997, + "loss": 2.6168, + "theoretical_loss": 3.4683404065484176, + "tokens_seen": 1755720704 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023640922768304915, + "loss": 2.7117, + "theoretical_loss": 3.468329333955359, + "tokens_seen": 1755786240 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023639919759277833, + "loss": 2.5585, + "theoretical_loss": 3.4683182618913033, + "tokens_seen": 1755851776 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002363891675025075, + "loss": 2.525, + "theoretical_loss": 3.468307190356204, + "tokens_seen": 1755917312 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023637913741223672, + "loss": 2.3099, + "theoretical_loss": 3.468296119350017, + "tokens_seen": 1755982848 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002363691073219659, + "loss": 2.4407, + "theoretical_loss": 3.468285048872697, + "tokens_seen": 1756048384 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023635907723169508, + "loss": 2.442, + "theoretical_loss": 3.4682739789241976, + "tokens_seen": 1756113920 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023634904714142426, + "loss": 2.4612, + "theoretical_loss": 3.4682629095044764, + "tokens_seen": 1756179456 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023633901705115347, + "loss": 2.5969, + "theoretical_loss": 3.468251840613487, + "tokens_seen": 1756244992 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023632898696088265, + "loss": 2.6169, + "theoretical_loss": 3.4682407722511837, + "tokens_seen": 1756310528 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1917852, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.753662109375, + "objective/train/theoretical_loss": 3.46823247132638, + "objective/train/tokens_used": 1776819680, + "theoretical_loss": 3.46823247132638, + "tokens_seen": 1756359680 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023631895687061184, + "loss": 2.5124, + "theoretical_loss": 3.4682297044175225, + "tokens_seen": 1756376064 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023630892678034102, + "loss": 2.4222, + "theoretical_loss": 3.468218637112458, + "tokens_seen": 1756441600 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002362988966900702, + "loss": 2.4413, + "theoretical_loss": 3.468207570335946, + "tokens_seen": 1756507136 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002362888665997994, + "loss": 2.333, + "theoretical_loss": 3.468196504087941, + "tokens_seen": 1756572672 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002362788365095286, + "loss": 2.6586, + "theoretical_loss": 3.4681854383683977, + "tokens_seen": 1756638208 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023626880641925777, + "loss": 2.7103, + "theoretical_loss": 3.4681743731772716, + "tokens_seen": 1756703744 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023625877632898695, + "loss": 2.5046, + "theoretical_loss": 3.468163308514518, + "tokens_seen": 1756769280 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023624874623871616, + "loss": 2.55, + "theoretical_loss": 3.4681522443800916, + "tokens_seen": 1756834816 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023623871614844534, + "loss": 2.7821, + "theoretical_loss": 3.468141180773947, + "tokens_seen": 1756900352 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023622868605817452, + "loss": 2.1776, + "theoretical_loss": 3.46813011769604, + "tokens_seen": 1756965888 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002362186559679037, + "loss": 2.3755, + "theoretical_loss": 3.468119055146326, + "tokens_seen": 1757031424 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023620862587763288, + "loss": 2.6778, + "theoretical_loss": 3.468107993124759, + "tokens_seen": 1757096960 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002361985957873621, + "loss": 2.444, + "theoretical_loss": 3.4680969316312944, + "tokens_seen": 1757162496 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023618856569709127, + "loss": 2.647, + "theoretical_loss": 3.4680858706658877, + "tokens_seen": 1757228032 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023617853560682045, + "loss": 2.4625, + "theoretical_loss": 3.4680748102284937, + "tokens_seen": 1757293568 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023616850551654963, + "loss": 2.6135, + "theoretical_loss": 3.4680637503190677, + "tokens_seen": 1757359104 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023615847542627884, + "loss": 2.2921, + "theoretical_loss": 3.4680526909375646, + "tokens_seen": 1757424640 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023614844533600802, + "loss": 2.4032, + "theoretical_loss": 3.46804163208394, + "tokens_seen": 1757490176 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002361384152457372, + "loss": 2.5568, + "theoretical_loss": 3.4680305737581483, + "tokens_seen": 1757555712 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023612838515546639, + "loss": 2.616, + "theoretical_loss": 3.468019515960145, + "tokens_seen": 1757621248 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023611835506519557, + "loss": 2.2857, + "theoretical_loss": 3.4680084586898854, + "tokens_seen": 1757686784 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002361083249749248, + "loss": 2.589, + "theoretical_loss": 3.467997401947324, + "tokens_seen": 1757752320 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023609829488465398, + "loss": 2.3022, + "theoretical_loss": 3.4679863457324167, + "tokens_seen": 1757817856 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023608826479438316, + "loss": 2.5159, + "theoretical_loss": 3.467975290045118, + "tokens_seen": 1757883392 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023607823470411234, + "loss": 2.5914, + "theoretical_loss": 3.467964234885384, + "tokens_seen": 1757948928 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1918641, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9182496070861816, + "objective/train/theoretical_loss": 3.46795594386177, + "objective/train/tokens_used": 1778458080, + "theoretical_loss": 3.46795594386177, + "tokens_seen": 1757998080 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023606820461384155, + "loss": 2.496, + "theoretical_loss": 3.4679531802531685, + "tokens_seen": 1758014464 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023605817452357073, + "loss": 2.4925, + "theoretical_loss": 3.467942126148428, + "tokens_seen": 1758080000 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023604814443329992, + "loss": 2.6393, + "theoretical_loss": 3.467931072571117, + "tokens_seen": 1758145536 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002360381143430291, + "loss": 2.4737, + "theoretical_loss": 3.4679200195211908, + "tokens_seen": 1758211072 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023602808425275828, + "loss": 2.5902, + "theoretical_loss": 3.4679089669986043, + "tokens_seen": 1758276608 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023601805416248749, + "loss": 2.4938, + "theoretical_loss": 3.467897915003313, + "tokens_seen": 1758342144 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023600802407221667, + "loss": 2.5029, + "theoretical_loss": 3.4678868635352718, + "tokens_seen": 1758407680 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023599799398194585, + "loss": 2.4677, + "theoretical_loss": 3.4678758125944364, + "tokens_seen": 1758473216 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023598796389167503, + "loss": 2.4647, + "theoretical_loss": 3.4678647621807617, + "tokens_seen": 1758538752 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023597793380140424, + "loss": 2.5808, + "theoretical_loss": 3.467853712294203, + "tokens_seen": 1758604288 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023596790371113342, + "loss": 2.7248, + "theoretical_loss": 3.467842662934715, + "tokens_seen": 1758669824 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002359578736208626, + "loss": 2.5173, + "theoretical_loss": 3.467831614102254, + "tokens_seen": 1758735360 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023594784353059178, + "loss": 2.4618, + "theoretical_loss": 3.4678205657967744, + "tokens_seen": 1758800896 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023593781344032096, + "loss": 2.6533, + "theoretical_loss": 3.467809518018231, + "tokens_seen": 1758866432 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023592778335005017, + "loss": 2.5789, + "theoretical_loss": 3.46779847076658, + "tokens_seen": 1758931968 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023591775325977935, + "loss": 2.5674, + "theoretical_loss": 3.4677874240417763, + "tokens_seen": 1758997504 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023590772316950853, + "loss": 2.419, + "theoretical_loss": 3.4677763778437756, + "tokens_seen": 1759063040 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002358976930792377, + "loss": 2.4435, + "theoretical_loss": 3.467765332172532, + "tokens_seen": 1759128576 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023588766298896692, + "loss": 2.4901, + "theoretical_loss": 3.4677542870280016, + "tokens_seen": 1759194112 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002358776328986961, + "loss": 2.629, + "theoretical_loss": 3.4677432424101404, + "tokens_seen": 1759259648 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023586760280842528, + "loss": 2.5999, + "theoretical_loss": 3.4677321983189016, + "tokens_seen": 1759325184 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023585757271815447, + "loss": 2.522, + "theoretical_loss": 3.4677211547542424, + "tokens_seen": 1759390720 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023584754262788367, + "loss": 2.6044, + "theoretical_loss": 3.4677101117161166, + "tokens_seen": 1759456256 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023583751253761285, + "loss": 2.2136, + "theoretical_loss": 3.467699069204481, + "tokens_seen": 1759521792 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023582748244734204, + "loss": 2.2825, + "theoretical_loss": 3.46768802721929, + "tokens_seen": 1759587328 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.684250831604004, + "objective/train/theoretical_loss": 3.4676797460758486, + "objective/train/tokens_used": 1780096480, + "theoretical_loss": 3.4676797460758486, + "tokens_seen": 1759636480 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023581745235707122, + "loss": 2.7752, + "theoretical_loss": 3.467676985760498, + "tokens_seen": 1759652864 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002358074222668004, + "loss": 2.6233, + "theoretical_loss": 3.4676659448280622, + "tokens_seen": 1759718400 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002357973921765296, + "loss": 2.4457, + "theoretical_loss": 3.467654904421937, + "tokens_seen": 1759783936 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002357873620862588, + "loss": 2.684, + "theoretical_loss": 3.4676438645420777, + "tokens_seen": 1759849472 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023577733199598797, + "loss": 2.6564, + "theoretical_loss": 3.4676328251884394, + "tokens_seen": 1759915008 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023576730190571715, + "loss": 2.599, + "theoretical_loss": 3.4676217863609775, + "tokens_seen": 1759980544 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023575727181544636, + "loss": 2.4638, + "theoretical_loss": 3.467610748059648, + "tokens_seen": 1760046080 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023574724172517554, + "loss": 2.4456, + "theoretical_loss": 3.4675997102844054, + "tokens_seen": 1760111616 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023573721163490472, + "loss": 2.3307, + "theoretical_loss": 3.4675886730352055, + "tokens_seen": 1760177152 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002357271815446339, + "loss": 2.3561, + "theoretical_loss": 3.4675776363120034, + "tokens_seen": 1760242688 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023571715145436308, + "loss": 2.507, + "theoretical_loss": 3.467566600114755, + "tokens_seen": 1760308224 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002357071213640923, + "loss": 2.6555, + "theoretical_loss": 3.4675555644434146, + "tokens_seen": 1760373760 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023569709127382147, + "loss": 2.486, + "theoretical_loss": 3.467544529297939, + "tokens_seen": 1760439296 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023568706118355065, + "loss": 2.5958, + "theoretical_loss": 3.467533494678282, + "tokens_seen": 1760504832 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023567703109327983, + "loss": 2.5002, + "theoretical_loss": 3.4675224605843997, + "tokens_seen": 1760570368 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023566700100300904, + "loss": 2.6807, + "theoretical_loss": 3.467511427016248, + "tokens_seen": 1760635904 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023565697091273822, + "loss": 2.3531, + "theoretical_loss": 3.4675003939737814, + "tokens_seen": 1760701440 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002356469408224674, + "loss": 2.3301, + "theoretical_loss": 3.4674893614569555, + "tokens_seen": 1760766976 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023563691073219659, + "loss": 2.5936, + "theoretical_loss": 3.4674783294657265, + "tokens_seen": 1760832512 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023562688064192577, + "loss": 2.3267, + "theoretical_loss": 3.4674672980000487, + "tokens_seen": 1760898048 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023561685055165497, + "loss": 2.5555, + "theoretical_loss": 3.467456267059878, + "tokens_seen": 1760963584 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023560682046138416, + "loss": 2.5269, + "theoretical_loss": 3.46744523664517, + "tokens_seen": 1761029120 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023559679037111334, + "loss": 2.3557, + "theoretical_loss": 3.46743420675588, + "tokens_seen": 1761094656 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023558676028084252, + "loss": 2.9255, + "theoretical_loss": 3.4674231773919626, + "tokens_seen": 1761160192 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023557673019057173, + "loss": 2.5995, + "theoretical_loss": 3.4674121485533744, + "tokens_seen": 1761225728 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.361060380935669, + "objective/train/theoretical_loss": 3.4674038772691533, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.4674038772691533, + "tokens_seen": 1761274880 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002355667001003009, + "loss": 2.2956, + "theoretical_loss": 3.4674011202400705, + "tokens_seen": 1761291264 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002355566700100301, + "loss": 2.58, + "theoretical_loss": 3.467390092452006, + "tokens_seen": 1761356800 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023554663991975927, + "loss": 2.6235, + "theoretical_loss": 3.467379065189137, + "tokens_seen": 1761422336 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023553660982948845, + "loss": 2.3079, + "theoretical_loss": 3.4673680384514176, + "tokens_seen": 1761487872 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023552657973921766, + "loss": 2.4721, + "theoretical_loss": 3.467357012238805, + "tokens_seen": 1761553408 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023551654964894684, + "loss": 2.6342, + "theoretical_loss": 3.4673459865512535, + "tokens_seen": 1761618944 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023550651955867602, + "loss": 2.5316, + "theoretical_loss": 3.467334961388719, + "tokens_seen": 1761684480 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002354964894684052, + "loss": 2.5439, + "theoretical_loss": 3.4673239367511566, + "tokens_seen": 1761750016 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002354864593781344, + "loss": 2.5401, + "theoretical_loss": 3.4673129126385223, + "tokens_seen": 1761815552 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002354764292878636, + "loss": 2.5952, + "theoretical_loss": 3.467301889050771, + "tokens_seen": 1761881088 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023546639919759277, + "loss": 2.4916, + "theoretical_loss": 3.467290865987859, + "tokens_seen": 1761946624 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023545636910732195, + "loss": 2.5921, + "theoretical_loss": 3.4672798434497407, + "tokens_seen": 1762012160 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023544633901705113, + "loss": 2.4428, + "theoretical_loss": 3.467268821436373, + "tokens_seen": 1762077696 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023543630892678034, + "loss": 2.5588, + "theoretical_loss": 3.46725779994771, + "tokens_seen": 1762143232 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023542627883650952, + "loss": 2.5045, + "theoretical_loss": 3.4672467789837076, + "tokens_seen": 1762208768 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002354162487462387, + "loss": 2.474, + "theoretical_loss": 3.467235758544322, + "tokens_seen": 1762274304 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002354062186559679, + "loss": 2.6545, + "theoretical_loss": 3.4672247386295076, + "tokens_seen": 1762339840 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002353961885656971, + "loss": 2.5465, + "theoretical_loss": 3.467213719239221, + "tokens_seen": 1762405376 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023538615847542628, + "loss": 2.4112, + "theoretical_loss": 3.4672027003734174, + "tokens_seen": 1762470912 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023537612838515546, + "loss": 2.6024, + "theoretical_loss": 3.467191682032052, + "tokens_seen": 1762536448 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023536609829488464, + "loss": 2.6876, + "theoretical_loss": 3.4671806642150806, + "tokens_seen": 1762601984 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023535606820461387, + "loss": 2.3949, + "theoretical_loss": 3.467169646922459, + "tokens_seen": 1762667520 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023534603811434305, + "loss": 2.5294, + "theoretical_loss": 3.4671586301541417, + "tokens_seen": 1762733056 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023533600802407224, + "loss": 2.3524, + "theoretical_loss": 3.467147613910086, + "tokens_seen": 1762798592 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023532597793380142, + "loss": 2.7103, + "theoretical_loss": 3.4671365981902458, + "tokens_seen": 1762864128 + }, + { + "epoch": 5.08, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.408428430557251, + "objective/train/theoretical_loss": 3.4671283367443557, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.4671283367443557, + "tokens_seen": 1762913280 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002353159478435306, + "loss": 2.2904, + "theoretical_loss": 3.4671255829945777, + "tokens_seen": 1762929664 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002353059177532598, + "loss": 2.409, + "theoretical_loss": 3.4671145683230367, + "tokens_seen": 1762995200 + }, + { + "epoch": 5.08, + "learning_rate": 0.000235295887662989, + "loss": 2.6589, + "theoretical_loss": 3.467103554175579, + "tokens_seen": 1763060736 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023528585757271817, + "loss": 2.4615, + "theoretical_loss": 3.467092540552159, + "tokens_seen": 1763126272 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023527582748244735, + "loss": 2.5964, + "theoretical_loss": 3.4670815274527333, + "tokens_seen": 1763191808 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023526579739217656, + "loss": 2.8246, + "theoretical_loss": 3.467070514877258, + "tokens_seen": 1763257344 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023525576730190574, + "loss": 2.6005, + "theoretical_loss": 3.4670595028256876, + "tokens_seen": 1763322880 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023524573721163492, + "loss": 2.4462, + "theoretical_loss": 3.4670484912979775, + "tokens_seen": 1763388416 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002352357071213641, + "loss": 2.7299, + "theoretical_loss": 3.4670374802940844, + "tokens_seen": 1763453952 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023522567703109328, + "loss": 2.4613, + "theoretical_loss": 3.4670264698139635, + "tokens_seen": 1763519488 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002352156469408225, + "loss": 2.4862, + "theoretical_loss": 3.4670154598575698, + "tokens_seen": 1763585024 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023520561685055167, + "loss": 2.6885, + "theoretical_loss": 3.4670044504248603, + "tokens_seen": 1763650560 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023519558676028085, + "loss": 2.5629, + "theoretical_loss": 3.466993441515789, + "tokens_seen": 1763716096 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023518555667001003, + "loss": 2.3579, + "theoretical_loss": 3.4669824331303127, + "tokens_seen": 1763781632 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023517552657973924, + "loss": 2.5994, + "theoretical_loss": 3.4669714252683868, + "tokens_seen": 1763847168 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023516549648946842, + "loss": 2.3519, + "theoretical_loss": 3.4669604179299665, + "tokens_seen": 1763912704 + }, + { + "epoch": 5.08, + "learning_rate": 0.0002351554663991976, + "loss": 2.4897, + "theoretical_loss": 3.4669494111150074, + "tokens_seen": 1763978240 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023514543630892679, + "loss": 2.6009, + "theoretical_loss": 3.466938404823466, + "tokens_seen": 1764043776 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023513540621865597, + "loss": 2.5045, + "theoretical_loss": 3.466927399055298, + "tokens_seen": 1764109312 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023512537612838517, + "loss": 2.5427, + "theoretical_loss": 3.466916393810458, + "tokens_seen": 1764174848 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023511534603811436, + "loss": 2.5333, + "theoretical_loss": 3.466905389088902, + "tokens_seen": 1764240384 + }, + { + "epoch": 5.08, + "learning_rate": 0.00023510531594784354, + "loss": 2.3119, + "theoretical_loss": 3.4668943848905864, + "tokens_seen": 1764305920 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023509528585757272, + "loss": 2.6842, + "theoretical_loss": 3.4668833812154665, + "tokens_seen": 1764371456 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023508525576730193, + "loss": 2.4093, + "theoretical_loss": 3.4668723780634974, + "tokens_seen": 1764436992 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002350752256770311, + "loss": 2.4364, + "theoretical_loss": 3.466861375434635, + "tokens_seen": 1764502528 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6699795722961426, + "objective/train/theoretical_loss": 3.4668531238062514, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.4668531238062514, + "tokens_seen": 1764551680 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002350651955867603, + "loss": 2.5005, + "theoretical_loss": 3.4668503733288363, + "tokens_seen": 1764568064 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023505516549648947, + "loss": 2.5249, + "theoretical_loss": 3.4668393717460555, + "tokens_seen": 1764633600 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023504513540621865, + "loss": 2.4366, + "theoretical_loss": 3.4668283706862493, + "tokens_seen": 1764699136 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023503510531594786, + "loss": 2.3593, + "theoretical_loss": 3.4668173701493723, + "tokens_seen": 1764764672 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023502507522567704, + "loss": 2.6649, + "theoretical_loss": 3.466806370135381, + "tokens_seen": 1764830208 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023501504513540622, + "loss": 2.3591, + "theoretical_loss": 3.466795370644231, + "tokens_seen": 1764895744 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002350050150451354, + "loss": 2.3061, + "theoretical_loss": 3.466784371675878, + "tokens_seen": 1764961280 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002349949849548646, + "loss": 2.6703, + "theoretical_loss": 3.4667733732302786, + "tokens_seen": 1765026816 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002349849548645938, + "loss": 2.5614, + "theoretical_loss": 3.4667623753073866, + "tokens_seen": 1765092352 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023497492477432297, + "loss": 2.7806, + "theoretical_loss": 3.4667513779071593, + "tokens_seen": 1765157888 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023496489468405215, + "loss": 2.4194, + "theoretical_loss": 3.4667403810295525, + "tokens_seen": 1765223424 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023495486459378134, + "loss": 2.6921, + "theoretical_loss": 3.4667293846745206, + "tokens_seen": 1765288960 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023494483450351054, + "loss": 2.514, + "theoretical_loss": 3.466718388842021, + "tokens_seen": 1765354496 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023493480441323972, + "loss": 2.5694, + "theoretical_loss": 3.4667073935320083, + "tokens_seen": 1765420032 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002349247743229689, + "loss": 2.5601, + "theoretical_loss": 3.466696398744439, + "tokens_seen": 1765485568 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002349147442326981, + "loss": 2.4646, + "theoretical_loss": 3.4666854044792688, + "tokens_seen": 1765551104 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002349047141424273, + "loss": 2.4744, + "theoretical_loss": 3.4666744107364527, + "tokens_seen": 1765616640 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023489468405215648, + "loss": 2.4071, + "theoretical_loss": 3.4666634175159476, + "tokens_seen": 1765682176 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023488465396188566, + "loss": 2.4783, + "theoretical_loss": 3.4666524248177084, + "tokens_seen": 1765747712 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023487462387161484, + "loss": 2.5431, + "theoretical_loss": 3.4666414326416914, + "tokens_seen": 1765813248 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023486459378134402, + "loss": 2.325, + "theoretical_loss": 3.4666304409878523, + "tokens_seen": 1765878784 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023485456369107323, + "loss": 2.8022, + "theoretical_loss": 3.4666194498561467, + "tokens_seen": 1765944320 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002348445336008024, + "loss": 2.6817, + "theoretical_loss": 3.4666084592465314, + "tokens_seen": 1766009856 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002348345035105316, + "loss": 2.6012, + "theoretical_loss": 3.4665974691589607, + "tokens_seen": 1766075392 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023482447342026077, + "loss": 2.3683, + "theoretical_loss": 3.466586479593391, + "tokens_seen": 1766140928 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1888675689697266, + "objective/train/theoretical_loss": 3.466578237761751, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.466578237761751, + "tokens_seen": 1766190080 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023481444332998998, + "loss": 2.5651, + "theoretical_loss": 3.466575490549779, + "tokens_seen": 1766206464 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023480441323971916, + "loss": 2.5376, + "theoretical_loss": 3.46656450202808, + "tokens_seen": 1766272000 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023479438314944834, + "loss": 2.306, + "theoretical_loss": 3.4665535140282495, + "tokens_seen": 1766337536 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023478435305917752, + "loss": 2.6038, + "theoretical_loss": 3.4665425265502434, + "tokens_seen": 1766403072 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023477432296890673, + "loss": 2.6747, + "theoretical_loss": 3.4665315395940177, + "tokens_seen": 1766468608 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002347642928786359, + "loss": 2.5324, + "theoretical_loss": 3.4665205531595285, + "tokens_seen": 1766534144 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002347542627883651, + "loss": 2.5262, + "theoretical_loss": 3.4665095672467316, + "tokens_seen": 1766599680 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023474423269809427, + "loss": 2.5028, + "theoretical_loss": 3.466498581855582, + "tokens_seen": 1766665216 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023473420260782346, + "loss": 2.3845, + "theoretical_loss": 3.4664875969860374, + "tokens_seen": 1766730752 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023472417251755266, + "loss": 2.4874, + "theoretical_loss": 3.466476612638052, + "tokens_seen": 1766796288 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023471414242728184, + "loss": 2.6271, + "theoretical_loss": 3.4664656288115823, + "tokens_seen": 1766861824 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023470411233701103, + "loss": 2.5846, + "theoretical_loss": 3.4664546455065843, + "tokens_seen": 1766927360 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002346940822467402, + "loss": 2.5837, + "theoretical_loss": 3.466443662723014, + "tokens_seen": 1766992896 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023468405215646942, + "loss": 2.6022, + "theoretical_loss": 3.4664326804608265, + "tokens_seen": 1767058432 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002346740220661986, + "loss": 2.4213, + "theoretical_loss": 3.466421698719979, + "tokens_seen": 1767123968 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023466399197592778, + "loss": 2.2348, + "theoretical_loss": 3.466410717500427, + "tokens_seen": 1767189504 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023465396188565696, + "loss": 2.5319, + "theoretical_loss": 3.466399736802125, + "tokens_seen": 1767255040 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023464393179538614, + "loss": 2.5631, + "theoretical_loss": 3.4663887566250313, + "tokens_seen": 1767320576 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023463390170511535, + "loss": 2.6082, + "theoretical_loss": 3.4663777769691, + "tokens_seen": 1767386112 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023463390170511535, + "loss": 2.3313, + "theoretical_loss": 3.466366797834288, + "tokens_seen": 1767451648 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023462387161484453, + "loss": 2.6091, + "theoretical_loss": 3.4663558192205506, + "tokens_seen": 1767517184 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002346138415245737, + "loss": 2.5843, + "theoretical_loss": 3.466344841127844, + "tokens_seen": 1767582720 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023460381143430292, + "loss": 2.5043, + "theoretical_loss": 3.4663338635561245, + "tokens_seen": 1767648256 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023459378134403213, + "loss": 2.4155, + "theoretical_loss": 3.4663228865053477, + "tokens_seen": 1767713792 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002345837512537613, + "loss": 2.4292, + "theoretical_loss": 3.46631190997547, + "tokens_seen": 1767779328 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.40676212310791, + "objective/train/theoretical_loss": 3.466303677919875, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.466303677919875, + "tokens_seen": 1767828480 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002345737211634905, + "loss": 2.5097, + "theoretical_loss": 3.4663009339664463, + "tokens_seen": 1767844864 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023456369107321967, + "loss": 2.4759, + "theoretical_loss": 3.466289958478234, + "tokens_seen": 1767910400 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023455366098294885, + "loss": 2.8101, + "theoretical_loss": 3.466278983510788, + "tokens_seen": 1767975936 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023454363089267806, + "loss": 2.5102, + "theoretical_loss": 3.4662680090640645, + "tokens_seen": 1768041472 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023453360080240724, + "loss": 2.4973, + "theoretical_loss": 3.46625703513802, + "tokens_seen": 1768107008 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023452357071213642, + "loss": 2.7445, + "theoretical_loss": 3.46624606173261, + "tokens_seen": 1768172544 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002345135406218656, + "loss": 2.5312, + "theoretical_loss": 3.4662350888477906, + "tokens_seen": 1768238080 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002345035105315948, + "loss": 2.7126, + "theoretical_loss": 3.4662241164835184, + "tokens_seen": 1768303616 + }, + { + "epoch": 5.09, + "learning_rate": 0.000234493480441324, + "loss": 2.4269, + "theoretical_loss": 3.4662131446397484, + "tokens_seen": 1768369152 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023448345035105317, + "loss": 2.227, + "theoretical_loss": 3.4662021733164377, + "tokens_seen": 1768434688 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023447342026078235, + "loss": 2.3333, + "theoretical_loss": 3.466191202513541, + "tokens_seen": 1768500224 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023446339017051154, + "loss": 2.4462, + "theoretical_loss": 3.4661802322310153, + "tokens_seen": 1768565760 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023445336008024074, + "loss": 2.6257, + "theoretical_loss": 3.4661692624688163, + "tokens_seen": 1768631296 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023444332998996992, + "loss": 2.5116, + "theoretical_loss": 3.4661582932269006, + "tokens_seen": 1768696832 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002344332998996991, + "loss": 2.5406, + "theoretical_loss": 3.466147324505223, + "tokens_seen": 1768762368 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002344232698094283, + "loss": 2.4436, + "theoretical_loss": 3.4661363563037413, + "tokens_seen": 1768827904 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002344132397191575, + "loss": 2.5034, + "theoretical_loss": 3.46612538862241, + "tokens_seen": 1768893440 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023440320962888668, + "loss": 2.7441, + "theoretical_loss": 3.466114421461186, + "tokens_seen": 1768958976 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023439317953861586, + "loss": 2.3034, + "theoretical_loss": 3.4661034548200247, + "tokens_seen": 1769024512 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023438314944834504, + "loss": 2.6486, + "theoretical_loss": 3.4660924886988833, + "tokens_seen": 1769090048 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023437311935807422, + "loss": 2.496, + "theoretical_loss": 3.466081523097717, + "tokens_seen": 1769155584 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023436308926780343, + "loss": 2.4793, + "theoretical_loss": 3.4660705580164817, + "tokens_seen": 1769221120 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002343530591775326, + "loss": 2.616, + "theoretical_loss": 3.4660595934551344, + "tokens_seen": 1769286656 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002343430290872618, + "loss": 2.3019, + "theoretical_loss": 3.4660486294136303, + "tokens_seen": 1769352192 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023433299899699097, + "loss": 2.5352, + "theoretical_loss": 3.4660376658919256, + "tokens_seen": 1769417728 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.412342071533203, + "objective/train/theoretical_loss": 3.4660294435917396, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.4660294435917396, + "tokens_seen": 1769466880 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023432296890672018, + "loss": 2.3163, + "theoretical_loss": 3.466026702889977, + "tokens_seen": 1769483264 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023431293881644936, + "loss": 2.6751, + "theoretical_loss": 3.46601574040774, + "tokens_seen": 1769548800 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023430290872617854, + "loss": 2.6627, + "theoretical_loss": 3.4660047784451713, + "tokens_seen": 1769614336 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023429287863590772, + "loss": 2.5085, + "theoretical_loss": 3.465993817002227, + "tokens_seen": 1769679872 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002342828485456369, + "loss": 2.5014, + "theoretical_loss": 3.4659828560788624, + "tokens_seen": 1769745408 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002342728184553661, + "loss": 2.4813, + "theoretical_loss": 3.4659718956750343, + "tokens_seen": 1769810944 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002342627883650953, + "loss": 2.4091, + "theoretical_loss": 3.4659609357906986, + "tokens_seen": 1769876480 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023425275827482447, + "loss": 2.543, + "theoretical_loss": 3.4659499764258115, + "tokens_seen": 1769942016 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023424272818455366, + "loss": 2.7011, + "theoretical_loss": 3.4659390175803297, + "tokens_seen": 1770007552 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023423269809428286, + "loss": 2.5546, + "theoretical_loss": 3.4659280592542085, + "tokens_seen": 1770073088 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023422266800401204, + "loss": 2.363, + "theoretical_loss": 3.4659171014474044, + "tokens_seen": 1770138624 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023421263791374123, + "loss": 2.3633, + "theoretical_loss": 3.4659061441598737, + "tokens_seen": 1770204160 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002342026078234704, + "loss": 2.6171, + "theoretical_loss": 3.465895187391572, + "tokens_seen": 1770269696 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023419257773319962, + "loss": 2.6448, + "theoretical_loss": 3.4658842311424567, + "tokens_seen": 1770335232 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002341825476429288, + "loss": 2.3643, + "theoretical_loss": 3.465873275412483, + "tokens_seen": 1770400768 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023417251755265798, + "loss": 2.6347, + "theoretical_loss": 3.465862320201607, + "tokens_seen": 1770466304 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023416248746238716, + "loss": 2.492, + "theoretical_loss": 3.465851365509785, + "tokens_seen": 1770531840 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023415245737211634, + "loss": 2.4461, + "theoretical_loss": 3.4658404113369734, + "tokens_seen": 1770597376 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023414242728184555, + "loss": 2.5314, + "theoretical_loss": 3.465829457683129, + "tokens_seen": 1770662912 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023413239719157473, + "loss": 2.3787, + "theoretical_loss": 3.465818504548207, + "tokens_seen": 1770728448 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002341223671013039, + "loss": 2.7296, + "theoretical_loss": 3.465807551932164, + "tokens_seen": 1770793984 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002341123370110331, + "loss": 2.4833, + "theoretical_loss": 3.465796599834956, + "tokens_seen": 1770859520 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002341023069207623, + "loss": 2.4753, + "theoretical_loss": 3.4657856482565395, + "tokens_seen": 1770925056 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023409227683049148, + "loss": 2.5932, + "theoretical_loss": 3.4657746971968706, + "tokens_seen": 1770990592 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023408224674022066, + "loss": 2.6503, + "theoretical_loss": 3.465763746655906, + "tokens_seen": 1771056128 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.008837938308716, + "objective/train/theoretical_loss": 3.4657555340905555, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.4657555340905555, + "tokens_seen": 1771105280 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023407221664994984, + "loss": 2.2046, + "theoretical_loss": 3.465752796633601, + "tokens_seen": 1771121664 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023406218655967902, + "loss": 2.646, + "theoretical_loss": 3.465741847129913, + "tokens_seen": 1771187200 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023405215646940823, + "loss": 2.6061, + "theoretical_loss": 3.4657308981447974, + "tokens_seen": 1771252736 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002340421263791374, + "loss": 2.6694, + "theoretical_loss": 3.4657199496782103, + "tokens_seen": 1771318272 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002340320962888666, + "loss": 2.293, + "theoretical_loss": 3.4657090017301084, + "tokens_seen": 1771383808 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023402206619859578, + "loss": 2.4046, + "theoretical_loss": 3.4656980543004483, + "tokens_seen": 1771449344 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023401203610832498, + "loss": 2.1652, + "theoretical_loss": 3.4656871073891855, + "tokens_seen": 1771514880 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023400200601805416, + "loss": 2.5043, + "theoretical_loss": 3.465676160996277, + "tokens_seen": 1771580416 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023399197592778335, + "loss": 2.6895, + "theoretical_loss": 3.4656652151216782, + "tokens_seen": 1771645952 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023398194583751253, + "loss": 2.538, + "theoretical_loss": 3.4656542697653467, + "tokens_seen": 1771711488 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002339719157472417, + "loss": 2.6488, + "theoretical_loss": 3.465643324927237, + "tokens_seen": 1771777024 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023396188565697092, + "loss": 2.4464, + "theoretical_loss": 3.4656323806073073, + "tokens_seen": 1771842560 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002339518555667001, + "loss": 2.4428, + "theoretical_loss": 3.4656214368055123, + "tokens_seen": 1771908096 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023394182547642928, + "loss": 2.5355, + "theoretical_loss": 3.46561049352181, + "tokens_seen": 1771973632 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023393179538615846, + "loss": 2.4985, + "theoretical_loss": 3.4655995507561546, + "tokens_seen": 1772039168 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023392176529588767, + "loss": 2.4878, + "theoretical_loss": 3.4655886085085044, + "tokens_seen": 1772104704 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023391173520561685, + "loss": 2.6146, + "theoretical_loss": 3.465577666778814, + "tokens_seen": 1772170240 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023390170511534603, + "loss": 2.4653, + "theoretical_loss": 3.4655667255670415, + "tokens_seen": 1772235776 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002338916750250752, + "loss": 2.6172, + "theoretical_loss": 3.4655557848731418, + "tokens_seen": 1772301312 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002338816449348044, + "loss": 2.7327, + "theoretical_loss": 3.465544844697072, + "tokens_seen": 1772366848 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002338716148445336, + "loss": 2.5343, + "theoretical_loss": 3.465533905038788, + "tokens_seen": 1772432384 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023386158475426278, + "loss": 2.4095, + "theoretical_loss": 3.4655229658982467, + "tokens_seen": 1772497920 + }, + { + "epoch": 5.09, + "learning_rate": 0.000233851554663992, + "loss": 2.5593, + "theoretical_loss": 3.4655120272754036, + "tokens_seen": 1772563456 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023384152457372117, + "loss": 2.53, + "theoretical_loss": 3.4655010891702163, + "tokens_seen": 1772628992 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023383149448345038, + "loss": 2.4605, + "theoretical_loss": 3.4654901515826397, + "tokens_seen": 1772694528 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.718626022338867, + "objective/train/theoretical_loss": 3.465481948731614, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.465481948731614, + "tokens_seen": 1772743680 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023382146439317956, + "loss": 2.4501, + "theoretical_loss": 3.465479214512632, + "tokens_seen": 1772760064 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023381143430290874, + "loss": 2.4683, + "theoretical_loss": 3.4654682779601473, + "tokens_seen": 1772825600 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023380140421263792, + "loss": 2.3056, + "theoretical_loss": 3.4654573419251435, + "tokens_seen": 1772891136 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002337913741223671, + "loss": 2.481, + "theoretical_loss": 3.465446406407577, + "tokens_seen": 1772956672 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002337813440320963, + "loss": 2.5162, + "theoretical_loss": 3.4654354714074036, + "tokens_seen": 1773022208 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002337713139418255, + "loss": 2.1997, + "theoretical_loss": 3.46542453692458, + "tokens_seen": 1773087744 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023376128385155467, + "loss": 2.286, + "theoretical_loss": 3.4654136029590625, + "tokens_seen": 1773153280 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023375125376128386, + "loss": 2.685, + "theoretical_loss": 3.4654026695108078, + "tokens_seen": 1773218816 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023374122367101306, + "loss": 2.4305, + "theoretical_loss": 3.4653917365797717, + "tokens_seen": 1773284352 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023373119358074224, + "loss": 2.3639, + "theoretical_loss": 3.4653808041659113, + "tokens_seen": 1773349888 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023372116349047143, + "loss": 2.3568, + "theoretical_loss": 3.4653698722691826, + "tokens_seen": 1773415424 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002337111334002006, + "loss": 2.6072, + "theoretical_loss": 3.465358940889542, + "tokens_seen": 1773480960 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023370110330992982, + "loss": 2.65, + "theoretical_loss": 3.465348010026946, + "tokens_seen": 1773546496 + }, + { + "epoch": 5.09, + "learning_rate": 0.000233691073219659, + "loss": 2.6185, + "theoretical_loss": 3.4653370796813516, + "tokens_seen": 1773612032 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023368104312938818, + "loss": 2.5071, + "theoretical_loss": 3.465326149852715, + "tokens_seen": 1773677568 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023367101303911736, + "loss": 2.6493, + "theoretical_loss": 3.4653152205409916, + "tokens_seen": 1773743104 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023366098294884654, + "loss": 2.3701, + "theoretical_loss": 3.465304291746139, + "tokens_seen": 1773808640 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023365095285857575, + "loss": 2.45, + "theoretical_loss": 3.4652933634681133, + "tokens_seen": 1773874176 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023364092276830493, + "loss": 2.3504, + "theoretical_loss": 3.4652824357068712, + "tokens_seen": 1773939712 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002336308926780341, + "loss": 2.4882, + "theoretical_loss": 3.4652715084623686, + "tokens_seen": 1774005248 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002336208625877633, + "loss": 2.6701, + "theoretical_loss": 3.4652605817345625, + "tokens_seen": 1774070784 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002336108324974925, + "loss": 2.5634, + "theoretical_loss": 3.46524965552341, + "tokens_seen": 1774136320 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023360080240722168, + "loss": 2.6451, + "theoretical_loss": 3.4652387298288656, + "tokens_seen": 1774201856 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023359077231695086, + "loss": 2.4303, + "theoretical_loss": 3.4652278046508878, + "tokens_seen": 1774267392 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023358074222668004, + "loss": 2.4778, + "theoretical_loss": 3.4652168799894323, + "tokens_seen": 1774332928 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9747250080108643, + "objective/train/theoretical_loss": 3.465208686832282, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.465208686832282, + "tokens_seen": 1774382080 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023357071213640922, + "loss": 2.7781, + "theoretical_loss": 3.4652059558444552, + "tokens_seen": 1774398464 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023356068204613843, + "loss": 2.4818, + "theoretical_loss": 3.465195032215913, + "tokens_seen": 1774464000 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002335506519558676, + "loss": 2.637, + "theoretical_loss": 3.4651841091037636, + "tokens_seen": 1774529536 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002335406218655968, + "loss": 2.6496, + "theoretical_loss": 3.465173186507962, + "tokens_seen": 1774595072 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023353059177532598, + "loss": 2.5184, + "theoretical_loss": 3.4651622644284656, + "tokens_seen": 1774660608 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023352056168505518, + "loss": 2.6309, + "theoretical_loss": 3.4651513428652305, + "tokens_seen": 1774726144 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023351053159478437, + "loss": 2.6598, + "theoretical_loss": 3.465140421818213, + "tokens_seen": 1774791680 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023350050150451355, + "loss": 2.3495, + "theoretical_loss": 3.465129501287371, + "tokens_seen": 1774857216 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023349047141424273, + "loss": 2.5256, + "theoretical_loss": 3.465118581272659, + "tokens_seen": 1774922752 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002334804413239719, + "loss": 2.5715, + "theoretical_loss": 3.4651076617740344, + "tokens_seen": 1774988288 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023347041123370112, + "loss": 2.4894, + "theoretical_loss": 3.465096742791455, + "tokens_seen": 1775053824 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002334603811434303, + "loss": 2.4852, + "theoretical_loss": 3.465085824324875, + "tokens_seen": 1775119360 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023345035105315948, + "loss": 2.4874, + "theoretical_loss": 3.4650749063742534, + "tokens_seen": 1775184896 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023344032096288866, + "loss": 2.4173, + "theoretical_loss": 3.465063988939545, + "tokens_seen": 1775250432 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023343029087261787, + "loss": 2.6071, + "theoretical_loss": 3.4650530720207073, + "tokens_seen": 1775315968 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023342026078234705, + "loss": 2.2076, + "theoretical_loss": 3.465042155617697, + "tokens_seen": 1775381504 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023341023069207623, + "loss": 2.513, + "theoretical_loss": 3.4650312397304694, + "tokens_seen": 1775447040 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002334002006018054, + "loss": 2.5501, + "theoretical_loss": 3.4650203243589823, + "tokens_seen": 1775512576 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002333901705115346, + "loss": 2.4242, + "theoretical_loss": 3.465009409503192, + "tokens_seen": 1775578112 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002333801404212638, + "loss": 2.5968, + "theoretical_loss": 3.464998495163055, + "tokens_seen": 1775643648 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023337011033099298, + "loss": 2.4782, + "theoretical_loss": 3.4649875813385282, + "tokens_seen": 1775709184 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023336008024072216, + "loss": 2.6304, + "theoretical_loss": 3.4649766680295677, + "tokens_seen": 1775774720 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023335005015045134, + "loss": 2.7412, + "theoretical_loss": 3.464965755236131, + "tokens_seen": 1775840256 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023334002006018055, + "loss": 2.4042, + "theoretical_loss": 3.464954842958173, + "tokens_seen": 1775905792 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023332998996990973, + "loss": 2.616, + "theoretical_loss": 3.464943931195653, + "tokens_seen": 1775971328 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.874159574508667, + "objective/train/theoretical_loss": 3.464935747711991, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.464935747711991, + "tokens_seen": 1776020480 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023331995987963891, + "loss": 2.5332, + "theoretical_loss": 3.464933019948525, + "tokens_seen": 1776036864 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002333099297893681, + "loss": 2.4847, + "theoretical_loss": 3.464922109216747, + "tokens_seen": 1776102400 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023329989969909728, + "loss": 2.5869, + "theoretical_loss": 3.464911199000275, + "tokens_seen": 1776167936 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023328986960882649, + "loss": 2.5488, + "theoretical_loss": 3.4649002892990666, + "tokens_seen": 1776233472 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023327983951855567, + "loss": 2.4805, + "theoretical_loss": 3.4648893801130773, + "tokens_seen": 1776299008 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023326980942828485, + "loss": 2.6329, + "theoretical_loss": 3.4648784714422654, + "tokens_seen": 1776364544 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023325977933801403, + "loss": 2.5593, + "theoretical_loss": 3.4648675632865857, + "tokens_seen": 1776430080 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023324974924774324, + "loss": 2.4065, + "theoretical_loss": 3.464856655645996, + "tokens_seen": 1776495616 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023323971915747242, + "loss": 2.3472, + "theoretical_loss": 3.4648457485204522, + "tokens_seen": 1776561152 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002332296890672016, + "loss": 2.5367, + "theoretical_loss": 3.464834841909912, + "tokens_seen": 1776626688 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023321965897693078, + "loss": 2.6691, + "theoretical_loss": 3.464823935814331, + "tokens_seen": 1776692224 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023320962888665996, + "loss": 2.3813, + "theoretical_loss": 3.464813030233667, + "tokens_seen": 1776757760 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023319959879638917, + "loss": 2.6345, + "theoretical_loss": 3.464802125167876, + "tokens_seen": 1776823296 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023318956870611835, + "loss": 2.5392, + "theoretical_loss": 3.4647912206169145, + "tokens_seen": 1776888832 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023317953861584753, + "loss": 2.3573, + "theoretical_loss": 3.4647803165807396, + "tokens_seen": 1776954368 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002331695085255767, + "loss": 2.3336, + "theoretical_loss": 3.464769413059308, + "tokens_seen": 1777019904 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023315947843530592, + "loss": 2.3903, + "theoretical_loss": 3.464758510052577, + "tokens_seen": 1777085440 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002331494483450351, + "loss": 2.6098, + "theoretical_loss": 3.4647476075605015, + "tokens_seen": 1777150976 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023313941825476428, + "loss": 2.5452, + "theoretical_loss": 3.4647367055830403, + "tokens_seen": 1777216512 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023312938816449346, + "loss": 2.5567, + "theoretical_loss": 3.4647258041201487, + "tokens_seen": 1777282048 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023311935807422267, + "loss": 2.5928, + "theoretical_loss": 3.4647149031717843, + "tokens_seen": 1777347584 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023310932798395185, + "loss": 2.4107, + "theoretical_loss": 3.4647040027379035, + "tokens_seen": 1777413120 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023309929789368106, + "loss": 2.5606, + "theoretical_loss": 3.4646931028184635, + "tokens_seen": 1777478656 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023308926780341024, + "loss": 2.3068, + "theoretical_loss": 3.46468220341342, + "tokens_seen": 1777544192 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023307923771313942, + "loss": 2.4885, + "theoretical_loss": 3.4646713045227306, + "tokens_seen": 1777609728 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2620835304260254, + "objective/train/theoretical_loss": 3.4646631306922324, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.4646631306922324, + "tokens_seen": 1777658880 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023306920762286863, + "loss": 2.3808, + "theoretical_loss": 3.464660406146352, + "tokens_seen": 1777675264 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023305917753259781, + "loss": 2.2059, + "theoretical_loss": 3.464649508284241, + "tokens_seen": 1777740800 + }, + { + "epoch": 5.09, + "learning_rate": 0.000233049147442327, + "loss": 2.7141, + "theoretical_loss": 3.464638610936354, + "tokens_seen": 1777806336 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023303911735205618, + "loss": 2.7305, + "theoretical_loss": 3.464627714102648, + "tokens_seen": 1777871872 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023302908726178538, + "loss": 2.7274, + "theoretical_loss": 3.4646168177830794, + "tokens_seen": 1777937408 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023301905717151457, + "loss": 2.7583, + "theoretical_loss": 3.464605921977606, + "tokens_seen": 1778002944 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023300902708124375, + "loss": 2.4897, + "theoretical_loss": 3.4645950266861836, + "tokens_seen": 1778068480 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023299899699097293, + "loss": 2.5003, + "theoretical_loss": 3.4645841319087696, + "tokens_seen": 1778134016 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002329889669007021, + "loss": 2.5002, + "theoretical_loss": 3.46457323764532, + "tokens_seen": 1778199552 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023297893681043132, + "loss": 2.4599, + "theoretical_loss": 3.4645623438957927, + "tokens_seen": 1778265088 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002329689067201605, + "loss": 2.3209, + "theoretical_loss": 3.464551450660144, + "tokens_seen": 1778330624 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023295887662988968, + "loss": 2.548, + "theoretical_loss": 3.4645405579383306, + "tokens_seen": 1778396160 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023294884653961886, + "loss": 2.468, + "theoretical_loss": 3.4645296657303097, + "tokens_seen": 1778461696 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023293881644934807, + "loss": 2.6407, + "theoretical_loss": 3.4645187740360375, + "tokens_seen": 1778527232 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023292878635907725, + "loss": 2.4272, + "theoretical_loss": 3.4645078828554716, + "tokens_seen": 1778592768 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023291875626880643, + "loss": 2.4103, + "theoretical_loss": 3.464496992188568, + "tokens_seen": 1778658304 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002329087261785356, + "loss": 2.3247, + "theoretical_loss": 3.4644861020352846, + "tokens_seen": 1778723840 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002328986960882648, + "loss": 2.4989, + "theoretical_loss": 3.4644752123955778, + "tokens_seen": 1778789376 + }, + { + "epoch": 5.09, + "learning_rate": 0.000232888665997994, + "loss": 2.6325, + "theoretical_loss": 3.4644643232694037, + "tokens_seen": 1778854912 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023287863590772318, + "loss": 2.8956, + "theoretical_loss": 3.46445343465672, + "tokens_seen": 1778920448 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023286860581745236, + "loss": 2.5246, + "theoretical_loss": 3.4644425465574837, + "tokens_seen": 1778985984 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023285857572718154, + "loss": 2.4851, + "theoretical_loss": 3.464431658971651, + "tokens_seen": 1779051520 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023284854563691075, + "loss": 2.6301, + "theoretical_loss": 3.4644207718991793, + "tokens_seen": 1779117056 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023283851554663993, + "loss": 2.4833, + "theoretical_loss": 3.464409885340025, + "tokens_seen": 1779182592 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023282848545636911, + "loss": 2.6387, + "theoretical_loss": 3.4643989992941457, + "tokens_seen": 1779248128 + }, + { + "epoch": 5.09, + "objective/train/docs_used": 1923636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.7796472311019897, + "objective/train/theoretical_loss": 3.4643908350965464, + "objective/train/tokens_used": 1780554208, + "theoretical_loss": 3.4643908350965464, + "tokens_seen": 1779297280 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002328184553660983, + "loss": 2.1337, + "theoretical_loss": 3.4643881137614976, + "tokens_seen": 1779313664 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023280842527582748, + "loss": 2.3812, + "theoretical_loss": 3.4643772287420385, + "tokens_seen": 1779379200 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023279839518555669, + "loss": 2.4999, + "theoretical_loss": 3.4643663442357235, + "tokens_seen": 1779444736 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023278836509528587, + "loss": 2.4292, + "theoretical_loss": 3.464355460242512, + "tokens_seen": 1779510272 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023277833500501505, + "loss": 2.561, + "theoretical_loss": 3.4643445767623593, + "tokens_seen": 1779575808 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023276830491474423, + "loss": 2.4096, + "theoretical_loss": 3.464333693795222, + "tokens_seen": 1779641344 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023275827482447344, + "loss": 2.7237, + "theoretical_loss": 3.4643228113410585, + "tokens_seen": 1779706880 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023274824473420262, + "loss": 2.4071, + "theoretical_loss": 3.4643119293998246, + "tokens_seen": 1779772416 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002327382146439318, + "loss": 2.4214, + "theoretical_loss": 3.4643010479714773, + "tokens_seen": 1779837952 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023272818455366098, + "loss": 2.5758, + "theoretical_loss": 3.464290167055974, + "tokens_seen": 1779903488 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023271815446339016, + "loss": 2.3163, + "theoretical_loss": 3.4642792866532717, + "tokens_seen": 1779969024 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023270812437311937, + "loss": 2.4612, + "theoretical_loss": 3.4642684067633267, + "tokens_seen": 1780034560 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023269809428284855, + "loss": 2.6655, + "theoretical_loss": 3.4642575273860965, + "tokens_seen": 1780100096 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023268806419257773, + "loss": 2.6247, + "theoretical_loss": 3.4642466485215384, + "tokens_seen": 1780165632 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002326780341023069, + "loss": 2.329, + "theoretical_loss": 3.4642357701696085, + "tokens_seen": 1780231168 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023266800401203612, + "loss": 2.4286, + "theoretical_loss": 3.4642248923302636, + "tokens_seen": 1780296704 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002326579739217653, + "loss": 2.5486, + "theoretical_loss": 3.464214015003462, + "tokens_seen": 1780362240 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023264794383149448, + "loss": 2.3246, + "theoretical_loss": 3.46420313818916, + "tokens_seen": 1780427776 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023263791374122366, + "loss": 2.6407, + "theoretical_loss": 3.464192261887314, + "tokens_seen": 1780493312 + }, + { + "epoch": 5.09, + "learning_rate": 0.00023262788365095287, + "loss": 2.6565, + "theoretical_loss": 3.4641813860978816, + "tokens_seen": 1780558848 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023261785356068205, + "loss": 3.4603, + "theoretical_loss": 3.4641679620018704, + "tokens_seen": 1780639744 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023260782347041124, + "loss": 2.7234, + "theoretical_loss": 3.4641570873572065, + "tokens_seen": 1780705280 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023259779338014042, + "loss": 2.6585, + "theoretical_loss": 3.4641462132248173, + "tokens_seen": 1780770816 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002325877632898696, + "loss": 2.5922, + "theoretical_loss": 3.4641353396046597, + "tokens_seen": 1780836352 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002325777331995988, + "loss": 2.6082, + "theoretical_loss": 3.46412446649669, + "tokens_seen": 1780901888 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1988445, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4177474975585938, + "objective/train/theoretical_loss": 3.464119030134763, + "objective/train/tokens_used": 1801394656, + "theoretical_loss": 3.464119030134763, + "tokens_seen": 1780934656 + }, + { + "epoch": 6.0, + "learning_rate": 0.000232567703109328, + "loss": 2.5676, + "theoretical_loss": 3.4641135939008665, + "tokens_seen": 1780967424 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023255767301905717, + "loss": 2.5814, + "theoretical_loss": 3.464102721817145, + "tokens_seen": 1781032960 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023254764292878635, + "loss": 2.553, + "theoretical_loss": 3.4640918502454836, + "tokens_seen": 1781098496 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023253761283851556, + "loss": 2.5737, + "theoretical_loss": 3.464080979185839, + "tokens_seen": 1781164032 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023252758274824474, + "loss": 2.6425, + "theoretical_loss": 3.4640701086381673, + "tokens_seen": 1781229568 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023251755265797392, + "loss": 2.6861, + "theoretical_loss": 3.4640592386024274, + "tokens_seen": 1781295104 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002325075225677031, + "loss": 2.6236, + "theoretical_loss": 3.4640483690785744, + "tokens_seen": 1781360640 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023249749247743228, + "loss": 2.535, + "theoretical_loss": 3.464037500066566, + "tokens_seen": 1781426176 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002324874623871615, + "loss": 2.7341, + "theoretical_loss": 3.4640266315663606, + "tokens_seen": 1781491712 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023247743229689067, + "loss": 2.6326, + "theoretical_loss": 3.4640157635779136, + "tokens_seen": 1781557248 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023246740220661985, + "loss": 2.6268, + "theoretical_loss": 3.4640048961011827, + "tokens_seen": 1781622784 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023245737211634903, + "loss": 2.648, + "theoretical_loss": 3.463994029136125, + "tokens_seen": 1781688320 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023244734202607824, + "loss": 2.7074, + "theoretical_loss": 3.4639831626826973, + "tokens_seen": 1781753856 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023243731193580742, + "loss": 2.6623, + "theoretical_loss": 3.4639722967408573, + "tokens_seen": 1781819392 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002324272818455366, + "loss": 2.6243, + "theoretical_loss": 3.463961431310562, + "tokens_seen": 1781884928 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023241725175526578, + "loss": 2.6062, + "theoretical_loss": 3.4639505663917673, + "tokens_seen": 1781950464 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023240722166499497, + "loss": 2.5785, + "theoretical_loss": 3.463939701984432, + "tokens_seen": 1782016000 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023239719157472417, + "loss": 2.5885, + "theoretical_loss": 3.4639288380885125, + "tokens_seen": 1782081536 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023238716148445336, + "loss": 2.7717, + "theoretical_loss": 3.463917974703965, + "tokens_seen": 1782147072 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023237713139418254, + "loss": 2.6371, + "theoretical_loss": 3.4639071118307485, + "tokens_seen": 1782212608 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023236710130391172, + "loss": 2.6548, + "theoretical_loss": 3.463896249468819, + "tokens_seen": 1782278144 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023235707121364095, + "loss": 2.6334, + "theoretical_loss": 3.4638853876181335, + "tokens_seen": 1782343680 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023234704112337013, + "loss": 2.6794, + "theoretical_loss": 3.4638745262786497, + "tokens_seen": 1782409216 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023233701103309932, + "loss": 2.4647, + "theoretical_loss": 3.463863665450324, + "tokens_seen": 1782474752 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002323269809428285, + "loss": 2.683, + "theoretical_loss": 3.4638528051331146, + "tokens_seen": 1782540288 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1993451, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5961039066314697, + "objective/train/theoretical_loss": 3.463847375166165, + "objective/train/tokens_used": 1803033056, + "theoretical_loss": 3.463847375166165, + "tokens_seen": 1782573056 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023231695085255768, + "loss": 2.6831, + "theoretical_loss": 3.463841945326978, + "tokens_seen": 1782605824 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023230692076228689, + "loss": 2.6943, + "theoretical_loss": 3.4638310860318713, + "tokens_seen": 1782671360 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023229689067201607, + "loss": 2.5955, + "theoretical_loss": 3.4638202272477514, + "tokens_seen": 1782736896 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023228686058174525, + "loss": 2.6862, + "theoretical_loss": 3.4638093689745766, + "tokens_seen": 1782802432 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023227683049147443, + "loss": 2.5886, + "theoretical_loss": 3.4637985112123033, + "tokens_seen": 1782867968 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023226680040120364, + "loss": 2.538, + "theoretical_loss": 3.463787653960888, + "tokens_seen": 1782933504 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023225677031093282, + "loss": 2.6801, + "theoretical_loss": 3.463776797220289, + "tokens_seen": 1782999040 + }, + { + "epoch": 6.0, + "learning_rate": 0.000232246740220662, + "loss": 2.6758, + "theoretical_loss": 3.4637659409904638, + "tokens_seen": 1783064576 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023223671013039118, + "loss": 2.6155, + "theoretical_loss": 3.4637550852713686, + "tokens_seen": 1783130112 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023222668004012036, + "loss": 2.6869, + "theoretical_loss": 3.4637442300629604, + "tokens_seen": 1783195648 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023221664994984957, + "loss": 2.8396, + "theoretical_loss": 3.463733375365197, + "tokens_seen": 1783261184 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023220661985957875, + "loss": 2.7689, + "theoretical_loss": 3.4637225211780356, + "tokens_seen": 1783326720 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023219658976930793, + "loss": 2.6325, + "theoretical_loss": 3.4637116675014337, + "tokens_seen": 1783392256 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002321865596790371, + "loss": 2.6908, + "theoretical_loss": 3.463700814335348, + "tokens_seen": 1783457792 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023217652958876632, + "loss": 2.5329, + "theoretical_loss": 3.463689961679736, + "tokens_seen": 1783523328 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002321664994984955, + "loss": 2.6341, + "theoretical_loss": 3.4636791095345547, + "tokens_seen": 1783588864 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023215646940822468, + "loss": 2.7021, + "theoretical_loss": 3.4636682578997613, + "tokens_seen": 1783654400 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023214643931795386, + "loss": 2.5456, + "theoretical_loss": 3.463657406775314, + "tokens_seen": 1783719936 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023213640922768307, + "loss": 2.6259, + "theoretical_loss": 3.4636465561611685, + "tokens_seen": 1783785472 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023212637913741225, + "loss": 2.6354, + "theoretical_loss": 3.4636357060572833, + "tokens_seen": 1783851008 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023211634904714144, + "loss": 2.5532, + "theoretical_loss": 3.463624856463615, + "tokens_seen": 1783916544 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023210631895687062, + "loss": 2.6949, + "theoretical_loss": 3.463614007380121, + "tokens_seen": 1783982080 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002320962888665998, + "loss": 2.6938, + "theoretical_loss": 3.463603158806759, + "tokens_seen": 1784047616 + }, + { + "epoch": 6.0, + "learning_rate": 0.000232086258776329, + "loss": 2.6866, + "theoretical_loss": 3.4635923107434854, + "tokens_seen": 1784113152 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002320762286860582, + "loss": 2.5674, + "theoretical_loss": 3.463581463190258, + "tokens_seen": 1784178688 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1998470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4440953731536865, + "objective/train/theoretical_loss": 3.4635760396048987, + "objective/train/tokens_used": 1804671456, + "theoretical_loss": 3.4635760396048987, + "tokens_seen": 1784211456 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023206619859578737, + "loss": 2.6431, + "theoretical_loss": 3.4635706161470345, + "tokens_seen": 1784244224 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023205616850551655, + "loss": 2.5204, + "theoretical_loss": 3.463559769613772, + "tokens_seen": 1784309760 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023204613841524576, + "loss": 2.6567, + "theoretical_loss": 3.4635489235904267, + "tokens_seen": 1784375296 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023203610832497494, + "loss": 2.557, + "theoretical_loss": 3.463538078076957, + "tokens_seen": 1784440832 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023202607823470412, + "loss": 2.8933, + "theoretical_loss": 3.46352723307332, + "tokens_seen": 1784506368 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002320160481444333, + "loss": 2.4005, + "theoretical_loss": 3.4635163885794733, + "tokens_seen": 1784571904 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023200601805416248, + "loss": 2.6778, + "theoretical_loss": 3.4635055445953737, + "tokens_seen": 1784637440 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002319959879638917, + "loss": 2.6741, + "theoretical_loss": 3.463494701120979, + "tokens_seen": 1784702976 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023198595787362087, + "loss": 2.6936, + "theoretical_loss": 3.4634838581562457, + "tokens_seen": 1784768512 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023197592778335005, + "loss": 2.6251, + "theoretical_loss": 3.463473015701132, + "tokens_seen": 1784834048 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023196589769307923, + "loss": 2.6029, + "theoretical_loss": 3.4634621737555946, + "tokens_seen": 1784899584 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023195586760280844, + "loss": 2.5414, + "theoretical_loss": 3.4634513323195915, + "tokens_seen": 1784965120 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023194583751253762, + "loss": 2.6506, + "theoretical_loss": 3.4634404913930794, + "tokens_seen": 1785030656 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002319358074222668, + "loss": 2.6338, + "theoretical_loss": 3.4634296509760163, + "tokens_seen": 1785096192 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023192577733199599, + "loss": 2.7362, + "theoretical_loss": 3.463418811068359, + "tokens_seen": 1785161728 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023191574724172517, + "loss": 2.5864, + "theoretical_loss": 3.4634079716700654, + "tokens_seen": 1785227264 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023190571715145437, + "loss": 2.7441, + "theoretical_loss": 3.4633971327810924, + "tokens_seen": 1785292800 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023189568706118356, + "loss": 2.6639, + "theoretical_loss": 3.4633862944013973, + "tokens_seen": 1785358336 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023188565697091274, + "loss": 2.7691, + "theoretical_loss": 3.4633754565309376, + "tokens_seen": 1785423872 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023187562688064192, + "loss": 2.5697, + "theoretical_loss": 3.4633646191696705, + "tokens_seen": 1785489408 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023186559679037113, + "loss": 2.4497, + "theoretical_loss": 3.4633537823175544, + "tokens_seen": 1785554944 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002318555667001003, + "loss": 2.7018, + "theoretical_loss": 3.4633429459745457, + "tokens_seen": 1785620480 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002318455366098295, + "loss": 2.5004, + "theoretical_loss": 3.463332110140602, + "tokens_seen": 1785686016 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023183550651955867, + "loss": 2.8537, + "theoretical_loss": 3.463321274815681, + "tokens_seen": 1785751552 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023182547642928785, + "loss": 2.6595, + "theoretical_loss": 3.4633104399997396, + "tokens_seen": 1785817088 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 2003404, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.415255308151245, + "objective/train/theoretical_loss": 3.463305022782623, + "objective/train/tokens_used": 1806309856, + "theoretical_loss": 3.463305022782623, + "tokens_seen": 1785849856 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023181544633901706, + "loss": 2.466, + "theoretical_loss": 3.463299605692735, + "tokens_seen": 1785882624 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023180541624874624, + "loss": 2.6709, + "theoretical_loss": 3.463288771894626, + "tokens_seen": 1785948160 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023179538615847542, + "loss": 2.7177, + "theoretical_loss": 3.4632779386053687, + "tokens_seen": 1786013696 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002317853560682046, + "loss": 2.5029, + "theoretical_loss": 3.463267105824921, + "tokens_seen": 1786079232 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002317753259779338, + "loss": 2.5948, + "theoretical_loss": 3.4632562735532404, + "tokens_seen": 1786144768 + }, + { + "epoch": 6.0, + "learning_rate": 0.000231765295887663, + "loss": 2.4019, + "theoretical_loss": 3.463245441790284, + "tokens_seen": 1786210304 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023175526579739217, + "loss": 2.6825, + "theoretical_loss": 3.4632346105360092, + "tokens_seen": 1786275840 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023174523570712135, + "loss": 2.62, + "theoretical_loss": 3.4632237797903747, + "tokens_seen": 1786341376 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023173520561685053, + "loss": 2.6821, + "theoretical_loss": 3.463212949553336, + "tokens_seen": 1786406912 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023172517552657974, + "loss": 2.6356, + "theoretical_loss": 3.463202119824852, + "tokens_seen": 1786472448 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023171514543630892, + "loss": 2.6615, + "theoretical_loss": 3.4631912906048794, + "tokens_seen": 1786537984 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002317051153460381, + "loss": 2.6314, + "theoretical_loss": 3.4631804618933764, + "tokens_seen": 1786603520 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023169508525576729, + "loss": 2.5312, + "theoretical_loss": 3.4631696336903, + "tokens_seen": 1786669056 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002316850551654965, + "loss": 2.8512, + "theoretical_loss": 3.463158805995607, + "tokens_seen": 1786734592 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023167502507522568, + "loss": 2.7573, + "theoretical_loss": 3.463147978809256, + "tokens_seen": 1786800128 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023166499498495486, + "loss": 2.7197, + "theoretical_loss": 3.4631371521312047, + "tokens_seen": 1786865664 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023165496489468404, + "loss": 2.6101, + "theoretical_loss": 3.46312632596141, + "tokens_seen": 1786931200 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023164493480441322, + "loss": 2.6172, + "theoretical_loss": 3.4631155002998284, + "tokens_seen": 1786996736 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023163490471414243, + "loss": 2.7336, + "theoretical_loss": 3.463104675146419, + "tokens_seen": 1787062272 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002316248746238716, + "loss": 2.6689, + "theoretical_loss": 3.4630938505011386, + "tokens_seen": 1787127808 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002316148445336008, + "loss": 2.5802, + "theoretical_loss": 3.463083026363945, + "tokens_seen": 1787193344 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023160481444333, + "loss": 2.5955, + "theoretical_loss": 3.4630722027347955, + "tokens_seen": 1787258880 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002315947843530592, + "loss": 2.5698, + "theoretical_loss": 3.463061379613648, + "tokens_seen": 1787324416 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002315847542627884, + "loss": 2.5366, + "theoretical_loss": 3.463050557000459, + "tokens_seen": 1787389952 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023157472417251757, + "loss": 2.8764, + "theoretical_loss": 3.463039734895187, + "tokens_seen": 1787455488 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 2008412, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6942412853240967, + "objective/train/theoretical_loss": 3.4630343240330066, + "objective/train/tokens_used": 1807948256, + "theoretical_loss": 3.4630343240330066, + "tokens_seen": 1787488256 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023156469408224675, + "loss": 2.6602, + "theoretical_loss": 3.4630289132977894, + "tokens_seen": 1787521024 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023155466399197596, + "loss": 2.5687, + "theoretical_loss": 3.4630180922082237, + "tokens_seen": 1787586560 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023154463390170514, + "loss": 2.5634, + "theoretical_loss": 3.4630072716264473, + "tokens_seen": 1787652096 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023153460381143432, + "loss": 2.7274, + "theoretical_loss": 3.462996451552418, + "tokens_seen": 1787717632 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002315245737211635, + "loss": 2.6341, + "theoretical_loss": 3.4629856319860925, + "tokens_seen": 1787783168 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023151454363089268, + "loss": 2.4239, + "theoretical_loss": 3.4629748129274303, + "tokens_seen": 1787848704 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002315045135406219, + "loss": 2.4723, + "theoretical_loss": 3.462963994376387, + "tokens_seen": 1787914240 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023149448345035107, + "loss": 2.5423, + "theoretical_loss": 3.462953176332921, + "tokens_seen": 1787979776 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023148445336008025, + "loss": 2.4375, + "theoretical_loss": 3.4629423587969894, + "tokens_seen": 1788045312 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023147442326980943, + "loss": 2.5739, + "theoretical_loss": 3.462931541768551, + "tokens_seen": 1788110848 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023146439317953864, + "loss": 2.789, + "theoretical_loss": 3.462920725247562, + "tokens_seen": 1788176384 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023145436308926782, + "loss": 2.6926, + "theoretical_loss": 3.4629099092339812, + "tokens_seen": 1788241920 + }, + { + "epoch": 6.0, + "learning_rate": 0.000231444332998997, + "loss": 2.6702, + "theoretical_loss": 3.462899093727765, + "tokens_seen": 1788307456 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023143430290872619, + "loss": 2.4993, + "theoretical_loss": 3.4628882787288715, + "tokens_seen": 1788372992 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023142427281845537, + "loss": 2.5323, + "theoretical_loss": 3.462877464237259, + "tokens_seen": 1788438528 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023141424272818457, + "loss": 2.6028, + "theoretical_loss": 3.4628666502528844, + "tokens_seen": 1788504064 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023140421263791376, + "loss": 2.813, + "theoretical_loss": 3.462855836775705, + "tokens_seen": 1788569600 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023139418254764294, + "loss": 2.6139, + "theoretical_loss": 3.462845023805679, + "tokens_seen": 1788635136 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023138415245737212, + "loss": 2.7396, + "theoretical_loss": 3.462834211342764, + "tokens_seen": 1788700672 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023137412236710133, + "loss": 2.6212, + "theoretical_loss": 3.4628233993869175, + "tokens_seen": 1788766208 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002313640922768305, + "loss": 2.5981, + "theoretical_loss": 3.4628125879380978, + "tokens_seen": 1788831744 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002313540621865597, + "loss": 2.6835, + "theoretical_loss": 3.4628017769962613, + "tokens_seen": 1788897280 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023134403209628887, + "loss": 2.5397, + "theoretical_loss": 3.4627909665613665, + "tokens_seen": 1788962816 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023133400200601805, + "loss": 2.5619, + "theoretical_loss": 3.4627801566333707, + "tokens_seen": 1789028352 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023132397191574726, + "loss": 2.5985, + "theoretical_loss": 3.462769347212232, + "tokens_seen": 1789093888 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 2011203, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.391145706176758, + "objective/train/theoretical_loss": 3.4627639426917205, + "objective/train/tokens_used": 1809586656, + "theoretical_loss": 3.4627639426917205, + "tokens_seen": 1789126656 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023131394182547644, + "loss": 2.6131, + "theoretical_loss": 3.4627585382979076, + "tokens_seen": 1789159424 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023130391173520562, + "loss": 2.7482, + "theoretical_loss": 3.4627477298903555, + "tokens_seen": 1789224960 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002312938816449348, + "loss": 2.5442, + "theoretical_loss": 3.462736921989533, + "tokens_seen": 1789290496 + }, + { + "epoch": 6.0, + "learning_rate": 0.000231283851554664, + "loss": 2.441, + "theoretical_loss": 3.462726114595398, + "tokens_seen": 1789356032 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002312738214643932, + "loss": 2.6041, + "theoretical_loss": 3.4627153077079083, + "tokens_seen": 1789421568 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023126379137412237, + "loss": 2.5643, + "theoretical_loss": 3.4627045013270217, + "tokens_seen": 1789487104 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023125376128385155, + "loss": 2.6464, + "theoretical_loss": 3.4626936954526957, + "tokens_seen": 1789552640 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023124373119358073, + "loss": 2.6441, + "theoretical_loss": 3.4626828900848876, + "tokens_seen": 1789618176 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023123370110330994, + "loss": 2.6312, + "theoretical_loss": 3.4626720852235566, + "tokens_seen": 1789683712 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023122367101303912, + "loss": 2.5889, + "theoretical_loss": 3.4626612808686583, + "tokens_seen": 1789749248 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002312136409227683, + "loss": 2.7286, + "theoretical_loss": 3.4626504770201514, + "tokens_seen": 1789814784 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002312036108324975, + "loss": 2.7318, + "theoretical_loss": 3.4626396736779945, + "tokens_seen": 1789880320 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002311935807422267, + "loss": 2.6745, + "theoretical_loss": 3.4626288708421438, + "tokens_seen": 1789945856 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023118355065195588, + "loss": 2.6087, + "theoretical_loss": 3.4626180685125583, + "tokens_seen": 1790011392 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023117352056168506, + "loss": 2.667, + "theoretical_loss": 3.4626072666891945, + "tokens_seen": 1790076928 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023116349047141424, + "loss": 2.4849, + "theoretical_loss": 3.4625964653720116, + "tokens_seen": 1790142464 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023115346038114342, + "loss": 2.3665, + "theoretical_loss": 3.462585664560966, + "tokens_seen": 1790208000 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023114343029087263, + "loss": 2.4967, + "theoretical_loss": 3.4625748642560166, + "tokens_seen": 1790273536 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002311334002006018, + "loss": 2.7586, + "theoretical_loss": 3.46256406445712, + "tokens_seen": 1790339072 + }, + { + "epoch": 6.0, + "learning_rate": 0.000231123370110331, + "loss": 2.6626, + "theoretical_loss": 3.4625532651642352, + "tokens_seen": 1790404608 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023111334002006017, + "loss": 2.3549, + "theoretical_loss": 3.462542466377319, + "tokens_seen": 1790470144 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023110330992978938, + "loss": 2.5911, + "theoretical_loss": 3.4625316680963296, + "tokens_seen": 1790535680 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023109327983951856, + "loss": 2.757, + "theoretical_loss": 3.4625208703212245, + "tokens_seen": 1790601216 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023108324974924774, + "loss": 2.5254, + "theoretical_loss": 3.462510073051962, + "tokens_seen": 1790666752 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023107321965897692, + "loss": 2.6383, + "theoretical_loss": 3.462499276288499, + "tokens_seen": 1790732288 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 2016374, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6111676692962646, + "objective/train/theoretical_loss": 3.4624938780964296, + "objective/train/tokens_used": 1811225056, + "theoretical_loss": 3.4624938780964296, + "tokens_seen": 1790765056 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002310631895687061, + "loss": 2.5934, + "theoretical_loss": 3.4624884800307942, + "tokens_seen": 1790797824 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002310531594784353, + "loss": 2.5837, + "theoretical_loss": 3.4624776842788054, + "tokens_seen": 1790863360 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002310431293881645, + "loss": 2.4408, + "theoretical_loss": 3.4624668890324894, + "tokens_seen": 1790928896 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023103309929789367, + "loss": 2.7033, + "theoretical_loss": 3.462456094291806, + "tokens_seen": 1790994432 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023102306920762286, + "loss": 2.6713, + "theoretical_loss": 3.4624453000567104, + "tokens_seen": 1791059968 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023101303911735206, + "loss": 2.4653, + "theoretical_loss": 3.462434506327162, + "tokens_seen": 1791125504 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023100300902708124, + "loss": 2.5294, + "theoretical_loss": 3.4624237131031186, + "tokens_seen": 1791191040 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023099297893681043, + "loss": 2.7119, + "theoretical_loss": 3.462412920384538, + "tokens_seen": 1791256576 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002309829488465396, + "loss": 2.5355, + "theoretical_loss": 3.4624021281713775, + "tokens_seen": 1791322112 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023097291875626881, + "loss": 2.6543, + "theoretical_loss": 3.462391336463595, + "tokens_seen": 1791387648 + }, + { + "epoch": 6.0, + "learning_rate": 0.000230962888665998, + "loss": 2.6801, + "theoretical_loss": 3.4623805452611487, + "tokens_seen": 1791453184 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023095285857572718, + "loss": 2.6676, + "theoretical_loss": 3.462369754563997, + "tokens_seen": 1791518720 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023094282848545636, + "loss": 2.7459, + "theoretical_loss": 3.462358964372096, + "tokens_seen": 1791584256 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023093279839518554, + "loss": 2.5861, + "theoretical_loss": 3.4623481746854057, + "tokens_seen": 1791649792 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023092276830491475, + "loss": 2.5331, + "theoretical_loss": 3.462337385503883, + "tokens_seen": 1791715328 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023091273821464393, + "loss": 2.6271, + "theoretical_loss": 3.462326596827485, + "tokens_seen": 1791780864 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002309027081243731, + "loss": 2.7419, + "theoretical_loss": 3.4623158086561707, + "tokens_seen": 1791846400 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002308926780341023, + "loss": 2.6493, + "theoretical_loss": 3.4623050209898976, + "tokens_seen": 1791911936 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002308826479438315, + "loss": 2.5061, + "theoretical_loss": 3.462294233828623, + "tokens_seen": 1791977472 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023087261785356068, + "loss": 2.5831, + "theoretical_loss": 3.4622834471723056, + "tokens_seen": 1792043008 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023086258776328986, + "loss": 2.7366, + "theoretical_loss": 3.4622726610209034, + "tokens_seen": 1792108544 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023085255767301907, + "loss": 2.6033, + "theoretical_loss": 3.462261875374374, + "tokens_seen": 1792174080 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023084252758274825, + "loss": 2.4686, + "theoretical_loss": 3.462251090232675, + "tokens_seen": 1792239616 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023083249749247746, + "loss": 2.6278, + "theoretical_loss": 3.462240305595765, + "tokens_seen": 1792305152 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023082246740220664, + "loss": 2.5505, + "theoretical_loss": 3.462229521463601, + "tokens_seen": 1792370688 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 2021422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9169247150421143, + "objective/train/theoretical_loss": 3.4622241295867857, + "objective/train/tokens_used": 1812863456, + "theoretical_loss": 3.4622241295867857, + "tokens_seen": 1792403456 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023081243731193582, + "loss": 2.8172, + "theoretical_loss": 3.4622187378361415, + "tokens_seen": 1792436224 + }, + { + "epoch": 6.0, + "learning_rate": 0.000230802407221665, + "loss": 2.5892, + "theoretical_loss": 3.4622079547133446, + "tokens_seen": 1792501760 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002307923771313942, + "loss": 2.7061, + "theoretical_loss": 3.4621971720951676, + "tokens_seen": 1792567296 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002307823470411234, + "loss": 2.6604, + "theoretical_loss": 3.4621863899815692, + "tokens_seen": 1792632832 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023077231695085257, + "loss": 2.4921, + "theoretical_loss": 3.4621756083725064, + "tokens_seen": 1792698368 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023076228686058175, + "loss": 2.5904, + "theoretical_loss": 3.462164827267938, + "tokens_seen": 1792763904 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023075225677031094, + "loss": 2.5383, + "theoretical_loss": 3.462154046667822, + "tokens_seen": 1792829440 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023074222668004014, + "loss": 2.7259, + "theoretical_loss": 3.4621432665721157, + "tokens_seen": 1792894976 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023073219658976932, + "loss": 2.6006, + "theoretical_loss": 3.462132486980778, + "tokens_seen": 1792960512 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002307221664994985, + "loss": 2.6383, + "theoretical_loss": 3.4621217078937656, + "tokens_seen": 1793026048 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002307121364092277, + "loss": 2.6948, + "theoretical_loss": 3.462110929311037, + "tokens_seen": 1793091584 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002307021063189569, + "loss": 2.7399, + "theoretical_loss": 3.4621001512325504, + "tokens_seen": 1793157120 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023069207622868608, + "loss": 2.6013, + "theoretical_loss": 3.462089373658264, + "tokens_seen": 1793222656 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023068204613841526, + "loss": 2.4719, + "theoretical_loss": 3.4620785965881353, + "tokens_seen": 1793288192 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023067201604814444, + "loss": 2.5918, + "theoretical_loss": 3.4620678200221224, + "tokens_seen": 1793353728 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023066198595787362, + "loss": 2.4979, + "theoretical_loss": 3.462057043960183, + "tokens_seen": 1793419264 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023065195586760283, + "loss": 2.5663, + "theoretical_loss": 3.4620462684022764, + "tokens_seen": 1793484800 + }, + { + "epoch": 6.0, + "learning_rate": 0.000230641925777332, + "loss": 2.3784, + "theoretical_loss": 3.462035493348359, + "tokens_seen": 1793550336 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002306318956870612, + "loss": 2.5167, + "theoretical_loss": 3.4620247187983892, + "tokens_seen": 1793615872 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023062186559679037, + "loss": 2.7611, + "theoretical_loss": 3.462013944752326, + "tokens_seen": 1793681408 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023061183550651958, + "loss": 2.6072, + "theoretical_loss": 3.4620031712101262, + "tokens_seen": 1793746944 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023060180541624876, + "loss": 2.6941, + "theoretical_loss": 3.4619923981717484, + "tokens_seen": 1793812480 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023059177532597794, + "loss": 2.5358, + "theoretical_loss": 3.4619816256371507, + "tokens_seen": 1793878016 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023058174523570712, + "loss": 2.6103, + "theoretical_loss": 3.4619708536062914, + "tokens_seen": 1793943552 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002305717151454363, + "loss": 2.7136, + "theoretical_loss": 3.461960082079128, + "tokens_seen": 1794009088 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 2026152, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.368499517440796, + "objective/train/theoretical_loss": 3.461954696504419, + "objective/train/tokens_used": 1814501856, + "theoretical_loss": 3.461954696504419, + "tokens_seen": 1794041856 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002305616850551655, + "loss": 2.5295, + "theoretical_loss": 3.4619493110556183, + "tokens_seen": 1794074624 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002305516549648947, + "loss": 2.6119, + "theoretical_loss": 3.461938540535721, + "tokens_seen": 1794140160 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023054162487462387, + "loss": 2.6119, + "theoretical_loss": 3.4619277705193943, + "tokens_seen": 1794205696 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023053159478435306, + "loss": 2.5741, + "theoretical_loss": 3.4619170010065954, + "tokens_seen": 1794271232 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023052156469408226, + "loss": 2.6946, + "theoretical_loss": 3.461906231997283, + "tokens_seen": 1794336768 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023051153460381144, + "loss": 2.5917, + "theoretical_loss": 3.461895463491415, + "tokens_seen": 1794402304 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023050150451354063, + "loss": 2.7344, + "theoretical_loss": 3.46188469548895, + "tokens_seen": 1794467840 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002304914744232698, + "loss": 2.6991, + "theoretical_loss": 3.461873927989845, + "tokens_seen": 1794533376 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023048144433299902, + "loss": 2.5785, + "theoretical_loss": 3.461863160994059, + "tokens_seen": 1794598912 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002304714142427282, + "loss": 2.629, + "theoretical_loss": 3.4618523945015496, + "tokens_seen": 1794664448 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023046138415245738, + "loss": 2.7818, + "theoretical_loss": 3.461841628512275, + "tokens_seen": 1794729984 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023045135406218656, + "loss": 2.6838, + "theoretical_loss": 3.4618308630261936, + "tokens_seen": 1794795520 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023044132397191574, + "loss": 2.6502, + "theoretical_loss": 3.461820098043263, + "tokens_seen": 1794861056 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023043129388164495, + "loss": 2.7259, + "theoretical_loss": 3.4618093335634423, + "tokens_seen": 1794926592 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023042126379137413, + "loss": 2.5748, + "theoretical_loss": 3.461798569586688, + "tokens_seen": 1794992128 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002304112337011033, + "loss": 2.6937, + "theoretical_loss": 3.4617878061129597, + "tokens_seen": 1795057664 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002304012036108325, + "loss": 2.5537, + "theoretical_loss": 3.461777043142215, + "tokens_seen": 1795123200 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002303911735205617, + "loss": 2.7301, + "theoretical_loss": 3.4617662806744116, + "tokens_seen": 1795188736 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023038114343029088, + "loss": 2.526, + "theoretical_loss": 3.4617555187095084, + "tokens_seen": 1795254272 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023037111334002006, + "loss": 2.7283, + "theoretical_loss": 3.4617447572474633, + "tokens_seen": 1795319808 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023036108324974924, + "loss": 2.4514, + "theoretical_loss": 3.461733996288234, + "tokens_seen": 1795385344 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023035105315947842, + "loss": 2.6518, + "theoretical_loss": 3.461723235831779, + "tokens_seen": 1795450880 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023034102306920763, + "loss": 2.7155, + "theoretical_loss": 3.461712475878057, + "tokens_seen": 1795516416 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002303309929789368, + "loss": 2.6113, + "theoretical_loss": 3.461701716427025, + "tokens_seen": 1795581952 + }, + { + "epoch": 6.0, + "learning_rate": 0.000230320962888666, + "loss": 2.7733, + "theoretical_loss": 3.461690957478642, + "tokens_seen": 1795647488 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 2031269, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.562140941619873, + "objective/train/theoretical_loss": 3.461685578192931, + "objective/train/tokens_used": 1816140256, + "theoretical_loss": 3.461685578192931, + "tokens_seen": 1795680256 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023031093279839518, + "loss": 2.6737, + "theoretical_loss": 3.461680199032866, + "tokens_seen": 1795713024 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023030090270812438, + "loss": 2.5679, + "theoretical_loss": 3.461669441089655, + "tokens_seen": 1795778560 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023029087261785356, + "loss": 2.6601, + "theoretical_loss": 3.4616586836489676, + "tokens_seen": 1795844096 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023028084252758275, + "loss": 2.6584, + "theoretical_loss": 3.4616479267107616, + "tokens_seen": 1795909632 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023027081243731193, + "loss": 2.6565, + "theoretical_loss": 3.4616371702749946, + "tokens_seen": 1795975168 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002302607823470411, + "loss": 2.6871, + "theoretical_loss": 3.4616264143416267, + "tokens_seen": 1796040704 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023025075225677032, + "loss": 2.6442, + "theoretical_loss": 3.461615658910614, + "tokens_seen": 1796106240 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002302407221664995, + "loss": 2.6063, + "theoretical_loss": 3.4616049039819163, + "tokens_seen": 1796171776 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023023069207622868, + "loss": 2.73, + "theoretical_loss": 3.4615941495554905, + "tokens_seen": 1796237312 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023022066198595786, + "loss": 2.662, + "theoretical_loss": 3.461583395631296, + "tokens_seen": 1796302848 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023021063189568707, + "loss": 2.89, + "theoretical_loss": 3.46157264220929, + "tokens_seen": 1796368384 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023020060180541625, + "loss": 2.7899, + "theoretical_loss": 3.461561889289431, + "tokens_seen": 1796433920 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023019057171514543, + "loss": 2.7136, + "theoretical_loss": 3.4615511368716785, + "tokens_seen": 1796499456 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002301805416248746, + "loss": 2.6399, + "theoretical_loss": 3.4615403849559887, + "tokens_seen": 1796564992 + }, + { + "epoch": 6.0, + "learning_rate": 0.0002301705115346038, + "loss": 2.6746, + "theoretical_loss": 3.4615296335423213, + "tokens_seen": 1796630528 + }, + { + "epoch": 6.0, + "learning_rate": 0.000230160481444333, + "loss": 2.5864, + "theoretical_loss": 3.461518882630634, + "tokens_seen": 1796696064 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023015045135406218, + "loss": 2.5243, + "theoretical_loss": 3.4615081322208847, + "tokens_seen": 1796761600 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023014042126379136, + "loss": 2.6444, + "theoretical_loss": 3.4614973823130324, + "tokens_seen": 1796827136 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023013039117352054, + "loss": 2.4187, + "theoretical_loss": 3.461486632907035, + "tokens_seen": 1796892672 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023012036108324975, + "loss": 2.5483, + "theoretical_loss": 3.461475884002851, + "tokens_seen": 1796958208 + }, + { + "epoch": 6.0, + "learning_rate": 0.00023011033099297893, + "loss": 2.6439, + "theoretical_loss": 3.4614651356004384, + "tokens_seen": 1797023744 + }, + { + "epoch": 6.01, + "learning_rate": 0.00023010030090270814, + "loss": 2.6534, + "theoretical_loss": 3.4614543876997557, + "tokens_seen": 1797089280 + }, + { + "epoch": 6.01, + "learning_rate": 0.00023009027081243732, + "loss": 2.6655, + "theoretical_loss": 3.461443640300761, + "tokens_seen": 1797154816 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002300802407221665, + "loss": 2.7352, + "theoretical_loss": 3.461432893403412, + "tokens_seen": 1797220352 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002300702106318957, + "loss": 2.5736, + "theoretical_loss": 3.461422147007668, + "tokens_seen": 1797285888 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2036128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.879075288772583, + "objective/train/theoretical_loss": 3.461416773997885, + "objective/train/tokens_used": 1817778656, + "theoretical_loss": 3.461416773997885, + "tokens_seen": 1797318656 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002300601805416249, + "loss": 2.7956, + "theoretical_loss": 3.4614114011134873, + "tokens_seen": 1797351424 + }, + { + "epoch": 6.01, + "learning_rate": 0.00023005015045135407, + "loss": 2.6816, + "theoretical_loss": 3.4614006557208277, + "tokens_seen": 1797416960 + }, + { + "epoch": 6.01, + "learning_rate": 0.00023004012036108326, + "loss": 2.5554, + "theoretical_loss": 3.4613899108296478, + "tokens_seen": 1797482496 + }, + { + "epoch": 6.01, + "learning_rate": 0.00023003009027081246, + "loss": 2.4739, + "theoretical_loss": 3.4613791664399054, + "tokens_seen": 1797548032 + }, + { + "epoch": 6.01, + "learning_rate": 0.00023002006018054164, + "loss": 2.5294, + "theoretical_loss": 3.4613684225515593, + "tokens_seen": 1797613568 + }, + { + "epoch": 6.01, + "learning_rate": 0.00023001003009027083, + "loss": 2.6836, + "theoretical_loss": 3.4613576791645677, + "tokens_seen": 1797679104 + }, + { + "epoch": 6.01, + "learning_rate": 0.00023, + "loss": 2.7219, + "theoretical_loss": 3.461346936278889, + "tokens_seen": 1797744640 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022998996990972922, + "loss": 2.8079, + "theoretical_loss": 3.461336193894481, + "tokens_seen": 1797810176 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002299799398194584, + "loss": 2.6874, + "theoretical_loss": 3.4613254520113026, + "tokens_seen": 1797875712 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022996990972918758, + "loss": 2.6417, + "theoretical_loss": 3.4613147106293125, + "tokens_seen": 1797941248 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022995987963891676, + "loss": 2.5866, + "theoretical_loss": 3.461303969748468, + "tokens_seen": 1798006784 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022994984954864594, + "loss": 2.5248, + "theoretical_loss": 3.461293229368729, + "tokens_seen": 1798072320 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022993981945837515, + "loss": 2.6408, + "theoretical_loss": 3.4612824894900522, + "tokens_seen": 1798137856 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022992978936810433, + "loss": 2.5725, + "theoretical_loss": 3.461271750112397, + "tokens_seen": 1798203392 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002299197592778335, + "loss": 2.6486, + "theoretical_loss": 3.461261011235721, + "tokens_seen": 1798268928 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002299097291875627, + "loss": 2.739, + "theoretical_loss": 3.461250272859983, + "tokens_seen": 1798334464 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002298996990972919, + "loss": 2.5459, + "theoretical_loss": 3.461239534985142, + "tokens_seen": 1798400000 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022988966900702108, + "loss": 2.6188, + "theoretical_loss": 3.461228797611155, + "tokens_seen": 1798465536 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022987963891675026, + "loss": 2.6078, + "theoretical_loss": 3.461218060737982, + "tokens_seen": 1798531072 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022986960882647944, + "loss": 2.749, + "theoretical_loss": 3.4612073243655797, + "tokens_seen": 1798596608 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022985957873620862, + "loss": 2.6324, + "theoretical_loss": 3.461196588493908, + "tokens_seen": 1798662144 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022984954864593783, + "loss": 2.6054, + "theoretical_loss": 3.4611858531229247, + "tokens_seen": 1798727680 + }, + { + "epoch": 6.01, + "learning_rate": 0.000229839518555667, + "loss": 2.526, + "theoretical_loss": 3.461175118252588, + "tokens_seen": 1798793216 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002298294884653962, + "loss": 2.6468, + "theoretical_loss": 3.4611643838828563, + "tokens_seen": 1798858752 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022981945837512538, + "loss": 2.5075, + "theoretical_loss": 3.4611536500136877, + "tokens_seen": 1798924288 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2041349, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5370969772338867, + "objective/train/theoretical_loss": 3.4611482832668026, + "objective/train/tokens_used": 1819417056, + "theoretical_loss": 3.4611482832668026, + "tokens_seen": 1798957056 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022980942828485458, + "loss": 2.524, + "theoretical_loss": 3.4611429166450423, + "tokens_seen": 1798989824 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022979939819458376, + "loss": 2.5088, + "theoretical_loss": 3.4611321837768765, + "tokens_seen": 1799055360 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022978936810431295, + "loss": 2.6567, + "theoretical_loss": 3.4611214514091495, + "tokens_seen": 1799120896 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022977933801404213, + "loss": 2.7544, + "theoretical_loss": 3.4611107195418205, + "tokens_seen": 1799186432 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002297693079237713, + "loss": 2.662, + "theoretical_loss": 3.4610999881748468, + "tokens_seen": 1799251968 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022975927783350052, + "loss": 2.5275, + "theoretical_loss": 3.4610892573081875, + "tokens_seen": 1799317504 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002297492477432297, + "loss": 2.5917, + "theoretical_loss": 3.4610785269418005, + "tokens_seen": 1799383040 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022973921765295888, + "loss": 2.7769, + "theoretical_loss": 3.461067797075645, + "tokens_seen": 1799448576 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022972918756268806, + "loss": 2.7373, + "theoretical_loss": 3.461057067709679, + "tokens_seen": 1799514112 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022971915747241727, + "loss": 2.7428, + "theoretical_loss": 3.461046338843861, + "tokens_seen": 1799579648 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022970912738214645, + "loss": 2.7443, + "theoretical_loss": 3.4610356104781497, + "tokens_seen": 1799645184 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022969909729187563, + "loss": 2.6671, + "theoretical_loss": 3.461024882612503, + "tokens_seen": 1799710720 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002296890672016048, + "loss": 2.569, + "theoretical_loss": 3.46101415524688, + "tokens_seen": 1799776256 + }, + { + "epoch": 6.01, + "learning_rate": 0.000229679037111334, + "loss": 2.4606, + "theoretical_loss": 3.4610034283812388, + "tokens_seen": 1799841792 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002296690070210632, + "loss": 2.6362, + "theoretical_loss": 3.460992702015538, + "tokens_seen": 1799907328 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022965897693079238, + "loss": 2.7109, + "theoretical_loss": 3.460981976149737, + "tokens_seen": 1799972864 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022964894684052156, + "loss": 2.5835, + "theoretical_loss": 3.4609712507837926, + "tokens_seen": 1800038400 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022963891675025074, + "loss": 2.7835, + "theoretical_loss": 3.460960525917664, + "tokens_seen": 1800103936 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022962888665997995, + "loss": 2.6124, + "theoretical_loss": 3.460949801551311, + "tokens_seen": 1800169472 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022961885656970913, + "loss": 2.7657, + "theoretical_loss": 3.4609390776846904, + "tokens_seen": 1800235008 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022960882647943831, + "loss": 2.5151, + "theoretical_loss": 3.460928354317761, + "tokens_seen": 1800300544 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002295987963891675, + "loss": 2.7375, + "theoretical_loss": 3.460917631450482, + "tokens_seen": 1800366080 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022958876629889668, + "loss": 2.6246, + "theoretical_loss": 3.4609069090828113, + "tokens_seen": 1800431616 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022957873620862589, + "loss": 2.7867, + "theoretical_loss": 3.4608961872147077, + "tokens_seen": 1800497152 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022956870611835507, + "loss": 2.5932, + "theoretical_loss": 3.4608854658461303, + "tokens_seen": 1800562688 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2046353, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7808806896209717, + "objective/train/theoretical_loss": 3.4608801053491507, + "objective/train/tokens_used": 1821055456, + "theoretical_loss": 3.4608801053491507, + "tokens_seen": 1800595456 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022955867602808425, + "loss": 2.6338, + "theoretical_loss": 3.460874744977037, + "tokens_seen": 1800628224 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022954864593781343, + "loss": 2.6182, + "theoretical_loss": 3.4608640246073863, + "tokens_seen": 1800693760 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022953861584754264, + "loss": 2.6406, + "theoretical_loss": 3.4608533047371366, + "tokens_seen": 1800759296 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022952858575727182, + "loss": 2.6303, + "theoretical_loss": 3.460842585366247, + "tokens_seen": 1800824832 + }, + { + "epoch": 6.01, + "learning_rate": 0.000229518555667001, + "loss": 2.4906, + "theoretical_loss": 3.460831866494676, + "tokens_seen": 1800890368 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022950852557673018, + "loss": 2.5155, + "theoretical_loss": 3.460821148122382, + "tokens_seen": 1800955904 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022949849548645936, + "loss": 2.5724, + "theoretical_loss": 3.4608104302493237, + "tokens_seen": 1801021440 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022948846539618857, + "loss": 2.453, + "theoretical_loss": 3.460799712875459, + "tokens_seen": 1801086976 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022947843530591775, + "loss": 2.5845, + "theoretical_loss": 3.4607889960007476, + "tokens_seen": 1801152512 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022946840521564693, + "loss": 2.7082, + "theoretical_loss": 3.4607782796251474, + "tokens_seen": 1801218048 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002294583751253761, + "loss": 2.6949, + "theoretical_loss": 3.4607675637486173, + "tokens_seen": 1801283584 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022944834503510532, + "loss": 2.5929, + "theoretical_loss": 3.4607568483711155, + "tokens_seen": 1801349120 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002294383149448345, + "loss": 2.6171, + "theoretical_loss": 3.460746133492601, + "tokens_seen": 1801414656 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022942828485456368, + "loss": 2.6259, + "theoretical_loss": 3.460735419113032, + "tokens_seen": 1801480192 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022941825476429286, + "loss": 2.8155, + "theoretical_loss": 3.460724705232368, + "tokens_seen": 1801545728 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022940822467402207, + "loss": 2.7171, + "theoretical_loss": 3.4607139918505663, + "tokens_seen": 1801611264 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022939819458375125, + "loss": 2.5209, + "theoretical_loss": 3.460703278967587, + "tokens_seen": 1801676800 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022938816449348043, + "loss": 2.6911, + "theoretical_loss": 3.460692566583387, + "tokens_seen": 1801742336 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022937813440320962, + "loss": 2.4671, + "theoretical_loss": 3.4606818546979268, + "tokens_seen": 1801807872 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002293681043129388, + "loss": 2.5872, + "theoretical_loss": 3.460671143311164, + "tokens_seen": 1801873408 + }, + { + "epoch": 6.01, + "learning_rate": 0.000229358074222668, + "loss": 2.5834, + "theoretical_loss": 3.460660432423057, + "tokens_seen": 1801938944 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002293480441323972, + "loss": 2.6126, + "theoretical_loss": 3.460649722033565, + "tokens_seen": 1802004480 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002293380140421264, + "loss": 2.5311, + "theoretical_loss": 3.4606390121426465, + "tokens_seen": 1802070016 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022932798395185558, + "loss": 2.6272, + "theoretical_loss": 3.4606283027502602, + "tokens_seen": 1802135552 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022931795386158478, + "loss": 2.6348, + "theoretical_loss": 3.460617593856365, + "tokens_seen": 1802201088 + }, + { + "debugging/Self-BLEU-5": 0.7055341612995765, + "debugging/distinct-1-grams": 0.7332740226788764, + "debugging/distinct-2-grams": 0.9406247709513366, + "debugging/entropy-1-grams": 6.45133006739548, + "debugging/entropy-2-grams": 7.94566962261625, + "debugging/length": 509.2888888888889, + "debugging/num_segments": 45, + "debugging/score": 4.018485031143259e-05, + "debugging/score_std": 0.00026655614147923646, + "epoch": 6.01, + "objective/train/docs_used": 2051590, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9565231800079346, + "objective/train/theoretical_loss": 3.4606122395963386, + "objective/train/tokens_used": 1822693856, + "theoretical_loss": 3.4606122395963386, + "tokens_seen": 1802233856 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022930792377131397, + "loss": 2.806, + "theoretical_loss": 3.460606885460919, + "tokens_seen": 1802266624 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022929789368104315, + "loss": 2.5209, + "theoretical_loss": 3.460596177563881, + "tokens_seen": 1802332160 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022928786359077233, + "loss": 2.503, + "theoretical_loss": 3.46058547016521, + "tokens_seen": 1802397696 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002292778335005015, + "loss": 2.4323, + "theoretical_loss": 3.460574763264865, + "tokens_seen": 1802463232 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022926780341023072, + "loss": 2.6089, + "theoretical_loss": 3.4605640568628036, + "tokens_seen": 1802528768 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002292577733199599, + "loss": 2.8504, + "theoretical_loss": 3.4605533509589854, + "tokens_seen": 1802594304 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022924774322968908, + "loss": 2.8154, + "theoretical_loss": 3.4605426455533688, + "tokens_seen": 1802659840 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022923771313941826, + "loss": 2.5586, + "theoretical_loss": 3.4605319406459127, + "tokens_seen": 1802725376 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022922768304914747, + "loss": 2.5762, + "theoretical_loss": 3.460521236236575, + "tokens_seen": 1802790912 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022921765295887665, + "loss": 2.5758, + "theoretical_loss": 3.4605105323253156, + "tokens_seen": 1802856448 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022920762286860583, + "loss": 2.4976, + "theoretical_loss": 3.4604998289120927, + "tokens_seen": 1802921984 + }, + { + "epoch": 6.01, + "learning_rate": 0.000229197592778335, + "loss": 2.4324, + "theoretical_loss": 3.460489125996865, + "tokens_seen": 1802987520 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002291875626880642, + "loss": 2.5075, + "theoretical_loss": 3.460478423579591, + "tokens_seen": 1803053056 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002291775325977934, + "loss": 2.7023, + "theoretical_loss": 3.4604677216602298, + "tokens_seen": 1803118592 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022916750250752258, + "loss": 2.4986, + "theoretical_loss": 3.4604570202387395, + "tokens_seen": 1803184128 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022915747241725176, + "loss": 2.6025, + "theoretical_loss": 3.4604463193150803, + "tokens_seen": 1803249664 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022914744232698094, + "loss": 2.669, + "theoretical_loss": 3.4604356188892096, + "tokens_seen": 1803315200 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022913741223671015, + "loss": 2.5785, + "theoretical_loss": 3.460424918961086, + "tokens_seen": 1803380736 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022912738214643933, + "loss": 2.6923, + "theoretical_loss": 3.4604142195306693, + "tokens_seen": 1803446272 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022911735205616851, + "loss": 2.5968, + "theoretical_loss": 3.460403520597918, + "tokens_seen": 1803511808 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002291073219658977, + "loss": 2.5882, + "theoretical_loss": 3.46039282216279, + "tokens_seen": 1803577344 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022909729187562688, + "loss": 2.6989, + "theoretical_loss": 3.4603821242252453, + "tokens_seen": 1803642880 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022908726178535609, + "loss": 2.6891, + "theoretical_loss": 3.460371426785242, + "tokens_seen": 1803708416 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022907723169508527, + "loss": 2.631, + "theoretical_loss": 3.460360729842739, + "tokens_seen": 1803773952 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022906720160481445, + "loss": 2.553, + "theoretical_loss": 3.4603500333976944, + "tokens_seen": 1803839488 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2054374, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.685779571533203, + "objective/train/theoretical_loss": 3.460344685361707, + "objective/train/tokens_used": 1824332256, + "theoretical_loss": 3.460344685361707, + "tokens_seen": 1803872256 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022905717151454363, + "loss": 2.6189, + "theoretical_loss": 3.460339337450068, + "tokens_seen": 1803905024 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022904714142427284, + "loss": 2.6172, + "theoretical_loss": 3.460328641999819, + "tokens_seen": 1803970560 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022903711133400202, + "loss": 2.5873, + "theoretical_loss": 3.4603179470469048, + "tokens_seen": 1804036096 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002290270812437312, + "loss": 2.4076, + "theoretical_loss": 3.460307252591285, + "tokens_seen": 1804101632 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022901705115346038, + "loss": 2.6755, + "theoretical_loss": 3.460296558632918, + "tokens_seen": 1804167168 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022900702106318956, + "loss": 2.7814, + "theoretical_loss": 3.4602858651717634, + "tokens_seen": 1804232704 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022899699097291877, + "loss": 2.7115, + "theoretical_loss": 3.4602751722077794, + "tokens_seen": 1804298240 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022898696088264795, + "loss": 2.3877, + "theoretical_loss": 3.4602644797409248, + "tokens_seen": 1804363776 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022897693079237713, + "loss": 2.8002, + "theoretical_loss": 3.4602537877711583, + "tokens_seen": 1804429312 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002289669007021063, + "loss": 2.5388, + "theoretical_loss": 3.4602430962984396, + "tokens_seen": 1804494848 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022895687061183552, + "loss": 2.6925, + "theoretical_loss": 3.460232405322727, + "tokens_seen": 1804560384 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002289468405215647, + "loss": 2.6482, + "theoretical_loss": 3.4602217148439793, + "tokens_seen": 1804625920 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022893681043129388, + "loss": 2.6756, + "theoretical_loss": 3.460211024862155, + "tokens_seen": 1804691456 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022892678034102306, + "loss": 2.7256, + "theoretical_loss": 3.4602003353772135, + "tokens_seen": 1804756992 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022891675025075227, + "loss": 2.6114, + "theoretical_loss": 3.4601896463891135, + "tokens_seen": 1804822528 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022890672016048145, + "loss": 2.7793, + "theoretical_loss": 3.4601789578978135, + "tokens_seen": 1804888064 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022889669007021063, + "loss": 2.7581, + "theoretical_loss": 3.460168269903273, + "tokens_seen": 1804953600 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022888665997993982, + "loss": 2.4662, + "theoretical_loss": 3.4601575824054507, + "tokens_seen": 1805019136 + }, + { + "epoch": 6.01, + "learning_rate": 0.000228876629889669, + "loss": 2.6677, + "theoretical_loss": 3.4601468954043053, + "tokens_seen": 1805084672 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002288665997993982, + "loss": 2.6699, + "theoretical_loss": 3.4601362088997956, + "tokens_seen": 1805150208 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002288565697091274, + "loss": 2.6152, + "theoretical_loss": 3.460125522891881, + "tokens_seen": 1805215744 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022884653961885657, + "loss": 2.5254, + "theoretical_loss": 3.46011483738052, + "tokens_seen": 1805281280 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022883650952858575, + "loss": 2.622, + "theoretical_loss": 3.4601041523656715, + "tokens_seen": 1805346816 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022882647943831496, + "loss": 2.716, + "theoretical_loss": 3.4600934678472943, + "tokens_seen": 1805412352 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022881644934804414, + "loss": 2.6779, + "theoretical_loss": 3.4600827838253476, + "tokens_seen": 1805477888 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2055158, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5684828758239746, + "objective/train/theoretical_loss": 3.4600774420005234, + "objective/train/tokens_used": 1825970656, + "theoretical_loss": 3.4600774420005234, + "tokens_seen": 1805510656 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022880641925777332, + "loss": 2.5777, + "theoretical_loss": 3.46007210029979, + "tokens_seen": 1805543424 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002287963891675025, + "loss": 2.4658, + "theoretical_loss": 3.4600614172705813, + "tokens_seen": 1805608960 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022878635907723168, + "loss": 2.5695, + "theoretical_loss": 3.4600507347376794, + "tokens_seen": 1805674496 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002287763289869609, + "loss": 2.7004, + "theoretical_loss": 3.4600400527010433, + "tokens_seen": 1805740032 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022876629889669007, + "loss": 2.5248, + "theoretical_loss": 3.4600293711606325, + "tokens_seen": 1805805568 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022875626880641925, + "loss": 2.6079, + "theoretical_loss": 3.4600186901164056, + "tokens_seen": 1805871104 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022874623871614843, + "loss": 2.6037, + "theoretical_loss": 3.4600080095683214, + "tokens_seen": 1805936640 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022873620862587764, + "loss": 2.3881, + "theoretical_loss": 3.459997329516339, + "tokens_seen": 1806002176 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022872617853560682, + "loss": 2.4628, + "theoretical_loss": 3.4599866499604177, + "tokens_seen": 1806067712 + }, + { + "epoch": 6.01, + "learning_rate": 0.000228716148445336, + "loss": 2.6824, + "theoretical_loss": 3.459975970900516, + "tokens_seen": 1806133248 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022870611835506518, + "loss": 2.4166, + "theoretical_loss": 3.459965292336593, + "tokens_seen": 1806198784 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022869608826479437, + "loss": 2.3918, + "theoretical_loss": 3.459954614268608, + "tokens_seen": 1806264320 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022868605817452357, + "loss": 2.6873, + "theoretical_loss": 3.4599439366965195, + "tokens_seen": 1806329856 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022867602808425276, + "loss": 2.6966, + "theoretical_loss": 3.4599332596202865, + "tokens_seen": 1806395392 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022866599799398194, + "loss": 2.6945, + "theoretical_loss": 3.459922583039868, + "tokens_seen": 1806460928 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022865596790371112, + "loss": 2.7233, + "theoretical_loss": 3.4599119069552238, + "tokens_seen": 1806526464 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022864593781344033, + "loss": 2.5733, + "theoretical_loss": 3.4599012313663113, + "tokens_seen": 1806592000 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002286359077231695, + "loss": 2.6825, + "theoretical_loss": 3.4598905562730913, + "tokens_seen": 1806657536 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002286258776328987, + "loss": 2.5703, + "theoretical_loss": 3.459879881675521, + "tokens_seen": 1806723072 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022861584754262787, + "loss": 2.6087, + "theoretical_loss": 3.4598692075735613, + "tokens_seen": 1806788608 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022860581745235705, + "loss": 2.5752, + "theoretical_loss": 3.4598585339671697, + "tokens_seen": 1806854144 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022859578736208629, + "loss": 2.5852, + "theoretical_loss": 3.459847860856306, + "tokens_seen": 1806919680 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022858575727181547, + "loss": 2.4889, + "theoretical_loss": 3.4598371882409285, + "tokens_seen": 1806985216 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022857572718154465, + "loss": 2.5777, + "theoretical_loss": 3.459826516120997, + "tokens_seen": 1807050752 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022856569709127383, + "loss": 2.5757, + "theoretical_loss": 3.459815844496471, + "tokens_seen": 1807116288 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2056282, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1619203090667725, + "objective/train/theoretical_loss": 3.459810508869971, + "objective/train/tokens_used": 1827609056, + "theoretical_loss": 3.459810508869971, + "tokens_seen": 1807149056 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022855566700100304, + "loss": 2.4102, + "theoretical_loss": 3.4598051733673074, + "tokens_seen": 1807181824 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022854563691073222, + "loss": 2.6096, + "theoretical_loss": 3.459794502733468, + "tokens_seen": 1807247360 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002285356068204614, + "loss": 2.5663, + "theoretical_loss": 3.4597838325949093, + "tokens_seen": 1807312896 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022852557673019058, + "loss": 2.5474, + "theoretical_loss": 3.459773162951592, + "tokens_seen": 1807378432 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022851554663991976, + "loss": 2.5512, + "theoretical_loss": 3.459762493803475, + "tokens_seen": 1807443968 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022850551654964897, + "loss": 2.7883, + "theoretical_loss": 3.4597518251505166, + "tokens_seen": 1807509504 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022849548645937815, + "loss": 2.7157, + "theoretical_loss": 3.459741156992677, + "tokens_seen": 1807575040 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022848545636910733, + "loss": 2.7186, + "theoretical_loss": 3.459730489329914, + "tokens_seen": 1807640576 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002284754262788365, + "loss": 2.407, + "theoretical_loss": 3.459719822162187, + "tokens_seen": 1807706112 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022846539618856572, + "loss": 2.7317, + "theoretical_loss": 3.459709155489456, + "tokens_seen": 1807771648 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002284553660982949, + "loss": 2.5222, + "theoretical_loss": 3.4596984893116787, + "tokens_seen": 1807837184 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022844533600802408, + "loss": 2.4068, + "theoretical_loss": 3.4596878236288156, + "tokens_seen": 1807902720 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022843530591775326, + "loss": 2.6281, + "theoretical_loss": 3.459677158440825, + "tokens_seen": 1807968256 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022842527582748247, + "loss": 2.6307, + "theoretical_loss": 3.459666493747666, + "tokens_seen": 1808033792 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022841524573721165, + "loss": 2.695, + "theoretical_loss": 3.4596558295492983, + "tokens_seen": 1808099328 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022840521564694084, + "loss": 2.5316, + "theoretical_loss": 3.45964516584568, + "tokens_seen": 1808164864 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022839518555667002, + "loss": 2.416, + "theoretical_loss": 3.4596345026367707, + "tokens_seen": 1808230400 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002283851554663992, + "loss": 2.5682, + "theoretical_loss": 3.45962383992253, + "tokens_seen": 1808295936 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002283751253761284, + "loss": 2.7359, + "theoretical_loss": 3.4596131777029164, + "tokens_seen": 1808361472 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002283650952858576, + "loss": 2.6419, + "theoretical_loss": 3.459602515977889, + "tokens_seen": 1808427008 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022835506519558677, + "loss": 2.5341, + "theoretical_loss": 3.4595918547474076, + "tokens_seen": 1808492544 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022834503510531595, + "loss": 2.3745, + "theoretical_loss": 3.459581194011431, + "tokens_seen": 1808558080 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022833500501504516, + "loss": 2.4398, + "theoretical_loss": 3.4595705337699174, + "tokens_seen": 1808623616 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022832497492477434, + "loss": 2.4594, + "theoretical_loss": 3.459559874022828, + "tokens_seen": 1808689152 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022831494483450352, + "loss": 2.5316, + "theoretical_loss": 3.45954921477012, + "tokens_seen": 1808754688 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2057444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6249680519104004, + "objective/train/theoretical_loss": 3.4595438853291465, + "objective/train/tokens_used": 1829247456, + "theoretical_loss": 3.4595438853291465, + "tokens_seen": 1808787456 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002283049147442327, + "loss": 2.6404, + "theoretical_loss": 3.4595385560117533, + "tokens_seen": 1808820224 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022829488465396188, + "loss": 2.6684, + "theoretical_loss": 3.4595278977476873, + "tokens_seen": 1808885760 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002282848545636911, + "loss": 2.6545, + "theoretical_loss": 3.459517239977881, + "tokens_seen": 1808951296 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022827482447342027, + "loss": 2.6381, + "theoretical_loss": 3.4595065827022937, + "tokens_seen": 1809016832 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022826479438314945, + "loss": 2.3953, + "theoretical_loss": 3.459495925920884, + "tokens_seen": 1809082368 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022825476429287863, + "loss": 2.7543, + "theoretical_loss": 3.459485269633612, + "tokens_seen": 1809147904 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022824473420260784, + "loss": 2.4314, + "theoretical_loss": 3.4594746138404355, + "tokens_seen": 1809213440 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022823470411233702, + "loss": 2.6262, + "theoretical_loss": 3.459463958541315, + "tokens_seen": 1809278976 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002282246740220662, + "loss": 2.6661, + "theoretical_loss": 3.45945330373621, + "tokens_seen": 1809344512 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022821464393179538, + "loss": 2.4809, + "theoretical_loss": 3.4594426494250783, + "tokens_seen": 1809410048 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022820461384152457, + "loss": 2.5237, + "theoretical_loss": 3.45943199560788, + "tokens_seen": 1809475584 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022819458375125377, + "loss": 2.6473, + "theoretical_loss": 3.459421342284574, + "tokens_seen": 1809541120 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022818455366098296, + "loss": 2.5015, + "theoretical_loss": 3.4594106894551198, + "tokens_seen": 1809606656 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022817452357071214, + "loss": 2.7455, + "theoretical_loss": 3.459400037119476, + "tokens_seen": 1809672192 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022816449348044132, + "loss": 2.5253, + "theoretical_loss": 3.4593893852776025, + "tokens_seen": 1809737728 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022815446339017053, + "loss": 2.7315, + "theoretical_loss": 3.4593787339294586, + "tokens_seen": 1809803264 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002281444332998997, + "loss": 2.658, + "theoretical_loss": 3.459368083075003, + "tokens_seen": 1809868800 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002281344032096289, + "loss": 2.5462, + "theoretical_loss": 3.459357432714195, + "tokens_seen": 1809934336 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022812437311935807, + "loss": 2.662, + "theoretical_loss": 3.4593467828469944, + "tokens_seen": 1809999872 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022811434302908725, + "loss": 2.5969, + "theoretical_loss": 3.45933613347336, + "tokens_seen": 1810065408 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022810431293881646, + "loss": 2.5189, + "theoretical_loss": 3.4593254845932506, + "tokens_seen": 1810130944 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022809428284854564, + "loss": 2.4542, + "theoretical_loss": 3.4593148362066266, + "tokens_seen": 1810196480 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022808425275827482, + "loss": 2.7497, + "theoretical_loss": 3.459304188313447, + "tokens_seen": 1810262016 + }, + { + "epoch": 6.01, + "learning_rate": 0.000228074222668004, + "loss": 2.4607, + "theoretical_loss": 3.45929354091367, + "tokens_seen": 1810327552 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002280641925777332, + "loss": 2.5121, + "theoretical_loss": 3.4592828940072557, + "tokens_seen": 1810393088 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2058058, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4326138496398926, + "objective/train/theoretical_loss": 3.4592775707390473, + "objective/train/tokens_used": 1830885856, + "theoretical_loss": 3.4592775707390473, + "tokens_seen": 1810425856 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002280541624874624, + "loss": 2.5378, + "theoretical_loss": 3.4592722475941633, + "tokens_seen": 1810458624 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022804413239719157, + "loss": 2.4306, + "theoretical_loss": 3.4592616016743527, + "tokens_seen": 1810524160 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022803410230692075, + "loss": 2.6668, + "theoretical_loss": 3.4592509562477822, + "tokens_seen": 1810589696 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022802407221664993, + "loss": 2.5613, + "theoretical_loss": 3.459240311314411, + "tokens_seen": 1810655232 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022801404212637914, + "loss": 2.4978, + "theoretical_loss": 3.4592296668741995, + "tokens_seen": 1810720768 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022800401203610832, + "loss": 2.5626, + "theoretical_loss": 3.459219022927106, + "tokens_seen": 1810786304 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002279939819458375, + "loss": 2.6585, + "theoretical_loss": 3.4592083794730906, + "tokens_seen": 1810851840 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022798395185556669, + "loss": 2.6267, + "theoretical_loss": 3.459197736512112, + "tokens_seen": 1810917376 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002279739217652959, + "loss": 2.6907, + "theoretical_loss": 3.4591870940441294, + "tokens_seen": 1810982912 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022796389167502508, + "loss": 2.6812, + "theoretical_loss": 3.4591764520691024, + "tokens_seen": 1811048448 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022795386158475426, + "loss": 2.4672, + "theoretical_loss": 3.459165810586991, + "tokens_seen": 1811113984 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022794383149448344, + "loss": 2.5462, + "theoretical_loss": 3.4591551695977536, + "tokens_seen": 1811179520 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022793380140421262, + "loss": 2.6319, + "theoretical_loss": 3.4591445291013496, + "tokens_seen": 1811245056 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022792377131394183, + "loss": 2.6462, + "theoretical_loss": 3.459133889097739, + "tokens_seen": 1811310592 + }, + { + "epoch": 6.01, + "learning_rate": 0.000227913741223671, + "loss": 2.5725, + "theoretical_loss": 3.4591232495868804, + "tokens_seen": 1811376128 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002279037111334002, + "loss": 2.5717, + "theoretical_loss": 3.4591126105687335, + "tokens_seen": 1811441664 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022789368104312937, + "loss": 2.6646, + "theoretical_loss": 3.459101972043258, + "tokens_seen": 1811507200 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022788365095285858, + "loss": 2.3977, + "theoretical_loss": 3.4590913340104126, + "tokens_seen": 1811572736 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022787362086258776, + "loss": 2.5634, + "theoretical_loss": 3.4590806964701573, + "tokens_seen": 1811638272 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022786359077231694, + "loss": 2.8146, + "theoretical_loss": 3.4590700594224506, + "tokens_seen": 1811703808 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022785356068204612, + "loss": 2.6991, + "theoretical_loss": 3.459059422867253, + "tokens_seen": 1811769344 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022784353059177536, + "loss": 2.4311, + "theoretical_loss": 3.459048786804523, + "tokens_seen": 1811834880 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022783350050150454, + "loss": 2.5624, + "theoretical_loss": 3.4590381512342203, + "tokens_seen": 1811900416 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022782347041123372, + "loss": 2.6245, + "theoretical_loss": 3.459027516156304, + "tokens_seen": 1811965952 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002278134403209629, + "loss": 2.4825, + "theoretical_loss": 3.4590168815707343, + "tokens_seen": 1812031488 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2058860, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.847701072692871, + "objective/train/theoretical_loss": 3.4590115644625667, + "objective/train/tokens_used": 1832524256, + "theoretical_loss": 3.4590115644625667, + "tokens_seen": 1812064256 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022780341023069208, + "loss": 2.5595, + "theoretical_loss": 3.45900624747747, + "tokens_seen": 1812097024 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002277933801404213, + "loss": 2.5605, + "theoretical_loss": 3.4589956138764704, + "tokens_seen": 1812162560 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022778335005015047, + "loss": 2.4626, + "theoretical_loss": 3.4589849807676956, + "tokens_seen": 1812228096 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022777331995987965, + "loss": 2.5258, + "theoretical_loss": 3.458974348151104, + "tokens_seen": 1812293632 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022776328986960883, + "loss": 2.5952, + "theoretical_loss": 3.4589637160266555, + "tokens_seen": 1812359168 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022775325977933804, + "loss": 2.6291, + "theoretical_loss": 3.45895308439431, + "tokens_seen": 1812424704 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022774322968906722, + "loss": 2.5364, + "theoretical_loss": 3.458942453254026, + "tokens_seen": 1812490240 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002277331995987964, + "loss": 2.2968, + "theoretical_loss": 3.4589318226057637, + "tokens_seen": 1812555776 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022772316950852558, + "loss": 2.6488, + "theoretical_loss": 3.4589211924494823, + "tokens_seen": 1812621312 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022771313941825477, + "loss": 2.6796, + "theoretical_loss": 3.4589105627851415, + "tokens_seen": 1812686848 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022770310932798397, + "loss": 2.5319, + "theoretical_loss": 3.4588999336126998, + "tokens_seen": 1812752384 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022769307923771316, + "loss": 2.4265, + "theoretical_loss": 3.458889304932118, + "tokens_seen": 1812817920 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022768304914744234, + "loss": 2.4862, + "theoretical_loss": 3.458878676743354, + "tokens_seen": 1812883456 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022767301905717152, + "loss": 2.5823, + "theoretical_loss": 3.4588680490463686, + "tokens_seen": 1812948992 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022766298896690073, + "loss": 2.5473, + "theoretical_loss": 3.458857421841121, + "tokens_seen": 1813014528 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002276529588766299, + "loss": 2.622, + "theoretical_loss": 3.4588467951275708, + "tokens_seen": 1813080064 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002276429287863591, + "loss": 2.6401, + "theoretical_loss": 3.458836168905677, + "tokens_seen": 1813145600 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022763289869608827, + "loss": 2.4144, + "theoretical_loss": 3.458825543175399, + "tokens_seen": 1813211136 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022762286860581745, + "loss": 2.5735, + "theoretical_loss": 3.4588149179366963, + "tokens_seen": 1813276672 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022761283851554666, + "loss": 2.4007, + "theoretical_loss": 3.458804293189529, + "tokens_seen": 1813342208 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022760280842527584, + "loss": 2.7273, + "theoretical_loss": 3.458793668933856, + "tokens_seen": 1813407744 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022759277833500502, + "loss": 2.6435, + "theoretical_loss": 3.4587830451696373, + "tokens_seen": 1813473280 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002275827482447342, + "loss": 2.727, + "theoretical_loss": 3.458772421896832, + "tokens_seen": 1813538816 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002275727181544634, + "loss": 2.5385, + "theoretical_loss": 3.4587617991153996, + "tokens_seen": 1813604352 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002275626880641926, + "loss": 2.6351, + "theoretical_loss": 3.4587511768252996, + "tokens_seen": 1813669888 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2060457, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6828434467315674, + "objective/train/theoretical_loss": 3.458745865864487, + "objective/train/tokens_used": 1834162656, + "theoretical_loss": 3.458745865864487, + "tokens_seen": 1813702656 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022755265797392177, + "loss": 2.7296, + "theoretical_loss": 3.458740555026492, + "tokens_seen": 1813735424 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022754262788365095, + "loss": 2.6811, + "theoretical_loss": 3.4587299337189363, + "tokens_seen": 1813800960 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022753259779338013, + "loss": 2.5938, + "theoretical_loss": 3.4587193129025913, + "tokens_seen": 1813866496 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022752256770310934, + "loss": 2.6154, + "theoretical_loss": 3.4587086925774173, + "tokens_seen": 1813932032 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022751253761283852, + "loss": 2.5901, + "theoretical_loss": 3.458698072743373, + "tokens_seen": 1813997568 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002275025075225677, + "loss": 2.3883, + "theoretical_loss": 3.4586874534004193, + "tokens_seen": 1814063104 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022749247743229689, + "loss": 2.6821, + "theoretical_loss": 3.4586768345485144, + "tokens_seen": 1814128640 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002274824473420261, + "loss": 2.5923, + "theoretical_loss": 3.458666216187619, + "tokens_seen": 1814194176 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022747241725175528, + "loss": 2.7704, + "theoretical_loss": 3.4586555983176908, + "tokens_seen": 1814259712 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022746238716148446, + "loss": 2.5076, + "theoretical_loss": 3.4586449809386917, + "tokens_seen": 1814325248 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022745235707121364, + "loss": 2.5431, + "theoretical_loss": 3.4586343640505794, + "tokens_seen": 1814390784 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022744232698094282, + "loss": 2.5703, + "theoretical_loss": 3.4586237476533146, + "tokens_seen": 1814456320 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022743229689067203, + "loss": 2.5492, + "theoretical_loss": 3.4586131317468567, + "tokens_seen": 1814521856 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002274222668004012, + "loss": 2.4609, + "theoretical_loss": 3.458602516331165, + "tokens_seen": 1814587392 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002274122367101304, + "loss": 2.5913, + "theoretical_loss": 3.458591901406199, + "tokens_seen": 1814652928 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022740220661985957, + "loss": 2.6185, + "theoretical_loss": 3.458581286971919, + "tokens_seen": 1814718464 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022739217652958878, + "loss": 2.5146, + "theoretical_loss": 3.4585706730282837, + "tokens_seen": 1814784000 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022738214643931796, + "loss": 2.5973, + "theoretical_loss": 3.4585600595752526, + "tokens_seen": 1814849536 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022737211634904714, + "loss": 2.4914, + "theoretical_loss": 3.4585494466127864, + "tokens_seen": 1814915072 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022736208625877632, + "loss": 2.5125, + "theoretical_loss": 3.4585388341408443, + "tokens_seen": 1814980608 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002273520561685055, + "loss": 2.5612, + "theoretical_loss": 3.4585282221593854, + "tokens_seen": 1815046144 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002273420260782347, + "loss": 2.6847, + "theoretical_loss": 3.4585176106683697, + "tokens_seen": 1815111680 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002273319959879639, + "loss": 2.4669, + "theoretical_loss": 3.458506999667757, + "tokens_seen": 1815177216 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022732196589769307, + "loss": 2.4931, + "theoretical_loss": 3.4584963891575065, + "tokens_seen": 1815242752 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022731193580742225, + "loss": 2.6044, + "theoretical_loss": 3.458485779137578, + "tokens_seen": 1815308288 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2062018, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.805525541305542, + "objective/train/theoretical_loss": 3.4584804743114717, + "objective/train/tokens_used": 1835801056, + "theoretical_loss": 3.4584804743114717, + "tokens_seen": 1815341056 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022730190571715146, + "loss": 2.6446, + "theoretical_loss": 3.458475169607931, + "tokens_seen": 1815373824 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022729187562688064, + "loss": 2.5174, + "theoretical_loss": 3.458464560568526, + "tokens_seen": 1815439360 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022728184553660983, + "loss": 2.6381, + "theoretical_loss": 3.4584539520193216, + "tokens_seen": 1815504896 + }, + { + "epoch": 6.01, + "learning_rate": 0.000227271815446339, + "loss": 2.6771, + "theoretical_loss": 3.4584433439602775, + "tokens_seen": 1815570432 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022726178535606821, + "loss": 2.5397, + "theoretical_loss": 3.4584327363913543, + "tokens_seen": 1815635968 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002272517552657974, + "loss": 2.6494, + "theoretical_loss": 3.4584221293125106, + "tokens_seen": 1815701504 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022724172517552658, + "loss": 2.7639, + "theoretical_loss": 3.458411522723707, + "tokens_seen": 1815767040 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022723169508525576, + "loss": 2.5262, + "theoretical_loss": 3.4584009166249023, + "tokens_seen": 1815832576 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022722166499498494, + "loss": 2.5479, + "theoretical_loss": 3.4583903110160565, + "tokens_seen": 1815898112 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022721163490471415, + "loss": 2.4713, + "theoretical_loss": 3.45837970589713, + "tokens_seen": 1815963648 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022720160481444333, + "loss": 2.478, + "theoretical_loss": 3.4583691012680813, + "tokens_seen": 1816029184 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002271915747241725, + "loss": 2.5155, + "theoretical_loss": 3.458358497128871, + "tokens_seen": 1816094720 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002271815446339017, + "loss": 2.471, + "theoretical_loss": 3.4583478934794583, + "tokens_seen": 1816160256 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002271715145436309, + "loss": 2.4519, + "theoretical_loss": 3.458337290319803, + "tokens_seen": 1816225792 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022716148445336008, + "loss": 2.4745, + "theoretical_loss": 3.4583266876498646, + "tokens_seen": 1816291328 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022715145436308926, + "loss": 2.5354, + "theoretical_loss": 3.458316085469604, + "tokens_seen": 1816356864 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022714142427281844, + "loss": 2.6141, + "theoretical_loss": 3.458305483778979, + "tokens_seen": 1816422400 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022713139418254762, + "loss": 2.4008, + "theoretical_loss": 3.4582948825779507, + "tokens_seen": 1816487936 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022712136409227683, + "loss": 2.5926, + "theoretical_loss": 3.4582842818664785, + "tokens_seen": 1816553472 + }, + { + "epoch": 6.01, + "learning_rate": 0.000227111334002006, + "loss": 2.6843, + "theoretical_loss": 3.4582736816445223, + "tokens_seen": 1816619008 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022710130391173522, + "loss": 2.5535, + "theoretical_loss": 3.4582630819120412, + "tokens_seen": 1816684544 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002270912738214644, + "loss": 2.5377, + "theoretical_loss": 3.4582524826689958, + "tokens_seen": 1816750080 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002270812437311936, + "loss": 2.5191, + "theoretical_loss": 3.4582418839153455, + "tokens_seen": 1816815616 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002270712136409228, + "loss": 2.5947, + "theoretical_loss": 3.458231285651049, + "tokens_seen": 1816881152 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022706118355065197, + "loss": 2.5739, + "theoretical_loss": 3.458220687876068, + "tokens_seen": 1816946688 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2062522, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2879951000213623, + "objective/train/theoretical_loss": 3.4582153891720577, + "objective/train/tokens_used": 1837439456, + "theoretical_loss": 3.4582153891720577, + "tokens_seen": 1816979456 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022705115346038115, + "loss": 2.5292, + "theoretical_loss": 3.458210090590361, + "tokens_seen": 1817012224 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022704112337011033, + "loss": 2.7478, + "theoretical_loss": 3.458199493793888, + "tokens_seen": 1817077760 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022703109327983954, + "loss": 2.6653, + "theoretical_loss": 3.458188897486609, + "tokens_seen": 1817143296 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022702106318956872, + "loss": 2.6394, + "theoretical_loss": 3.4581783016684833, + "tokens_seen": 1817208832 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002270110330992979, + "loss": 2.8401, + "theoretical_loss": 3.458167706339471, + "tokens_seen": 1817274368 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022700100300902709, + "loss": 2.3265, + "theoretical_loss": 3.4581571114995318, + "tokens_seen": 1817339904 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002269909729187563, + "loss": 2.5318, + "theoretical_loss": 3.4581465171486254, + "tokens_seen": 1817405440 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022698094282848548, + "loss": 2.6416, + "theoretical_loss": 3.4581359232867124, + "tokens_seen": 1817470976 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022697091273821466, + "loss": 2.697, + "theoretical_loss": 3.4581253299137513, + "tokens_seen": 1817536512 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022696088264794384, + "loss": 2.5806, + "theoretical_loss": 3.458114737029703, + "tokens_seen": 1817602048 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022695085255767302, + "loss": 2.3123, + "theoretical_loss": 3.4581041446345266, + "tokens_seen": 1817667584 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022694082246740223, + "loss": 2.5057, + "theoretical_loss": 3.458093552728182, + "tokens_seen": 1817733120 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002269307923771314, + "loss": 2.544, + "theoretical_loss": 3.4580829613106294, + "tokens_seen": 1817798656 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002269207622868606, + "loss": 2.5455, + "theoretical_loss": 3.4580723703818284, + "tokens_seen": 1817864192 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022691073219658977, + "loss": 2.658, + "theoretical_loss": 3.4580617799417386, + "tokens_seen": 1817929728 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022690070210631898, + "loss": 2.4833, + "theoretical_loss": 3.4580511899903206, + "tokens_seen": 1817995264 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022689067201604816, + "loss": 2.6292, + "theoretical_loss": 3.458040600527533, + "tokens_seen": 1818060800 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022688064192577734, + "loss": 2.5741, + "theoretical_loss": 3.458030011553337, + "tokens_seen": 1818126336 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022687061183550652, + "loss": 2.3273, + "theoretical_loss": 3.4580194230676913, + "tokens_seen": 1818191872 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002268605817452357, + "loss": 2.715, + "theoretical_loss": 3.4580088350705562, + "tokens_seen": 1818257408 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002268505516549649, + "loss": 2.4426, + "theoretical_loss": 3.457998247561892, + "tokens_seen": 1818322944 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002268405215646941, + "loss": 2.5989, + "theoretical_loss": 3.457987660541658, + "tokens_seen": 1818388480 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022683049147442327, + "loss": 2.4793, + "theoretical_loss": 3.457977074009814, + "tokens_seen": 1818454016 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022682046138415245, + "loss": 2.6795, + "theoretical_loss": 3.4579664879663206, + "tokens_seen": 1818519552 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022681043129388166, + "loss": 2.5732, + "theoretical_loss": 3.4579559024111366, + "tokens_seen": 1818585088 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2063975, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.478909969329834, + "objective/train/theoretical_loss": 3.457950609816648, + "objective/train/tokens_used": 1839077856, + "theoretical_loss": 3.457950609816648, + "tokens_seen": 1818617856 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022680040120361084, + "loss": 2.6524, + "theoretical_loss": 3.4579453173442225, + "tokens_seen": 1818650624 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022679037111334003, + "loss": 2.594, + "theoretical_loss": 3.4579347327655383, + "tokens_seen": 1818716160 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002267803410230692, + "loss": 2.6094, + "theoretical_loss": 3.4579241486750436, + "tokens_seen": 1818781696 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022677031093279841, + "loss": 2.7041, + "theoretical_loss": 3.4579135650726984, + "tokens_seen": 1818847232 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002267602808425276, + "loss": 2.798, + "theoretical_loss": 3.457902981958463, + "tokens_seen": 1818912768 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022675025075225678, + "loss": 2.6307, + "theoretical_loss": 3.4578923993322963, + "tokens_seen": 1818978304 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022674022066198596, + "loss": 2.5779, + "theoretical_loss": 3.457881817194159, + "tokens_seen": 1819043840 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022673019057171514, + "loss": 2.7733, + "theoretical_loss": 3.457871235544011, + "tokens_seen": 1819109376 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022672016048144435, + "loss": 2.7108, + "theoretical_loss": 3.457860654381812, + "tokens_seen": 1819174912 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022671013039117353, + "loss": 2.5775, + "theoretical_loss": 3.457850073707522, + "tokens_seen": 1819240448 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002267001003009027, + "loss": 2.6973, + "theoretical_loss": 3.4578394935211008, + "tokens_seen": 1819305984 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002266900702106319, + "loss": 2.6247, + "theoretical_loss": 3.4578289138225085, + "tokens_seen": 1819371520 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002266800401203611, + "loss": 2.6276, + "theoretical_loss": 3.457818334611705, + "tokens_seen": 1819437056 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022667001003009028, + "loss": 2.6765, + "theoretical_loss": 3.45780775588865, + "tokens_seen": 1819502592 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022665997993981946, + "loss": 2.7704, + "theoretical_loss": 3.457797177653304, + "tokens_seen": 1819568128 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022664994984954864, + "loss": 2.6385, + "theoretical_loss": 3.457786599905626, + "tokens_seen": 1819633664 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022663991975927782, + "loss": 2.7489, + "theoretical_loss": 3.457776022645577, + "tokens_seen": 1819699200 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022662988966900703, + "loss": 2.3886, + "theoretical_loss": 3.4577654458731164, + "tokens_seen": 1819764736 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002266198595787362, + "loss": 2.4211, + "theoretical_loss": 3.4577548695882045, + "tokens_seen": 1819830272 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002266098294884654, + "loss": 2.7398, + "theoretical_loss": 3.457744293790801, + "tokens_seen": 1819895808 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022659979939819458, + "loss": 2.4493, + "theoretical_loss": 3.457733718480866, + "tokens_seen": 1819961344 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022658976930792378, + "loss": 2.503, + "theoretical_loss": 3.4577231436583586, + "tokens_seen": 1820026880 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022657973921765296, + "loss": 2.6394, + "theoretical_loss": 3.4577125693232404, + "tokens_seen": 1820092416 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022656970912738215, + "loss": 2.6587, + "theoretical_loss": 3.45770199547547, + "tokens_seen": 1820157952 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022655967903711133, + "loss": 2.6076, + "theoretical_loss": 3.457691422115009, + "tokens_seen": 1820223488 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2064721, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6217403411865234, + "objective/train/theoretical_loss": 3.4576861356175064, + "objective/train/tokens_used": 1840716256, + "theoretical_loss": 3.4576861356175064, + "tokens_seen": 1820256256 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002265496489468405, + "loss": 2.8664, + "theoretical_loss": 3.4576808492418154, + "tokens_seen": 1820289024 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022653961885656972, + "loss": 2.4369, + "theoretical_loss": 3.4576702768558505, + "tokens_seen": 1820354560 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002265295887662989, + "loss": 2.6452, + "theoretical_loss": 3.457659704957074, + "tokens_seen": 1820420096 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022651955867602808, + "loss": 2.5459, + "theoretical_loss": 3.457649133545446, + "tokens_seen": 1820485632 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022650952858575726, + "loss": 2.7292, + "theoretical_loss": 3.457638562620926, + "tokens_seen": 1820551168 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022649949849548647, + "loss": 2.4302, + "theoretical_loss": 3.4576279921834745, + "tokens_seen": 1820616704 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022648946840521565, + "loss": 2.4857, + "theoretical_loss": 3.457617422233052, + "tokens_seen": 1820682240 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022647943831494483, + "loss": 2.6944, + "theoretical_loss": 3.4576068527696178, + "tokens_seen": 1820747776 + }, + { + "epoch": 6.01, + "learning_rate": 0.000226469408224674, + "loss": 2.478, + "theoretical_loss": 3.4575962837931318, + "tokens_seen": 1820813312 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002264593781344032, + "loss": 2.5612, + "theoretical_loss": 3.457585715303554, + "tokens_seen": 1820878848 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002264493480441324, + "loss": 2.465, + "theoretical_loss": 3.4575751473008456, + "tokens_seen": 1820944384 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022643931795386158, + "loss": 2.3597, + "theoretical_loss": 3.4575645797849655, + "tokens_seen": 1821009920 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022642928786359076, + "loss": 2.6567, + "theoretical_loss": 3.4575540127558746, + "tokens_seen": 1821075456 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022641925777331994, + "loss": 2.351, + "theoretical_loss": 3.4575434462135317, + "tokens_seen": 1821140992 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022640922768304915, + "loss": 2.5624, + "theoretical_loss": 3.457532880157898, + "tokens_seen": 1821206528 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022639919759277833, + "loss": 2.5821, + "theoretical_loss": 3.457522314588933, + "tokens_seen": 1821272064 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022638916750250751, + "loss": 2.6327, + "theoretical_loss": 3.4575117495065975, + "tokens_seen": 1821337600 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002263791374122367, + "loss": 2.7393, + "theoretical_loss": 3.4575011849108503, + "tokens_seen": 1821403136 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022636910732196588, + "loss": 2.5607, + "theoretical_loss": 3.4574906208016527, + "tokens_seen": 1821468672 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022635907723169508, + "loss": 2.5531, + "theoretical_loss": 3.457480057178964, + "tokens_seen": 1821534208 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002263490471414243, + "loss": 2.5457, + "theoretical_loss": 3.457469494042745, + "tokens_seen": 1821599744 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022633901705115347, + "loss": 2.6849, + "theoretical_loss": 3.457458931392955, + "tokens_seen": 1821665280 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022632898696088266, + "loss": 2.6284, + "theoretical_loss": 3.457448369229555, + "tokens_seen": 1821730816 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022631895687061186, + "loss": 2.5702, + "theoretical_loss": 3.4574378075525045, + "tokens_seen": 1821796352 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022630892678034104, + "loss": 2.5455, + "theoretical_loss": 3.457427246361763, + "tokens_seen": 1821861888 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2065460, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7452199459075928, + "objective/train/theoretical_loss": 3.457421965948747, + "objective/train/tokens_used": 1842354656, + "theoretical_loss": 3.457421965948747, + "tokens_seen": 1821894656 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022629889669007023, + "loss": 2.5699, + "theoretical_loss": 3.457416685657292, + "tokens_seen": 1821927424 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002262888665997994, + "loss": 2.5824, + "theoretical_loss": 3.457406125439051, + "tokens_seen": 1821992960 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022627883650952861, + "loss": 2.5649, + "theoretical_loss": 3.4573955657070004, + "tokens_seen": 1822058496 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002262688064192578, + "loss": 2.472, + "theoretical_loss": 3.4573850064610996, + "tokens_seen": 1822124032 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022625877632898698, + "loss": 2.5899, + "theoretical_loss": 3.4573744477013086, + "tokens_seen": 1822189568 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022624874623871616, + "loss": 2.6018, + "theoretical_loss": 3.457363889427589, + "tokens_seen": 1822255104 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022623871614844534, + "loss": 2.7416, + "theoretical_loss": 3.4573533316399, + "tokens_seen": 1822320640 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022622868605817455, + "loss": 2.4788, + "theoretical_loss": 3.4573427743382013, + "tokens_seen": 1822386176 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022621865596790373, + "loss": 2.5186, + "theoretical_loss": 3.4573322175224535, + "tokens_seen": 1822451712 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002262086258776329, + "loss": 2.6155, + "theoretical_loss": 3.457321661192617, + "tokens_seen": 1822517248 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002261985957873621, + "loss": 2.5353, + "theoretical_loss": 3.457311105348652, + "tokens_seen": 1822582784 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002261885656970913, + "loss": 2.6449, + "theoretical_loss": 3.457300549990518, + "tokens_seen": 1822648320 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022617853560682048, + "loss": 2.7461, + "theoretical_loss": 3.4572899951181757, + "tokens_seen": 1822713856 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022616850551654966, + "loss": 2.5788, + "theoretical_loss": 3.457279440731585, + "tokens_seen": 1822779392 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022615847542627884, + "loss": 2.5443, + "theoretical_loss": 3.457268886830707, + "tokens_seen": 1822844928 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022614844533600802, + "loss": 2.493, + "theoretical_loss": 3.457258333415501, + "tokens_seen": 1822910464 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022613841524573723, + "loss": 2.5051, + "theoretical_loss": 3.4572477804859267, + "tokens_seen": 1822976000 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002261283851554664, + "loss": 2.8141, + "theoretical_loss": 3.4572372280419454, + "tokens_seen": 1823041536 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002261183550651956, + "loss": 2.8462, + "theoretical_loss": 3.4572266760835166, + "tokens_seen": 1823107072 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022610832497492478, + "loss": 2.3164, + "theoretical_loss": 3.457216124610601, + "tokens_seen": 1823172608 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022609829488465398, + "loss": 2.4369, + "theoretical_loss": 3.457205573623158, + "tokens_seen": 1823238144 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022608826479438316, + "loss": 2.5843, + "theoretical_loss": 3.457195023121149, + "tokens_seen": 1823303680 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022607823470411235, + "loss": 2.6075, + "theoretical_loss": 3.457184473104533, + "tokens_seen": 1823369216 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022606820461384153, + "loss": 2.5717, + "theoretical_loss": 3.4571739235732712, + "tokens_seen": 1823434752 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002260581745235707, + "loss": 2.7146, + "theoretical_loss": 3.4571633745273234, + "tokens_seen": 1823500288 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2067081, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6700708866119385, + "objective/train/theoretical_loss": 3.4571581001863296, + "objective/train/tokens_used": 1843993056, + "theoretical_loss": 3.4571581001863296, + "tokens_seen": 1823533056 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022604814443329992, + "loss": 2.5309, + "theoretical_loss": 3.4571528259666495, + "tokens_seen": 1823565824 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002260381143430291, + "loss": 2.7122, + "theoretical_loss": 3.4571422778912106, + "tokens_seen": 1823631360 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022602808425275828, + "loss": 2.4577, + "theoretical_loss": 3.457131730300966, + "tokens_seen": 1823696896 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022601805416248746, + "loss": 2.6075, + "theoretical_loss": 3.4571211831958766, + "tokens_seen": 1823762432 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022600802407221667, + "loss": 2.6529, + "theoretical_loss": 3.4571106365759023, + "tokens_seen": 1823827968 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022599799398194585, + "loss": 2.4302, + "theoretical_loss": 3.4571000904410036, + "tokens_seen": 1823893504 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022598796389167503, + "loss": 2.4566, + "theoretical_loss": 3.457089544791141, + "tokens_seen": 1823959040 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002259779338014042, + "loss": 2.595, + "theoretical_loss": 3.4570789996262743, + "tokens_seen": 1824024576 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002259679037111334, + "loss": 2.3757, + "theoretical_loss": 3.4570684549463637, + "tokens_seen": 1824090112 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002259578736208626, + "loss": 2.6632, + "theoretical_loss": 3.4570579107513697, + "tokens_seen": 1824155648 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022594784353059178, + "loss": 2.7071, + "theoretical_loss": 3.4570473670412527, + "tokens_seen": 1824221184 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022593781344032096, + "loss": 2.6958, + "theoretical_loss": 3.4570368238159723, + "tokens_seen": 1824286720 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022592778335005014, + "loss": 2.4999, + "theoretical_loss": 3.45702628107549, + "tokens_seen": 1824352256 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022591775325977935, + "loss": 2.4945, + "theoretical_loss": 3.4570157388197646, + "tokens_seen": 1824417792 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022590772316950853, + "loss": 2.6032, + "theoretical_loss": 3.4570051970487583, + "tokens_seen": 1824483328 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022589769307923771, + "loss": 2.5439, + "theoretical_loss": 3.4569946557624296, + "tokens_seen": 1824548864 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002258876629889669, + "loss": 2.5034, + "theoretical_loss": 3.45698411496074, + "tokens_seen": 1824614400 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022587763289869608, + "loss": 2.3399, + "theoretical_loss": 3.4569735746436487, + "tokens_seen": 1824679936 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022586760280842528, + "loss": 2.4677, + "theoretical_loss": 3.4569630348111176, + "tokens_seen": 1824745472 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022585757271815447, + "loss": 2.5541, + "theoretical_loss": 3.456952495463105, + "tokens_seen": 1824811008 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022584754262788365, + "loss": 2.4636, + "theoretical_loss": 3.4569419565995725, + "tokens_seen": 1824876544 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022583751253761283, + "loss": 2.633, + "theoretical_loss": 3.456931418220481, + "tokens_seen": 1824942080 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022582748244734204, + "loss": 2.5584, + "theoretical_loss": 3.4569208803257894, + "tokens_seen": 1825007616 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022581745235707122, + "loss": 2.5986, + "theoretical_loss": 3.4569103429154593, + "tokens_seen": 1825073152 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002258074222668004, + "loss": 2.5239, + "theoretical_loss": 3.45689980598945, + "tokens_seen": 1825138688 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2067783, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9730470180511475, + "objective/train/theoretical_loss": 3.4568945377080533, + "objective/train/tokens_used": 1845631456, + "theoretical_loss": 3.4568945377080533, + "tokens_seen": 1825171456 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022579739217652958, + "loss": 2.5069, + "theoretical_loss": 3.4568892695477222, + "tokens_seen": 1825204224 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022578736208625876, + "loss": 2.4431, + "theoretical_loss": 3.4568787335902367, + "tokens_seen": 1825269760 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022577733199598797, + "loss": 2.4695, + "theoretical_loss": 3.456868198116953, + "tokens_seen": 1825335296 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022576730190571715, + "loss": 2.4398, + "theoretical_loss": 3.456857663127833, + "tokens_seen": 1825400832 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022575727181544633, + "loss": 2.6451, + "theoretical_loss": 3.4568471286228353, + "tokens_seen": 1825466368 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002257472417251755, + "loss": 2.5725, + "theoretical_loss": 3.456836594601921, + "tokens_seen": 1825531904 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022573721163490472, + "loss": 2.5659, + "theoretical_loss": 3.4568260610650507, + "tokens_seen": 1825597440 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002257271815446339, + "loss": 2.5223, + "theoretical_loss": 3.456815528012185, + "tokens_seen": 1825662976 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022571715145436308, + "loss": 2.5754, + "theoretical_loss": 3.4568049954432833, + "tokens_seen": 1825728512 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022570712136409226, + "loss": 2.7877, + "theoretical_loss": 3.4567944633583068, + "tokens_seen": 1825794048 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022569709127382147, + "loss": 2.5883, + "theoretical_loss": 3.456783931757215, + "tokens_seen": 1825859584 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022568706118355065, + "loss": 2.6629, + "theoretical_loss": 3.45677340063997, + "tokens_seen": 1825925120 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022567703109327983, + "loss": 2.6135, + "theoretical_loss": 3.4567628700065307, + "tokens_seen": 1825990656 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022566700100300902, + "loss": 2.4434, + "theoretical_loss": 3.456752339856858, + "tokens_seen": 1826056192 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002256569709127382, + "loss": 2.6111, + "theoretical_loss": 3.4567418101909126, + "tokens_seen": 1826121728 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002256469408224674, + "loss": 2.5232, + "theoretical_loss": 3.456731281008654, + "tokens_seen": 1826187264 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022563691073219659, + "loss": 2.5761, + "theoretical_loss": 3.456720752310044, + "tokens_seen": 1826252800 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022562688064192577, + "loss": 2.4579, + "theoretical_loss": 3.456710224095042, + "tokens_seen": 1826318336 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022561685055165495, + "loss": 2.3412, + "theoretical_loss": 3.456699696363608, + "tokens_seen": 1826383872 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022560682046138416, + "loss": 2.5409, + "theoretical_loss": 3.4566891691157045, + "tokens_seen": 1826449408 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022559679037111336, + "loss": 2.545, + "theoretical_loss": 3.4566786423512896, + "tokens_seen": 1826514944 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022558676028084255, + "loss": 2.5588, + "theoretical_loss": 3.4566681160703245, + "tokens_seen": 1826580480 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022557673019057173, + "loss": 2.6534, + "theoretical_loss": 3.456657590272771, + "tokens_seen": 1826646016 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002255667001003009, + "loss": 2.6991, + "theoretical_loss": 3.4566470649585876, + "tokens_seen": 1826711552 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022555667001003012, + "loss": 2.2537, + "theoretical_loss": 3.456636540127736, + "tokens_seen": 1826777088 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2069072, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2105824947357178, + "objective/train/theoretical_loss": 3.456631277893547, + "objective/train/tokens_used": 1847269856, + "theoretical_loss": 3.456631277893547, + "tokens_seen": 1826809856 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002255466399197593, + "loss": 2.4025, + "theoretical_loss": 3.4566260157801763, + "tokens_seen": 1826842624 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022553660982948848, + "loss": 2.5792, + "theoretical_loss": 3.4566154919158683, + "tokens_seen": 1826908160 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022552657973921766, + "loss": 2.4514, + "theoretical_loss": 3.4566049685347737, + "tokens_seen": 1826973696 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022551654964894687, + "loss": 2.5553, + "theoretical_loss": 3.4565944456368523, + "tokens_seen": 1827039232 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022550651955867605, + "loss": 2.6518, + "theoretical_loss": 3.4565839232220648, + "tokens_seen": 1827104768 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022549648946840523, + "loss": 2.6071, + "theoretical_loss": 3.4565734012903713, + "tokens_seen": 1827170304 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002254864593781344, + "loss": 2.4258, + "theoretical_loss": 3.4565628798417327, + "tokens_seen": 1827235840 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002254764292878636, + "loss": 2.5103, + "theoretical_loss": 3.4565523588761096, + "tokens_seen": 1827301376 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002254663991975928, + "loss": 2.4389, + "theoretical_loss": 3.456541838393462, + "tokens_seen": 1827366912 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022545636910732198, + "loss": 2.5124, + "theoretical_loss": 3.4565313183937514, + "tokens_seen": 1827432448 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022544633901705116, + "loss": 2.6675, + "theoretical_loss": 3.4565207988769364, + "tokens_seen": 1827497984 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022543630892678034, + "loss": 2.5968, + "theoretical_loss": 3.4565102798429796, + "tokens_seen": 1827563520 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022542627883650955, + "loss": 2.587, + "theoretical_loss": 3.4564997612918402, + "tokens_seen": 1827629056 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022541624874623873, + "loss": 2.4719, + "theoretical_loss": 3.4564892432234795, + "tokens_seen": 1827694592 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022540621865596791, + "loss": 2.5058, + "theoretical_loss": 3.4564787256378575, + "tokens_seen": 1827760128 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002253961885656971, + "loss": 2.5604, + "theoretical_loss": 3.456468208534935, + "tokens_seen": 1827825664 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022538615847542628, + "loss": 2.4634, + "theoretical_loss": 3.4564576919146726, + "tokens_seen": 1827891200 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022537612838515548, + "loss": 2.6279, + "theoretical_loss": 3.45644717577703, + "tokens_seen": 1827956736 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022536609829488467, + "loss": 2.3884, + "theoretical_loss": 3.4564366601219696, + "tokens_seen": 1828022272 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022535606820461385, + "loss": 2.4674, + "theoretical_loss": 3.45642614494945, + "tokens_seen": 1828087808 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022534603811434303, + "loss": 2.4105, + "theoretical_loss": 3.456415630259433, + "tokens_seen": 1828153344 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022533600802407224, + "loss": 2.561, + "theoretical_loss": 3.4564051160518785, + "tokens_seen": 1828218880 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022532597793380142, + "loss": 2.4794, + "theoretical_loss": 3.4563946023267476, + "tokens_seen": 1828284416 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002253159478435306, + "loss": 2.6567, + "theoretical_loss": 3.4563840890840005, + "tokens_seen": 1828349952 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022530591775325978, + "loss": 2.5065, + "theoretical_loss": 3.456373576323598, + "tokens_seen": 1828415488 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2069775, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.352907180786133, + "objective/train/theoretical_loss": 3.4563683201242634, + "objective/train/tokens_used": 1848908256, + "theoretical_loss": 3.4563683201242634, + "tokens_seen": 1828448256 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022529588766298896, + "loss": 2.4469, + "theoretical_loss": 3.4563630640455, + "tokens_seen": 1828481024 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022528585757271817, + "loss": 2.7178, + "theoretical_loss": 3.456352552249668, + "tokens_seen": 1828546560 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022527582748244735, + "loss": 2.4016, + "theoretical_loss": 3.4563420409360623, + "tokens_seen": 1828612096 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022526579739217653, + "loss": 2.4325, + "theoretical_loss": 3.4563315301046433, + "tokens_seen": 1828677632 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002252557673019057, + "loss": 2.4816, + "theoretical_loss": 3.4563210197553715, + "tokens_seen": 1828743168 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022524573721163492, + "loss": 2.4638, + "theoretical_loss": 3.4563105098882083, + "tokens_seen": 1828808704 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002252357071213641, + "loss": 2.5182, + "theoretical_loss": 3.4563000005031137, + "tokens_seen": 1828874240 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022522567703109328, + "loss": 2.493, + "theoretical_loss": 3.4562894916000477, + "tokens_seen": 1828939776 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022521564694082246, + "loss": 2.6097, + "theoretical_loss": 3.456278983178972, + "tokens_seen": 1829005312 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022520561685055167, + "loss": 2.433, + "theoretical_loss": 3.456268475239847, + "tokens_seen": 1829070848 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022519558676028085, + "loss": 2.5501, + "theoretical_loss": 3.456257967782633, + "tokens_seen": 1829136384 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022518555667001003, + "loss": 2.5231, + "theoretical_loss": 3.456247460807291, + "tokens_seen": 1829201920 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022517552657973922, + "loss": 2.6117, + "theoretical_loss": 3.456236954313781, + "tokens_seen": 1829267456 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002251654964894684, + "loss": 2.5102, + "theoretical_loss": 3.456226448302064, + "tokens_seen": 1829332992 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002251554663991976, + "loss": 2.736, + "theoretical_loss": 3.456215942772101, + "tokens_seen": 1829398528 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022514543630892679, + "loss": 2.5271, + "theoretical_loss": 3.4562054377238525, + "tokens_seen": 1829464064 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022513540621865597, + "loss": 2.5489, + "theoretical_loss": 3.4561949331572785, + "tokens_seen": 1829529600 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022512537612838515, + "loss": 2.5832, + "theoretical_loss": 3.4561844290723402, + "tokens_seen": 1829595136 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022511534603811436, + "loss": 2.4615, + "theoretical_loss": 3.456173925468999, + "tokens_seen": 1829660672 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022510531594784354, + "loss": 2.5902, + "theoretical_loss": 3.456163422347214, + "tokens_seen": 1829726208 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022509528585757272, + "loss": 2.731, + "theoretical_loss": 3.456152919706947, + "tokens_seen": 1829791744 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002250852557673019, + "loss": 2.5597, + "theoretical_loss": 3.456142417548158, + "tokens_seen": 1829857280 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022507522567703108, + "loss": 2.6601, + "theoretical_loss": 3.4561319158708086, + "tokens_seen": 1829922816 + }, + { + "epoch": 6.01, + "learning_rate": 0.0002250651955867603, + "loss": 2.5792, + "theoretical_loss": 3.4561214146748584, + "tokens_seen": 1829988352 + }, + { + "epoch": 6.01, + "learning_rate": 0.00022505516549648947, + "loss": 2.5706, + "theoretical_loss": 3.456110913960269, + "tokens_seen": 1830053888 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 2071315, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.518676519393921, + "objective/train/theoretical_loss": 3.4561056637834717, + "objective/train/tokens_used": 1850546656, + "theoretical_loss": 3.4561056637834717, + "tokens_seen": 1830086656 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022504513540621865, + "loss": 2.448, + "theoretical_loss": 3.4561004137270004, + "tokens_seen": 1830119424 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022503510531594783, + "loss": 2.6077, + "theoretical_loss": 3.4560899139750143, + "tokens_seen": 1830184960 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022502507522567704, + "loss": 2.6881, + "theoretical_loss": 3.45607941470427, + "tokens_seen": 1830250496 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022501504513540622, + "loss": 2.3479, + "theoretical_loss": 3.456068915914729, + "tokens_seen": 1830316032 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002250050150451354, + "loss": 2.4686, + "theoretical_loss": 3.4560584176063527, + "tokens_seen": 1830381568 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022499498495486458, + "loss": 2.7257, + "theoretical_loss": 3.4560479197791, + "tokens_seen": 1830447104 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022498495486459377, + "loss": 2.5563, + "theoretical_loss": 3.4560374224329333, + "tokens_seen": 1830512640 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022497492477432297, + "loss": 2.6485, + "theoretical_loss": 3.456026925567813, + "tokens_seen": 1830578176 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022496489468405215, + "loss": 2.6106, + "theoretical_loss": 3.4560164291836992, + "tokens_seen": 1830643712 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022495486459378134, + "loss": 2.5698, + "theoretical_loss": 3.4560059332805535, + "tokens_seen": 1830709248 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022494483450351052, + "loss": 2.3401, + "theoretical_loss": 3.4559954378583355, + "tokens_seen": 1830774784 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022493480441323973, + "loss": 2.5036, + "theoretical_loss": 3.4559849429170066, + "tokens_seen": 1830840320 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002249247743229689, + "loss": 2.4688, + "theoretical_loss": 3.4559744484565282, + "tokens_seen": 1830905856 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002249147442326981, + "loss": 2.6061, + "theoretical_loss": 3.45596395447686, + "tokens_seen": 1830971392 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022490471414242727, + "loss": 2.5242, + "theoretical_loss": 3.4559534609779634, + "tokens_seen": 1831036928 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022489468405215645, + "loss": 2.575, + "theoretical_loss": 3.455942967959799, + "tokens_seen": 1831102464 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022488465396188566, + "loss": 2.5531, + "theoretical_loss": 3.455932475422327, + "tokens_seen": 1831168000 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022487462387161484, + "loss": 2.58, + "theoretical_loss": 3.455921983365509, + "tokens_seen": 1831233536 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022486459378134402, + "loss": 2.581, + "theoretical_loss": 3.4559114917893057, + "tokens_seen": 1831299072 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002248545636910732, + "loss": 2.2602, + "theoretical_loss": 3.4559010006936774, + "tokens_seen": 1831364608 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022484453360080244, + "loss": 2.3993, + "theoretical_loss": 3.4558905100785853, + "tokens_seen": 1831430144 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022483450351053162, + "loss": 2.6381, + "theoretical_loss": 3.45588001994399, + "tokens_seen": 1831495680 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002248244734202608, + "loss": 2.6724, + "theoretical_loss": 3.4558695302898528, + "tokens_seen": 1831561216 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022481444332998998, + "loss": 2.5066, + "theoretical_loss": 3.455859041116133, + "tokens_seen": 1831626752 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022480441323971916, + "loss": 2.5416, + "theoretical_loss": 3.4558485524227933, + "tokens_seen": 1831692288 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2072087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4619102478027344, + "objective/train/theoretical_loss": 3.455843308256253, + "objective/train/tokens_used": 1852185056, + "theoretical_loss": 3.455843308256253, + "tokens_seen": 1831725056 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022479438314944837, + "loss": 2.5151, + "theoretical_loss": 3.455838064209794, + "tokens_seen": 1831757824 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022478435305917755, + "loss": 2.5289, + "theoretical_loss": 3.4558275764770947, + "tokens_seen": 1831823360 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022477432296890673, + "loss": 2.6391, + "theoretical_loss": 3.4558170892246576, + "tokens_seen": 1831888896 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002247642928786359, + "loss": 2.2882, + "theoretical_loss": 3.4558066024524425, + "tokens_seen": 1831954432 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022475426278836512, + "loss": 2.3652, + "theoretical_loss": 3.4557961161604114, + "tokens_seen": 1832019968 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002247442326980943, + "loss": 2.4964, + "theoretical_loss": 3.4557856303485246, + "tokens_seen": 1832085504 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022473420260782348, + "loss": 2.669, + "theoretical_loss": 3.4557751450167427, + "tokens_seen": 1832151040 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022472417251755266, + "loss": 2.5818, + "theoretical_loss": 3.455764660165026, + "tokens_seen": 1832216576 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022471414242728187, + "loss": 2.7286, + "theoretical_loss": 3.455754175793337, + "tokens_seen": 1832282112 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022470411233701105, + "loss": 2.598, + "theoretical_loss": 3.4557436919016356, + "tokens_seen": 1832347648 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022469408224674023, + "loss": 2.3654, + "theoretical_loss": 3.4557332084898817, + "tokens_seen": 1832413184 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022468405215646942, + "loss": 2.5229, + "theoretical_loss": 3.4557227255580383, + "tokens_seen": 1832478720 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002246740220661986, + "loss": 2.6365, + "theoretical_loss": 3.4557122431060643, + "tokens_seen": 1832544256 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002246639919759278, + "loss": 2.3238, + "theoretical_loss": 3.4557017611339216, + "tokens_seen": 1832609792 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022465396188565699, + "loss": 2.6732, + "theoretical_loss": 3.455691279641571, + "tokens_seen": 1832675328 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022464393179538617, + "loss": 2.6732, + "theoretical_loss": 3.4556807986289733, + "tokens_seen": 1832740864 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022463390170511535, + "loss": 2.5104, + "theoretical_loss": 3.455670318096089, + "tokens_seen": 1832806400 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022462387161484456, + "loss": 2.7456, + "theoretical_loss": 3.4556598380428794, + "tokens_seen": 1832871936 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022461384152457374, + "loss": 2.6726, + "theoretical_loss": 3.4556493584693055, + "tokens_seen": 1832937472 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022460381143430292, + "loss": 2.7563, + "theoretical_loss": 3.4556388793753277, + "tokens_seen": 1833003008 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002245937813440321, + "loss": 2.6845, + "theoretical_loss": 3.4556284007609075, + "tokens_seen": 1833068544 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022458375125376128, + "loss": 2.5446, + "theoretical_loss": 3.4556179226260055, + "tokens_seen": 1833134080 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002245737211634905, + "loss": 2.4348, + "theoretical_loss": 3.4556074449705827, + "tokens_seen": 1833199616 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022456369107321967, + "loss": 2.3944, + "theoretical_loss": 3.4555969677946, + "tokens_seen": 1833265152 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022455366098294885, + "loss": 2.5757, + "theoretical_loss": 3.4555864910980176, + "tokens_seen": 1833330688 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2073507, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.365025281906128, + "objective/train/theoretical_loss": 3.4555812529294903, + "objective/train/tokens_used": 1853823456, + "theoretical_loss": 3.4555812529294903, + "tokens_seen": 1833363456 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022454363089267803, + "loss": 2.547, + "theoretical_loss": 3.4555760148807977, + "tokens_seen": 1833396224 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022453360080240724, + "loss": 2.4825, + "theoretical_loss": 3.4555655391429005, + "tokens_seen": 1833461760 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022452357071213642, + "loss": 2.7884, + "theoretical_loss": 3.4555550638842876, + "tokens_seen": 1833527296 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002245135406218656, + "loss": 2.5369, + "theoretical_loss": 3.455544589104919, + "tokens_seen": 1833592832 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022450351053159478, + "loss": 2.4218, + "theoretical_loss": 3.4555341148047556, + "tokens_seen": 1833658368 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022449348044132397, + "loss": 2.8117, + "theoretical_loss": 3.4555236409837597, + "tokens_seen": 1833723904 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022448345035105317, + "loss": 2.4793, + "theoretical_loss": 3.4555131676418904, + "tokens_seen": 1833789440 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022447342026078236, + "loss": 2.5328, + "theoretical_loss": 3.4555026947791103, + "tokens_seen": 1833854976 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022446339017051154, + "loss": 2.499, + "theoretical_loss": 3.45549222239538, + "tokens_seen": 1833920512 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022445336008024072, + "loss": 2.4333, + "theoretical_loss": 3.4554817504906596, + "tokens_seen": 1833986048 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022444332998996993, + "loss": 2.3804, + "theoretical_loss": 3.455471279064911, + "tokens_seen": 1834051584 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002244332998996991, + "loss": 2.7963, + "theoretical_loss": 3.455460808118094, + "tokens_seen": 1834117120 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002244232698094283, + "loss": 2.5306, + "theoretical_loss": 3.4554503376501713, + "tokens_seen": 1834182656 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022441323971915747, + "loss": 2.522, + "theoretical_loss": 3.4554398676611027, + "tokens_seen": 1834248192 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022440320962888665, + "loss": 2.742, + "theoretical_loss": 3.4554293981508497, + "tokens_seen": 1834313728 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022439317953861586, + "loss": 2.4171, + "theoretical_loss": 3.4554189291193724, + "tokens_seen": 1834379264 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022438314944834504, + "loss": 2.477, + "theoretical_loss": 3.455408460566633, + "tokens_seen": 1834444800 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022437311935807422, + "loss": 2.6749, + "theoretical_loss": 3.455397992492592, + "tokens_seen": 1834510336 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002243630892678034, + "loss": 2.4938, + "theoretical_loss": 3.4553875248972106, + "tokens_seen": 1834575872 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002243530591775326, + "loss": 2.4613, + "theoretical_loss": 3.4553770577804492, + "tokens_seen": 1834641408 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002243430290872618, + "loss": 2.5337, + "theoretical_loss": 3.45536659114227, + "tokens_seen": 1834706944 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022433299899699097, + "loss": 2.7188, + "theoretical_loss": 3.4553561249826323, + "tokens_seen": 1834772480 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022432296890672015, + "loss": 2.6147, + "theoretical_loss": 3.455345659301498, + "tokens_seen": 1834838016 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022431293881644933, + "loss": 2.4862, + "theoretical_loss": 3.4553351940988293, + "tokens_seen": 1834903552 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022430290872617854, + "loss": 2.6323, + "theoretical_loss": 3.4553247293745857, + "tokens_seen": 1834969088 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2074077, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5253472328186035, + "objective/train/theoretical_loss": 3.4553194971918613, + "objective/train/tokens_used": 1855461856, + "theoretical_loss": 3.4553194971918613, + "tokens_seen": 1835001856 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022429287863590772, + "loss": 2.7679, + "theoretical_loss": 3.4553142651287283, + "tokens_seen": 1835034624 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002242828485456369, + "loss": 2.5485, + "theoretical_loss": 3.455303801361219, + "tokens_seen": 1835100160 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022427281845536609, + "loss": 2.324, + "theoretical_loss": 3.455293338072018, + "tokens_seen": 1835165696 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002242627883650953, + "loss": 2.5216, + "theoretical_loss": 3.455282875261087, + "tokens_seen": 1835231232 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022425275827482448, + "loss": 2.3748, + "theoretical_loss": 3.4552724129283865, + "tokens_seen": 1835296768 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022424272818455366, + "loss": 2.5196, + "theoretical_loss": 3.4552619510738785, + "tokens_seen": 1835362304 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022423269809428284, + "loss": 2.4926, + "theoretical_loss": 3.455251489697523, + "tokens_seen": 1835427840 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022422266800401202, + "loss": 2.5784, + "theoretical_loss": 3.455241028799282, + "tokens_seen": 1835493376 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022421263791374123, + "loss": 2.6005, + "theoretical_loss": 3.4552305683791156, + "tokens_seen": 1835558912 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002242026078234704, + "loss": 2.519, + "theoretical_loss": 3.4552201084369853, + "tokens_seen": 1835624448 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002241925777331996, + "loss": 2.5879, + "theoretical_loss": 3.4552096489728528, + "tokens_seen": 1835689984 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022418254764292877, + "loss": 2.7103, + "theoretical_loss": 3.4551991899866783, + "tokens_seen": 1835755520 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022417251755265798, + "loss": 2.6813, + "theoretical_loss": 3.4551887314784233, + "tokens_seen": 1835821056 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022416248746238716, + "loss": 2.5205, + "theoretical_loss": 3.455178273448049, + "tokens_seen": 1835886592 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022415245737211634, + "loss": 2.5077, + "theoretical_loss": 3.4551678158955164, + "tokens_seen": 1835952128 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022414242728184552, + "loss": 2.8135, + "theoretical_loss": 3.4551573588207867, + "tokens_seen": 1836017664 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002241323971915747, + "loss": 2.7113, + "theoretical_loss": 3.4551469022238206, + "tokens_seen": 1836083200 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002241223671013039, + "loss": 2.6992, + "theoretical_loss": 3.4551364461045795, + "tokens_seen": 1836148736 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002241123370110331, + "loss": 2.6283, + "theoretical_loss": 3.455125990463025, + "tokens_seen": 1836214272 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022410230692076227, + "loss": 2.6683, + "theoretical_loss": 3.455115535299117, + "tokens_seen": 1836279808 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022409227683049148, + "loss": 2.6663, + "theoretical_loss": 3.4551050806128183, + "tokens_seen": 1836345344 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002240822467402207, + "loss": 2.4683, + "theoretical_loss": 3.4550946264040885, + "tokens_seen": 1836410880 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022407221664994987, + "loss": 2.7423, + "theoretical_loss": 3.4550841726728896, + "tokens_seen": 1836476416 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022406218655967905, + "loss": 2.5374, + "theoretical_loss": 3.455073719419183, + "tokens_seen": 1836541952 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022405215646940823, + "loss": 2.7347, + "theoretical_loss": 3.4550632666429286, + "tokens_seen": 1836607488 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2075456, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.573488473892212, + "objective/train/theoretical_loss": 3.4550580404338342, + "objective/train/tokens_used": 1857100256, + "theoretical_loss": 3.4550580404338342, + "tokens_seen": 1836640256 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022404212637913744, + "loss": 2.3844, + "theoretical_loss": 3.455052814344089, + "tokens_seen": 1836673024 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022403209628886662, + "loss": 2.4484, + "theoretical_loss": 3.455042362522624, + "tokens_seen": 1836738560 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002240220661985958, + "loss": 2.787, + "theoretical_loss": 3.455031911178496, + "tokens_seen": 1836804096 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022401203610832498, + "loss": 2.235, + "theoretical_loss": 3.4550214603116656, + "tokens_seen": 1836869632 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022400200601805417, + "loss": 2.3513, + "theoretical_loss": 3.455011009922094, + "tokens_seen": 1836935168 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022399197592778337, + "loss": 2.6124, + "theoretical_loss": 3.4550005600097426, + "tokens_seen": 1837000704 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022398194583751256, + "loss": 2.5654, + "theoretical_loss": 3.454990110574572, + "tokens_seen": 1837066240 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022397191574724174, + "loss": 2.4479, + "theoretical_loss": 3.454979661616544, + "tokens_seen": 1837131776 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022396188565697092, + "loss": 2.486, + "theoretical_loss": 3.4549692131356196, + "tokens_seen": 1837197312 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022395185556670013, + "loss": 2.5024, + "theoretical_loss": 3.4549587651317597, + "tokens_seen": 1837262848 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002239418254764293, + "loss": 2.6225, + "theoretical_loss": 3.4549483176049263, + "tokens_seen": 1837328384 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002239317953861585, + "loss": 2.5797, + "theoretical_loss": 3.4549378705550797, + "tokens_seen": 1837393920 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022392176529588767, + "loss": 2.6513, + "theoretical_loss": 3.4549274239821814, + "tokens_seen": 1837459456 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022391173520561685, + "loss": 2.489, + "theoretical_loss": 3.4549169778861923, + "tokens_seen": 1837524992 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022390170511534606, + "loss": 2.4905, + "theoretical_loss": 3.4549065322670747, + "tokens_seen": 1837590528 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022389167502507524, + "loss": 2.5121, + "theoretical_loss": 3.454896087124789, + "tokens_seen": 1837656064 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022388164493480442, + "loss": 2.579, + "theoretical_loss": 3.4548856424592964, + "tokens_seen": 1837721600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002238716148445336, + "loss": 2.4979, + "theoretical_loss": 3.4548751982705586, + "tokens_seen": 1837787136 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002238615847542628, + "loss": 2.7216, + "theoretical_loss": 3.4548647545585363, + "tokens_seen": 1837852672 + }, + { + "epoch": 6.02, + "learning_rate": 0.000223851554663992, + "loss": 2.6007, + "theoretical_loss": 3.454854311323191, + "tokens_seen": 1837918208 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022384152457372117, + "loss": 2.5447, + "theoretical_loss": 3.4548438685644838, + "tokens_seen": 1837983744 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022383149448345035, + "loss": 2.4399, + "theoretical_loss": 3.4548334262823763, + "tokens_seen": 1838049280 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022382146439317953, + "loss": 2.4916, + "theoretical_loss": 3.4548229844768295, + "tokens_seen": 1838114816 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022381143430290874, + "loss": 2.398, + "theoretical_loss": 3.4548125431478045, + "tokens_seen": 1838180352 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022380140421263792, + "loss": 2.8012, + "theoretical_loss": 3.454802102295263, + "tokens_seen": 1838245888 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2076251, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9435410499572754, + "objective/train/theoretical_loss": 3.4547968820476616, + "objective/train/tokens_used": 1858738656, + "theoretical_loss": 3.4547968820476616, + "tokens_seen": 1838278656 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002237913741223671, + "loss": 2.6157, + "theoretical_loss": 3.454791661919166, + "tokens_seen": 1838311424 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022378134403209629, + "loss": 2.6855, + "theoretical_loss": 3.454781222019475, + "tokens_seen": 1838376960 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002237713139418255, + "loss": 2.6949, + "theoretical_loss": 3.4547707825961504, + "tokens_seen": 1838442496 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022376128385155468, + "loss": 2.456, + "theoretical_loss": 3.4547603436491547, + "tokens_seen": 1838508032 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022375125376128386, + "loss": 2.483, + "theoretical_loss": 3.4547499051784487, + "tokens_seen": 1838573568 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022374122367101304, + "loss": 2.5349, + "theoretical_loss": 3.4547394671839937, + "tokens_seen": 1838639104 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022373119358074222, + "loss": 2.782, + "theoretical_loss": 3.454729029665751, + "tokens_seen": 1838704640 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022372116349047143, + "loss": 2.7682, + "theoretical_loss": 3.4547185926236814, + "tokens_seen": 1838770176 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002237111334002006, + "loss": 2.7516, + "theoretical_loss": 3.4547081560577473, + "tokens_seen": 1838835712 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002237011033099298, + "loss": 2.8939, + "theoretical_loss": 3.4546977199679088, + "tokens_seen": 1838901248 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022369107321965897, + "loss": 2.7231, + "theoretical_loss": 3.4546872843541276, + "tokens_seen": 1838966784 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022368104312938818, + "loss": 2.5851, + "theoretical_loss": 3.4546768492163658, + "tokens_seen": 1839032320 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022367101303911736, + "loss": 2.4585, + "theoretical_loss": 3.454666414554584, + "tokens_seen": 1839097856 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022366098294884654, + "loss": 2.5218, + "theoretical_loss": 3.4546559803687433, + "tokens_seen": 1839163392 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022365095285857572, + "loss": 2.5021, + "theoretical_loss": 3.4546455466588055, + "tokens_seen": 1839228928 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002236409227683049, + "loss": 2.8009, + "theoretical_loss": 3.4546351134247324, + "tokens_seen": 1839294464 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002236308926780341, + "loss": 2.5986, + "theoretical_loss": 3.454624680666484, + "tokens_seen": 1839360000 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002236208625877633, + "loss": 2.647, + "theoretical_loss": 3.454614248384023, + "tokens_seen": 1839425536 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022361083249749247, + "loss": 2.4978, + "theoretical_loss": 3.45460381657731, + "tokens_seen": 1839491072 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022360080240722165, + "loss": 2.465, + "theoretical_loss": 3.4545933852463064, + "tokens_seen": 1839556608 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022359077231695086, + "loss": 2.5894, + "theoretical_loss": 3.4545829543909736, + "tokens_seen": 1839622144 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022358074222668004, + "loss": 2.6297, + "theoretical_loss": 3.4545725240112732, + "tokens_seen": 1839687680 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022357071213640923, + "loss": 2.6637, + "theoretical_loss": 3.4545620941071666, + "tokens_seen": 1839753216 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002235606820461384, + "loss": 2.5583, + "theoretical_loss": 3.4545516646786147, + "tokens_seen": 1839818752 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022355065195586761, + "loss": 2.5579, + "theoretical_loss": 3.4545412357255794, + "tokens_seen": 1839884288 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2077677, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.380065441131592, + "objective/train/theoretical_loss": 3.454536021427368, + "objective/train/tokens_used": 1860377056, + "theoretical_loss": 3.454536021427368, + "tokens_seen": 1839917056 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002235406218655968, + "loss": 2.2848, + "theoretical_loss": 3.4545308072480214, + "tokens_seen": 1839949824 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022353059177532598, + "loss": 2.7449, + "theoretical_loss": 3.4545203792459027, + "tokens_seen": 1840015360 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022352056168505516, + "loss": 2.7087, + "theoretical_loss": 3.454509951719185, + "tokens_seen": 1840080896 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022351053159478434, + "loss": 2.7005, + "theoretical_loss": 3.454499524667829, + "tokens_seen": 1840146432 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022350050150451355, + "loss": 2.5851, + "theoretical_loss": 3.454489098091796, + "tokens_seen": 1840211968 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022349047141424273, + "loss": 2.5111, + "theoretical_loss": 3.454478671991048, + "tokens_seen": 1840277504 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002234804413239719, + "loss": 2.4791, + "theoretical_loss": 3.454468246365546, + "tokens_seen": 1840343040 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002234704112337011, + "loss": 2.791, + "theoretical_loss": 3.454457821215252, + "tokens_seen": 1840408576 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002234603811434303, + "loss": 2.7239, + "theoretical_loss": 3.4544473965401266, + "tokens_seen": 1840474112 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022345035105315948, + "loss": 2.5269, + "theoretical_loss": 3.4544369723401314, + "tokens_seen": 1840539648 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022344032096288866, + "loss": 2.4144, + "theoretical_loss": 3.4544265486152286, + "tokens_seen": 1840605184 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022343029087261784, + "loss": 2.8098, + "theoretical_loss": 3.4544161253653787, + "tokens_seen": 1840670720 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022342026078234702, + "loss": 2.289, + "theoretical_loss": 3.4544057025905435, + "tokens_seen": 1840736256 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022341023069207623, + "loss": 2.6093, + "theoretical_loss": 3.4543952802906843, + "tokens_seen": 1840801792 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002234002006018054, + "loss": 2.5548, + "theoretical_loss": 3.4543848584657626, + "tokens_seen": 1840867328 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002233901705115346, + "loss": 2.5924, + "theoretical_loss": 3.4543744371157405, + "tokens_seen": 1840932864 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022338014042126377, + "loss": 2.4213, + "theoretical_loss": 3.4543640162405786, + "tokens_seen": 1840998400 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022337011033099298, + "loss": 2.4559, + "theoretical_loss": 3.4543535958402387, + "tokens_seen": 1841063936 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022336008024072216, + "loss": 2.408, + "theoretical_loss": 3.454343175914682, + "tokens_seen": 1841129472 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022335005015045135, + "loss": 2.7152, + "theoretical_loss": 3.45433275646387, + "tokens_seen": 1841195008 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022334002006018055, + "loss": 2.5379, + "theoretical_loss": 3.454322337487765, + "tokens_seen": 1841260544 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022332998996990973, + "loss": 2.5828, + "theoretical_loss": 3.4543119189863276, + "tokens_seen": 1841326080 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022331995987963894, + "loss": 2.5442, + "theoretical_loss": 3.4543015009595193, + "tokens_seen": 1841391616 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022330992978936812, + "loss": 2.654, + "theoretical_loss": 3.454291083407302, + "tokens_seen": 1841457152 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002232998996990973, + "loss": 2.5814, + "theoretical_loss": 3.4542806663296375, + "tokens_seen": 1841522688 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2078238, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7181644439697266, + "objective/train/theoretical_loss": 3.4542754579687496, + "objective/train/tokens_used": 1862015456, + "theoretical_loss": 3.4542754579687496, + "tokens_seen": 1841555456 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022328986960882649, + "loss": 2.7707, + "theoretical_loss": 3.454270249726486, + "tokens_seen": 1841588224 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002232798395185557, + "loss": 2.7041, + "theoretical_loss": 3.45425983359781, + "tokens_seen": 1841653760 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022326980942828488, + "loss": 2.676, + "theoretical_loss": 3.4542494179435708, + "tokens_seen": 1841719296 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022325977933801406, + "loss": 2.4745, + "theoretical_loss": 3.45423900276373, + "tokens_seen": 1841784832 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022324974924774324, + "loss": 2.6769, + "theoretical_loss": 3.454228588058249, + "tokens_seen": 1841850368 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022323971915747242, + "loss": 2.6586, + "theoretical_loss": 3.454218173827089, + "tokens_seen": 1841915904 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022322968906720163, + "loss": 2.7146, + "theoretical_loss": 3.4542077600702124, + "tokens_seen": 1841981440 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002232196589769308, + "loss": 2.5143, + "theoretical_loss": 3.4541973467875797, + "tokens_seen": 1842046976 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022320962888666, + "loss": 2.5973, + "theoretical_loss": 3.454186933979153, + "tokens_seen": 1842112512 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022319959879638917, + "loss": 2.6156, + "theoretical_loss": 3.454176521644894, + "tokens_seen": 1842178048 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022318956870611838, + "loss": 2.4774, + "theoretical_loss": 3.454166109784764, + "tokens_seen": 1842243584 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022317953861584756, + "loss": 2.4441, + "theoretical_loss": 3.4541556983987243, + "tokens_seen": 1842309120 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022316950852557674, + "loss": 2.4323, + "theoretical_loss": 3.454145287486737, + "tokens_seen": 1842374656 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022315947843530592, + "loss": 2.5131, + "theoretical_loss": 3.454134877048763, + "tokens_seen": 1842440192 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002231494483450351, + "loss": 2.5092, + "theoretical_loss": 3.454124467084764, + "tokens_seen": 1842505728 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002231394182547643, + "loss": 2.4891, + "theoretical_loss": 3.454114057594702, + "tokens_seen": 1842571264 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002231293881644935, + "loss": 2.5454, + "theoretical_loss": 3.4541036485785384, + "tokens_seen": 1842636800 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022311935807422267, + "loss": 2.6699, + "theoretical_loss": 3.4540932400362347, + "tokens_seen": 1842702336 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022310932798395185, + "loss": 2.3148, + "theoretical_loss": 3.454082831967753, + "tokens_seen": 1842767872 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022309929789368106, + "loss": 2.5744, + "theoretical_loss": 3.4540724243730536, + "tokens_seen": 1842833408 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022308926780341024, + "loss": 2.5361, + "theoretical_loss": 3.454062017252099, + "tokens_seen": 1842898944 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022307923771313943, + "loss": 2.7021, + "theoretical_loss": 3.454051610604851, + "tokens_seen": 1842964480 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002230692076228686, + "loss": 2.6471, + "theoretical_loss": 3.4540412044312703, + "tokens_seen": 1843030016 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022305917753259781, + "loss": 2.5367, + "theoretical_loss": 3.4540307987313192, + "tokens_seen": 1843095552 + }, + { + "epoch": 6.02, + "learning_rate": 0.000223049147442327, + "loss": 2.5531, + "theoretical_loss": 3.454020393504959, + "tokens_seen": 1843161088 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2079545, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4279556274414062, + "objective/train/theoretical_loss": 3.454015191069364, + "objective/train/tokens_used": 1863653856, + "theoretical_loss": 3.454015191069364, + "tokens_seen": 1843193856 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022303911735205618, + "loss": 2.6061, + "theoretical_loss": 3.454009988752152, + "tokens_seen": 1843226624 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022302908726178536, + "loss": 2.7316, + "theoretical_loss": 3.4539995844728586, + "tokens_seen": 1843292160 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022301905717151454, + "loss": 2.7037, + "theoretical_loss": 3.4539891806670413, + "tokens_seen": 1843357696 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022300902708124375, + "loss": 2.6553, + "theoretical_loss": 3.4539787773346617, + "tokens_seen": 1843423232 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022299899699097293, + "loss": 2.7941, + "theoretical_loss": 3.4539683744756813, + "tokens_seen": 1843488768 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002229889669007021, + "loss": 2.6514, + "theoretical_loss": 3.453957972090061, + "tokens_seen": 1843554304 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002229789368104313, + "loss": 2.6098, + "theoretical_loss": 3.4539475701777635, + "tokens_seen": 1843619840 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002229689067201605, + "loss": 2.7194, + "theoretical_loss": 3.45393716873875, + "tokens_seen": 1843685376 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022295887662988968, + "loss": 2.4059, + "theoretical_loss": 3.453926767772982, + "tokens_seen": 1843750912 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022294884653961886, + "loss": 2.6023, + "theoretical_loss": 3.4539163672804216, + "tokens_seen": 1843816448 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022293881644934804, + "loss": 2.5277, + "theoretical_loss": 3.4539059672610297, + "tokens_seen": 1843881984 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022292878635907722, + "loss": 2.663, + "theoretical_loss": 3.4538955677147687, + "tokens_seen": 1843947520 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022291875626880643, + "loss": 2.7797, + "theoretical_loss": 3.4538851686416, + "tokens_seen": 1844013056 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002229087261785356, + "loss": 2.4724, + "theoretical_loss": 3.4538747700414856, + "tokens_seen": 1844078592 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002228986960882648, + "loss": 2.472, + "theoretical_loss": 3.453864371914386, + "tokens_seen": 1844144128 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022288866599799397, + "loss": 2.5808, + "theoretical_loss": 3.453853974260264, + "tokens_seen": 1844209664 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022287863590772318, + "loss": 2.5834, + "theoretical_loss": 3.4538435770790814, + "tokens_seen": 1844275200 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022286860581745236, + "loss": 2.5555, + "theoretical_loss": 3.4538331803707987, + "tokens_seen": 1844340736 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022285857572718155, + "loss": 2.4803, + "theoretical_loss": 3.4538227841353786, + "tokens_seen": 1844406272 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022284854563691073, + "loss": 2.6436, + "theoretical_loss": 3.453812388372783, + "tokens_seen": 1844471808 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002228385155466399, + "loss": 2.6822, + "theoretical_loss": 3.4538019930829726, + "tokens_seen": 1844537344 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022282848545636912, + "loss": 2.6799, + "theoretical_loss": 3.45379159826591, + "tokens_seen": 1844602880 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002228184553660983, + "loss": 2.4949, + "theoretical_loss": 3.4537812039215563, + "tokens_seen": 1844668416 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022280842527582748, + "loss": 2.3281, + "theoretical_loss": 3.453770810049874, + "tokens_seen": 1844733952 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022279839518555666, + "loss": 2.6597, + "theoretical_loss": 3.453760416650823, + "tokens_seen": 1844799488 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2080283, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6743063926696777, + "objective/train/theoretical_loss": 3.4537552201285235, + "objective/train/tokens_used": 1865292256, + "theoretical_loss": 3.4537552201285235, + "tokens_seen": 1844832256 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022278836509528587, + "loss": 2.6886, + "theoretical_loss": 3.4537500237243677, + "tokens_seen": 1844865024 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022277833500501505, + "loss": 2.5536, + "theoretical_loss": 3.4537396312704676, + "tokens_seen": 1844930560 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022276830491474423, + "loss": 2.3708, + "theoretical_loss": 3.453729239289085, + "tokens_seen": 1844996096 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002227582748244734, + "loss": 2.6147, + "theoretical_loss": 3.4537188477801823, + "tokens_seen": 1845061632 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002227482447342026, + "loss": 2.6038, + "theoretical_loss": 3.453708456743721, + "tokens_seen": 1845127168 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002227382146439318, + "loss": 2.6683, + "theoretical_loss": 3.453698066179662, + "tokens_seen": 1845192704 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022272818455366098, + "loss": 2.6549, + "theoretical_loss": 3.4536876760879682, + "tokens_seen": 1845258240 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022271815446339016, + "loss": 2.7385, + "theoretical_loss": 3.4536772864686007, + "tokens_seen": 1845323776 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022270812437311934, + "loss": 2.5943, + "theoretical_loss": 3.453666897321521, + "tokens_seen": 1845389312 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022269809428284855, + "loss": 2.7114, + "theoretical_loss": 3.453656508646692, + "tokens_seen": 1845454848 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022268806419257773, + "loss": 2.5695, + "theoretical_loss": 3.4536461204440743, + "tokens_seen": 1845520384 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022267803410230691, + "loss": 2.3246, + "theoretical_loss": 3.4536357327136304, + "tokens_seen": 1845585920 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002226680040120361, + "loss": 2.6908, + "theoretical_loss": 3.4536253454553214, + "tokens_seen": 1845651456 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022265797392176528, + "loss": 2.5371, + "theoretical_loss": 3.4536149586691094, + "tokens_seen": 1845716992 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022264794383149448, + "loss": 2.6719, + "theoretical_loss": 3.4536045723549567, + "tokens_seen": 1845782528 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022263791374122367, + "loss": 2.6886, + "theoretical_loss": 3.4535941865128237, + "tokens_seen": 1845848064 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022262788365095285, + "loss": 2.3768, + "theoretical_loss": 3.453583801142674, + "tokens_seen": 1845913600 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022261785356068203, + "loss": 2.4015, + "theoretical_loss": 3.453573416244468, + "tokens_seen": 1845979136 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022260782347041124, + "loss": 2.4441, + "theoretical_loss": 3.4535630318181685, + "tokens_seen": 1846044672 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022259779338014042, + "loss": 2.583, + "theoretical_loss": 3.453552647863736, + "tokens_seen": 1846110208 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022258776328986963, + "loss": 2.4628, + "theoretical_loss": 3.4535422643811335, + "tokens_seen": 1846175744 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002225777331995988, + "loss": 2.6101, + "theoretical_loss": 3.4535318813703224, + "tokens_seen": 1846241280 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022256770310932801, + "loss": 2.3975, + "theoretical_loss": 3.4535214988312646, + "tokens_seen": 1846306816 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002225576730190572, + "loss": 2.6896, + "theoretical_loss": 3.453511116763922, + "tokens_seen": 1846372352 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022254764292878638, + "loss": 2.691, + "theoretical_loss": 3.453500735168256, + "tokens_seen": 1846437888 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2081878, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8530571460723877, + "objective/train/theoretical_loss": 3.45349554454729, + "objective/train/tokens_used": 1866930656, + "theoretical_loss": 3.45349554454729, + "tokens_seen": 1846470656 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022253761283851556, + "loss": 2.7306, + "theoretical_loss": 3.453490354044229, + "tokens_seen": 1846503424 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022252758274824474, + "loss": 2.6745, + "theoretical_loss": 3.4534799733918025, + "tokens_seen": 1846568960 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022251755265797395, + "loss": 2.4293, + "theoretical_loss": 3.4534695932109383, + "tokens_seen": 1846634496 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022250752256770313, + "loss": 2.5705, + "theoretical_loss": 3.4534592135015982, + "tokens_seen": 1846700032 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002224974924774323, + "loss": 2.5666, + "theoretical_loss": 3.453448834263744, + "tokens_seen": 1846765568 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002224874623871615, + "loss": 2.6135, + "theoretical_loss": 3.453438455497338, + "tokens_seen": 1846831104 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002224774322968907, + "loss": 2.6434, + "theoretical_loss": 3.453428077202342, + "tokens_seen": 1846896640 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022246740220661988, + "loss": 2.5814, + "theoretical_loss": 3.453417699378717, + "tokens_seen": 1846962176 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022245737211634906, + "loss": 2.4005, + "theoretical_loss": 3.4534073220264263, + "tokens_seen": 1847027712 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022244734202607824, + "loss": 2.521, + "theoretical_loss": 3.4533969451454305, + "tokens_seen": 1847093248 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022243731193580742, + "loss": 2.5671, + "theoretical_loss": 3.453386568735692, + "tokens_seen": 1847158784 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022242728184553663, + "loss": 2.4524, + "theoretical_loss": 3.4533761927971725, + "tokens_seen": 1847224320 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002224172517552658, + "loss": 2.6926, + "theoretical_loss": 3.4533658173298347, + "tokens_seen": 1847289856 + }, + { + "epoch": 6.02, + "learning_rate": 0.000222407221664995, + "loss": 2.5651, + "theoretical_loss": 3.4533554423336392, + "tokens_seen": 1847355392 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022239719157472418, + "loss": 2.7077, + "theoretical_loss": 3.4533450678085487, + "tokens_seen": 1847420928 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022238716148445338, + "loss": 2.5735, + "theoretical_loss": 3.453334693754525, + "tokens_seen": 1847486464 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022237713139418256, + "loss": 2.5319, + "theoretical_loss": 3.4533243201715296, + "tokens_seen": 1847552000 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022236710130391175, + "loss": 2.3341, + "theoretical_loss": 3.4533139470595247, + "tokens_seen": 1847617536 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022235707121364093, + "loss": 2.6912, + "theoretical_loss": 3.4533035744184724, + "tokens_seen": 1847683072 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002223470411233701, + "loss": 2.3704, + "theoretical_loss": 3.453293202248334, + "tokens_seen": 1847748608 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022233701103309932, + "loss": 2.5918, + "theoretical_loss": 3.453282830549073, + "tokens_seen": 1847814144 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002223269809428285, + "loss": 2.3889, + "theoretical_loss": 3.453272459320649, + "tokens_seen": 1847879680 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022231695085255768, + "loss": 2.6129, + "theoretical_loss": 3.453262088563026, + "tokens_seen": 1847945216 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022230692076228686, + "loss": 2.7376, + "theoretical_loss": 3.453251718276164, + "tokens_seen": 1848010752 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022229689067201607, + "loss": 2.4948, + "theoretical_loss": 3.453241348460027, + "tokens_seen": 1848076288 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2082481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.588986873626709, + "objective/train/theoretical_loss": 3.4532361637284676, + "objective/train/tokens_used": 1868569056, + "theoretical_loss": 3.4532361637284676, + "tokens_seen": 1848109056 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022228686058174525, + "loss": 2.6985, + "theoretical_loss": 3.453230979114575, + "tokens_seen": 1848141824 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022227683049147443, + "loss": 2.6038, + "theoretical_loss": 3.4532206102397716, + "tokens_seen": 1848207360 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002222668004012036, + "loss": 2.5768, + "theoretical_loss": 3.4532102418355777, + "tokens_seen": 1848272896 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002222567703109328, + "loss": 2.5738, + "theoretical_loss": 3.453199873901956, + "tokens_seen": 1848338432 + }, + { + "epoch": 6.02, + "learning_rate": 0.000222246740220662, + "loss": 2.523, + "theoretical_loss": 3.453189506438868, + "tokens_seen": 1848403968 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022223671013039118, + "loss": 2.7306, + "theoretical_loss": 3.4531791394462745, + "tokens_seen": 1848469504 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022222668004012036, + "loss": 2.598, + "theoretical_loss": 3.4531687729241396, + "tokens_seen": 1848535040 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022221664994984954, + "loss": 2.7304, + "theoretical_loss": 3.453158406872425, + "tokens_seen": 1848600576 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022220661985957875, + "loss": 2.7213, + "theoretical_loss": 3.453148041291091, + "tokens_seen": 1848666112 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022219658976930793, + "loss": 2.6957, + "theoretical_loss": 3.4531376761801003, + "tokens_seen": 1848731648 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022218655967903711, + "loss": 2.774, + "theoretical_loss": 3.453127311539416, + "tokens_seen": 1848797184 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002221765295887663, + "loss": 2.4651, + "theoretical_loss": 3.453116947368999, + "tokens_seen": 1848862720 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022216649949849548, + "loss": 2.5329, + "theoretical_loss": 3.4531065836688115, + "tokens_seen": 1848928256 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022215646940822468, + "loss": 2.5363, + "theoretical_loss": 3.4530962204388156, + "tokens_seen": 1848993792 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022214643931795387, + "loss": 2.5849, + "theoretical_loss": 3.4530858576789734, + "tokens_seen": 1849059328 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022213640922768305, + "loss": 2.6085, + "theoretical_loss": 3.4530754953892466, + "tokens_seen": 1849124864 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022212637913741223, + "loss": 2.6634, + "theoretical_loss": 3.4530651335695977, + "tokens_seen": 1849190400 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022211634904714144, + "loss": 2.5977, + "theoretical_loss": 3.4530547722199882, + "tokens_seen": 1849255936 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022210631895687062, + "loss": 2.512, + "theoretical_loss": 3.4530444113403798, + "tokens_seen": 1849321472 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002220962888665998, + "loss": 2.4833, + "theoretical_loss": 3.453034050930736, + "tokens_seen": 1849387008 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022208625877632898, + "loss": 2.5122, + "theoretical_loss": 3.4530236909910172, + "tokens_seen": 1849452544 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022207622868605816, + "loss": 2.5651, + "theoretical_loss": 3.453013331521186, + "tokens_seen": 1849518080 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022206619859578737, + "loss": 2.5483, + "theoretical_loss": 3.453002972521205, + "tokens_seen": 1849583616 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022205616850551655, + "loss": 2.6226, + "theoretical_loss": 3.452992613991036, + "tokens_seen": 1849649152 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022204613841524573, + "loss": 2.277, + "theoretical_loss": 3.4529822559306402, + "tokens_seen": 1849714688 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2083806, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.757309675216675, + "objective/train/theoretical_loss": 3.4529770770765955, + "objective/train/tokens_used": 1870207456, + "theoretical_loss": 3.4529770770765955, + "tokens_seen": 1849747456 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002220361083249749, + "loss": 2.4238, + "theoretical_loss": 3.452971898339981, + "tokens_seen": 1849780224 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022202607823470412, + "loss": 2.85, + "theoretical_loss": 3.452961541219019, + "tokens_seen": 1849845760 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002220160481444333, + "loss": 2.6035, + "theoretical_loss": 3.452951184567717, + "tokens_seen": 1849911296 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022200601805416248, + "loss": 2.8368, + "theoretical_loss": 3.4529408283860374, + "tokens_seen": 1849976832 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022199598796389166, + "loss": 2.5703, + "theoretical_loss": 3.452930472673942, + "tokens_seen": 1850042368 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022198595787362084, + "loss": 2.682, + "theoretical_loss": 3.452920117431393, + "tokens_seen": 1850107904 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022197592778335005, + "loss": 2.6043, + "theoretical_loss": 3.452909762658352, + "tokens_seen": 1850173440 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022196589769307923, + "loss": 2.4143, + "theoretical_loss": 3.4528994083547815, + "tokens_seen": 1850238976 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022195586760280842, + "loss": 2.5922, + "theoretical_loss": 3.4528890545206434, + "tokens_seen": 1850304512 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002219458375125376, + "loss": 2.5717, + "theoretical_loss": 3.4528787011558997, + "tokens_seen": 1850370048 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002219358074222668, + "loss": 2.6433, + "theoretical_loss": 3.452868348260513, + "tokens_seen": 1850435584 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022192577733199599, + "loss": 2.5356, + "theoretical_loss": 3.452857995834445, + "tokens_seen": 1850501120 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022191574724172517, + "loss": 2.4139, + "theoretical_loss": 3.4528476438776576, + "tokens_seen": 1850566656 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022190571715145435, + "loss": 2.6783, + "theoretical_loss": 3.4528372923901136, + "tokens_seen": 1850632192 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022189568706118356, + "loss": 2.5835, + "theoretical_loss": 3.4528269413717743, + "tokens_seen": 1850697728 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022188565697091274, + "loss": 2.7219, + "theoretical_loss": 3.4528165908226023, + "tokens_seen": 1850763264 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022187562688064192, + "loss": 2.6487, + "theoretical_loss": 3.4528062407425595, + "tokens_seen": 1850828800 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002218655967903711, + "loss": 2.6479, + "theoretical_loss": 3.4527958911316086, + "tokens_seen": 1850894336 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022185556670010028, + "loss": 2.4065, + "theoretical_loss": 3.4527855419897104, + "tokens_seen": 1850959872 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022184553660982952, + "loss": 2.6798, + "theoretical_loss": 3.4527751933168287, + "tokens_seen": 1851025408 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002218355065195587, + "loss": 2.6343, + "theoretical_loss": 3.4527648451129247, + "tokens_seen": 1851090944 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022182547642928788, + "loss": 2.527, + "theoretical_loss": 3.4527544973779607, + "tokens_seen": 1851156480 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022181544633901706, + "loss": 2.4499, + "theoretical_loss": 3.452744150111899, + "tokens_seen": 1851222016 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022180541624874627, + "loss": 2.4081, + "theoretical_loss": 3.4527338033147013, + "tokens_seen": 1851287552 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022179538615847545, + "loss": 2.6348, + "theoretical_loss": 3.4527234569863303, + "tokens_seen": 1851353088 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2084481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.209381580352783, + "objective/train/theoretical_loss": 3.4527182839979425, + "objective/train/tokens_used": 1871845856, + "theoretical_loss": 3.4527182839979425, + "tokens_seen": 1851385856 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022178535606820463, + "loss": 2.5266, + "theoretical_loss": 3.4527131111267475, + "tokens_seen": 1851418624 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002217753259779338, + "loss": 2.4583, + "theoretical_loss": 3.452702765735916, + "tokens_seen": 1851484160 + }, + { + "epoch": 6.02, + "learning_rate": 0.000221765295887663, + "loss": 2.8933, + "theoretical_loss": 3.452692420813797, + "tokens_seen": 1851549696 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002217552657973922, + "loss": 2.754, + "theoretical_loss": 3.4526820763603534, + "tokens_seen": 1851615232 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022174523570712138, + "loss": 2.6322, + "theoretical_loss": 3.4526717323755474, + "tokens_seen": 1851680768 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022173520561685056, + "loss": 2.4298, + "theoretical_loss": 3.4526613888593403, + "tokens_seen": 1851746304 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022172517552657974, + "loss": 2.3246, + "theoretical_loss": 3.4526510458116952, + "tokens_seen": 1851811840 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022171514543630895, + "loss": 2.4987, + "theoretical_loss": 3.452640703232574, + "tokens_seen": 1851877376 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022170511534603813, + "loss": 2.7747, + "theoretical_loss": 3.4526303611219387, + "tokens_seen": 1851942912 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022169508525576731, + "loss": 2.5183, + "theoretical_loss": 3.4526200194797516, + "tokens_seen": 1852008448 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002216850551654965, + "loss": 2.5137, + "theoretical_loss": 3.4526096783059756, + "tokens_seen": 1852073984 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022167502507522568, + "loss": 2.6037, + "theoretical_loss": 3.4525993376005717, + "tokens_seen": 1852139520 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022166499498495488, + "loss": 2.3649, + "theoretical_loss": 3.452588997363503, + "tokens_seen": 1852205056 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022165496489468407, + "loss": 2.5317, + "theoretical_loss": 3.4525786575947315, + "tokens_seen": 1852270592 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022164493480441325, + "loss": 2.5204, + "theoretical_loss": 3.452568318294219, + "tokens_seen": 1852336128 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022163490471414243, + "loss": 2.5212, + "theoretical_loss": 3.452557979461928, + "tokens_seen": 1852401664 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022162487462387164, + "loss": 2.4552, + "theoretical_loss": 3.452547641097821, + "tokens_seen": 1852467200 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022161484453360082, + "loss": 2.5062, + "theoretical_loss": 3.4525373032018605, + "tokens_seen": 1852532736 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022160481444333, + "loss": 2.6679, + "theoretical_loss": 3.452526965774008, + "tokens_seen": 1852598272 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022159478435305918, + "loss": 2.5648, + "theoretical_loss": 3.4525166288142257, + "tokens_seen": 1852663808 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022158475426278836, + "loss": 2.7683, + "theoretical_loss": 3.452506292322477, + "tokens_seen": 1852729344 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022157472417251757, + "loss": 2.7211, + "theoretical_loss": 3.4524959562987227, + "tokens_seen": 1852794880 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022156469408224675, + "loss": 2.3376, + "theoretical_loss": 3.4524856207429258, + "tokens_seen": 1852860416 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022155466399197593, + "loss": 2.6331, + "theoretical_loss": 3.4524752856550482, + "tokens_seen": 1852925952 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002215446339017051, + "loss": 2.6565, + "theoretical_loss": 3.4524649510350525, + "tokens_seen": 1852991488 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2085716, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5095412731170654, + "objective/train/theoretical_loss": 3.452459783900499, + "objective/train/tokens_used": 1873484256, + "theoretical_loss": 3.452459783900499, + "tokens_seen": 1853024256 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022153460381143432, + "loss": 2.599, + "theoretical_loss": 3.4524546168829016, + "tokens_seen": 1853057024 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002215245737211635, + "loss": 2.6512, + "theoretical_loss": 3.4524442831985565, + "tokens_seen": 1853122560 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022151454363089268, + "loss": 2.7322, + "theoretical_loss": 3.4524339499819803, + "tokens_seen": 1853188096 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022150451354062186, + "loss": 2.4098, + "theoretical_loss": 3.4524236172331344, + "tokens_seen": 1853253632 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022149448345035105, + "loss": 2.8494, + "theoretical_loss": 3.4524132849519824, + "tokens_seen": 1853319168 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022148445336008025, + "loss": 2.3552, + "theoretical_loss": 3.4524029531384857, + "tokens_seen": 1853384704 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022147442326980943, + "loss": 2.8899, + "theoretical_loss": 3.452392621792607, + "tokens_seen": 1853450240 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022146439317953862, + "loss": 2.4715, + "theoretical_loss": 3.452382290914308, + "tokens_seen": 1853515776 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002214543630892678, + "loss": 2.6193, + "theoretical_loss": 3.4523719605035517, + "tokens_seen": 1853581312 + }, + { + "epoch": 6.02, + "learning_rate": 0.000221444332998997, + "loss": 2.573, + "theoretical_loss": 3.4523616305603, + "tokens_seen": 1853646848 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022143430290872619, + "loss": 2.7348, + "theoretical_loss": 3.4523513010845157, + "tokens_seen": 1853712384 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022142427281845537, + "loss": 2.5274, + "theoretical_loss": 3.4523409720761604, + "tokens_seen": 1853777920 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022141424272818455, + "loss": 2.5751, + "theoretical_loss": 3.4523306435351975, + "tokens_seen": 1853843456 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022140421263791376, + "loss": 2.57, + "theoretical_loss": 3.452320315461588, + "tokens_seen": 1853908992 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022139418254764294, + "loss": 2.5487, + "theoretical_loss": 3.452309987855295, + "tokens_seen": 1853974528 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022138415245737212, + "loss": 2.6824, + "theoretical_loss": 3.452299660716281, + "tokens_seen": 1854040064 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002213741223671013, + "loss": 2.5695, + "theoretical_loss": 3.452289334044507, + "tokens_seen": 1854105600 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022136409227683048, + "loss": 2.6243, + "theoretical_loss": 3.4522790078399375, + "tokens_seen": 1854171136 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002213540621865597, + "loss": 2.6443, + "theoretical_loss": 3.4522686821025332, + "tokens_seen": 1854236672 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022134403209628887, + "loss": 2.3889, + "theoretical_loss": 3.452258356832257, + "tokens_seen": 1854302208 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022133400200601805, + "loss": 2.704, + "theoretical_loss": 3.452248032029072, + "tokens_seen": 1854367744 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022132397191574723, + "loss": 2.8643, + "theoretical_loss": 3.4522377076929387, + "tokens_seen": 1854433280 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022131394182547644, + "loss": 2.4748, + "theoretical_loss": 3.4522273838238213, + "tokens_seen": 1854498816 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022130391173520562, + "loss": 2.6276, + "theoretical_loss": 3.452217060421681, + "tokens_seen": 1854564352 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002212938816449348, + "loss": 2.6536, + "theoretical_loss": 3.452206737486481, + "tokens_seen": 1854629888 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2086127, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9091062545776367, + "objective/train/theoretical_loss": 3.4522015761939717, + "objective/train/tokens_used": 1875122656, + "theoretical_loss": 3.4522015761939717, + "tokens_seen": 1854662656 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022128385155466398, + "loss": 2.819, + "theoretical_loss": 3.452196415018183, + "tokens_seen": 1854695424 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022127382146439317, + "loss": 2.5658, + "theoretical_loss": 3.45218609301675, + "tokens_seen": 1854760960 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022126379137412237, + "loss": 2.5873, + "theoretical_loss": 3.4521757714821444, + "tokens_seen": 1854826496 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022125376128385155, + "loss": 2.6903, + "theoretical_loss": 3.4521654504143275, + "tokens_seen": 1854892032 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022124373119358074, + "loss": 2.6124, + "theoretical_loss": 3.4521551298132627, + "tokens_seen": 1854957568 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022123370110330992, + "loss": 2.6068, + "theoretical_loss": 3.452144809678913, + "tokens_seen": 1855023104 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022122367101303913, + "loss": 2.5341, + "theoretical_loss": 3.4521344900112387, + "tokens_seen": 1855088640 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002212136409227683, + "loss": 2.4927, + "theoretical_loss": 3.4521241708102046, + "tokens_seen": 1855154176 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002212036108324975, + "loss": 2.5533, + "theoretical_loss": 3.4521138520757715, + "tokens_seen": 1855219712 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022119358074222667, + "loss": 2.6228, + "theoretical_loss": 3.452103533807902, + "tokens_seen": 1855285248 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022118355065195585, + "loss": 2.5705, + "theoretical_loss": 3.4520932160065594, + "tokens_seen": 1855350784 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022117352056168506, + "loss": 2.6192, + "theoretical_loss": 3.4520828986717054, + "tokens_seen": 1855416320 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022116349047141424, + "loss": 2.6248, + "theoretical_loss": 3.4520725818033027, + "tokens_seen": 1855481856 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022115346038114342, + "loss": 2.6298, + "theoretical_loss": 3.4520622654013136, + "tokens_seen": 1855547392 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002211434302908726, + "loss": 2.5985, + "theoretical_loss": 3.4520519494657007, + "tokens_seen": 1855612928 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002211334002006018, + "loss": 2.7232, + "theoretical_loss": 3.4520416339964264, + "tokens_seen": 1855678464 + }, + { + "epoch": 6.02, + "learning_rate": 0.000221123370110331, + "loss": 2.7393, + "theoretical_loss": 3.452031318993453, + "tokens_seen": 1855744000 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022111334002006017, + "loss": 2.6616, + "theoretical_loss": 3.4520210044567428, + "tokens_seen": 1855809536 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022110330992978935, + "loss": 2.6347, + "theoretical_loss": 3.452010690386259, + "tokens_seen": 1855875072 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022109327983951856, + "loss": 2.6287, + "theoretical_loss": 3.452000376781963, + "tokens_seen": 1855940608 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022108324974924777, + "loss": 2.4452, + "theoretical_loss": 3.451990063643818, + "tokens_seen": 1856006144 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022107321965897695, + "loss": 2.5557, + "theoretical_loss": 3.4519797509717867, + "tokens_seen": 1856071680 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022106318956870613, + "loss": 2.3446, + "theoretical_loss": 3.451969438765831, + "tokens_seen": 1856137216 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002210531594784353, + "loss": 2.3379, + "theoretical_loss": 3.4519591270259133, + "tokens_seen": 1856202752 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022104312938816452, + "loss": 2.5711, + "theoretical_loss": 3.4519488157519964, + "tokens_seen": 1856268288 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2087394, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4551119804382324, + "objective/train/theoretical_loss": 3.451943660289777, + "objective/train/tokens_used": 1876761056, + "theoretical_loss": 3.451943660289777, + "tokens_seen": 1856301056 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002210330992978937, + "loss": 2.7922, + "theoretical_loss": 3.451938504944043, + "tokens_seen": 1856333824 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022102306920762288, + "loss": 2.6312, + "theoretical_loss": 3.451928194602016, + "tokens_seen": 1856399360 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022101303911735206, + "loss": 2.4909, + "theoretical_loss": 3.4519178847258765, + "tokens_seen": 1856464896 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022100300902708125, + "loss": 2.8755, + "theoretical_loss": 3.4519075753155875, + "tokens_seen": 1856530432 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022099297893681045, + "loss": 2.3572, + "theoretical_loss": 3.451897266371112, + "tokens_seen": 1856595968 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022098294884653963, + "loss": 2.2535, + "theoretical_loss": 3.4518869578924125, + "tokens_seen": 1856661504 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022097291875626882, + "loss": 2.6896, + "theoretical_loss": 3.451876649879451, + "tokens_seen": 1856727040 + }, + { + "epoch": 6.02, + "learning_rate": 0.000220962888665998, + "loss": 2.5081, + "theoretical_loss": 3.45186634233219, + "tokens_seen": 1856792576 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002209528585757272, + "loss": 2.5772, + "theoretical_loss": 3.451856035250593, + "tokens_seen": 1856858112 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022094282848545639, + "loss": 2.6811, + "theoretical_loss": 3.4518457286346216, + "tokens_seen": 1856923648 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022093279839518557, + "loss": 2.4726, + "theoretical_loss": 3.4518354224842387, + "tokens_seen": 1856989184 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022092276830491475, + "loss": 2.4987, + "theoretical_loss": 3.451825116799406, + "tokens_seen": 1857054720 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022091273821464396, + "loss": 2.7118, + "theoretical_loss": 3.4518148115800873, + "tokens_seen": 1857120256 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022090270812437314, + "loss": 2.529, + "theoretical_loss": 3.451804506826245, + "tokens_seen": 1857185792 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022089267803410232, + "loss": 2.4449, + "theoretical_loss": 3.451794202537841, + "tokens_seen": 1857251328 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002208826479438315, + "loss": 2.7636, + "theoretical_loss": 3.451783898714838, + "tokens_seen": 1857316864 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022087261785356068, + "loss": 2.7027, + "theoretical_loss": 3.4517735953571984, + "tokens_seen": 1857382400 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002208625877632899, + "loss": 2.7712, + "theoretical_loss": 3.451763292464886, + "tokens_seen": 1857447936 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022085255767301907, + "loss": 2.5095, + "theoretical_loss": 3.4517529900378614, + "tokens_seen": 1857513472 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022084252758274825, + "loss": 2.5748, + "theoretical_loss": 3.4517426880760884, + "tokens_seen": 1857579008 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022083249749247743, + "loss": 2.6407, + "theoretical_loss": 3.4517323865795295, + "tokens_seen": 1857644544 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022082246740220664, + "loss": 2.5264, + "theoretical_loss": 3.4517220855481474, + "tokens_seen": 1857710080 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022081243731193582, + "loss": 2.4447, + "theoretical_loss": 3.451711784981904, + "tokens_seen": 1857775616 + }, + { + "epoch": 6.02, + "learning_rate": 0.000220802407221665, + "loss": 2.7567, + "theoretical_loss": 3.451701484880762, + "tokens_seen": 1857841152 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022079237713139418, + "loss": 2.6103, + "theoretical_loss": 3.4516911852446848, + "tokens_seen": 1857906688 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2088078, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.612907886505127, + "objective/train/theoretical_loss": 3.4516860356010337, + "objective/train/tokens_used": 1878399456, + "theoretical_loss": 3.4516860356010337, + "tokens_seen": 1857939456 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022078234704112337, + "loss": 2.5353, + "theoretical_loss": 3.4516808860736345, + "tokens_seen": 1857972224 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022077231695085257, + "loss": 2.6338, + "theoretical_loss": 3.451670587367574, + "tokens_seen": 1858037760 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022076228686058175, + "loss": 2.5677, + "theoretical_loss": 3.4516602891264654, + "tokens_seen": 1858103296 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022075225677031094, + "loss": 2.6914, + "theoretical_loss": 3.451649991350271, + "tokens_seen": 1858168832 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022074222668004012, + "loss": 2.7403, + "theoretical_loss": 3.4516396940389544, + "tokens_seen": 1858234368 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022073219658976933, + "loss": 2.7167, + "theoretical_loss": 3.4516293971924776, + "tokens_seen": 1858299904 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002207221664994985, + "loss": 2.728, + "theoretical_loss": 3.4516191008108033, + "tokens_seen": 1858365440 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002207121364092277, + "loss": 2.6938, + "theoretical_loss": 3.451608804893895, + "tokens_seen": 1858430976 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022070210631895687, + "loss": 2.7009, + "theoretical_loss": 3.4515985094417134, + "tokens_seen": 1858496512 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022069207622868605, + "loss": 2.437, + "theoretical_loss": 3.451588214454223, + "tokens_seen": 1858562048 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022068204613841526, + "loss": 2.6889, + "theoretical_loss": 3.4515779199313856, + "tokens_seen": 1858627584 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022067201604814444, + "loss": 2.4771, + "theoretical_loss": 3.451567625873164, + "tokens_seen": 1858693120 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022066198595787362, + "loss": 2.5668, + "theoretical_loss": 3.4515573322795206, + "tokens_seen": 1858758656 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002206519558676028, + "loss": 2.6494, + "theoretical_loss": 3.451547039150418, + "tokens_seen": 1858824192 + }, + { + "epoch": 6.02, + "learning_rate": 0.000220641925777332, + "loss": 2.6373, + "theoretical_loss": 3.4515367464858198, + "tokens_seen": 1858889728 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002206318956870612, + "loss": 2.6196, + "theoretical_loss": 3.4515264542856876, + "tokens_seen": 1858955264 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022062186559679037, + "loss": 2.5118, + "theoretical_loss": 3.451516162549985, + "tokens_seen": 1859020800 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022061183550651955, + "loss": 2.8588, + "theoretical_loss": 3.4515058712786733, + "tokens_seen": 1859086336 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022060180541624873, + "loss": 2.4043, + "theoretical_loss": 3.4514955804717165, + "tokens_seen": 1859151872 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022059177532597794, + "loss": 2.5331, + "theoretical_loss": 3.4514852901290762, + "tokens_seen": 1859217408 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022058174523570712, + "loss": 2.6115, + "theoretical_loss": 3.451475000250716, + "tokens_seen": 1859282944 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002205717151454363, + "loss": 2.7062, + "theoretical_loss": 3.451464710836599, + "tokens_seen": 1859348480 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022056168505516549, + "loss": 2.4966, + "theoretical_loss": 3.4514544218866865, + "tokens_seen": 1859414016 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002205516549648947, + "loss": 2.5289, + "theoretical_loss": 3.451444133400942, + "tokens_seen": 1859479552 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022054162487462387, + "loss": 2.7116, + "theoretical_loss": 3.4514338453793276, + "tokens_seen": 1859545088 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2088845, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6052680015563965, + "objective/train/theoretical_loss": 3.451428701542558, + "objective/train/tokens_used": 1880037856, + "theoretical_loss": 3.451428701542558, + "tokens_seen": 1859577856 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022053159478435306, + "loss": 2.4975, + "theoretical_loss": 3.451423557821807, + "tokens_seen": 1859610624 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022052156469408224, + "loss": 2.6621, + "theoretical_loss": 3.4514132707283425, + "tokens_seen": 1859676160 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022051153460381142, + "loss": 2.6386, + "theoretical_loss": 3.451402984098896, + "tokens_seen": 1859741696 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022050150451354063, + "loss": 2.7372, + "theoretical_loss": 3.4513926979334313, + "tokens_seen": 1859807232 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002204914744232698, + "loss": 2.7306, + "theoretical_loss": 3.4513824122319106, + "tokens_seen": 1859872768 + }, + { + "epoch": 6.02, + "learning_rate": 0.000220481444332999, + "loss": 2.5577, + "theoretical_loss": 3.451372126994297, + "tokens_seen": 1859938304 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022047141424272817, + "loss": 2.7291, + "theoretical_loss": 3.4513618422205528, + "tokens_seen": 1860003840 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022046138415245738, + "loss": 2.7768, + "theoretical_loss": 3.451351557910641, + "tokens_seen": 1860069376 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022045135406218656, + "loss": 2.5649, + "theoretical_loss": 3.451341274064524, + "tokens_seen": 1860134912 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022044132397191574, + "loss": 2.554, + "theoretical_loss": 3.451330990682165, + "tokens_seen": 1860200448 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022043129388164492, + "loss": 2.6878, + "theoretical_loss": 3.451320707763527, + "tokens_seen": 1860265984 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002204212637913741, + "loss": 2.7237, + "theoretical_loss": 3.451310425308572, + "tokens_seen": 1860331520 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002204112337011033, + "loss": 2.7016, + "theoretical_loss": 3.4513001433172628, + "tokens_seen": 1860397056 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002204012036108325, + "loss": 2.6613, + "theoretical_loss": 3.4512898617895624, + "tokens_seen": 1860462592 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022039117352056167, + "loss": 2.5469, + "theoretical_loss": 3.4512795807254335, + "tokens_seen": 1860528128 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022038114343029085, + "loss": 2.5209, + "theoretical_loss": 3.4512693001248396, + "tokens_seen": 1860593664 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022037111334002006, + "loss": 2.4424, + "theoretical_loss": 3.4512590199877424, + "tokens_seen": 1860659200 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022036108324974924, + "loss": 2.33, + "theoretical_loss": 3.4512487403141052, + "tokens_seen": 1860724736 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022035105315947842, + "loss": 2.6275, + "theoretical_loss": 3.4512384611038907, + "tokens_seen": 1860790272 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022034102306920763, + "loss": 2.6501, + "theoretical_loss": 3.4512281823570614, + "tokens_seen": 1860855808 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022033099297893684, + "loss": 2.6442, + "theoretical_loss": 3.451217904073581, + "tokens_seen": 1860921344 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022032096288866602, + "loss": 2.7135, + "theoretical_loss": 3.4512076262534115, + "tokens_seen": 1860986880 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002203109327983952, + "loss": 2.7164, + "theoretical_loss": 3.4511973488965157, + "tokens_seen": 1861052416 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022030090270812438, + "loss": 2.6334, + "theoretical_loss": 3.4511870720028566, + "tokens_seen": 1861117952 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022029087261785357, + "loss": 2.5449, + "theoretical_loss": 3.451176795572397, + "tokens_seen": 1861183488 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2090374, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.67806077003479, + "objective/train/theoretical_loss": 3.451171657530855, + "objective/train/tokens_used": 1881676256, + "theoretical_loss": 3.451171657530855, + "tokens_seen": 1861216256 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022028084252758277, + "loss": 2.6976, + "theoretical_loss": 3.4511665196051, + "tokens_seen": 1861249024 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022027081243731195, + "loss": 2.5652, + "theoretical_loss": 3.4511562441009276, + "tokens_seen": 1861314560 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022026078234704114, + "loss": 2.5981, + "theoretical_loss": 3.4511459690598434, + "tokens_seen": 1861380096 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022025075225677032, + "loss": 2.5621, + "theoretical_loss": 3.4511356944818097, + "tokens_seen": 1861445632 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022024072216649953, + "loss": 2.6513, + "theoretical_loss": 3.4511254203667905, + "tokens_seen": 1861511168 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002202306920762287, + "loss": 2.5952, + "theoretical_loss": 3.4511151467147467, + "tokens_seen": 1861576704 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002202206619859579, + "loss": 2.5643, + "theoretical_loss": 3.451104873525643, + "tokens_seen": 1861642240 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022021063189568707, + "loss": 2.6939, + "theoretical_loss": 3.4510946007994407, + "tokens_seen": 1861707776 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022020060180541625, + "loss": 2.6769, + "theoretical_loss": 3.451084328536104, + "tokens_seen": 1861773312 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022019057171514546, + "loss": 2.5852, + "theoretical_loss": 3.4510740567355946, + "tokens_seen": 1861838848 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022018054162487464, + "loss": 2.6775, + "theoretical_loss": 3.451063785397876, + "tokens_seen": 1861904384 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022017051153460382, + "loss": 2.7334, + "theoretical_loss": 3.4510535145229113, + "tokens_seen": 1861969920 + }, + { + "epoch": 6.02, + "learning_rate": 0.000220160481444333, + "loss": 2.6328, + "theoretical_loss": 3.4510432441106627, + "tokens_seen": 1862035456 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002201504513540622, + "loss": 2.8487, + "theoretical_loss": 3.4510329741610937, + "tokens_seen": 1862100992 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002201404212637914, + "loss": 2.5657, + "theoretical_loss": 3.451022704674166, + "tokens_seen": 1862166528 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022013039117352057, + "loss": 2.6259, + "theoretical_loss": 3.451012435649844, + "tokens_seen": 1862232064 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022012036108324975, + "loss": 2.853, + "theoretical_loss": 3.45100216708809, + "tokens_seen": 1862297600 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022011033099297893, + "loss": 2.6996, + "theoretical_loss": 3.4509918989888666, + "tokens_seen": 1862363136 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022010030090270814, + "loss": 2.6053, + "theoretical_loss": 3.4509816313521373, + "tokens_seen": 1862428672 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022009027081243732, + "loss": 2.6542, + "theoretical_loss": 3.450971364177864, + "tokens_seen": 1862494208 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002200802407221665, + "loss": 2.6396, + "theoretical_loss": 3.4509610974660103, + "tokens_seen": 1862559744 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022007021063189569, + "loss": 2.5867, + "theoretical_loss": 3.450950831216539, + "tokens_seen": 1862625280 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002200601805416249, + "loss": 2.4579, + "theoretical_loss": 3.450940565429413, + "tokens_seen": 1862690816 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022005015045135408, + "loss": 2.5035, + "theoretical_loss": 3.4509303001045954, + "tokens_seen": 1862756352 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022004012036108326, + "loss": 2.4473, + "theoretical_loss": 3.450920035242049, + "tokens_seen": 1862821888 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 2090879, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.635817766189575, + "objective/train/theoretical_loss": 3.4509149029841155, + "objective/train/tokens_used": 1883314656, + "theoretical_loss": 3.4509149029841155, + "tokens_seen": 1862854656 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022003009027081244, + "loss": 2.6503, + "theoretical_loss": 3.4509097708417364, + "tokens_seen": 1862887424 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022002006018054162, + "loss": 2.6419, + "theoretical_loss": 3.4508995069036206, + "tokens_seen": 1862952960 + }, + { + "epoch": 6.02, + "learning_rate": 0.00022001003009027083, + "loss": 2.6429, + "theoretical_loss": 3.450889243427665, + "tokens_seen": 1863018496 + }, + { + "epoch": 6.03, + "learning_rate": 0.00022, + "loss": 2.5062, + "theoretical_loss": 3.4508789804138322, + "tokens_seen": 1863084032 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002199899699097292, + "loss": 2.3979, + "theoretical_loss": 3.450868717862085, + "tokens_seen": 1863149568 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021997993981945837, + "loss": 2.5716, + "theoretical_loss": 3.4508584557723863, + "tokens_seen": 1863215104 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021996990972918758, + "loss": 2.6394, + "theoretical_loss": 3.4508481941447, + "tokens_seen": 1863280640 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021995987963891676, + "loss": 2.5726, + "theoretical_loss": 3.4508379329789878, + "tokens_seen": 1863346176 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021994984954864594, + "loss": 2.4213, + "theoretical_loss": 3.4508276722752136, + "tokens_seen": 1863411712 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021993981945837512, + "loss": 2.4184, + "theoretical_loss": 3.4508174120333392, + "tokens_seen": 1863477248 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002199297893681043, + "loss": 2.5901, + "theoretical_loss": 3.450807152253329, + "tokens_seen": 1863542784 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002199197592778335, + "loss": 2.7877, + "theoretical_loss": 3.4507968929351445, + "tokens_seen": 1863608320 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002199097291875627, + "loss": 2.5469, + "theoretical_loss": 3.45078663407875, + "tokens_seen": 1863673856 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021989969909729187, + "loss": 2.3507, + "theoretical_loss": 3.4507763756841077, + "tokens_seen": 1863739392 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021988966900702105, + "loss": 2.5926, + "theoretical_loss": 3.450766117751181, + "tokens_seen": 1863804928 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021987963891675026, + "loss": 2.5239, + "theoretical_loss": 3.450755860279932, + "tokens_seen": 1863870464 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021986960882647944, + "loss": 2.6716, + "theoretical_loss": 3.4507456032703248, + "tokens_seen": 1863936000 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021985957873620862, + "loss": 2.6167, + "theoretical_loss": 3.4507353467223223, + "tokens_seen": 1864001536 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002198495486459378, + "loss": 2.8266, + "theoretical_loss": 3.4507250906358866, + "tokens_seen": 1864067072 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021983951855566701, + "loss": 2.5917, + "theoretical_loss": 3.4507148350109818, + "tokens_seen": 1864132608 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002198294884653962, + "loss": 2.7331, + "theoretical_loss": 3.45070457984757, + "tokens_seen": 1864198144 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021981945837512538, + "loss": 2.7618, + "theoretical_loss": 3.450694325145615, + "tokens_seen": 1864263680 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021980942828485456, + "loss": 2.4677, + "theoretical_loss": 3.4506840709050786, + "tokens_seen": 1864329216 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021979939819458374, + "loss": 2.6461, + "theoretical_loss": 3.4506738171259252, + "tokens_seen": 1864394752 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021978936810431295, + "loss": 2.5439, + "theoretical_loss": 3.450663563808117, + "tokens_seen": 1864460288 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2091481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8836681842803955, + "objective/train/theoretical_loss": 3.450658437322206, + "objective/train/tokens_used": 1884953056, + "theoretical_loss": 3.450658437322206, + "tokens_seen": 1864493056 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021977933801404213, + "loss": 2.7997, + "theoretical_loss": 3.4506533109516173, + "tokens_seen": 1864525824 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002197693079237713, + "loss": 2.8015, + "theoretical_loss": 3.4506430585563885, + "tokens_seen": 1864591360 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002197592778335005, + "loss": 2.5417, + "theoretical_loss": 3.4506328066223952, + "tokens_seen": 1864656896 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002197492477432297, + "loss": 2.2631, + "theoretical_loss": 3.4506225551495993, + "tokens_seen": 1864722432 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021973921765295888, + "loss": 2.6009, + "theoretical_loss": 3.4506123041379633, + "tokens_seen": 1864787968 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021972918756268806, + "loss": 2.7667, + "theoretical_loss": 3.4506020535874518, + "tokens_seen": 1864853504 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021971915747241724, + "loss": 2.5684, + "theoretical_loss": 3.4505918034980265, + "tokens_seen": 1864919040 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021970912738214642, + "loss": 2.664, + "theoretical_loss": 3.450581553869651, + "tokens_seen": 1864984576 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021969909729187563, + "loss": 2.3476, + "theoretical_loss": 3.4505713047022883, + "tokens_seen": 1865050112 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002196890672016048, + "loss": 2.498, + "theoretical_loss": 3.4505610559959017, + "tokens_seen": 1865115648 + }, + { + "epoch": 6.03, + "learning_rate": 0.000219679037111334, + "loss": 2.638, + "theoretical_loss": 3.450550807750454, + "tokens_seen": 1865181184 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021966900702106317, + "loss": 2.6269, + "theoretical_loss": 3.4505405599659085, + "tokens_seen": 1865246720 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021965897693079238, + "loss": 2.7502, + "theoretical_loss": 3.4505303126422273, + "tokens_seen": 1865312256 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021964894684052156, + "loss": 2.4454, + "theoretical_loss": 3.450520065779375, + "tokens_seen": 1865377792 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021963891675025075, + "loss": 2.6718, + "theoretical_loss": 3.450509819377314, + "tokens_seen": 1865443328 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021962888665997993, + "loss": 2.5245, + "theoretical_loss": 3.4504995734360073, + "tokens_seen": 1865508864 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002196188565697091, + "loss": 2.4947, + "theoretical_loss": 3.450489327955418, + "tokens_seen": 1865574400 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021960882647943832, + "loss": 2.6987, + "theoretical_loss": 3.4504790829355096, + "tokens_seen": 1865639936 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002195987963891675, + "loss": 2.539, + "theoretical_loss": 3.4504688383762443, + "tokens_seen": 1865705472 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002195887662988967, + "loss": 2.5409, + "theoretical_loss": 3.4504585942775865, + "tokens_seen": 1865771008 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021957873620862589, + "loss": 2.5604, + "theoretical_loss": 3.4504483506394976, + "tokens_seen": 1865836544 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002195687061183551, + "loss": 2.7907, + "theoretical_loss": 3.4504381074619426, + "tokens_seen": 1865902080 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021955867602808428, + "loss": 2.6194, + "theoretical_loss": 3.450427864744883, + "tokens_seen": 1865967616 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021954864593781346, + "loss": 2.5913, + "theoretical_loss": 3.4504176224882834, + "tokens_seen": 1866033152 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021953861584754264, + "loss": 2.4558, + "theoretical_loss": 3.4504073806921056, + "tokens_seen": 1866098688 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2092867, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.195143699645996, + "objective/train/theoretical_loss": 3.4504022599666637, + "objective/train/tokens_used": 1886591456, + "theoretical_loss": 3.4504022599666637, + "tokens_seen": 1866131456 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021952858575727182, + "loss": 2.7633, + "theoretical_loss": 3.4503971393563138, + "tokens_seen": 1866164224 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021951855566700103, + "loss": 2.4622, + "theoretical_loss": 3.4503868984808705, + "tokens_seen": 1866229760 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002195085255767302, + "loss": 2.7049, + "theoretical_loss": 3.450376658065739, + "tokens_seen": 1866295296 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002194984954864594, + "loss": 2.5285, + "theoretical_loss": 3.4503664181108826, + "tokens_seen": 1866360832 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021948846539618857, + "loss": 2.6655, + "theoretical_loss": 3.450356178616264, + "tokens_seen": 1866426368 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021947843530591778, + "loss": 2.7, + "theoretical_loss": 3.4503459395818465, + "tokens_seen": 1866491904 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021946840521564696, + "loss": 2.7238, + "theoretical_loss": 3.450335701007594, + "tokens_seen": 1866557440 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021945837512537614, + "loss": 2.681, + "theoretical_loss": 3.4503254628934688, + "tokens_seen": 1866622976 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021944834503510532, + "loss": 2.7385, + "theoretical_loss": 3.450315225239434, + "tokens_seen": 1866688512 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002194383149448345, + "loss": 2.4709, + "theoretical_loss": 3.4503049880454535, + "tokens_seen": 1866754048 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002194282848545637, + "loss": 2.6496, + "theoretical_loss": 3.45029475131149, + "tokens_seen": 1866819584 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002194182547642929, + "loss": 2.6883, + "theoretical_loss": 3.4502845150375068, + "tokens_seen": 1866885120 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021940822467402207, + "loss": 2.6864, + "theoretical_loss": 3.4502742792234673, + "tokens_seen": 1866950656 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021939819458375125, + "loss": 2.6737, + "theoretical_loss": 3.450264043869334, + "tokens_seen": 1867016192 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021938816449348046, + "loss": 2.5557, + "theoretical_loss": 3.450253808975071, + "tokens_seen": 1867081728 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021937813440320964, + "loss": 2.7028, + "theoretical_loss": 3.450243574540641, + "tokens_seen": 1867147264 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021936810431293882, + "loss": 2.5823, + "theoretical_loss": 3.450233340566007, + "tokens_seen": 1867212800 + }, + { + "epoch": 6.03, + "learning_rate": 0.000219358074222668, + "loss": 2.4431, + "theoretical_loss": 3.4502231070511327, + "tokens_seen": 1867278336 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021934804413239721, + "loss": 2.6324, + "theoretical_loss": 3.4502128739959805, + "tokens_seen": 1867343872 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002193380140421264, + "loss": 2.6347, + "theoretical_loss": 3.450202641400515, + "tokens_seen": 1867409408 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021932798395185558, + "loss": 2.7464, + "theoretical_loss": 3.450192409264698, + "tokens_seen": 1867474944 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021931795386158476, + "loss": 2.5087, + "theoretical_loss": 3.450182177588493, + "tokens_seen": 1867540480 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021930792377131394, + "loss": 2.4609, + "theoretical_loss": 3.4501719463718645, + "tokens_seen": 1867606016 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021929789368104315, + "loss": 2.5878, + "theoretical_loss": 3.450161715614774, + "tokens_seen": 1867671552 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021928786359077233, + "loss": 2.5072, + "theoretical_loss": 3.450151485317186, + "tokens_seen": 1867737088 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2093933, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.46816086769104, + "objective/train/theoretical_loss": 3.4501463703406934, + "objective/train/tokens_used": 1888229856, + "theoretical_loss": 3.4501463703406934, + "tokens_seen": 1867769856 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002192778335005015, + "loss": 2.6087, + "theoretical_loss": 3.450141255479063, + "tokens_seen": 1867802624 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002192678034102307, + "loss": 2.5017, + "theoretical_loss": 3.450131026100369, + "tokens_seen": 1867868160 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002192577733199599, + "loss": 2.6548, + "theoretical_loss": 3.450120797181066, + "tokens_seen": 1867933696 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021924774322968908, + "loss": 2.3443, + "theoretical_loss": 3.450110568721118, + "tokens_seen": 1867999232 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021923771313941826, + "loss": 2.7162, + "theoretical_loss": 3.4501003407204887, + "tokens_seen": 1868064768 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021922768304914744, + "loss": 2.5374, + "theoretical_loss": 3.4500901131791406, + "tokens_seen": 1868130304 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021921765295887662, + "loss": 2.5814, + "theoretical_loss": 3.450079886097037, + "tokens_seen": 1868195840 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021920762286860583, + "loss": 2.6167, + "theoretical_loss": 3.4500696594741425, + "tokens_seen": 1868261376 + }, + { + "epoch": 6.03, + "learning_rate": 0.000219197592778335, + "loss": 2.7094, + "theoretical_loss": 3.4500594333104186, + "tokens_seen": 1868326912 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002191875626880642, + "loss": 2.4439, + "theoretical_loss": 3.4500492076058293, + "tokens_seen": 1868392448 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021917753259779337, + "loss": 2.7402, + "theoretical_loss": 3.450038982360338, + "tokens_seen": 1868457984 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021916750250752258, + "loss": 2.6974, + "theoretical_loss": 3.450028757573908, + "tokens_seen": 1868523520 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021915747241725176, + "loss": 2.5215, + "theoretical_loss": 3.450018533246502, + "tokens_seen": 1868589056 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021914744232698095, + "loss": 2.7397, + "theoretical_loss": 3.4500083093780844, + "tokens_seen": 1868654592 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021913741223671013, + "loss": 2.6327, + "theoretical_loss": 3.4499980859686175, + "tokens_seen": 1868720128 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002191273821464393, + "loss": 2.6278, + "theoretical_loss": 3.449987863018065, + "tokens_seen": 1868785664 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021911735205616852, + "loss": 2.7101, + "theoretical_loss": 3.4499776405263898, + "tokens_seen": 1868851200 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002191073219658977, + "loss": 2.7024, + "theoretical_loss": 3.4499674184935563, + "tokens_seen": 1868916736 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021909729187562688, + "loss": 2.6868, + "theoretical_loss": 3.4499571969195264, + "tokens_seen": 1868982272 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021908726178535606, + "loss": 2.5188, + "theoretical_loss": 3.4499469758042647, + "tokens_seen": 1869047808 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021907723169508527, + "loss": 2.469, + "theoretical_loss": 3.449936755147734, + "tokens_seen": 1869113344 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021906720160481445, + "loss": 2.935, + "theoretical_loss": 3.449926534949897, + "tokens_seen": 1869178880 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021905717151454363, + "loss": 2.7407, + "theoretical_loss": 3.449916315210718, + "tokens_seen": 1869244416 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002190471414242728, + "loss": 2.8614, + "theoretical_loss": 3.4499060959301593, + "tokens_seen": 1869309952 + }, + { + "epoch": 6.03, + "learning_rate": 0.000219037111334002, + "loss": 2.6117, + "theoretical_loss": 3.449895877108186, + "tokens_seen": 1869375488 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2094422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3558056354522705, + "objective/train/theoretical_loss": 3.449890767869156, + "objective/train/tokens_used": 1889868256, + "theoretical_loss": 3.449890767869156, + "tokens_seen": 1869408256 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002190270812437312, + "loss": 2.4567, + "theoretical_loss": 3.4498856587447593, + "tokens_seen": 1869441024 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021901705115346038, + "loss": 2.6374, + "theoretical_loss": 3.4498754408398438, + "tokens_seen": 1869506560 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021900702106318956, + "loss": 2.4596, + "theoretical_loss": 3.449865223393403, + "tokens_seen": 1869572096 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021899699097291874, + "loss": 2.6354, + "theoretical_loss": 3.449855006405399, + "tokens_seen": 1869637632 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021898696088264795, + "loss": 2.8103, + "theoretical_loss": 3.4498447898757973, + "tokens_seen": 1869703168 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021897693079237713, + "loss": 2.8947, + "theoretical_loss": 3.4498345738045595, + "tokens_seen": 1869768704 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021896690070210631, + "loss": 2.6436, + "theoretical_loss": 3.4498243581916492, + "tokens_seen": 1869834240 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002189568706118355, + "loss": 2.7694, + "theoretical_loss": 3.44981414303703, + "tokens_seen": 1869899776 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021894684052156468, + "loss": 2.6779, + "theoretical_loss": 3.449803928340666, + "tokens_seen": 1869965312 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021893681043129388, + "loss": 2.7725, + "theoretical_loss": 3.449793714102519, + "tokens_seen": 1870030848 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021892678034102307, + "loss": 3.0278, + "theoretical_loss": 3.449783500322554, + "tokens_seen": 1870096384 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021891675025075225, + "loss": 2.6411, + "theoretical_loss": 3.449773287000734, + "tokens_seen": 1870161920 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021890672016048143, + "loss": 2.5877, + "theoretical_loss": 3.449763074137021, + "tokens_seen": 1870227456 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021889669007021064, + "loss": 2.6925, + "theoretical_loss": 3.4497528617313806, + "tokens_seen": 1870292992 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021888665997993982, + "loss": 2.5556, + "theoretical_loss": 3.449742649783774, + "tokens_seen": 1870358528 + }, + { + "epoch": 6.03, + "learning_rate": 0.000218876629889669, + "loss": 2.6536, + "theoretical_loss": 3.4497324382941668, + "tokens_seen": 1870424064 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021886659979939818, + "loss": 2.7269, + "theoretical_loss": 3.4497222272625208, + "tokens_seen": 1870489600 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021885656970912736, + "loss": 2.712, + "theoretical_loss": 3.4497120166887996, + "tokens_seen": 1870555136 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021884653961885657, + "loss": 2.766, + "theoretical_loss": 3.4497018065729677, + "tokens_seen": 1870620672 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021883650952858578, + "loss": 2.8054, + "theoretical_loss": 3.4496915969149873, + "tokens_seen": 1870686208 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021882647943831496, + "loss": 2.493, + "theoretical_loss": 3.4496813877148225, + "tokens_seen": 1870751744 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021881644934804414, + "loss": 2.6676, + "theoretical_loss": 3.4496711789724364, + "tokens_seen": 1870817280 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021880641925777335, + "loss": 2.5931, + "theoretical_loss": 3.449660970687793, + "tokens_seen": 1870882816 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021879638916750253, + "loss": 2.3668, + "theoretical_loss": 3.4496507628608546, + "tokens_seen": 1870948352 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002187863590772317, + "loss": 2.5647, + "theoretical_loss": 3.449640555491586, + "tokens_seen": 1871013888 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2095376, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.416127920150757, + "objective/train/theoretical_loss": 3.449635451978566, + "objective/train/tokens_used": 1891506656, + "theoretical_loss": 3.449635451978566, + "tokens_seen": 1871046656 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002187763289869609, + "loss": 2.4298, + "theoretical_loss": 3.4496303485799498, + "tokens_seen": 1871079424 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002187662988966901, + "loss": 2.7766, + "theoretical_loss": 3.4496201421259096, + "tokens_seen": 1871144960 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021875626880641928, + "loss": 2.6045, + "theoretical_loss": 3.449609936129429, + "tokens_seen": 1871210496 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021874623871614846, + "loss": 2.7574, + "theoretical_loss": 3.4495997305904718, + "tokens_seen": 1871276032 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021873620862587764, + "loss": 2.5679, + "theoretical_loss": 3.4495895255090003, + "tokens_seen": 1871341568 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021872617853560682, + "loss": 2.6389, + "theoretical_loss": 3.449579320884979, + "tokens_seen": 1871407104 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021871614844533603, + "loss": 2.5938, + "theoretical_loss": 3.4495691167183713, + "tokens_seen": 1871472640 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002187061183550652, + "loss": 2.703, + "theoretical_loss": 3.449558913009141, + "tokens_seen": 1871538176 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002186960882647944, + "loss": 2.5896, + "theoretical_loss": 3.4495487097572504, + "tokens_seen": 1871603712 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021868605817452357, + "loss": 2.914, + "theoretical_loss": 3.4495385069626634, + "tokens_seen": 1871669248 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021867602808425278, + "loss": 2.8884, + "theoretical_loss": 3.4495283046253444, + "tokens_seen": 1871734784 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021866599799398196, + "loss": 2.6977, + "theoretical_loss": 3.449518102745256, + "tokens_seen": 1871800320 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021865596790371115, + "loss": 2.6366, + "theoretical_loss": 3.449507901322362, + "tokens_seen": 1871865856 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021864593781344033, + "loss": 2.6112, + "theoretical_loss": 3.449497700356626, + "tokens_seen": 1871931392 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002186359077231695, + "loss": 2.763, + "theoretical_loss": 3.4494874998480114, + "tokens_seen": 1871996928 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021862587763289872, + "loss": 2.5918, + "theoretical_loss": 3.4494772997964818, + "tokens_seen": 1872062464 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002186158475426279, + "loss": 2.8566, + "theoretical_loss": 3.449467100202, + "tokens_seen": 1872128000 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021860581745235708, + "loss": 2.7066, + "theoretical_loss": 3.4494569010645306, + "tokens_seen": 1872193536 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021859578736208626, + "loss": 2.7591, + "theoretical_loss": 3.4494467023840363, + "tokens_seen": 1872259072 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021858575727181547, + "loss": 2.5281, + "theoretical_loss": 3.4494365041604813, + "tokens_seen": 1872324608 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021857572718154465, + "loss": 2.7388, + "theoretical_loss": 3.449426306393829, + "tokens_seen": 1872390144 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021856569709127383, + "loss": 2.5501, + "theoretical_loss": 3.4494161090840425, + "tokens_seen": 1872455680 + }, + { + "epoch": 6.03, + "learning_rate": 0.000218555667001003, + "loss": 2.6282, + "theoretical_loss": 3.449405912231086, + "tokens_seen": 1872521216 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002185456369107322, + "loss": 2.7274, + "theoretical_loss": 3.449395715834922, + "tokens_seen": 1872586752 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002185356068204614, + "loss": 2.7982, + "theoretical_loss": 3.4493855198955146, + "tokens_seen": 1872652288 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2095376, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8873865604400635, + "objective/train/theoretical_loss": 3.4493804220970836, + "objective/train/tokens_used": 1893145056, + "theoretical_loss": 3.4493804220970836, + "tokens_seen": 1872685056 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021852557673019058, + "loss": 2.6897, + "theoretical_loss": 3.449375324412828, + "tokens_seen": 1872717824 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021851554663991976, + "loss": 2.6498, + "theoretical_loss": 3.449365129386825, + "tokens_seen": 1872783360 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021850551654964894, + "loss": 2.5187, + "theoretical_loss": 3.4493549348174692, + "tokens_seen": 1872848896 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021849548645937815, + "loss": 2.5388, + "theoretical_loss": 3.4493447407047246, + "tokens_seen": 1872914432 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021848545636910733, + "loss": 2.6437, + "theoretical_loss": 3.4493345470485544, + "tokens_seen": 1872979968 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021847542627883651, + "loss": 2.6538, + "theoretical_loss": 3.4493243538489224, + "tokens_seen": 1873045504 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002184653961885657, + "loss": 2.7006, + "theoretical_loss": 3.449314161105792, + "tokens_seen": 1873111040 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021845536609829488, + "loss": 2.6761, + "theoretical_loss": 3.449303968819127, + "tokens_seen": 1873176576 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021844533600802408, + "loss": 2.6639, + "theoretical_loss": 3.449293776988891, + "tokens_seen": 1873242112 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021843530591775327, + "loss": 2.6409, + "theoretical_loss": 3.449283585615047, + "tokens_seen": 1873307648 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021842527582748245, + "loss": 2.6676, + "theoretical_loss": 3.4492733946975593, + "tokens_seen": 1873373184 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021841524573721163, + "loss": 2.7923, + "theoretical_loss": 3.4492632042363915, + "tokens_seen": 1873438720 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021840521564694084, + "loss": 2.838, + "theoretical_loss": 3.4492530142315063, + "tokens_seen": 1873504256 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021839518555667002, + "loss": 2.5305, + "theoretical_loss": 3.4492428246828686, + "tokens_seen": 1873569792 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002183851554663992, + "loss": 2.8065, + "theoretical_loss": 3.449232635590441, + "tokens_seen": 1873635328 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021837512537612838, + "loss": 2.6473, + "theoretical_loss": 3.4492224469541872, + "tokens_seen": 1873700864 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021836509528585756, + "loss": 2.4994, + "theoretical_loss": 3.4492122587740717, + "tokens_seen": 1873766400 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021835506519558677, + "loss": 2.679, + "theoretical_loss": 3.449202071050057, + "tokens_seen": 1873831936 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021834503510531595, + "loss": 2.8434, + "theoretical_loss": 3.449191883782108, + "tokens_seen": 1873897472 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021833500501504513, + "loss": 2.7586, + "theoretical_loss": 3.449181696970187, + "tokens_seen": 1873963008 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002183249749247743, + "loss": 2.8276, + "theoretical_loss": 3.449171510614258, + "tokens_seen": 1874028544 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021831494483450352, + "loss": 2.717, + "theoretical_loss": 3.449161324714286, + "tokens_seen": 1874094080 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002183049147442327, + "loss": 2.8928, + "theoretical_loss": 3.4491511392702323, + "tokens_seen": 1874159616 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021829488465396188, + "loss": 2.8466, + "theoretical_loss": 3.449140954282063, + "tokens_seen": 1874225152 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021828485456369106, + "loss": 2.6807, + "theoretical_loss": 3.4491307697497398, + "tokens_seen": 1874290688 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2096132, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6751327514648438, + "objective/train/theoretical_loss": 3.449125677654509, + "objective/train/tokens_used": 1894783456, + "theoretical_loss": 3.449125677654509, + "tokens_seen": 1874323456 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021827482447342024, + "loss": 2.7941, + "theoretical_loss": 3.449120585673227, + "tokens_seen": 1874356224 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021826479438314945, + "loss": 2.651, + "theoretical_loss": 3.4491104020524883, + "tokens_seen": 1874421760 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021825476429287863, + "loss": 2.7943, + "theoretical_loss": 3.4491002188874877, + "tokens_seen": 1874487296 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021824473420260782, + "loss": 2.8192, + "theoretical_loss": 3.4490900361781884, + "tokens_seen": 1874552832 + }, + { + "epoch": 6.03, + "learning_rate": 0.000218234704112337, + "loss": 2.9578, + "theoretical_loss": 3.4490798539245544, + "tokens_seen": 1874618368 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002182246740220662, + "loss": 2.7774, + "theoretical_loss": 3.4490696721265497, + "tokens_seen": 1874683904 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021821464393179539, + "loss": 3.0276, + "theoretical_loss": 3.449059490784137, + "tokens_seen": 1874749440 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021820461384152457, + "loss": 2.692, + "theoretical_loss": 3.4490493098972808, + "tokens_seen": 1874814976 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021819458375125375, + "loss": 2.7241, + "theoretical_loss": 3.4490391294659446, + "tokens_seen": 1874880512 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021818455366098296, + "loss": 2.8494, + "theoretical_loss": 3.4490289494900916, + "tokens_seen": 1874946048 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021817452357071214, + "loss": 2.6784, + "theoretical_loss": 3.449018769969686, + "tokens_seen": 1875011584 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021816449348044132, + "loss": 2.5005, + "theoretical_loss": 3.4490085909046915, + "tokens_seen": 1875077120 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002181544633901705, + "loss": 2.7137, + "theoretical_loss": 3.4489984122950723, + "tokens_seen": 1875142656 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021814443329989968, + "loss": 2.8095, + "theoretical_loss": 3.448988234140791, + "tokens_seen": 1875208192 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002181344032096289, + "loss": 2.7387, + "theoretical_loss": 3.448978056441812, + "tokens_seen": 1875273728 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021812437311935807, + "loss": 2.6629, + "theoretical_loss": 3.448967879198099, + "tokens_seen": 1875339264 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021811434302908725, + "loss": 2.5509, + "theoretical_loss": 3.448957702409615, + "tokens_seen": 1875404800 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021810431293881643, + "loss": 2.9283, + "theoretical_loss": 3.448947526076325, + "tokens_seen": 1875470336 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021809428284854564, + "loss": 2.7025, + "theoretical_loss": 3.448937350198192, + "tokens_seen": 1875535872 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021808425275827485, + "loss": 3.0127, + "theoretical_loss": 3.4489271747751795, + "tokens_seen": 1875601408 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021807422266800403, + "loss": 2.8624, + "theoretical_loss": 3.448916999807252, + "tokens_seen": 1875666944 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002180641925777332, + "loss": 2.752, + "theoretical_loss": 3.4489068252943724, + "tokens_seen": 1875732480 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002180541624874624, + "loss": 2.837, + "theoretical_loss": 3.448896651236505, + "tokens_seen": 1875798016 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002180441323971916, + "loss": 2.7392, + "theoretical_loss": 3.4488864776336143, + "tokens_seen": 1875863552 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021803410230692078, + "loss": 2.8056, + "theoretical_loss": 3.448876304485662, + "tokens_seen": 1875929088 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2096946, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6697466373443604, + "objective/train/theoretical_loss": 3.448871218082277, + "objective/train/tokens_used": 1896421856, + "theoretical_loss": 3.448871218082277, + "tokens_seen": 1875961856 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021802407221664996, + "loss": 2.6196, + "theoretical_loss": 3.448866131792614, + "tokens_seen": 1875994624 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021801404212637914, + "loss": 2.8982, + "theoretical_loss": 3.4488559595544324, + "tokens_seen": 1876060160 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021800401203610835, + "loss": 2.8282, + "theoretical_loss": 3.4488457877710816, + "tokens_seen": 1876125696 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021799398194583753, + "loss": 2.6268, + "theoretical_loss": 3.448835616442526, + "tokens_seen": 1876191232 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021798395185556671, + "loss": 2.9423, + "theoretical_loss": 3.4488254455687284, + "tokens_seen": 1876256768 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002179739217652959, + "loss": 2.551, + "theoretical_loss": 3.448815275149653, + "tokens_seen": 1876322304 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021796389167502508, + "loss": 2.8651, + "theoretical_loss": 3.448805105185264, + "tokens_seen": 1876387840 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021795386158475428, + "loss": 2.6894, + "theoretical_loss": 3.448794935675525, + "tokens_seen": 1876453376 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021794383149448347, + "loss": 2.6958, + "theoretical_loss": 3.448784766620399, + "tokens_seen": 1876518912 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021793380140421265, + "loss": 2.7127, + "theoretical_loss": 3.4487745980198508, + "tokens_seen": 1876584448 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021792377131394183, + "loss": 2.6632, + "theoretical_loss": 3.448764429873844, + "tokens_seen": 1876649984 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021791374122367104, + "loss": 2.7466, + "theoretical_loss": 3.4487542621823417, + "tokens_seen": 1876715520 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021790371113340022, + "loss": 2.6712, + "theoretical_loss": 3.448744094945308, + "tokens_seen": 1876781056 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002178936810431294, + "loss": 2.9449, + "theoretical_loss": 3.4487339281627074, + "tokens_seen": 1876846592 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021788365095285858, + "loss": 2.5916, + "theoretical_loss": 3.4487237618345037, + "tokens_seen": 1876912128 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021787362086258776, + "loss": 2.6672, + "theoretical_loss": 3.44871359596066, + "tokens_seen": 1876977664 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021786359077231697, + "loss": 2.8205, + "theoretical_loss": 3.44870343054114, + "tokens_seen": 1877043200 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021785356068204615, + "loss": 2.7248, + "theoretical_loss": 3.4486932655759084, + "tokens_seen": 1877108736 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021784353059177533, + "loss": 2.9048, + "theoretical_loss": 3.448683101064929, + "tokens_seen": 1877174272 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002178335005015045, + "loss": 2.5472, + "theoretical_loss": 3.448672937008164, + "tokens_seen": 1877239808 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021782347041123372, + "loss": 2.7651, + "theoretical_loss": 3.4486627734055793, + "tokens_seen": 1877305344 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002178134403209629, + "loss": 2.7667, + "theoretical_loss": 3.448652610257138, + "tokens_seen": 1877370880 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021780341023069208, + "loss": 2.7346, + "theoretical_loss": 3.4486424475628037, + "tokens_seen": 1877436416 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021779338014042126, + "loss": 2.7683, + "theoretical_loss": 3.4486322853225406, + "tokens_seen": 1877501952 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021778335005015044, + "loss": 2.8853, + "theoretical_loss": 3.448622123536312, + "tokens_seen": 1877567488 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2098205, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5736496448516846, + "objective/train/theoretical_loss": 3.44861704281345, + "objective/train/tokens_used": 1898060256, + "theoretical_loss": 3.44861704281345, + "tokens_seen": 1877600256 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021777331995987965, + "loss": 3.0126, + "theoretical_loss": 3.448611962204083, + "tokens_seen": 1877633024 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021776328986960883, + "loss": 2.9417, + "theoretical_loss": 3.4486018013258164, + "tokens_seen": 1877698560 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021775325977933802, + "loss": 2.7457, + "theoretical_loss": 3.4485916409014763, + "tokens_seen": 1877764096 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002177432296890672, + "loss": 2.8446, + "theoretical_loss": 3.448581480931026, + "tokens_seen": 1877829632 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002177331995987964, + "loss": 2.7971, + "theoretical_loss": 3.4485713214144305, + "tokens_seen": 1877895168 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021772316950852559, + "loss": 2.7983, + "theoretical_loss": 3.448561162351653, + "tokens_seen": 1877960704 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021771313941825477, + "loss": 2.8111, + "theoretical_loss": 3.448551003742658, + "tokens_seen": 1878026240 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021770310932798395, + "loss": 2.8381, + "theoretical_loss": 3.4485408455874085, + "tokens_seen": 1878091776 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021769307923771316, + "loss": 2.7345, + "theoretical_loss": 3.4485306878858695, + "tokens_seen": 1878157312 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021768304914744234, + "loss": 2.6874, + "theoretical_loss": 3.448520530638004, + "tokens_seen": 1878222848 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021767301905717152, + "loss": 2.7712, + "theoretical_loss": 3.448510373843776, + "tokens_seen": 1878288384 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002176629889669007, + "loss": 2.6724, + "theoretical_loss": 3.44850021750315, + "tokens_seen": 1878353920 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021765295887662988, + "loss": 2.7886, + "theoretical_loss": 3.4484900616160887, + "tokens_seen": 1878419456 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002176429287863591, + "loss": 2.7713, + "theoretical_loss": 3.4484799061825573, + "tokens_seen": 1878484992 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021763289869608827, + "loss": 2.7665, + "theoretical_loss": 3.4484697512025195, + "tokens_seen": 1878550528 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021762286860581745, + "loss": 2.673, + "theoretical_loss": 3.4484595966759386, + "tokens_seen": 1878616064 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021761283851554663, + "loss": 2.6171, + "theoretical_loss": 3.4484494426027794, + "tokens_seen": 1878681600 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021760280842527584, + "loss": 2.9394, + "theoretical_loss": 3.448439288983005, + "tokens_seen": 1878747136 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021759277833500502, + "loss": 2.8006, + "theoretical_loss": 3.44842913581658, + "tokens_seen": 1878812672 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002175827482447342, + "loss": 2.6499, + "theoretical_loss": 3.448418983103468, + "tokens_seen": 1878878208 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021757271815446338, + "loss": 2.8846, + "theoretical_loss": 3.4484088308436327, + "tokens_seen": 1878943744 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021756268806419257, + "loss": 2.3471, + "theoretical_loss": 3.4483986790370387, + "tokens_seen": 1879009280 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021755265797392177, + "loss": 2.7033, + "theoretical_loss": 3.4483885276836492, + "tokens_seen": 1879074816 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021754262788365095, + "loss": 2.8769, + "theoretical_loss": 3.4483783767834293, + "tokens_seen": 1879140352 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021753259779338014, + "loss": 2.856, + "theoretical_loss": 3.448368226336342, + "tokens_seen": 1879205888 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2098773, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5630502700805664, + "objective/train/theoretical_loss": 3.4483631512827113, + "objective/train/tokens_used": 1899698656, + "theoretical_loss": 3.4483631512827113, + "tokens_seen": 1879238656 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021752256770310932, + "loss": 2.7307, + "theoretical_loss": 3.448358076342351, + "tokens_seen": 1879271424 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021751253761283852, + "loss": 2.7454, + "theoretical_loss": 3.4483479268014214, + "tokens_seen": 1879336960 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002175025075225677, + "loss": 2.7834, + "theoretical_loss": 3.4483377777135162, + "tokens_seen": 1879402496 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002174924774322969, + "loss": 2.875, + "theoretical_loss": 3.4483276290786, + "tokens_seen": 1879468032 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021748244734202607, + "loss": 2.5481, + "theoretical_loss": 3.448317480896636, + "tokens_seen": 1879533568 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021747241725175525, + "loss": 2.8219, + "theoretical_loss": 3.4483073331675893, + "tokens_seen": 1879599104 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021746238716148446, + "loss": 2.6741, + "theoretical_loss": 3.4482971858914233, + "tokens_seen": 1879664640 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021745235707121364, + "loss": 2.8378, + "theoretical_loss": 3.448287039068102, + "tokens_seen": 1879730176 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021744232698094282, + "loss": 2.6456, + "theoretical_loss": 3.4482768926975895, + "tokens_seen": 1879795712 + }, + { + "epoch": 6.03, + "learning_rate": 0.000217432296890672, + "loss": 2.8128, + "theoretical_loss": 3.4482667467798493, + "tokens_seen": 1879861248 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002174222668004012, + "loss": 2.6783, + "theoretical_loss": 3.4482566013148466, + "tokens_seen": 1879926784 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002174122367101304, + "loss": 2.8526, + "theoretical_loss": 3.4482464563025435, + "tokens_seen": 1879992320 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021740220661985957, + "loss": 2.7022, + "theoretical_loss": 3.4482363117429062, + "tokens_seen": 1880057856 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021739217652958875, + "loss": 2.4535, + "theoretical_loss": 3.448226167635897, + "tokens_seen": 1880123392 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021738214643931793, + "loss": 2.5998, + "theoretical_loss": 3.448216023981481, + "tokens_seen": 1880188928 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021737211634904714, + "loss": 2.8056, + "theoretical_loss": 3.4482058807796223, + "tokens_seen": 1880254464 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021736208625877632, + "loss": 2.7433, + "theoretical_loss": 3.448195738030284, + "tokens_seen": 1880320000 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002173520561685055, + "loss": 2.8187, + "theoretical_loss": 3.448185595733431, + "tokens_seen": 1880385536 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002173420260782347, + "loss": 2.8001, + "theoretical_loss": 3.448175453889027, + "tokens_seen": 1880451072 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021733199598796392, + "loss": 2.68, + "theoretical_loss": 3.4481653124970357, + "tokens_seen": 1880516608 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002173219658976931, + "loss": 2.894, + "theoretical_loss": 3.4481551715574215, + "tokens_seen": 1880582144 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021731193580742228, + "loss": 2.8076, + "theoretical_loss": 3.4481450310701485, + "tokens_seen": 1880647680 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021730190571715146, + "loss": 2.7539, + "theoretical_loss": 3.4481348910351812, + "tokens_seen": 1880713216 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021729187562688065, + "loss": 2.8409, + "theoretical_loss": 3.4481247514524824, + "tokens_seen": 1880778752 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021728184553660985, + "loss": 2.786, + "theoretical_loss": 3.4481146123220174, + "tokens_seen": 1880844288 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2100224, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.632192850112915, + "objective/train/theoretical_loss": 3.4481095429263613, + "objective/train/tokens_used": 1901337056, + "theoretical_loss": 3.4481095429263613, + "tokens_seen": 1880877056 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021727181544633903, + "loss": 2.8171, + "theoretical_loss": 3.44810447364375, + "tokens_seen": 1880909824 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021726178535606822, + "loss": 2.657, + "theoretical_loss": 3.448094335417644, + "tokens_seen": 1880975360 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002172517552657974, + "loss": 2.7805, + "theoretical_loss": 3.4480841976436634, + "tokens_seen": 1881040896 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002172417251755266, + "loss": 2.8164, + "theoretical_loss": 3.4480740603217725, + "tokens_seen": 1881106432 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021723169508525579, + "loss": 2.6811, + "theoretical_loss": 3.4480639234519352, + "tokens_seen": 1881171968 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021722166499498497, + "loss": 2.8833, + "theoretical_loss": 3.448053787034116, + "tokens_seen": 1881237504 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021721163490471415, + "loss": 2.8292, + "theoretical_loss": 3.4480436510682786, + "tokens_seen": 1881303040 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021720160481444336, + "loss": 2.8158, + "theoretical_loss": 3.4480335155543873, + "tokens_seen": 1881368576 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021719157472417254, + "loss": 2.7431, + "theoretical_loss": 3.4480233804924065, + "tokens_seen": 1881434112 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021718154463390172, + "loss": 2.5308, + "theoretical_loss": 3.4480132458822994, + "tokens_seen": 1881499648 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002171715145436309, + "loss": 2.5433, + "theoretical_loss": 3.448003111724031, + "tokens_seen": 1881565184 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021716148445336008, + "loss": 2.7628, + "theoretical_loss": 3.447992978017565, + "tokens_seen": 1881630720 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002171514543630893, + "loss": 2.5813, + "theoretical_loss": 3.447982844762866, + "tokens_seen": 1881696256 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021714142427281847, + "loss": 2.8402, + "theoretical_loss": 3.4479727119598973, + "tokens_seen": 1881761792 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021713139418254765, + "loss": 2.6202, + "theoretical_loss": 3.4479625796086237, + "tokens_seen": 1881827328 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021712136409227683, + "loss": 2.7323, + "theoretical_loss": 3.447952447709009, + "tokens_seen": 1881892864 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021711133400200604, + "loss": 2.6208, + "theoretical_loss": 3.4479423162610177, + "tokens_seen": 1881958400 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021710130391173522, + "loss": 2.7757, + "theoretical_loss": 3.447932185264613, + "tokens_seen": 1882023936 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002170912738214644, + "loss": 2.5247, + "theoretical_loss": 3.4479220547197604, + "tokens_seen": 1882089472 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021708124373119358, + "loss": 2.6837, + "theoretical_loss": 3.4479119246264234, + "tokens_seen": 1882155008 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021707121364092277, + "loss": 2.7516, + "theoretical_loss": 3.447901794984566, + "tokens_seen": 1882220544 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021706118355065197, + "loss": 3.0223, + "theoretical_loss": 3.4478916657941525, + "tokens_seen": 1882286080 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021705115346038115, + "loss": 2.9941, + "theoretical_loss": 3.4478815370551468, + "tokens_seen": 1882351616 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021704112337011034, + "loss": 2.8184, + "theoretical_loss": 3.447871408767514, + "tokens_seen": 1882417152 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021703109327983952, + "loss": 2.5759, + "theoretical_loss": 3.4478612809312175, + "tokens_seen": 1882482688 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2103085, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.81003999710083, + "objective/train/theoretical_loss": 3.447856217182309, + "objective/train/tokens_used": 1902975456, + "theoretical_loss": 3.447856217182309, + "tokens_seen": 1882515456 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021702106318956873, + "loss": 2.7417, + "theoretical_loss": 3.447851153546221, + "tokens_seen": 1882548224 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002170110330992979, + "loss": 2.7782, + "theoretical_loss": 3.4478410266124895, + "tokens_seen": 1882613760 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002170010030090271, + "loss": 2.8457, + "theoretical_loss": 3.447830900129987, + "tokens_seen": 1882679296 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021699097291875627, + "loss": 2.5898, + "theoretical_loss": 3.447820774098678, + "tokens_seen": 1882744832 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021698094282848545, + "loss": 2.6467, + "theoretical_loss": 3.4478106485185265, + "tokens_seen": 1882810368 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021697091273821466, + "loss": 2.7596, + "theoretical_loss": 3.4478005233894957, + "tokens_seen": 1882875904 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021696088264794384, + "loss": 2.7676, + "theoretical_loss": 3.4477903987115512, + "tokens_seen": 1882941440 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021695085255767302, + "loss": 2.6964, + "theoretical_loss": 3.447780274484656, + "tokens_seen": 1883006976 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002169408224674022, + "loss": 2.6813, + "theoretical_loss": 3.447770150708776, + "tokens_seen": 1883072512 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002169307923771314, + "loss": 2.7303, + "theoretical_loss": 3.4477600273838735, + "tokens_seen": 1883138048 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002169207622868606, + "loss": 2.7637, + "theoretical_loss": 3.447749904509914, + "tokens_seen": 1883203584 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021691073219658977, + "loss": 2.8844, + "theoretical_loss": 3.4477397820868614, + "tokens_seen": 1883269120 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021690070210631895, + "loss": 2.8675, + "theoretical_loss": 3.447729660114679, + "tokens_seen": 1883334656 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021689067201604813, + "loss": 2.7334, + "theoretical_loss": 3.447719538593333, + "tokens_seen": 1883400192 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021688064192577734, + "loss": 2.5937, + "theoretical_loss": 3.447709417522786, + "tokens_seen": 1883465728 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021687061183550652, + "loss": 2.8333, + "theoretical_loss": 3.4476992969030027, + "tokens_seen": 1883531264 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002168605817452357, + "loss": 2.6442, + "theoretical_loss": 3.447689176733947, + "tokens_seen": 1883596800 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021685055165496489, + "loss": 2.801, + "theoretical_loss": 3.447679057015584, + "tokens_seen": 1883662336 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002168405215646941, + "loss": 2.7815, + "theoretical_loss": 3.4476689377478777, + "tokens_seen": 1883727872 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021683049147442327, + "loss": 2.8931, + "theoretical_loss": 3.4476588189307913, + "tokens_seen": 1883793408 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021682046138415246, + "loss": 2.6293, + "theoretical_loss": 3.44764870056429, + "tokens_seen": 1883858944 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021681043129388164, + "loss": 2.6397, + "theoretical_loss": 3.447638582648339, + "tokens_seen": 1883924480 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021680040120361082, + "loss": 2.7855, + "theoretical_loss": 3.4476284651829006, + "tokens_seen": 1883990016 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021679037111334003, + "loss": 2.6463, + "theoretical_loss": 3.44761834816794, + "tokens_seen": 1884055552 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002167803410230692, + "loss": 2.8731, + "theoretical_loss": 3.4476082316034216, + "tokens_seen": 1884121088 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2108362, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.143646240234375, + "objective/train/theoretical_loss": 3.4476031734900667, + "objective/train/tokens_used": 1904613856, + "theoretical_loss": 3.4476031734900667, + "tokens_seen": 1884153856 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002167703109327984, + "loss": 2.8024, + "theoretical_loss": 3.4475981154893094, + "tokens_seen": 1884186624 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021676028084252757, + "loss": 2.4447, + "theoretical_loss": 3.447587999825568, + "tokens_seen": 1884252160 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021675025075225678, + "loss": 2.6639, + "theoretical_loss": 3.447577884612161, + "tokens_seen": 1884317696 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021674022066198596, + "loss": 2.6084, + "theoretical_loss": 3.447567769849054, + "tokens_seen": 1884383232 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021673019057171514, + "loss": 2.7425, + "theoretical_loss": 3.44755765553621, + "tokens_seen": 1884448768 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021672016048144432, + "loss": 2.5915, + "theoretical_loss": 3.4475475416735937, + "tokens_seen": 1884514304 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002167101303911735, + "loss": 2.695, + "theoretical_loss": 3.4475374282611697, + "tokens_seen": 1884579840 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002167001003009027, + "loss": 2.5885, + "theoretical_loss": 3.4475273152989017, + "tokens_seen": 1884645376 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002166900702106319, + "loss": 2.6412, + "theoretical_loss": 3.447517202786755, + "tokens_seen": 1884710912 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021668004012036107, + "loss": 2.5947, + "theoretical_loss": 3.4475070907246925, + "tokens_seen": 1884776448 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021667001003009025, + "loss": 2.8587, + "theoretical_loss": 3.44749697911268, + "tokens_seen": 1884841984 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021665997993981946, + "loss": 2.6738, + "theoretical_loss": 3.4474868679506807, + "tokens_seen": 1884907520 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021664994984954864, + "loss": 2.6777, + "theoretical_loss": 3.44747675723866, + "tokens_seen": 1884973056 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021663991975927782, + "loss": 2.7763, + "theoretical_loss": 3.447466646976581, + "tokens_seen": 1885038592 + }, + { + "epoch": 6.03, + "learning_rate": 0.000216629889669007, + "loss": 2.6822, + "theoretical_loss": 3.447456537164409, + "tokens_seen": 1885104128 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021661985957873621, + "loss": 2.7631, + "theoretical_loss": 3.4474464278021077, + "tokens_seen": 1885169664 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002166098294884654, + "loss": 2.753, + "theoretical_loss": 3.447436318889642, + "tokens_seen": 1885235200 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021659979939819458, + "loss": 2.7053, + "theoretical_loss": 3.447426210426976, + "tokens_seen": 1885300736 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021658976930792378, + "loss": 2.7175, + "theoretical_loss": 3.4474161024140737, + "tokens_seen": 1885366272 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021657973921765297, + "loss": 2.7483, + "theoretical_loss": 3.4474059948509, + "tokens_seen": 1885431808 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021656970912738217, + "loss": 2.8033, + "theoretical_loss": 3.447395887737419, + "tokens_seen": 1885497344 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021655967903711135, + "loss": 2.4957, + "theoretical_loss": 3.447385781073595, + "tokens_seen": 1885562880 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021654964894684054, + "loss": 2.6868, + "theoretical_loss": 3.4473756748593924, + "tokens_seen": 1885628416 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021653961885656972, + "loss": 2.7528, + "theoretical_loss": 3.447365569094776, + "tokens_seen": 1885693952 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021652958876629893, + "loss": 2.5347, + "theoretical_loss": 3.4473554637797097, + "tokens_seen": 1885759488 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2113232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4644320011138916, + "objective/train/theoretical_loss": 3.4473504112907465, + "objective/train/tokens_used": 1906252256, + "theoretical_loss": 3.4473504112907465, + "tokens_seen": 1885792256 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002165195586760281, + "loss": 2.6663, + "theoretical_loss": 3.447345358914158, + "tokens_seen": 1885825024 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002165095285857573, + "loss": 2.5539, + "theoretical_loss": 3.447335254498085, + "tokens_seen": 1885890560 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021649949849548647, + "loss": 2.7845, + "theoretical_loss": 3.4473251505314555, + "tokens_seen": 1885956096 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021648946840521565, + "loss": 2.4715, + "theoretical_loss": 3.447315047014234, + "tokens_seen": 1886021632 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021647943831494486, + "loss": 2.9718, + "theoretical_loss": 3.447304943946384, + "tokens_seen": 1886087168 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021646940822467404, + "loss": 2.7761, + "theoretical_loss": 3.4472948413278717, + "tokens_seen": 1886152704 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021645937813440322, + "loss": 2.8395, + "theoretical_loss": 3.4472847391586594, + "tokens_seen": 1886218240 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002164493480441324, + "loss": 2.6965, + "theoretical_loss": 3.4472746374387127, + "tokens_seen": 1886283776 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002164393179538616, + "loss": 2.7207, + "theoretical_loss": 3.447264536167996, + "tokens_seen": 1886349312 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002164292878635908, + "loss": 2.6945, + "theoretical_loss": 3.447254435346473, + "tokens_seen": 1886414848 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021641925777331997, + "loss": 2.6727, + "theoretical_loss": 3.4472443349741093, + "tokens_seen": 1886480384 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021640922768304915, + "loss": 2.6678, + "theoretical_loss": 3.447234235050868, + "tokens_seen": 1886545920 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021639919759277833, + "loss": 2.7687, + "theoretical_loss": 3.4472241355767146, + "tokens_seen": 1886611456 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021638916750250754, + "loss": 2.5919, + "theoretical_loss": 3.447214036551613, + "tokens_seen": 1886676992 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021637913741223672, + "loss": 2.6433, + "theoretical_loss": 3.4472039379755275, + "tokens_seen": 1886742528 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002163691073219659, + "loss": 2.792, + "theoretical_loss": 3.447193839848423, + "tokens_seen": 1886808064 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021635907723169509, + "loss": 2.6355, + "theoretical_loss": 3.447183742170264, + "tokens_seen": 1886873600 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002163490471414243, + "loss": 2.5857, + "theoretical_loss": 3.4471736449410137, + "tokens_seen": 1886939136 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021633901705115347, + "loss": 2.7053, + "theoretical_loss": 3.4471635481606384, + "tokens_seen": 1887004672 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021632898696088266, + "loss": 2.428, + "theoretical_loss": 3.4471534518291014, + "tokens_seen": 1887070208 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021631895687061184, + "loss": 2.7224, + "theoretical_loss": 3.447143355946368, + "tokens_seen": 1887135744 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021630892678034102, + "loss": 3.0374, + "theoretical_loss": 3.4471332605124014, + "tokens_seen": 1887201280 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021629889669007023, + "loss": 2.7866, + "theoretical_loss": 3.447123165527167, + "tokens_seen": 1887266816 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002162888665997994, + "loss": 2.381, + "theoretical_loss": 3.4471130709906292, + "tokens_seen": 1887332352 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002162788365095286, + "loss": 2.3743, + "theoretical_loss": 3.447102976902752, + "tokens_seen": 1887397888 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2118410, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4384121894836426, + "objective/train/theoretical_loss": 3.4470979300270503, + "objective/train/tokens_used": 1907890656, + "theoretical_loss": 3.4470979300270503, + "tokens_seen": 1887430656 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021626880641925777, + "loss": 2.5823, + "theoretical_loss": 3.4470928832635, + "tokens_seen": 1887463424 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021625877632898698, + "loss": 2.4886, + "theoretical_loss": 3.4470827900728387, + "tokens_seen": 1887528960 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021624874623871616, + "loss": 2.7983, + "theoretical_loss": 3.4470726973307313, + "tokens_seen": 1887594496 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021623871614844534, + "loss": 2.6957, + "theoretical_loss": 3.447062605037143, + "tokens_seen": 1887660032 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021622868605817452, + "loss": 2.7268, + "theoretical_loss": 3.4470525131920375, + "tokens_seen": 1887725568 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002162186559679037, + "loss": 2.7495, + "theoretical_loss": 3.4470424217953806, + "tokens_seen": 1887791104 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002162086258776329, + "loss": 2.8951, + "theoretical_loss": 3.4470323308471356, + "tokens_seen": 1887856640 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002161985957873621, + "loss": 2.5712, + "theoretical_loss": 3.447022240347268, + "tokens_seen": 1887922176 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021618856569709127, + "loss": 2.8351, + "theoretical_loss": 3.447012150295741, + "tokens_seen": 1887987712 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021617853560682045, + "loss": 2.4957, + "theoretical_loss": 3.4470020606925207, + "tokens_seen": 1888053248 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021616850551654966, + "loss": 2.6802, + "theoretical_loss": 3.44699197153757, + "tokens_seen": 1888118784 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021615847542627884, + "loss": 2.4975, + "theoretical_loss": 3.446981882830855, + "tokens_seen": 1888184320 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021614844533600802, + "loss": 2.6763, + "theoretical_loss": 3.4469717945723395, + "tokens_seen": 1888249856 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002161384152457372, + "loss": 2.8242, + "theoretical_loss": 3.4469617067619875, + "tokens_seen": 1888315392 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021612838515546641, + "loss": 2.6706, + "theoretical_loss": 3.4469516193997647, + "tokens_seen": 1888380928 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002161183550651956, + "loss": 2.6943, + "theoretical_loss": 3.4469415324856345, + "tokens_seen": 1888446464 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021610832497492478, + "loss": 2.6932, + "theoretical_loss": 3.446931446019562, + "tokens_seen": 1888512000 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021609829488465396, + "loss": 2.5737, + "theoretical_loss": 3.446921360001512, + "tokens_seen": 1888577536 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021608826479438314, + "loss": 2.5819, + "theoretical_loss": 3.4469112744314487, + "tokens_seen": 1888643072 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021607823470411235, + "loss": 2.7649, + "theoretical_loss": 3.4469011893093366, + "tokens_seen": 1888708608 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021606820461384153, + "loss": 2.7053, + "theoretical_loss": 3.4468911046351405, + "tokens_seen": 1888774144 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002160581745235707, + "loss": 2.6991, + "theoretical_loss": 3.4468810204088243, + "tokens_seen": 1888839680 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002160481444332999, + "loss": 2.829, + "theoretical_loss": 3.446870936630354, + "tokens_seen": 1888905216 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002160381143430291, + "loss": 2.7423, + "theoretical_loss": 3.4468608532996923, + "tokens_seen": 1888970752 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021602808425275828, + "loss": 2.6642, + "theoretical_loss": 3.4468507704168054, + "tokens_seen": 1889036288 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2121479, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.052765130996704, + "objective/train/theoretical_loss": 3.4468457291432664, + "objective/train/tokens_used": 1909529056, + "theoretical_loss": 3.4468457291432664, + "tokens_seen": 1889069056 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021601805416248746, + "loss": 2.6683, + "theoretical_loss": 3.4468406879816573, + "tokens_seen": 1889101824 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021600802407221664, + "loss": 2.6905, + "theoretical_loss": 3.4468306059942124, + "tokens_seen": 1889167360 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021599799398194582, + "loss": 2.5295, + "theoretical_loss": 3.446820524454435, + "tokens_seen": 1889232896 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021598796389167503, + "loss": 2.9347, + "theoretical_loss": 3.446810443362291, + "tokens_seen": 1889298432 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002159779338014042, + "loss": 2.6107, + "theoretical_loss": 3.4468003627177435, + "tokens_seen": 1889363968 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002159679037111334, + "loss": 2.6581, + "theoretical_loss": 3.446790282520758, + "tokens_seen": 1889429504 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021595787362086257, + "loss": 2.7454, + "theoretical_loss": 3.446780202771299, + "tokens_seen": 1889495040 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021594784353059178, + "loss": 2.7553, + "theoretical_loss": 3.44677012346933, + "tokens_seen": 1889560576 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021593781344032096, + "loss": 2.491, + "theoretical_loss": 3.446760044614817, + "tokens_seen": 1889626112 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021592778335005014, + "loss": 2.7731, + "theoretical_loss": 3.4467499662077246, + "tokens_seen": 1889691648 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021591775325977933, + "loss": 2.7019, + "theoretical_loss": 3.4467398882480165, + "tokens_seen": 1889757184 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002159077231695085, + "loss": 2.6623, + "theoretical_loss": 3.4467298107356585, + "tokens_seen": 1889822720 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021589769307923772, + "loss": 2.6487, + "theoretical_loss": 3.446719733670614, + "tokens_seen": 1889888256 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002158876629889669, + "loss": 2.7851, + "theoretical_loss": 3.446709657052848, + "tokens_seen": 1889953792 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021587763289869608, + "loss": 2.615, + "theoretical_loss": 3.446699580882326, + "tokens_seen": 1890019328 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021586760280842526, + "loss": 2.6606, + "theoretical_loss": 3.446689505159011, + "tokens_seen": 1890084864 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021585757271815447, + "loss": 2.5319, + "theoretical_loss": 3.4466794298828693, + "tokens_seen": 1890150400 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021584754262788365, + "loss": 2.7633, + "theoretical_loss": 3.4466693550538645, + "tokens_seen": 1890215936 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021583751253761286, + "loss": 2.5778, + "theoretical_loss": 3.446659280671962, + "tokens_seen": 1890281472 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021582748244734204, + "loss": 2.7657, + "theoretical_loss": 3.446649206737126, + "tokens_seen": 1890347008 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021581745235707122, + "loss": 2.7982, + "theoretical_loss": 3.446639133249321, + "tokens_seen": 1890412544 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021580742226680043, + "loss": 2.7522, + "theoretical_loss": 3.4466290602085117, + "tokens_seen": 1890478080 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002157973921765296, + "loss": 2.7291, + "theoretical_loss": 3.446618987614664, + "tokens_seen": 1890543616 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002157873620862588, + "loss": 2.7213, + "theoretical_loss": 3.44660891546774, + "tokens_seen": 1890609152 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021577733199598797, + "loss": 2.5097, + "theoretical_loss": 3.446598843767707, + "tokens_seen": 1890674688 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2121955, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.885009765625, + "objective/train/theoretical_loss": 3.446593808085263, + "objective/train/tokens_used": 1911167456, + "theoretical_loss": 3.446593808085263, + "tokens_seen": 1890707456 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021576730190571718, + "loss": 2.788, + "theoretical_loss": 3.4465887725145286, + "tokens_seen": 1890740224 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021575727181544636, + "loss": 2.7273, + "theoretical_loss": 3.446578701708169, + "tokens_seen": 1890805760 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021574724172517554, + "loss": 2.7104, + "theoretical_loss": 3.446568631348594, + "tokens_seen": 1890871296 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021573721163490472, + "loss": 2.8044, + "theoretical_loss": 3.4465585614357668, + "tokens_seen": 1890936832 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002157271815446339, + "loss": 2.5478, + "theoretical_loss": 3.4465484919696534, + "tokens_seen": 1891002368 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002157171514543631, + "loss": 2.5879, + "theoretical_loss": 3.4465384229502183, + "tokens_seen": 1891067904 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002157071213640923, + "loss": 2.8724, + "theoretical_loss": 3.446528354377426, + "tokens_seen": 1891133440 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021569709127382147, + "loss": 2.6502, + "theoretical_loss": 3.446518286251241, + "tokens_seen": 1891198976 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021568706118355065, + "loss": 2.6264, + "theoretical_loss": 3.446508218571628, + "tokens_seen": 1891264512 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021567703109327986, + "loss": 2.7363, + "theoretical_loss": 3.4464981513385524, + "tokens_seen": 1891330048 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021566700100300904, + "loss": 2.6592, + "theoretical_loss": 3.4464880845519783, + "tokens_seen": 1891395584 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021565697091273822, + "loss": 2.9434, + "theoretical_loss": 3.4464780182118706, + "tokens_seen": 1891461120 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002156469408224674, + "loss": 2.6006, + "theoretical_loss": 3.4464679523181942, + "tokens_seen": 1891526656 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021563691073219661, + "loss": 2.6443, + "theoretical_loss": 3.4464578868709133, + "tokens_seen": 1891592192 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002156268806419258, + "loss": 2.6278, + "theoretical_loss": 3.4464478218699925, + "tokens_seen": 1891657728 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021561685055165498, + "loss": 2.7204, + "theoretical_loss": 3.446437757315398, + "tokens_seen": 1891723264 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021560682046138416, + "loss": 2.5424, + "theoretical_loss": 3.4464276932070934, + "tokens_seen": 1891788800 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021559679037111334, + "loss": 2.572, + "theoretical_loss": 3.446417629545043, + "tokens_seen": 1891854336 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021558676028084255, + "loss": 2.66, + "theoretical_loss": 3.4464075663292126, + "tokens_seen": 1891919872 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021557673019057173, + "loss": 2.6745, + "theoretical_loss": 3.4463975035595666, + "tokens_seen": 1891985408 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002155667001003009, + "loss": 2.5125, + "theoretical_loss": 3.4463874412360695, + "tokens_seen": 1892050944 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002155566700100301, + "loss": 2.6863, + "theoretical_loss": 3.4463773793586867, + "tokens_seen": 1892116480 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002155466399197593, + "loss": 2.7093, + "theoretical_loss": 3.4463673179273817, + "tokens_seen": 1892182016 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021553660982948848, + "loss": 2.6336, + "theoretical_loss": 3.446357256942121, + "tokens_seen": 1892247552 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021552657973921766, + "loss": 2.6298, + "theoretical_loss": 3.4463471964028676, + "tokens_seen": 1892313088 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2122654, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6720056533813477, + "objective/train/theoretical_loss": 3.446342166300483, + "objective/train/tokens_used": 1912805856, + "theoretical_loss": 3.446342166300483, + "tokens_seen": 1892345856 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021551654964894684, + "loss": 2.8387, + "theoretical_loss": 3.446337136309588, + "tokens_seen": 1892378624 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021550651955867602, + "loss": 2.5977, + "theoretical_loss": 3.4463270766622456, + "tokens_seen": 1892444160 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021549648946840523, + "loss": 2.5819, + "theoretical_loss": 3.4463170174608058, + "tokens_seen": 1892509696 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002154864593781344, + "loss": 2.4955, + "theoretical_loss": 3.4463069587052337, + "tokens_seen": 1892575232 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002154764292878636, + "loss": 2.6394, + "theoretical_loss": 3.4462969003954935, + "tokens_seen": 1892640768 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021546639919759277, + "loss": 2.884, + "theoretical_loss": 3.446286842531551, + "tokens_seen": 1892706304 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021545636910732198, + "loss": 2.5928, + "theoretical_loss": 3.446276785113369, + "tokens_seen": 1892771840 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021544633901705116, + "loss": 2.5185, + "theoretical_loss": 3.4462667281409143, + "tokens_seen": 1892837376 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021543630892678034, + "loss": 2.7182, + "theoretical_loss": 3.446256671614151, + "tokens_seen": 1892902912 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021542627883650953, + "loss": 2.5635, + "theoretical_loss": 3.4462466155330436, + "tokens_seen": 1892968448 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002154162487462387, + "loss": 2.6828, + "theoretical_loss": 3.4462365598975575, + "tokens_seen": 1893033984 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021540621865596792, + "loss": 2.7147, + "theoretical_loss": 3.446226504707657, + "tokens_seen": 1893099520 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002153961885656971, + "loss": 2.6396, + "theoretical_loss": 3.4462164499633072, + "tokens_seen": 1893165056 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021538615847542628, + "loss": 2.8081, + "theoretical_loss": 3.446206395664473, + "tokens_seen": 1893230592 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021537612838515546, + "loss": 2.7794, + "theoretical_loss": 3.446196341811119, + "tokens_seen": 1893296128 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021536609829488467, + "loss": 2.6655, + "theoretical_loss": 3.4461862884032106, + "tokens_seen": 1893361664 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021535606820461385, + "loss": 2.6821, + "theoretical_loss": 3.4461762354407117, + "tokens_seen": 1893427200 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021534603811434303, + "loss": 2.7018, + "theoretical_loss": 3.446166182923588, + "tokens_seen": 1893492736 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002153360080240722, + "loss": 2.863, + "theoretical_loss": 3.4461561308518043, + "tokens_seen": 1893558272 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002153259779338014, + "loss": 2.4666, + "theoretical_loss": 3.4461460792253247, + "tokens_seen": 1893623808 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002153159478435306, + "loss": 2.7424, + "theoretical_loss": 3.4461360280441147, + "tokens_seen": 1893689344 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021530591775325978, + "loss": 2.7534, + "theoretical_loss": 3.446125977308139, + "tokens_seen": 1893754880 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021529588766298896, + "loss": 2.6603, + "theoretical_loss": 3.4461159270173627, + "tokens_seen": 1893820416 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021528585757271814, + "loss": 2.7713, + "theoretical_loss": 3.4461058771717505, + "tokens_seen": 1893885952 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021527582748244735, + "loss": 2.6858, + "theoretical_loss": 3.446095827771267, + "tokens_seen": 1893951488 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2123910, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6577539443969727, + "objective/train/theoretical_loss": 3.4460908032379383, + "objective/train/tokens_used": 1914444256, + "theoretical_loss": 3.4460908032379383, + "tokens_seen": 1893984256 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021526579739217653, + "loss": 2.7978, + "theoretical_loss": 3.4460857788158776, + "tokens_seen": 1894017024 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002152557673019057, + "loss": 2.6574, + "theoretical_loss": 3.446075730305547, + "tokens_seen": 1894082560 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002152457372116349, + "loss": 2.6205, + "theoretical_loss": 3.44606568224024, + "tokens_seen": 1894148096 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021523570712136408, + "loss": 2.6238, + "theoretical_loss": 3.4460556346199214, + "tokens_seen": 1894213632 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021522567703109328, + "loss": 2.8505, + "theoretical_loss": 3.446045587444556, + "tokens_seen": 1894279168 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021521564694082247, + "loss": 2.8125, + "theoretical_loss": 3.4460355407141092, + "tokens_seen": 1894344704 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021520561685055165, + "loss": 2.8623, + "theoretical_loss": 3.4460254944285453, + "tokens_seen": 1894410240 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021519558676028083, + "loss": 2.6338, + "theoretical_loss": 3.44601544858783, + "tokens_seen": 1894475776 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021518555667001004, + "loss": 2.6798, + "theoretical_loss": 3.4460054031919274, + "tokens_seen": 1894541312 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021517552657973922, + "loss": 2.7683, + "theoretical_loss": 3.445995358240803, + "tokens_seen": 1894606848 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002151654964894684, + "loss": 2.5775, + "theoretical_loss": 3.445985313734421, + "tokens_seen": 1894672384 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021515546639919758, + "loss": 2.529, + "theoretical_loss": 3.445975269672747, + "tokens_seen": 1894737920 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021514543630892676, + "loss": 2.6889, + "theoretical_loss": 3.4459652260557463, + "tokens_seen": 1894803456 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021513540621865597, + "loss": 2.7244, + "theoretical_loss": 3.4459551828833828, + "tokens_seen": 1894868992 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021512537612838515, + "loss": 2.7205, + "theoretical_loss": 3.4459451401556223, + "tokens_seen": 1894934528 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021511534603811433, + "loss": 2.6896, + "theoretical_loss": 3.445935097872429, + "tokens_seen": 1895000064 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002151053159478435, + "loss": 2.8807, + "theoretical_loss": 3.445925056033768, + "tokens_seen": 1895065600 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021509528585757272, + "loss": 2.7136, + "theoretical_loss": 3.445915014639605, + "tokens_seen": 1895131136 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021508525576730193, + "loss": 2.6082, + "theoretical_loss": 3.445904973689904, + "tokens_seen": 1895196672 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002150752256770311, + "loss": 2.8266, + "theoretical_loss": 3.4458949331846305, + "tokens_seen": 1895262208 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002150651955867603, + "loss": 2.6897, + "theoretical_loss": 3.4458848931237496, + "tokens_seen": 1895327744 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002150551654964895, + "loss": 2.622, + "theoretical_loss": 3.4458748535072257, + "tokens_seen": 1895393280 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021504513540621868, + "loss": 2.4857, + "theoretical_loss": 3.4458648143350237, + "tokens_seen": 1895458816 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021503510531594786, + "loss": 2.5431, + "theoretical_loss": 3.44585477560711, + "tokens_seen": 1895524352 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021502507522567704, + "loss": 2.8415, + "theoretical_loss": 3.4458447373234473, + "tokens_seen": 1895589888 + }, + { + "epoch": 6.03, + "objective/train/docs_used": 2125191, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0263853073120117, + "objective/train/theoretical_loss": 3.4458397183482, + "objective/train/tokens_used": 1916082656, + "theoretical_loss": 3.4458397183482, + "tokens_seen": 1895622656 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021501504513540622, + "loss": 2.692, + "theoretical_loss": 3.4458346994840023, + "tokens_seen": 1895655424 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021500501504513543, + "loss": 2.741, + "theoretical_loss": 3.44582466208874, + "tokens_seen": 1895720960 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002149949849548646, + "loss": 2.579, + "theoretical_loss": 3.445814625137624, + "tokens_seen": 1895786496 + }, + { + "epoch": 6.03, + "learning_rate": 0.0002149849548645938, + "loss": 2.7167, + "theoretical_loss": 3.4458045886306206, + "tokens_seen": 1895852032 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021497492477432297, + "loss": 2.6229, + "theoretical_loss": 3.4457945525676945, + "tokens_seen": 1895917568 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021496489468405218, + "loss": 2.7762, + "theoretical_loss": 3.4457845169488106, + "tokens_seen": 1895983104 + }, + { + "epoch": 6.03, + "learning_rate": 0.00021495486459378136, + "loss": 2.5646, + "theoretical_loss": 3.4457744817739338, + "tokens_seen": 1896048640 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021494483450351055, + "loss": 2.5843, + "theoretical_loss": 3.445764447043029, + "tokens_seen": 1896114176 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021493480441323973, + "loss": 2.4578, + "theoretical_loss": 3.4457544127560613, + "tokens_seen": 1896179712 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002149247743229689, + "loss": 2.6948, + "theoretical_loss": 3.445744378912996, + "tokens_seen": 1896245248 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021491474423269812, + "loss": 2.6329, + "theoretical_loss": 3.445734345513798, + "tokens_seen": 1896310784 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002149047141424273, + "loss": 2.8691, + "theoretical_loss": 3.4457243125584323, + "tokens_seen": 1896376320 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021489468405215648, + "loss": 2.5688, + "theoretical_loss": 3.4457142800468636, + "tokens_seen": 1896441856 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021488465396188566, + "loss": 2.8323, + "theoretical_loss": 3.445704247979058, + "tokens_seen": 1896507392 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021487462387161487, + "loss": 2.756, + "theoretical_loss": 3.445694216354979, + "tokens_seen": 1896572928 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021486459378134405, + "loss": 2.6235, + "theoretical_loss": 3.445684185174592, + "tokens_seen": 1896638464 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021485456369107323, + "loss": 2.8404, + "theoretical_loss": 3.4456741544378637, + "tokens_seen": 1896704000 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002148445336008024, + "loss": 2.9319, + "theoretical_loss": 3.4456641241447574, + "tokens_seen": 1896769536 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002148345035105316, + "loss": 2.6759, + "theoretical_loss": 3.4456540942952385, + "tokens_seen": 1896835072 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002148244734202608, + "loss": 2.6664, + "theoretical_loss": 3.4456440648892723, + "tokens_seen": 1896900608 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021481444332998998, + "loss": 2.8405, + "theoretical_loss": 3.4456340359268234, + "tokens_seen": 1896966144 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021480441323971916, + "loss": 2.8264, + "theoretical_loss": 3.445624007407858, + "tokens_seen": 1897031680 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021479438314944834, + "loss": 2.5947, + "theoretical_loss": 3.4456139793323395, + "tokens_seen": 1897097216 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021478435305917755, + "loss": 2.7026, + "theoretical_loss": 3.4456039517002344, + "tokens_seen": 1897162752 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021477432296890673, + "loss": 2.6278, + "theoretical_loss": 3.445593924511507, + "tokens_seen": 1897228288 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2125913, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.829488754272461, + "objective/train/theoretical_loss": 3.4455889110833997, + "objective/train/tokens_used": 1917721056, + "theoretical_loss": 3.4455889110833997, + "tokens_seen": 1897261056 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021476429287863591, + "loss": 2.6911, + "theoretical_loss": 3.445583897766123, + "tokens_seen": 1897293824 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002147542627883651, + "loss": 2.7607, + "theoretical_loss": 3.445573871464047, + "tokens_seen": 1897359360 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021474423269809428, + "loss": 2.4912, + "theoretical_loss": 3.4455638456052444, + "tokens_seen": 1897424896 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021473420260782348, + "loss": 2.5156, + "theoretical_loss": 3.44555382018968, + "tokens_seen": 1897490432 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021472417251755267, + "loss": 2.6441, + "theoretical_loss": 3.445543795217319, + "tokens_seen": 1897555968 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021471414242728185, + "loss": 2.8356, + "theoretical_loss": 3.4455337706881264, + "tokens_seen": 1897621504 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021470411233701103, + "loss": 2.826, + "theoretical_loss": 3.4455237466020674, + "tokens_seen": 1897687040 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021469408224674024, + "loss": 2.6261, + "theoretical_loss": 3.445513722959107, + "tokens_seen": 1897752576 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021468405215646942, + "loss": 2.6405, + "theoretical_loss": 3.445503699759211, + "tokens_seen": 1897818112 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002146740220661986, + "loss": 2.8062, + "theoretical_loss": 3.445493677002343, + "tokens_seen": 1897883648 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021466399197592778, + "loss": 2.6911, + "theoretical_loss": 3.44548365468847, + "tokens_seen": 1897949184 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021465396188565696, + "loss": 2.7269, + "theoretical_loss": 3.4454736328175555, + "tokens_seen": 1898014720 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021464393179538617, + "loss": 2.5296, + "theoretical_loss": 3.445463611389566, + "tokens_seen": 1898080256 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021463390170511535, + "loss": 2.9339, + "theoretical_loss": 3.4454535904044654, + "tokens_seen": 1898145792 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021462387161484453, + "loss": 2.4637, + "theoretical_loss": 3.4454435698622197, + "tokens_seen": 1898211328 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002146138415245737, + "loss": 2.6091, + "theoretical_loss": 3.445433549762794, + "tokens_seen": 1898276864 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021460381143430292, + "loss": 2.6785, + "theoretical_loss": 3.4454235301061527, + "tokens_seen": 1898342400 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002145937813440321, + "loss": 2.6862, + "theoretical_loss": 3.445413510892261, + "tokens_seen": 1898407936 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021458375125376128, + "loss": 2.7362, + "theoretical_loss": 3.445403492121085, + "tokens_seen": 1898473472 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021457372116349046, + "loss": 2.5499, + "theoretical_loss": 3.4453934737925893, + "tokens_seen": 1898539008 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021456369107321964, + "loss": 2.7496, + "theoretical_loss": 3.4453834559067396, + "tokens_seen": 1898604544 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021455366098294885, + "loss": 2.8923, + "theoretical_loss": 3.4453734384635, + "tokens_seen": 1898670080 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021454363089267803, + "loss": 2.3181, + "theoretical_loss": 3.4453634214628366, + "tokens_seen": 1898735616 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021453360080240721, + "loss": 2.7103, + "theoretical_loss": 3.4453534049047136, + "tokens_seen": 1898801152 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002145235707121364, + "loss": 2.6074, + "theoretical_loss": 3.445343388789097, + "tokens_seen": 1898866688 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2127092, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5427329540252686, + "objective/train/theoretical_loss": 3.445338380897218, + "objective/train/tokens_used": 1919359456, + "theoretical_loss": 3.445338380897218, + "tokens_seen": 1898899456 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002145135406218656, + "loss": 2.4987, + "theoretical_loss": 3.445333373115952, + "tokens_seen": 1898932224 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021450351053159479, + "loss": 2.588, + "theoretical_loss": 3.4453233578852434, + "tokens_seen": 1898997760 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021449348044132397, + "loss": 2.7551, + "theoretical_loss": 3.4453133430969367, + "tokens_seen": 1899063296 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021448345035105315, + "loss": 2.6362, + "theoretical_loss": 3.4453033287509967, + "tokens_seen": 1899128832 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021447342026078236, + "loss": 2.493, + "theoretical_loss": 3.4452933148473885, + "tokens_seen": 1899194368 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021446339017051154, + "loss": 2.3555, + "theoretical_loss": 3.445283301386078, + "tokens_seen": 1899259904 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021445336008024072, + "loss": 2.563, + "theoretical_loss": 3.44527328836703, + "tokens_seen": 1899325440 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002144433299899699, + "loss": 2.7809, + "theoretical_loss": 3.44526327579021, + "tokens_seen": 1899390976 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021443329989969908, + "loss": 2.7315, + "theoretical_loss": 3.445253263655583, + "tokens_seen": 1899456512 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002144232698094283, + "loss": 2.7232, + "theoretical_loss": 3.4452432519631135, + "tokens_seen": 1899522048 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021441323971915747, + "loss": 2.6024, + "theoretical_loss": 3.445233240712768, + "tokens_seen": 1899587584 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021440320962888665, + "loss": 2.6036, + "theoretical_loss": 3.4452232299045105, + "tokens_seen": 1899653120 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021439317953861583, + "loss": 2.7399, + "theoretical_loss": 3.445213219538307, + "tokens_seen": 1899718656 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021438314944834504, + "loss": 2.5983, + "theoretical_loss": 3.4452032096141227, + "tokens_seen": 1899784192 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021437311935807422, + "loss": 2.5569, + "theoretical_loss": 3.4451932001319223, + "tokens_seen": 1899849728 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002143630892678034, + "loss": 2.4499, + "theoretical_loss": 3.4451831910916724, + "tokens_seen": 1899915264 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021435305917753258, + "loss": 2.5147, + "theoretical_loss": 3.4451731824933365, + "tokens_seen": 1899980800 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021434302908726176, + "loss": 2.6012, + "theoretical_loss": 3.4451631743368805, + "tokens_seen": 1900046336 + }, + { + "epoch": 6.04, + "learning_rate": 0.000214332998996991, + "loss": 2.5935, + "theoretical_loss": 3.44515316662227, + "tokens_seen": 1900111872 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021432296890672018, + "loss": 2.6655, + "theoretical_loss": 3.4451431593494704, + "tokens_seen": 1900177408 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021431293881644936, + "loss": 2.7439, + "theoretical_loss": 3.445133152518446, + "tokens_seen": 1900242944 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021430290872617854, + "loss": 2.7478, + "theoretical_loss": 3.4451231461291627, + "tokens_seen": 1900308480 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021429287863590775, + "loss": 2.5513, + "theoretical_loss": 3.445113140181586, + "tokens_seen": 1900374016 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021428284854563693, + "loss": 2.7055, + "theoretical_loss": 3.445103134675681, + "tokens_seen": 1900439552 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021427281845536611, + "loss": 2.6912, + "theoretical_loss": 3.4450931296114122, + "tokens_seen": 1900505088 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2127756, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.581923723220825, + "objective/train/theoretical_loss": 3.445088127244881, + "objective/train/tokens_used": 1920997856, + "theoretical_loss": 3.445088127244881, + "tokens_seen": 1900537856 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002142627883650953, + "loss": 2.5245, + "theoretical_loss": 3.445083124988746, + "tokens_seen": 1900570624 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021425275827482448, + "loss": 2.6942, + "theoretical_loss": 3.4450731208076473, + "tokens_seen": 1900636160 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021424272818455368, + "loss": 2.563, + "theoretical_loss": 3.4450631170680808, + "tokens_seen": 1900701696 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021423269809428287, + "loss": 2.6534, + "theoretical_loss": 3.4450531137700127, + "tokens_seen": 1900767232 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021422266800401205, + "loss": 2.6286, + "theoretical_loss": 3.4450431109134083, + "tokens_seen": 1900832768 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021421263791374123, + "loss": 2.7336, + "theoretical_loss": 3.4450331084982317, + "tokens_seen": 1900898304 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021420260782347044, + "loss": 2.6473, + "theoretical_loss": 3.4450231065244497, + "tokens_seen": 1900963840 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021419257773319962, + "loss": 2.6341, + "theoretical_loss": 3.445013104992026, + "tokens_seen": 1901029376 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002141825476429288, + "loss": 2.7522, + "theoretical_loss": 3.4450031039009277, + "tokens_seen": 1901094912 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021417251755265798, + "loss": 2.5293, + "theoretical_loss": 3.4449931032511185, + "tokens_seen": 1901160448 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021416248746238716, + "loss": 2.5875, + "theoretical_loss": 3.444983103042565, + "tokens_seen": 1901225984 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021415245737211637, + "loss": 2.6319, + "theoretical_loss": 3.4449731032752315, + "tokens_seen": 1901291520 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021414242728184555, + "loss": 2.7452, + "theoretical_loss": 3.444963103949084, + "tokens_seen": 1901357056 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021413239719157473, + "loss": 2.8362, + "theoretical_loss": 3.444953105064087, + "tokens_seen": 1901422592 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002141223671013039, + "loss": 2.5935, + "theoretical_loss": 3.444943106620207, + "tokens_seen": 1901488128 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021411233701103312, + "loss": 2.5574, + "theoretical_loss": 3.444933108617409, + "tokens_seen": 1901553664 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002141023069207623, + "loss": 2.6941, + "theoretical_loss": 3.444923111055658, + "tokens_seen": 1901619200 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021409227683049148, + "loss": 2.4692, + "theoretical_loss": 3.44491311393492, + "tokens_seen": 1901684736 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021408224674022066, + "loss": 2.6243, + "theoretical_loss": 3.4449031172551585, + "tokens_seen": 1901750272 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021407221664994984, + "loss": 2.806, + "theoretical_loss": 3.4448931210163414, + "tokens_seen": 1901815808 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021406218655967905, + "loss": 2.6439, + "theoretical_loss": 3.444883125218432, + "tokens_seen": 1901881344 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021405215646940823, + "loss": 2.7877, + "theoretical_loss": 3.4448731298613966, + "tokens_seen": 1901946880 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021404212637913742, + "loss": 2.6433, + "theoretical_loss": 3.4448631349452006, + "tokens_seen": 1902012416 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002140320962888666, + "loss": 2.5467, + "theoretical_loss": 3.4448531404698093, + "tokens_seen": 1902077952 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002140220661985958, + "loss": 2.5668, + "theoretical_loss": 3.444843146435188, + "tokens_seen": 1902143488 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2128538, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.843466281890869, + "objective/train/theoretical_loss": 3.444838149583155, + "objective/train/tokens_used": 1922636256, + "theoretical_loss": 3.444838149583155, + "tokens_seen": 1902176256 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021401203610832499, + "loss": 2.7933, + "theoretical_loss": 3.444833152841302, + "tokens_seen": 1902209024 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021400200601805417, + "loss": 2.5647, + "theoretical_loss": 3.4448231596881165, + "tokens_seen": 1902274560 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021399197592778335, + "loss": 2.6921, + "theoretical_loss": 3.4448131669755973, + "tokens_seen": 1902340096 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021398194583751256, + "loss": 2.503, + "theoretical_loss": 3.4448031747037096, + "tokens_seen": 1902405632 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021397191574724174, + "loss": 2.6579, + "theoretical_loss": 3.444793182872419, + "tokens_seen": 1902471168 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021396188565697092, + "loss": 2.5753, + "theoretical_loss": 3.4447831914816907, + "tokens_seen": 1902536704 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002139518555667001, + "loss": 2.5602, + "theoretical_loss": 3.4447732005314897, + "tokens_seen": 1902602240 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021394182547642928, + "loss": 2.7181, + "theoretical_loss": 3.444763210021782, + "tokens_seen": 1902667776 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002139317953861585, + "loss": 2.6646, + "theoretical_loss": 3.444753219952533, + "tokens_seen": 1902733312 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021392176529588767, + "loss": 2.5438, + "theoretical_loss": 3.444743230323708, + "tokens_seen": 1902798848 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021391173520561685, + "loss": 2.6728, + "theoretical_loss": 3.444733241135272, + "tokens_seen": 1902864384 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021390170511534603, + "loss": 2.8284, + "theoretical_loss": 3.444723252387191, + "tokens_seen": 1902929920 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021389167502507524, + "loss": 2.5369, + "theoretical_loss": 3.44471326407943, + "tokens_seen": 1902995456 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021388164493480442, + "loss": 2.6926, + "theoretical_loss": 3.4447032762119547, + "tokens_seen": 1903060992 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002138716148445336, + "loss": 2.701, + "theoretical_loss": 3.4446932887847304, + "tokens_seen": 1903126528 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021386158475426278, + "loss": 2.772, + "theoretical_loss": 3.4446833017977228, + "tokens_seen": 1903192064 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021385155466399196, + "loss": 2.4796, + "theoretical_loss": 3.444673315250897, + "tokens_seen": 1903257600 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021384152457372117, + "loss": 2.7009, + "theoretical_loss": 3.4446633291442184, + "tokens_seen": 1903323136 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021383149448345035, + "loss": 2.6173, + "theoretical_loss": 3.4446533434776527, + "tokens_seen": 1903388672 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021382146439317954, + "loss": 2.7703, + "theoretical_loss": 3.4446433582511653, + "tokens_seen": 1903454208 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021381143430290872, + "loss": 2.773, + "theoretical_loss": 3.444633373464722, + "tokens_seen": 1903519744 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021380140421263792, + "loss": 2.7622, + "theoretical_loss": 3.444623389118287, + "tokens_seen": 1903585280 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002137913741223671, + "loss": 2.5455, + "theoretical_loss": 3.444613405211827, + "tokens_seen": 1903650816 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002137813440320963, + "loss": 2.5038, + "theoretical_loss": 3.444603421745307, + "tokens_seen": 1903716352 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021377131394182547, + "loss": 2.7678, + "theoretical_loss": 3.4445934387186927, + "tokens_seen": 1903781888 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2129968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6191258430480957, + "objective/train/theoretical_loss": 3.4445884473703394, + "objective/train/tokens_used": 1924274656, + "theoretical_loss": 3.4445884473703394, + "tokens_seen": 1903814656 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021376128385155465, + "loss": 2.6666, + "theoretical_loss": 3.4445834561319497, + "tokens_seen": 1903847424 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021375125376128386, + "loss": 2.4513, + "theoretical_loss": 3.4445734739850433, + "tokens_seen": 1903912960 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021374122367101304, + "loss": 2.7188, + "theoretical_loss": 3.4445634922779385, + "tokens_seen": 1903978496 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021373119358074222, + "loss": 2.69, + "theoretical_loss": 3.444553511010601, + "tokens_seen": 1904044032 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002137211634904714, + "loss": 2.68, + "theoretical_loss": 3.4445435301829965, + "tokens_seen": 1904109568 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002137111334002006, + "loss": 2.7927, + "theoretical_loss": 3.444533549795091, + "tokens_seen": 1904175104 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002137011033099298, + "loss": 2.4848, + "theoretical_loss": 3.444523569846849, + "tokens_seen": 1904240640 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021369107321965897, + "loss": 2.7093, + "theoretical_loss": 3.4445135903382367, + "tokens_seen": 1904306176 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021368104312938815, + "loss": 2.6859, + "theoretical_loss": 3.4445036112692193, + "tokens_seen": 1904371712 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021367101303911733, + "loss": 2.6159, + "theoretical_loss": 3.4444936326397624, + "tokens_seen": 1904437248 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021366098294884654, + "loss": 2.5515, + "theoretical_loss": 3.444483654449832, + "tokens_seen": 1904502784 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021365095285857572, + "loss": 2.8388, + "theoretical_loss": 3.4444736766993924, + "tokens_seen": 1904568320 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002136409227683049, + "loss": 2.5167, + "theoretical_loss": 3.44446369938841, + "tokens_seen": 1904633856 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021363089267803409, + "loss": 2.5111, + "theoretical_loss": 3.44445372251685, + "tokens_seen": 1904699392 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002136208625877633, + "loss": 2.4916, + "theoretical_loss": 3.444443746084678, + "tokens_seen": 1904764928 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021361083249749247, + "loss": 2.528, + "theoretical_loss": 3.4444337700918606, + "tokens_seen": 1904830464 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021360080240722166, + "loss": 2.537, + "theoretical_loss": 3.4444237945383613, + "tokens_seen": 1904896000 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021359077231695084, + "loss": 2.4763, + "theoretical_loss": 3.444413819424147, + "tokens_seen": 1904961536 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021358074222668004, + "loss": 2.7116, + "theoretical_loss": 3.444403844749183, + "tokens_seen": 1905027072 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021357071213640925, + "loss": 2.5386, + "theoretical_loss": 3.444393870513435, + "tokens_seen": 1905092608 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021356068204613843, + "loss": 2.5452, + "theoretical_loss": 3.444383896716868, + "tokens_seen": 1905158144 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021355065195586762, + "loss": 2.4334, + "theoretical_loss": 3.444373923359448, + "tokens_seen": 1905223680 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002135406218655968, + "loss": 2.6202, + "theoretical_loss": 3.4443639504411405, + "tokens_seen": 1905289216 + }, + { + "epoch": 6.04, + "learning_rate": 0.000213530591775326, + "loss": 2.6052, + "theoretical_loss": 3.444353977961911, + "tokens_seen": 1905354752 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021352056168505519, + "loss": 2.8171, + "theoretical_loss": 3.444344005921725, + "tokens_seen": 1905420288 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2130805, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7090566158294678, + "objective/train/theoretical_loss": 3.4443390200662627, + "objective/train/tokens_used": 1925913056, + "theoretical_loss": 3.4443390200662627, + "tokens_seen": 1905453056 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021351053159478437, + "loss": 2.679, + "theoretical_loss": 3.444334034320548, + "tokens_seen": 1905485824 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021350050150451355, + "loss": 2.8194, + "theoretical_loss": 3.444324063158346, + "tokens_seen": 1905551360 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021349047141424276, + "loss": 2.739, + "theoretical_loss": 3.4443140924350844, + "tokens_seen": 1905616896 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021348044132397194, + "loss": 2.5881, + "theoretical_loss": 3.4443041221507285, + "tokens_seen": 1905682432 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021347041123370112, + "loss": 2.4959, + "theoretical_loss": 3.444294152305244, + "tokens_seen": 1905747968 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002134603811434303, + "loss": 2.7044, + "theoretical_loss": 3.4442841828985964, + "tokens_seen": 1905813504 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021345035105315948, + "loss": 2.6559, + "theoretical_loss": 3.4442742139307523, + "tokens_seen": 1905879040 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002134403209628887, + "loss": 2.8766, + "theoretical_loss": 3.4442642454016754, + "tokens_seen": 1905944576 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021343029087261787, + "loss": 2.6306, + "theoretical_loss": 3.444254277311333, + "tokens_seen": 1906010112 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021342026078234705, + "loss": 2.6154, + "theoretical_loss": 3.44424430965969, + "tokens_seen": 1906075648 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021341023069207623, + "loss": 2.5336, + "theoretical_loss": 3.4442343424467117, + "tokens_seen": 1906141184 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021340020060180544, + "loss": 2.643, + "theoretical_loss": 3.4442243756723645, + "tokens_seen": 1906206720 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021339017051153462, + "loss": 2.5956, + "theoretical_loss": 3.444214409336614, + "tokens_seen": 1906272256 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002133801404212638, + "loss": 2.6017, + "theoretical_loss": 3.444204443439425, + "tokens_seen": 1906337792 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021337011033099298, + "loss": 2.615, + "theoretical_loss": 3.444194477980763, + "tokens_seen": 1906403328 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021336008024072216, + "loss": 2.7344, + "theoretical_loss": 3.444184512960595, + "tokens_seen": 1906468864 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021335005015045137, + "loss": 2.5248, + "theoretical_loss": 3.4441745483788857, + "tokens_seen": 1906534400 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021334002006018055, + "loss": 2.7246, + "theoretical_loss": 3.4441645842356, + "tokens_seen": 1906599936 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021332998996990974, + "loss": 2.5189, + "theoretical_loss": 3.444154620530705, + "tokens_seen": 1906665472 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021331995987963892, + "loss": 2.7377, + "theoretical_loss": 3.4441446572641663, + "tokens_seen": 1906731008 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021330992978936812, + "loss": 2.6655, + "theoretical_loss": 3.4441346944359483, + "tokens_seen": 1906796544 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002132998996990973, + "loss": 2.3656, + "theoretical_loss": 3.4441247320460175, + "tokens_seen": 1906862080 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002132898696088265, + "loss": 2.5246, + "theoretical_loss": 3.4441147700943393, + "tokens_seen": 1906927616 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021327983951855567, + "loss": 2.3971, + "theoretical_loss": 3.44410480858088, + "tokens_seen": 1906993152 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021326980942828485, + "loss": 2.6383, + "theoretical_loss": 3.444094847505604, + "tokens_seen": 1907058688 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2132196, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3333470821380615, + "objective/train/theoretical_loss": 3.4440898671322744, + "objective/train/tokens_used": 1927551456, + "theoretical_loss": 3.4440898671322744, + "tokens_seen": 1907091456 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021325977933801406, + "loss": 2.4471, + "theoretical_loss": 3.4440848868684784, + "tokens_seen": 1907124224 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021324974924774324, + "loss": 2.6841, + "theoretical_loss": 3.4440749266694675, + "tokens_seen": 1907189760 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021323971915747242, + "loss": 2.6017, + "theoretical_loss": 3.444064966908538, + "tokens_seen": 1907255296 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002132296890672016, + "loss": 2.6402, + "theoretical_loss": 3.4440550075856553, + "tokens_seen": 1907320832 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002132196589769308, + "loss": 2.5609, + "theoretical_loss": 3.4440450487007848, + "tokens_seen": 1907386368 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021320962888666, + "loss": 2.6656, + "theoretical_loss": 3.4440350902538923, + "tokens_seen": 1907451904 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021319959879638917, + "loss": 2.4741, + "theoretical_loss": 3.4440251322449438, + "tokens_seen": 1907517440 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021318956870611835, + "loss": 2.4903, + "theoretical_loss": 3.444015174673905, + "tokens_seen": 1907582976 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021317953861584753, + "loss": 2.5601, + "theoretical_loss": 3.4440052175407407, + "tokens_seen": 1907648512 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021316950852557674, + "loss": 2.2804, + "theoretical_loss": 3.4439952608454174, + "tokens_seen": 1907714048 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021315947843530592, + "loss": 2.5603, + "theoretical_loss": 3.443985304587901, + "tokens_seen": 1907779584 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002131494483450351, + "loss": 2.669, + "theoretical_loss": 3.443975348768157, + "tokens_seen": 1907845120 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021313941825476429, + "loss": 2.532, + "theoretical_loss": 3.4439653933861507, + "tokens_seen": 1907910656 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002131293881644935, + "loss": 2.4984, + "theoretical_loss": 3.4439554384418485, + "tokens_seen": 1907976192 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021311935807422267, + "loss": 2.4609, + "theoretical_loss": 3.4439454839352157, + "tokens_seen": 1908041728 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021310932798395186, + "loss": 2.6944, + "theoretical_loss": 3.4439355298662178, + "tokens_seen": 1908107264 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021309929789368104, + "loss": 2.5869, + "theoretical_loss": 3.443925576234821, + "tokens_seen": 1908172800 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021308926780341022, + "loss": 2.6298, + "theoretical_loss": 3.443915623040991, + "tokens_seen": 1908238336 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021307923771313943, + "loss": 2.5934, + "theoretical_loss": 3.4439056702846926, + "tokens_seen": 1908303872 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002130692076228686, + "loss": 2.5866, + "theoretical_loss": 3.4438957179658933, + "tokens_seen": 1908369408 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002130591775325978, + "loss": 2.483, + "theoretical_loss": 3.4438857660845574, + "tokens_seen": 1908434944 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021304914744232697, + "loss": 2.3335, + "theoretical_loss": 3.443875814640651, + "tokens_seen": 1908500480 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021303911735205618, + "loss": 2.8092, + "theoretical_loss": 3.4438658636341404, + "tokens_seen": 1908566016 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021302908726178536, + "loss": 2.5337, + "theoretical_loss": 3.443855913064991, + "tokens_seen": 1908631552 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021301905717151454, + "loss": 2.736, + "theoretical_loss": 3.4438459629331675, + "tokens_seen": 1908697088 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2133082, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3511650562286377, + "objective/train/theoretical_loss": 3.4438409880312433, + "objective/train/tokens_used": 1929189856, + "theoretical_loss": 3.4438409880312433, + "tokens_seen": 1908729856 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021300902708124372, + "loss": 2.682, + "theoretical_loss": 3.4438360132386374, + "tokens_seen": 1908762624 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002129989969909729, + "loss": 2.5821, + "theoretical_loss": 3.4438260639813656, + "tokens_seen": 1908828160 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002129889669007021, + "loss": 2.4172, + "theoretical_loss": 3.4438161151613182, + "tokens_seen": 1908893696 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002129789368104313, + "loss": 2.6023, + "theoretical_loss": 3.4438061667784607, + "tokens_seen": 1908959232 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021296890672016047, + "loss": 2.4938, + "theoretical_loss": 3.443796218832759, + "tokens_seen": 1909024768 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021295887662988965, + "loss": 2.4901, + "theoretical_loss": 3.4437862713241785, + "tokens_seen": 1909090304 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021294884653961886, + "loss": 2.4696, + "theoretical_loss": 3.4437763242526858, + "tokens_seen": 1909155840 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021293881644934804, + "loss": 2.6983, + "theoretical_loss": 3.4437663776182457, + "tokens_seen": 1909221376 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021292878635907722, + "loss": 2.5434, + "theoretical_loss": 3.443756431420825, + "tokens_seen": 1909286912 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002129187562688064, + "loss": 2.6655, + "theoretical_loss": 3.4437464856603883, + "tokens_seen": 1909352448 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002129087261785356, + "loss": 2.6007, + "theoretical_loss": 3.4437365403369027, + "tokens_seen": 1909417984 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002128986960882648, + "loss": 2.6824, + "theoretical_loss": 3.4437265954503333, + "tokens_seen": 1909483520 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021288866599799398, + "loss": 2.6762, + "theoretical_loss": 3.4437166510006456, + "tokens_seen": 1909549056 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021287863590772316, + "loss": 2.467, + "theoretical_loss": 3.4437067069878062, + "tokens_seen": 1909614592 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021286860581745234, + "loss": 2.3096, + "theoretical_loss": 3.443696763411781, + "tokens_seen": 1909680128 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021285857572718155, + "loss": 2.8252, + "theoretical_loss": 3.443686820272535, + "tokens_seen": 1909745664 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021284854563691073, + "loss": 2.5638, + "theoretical_loss": 3.4436768775700344, + "tokens_seen": 1909811200 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002128385155466399, + "loss": 2.5602, + "theoretical_loss": 3.443666935304245, + "tokens_seen": 1909876736 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021282848545636912, + "loss": 2.9096, + "theoretical_loss": 3.443656993475132, + "tokens_seen": 1909942272 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021281845536609832, + "loss": 2.7029, + "theoretical_loss": 3.443647052082663, + "tokens_seen": 1910007808 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002128084252758275, + "loss": 2.5326, + "theoretical_loss": 3.4436371111268027, + "tokens_seen": 1910073344 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002127983951855567, + "loss": 2.4243, + "theoretical_loss": 3.4436271706075168, + "tokens_seen": 1910138880 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021278836509528587, + "loss": 2.5883, + "theoretical_loss": 3.443617230524771, + "tokens_seen": 1910204416 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021277833500501505, + "loss": 2.4849, + "theoretical_loss": 3.443607290878532, + "tokens_seen": 1910269952 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021276830491474426, + "loss": 2.5535, + "theoretical_loss": 3.443597351668765, + "tokens_seen": 1910335488 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2134667, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5696043968200684, + "objective/train/theoretical_loss": 3.443592382227547, + "objective/train/tokens_used": 1930828256, + "theoretical_loss": 3.443592382227547, + "tokens_seen": 1910368256 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021275827482447344, + "loss": 2.606, + "theoretical_loss": 3.443587412895435, + "tokens_seen": 1910401024 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021274824473420262, + "loss": 2.5859, + "theoretical_loss": 3.44357747455851, + "tokens_seen": 1910466560 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002127382146439318, + "loss": 2.6872, + "theoretical_loss": 3.443567536657955, + "tokens_seen": 1910532096 + }, + { + "epoch": 6.04, + "learning_rate": 0.000212728184553661, + "loss": 2.6765, + "theoretical_loss": 3.443557599193735, + "tokens_seen": 1910597632 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002127181544633902, + "loss": 2.6266, + "theoretical_loss": 3.443547662165817, + "tokens_seen": 1910663168 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021270812437311937, + "loss": 2.5597, + "theoretical_loss": 3.4435377255741657, + "tokens_seen": 1910728704 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021269809428284855, + "loss": 2.5354, + "theoretical_loss": 3.443527789418748, + "tokens_seen": 1910794240 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021268806419257773, + "loss": 2.6939, + "theoretical_loss": 3.4435178536995297, + "tokens_seen": 1910859776 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021267803410230694, + "loss": 2.7579, + "theoretical_loss": 3.443507918416476, + "tokens_seen": 1910925312 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021266800401203612, + "loss": 2.661, + "theoretical_loss": 3.4434979835695536, + "tokens_seen": 1910990848 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002126579739217653, + "loss": 2.496, + "theoretical_loss": 3.443488049158728, + "tokens_seen": 1911056384 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021264794383149449, + "loss": 2.6193, + "theoretical_loss": 3.4434781151839653, + "tokens_seen": 1911121920 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002126379137412237, + "loss": 2.7849, + "theoretical_loss": 3.4434681816452306, + "tokens_seen": 1911187456 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021262788365095287, + "loss": 2.5715, + "theoretical_loss": 3.443458248542491, + "tokens_seen": 1911252992 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021261785356068206, + "loss": 2.5927, + "theoretical_loss": 3.443448315875712, + "tokens_seen": 1911318528 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021260782347041124, + "loss": 2.67, + "theoretical_loss": 3.443438383644859, + "tokens_seen": 1911384064 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021259779338014042, + "loss": 2.3489, + "theoretical_loss": 3.4434284518498983, + "tokens_seen": 1911449600 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021258776328986963, + "loss": 2.3388, + "theoretical_loss": 3.443418520490796, + "tokens_seen": 1911515136 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002125777331995988, + "loss": 2.6743, + "theoretical_loss": 3.443408589567518, + "tokens_seen": 1911580672 + }, + { + "epoch": 6.04, + "learning_rate": 0.000212567703109328, + "loss": 2.4647, + "theoretical_loss": 3.44339865908003, + "tokens_seen": 1911646208 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021255767301905717, + "loss": 2.5062, + "theoretical_loss": 3.443388729028298, + "tokens_seen": 1911711744 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021254764292878638, + "loss": 2.2315, + "theoretical_loss": 3.443378799412288, + "tokens_seen": 1911777280 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021253761283851556, + "loss": 2.6059, + "theoretical_loss": 3.443368870231966, + "tokens_seen": 1911842816 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021252758274824474, + "loss": 2.7828, + "theoretical_loss": 3.443358941487298, + "tokens_seen": 1911908352 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021251755265797392, + "loss": 2.6044, + "theoretical_loss": 3.4433490131782496, + "tokens_seen": 1911973888 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2135283, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6463258266448975, + "objective/train/theoretical_loss": 3.443344049187072, + "objective/train/tokens_used": 1932466656, + "theoretical_loss": 3.443344049187072, + "tokens_seen": 1912006656 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002125075225677031, + "loss": 2.6323, + "theoretical_loss": 3.443339085304787, + "tokens_seen": 1912039424 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002124974924774323, + "loss": 2.5522, + "theoretical_loss": 3.4433291578668763, + "tokens_seen": 1912104960 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002124874623871615, + "loss": 2.6556, + "theoretical_loss": 3.443319230864483, + "tokens_seen": 1912170496 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021247743229689067, + "loss": 2.4075, + "theoretical_loss": 3.443309304297574, + "tokens_seen": 1912236032 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021246740220661985, + "loss": 2.3965, + "theoretical_loss": 3.443299378166114, + "tokens_seen": 1912301568 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021245737211634906, + "loss": 2.8406, + "theoretical_loss": 3.44328945247007, + "tokens_seen": 1912367104 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021244734202607824, + "loss": 2.6477, + "theoretical_loss": 3.443279527209407, + "tokens_seen": 1912432640 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021243731193580742, + "loss": 2.7571, + "theoretical_loss": 3.4432696023840927, + "tokens_seen": 1912498176 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002124272818455366, + "loss": 2.5266, + "theoretical_loss": 3.4432596779940914, + "tokens_seen": 1912563712 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002124172517552658, + "loss": 2.5344, + "theoretical_loss": 3.4432497540393694, + "tokens_seen": 1912629248 + }, + { + "epoch": 6.04, + "learning_rate": 0.000212407221664995, + "loss": 2.6078, + "theoretical_loss": 3.443239830519893, + "tokens_seen": 1912694784 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021239719157472418, + "loss": 2.7832, + "theoretical_loss": 3.4432299074356285, + "tokens_seen": 1912760320 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021238716148445336, + "loss": 2.4363, + "theoretical_loss": 3.4432199847865417, + "tokens_seen": 1912825856 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021237713139418254, + "loss": 2.7445, + "theoretical_loss": 3.4432100625725983, + "tokens_seen": 1912891392 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021236710130391175, + "loss": 2.5122, + "theoretical_loss": 3.4432001407937642, + "tokens_seen": 1912956928 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021235707121364093, + "loss": 2.5649, + "theoretical_loss": 3.443190219450006, + "tokens_seen": 1913022464 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002123470411233701, + "loss": 2.5215, + "theoretical_loss": 3.4431802985412894, + "tokens_seen": 1913088000 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002123370110330993, + "loss": 2.6939, + "theoretical_loss": 3.44317037806758, + "tokens_seen": 1913153536 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002123269809428285, + "loss": 2.7673, + "theoretical_loss": 3.443160458028845, + "tokens_seen": 1913219072 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021231695085255768, + "loss": 2.6578, + "theoretical_loss": 3.4431505384250496, + "tokens_seen": 1913284608 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021230692076228686, + "loss": 2.6449, + "theoretical_loss": 3.4431406192561593, + "tokens_seen": 1913350144 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021229689067201604, + "loss": 2.7855, + "theoretical_loss": 3.443130700522141, + "tokens_seen": 1913415680 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021228686058174522, + "loss": 2.2751, + "theoretical_loss": 3.443120782222961, + "tokens_seen": 1913481216 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021227683049147443, + "loss": 2.7815, + "theoretical_loss": 3.4431108643585846, + "tokens_seen": 1913546752 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002122668004012036, + "loss": 2.5509, + "theoretical_loss": 3.443100946928978, + "tokens_seen": 1913612288 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2136657, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6412930488586426, + "objective/train/theoretical_loss": 3.4430959883772028, + "objective/train/tokens_used": 1934105056, + "theoretical_loss": 3.4430959883772028, + "tokens_seen": 1913645056 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002122567703109328, + "loss": 2.5236, + "theoretical_loss": 3.4430910299341075, + "tokens_seen": 1913677824 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021224674022066197, + "loss": 2.625, + "theoretical_loss": 3.4430811133739385, + "tokens_seen": 1913743360 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021223671013039118, + "loss": 2.6394, + "theoretical_loss": 3.443071197248438, + "tokens_seen": 1913808896 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021222668004012036, + "loss": 2.5659, + "theoretical_loss": 3.443061281557571, + "tokens_seen": 1913874432 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021221664994984954, + "loss": 2.4365, + "theoretical_loss": 3.4430513663013054, + "tokens_seen": 1913939968 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021220661985957873, + "loss": 2.6697, + "theoretical_loss": 3.4430414514796053, + "tokens_seen": 1914005504 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002121965897693079, + "loss": 2.5515, + "theoretical_loss": 3.4430315370924376, + "tokens_seen": 1914071040 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021218655967903712, + "loss": 2.6889, + "theoretical_loss": 3.4430216231397686, + "tokens_seen": 1914136576 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002121765295887663, + "loss": 2.4574, + "theoretical_loss": 3.4430117096215636, + "tokens_seen": 1914202112 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021216649949849548, + "loss": 2.5316, + "theoretical_loss": 3.4430017965377897, + "tokens_seen": 1914267648 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021215646940822466, + "loss": 2.6619, + "theoretical_loss": 3.4429918838884124, + "tokens_seen": 1914333184 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021214643931795387, + "loss": 2.3557, + "theoretical_loss": 3.4429819716733974, + "tokens_seen": 1914398720 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021213640922768305, + "loss": 2.4413, + "theoretical_loss": 3.442972059892712, + "tokens_seen": 1914464256 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021212637913741223, + "loss": 2.6237, + "theoretical_loss": 3.442962148546321, + "tokens_seen": 1914529792 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002121163490471414, + "loss": 2.7625, + "theoretical_loss": 3.4429522376341914, + "tokens_seen": 1914595328 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002121063189568706, + "loss": 2.6155, + "theoretical_loss": 3.442942327156289, + "tokens_seen": 1914660864 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002120962888665998, + "loss": 2.4318, + "theoretical_loss": 3.44293241711258, + "tokens_seen": 1914726400 + }, + { + "epoch": 6.04, + "learning_rate": 0.000212086258776329, + "loss": 2.4692, + "theoretical_loss": 3.44292250750303, + "tokens_seen": 1914791936 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002120762286860582, + "loss": 2.6081, + "theoretical_loss": 3.442912598327606, + "tokens_seen": 1914857472 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021206619859578737, + "loss": 2.647, + "theoretical_loss": 3.442902689586273, + "tokens_seen": 1914923008 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021205616850551658, + "loss": 2.6262, + "theoretical_loss": 3.4428927812789984, + "tokens_seen": 1914988544 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021204613841524576, + "loss": 2.5385, + "theoretical_loss": 3.4428828734057477, + "tokens_seen": 1915054080 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021203610832497494, + "loss": 2.3548, + "theoretical_loss": 3.4428729659664867, + "tokens_seen": 1915119616 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021202607823470412, + "loss": 2.7113, + "theoretical_loss": 3.442863058961182, + "tokens_seen": 1915185152 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002120160481444333, + "loss": 2.8845, + "theoretical_loss": 3.4428531523898, + "tokens_seen": 1915250688 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2137318, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6103193759918213, + "objective/train/theoretical_loss": 3.442848199266819, + "objective/train/tokens_used": 1935743456, + "theoretical_loss": 3.442848199266819, + "tokens_seen": 1915283456 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002120060180541625, + "loss": 2.4608, + "theoretical_loss": 3.4428432462523064, + "tokens_seen": 1915316224 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002119959879638917, + "loss": 2.3933, + "theoretical_loss": 3.4428333405486673, + "tokens_seen": 1915381760 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021198595787362087, + "loss": 2.5476, + "theoretical_loss": 3.442823435278849, + "tokens_seen": 1915447296 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021197592778335005, + "loss": 2.6893, + "theoretical_loss": 3.442813530442818, + "tokens_seen": 1915512832 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021196589769307926, + "loss": 2.7952, + "theoretical_loss": 3.4428036260405395, + "tokens_seen": 1915578368 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021195586760280844, + "loss": 2.5242, + "theoretical_loss": 3.442793722071981, + "tokens_seen": 1915643904 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021194583751253762, + "loss": 2.3479, + "theoretical_loss": 3.442783818537107, + "tokens_seen": 1915709440 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002119358074222668, + "loss": 2.4773, + "theoretical_loss": 3.442773915435885, + "tokens_seen": 1915774976 + }, + { + "epoch": 6.04, + "learning_rate": 0.000211925777331996, + "loss": 2.3247, + "theoretical_loss": 3.442764012768281, + "tokens_seen": 1915840512 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002119157472417252, + "loss": 2.6237, + "theoretical_loss": 3.4427541105342616, + "tokens_seen": 1915906048 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021190571715145438, + "loss": 2.7649, + "theoretical_loss": 3.4427442087337914, + "tokens_seen": 1915971584 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021189568706118356, + "loss": 2.7409, + "theoretical_loss": 3.442734307366838, + "tokens_seen": 1916037120 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021188565697091274, + "loss": 2.5778, + "theoretical_loss": 3.442724406433367, + "tokens_seen": 1916102656 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021187562688064195, + "loss": 2.4944, + "theoretical_loss": 3.442714505933345, + "tokens_seen": 1916168192 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021186559679037113, + "loss": 2.4815, + "theoretical_loss": 3.442704605866738, + "tokens_seen": 1916233728 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002118555667001003, + "loss": 2.5061, + "theoretical_loss": 3.442694706233512, + "tokens_seen": 1916299264 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002118455366098295, + "loss": 2.6916, + "theoretical_loss": 3.4426848070336336, + "tokens_seen": 1916364800 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002118355065195587, + "loss": 2.6997, + "theoretical_loss": 3.442674908267068, + "tokens_seen": 1916430336 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021182547642928788, + "loss": 2.4708, + "theoretical_loss": 3.442665009933783, + "tokens_seen": 1916495872 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021181544633901706, + "loss": 2.4316, + "theoretical_loss": 3.4426551120337434, + "tokens_seen": 1916561408 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021180541624874624, + "loss": 2.3108, + "theoretical_loss": 3.442645214566917, + "tokens_seen": 1916626944 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021179538615847542, + "loss": 2.7395, + "theoretical_loss": 3.4426353175332682, + "tokens_seen": 1916692480 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021178535606820463, + "loss": 2.278, + "theoretical_loss": 3.4426254209327642, + "tokens_seen": 1916758016 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002117753259779338, + "loss": 2.6222, + "theoretical_loss": 3.4426155247653716, + "tokens_seen": 1916823552 + }, + { + "epoch": 6.04, + "learning_rate": 0.000211765295887663, + "loss": 2.6624, + "theoretical_loss": 3.442605629031056, + "tokens_seen": 1916889088 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2138605, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5397114753723145, + "objective/train/theoretical_loss": 3.442600681326291, + "objective/train/tokens_used": 1937381856, + "theoretical_loss": 3.442600681326291, + "tokens_seen": 1916921856 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021175526579739217, + "loss": 2.5679, + "theoretical_loss": 3.4425957337297834, + "tokens_seen": 1916954624 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021174523570712138, + "loss": 2.5898, + "theoretical_loss": 3.442585838861521, + "tokens_seen": 1917020160 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021173520561685056, + "loss": 2.8369, + "theoretical_loss": 3.442575944426234, + "tokens_seen": 1917085696 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021172517552657974, + "loss": 2.5778, + "theoretical_loss": 3.4425660504238893, + "tokens_seen": 1917151232 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021171514543630893, + "loss": 2.5444, + "theoretical_loss": 3.4425561568544536, + "tokens_seen": 1917216768 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002117051153460381, + "loss": 2.7787, + "theoretical_loss": 3.442546263717892, + "tokens_seen": 1917282304 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021169508525576732, + "loss": 2.6662, + "theoretical_loss": 3.4425363710141714, + "tokens_seen": 1917347840 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002116850551654965, + "loss": 2.397, + "theoretical_loss": 3.4425264787432583, + "tokens_seen": 1917413376 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021167502507522568, + "loss": 2.6797, + "theoretical_loss": 3.4425165869051186, + "tokens_seen": 1917478912 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021166499498495486, + "loss": 2.6077, + "theoretical_loss": 3.442506695499718, + "tokens_seen": 1917544448 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021165496489468407, + "loss": 2.5679, + "theoretical_loss": 3.4424968045270243, + "tokens_seen": 1917609984 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021164493480441325, + "loss": 2.4198, + "theoretical_loss": 3.4424869139870027, + "tokens_seen": 1917675520 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021163490471414243, + "loss": 2.4957, + "theoretical_loss": 3.44247702387962, + "tokens_seen": 1917741056 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002116248746238716, + "loss": 2.4397, + "theoretical_loss": 3.4424671342048416, + "tokens_seen": 1917806592 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002116148445336008, + "loss": 2.6285, + "theoretical_loss": 3.4424572449626343, + "tokens_seen": 1917872128 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021160481444333, + "loss": 2.5311, + "theoretical_loss": 3.4424473561529654, + "tokens_seen": 1917937664 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021159478435305918, + "loss": 2.4247, + "theoretical_loss": 3.4424374677757994, + "tokens_seen": 1918003200 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021158475426278836, + "loss": 2.4115, + "theoretical_loss": 3.4424275798311044, + "tokens_seen": 1918068736 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021157472417251754, + "loss": 2.5981, + "theoretical_loss": 3.4424176923188448, + "tokens_seen": 1918134272 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021156469408224675, + "loss": 2.6293, + "theoretical_loss": 3.4424078052389886, + "tokens_seen": 1918199808 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021155466399197593, + "loss": 2.6188, + "theoretical_loss": 3.4423979185915012, + "tokens_seen": 1918265344 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002115446339017051, + "loss": 2.5823, + "theoretical_loss": 3.4423880323763494, + "tokens_seen": 1918330880 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002115346038114343, + "loss": 2.614, + "theoretical_loss": 3.4423781465934993, + "tokens_seen": 1918396416 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021152457372116348, + "loss": 2.5076, + "theoretical_loss": 3.442368261242917, + "tokens_seen": 1918461952 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021151454363089268, + "loss": 2.8328, + "theoretical_loss": 3.4423583763245693, + "tokens_seen": 1918527488 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2139381, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5007266998291016, + "objective/train/theoretical_loss": 3.4423534340274724, + "objective/train/tokens_used": 1939020256, + "theoretical_loss": 3.4423534340274724, + "tokens_seen": 1918560256 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021150451354062186, + "loss": 2.4914, + "theoretical_loss": 3.442348491838422, + "tokens_seen": 1918593024 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021149448345035105, + "loss": 2.5321, + "theoretical_loss": 3.442338607784442, + "tokens_seen": 1918658560 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021148445336008023, + "loss": 2.5197, + "theoretical_loss": 3.4423287241625955, + "tokens_seen": 1918724096 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021147442326980944, + "loss": 2.7856, + "theoretical_loss": 3.442318840972848, + "tokens_seen": 1918789632 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021146439317953862, + "loss": 2.6713, + "theoretical_loss": 3.4423089582151674, + "tokens_seen": 1918855168 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002114543630892678, + "loss": 2.6522, + "theoretical_loss": 3.442299075889519, + "tokens_seen": 1918920704 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021144433299899698, + "loss": 2.5145, + "theoretical_loss": 3.4422891939958693, + "tokens_seen": 1918986240 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021143430290872616, + "loss": 2.6871, + "theoretical_loss": 3.442279312534185, + "tokens_seen": 1919051776 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021142427281845537, + "loss": 2.7518, + "theoretical_loss": 3.4422694315044318, + "tokens_seen": 1919117312 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021141424272818455, + "loss": 2.5325, + "theoretical_loss": 3.442259550906577, + "tokens_seen": 1919182848 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021140421263791373, + "loss": 2.6424, + "theoretical_loss": 3.442249670740586, + "tokens_seen": 1919248384 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002113941825476429, + "loss": 2.7161, + "theoretical_loss": 3.442239791006426, + "tokens_seen": 1919313920 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021138415245737212, + "loss": 2.5393, + "theoretical_loss": 3.4422299117040627, + "tokens_seen": 1919379456 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002113741223671013, + "loss": 2.4392, + "theoretical_loss": 3.442220032833463, + "tokens_seen": 1919444992 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021136409227683048, + "loss": 2.4757, + "theoretical_loss": 3.4422101543945933, + "tokens_seen": 1919510528 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021135406218655966, + "loss": 2.5876, + "theoretical_loss": 3.4422002763874198, + "tokens_seen": 1919576064 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021134403209628884, + "loss": 2.5301, + "theoretical_loss": 3.442190398811909, + "tokens_seen": 1919641600 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021133400200601808, + "loss": 2.5432, + "theoretical_loss": 3.4421805216680266, + "tokens_seen": 1919707136 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021132397191574726, + "loss": 2.5012, + "theoretical_loss": 3.4421706449557403, + "tokens_seen": 1919772672 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021131394182547644, + "loss": 2.4766, + "theoretical_loss": 3.4421607686750155, + "tokens_seen": 1919838208 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021130391173520562, + "loss": 2.524, + "theoretical_loss": 3.4421508928258184, + "tokens_seen": 1919903744 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021129388164493483, + "loss": 2.6931, + "theoretical_loss": 3.4421410174081166, + "tokens_seen": 1919969280 + }, + { + "epoch": 6.04, + "learning_rate": 0.000211283851554664, + "loss": 2.4682, + "theoretical_loss": 3.442131142421876, + "tokens_seen": 1920034816 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002112738214643932, + "loss": 2.709, + "theoretical_loss": 3.442121267867062, + "tokens_seen": 1920100352 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021126379137412237, + "loss": 2.7133, + "theoretical_loss": 3.4421113937436427, + "tokens_seen": 1920165888 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2140906, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5126380920410156, + "objective/train/theoretical_loss": 3.442106456843695, + "objective/train/tokens_used": 1940658656, + "theoretical_loss": 3.442106456843695, + "tokens_seen": 1920198656 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021125376128385158, + "loss": 2.4795, + "theoretical_loss": 3.4421015200515837, + "tokens_seen": 1920231424 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021124373119358076, + "loss": 2.7511, + "theoretical_loss": 3.442091646790851, + "tokens_seen": 1920296960 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021123370110330994, + "loss": 2.8517, + "theoretical_loss": 3.4420817739614122, + "tokens_seen": 1920362496 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021122367101303913, + "loss": 2.6595, + "theoretical_loss": 3.4420719015632324, + "tokens_seen": 1920428032 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002112136409227683, + "loss": 2.6039, + "theoretical_loss": 3.442062029596279, + "tokens_seen": 1920493568 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021120361083249752, + "loss": 2.5357, + "theoretical_loss": 3.442052158060518, + "tokens_seen": 1920559104 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002111935807422267, + "loss": 2.6753, + "theoretical_loss": 3.4420422869559166, + "tokens_seen": 1920624640 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021118355065195588, + "loss": 2.5999, + "theoretical_loss": 3.44203241628244, + "tokens_seen": 1920690176 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021117352056168506, + "loss": 2.3997, + "theoretical_loss": 3.4420225460400555, + "tokens_seen": 1920755712 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021116349047141427, + "loss": 2.5572, + "theoretical_loss": 3.4420126762287295, + "tokens_seen": 1920821248 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021115346038114345, + "loss": 2.5949, + "theoretical_loss": 3.4420028068484285, + "tokens_seen": 1920886784 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021114343029087263, + "loss": 2.4626, + "theoretical_loss": 3.4419929378991183, + "tokens_seen": 1920952320 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002111334002006018, + "loss": 2.6551, + "theoretical_loss": 3.4419830693807665, + "tokens_seen": 1921017856 + }, + { + "epoch": 6.04, + "learning_rate": 0.000211123370110331, + "loss": 2.6158, + "theoretical_loss": 3.4419732012933384, + "tokens_seen": 1921083392 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002111133400200602, + "loss": 2.6586, + "theoretical_loss": 3.441963333636801, + "tokens_seen": 1921148928 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021110330992978938, + "loss": 2.8771, + "theoretical_loss": 3.441953466411122, + "tokens_seen": 1921214464 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021109327983951856, + "loss": 2.722, + "theoretical_loss": 3.4419435996162653, + "tokens_seen": 1921280000 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021108324974924774, + "loss": 2.5652, + "theoretical_loss": 3.4419337332522, + "tokens_seen": 1921345536 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021107321965897695, + "loss": 2.4028, + "theoretical_loss": 3.4419238673188906, + "tokens_seen": 1921411072 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021106318956870613, + "loss": 2.5772, + "theoretical_loss": 3.441914001816305, + "tokens_seen": 1921476608 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002110531594784353, + "loss": 2.7018, + "theoretical_loss": 3.4419041367444088, + "tokens_seen": 1921542144 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002110431293881645, + "loss": 2.6243, + "theoretical_loss": 3.441894272103169, + "tokens_seen": 1921607680 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021103309929789368, + "loss": 2.6382, + "theoretical_loss": 3.441884407892552, + "tokens_seen": 1921673216 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021102306920762288, + "loss": 2.4433, + "theoretical_loss": 3.4418745441125242, + "tokens_seen": 1921738752 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021101303911735207, + "loss": 2.6511, + "theoretical_loss": 3.4418646807630524, + "tokens_seen": 1921804288 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2141572, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2893829345703125, + "objective/train/theoretical_loss": 3.4418597492497645, + "objective/train/tokens_used": 1942297056, + "theoretical_loss": 3.4418597492497645, + "tokens_seen": 1921837056 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021100300902708125, + "loss": 2.6274, + "theoretical_loss": 3.441854817844103, + "tokens_seen": 1921869824 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021099297893681043, + "loss": 2.4522, + "theoretical_loss": 3.4418449553556423, + "tokens_seen": 1921935360 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021098294884653964, + "loss": 2.544, + "theoretical_loss": 3.4418350932976365, + "tokens_seen": 1922000896 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021097291875626882, + "loss": 2.4978, + "theoretical_loss": 3.4418252316700535, + "tokens_seen": 1922066432 + }, + { + "epoch": 6.04, + "learning_rate": 0.000210962888665998, + "loss": 2.6331, + "theoretical_loss": 3.4418153704728587, + "tokens_seen": 1922131968 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021095285857572718, + "loss": 2.5549, + "theoretical_loss": 3.4418055097060183, + "tokens_seen": 1922197504 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021094282848545636, + "loss": 2.4441, + "theoretical_loss": 3.4417956493695, + "tokens_seen": 1922263040 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021093279839518557, + "loss": 2.7071, + "theoretical_loss": 3.4417857894632697, + "tokens_seen": 1922328576 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021092276830491475, + "loss": 2.7811, + "theoretical_loss": 3.441775929987294, + "tokens_seen": 1922394112 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021091273821464393, + "loss": 2.597, + "theoretical_loss": 3.44176607094154, + "tokens_seen": 1922459648 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002109027081243731, + "loss": 2.5943, + "theoretical_loss": 3.4417562123259735, + "tokens_seen": 1922525184 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021089267803410232, + "loss": 2.7314, + "theoretical_loss": 3.4417463541405606, + "tokens_seen": 1922590720 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002108826479438315, + "loss": 2.6386, + "theoretical_loss": 3.4417364963852695, + "tokens_seen": 1922656256 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021087261785356068, + "loss": 2.5157, + "theoretical_loss": 3.441726639060066, + "tokens_seen": 1922721792 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021086258776328986, + "loss": 2.4737, + "theoretical_loss": 3.4417167821649164, + "tokens_seen": 1922787328 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021085255767301904, + "loss": 2.5918, + "theoretical_loss": 3.441706925699787, + "tokens_seen": 1922852864 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021084252758274825, + "loss": 2.5187, + "theoretical_loss": 3.441697069664645, + "tokens_seen": 1922918400 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021083249749247743, + "loss": 2.7332, + "theoretical_loss": 3.441687214059457, + "tokens_seen": 1922983936 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021082246740220661, + "loss": 2.7551, + "theoretical_loss": 3.4416773588841894, + "tokens_seen": 1923049472 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002108124373119358, + "loss": 2.5733, + "theoretical_loss": 3.4416675041388087, + "tokens_seen": 1923115008 + }, + { + "epoch": 6.04, + "learning_rate": 0.000210802407221665, + "loss": 2.5457, + "theoretical_loss": 3.4416576498232816, + "tokens_seen": 1923180544 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021079237713139419, + "loss": 2.6434, + "theoretical_loss": 3.441647795937575, + "tokens_seen": 1923246080 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021078234704112337, + "loss": 2.4311, + "theoretical_loss": 3.441637942481655, + "tokens_seen": 1923311616 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021077231695085255, + "loss": 2.6057, + "theoretical_loss": 3.4416280894554885, + "tokens_seen": 1923377152 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021076228686058176, + "loss": 2.7047, + "theoretical_loss": 3.441618236859042, + "tokens_seen": 1923442688 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2142911, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0328121185302734, + "objective/train/theoretical_loss": 3.4416133107219533, + "objective/train/tokens_used": 1943935456, + "theoretical_loss": 3.4416133107219533, + "tokens_seen": 1923475456 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021075225677031094, + "loss": 2.8594, + "theoretical_loss": 3.441608384692282, + "tokens_seen": 1923508224 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021074222668004012, + "loss": 2.4247, + "theoretical_loss": 3.4415985329551755, + "tokens_seen": 1923573760 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002107321965897693, + "loss": 2.6348, + "theoretical_loss": 3.441588681647689, + "tokens_seen": 1923639296 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021072216649949848, + "loss": 2.6823, + "theoretical_loss": 3.4415788307697888, + "tokens_seen": 1923704832 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002107121364092277, + "loss": 2.7193, + "theoretical_loss": 3.441568980321442, + "tokens_seen": 1923770368 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021070210631895687, + "loss": 2.8579, + "theoretical_loss": 3.4415591303026147, + "tokens_seen": 1923835904 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021069207622868605, + "loss": 2.5427, + "theoretical_loss": 3.441549280713274, + "tokens_seen": 1923901440 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021068204613841523, + "loss": 2.4648, + "theoretical_loss": 3.4415394315533865, + "tokens_seen": 1923966976 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021067201604814444, + "loss": 2.7024, + "theoretical_loss": 3.441529582822919, + "tokens_seen": 1924032512 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021066198595787362, + "loss": 2.5461, + "theoretical_loss": 3.441519734521837, + "tokens_seen": 1924098048 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002106519558676028, + "loss": 2.6536, + "theoretical_loss": 3.441509886650109, + "tokens_seen": 1924163584 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021064192577733198, + "loss": 2.7992, + "theoretical_loss": 3.4415000392076998, + "tokens_seen": 1924229120 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021063189568706116, + "loss": 2.4612, + "theoretical_loss": 3.4414901921945775, + "tokens_seen": 1924294656 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021062186559679037, + "loss": 2.7393, + "theoretical_loss": 3.4414803456107084, + "tokens_seen": 1924360192 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021061183550651955, + "loss": 2.3769, + "theoretical_loss": 3.441470499456059, + "tokens_seen": 1924425728 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021060180541624873, + "loss": 2.3313, + "theoretical_loss": 3.4414606537305956, + "tokens_seen": 1924491264 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021059177532597792, + "loss": 2.6389, + "theoretical_loss": 3.4414508084342854, + "tokens_seen": 1924556800 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021058174523570715, + "loss": 2.5137, + "theoretical_loss": 3.4414409635670955, + "tokens_seen": 1924622336 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021057171514543633, + "loss": 2.4464, + "theoretical_loss": 3.4414311191289912, + "tokens_seen": 1924687872 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021056168505516551, + "loss": 2.4134, + "theoretical_loss": 3.4414212751199402, + "tokens_seen": 1924753408 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002105516549648947, + "loss": 2.431, + "theoretical_loss": 3.441411431539909, + "tokens_seen": 1924818944 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021054162487462388, + "loss": 2.5168, + "theoretical_loss": 3.4414015883888647, + "tokens_seen": 1924884480 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021053159478435308, + "loss": 2.7616, + "theoretical_loss": 3.4413917456667735, + "tokens_seen": 1924950016 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021052156469408227, + "loss": 2.3986, + "theoretical_loss": 3.4413819033736024, + "tokens_seen": 1925015552 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021051153460381145, + "loss": 2.6501, + "theoretical_loss": 3.441372061509317, + "tokens_seen": 1925081088 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2143554, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7691996097564697, + "objective/train/theoretical_loss": 3.441367140737997, + "objective/train/tokens_used": 1945573856, + "theoretical_loss": 3.441367140737997, + "tokens_seen": 1925113856 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021050150451354063, + "loss": 2.7213, + "theoretical_loss": 3.441362220073886, + "tokens_seen": 1925146624 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021049147442326984, + "loss": 2.6249, + "theoretical_loss": 3.4413523790672746, + "tokens_seen": 1925212160 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021048144433299902, + "loss": 2.6448, + "theoretical_loss": 3.44134253848945, + "tokens_seen": 1925277696 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002104714142427282, + "loss": 2.7611, + "theoretical_loss": 3.4413326983403785, + "tokens_seen": 1925343232 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021046138415245738, + "loss": 2.643, + "theoretical_loss": 3.4413228586200275, + "tokens_seen": 1925408768 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021045135406218656, + "loss": 2.2684, + "theoretical_loss": 3.441313019328364, + "tokens_seen": 1925474304 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021044132397191577, + "loss": 2.5149, + "theoretical_loss": 3.4413031804653533, + "tokens_seen": 1925539840 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021043129388164495, + "loss": 2.6089, + "theoretical_loss": 3.4412933420309635, + "tokens_seen": 1925605376 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021042126379137413, + "loss": 2.5394, + "theoretical_loss": 3.4412835040251606, + "tokens_seen": 1925670912 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002104112337011033, + "loss": 2.6724, + "theoretical_loss": 3.4412736664479118, + "tokens_seen": 1925736448 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021040120361083252, + "loss": 2.5523, + "theoretical_loss": 3.4412638292991833, + "tokens_seen": 1925801984 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002103911735205617, + "loss": 2.7842, + "theoretical_loss": 3.4412539925789423, + "tokens_seen": 1925867520 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021038114343029088, + "loss": 2.5372, + "theoretical_loss": 3.4412441562871554, + "tokens_seen": 1925933056 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021037111334002006, + "loss": 2.5467, + "theoretical_loss": 3.4412343204237894, + "tokens_seen": 1925998592 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021036108324974924, + "loss": 2.6356, + "theoretical_loss": 3.441224484988812, + "tokens_seen": 1926064128 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021035105315947845, + "loss": 2.6457, + "theoretical_loss": 3.4412146499821876, + "tokens_seen": 1926129664 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021034102306920763, + "loss": 2.5854, + "theoretical_loss": 3.4412048154038852, + "tokens_seen": 1926195200 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021033099297893681, + "loss": 2.7145, + "theoretical_loss": 3.4411949812538705, + "tokens_seen": 1926260736 + }, + { + "epoch": 6.04, + "learning_rate": 0.000210320962888666, + "loss": 2.6121, + "theoretical_loss": 3.441185147532111, + "tokens_seen": 1926326272 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002103109327983952, + "loss": 2.7478, + "theoretical_loss": 3.441175314238573, + "tokens_seen": 1926391808 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021030090270812439, + "loss": 2.726, + "theoretical_loss": 3.441165481373223, + "tokens_seen": 1926457344 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021029087261785357, + "loss": 2.3945, + "theoretical_loss": 3.4411556489360278, + "tokens_seen": 1926522880 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021028084252758275, + "loss": 2.5259, + "theoretical_loss": 3.441145816926955, + "tokens_seen": 1926588416 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021027081243731196, + "loss": 2.6443, + "theoretical_loss": 3.441135985345971, + "tokens_seen": 1926653952 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021026078234704114, + "loss": 2.6598, + "theoretical_loss": 3.4411261541930425, + "tokens_seen": 1926719488 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2145185, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5906100273132324, + "objective/train/theoretical_loss": 3.4411212387770886, + "objective/train/tokens_used": 1947212256, + "theoretical_loss": 3.4411212387770886, + "tokens_seen": 1926752256 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021025075225677032, + "loss": 2.6586, + "theoretical_loss": 3.441116323468136, + "tokens_seen": 1926785024 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002102407221664995, + "loss": 2.8331, + "theoretical_loss": 3.441106493171219, + "tokens_seen": 1926850560 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021023069207622868, + "loss": 2.4305, + "theoretical_loss": 3.4410966633022575, + "tokens_seen": 1926916096 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002102206619859579, + "loss": 2.5863, + "theoretical_loss": 3.441086833861219, + "tokens_seen": 1926981632 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021021063189568707, + "loss": 2.5131, + "theoretical_loss": 3.4410770048480703, + "tokens_seen": 1927047168 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021020060180541625, + "loss": 2.6755, + "theoretical_loss": 3.441067176262777, + "tokens_seen": 1927112704 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021019057171514543, + "loss": 2.7491, + "theoretical_loss": 3.441057348105308, + "tokens_seen": 1927178240 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021018054162487464, + "loss": 2.6029, + "theoretical_loss": 3.441047520375629, + "tokens_seen": 1927243776 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021017051153460382, + "loss": 2.651, + "theoretical_loss": 3.4410376930737065, + "tokens_seen": 1927309312 + }, + { + "epoch": 6.04, + "learning_rate": 0.000210160481444333, + "loss": 2.6438, + "theoretical_loss": 3.4410278661995077, + "tokens_seen": 1927374848 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021015045135406218, + "loss": 2.5198, + "theoretical_loss": 3.4410180397529997, + "tokens_seen": 1927440384 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021014042126379136, + "loss": 2.6515, + "theoretical_loss": 3.4410082137341487, + "tokens_seen": 1927505920 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021013039117352057, + "loss": 2.7025, + "theoretical_loss": 3.440998388142922, + "tokens_seen": 1927571456 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021012036108324975, + "loss": 2.3011, + "theoretical_loss": 3.4409885629792867, + "tokens_seen": 1927636992 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021011033099297894, + "loss": 2.5601, + "theoretical_loss": 3.440978738243209, + "tokens_seen": 1927702528 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021010030090270812, + "loss": 2.6469, + "theoretical_loss": 3.4409689139346566, + "tokens_seen": 1927768064 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021009027081243732, + "loss": 2.4048, + "theoretical_loss": 3.440959090053595, + "tokens_seen": 1927833600 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002100802407221665, + "loss": 2.5455, + "theoretical_loss": 3.4409492665999926, + "tokens_seen": 1927899136 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002100702106318957, + "loss": 2.5466, + "theoretical_loss": 3.440939443573816, + "tokens_seen": 1927964672 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021006018054162487, + "loss": 2.4768, + "theoretical_loss": 3.440929620975031, + "tokens_seen": 1928030208 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021005015045135405, + "loss": 2.6794, + "theoretical_loss": 3.4409197988036055, + "tokens_seen": 1928095744 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021004012036108326, + "loss": 2.8031, + "theoretical_loss": 3.4409099770595057, + "tokens_seen": 1928161280 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021003009027081244, + "loss": 2.3028, + "theoretical_loss": 3.4409001557426993, + "tokens_seen": 1928226816 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021002006018054162, + "loss": 2.5863, + "theoretical_loss": 3.4408903348531528, + "tokens_seen": 1928292352 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002100100300902708, + "loss": 2.7364, + "theoretical_loss": 3.4408805143908325, + "tokens_seen": 1928357888 + }, + { + "epoch": 6.04, + "objective/train/docs_used": 2145969, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7385342121124268, + "objective/train/theoretical_loss": 3.440875604319872, + "objective/train/tokens_used": 1948850656, + "theoretical_loss": 3.440875604319872, + "tokens_seen": 1928390656 + }, + { + "epoch": 6.04, + "learning_rate": 0.00021, + "loss": 2.7216, + "theoretical_loss": 3.440870694355706, + "tokens_seen": 1928423424 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002099899699097292, + "loss": 2.652, + "theoretical_loss": 3.44086087474774, + "tokens_seen": 1928488960 + }, + { + "epoch": 6.04, + "learning_rate": 0.00020997993981945837, + "loss": 2.5123, + "theoretical_loss": 3.4408510555669016, + "tokens_seen": 1928554496 + }, + { + "epoch": 6.04, + "learning_rate": 0.00020996990972918755, + "loss": 2.6226, + "theoretical_loss": 3.4408412368131573, + "tokens_seen": 1928620032 + }, + { + "epoch": 6.04, + "learning_rate": 0.00020995987963891673, + "loss": 2.2272, + "theoretical_loss": 3.440831418486474, + "tokens_seen": 1928685568 + }, + { + "epoch": 6.04, + "learning_rate": 0.00020994984954864594, + "loss": 2.5028, + "theoretical_loss": 3.4408216005868195, + "tokens_seen": 1928751104 + }, + { + "epoch": 6.04, + "learning_rate": 0.00020993981945837512, + "loss": 2.6323, + "theoretical_loss": 3.44081178311416, + "tokens_seen": 1928816640 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002099297893681043, + "loss": 2.4337, + "theoretical_loss": 3.440801966068462, + "tokens_seen": 1928882176 + }, + { + "epoch": 6.04, + "learning_rate": 0.00020991975927783348, + "loss": 2.5209, + "theoretical_loss": 3.440792149449693, + "tokens_seen": 1928947712 + }, + { + "epoch": 6.04, + "learning_rate": 0.0002099097291875627, + "loss": 2.6697, + "theoretical_loss": 3.44078233325782, + "tokens_seen": 1929013248 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020989969909729187, + "loss": 2.5938, + "theoretical_loss": 3.44077251749281, + "tokens_seen": 1929078784 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020988966900702106, + "loss": 2.6197, + "theoretical_loss": 3.4407627021546294, + "tokens_seen": 1929144320 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020987963891675024, + "loss": 2.7746, + "theoretical_loss": 3.440752887243246, + "tokens_seen": 1929209856 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020986960882647942, + "loss": 2.3979, + "theoretical_loss": 3.4407430727586252, + "tokens_seen": 1929275392 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020985957873620863, + "loss": 2.4612, + "theoretical_loss": 3.4407332587007353, + "tokens_seen": 1929340928 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002098495486459378, + "loss": 2.6218, + "theoretical_loss": 3.440723445069543, + "tokens_seen": 1929406464 + }, + { + "epoch": 6.05, + "learning_rate": 0.000209839518555667, + "loss": 2.5661, + "theoretical_loss": 3.4407136318650156, + "tokens_seen": 1929472000 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002098294884653962, + "loss": 2.5982, + "theoretical_loss": 3.440703819087119, + "tokens_seen": 1929537536 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002098194583751254, + "loss": 2.8159, + "theoretical_loss": 3.4406940067358214, + "tokens_seen": 1929603072 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020980942828485459, + "loss": 2.6278, + "theoretical_loss": 3.4406841948110882, + "tokens_seen": 1929668608 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020979939819458377, + "loss": 2.6047, + "theoretical_loss": 3.440674383312888, + "tokens_seen": 1929734144 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020978936810431295, + "loss": 2.4029, + "theoretical_loss": 3.440664572241187, + "tokens_seen": 1929799680 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020977933801404216, + "loss": 2.4926, + "theoretical_loss": 3.4406547615959524, + "tokens_seen": 1929865216 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020976930792377134, + "loss": 2.4986, + "theoretical_loss": 3.440644951377151, + "tokens_seen": 1929930752 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020975927783350052, + "loss": 2.569, + "theoretical_loss": 3.4406351415847496, + "tokens_seen": 1929996288 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2147340, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.811751365661621, + "objective/train/theoretical_loss": 3.4406302368484387, + "objective/train/tokens_used": 1950489056, + "theoretical_loss": 3.4406302368484387, + "tokens_seen": 1930029056 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002097492477432297, + "loss": 2.805, + "theoretical_loss": 3.4406253322187155, + "tokens_seen": 1930061824 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020973921765295888, + "loss": 2.5049, + "theoretical_loss": 3.4406155232790154, + "tokens_seen": 1930127360 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002097291875626881, + "loss": 2.5907, + "theoretical_loss": 3.440605714765617, + "tokens_seen": 1930192896 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020971915747241727, + "loss": 2.5735, + "theoretical_loss": 3.440595906678486, + "tokens_seen": 1930258432 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020970912738214645, + "loss": 2.626, + "theoretical_loss": 3.440586099017591, + "tokens_seen": 1930323968 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020969909729187563, + "loss": 2.5422, + "theoretical_loss": 3.4405762917828984, + "tokens_seen": 1930389504 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020968906720160484, + "loss": 2.4342, + "theoretical_loss": 3.4405664849743745, + "tokens_seen": 1930455040 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020967903711133402, + "loss": 2.8049, + "theoretical_loss": 3.440556678591987, + "tokens_seen": 1930520576 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002096690070210632, + "loss": 2.6348, + "theoretical_loss": 3.4405468726357027, + "tokens_seen": 1930586112 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020965897693079238, + "loss": 2.4153, + "theoretical_loss": 3.4405370671054882, + "tokens_seen": 1930651648 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020964894684052156, + "loss": 2.5765, + "theoretical_loss": 3.4405272620013116, + "tokens_seen": 1930717184 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020963891675025077, + "loss": 2.5259, + "theoretical_loss": 3.440517457323139, + "tokens_seen": 1930782720 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020962888665997995, + "loss": 2.6444, + "theoretical_loss": 3.440507653070938, + "tokens_seen": 1930848256 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020961885656970914, + "loss": 2.2375, + "theoretical_loss": 3.4404978492446756, + "tokens_seen": 1930913792 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020960882647943832, + "loss": 2.5185, + "theoretical_loss": 3.4404880458443183, + "tokens_seen": 1930979328 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020959879638916752, + "loss": 2.6158, + "theoretical_loss": 3.4404782428698337, + "tokens_seen": 1931044864 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002095887662988967, + "loss": 2.369, + "theoretical_loss": 3.4404684403211885, + "tokens_seen": 1931110400 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002095787362086259, + "loss": 2.6039, + "theoretical_loss": 3.4404586381983497, + "tokens_seen": 1931175936 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020956870611835507, + "loss": 2.5056, + "theoretical_loss": 3.440448836501285, + "tokens_seen": 1931241472 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020955867602808425, + "loss": 2.415, + "theoretical_loss": 3.4404390352299608, + "tokens_seen": 1931307008 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020954864593781346, + "loss": 2.6963, + "theoretical_loss": 3.440429234384344, + "tokens_seen": 1931372544 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020953861584754264, + "loss": 2.5228, + "theoretical_loss": 3.4404194339644025, + "tokens_seen": 1931438080 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020952858575727182, + "loss": 2.7523, + "theoretical_loss": 3.440409633970103, + "tokens_seen": 1931503616 + }, + { + "epoch": 6.05, + "learning_rate": 0.000209518555667001, + "loss": 2.5516, + "theoretical_loss": 3.440399834401412, + "tokens_seen": 1931569152 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002095085255767302, + "loss": 2.6377, + "theoretical_loss": 3.440390035258297, + "tokens_seen": 1931634688 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2147892, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.423159599304199, + "objective/train/theoretical_loss": 3.4403851358463204, + "objective/train/tokens_used": 1952127456, + "theoretical_loss": 3.4403851358463204, + "tokens_seen": 1931667456 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002094984954864594, + "loss": 2.557, + "theoretical_loss": 3.4403802365407254, + "tokens_seen": 1931700224 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020948846539618857, + "loss": 2.4443, + "theoretical_loss": 3.4403704382486637, + "tokens_seen": 1931765760 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020947843530591775, + "loss": 2.5366, + "theoretical_loss": 3.44036064038208, + "tokens_seen": 1931831296 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020946840521564693, + "loss": 2.4368, + "theoretical_loss": 3.4403508429409397, + "tokens_seen": 1931896832 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020945837512537614, + "loss": 2.6385, + "theoretical_loss": 3.4403410459252117, + "tokens_seen": 1931962368 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020944834503510532, + "loss": 2.537, + "theoretical_loss": 3.4403312493348617, + "tokens_seen": 1932027904 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002094383149448345, + "loss": 2.6312, + "theoretical_loss": 3.440321453169858, + "tokens_seen": 1932093440 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020942828485456368, + "loss": 2.4538, + "theoretical_loss": 3.4403116574301666, + "tokens_seen": 1932158976 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002094182547642929, + "loss": 2.495, + "theoretical_loss": 3.4403018621157555, + "tokens_seen": 1932224512 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020940822467402207, + "loss": 2.6151, + "theoretical_loss": 3.440292067226591, + "tokens_seen": 1932290048 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020939819458375126, + "loss": 2.6355, + "theoretical_loss": 3.4402822727626408, + "tokens_seen": 1932355584 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020938816449348044, + "loss": 2.6948, + "theoretical_loss": 3.4402724787238714, + "tokens_seen": 1932421120 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020937813440320962, + "loss": 2.4739, + "theoretical_loss": 3.440262685110251, + "tokens_seen": 1932486656 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020936810431293883, + "loss": 2.5553, + "theoretical_loss": 3.4402528919217463, + "tokens_seen": 1932552192 + }, + { + "epoch": 6.05, + "learning_rate": 0.000209358074222668, + "loss": 2.769, + "theoretical_loss": 3.4402430991583235, + "tokens_seen": 1932617728 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002093480441323972, + "loss": 2.7608, + "theoretical_loss": 3.440233306819951, + "tokens_seen": 1932683264 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020933801404212637, + "loss": 2.6196, + "theoretical_loss": 3.440223514906595, + "tokens_seen": 1932748800 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020932798395185558, + "loss": 2.3103, + "theoretical_loss": 3.440213723418223, + "tokens_seen": 1932814336 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020931795386158476, + "loss": 2.742, + "theoretical_loss": 3.4402039323548026, + "tokens_seen": 1932879872 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020930792377131394, + "loss": 2.4558, + "theoretical_loss": 3.4401941417163, + "tokens_seen": 1932945408 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020929789368104312, + "loss": 2.5755, + "theoretical_loss": 3.4401843515026833, + "tokens_seen": 1933010944 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002092878635907723, + "loss": 2.6017, + "theoretical_loss": 3.4401745617139197, + "tokens_seen": 1933076480 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002092778335005015, + "loss": 2.589, + "theoretical_loss": 3.4401647723499753, + "tokens_seen": 1933142016 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002092678034102307, + "loss": 2.6021, + "theoretical_loss": 3.440154983410818, + "tokens_seen": 1933207552 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020925777331995987, + "loss": 2.6329, + "theoretical_loss": 3.440145194896415, + "tokens_seen": 1933273088 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2147894, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4973556995391846, + "objective/train/theoretical_loss": 3.4401403007984857, + "objective/train/tokens_used": 1953765856, + "theoretical_loss": 3.4401403007984857, + "tokens_seen": 1933305856 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020924774322968905, + "loss": 2.4387, + "theoretical_loss": 3.440135406806733, + "tokens_seen": 1933338624 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020923771313941826, + "loss": 2.6543, + "theoretical_loss": 3.4401256191417393, + "tokens_seen": 1933404160 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020922768304914744, + "loss": 2.7327, + "theoretical_loss": 3.4401158319014016, + "tokens_seen": 1933469696 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020921765295887662, + "loss": 2.3645, + "theoretical_loss": 3.440106045085687, + "tokens_seen": 1933535232 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002092076228686058, + "loss": 2.7117, + "theoretical_loss": 3.4400962586945623, + "tokens_seen": 1933600768 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020919759277833499, + "loss": 2.6313, + "theoretical_loss": 3.4400864727279945, + "tokens_seen": 1933666304 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002091875626880642, + "loss": 2.5653, + "theoretical_loss": 3.440076687185951, + "tokens_seen": 1933731840 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020917753259779338, + "loss": 2.4912, + "theoretical_loss": 3.4400669020683994, + "tokens_seen": 1933797376 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020916750250752256, + "loss": 2.496, + "theoretical_loss": 3.440057117375307, + "tokens_seen": 1933862912 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020915747241725174, + "loss": 2.5372, + "theoretical_loss": 3.44004733310664, + "tokens_seen": 1933928448 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020914744232698095, + "loss": 2.5113, + "theoretical_loss": 3.4400375492623665, + "tokens_seen": 1933993984 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020913741223671013, + "loss": 2.4574, + "theoretical_loss": 3.440027765842454, + "tokens_seen": 1934059520 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002091273821464393, + "loss": 2.4382, + "theoretical_loss": 3.4400179828468684, + "tokens_seen": 1934125056 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002091173520561685, + "loss": 2.4487, + "theoretical_loss": 3.440008200275578, + "tokens_seen": 1934190592 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002091073219658977, + "loss": 2.6795, + "theoretical_loss": 3.4399984181285497, + "tokens_seen": 1934256128 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020909729187562688, + "loss": 2.7902, + "theoretical_loss": 3.4399886364057504, + "tokens_seen": 1934321664 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020908726178535606, + "loss": 2.4728, + "theoretical_loss": 3.4399788551071477, + "tokens_seen": 1934387200 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020907723169508527, + "loss": 2.4983, + "theoretical_loss": 3.4399690742327094, + "tokens_seen": 1934452736 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020906720160481445, + "loss": 2.714, + "theoretical_loss": 3.4399592937824015, + "tokens_seen": 1934518272 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020905717151454366, + "loss": 2.5056, + "theoretical_loss": 3.439949513756192, + "tokens_seen": 1934583808 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020904714142427284, + "loss": 2.6112, + "theoretical_loss": 3.4399397341540485, + "tokens_seen": 1934649344 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020903711133400202, + "loss": 2.7685, + "theoretical_loss": 3.439929954975937, + "tokens_seen": 1934714880 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002090270812437312, + "loss": 2.5141, + "theoretical_loss": 3.439920176221826, + "tokens_seen": 1934780416 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002090170511534604, + "loss": 2.5669, + "theoretical_loss": 3.439910397891682, + "tokens_seen": 1934845952 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002090070210631896, + "loss": 2.5517, + "theoretical_loss": 3.4399006199854725, + "tokens_seen": 1934911488 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2147894, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3182151317596436, + "objective/train/theoretical_loss": 3.439895731191333, + "objective/train/tokens_used": 1955404256, + "theoretical_loss": 3.439895731191333, + "tokens_seen": 1934944256 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020899699097291877, + "loss": 2.6756, + "theoretical_loss": 3.439890842503165, + "tokens_seen": 1934977024 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020898696088264795, + "loss": 2.4257, + "theoretical_loss": 3.4398810654447267, + "tokens_seen": 1935042560 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020897693079237713, + "loss": 2.4943, + "theoretical_loss": 3.4398712888101244, + "tokens_seen": 1935108096 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020896690070210634, + "loss": 2.6024, + "theoretical_loss": 3.4398615125993257, + "tokens_seen": 1935173632 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020895687061183552, + "loss": 2.5038, + "theoretical_loss": 3.4398517368122983, + "tokens_seen": 1935239168 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002089468405215647, + "loss": 2.7035, + "theoretical_loss": 3.439841961449009, + "tokens_seen": 1935304704 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020893681043129389, + "loss": 2.6108, + "theoretical_loss": 3.439832186509425, + "tokens_seen": 1935370240 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002089267803410231, + "loss": 2.5496, + "theoretical_loss": 3.4398224119935135, + "tokens_seen": 1935435776 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020891675025075227, + "loss": 2.5423, + "theoretical_loss": 3.439812637901242, + "tokens_seen": 1935501312 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020890672016048146, + "loss": 2.5488, + "theoretical_loss": 3.4398028642325778, + "tokens_seen": 1935566848 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020889669007021064, + "loss": 2.5781, + "theoretical_loss": 3.4397930909874885, + "tokens_seen": 1935632384 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020888665997993982, + "loss": 2.7041, + "theoretical_loss": 3.439783318165941, + "tokens_seen": 1935697920 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020887662988966903, + "loss": 2.7458, + "theoretical_loss": 3.4397735457679026, + "tokens_seen": 1935763456 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002088665997993982, + "loss": 2.6126, + "theoretical_loss": 3.439763773793341, + "tokens_seen": 1935828992 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002088565697091274, + "loss": 2.6851, + "theoretical_loss": 3.439754002242223, + "tokens_seen": 1935894528 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020884653961885657, + "loss": 2.443, + "theoretical_loss": 3.439744231114516, + "tokens_seen": 1935960064 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020883650952858578, + "loss": 2.5207, + "theoretical_loss": 3.439734460410188, + "tokens_seen": 1936025600 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020882647943831496, + "loss": 2.5829, + "theoretical_loss": 3.4397246901292053, + "tokens_seen": 1936091136 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020881644934804414, + "loss": 2.4601, + "theoretical_loss": 3.4397149202715367, + "tokens_seen": 1936156672 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020880641925777332, + "loss": 2.7629, + "theoretical_loss": 3.4397051508371477, + "tokens_seen": 1936222208 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002087963891675025, + "loss": 2.6372, + "theoretical_loss": 3.4396953818260068, + "tokens_seen": 1936287744 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002087863590772317, + "loss": 2.5538, + "theoretical_loss": 3.4396856132380806, + "tokens_seen": 1936353280 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002087763289869609, + "loss": 2.7516, + "theoretical_loss": 3.4396758450733373, + "tokens_seen": 1936418816 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020876629889669007, + "loss": 2.5077, + "theoretical_loss": 3.4396660773317436, + "tokens_seen": 1936484352 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020875626880641925, + "loss": 2.5726, + "theoretical_loss": 3.4396563100132678, + "tokens_seen": 1936549888 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2149399, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6136698722839355, + "objective/train/theoretical_loss": 3.439651426512688, + "objective/train/tokens_used": 1957042656, + "theoretical_loss": 3.439651426512688, + "tokens_seen": 1936582656 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020874623871614846, + "loss": 2.7095, + "theoretical_loss": 3.4396465431178758, + "tokens_seen": 1936615424 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020873620862587764, + "loss": 2.8489, + "theoretical_loss": 3.4396367766455356, + "tokens_seen": 1936680960 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020872617853560682, + "loss": 2.7001, + "theoretical_loss": 3.4396270105962152, + "tokens_seen": 1936746496 + }, + { + "epoch": 6.05, + "learning_rate": 0.000208716148445336, + "loss": 2.9969, + "theoretical_loss": 3.439617244969881, + "tokens_seen": 1936812032 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020870611835506519, + "loss": 2.6902, + "theoretical_loss": 3.439607479766501, + "tokens_seen": 1936877568 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002086960882647944, + "loss": 2.6303, + "theoretical_loss": 3.4395977149860424, + "tokens_seen": 1936943104 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020868605817452358, + "loss": 2.5615, + "theoretical_loss": 3.4395879506284723, + "tokens_seen": 1937008640 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020867602808425276, + "loss": 2.5556, + "theoretical_loss": 3.4395781866937583, + "tokens_seen": 1937074176 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020866599799398194, + "loss": 2.7088, + "theoretical_loss": 3.439568423181868, + "tokens_seen": 1937139712 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020865596790371115, + "loss": 2.644, + "theoretical_loss": 3.439558660092769, + "tokens_seen": 1937205248 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020864593781344033, + "loss": 2.4284, + "theoretical_loss": 3.4395488974264277, + "tokens_seen": 1937270784 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002086359077231695, + "loss": 2.6305, + "theoretical_loss": 3.4395391351828124, + "tokens_seen": 1937336320 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002086258776328987, + "loss": 2.6649, + "theoretical_loss": 3.4395293733618897, + "tokens_seen": 1937401856 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002086158475426279, + "loss": 2.5912, + "theoretical_loss": 3.4395196119636275, + "tokens_seen": 1937467392 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020860581745235708, + "loss": 2.703, + "theoretical_loss": 3.4395098509879936, + "tokens_seen": 1937532928 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020859578736208626, + "loss": 2.6892, + "theoretical_loss": 3.4395000904349544, + "tokens_seen": 1937598464 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020858575727181544, + "loss": 2.5693, + "theoretical_loss": 3.439490330304478, + "tokens_seen": 1937664000 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020857572718154462, + "loss": 2.8091, + "theoretical_loss": 3.4394805705965323, + "tokens_seen": 1937729536 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020856569709127383, + "loss": 2.4543, + "theoretical_loss": 3.439470811311084, + "tokens_seen": 1937795072 + }, + { + "epoch": 6.05, + "learning_rate": 0.000208555667001003, + "loss": 2.6292, + "theoretical_loss": 3.4394610524481, + "tokens_seen": 1937860608 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002085456369107322, + "loss": 2.5442, + "theoretical_loss": 3.439451294007549, + "tokens_seen": 1937926144 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020853560682046137, + "loss": 2.6625, + "theoretical_loss": 3.4394415359893973, + "tokens_seen": 1937991680 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020852557673019058, + "loss": 2.5226, + "theoretical_loss": 3.4394317783936135, + "tokens_seen": 1938057216 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020851554663991976, + "loss": 2.5617, + "theoretical_loss": 3.439422021220164, + "tokens_seen": 1938122752 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020850551654964894, + "loss": 2.6748, + "theoretical_loss": 3.4394122644690164, + "tokens_seen": 1938188288 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2149953, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.608050584793091, + "objective/train/theoretical_loss": 3.4394073862517955, + "objective/train/tokens_used": 1958681056, + "theoretical_loss": 3.4394073862517955, + "tokens_seen": 1938221056 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020849548645937813, + "loss": 2.7671, + "theoretical_loss": 3.4394025081401383, + "tokens_seen": 1938253824 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002084854563691073, + "loss": 2.6228, + "theoretical_loss": 3.4393927522334975, + "tokens_seen": 1938319360 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020847542627883651, + "loss": 2.5335, + "theoretical_loss": 3.439382996749061, + "tokens_seen": 1938384896 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002084653961885657, + "loss": 2.5464, + "theoretical_loss": 3.4393732416867966, + "tokens_seen": 1938450432 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020845536609829488, + "loss": 2.7314, + "theoretical_loss": 3.439363487046671, + "tokens_seen": 1938515968 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020844533600802406, + "loss": 2.6554, + "theoretical_loss": 3.4393537328286525, + "tokens_seen": 1938581504 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020843530591775327, + "loss": 2.6656, + "theoretical_loss": 3.4393439790327087, + "tokens_seen": 1938647040 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020842527582748245, + "loss": 2.6052, + "theoretical_loss": 3.4393342256588064, + "tokens_seen": 1938712576 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020841524573721163, + "loss": 2.6923, + "theoretical_loss": 3.4393244727069137, + "tokens_seen": 1938778112 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002084052156469408, + "loss": 2.7543, + "theoretical_loss": 3.4393147201769967, + "tokens_seen": 1938843648 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020839518555667, + "loss": 2.9191, + "theoretical_loss": 3.439304968069025, + "tokens_seen": 1938909184 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002083851554663992, + "loss": 2.4942, + "theoretical_loss": 3.4392952163829644, + "tokens_seen": 1938974720 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020837512537612838, + "loss": 2.5992, + "theoretical_loss": 3.4392854651187825, + "tokens_seen": 1939040256 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020836509528585756, + "loss": 2.5379, + "theoretical_loss": 3.439275714276448, + "tokens_seen": 1939105792 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020835506519558674, + "loss": 2.6997, + "theoretical_loss": 3.4392659638559273, + "tokens_seen": 1939171328 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020834503510531595, + "loss": 2.6178, + "theoretical_loss": 3.4392562138571887, + "tokens_seen": 1939236864 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020833500501504513, + "loss": 2.4314, + "theoretical_loss": 3.439246464280199, + "tokens_seen": 1939302400 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020832497492477434, + "loss": 2.3728, + "theoretical_loss": 3.4392367151249257, + "tokens_seen": 1939367936 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020831494483450352, + "loss": 2.641, + "theoretical_loss": 3.4392269663913364, + "tokens_seen": 1939433472 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002083049147442327, + "loss": 2.4733, + "theoretical_loss": 3.439217218079399, + "tokens_seen": 1939499008 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002082948846539619, + "loss": 2.4342, + "theoretical_loss": 3.439207470189081, + "tokens_seen": 1939564544 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002082848545636911, + "loss": 2.5395, + "theoretical_loss": 3.43919772272035, + "tokens_seen": 1939630080 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020827482447342027, + "loss": 2.5956, + "theoretical_loss": 3.4391879756731725, + "tokens_seen": 1939695616 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020826479438314945, + "loss": 2.4245, + "theoretical_loss": 3.439178229047517, + "tokens_seen": 1939761152 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020825476429287866, + "loss": 2.7431, + "theoretical_loss": 3.4391684828433506, + "tokens_seen": 1939826688 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2151225, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8013925552368164, + "objective/train/theoretical_loss": 3.439163609899316, + "objective/train/tokens_used": 1960319456, + "theoretical_loss": 3.439163609899316, + "tokens_seen": 1939859456 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020824473420260784, + "loss": 2.6786, + "theoretical_loss": 3.439158737060642, + "tokens_seen": 1939892224 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020823470411233702, + "loss": 2.6853, + "theoretical_loss": 3.4391489916993567, + "tokens_seen": 1939957760 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002082246740220662, + "loss": 2.5087, + "theoretical_loss": 3.4391392467594635, + "tokens_seen": 1940023296 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002082146439317954, + "loss": 2.5461, + "theoretical_loss": 3.43912950224093, + "tokens_seen": 1940088832 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002082046138415246, + "loss": 2.6701, + "theoretical_loss": 3.439119758143723, + "tokens_seen": 1940154368 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020819458375125378, + "loss": 2.6988, + "theoretical_loss": 3.4391100144678113, + "tokens_seen": 1940219904 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020818455366098296, + "loss": 2.671, + "theoretical_loss": 3.439100271213161, + "tokens_seen": 1940285440 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020817452357071214, + "loss": 2.6777, + "theoretical_loss": 3.4390905283797406, + "tokens_seen": 1940350976 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020816449348044135, + "loss": 2.6386, + "theoretical_loss": 3.4390807859675174, + "tokens_seen": 1940416512 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020815446339017053, + "loss": 2.6126, + "theoretical_loss": 3.439071043976459, + "tokens_seen": 1940482048 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002081444332998997, + "loss": 2.5682, + "theoretical_loss": 3.4390613024065333, + "tokens_seen": 1940547584 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002081344032096289, + "loss": 2.7314, + "theoretical_loss": 3.439051561257707, + "tokens_seen": 1940613120 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002081243731193581, + "loss": 2.6016, + "theoretical_loss": 3.4390418205299484, + "tokens_seen": 1940678656 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020811434302908728, + "loss": 2.5412, + "theoretical_loss": 3.439032080223225, + "tokens_seen": 1940744192 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020810431293881646, + "loss": 2.4608, + "theoretical_loss": 3.4390223403375044, + "tokens_seen": 1940809728 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020809428284854564, + "loss": 2.6929, + "theoretical_loss": 3.4390126008727537, + "tokens_seen": 1940875264 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020808425275827482, + "loss": 2.6196, + "theoretical_loss": 3.439002861828941, + "tokens_seen": 1940940800 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020807422266800403, + "loss": 2.5233, + "theoretical_loss": 3.438993123206034, + "tokens_seen": 1941006336 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002080641925777332, + "loss": 2.6891, + "theoretical_loss": 3.4389833850039997, + "tokens_seen": 1941071872 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002080541624874624, + "loss": 2.512, + "theoretical_loss": 3.438973647222806, + "tokens_seen": 1941137408 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020804413239719157, + "loss": 2.4727, + "theoretical_loss": 3.4389639098624207, + "tokens_seen": 1941202944 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020803410230692078, + "loss": 2.8285, + "theoretical_loss": 3.438954172922811, + "tokens_seen": 1941268480 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020802407221664996, + "loss": 2.583, + "theoretical_loss": 3.438944436403945, + "tokens_seen": 1941334016 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020801404212637914, + "loss": 2.6735, + "theoretical_loss": 3.438934700305791, + "tokens_seen": 1941399552 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020800401203610833, + "loss": 2.6351, + "theoretical_loss": 3.438924964628314, + "tokens_seen": 1941465088 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2152441, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2327756881713867, + "objective/train/theoretical_loss": 3.438920096947321, + "objective/train/tokens_used": 1961957856, + "theoretical_loss": 3.438920096947321, + "tokens_seen": 1941497856 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002079939819458375, + "loss": 2.5486, + "theoretical_loss": 3.4389152293714846, + "tokens_seen": 1941530624 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020798395185556671, + "loss": 2.6661, + "theoretical_loss": 3.438905494535269, + "tokens_seen": 1941596160 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002079739217652959, + "loss": 2.5693, + "theoretical_loss": 3.4388957601196344, + "tokens_seen": 1941661696 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020796389167502508, + "loss": 2.7042, + "theoretical_loss": 3.4388860261245497, + "tokens_seen": 1941727232 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020795386158475426, + "loss": 2.6314, + "theoretical_loss": 3.438876292549981, + "tokens_seen": 1941792768 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020794383149448347, + "loss": 2.464, + "theoretical_loss": 3.4388665593958976, + "tokens_seen": 1941858304 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020793380140421265, + "loss": 2.6487, + "theoretical_loss": 3.438856826662266, + "tokens_seen": 1941923840 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020792377131394183, + "loss": 2.6111, + "theoretical_loss": 3.438847094349055, + "tokens_seen": 1941989376 + }, + { + "epoch": 6.05, + "learning_rate": 0.000207913741223671, + "loss": 2.6376, + "theoretical_loss": 3.4388373624562303, + "tokens_seen": 1942054912 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002079037111334002, + "loss": 2.7143, + "theoretical_loss": 3.438827630983761, + "tokens_seen": 1942120448 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002078936810431294, + "loss": 2.5209, + "theoretical_loss": 3.438817899931615, + "tokens_seen": 1942185984 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020788365095285858, + "loss": 2.4732, + "theoretical_loss": 3.438808169299759, + "tokens_seen": 1942251520 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020787362086258776, + "loss": 2.626, + "theoretical_loss": 3.4387984390881607, + "tokens_seen": 1942317056 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020786359077231694, + "loss": 2.8415, + "theoretical_loss": 3.438788709296789, + "tokens_seen": 1942382592 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020785356068204615, + "loss": 2.5915, + "theoretical_loss": 3.43877897992561, + "tokens_seen": 1942448128 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020784353059177533, + "loss": 2.5537, + "theoretical_loss": 3.438769250974593, + "tokens_seen": 1942513664 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002078335005015045, + "loss": 2.559, + "theoretical_loss": 3.4387595224437044, + "tokens_seen": 1942579200 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002078234704112337, + "loss": 2.5999, + "theoretical_loss": 3.438749794332912, + "tokens_seen": 1942644736 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020781344032096288, + "loss": 2.6117, + "theoretical_loss": 3.4387400666421843, + "tokens_seen": 1942710272 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020780341023069208, + "loss": 2.6295, + "theoretical_loss": 3.438730339371488, + "tokens_seen": 1942775808 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020779338014042126, + "loss": 2.7758, + "theoretical_loss": 3.4387206125207914, + "tokens_seen": 1942841344 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020778335005015045, + "loss": 2.4338, + "theoretical_loss": 3.438710886090062, + "tokens_seen": 1942906880 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020777331995987963, + "loss": 2.8252, + "theoretical_loss": 3.4387011600792676, + "tokens_seen": 1942972416 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020776328986960884, + "loss": 2.8088, + "theoretical_loss": 3.438691434488376, + "tokens_seen": 1943037952 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020775325977933802, + "loss": 2.4847, + "theoretical_loss": 3.438681709317355, + "tokens_seen": 1943103488 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2153295, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6094300746917725, + "objective/train/theoretical_loss": 3.4386768468892854, + "objective/train/tokens_used": 1963596256, + "theoretical_loss": 3.4386768468892854, + "tokens_seen": 1943136256 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002077432296890672, + "loss": 2.6439, + "theoretical_loss": 3.4386719845661715, + "tokens_seen": 1943169024 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020773319959879638, + "loss": 2.6473, + "theoretical_loss": 3.438662260234794, + "tokens_seen": 1943234560 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020772316950852556, + "loss": 2.6196, + "theoretical_loss": 3.4386525363231906, + "tokens_seen": 1943300096 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020771313941825477, + "loss": 2.6043, + "theoretical_loss": 3.4386428128313273, + "tokens_seen": 1943365632 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020770310932798395, + "loss": 2.4774, + "theoretical_loss": 3.4386330897591737, + "tokens_seen": 1943431168 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020769307923771313, + "loss": 2.6036, + "theoretical_loss": 3.438623367106697, + "tokens_seen": 1943496704 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002076830491474423, + "loss": 2.6388, + "theoretical_loss": 3.4386136448738647, + "tokens_seen": 1943562240 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020767301905717152, + "loss": 2.2837, + "theoretical_loss": 3.4386039230606444, + "tokens_seen": 1943627776 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002076629889669007, + "loss": 2.4957, + "theoretical_loss": 3.438594201667004, + "tokens_seen": 1943693312 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020765295887662988, + "loss": 2.6059, + "theoretical_loss": 3.4385844806929113, + "tokens_seen": 1943758848 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020764292878635906, + "loss": 2.6932, + "theoretical_loss": 3.438574760138334, + "tokens_seen": 1943824384 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020763289869608824, + "loss": 2.5915, + "theoretical_loss": 3.43856504000324, + "tokens_seen": 1943889920 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020762286860581745, + "loss": 2.3164, + "theoretical_loss": 3.438555320287597, + "tokens_seen": 1943955456 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020761283851554663, + "loss": 2.7734, + "theoretical_loss": 3.4385456009913726, + "tokens_seen": 1944020992 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020760280842527581, + "loss": 2.6588, + "theoretical_loss": 3.438535882114534, + "tokens_seen": 1944086528 + }, + { + "epoch": 6.05, + "learning_rate": 0.000207592778335005, + "loss": 2.5787, + "theoretical_loss": 3.4385261636570505, + "tokens_seen": 1944152064 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002075827482447342, + "loss": 2.4693, + "theoretical_loss": 3.438516445618889, + "tokens_seen": 1944217600 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002075727181544634, + "loss": 2.6652, + "theoretical_loss": 3.438506728000017, + "tokens_seen": 1944283136 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002075626880641926, + "loss": 2.5986, + "theoretical_loss": 3.4384970108004023, + "tokens_seen": 1944348672 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020755265797392177, + "loss": 2.6379, + "theoretical_loss": 3.438487294020013, + "tokens_seen": 1944414208 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020754262788365098, + "loss": 2.589, + "theoretical_loss": 3.438477577658817, + "tokens_seen": 1944479744 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020753259779338016, + "loss": 2.5867, + "theoretical_loss": 3.438467861716782, + "tokens_seen": 1944545280 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020752256770310934, + "loss": 2.6517, + "theoretical_loss": 3.4384581461938755, + "tokens_seen": 1944610816 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020751253761283853, + "loss": 2.5086, + "theoretical_loss": 3.438448431090065, + "tokens_seen": 1944676352 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002075025075225677, + "loss": 2.7802, + "theoretical_loss": 3.4384387164053196, + "tokens_seen": 1944741888 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2154635, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0703694820404053, + "objective/train/theoretical_loss": 3.438433859220086, + "objective/train/tokens_used": 1965234656, + "theoretical_loss": 3.438433859220086, + "tokens_seen": 1944774656 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020749247743229692, + "loss": 2.6535, + "theoretical_loss": 3.438429002139606, + "tokens_seen": 1944807424 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002074824473420261, + "loss": 2.4539, + "theoretical_loss": 3.4384192882928923, + "tokens_seen": 1944872960 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020747241725175528, + "loss": 2.6391, + "theoretical_loss": 3.4384095748651466, + "tokens_seen": 1944938496 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020746238716148446, + "loss": 2.5297, + "theoretical_loss": 3.438399861856336, + "tokens_seen": 1945004032 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020745235707121367, + "loss": 2.5103, + "theoretical_loss": 3.4383901492664286, + "tokens_seen": 1945069568 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020744232698094285, + "loss": 2.4228, + "theoretical_loss": 3.4383804370953928, + "tokens_seen": 1945135104 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020743229689067203, + "loss": 2.5587, + "theoretical_loss": 3.4383707253431957, + "tokens_seen": 1945200640 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002074222668004012, + "loss": 2.3653, + "theoretical_loss": 3.4383610140098053, + "tokens_seen": 1945266176 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002074122367101304, + "loss": 2.5317, + "theoretical_loss": 3.4383513030951898, + "tokens_seen": 1945331712 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002074022066198596, + "loss": 2.6211, + "theoretical_loss": 3.4383415925993166, + "tokens_seen": 1945397248 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020739217652958878, + "loss": 2.5849, + "theoretical_loss": 3.438331882522154, + "tokens_seen": 1945462784 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020738214643931796, + "loss": 2.6113, + "theoretical_loss": 3.438322172863669, + "tokens_seen": 1945528320 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020737211634904714, + "loss": 2.597, + "theoretical_loss": 3.4383124636238303, + "tokens_seen": 1945593856 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020736208625877635, + "loss": 2.6761, + "theoretical_loss": 3.4383027548026055, + "tokens_seen": 1945659392 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020735205616850553, + "loss": 2.6959, + "theoretical_loss": 3.438293046399962, + "tokens_seen": 1945724928 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002073420260782347, + "loss": 2.5735, + "theoretical_loss": 3.438283338415869, + "tokens_seen": 1945790464 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002073319959879639, + "loss": 2.3986, + "theoretical_loss": 3.4382736308502926, + "tokens_seen": 1945856000 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020732196589769308, + "loss": 2.7373, + "theoretical_loss": 3.438263923703201, + "tokens_seen": 1945921536 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020731193580742228, + "loss": 2.4799, + "theoretical_loss": 3.4382542169745633, + "tokens_seen": 1945987072 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020730190571715146, + "loss": 2.544, + "theoretical_loss": 3.4382445106643464, + "tokens_seen": 1946052608 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020729187562688065, + "loss": 2.6718, + "theoretical_loss": 3.4382348047725184, + "tokens_seen": 1946118144 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020728184553660983, + "loss": 2.6103, + "theoretical_loss": 3.4382250992990473, + "tokens_seen": 1946183680 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020727181544633904, + "loss": 2.6485, + "theoretical_loss": 3.438215394243901, + "tokens_seen": 1946249216 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020726178535606822, + "loss": 2.4347, + "theoretical_loss": 3.438205689607047, + "tokens_seen": 1946314752 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002072517552657974, + "loss": 2.5966, + "theoretical_loss": 3.438195985388453, + "tokens_seen": 1946380288 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2155285, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.744236707687378, + "objective/train/theoretical_loss": 3.438191133435994, + "objective/train/tokens_used": 1966873056, + "theoretical_loss": 3.438191133435994, + "tokens_seen": 1946413056 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020724172517552658, + "loss": 2.6821, + "theoretical_loss": 3.438186281588088, + "tokens_seen": 1946445824 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020723169508525576, + "loss": 2.2908, + "theoretical_loss": 3.4381765782059186, + "tokens_seen": 1946511360 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020722166499498497, + "loss": 2.6259, + "theoretical_loss": 3.4381668752419134, + "tokens_seen": 1946576896 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020721163490471415, + "loss": 2.352, + "theoretical_loss": 3.438157172696041, + "tokens_seen": 1946642432 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020720160481444333, + "loss": 2.7006, + "theoretical_loss": 3.4381474705682677, + "tokens_seen": 1946707968 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002071915747241725, + "loss": 2.7825, + "theoretical_loss": 3.438137768858562, + "tokens_seen": 1946773504 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020718154463390172, + "loss": 2.7262, + "theoretical_loss": 3.4381280675668924, + "tokens_seen": 1946839040 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002071715145436309, + "loss": 2.5308, + "theoretical_loss": 3.4381183666932262, + "tokens_seen": 1946904576 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020716148445336008, + "loss": 2.5069, + "theoretical_loss": 3.438108666237532, + "tokens_seen": 1946970112 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020715145436308926, + "loss": 2.6605, + "theoretical_loss": 3.438098966199777, + "tokens_seen": 1947035648 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020714142427281844, + "loss": 2.6005, + "theoretical_loss": 3.4380892665799294, + "tokens_seen": 1947101184 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020713139418254765, + "loss": 2.5234, + "theoretical_loss": 3.438079567377957, + "tokens_seen": 1947166720 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020712136409227683, + "loss": 2.8279, + "theoretical_loss": 3.4380698685938285, + "tokens_seen": 1947232256 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020711133400200601, + "loss": 2.6197, + "theoretical_loss": 3.4380601702275104, + "tokens_seen": 1947297792 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002071013039117352, + "loss": 2.6847, + "theoretical_loss": 3.438050472278972, + "tokens_seen": 1947363328 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002070912738214644, + "loss": 2.4211, + "theoretical_loss": 3.4380407747481803, + "tokens_seen": 1947428864 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020708124373119358, + "loss": 2.3362, + "theoretical_loss": 3.438031077635104, + "tokens_seen": 1947494400 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020707121364092277, + "loss": 2.4906, + "theoretical_loss": 3.4380213809397104, + "tokens_seen": 1947559936 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020706118355065195, + "loss": 2.7486, + "theoretical_loss": 3.4380116846619675, + "tokens_seen": 1947625472 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020705115346038116, + "loss": 2.5735, + "theoretical_loss": 3.4380019888018443, + "tokens_seen": 1947691008 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020704112337011034, + "loss": 2.6185, + "theoretical_loss": 3.4379922933593074, + "tokens_seen": 1947756544 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020703109327983952, + "loss": 2.7939, + "theoretical_loss": 3.4379825983343255, + "tokens_seen": 1947822080 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002070210631895687, + "loss": 2.6848, + "theoretical_loss": 3.437972903726866, + "tokens_seen": 1947887616 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020701103309929788, + "loss": 2.5538, + "theoretical_loss": 3.437963209536898, + "tokens_seen": 1947953152 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002070010030090271, + "loss": 2.6766, + "theoretical_loss": 3.4379535157643883, + "tokens_seen": 1948018688 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2155875, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.419450044631958, + "objective/train/theoretical_loss": 3.437948669034671, + "objective/train/tokens_used": 1968511456, + "theoretical_loss": 3.437948669034671, + "tokens_seen": 1948051456 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020699097291875627, + "loss": 2.6523, + "theoretical_loss": 3.4379438224093053, + "tokens_seen": 1948084224 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020698094282848545, + "loss": 2.4775, + "theoretical_loss": 3.4379341294716173, + "tokens_seen": 1948149760 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020697091273821463, + "loss": 2.6418, + "theoretical_loss": 3.437924436951292, + "tokens_seen": 1948215296 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020696088264794384, + "loss": 2.6362, + "theoretical_loss": 3.437914744848297, + "tokens_seen": 1948280832 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020695085255767302, + "loss": 2.4317, + "theoretical_loss": 3.4379050531626008, + "tokens_seen": 1948346368 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002069408224674022, + "loss": 2.6929, + "theoretical_loss": 3.4378953618941717, + "tokens_seen": 1948411904 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020693079237713138, + "loss": 2.6608, + "theoretical_loss": 3.4378856710429764, + "tokens_seen": 1948477440 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020692076228686056, + "loss": 2.6941, + "theoretical_loss": 3.4378759806089847, + "tokens_seen": 1948542976 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020691073219658977, + "loss": 2.5577, + "theoretical_loss": 3.4378662905921633, + "tokens_seen": 1948608512 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020690070210631895, + "loss": 2.7221, + "theoretical_loss": 3.4378566009924807, + "tokens_seen": 1948674048 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020689067201604813, + "loss": 2.7256, + "theoretical_loss": 3.4378469118099044, + "tokens_seen": 1948739584 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020688064192577732, + "loss": 2.5467, + "theoretical_loss": 3.4378372230444034, + "tokens_seen": 1948805120 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020687061183550652, + "loss": 2.3252, + "theoretical_loss": 3.437827534695945, + "tokens_seen": 1948870656 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002068605817452357, + "loss": 2.7514, + "theoretical_loss": 3.4378178467644975, + "tokens_seen": 1948936192 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020685055165496489, + "loss": 2.5882, + "theoretical_loss": 3.4378081592500287, + "tokens_seen": 1949001728 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020684052156469407, + "loss": 2.58, + "theoretical_loss": 3.4377984721525063, + "tokens_seen": 1949067264 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020683049147442328, + "loss": 2.3209, + "theoretical_loss": 3.4377887854718994, + "tokens_seen": 1949132800 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020682046138415248, + "loss": 2.6551, + "theoretical_loss": 3.437779099208175, + "tokens_seen": 1949198336 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020681043129388166, + "loss": 2.6961, + "theoretical_loss": 3.437769413361302, + "tokens_seen": 1949263872 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020680040120361085, + "loss": 2.7064, + "theoretical_loss": 3.4377597279312475, + "tokens_seen": 1949329408 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020679037111334003, + "loss": 2.7202, + "theoretical_loss": 3.4377500429179806, + "tokens_seen": 1949394944 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020678034102306924, + "loss": 2.522, + "theoretical_loss": 3.4377403583214683, + "tokens_seen": 1949460480 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020677031093279842, + "loss": 2.6884, + "theoretical_loss": 3.4377306741416795, + "tokens_seen": 1949526016 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002067602808425276, + "loss": 2.6305, + "theoretical_loss": 3.437720990378582, + "tokens_seen": 1949591552 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020675025075225678, + "loss": 2.8586, + "theoretical_loss": 3.4377113070321434, + "tokens_seen": 1949657088 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2157271, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.666491985321045, + "objective/train/theoretical_loss": 3.437706465515162, + "objective/train/tokens_used": 1970149856, + "theoretical_loss": 3.437706465515162, + "tokens_seen": 1949689856 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020674022066198596, + "loss": 2.7461, + "theoretical_loss": 3.4377016241023326, + "tokens_seen": 1949722624 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020673019057171517, + "loss": 2.7101, + "theoretical_loss": 3.437691941589117, + "tokens_seen": 1949788160 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020672016048144435, + "loss": 2.689, + "theoretical_loss": 3.437682259492465, + "tokens_seen": 1949853696 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020671013039117353, + "loss": 2.5007, + "theoretical_loss": 3.4376725778123447, + "tokens_seen": 1949919232 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002067001003009027, + "loss": 2.4359, + "theoretical_loss": 3.4376628965487237, + "tokens_seen": 1949984768 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020669007021063192, + "loss": 2.6345, + "theoretical_loss": 3.437653215701571, + "tokens_seen": 1950050304 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002066800401203611, + "loss": 2.5347, + "theoretical_loss": 3.437643535270854, + "tokens_seen": 1950115840 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020667001003009028, + "loss": 2.7469, + "theoretical_loss": 3.4376338552565406, + "tokens_seen": 1950181376 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020665997993981946, + "loss": 2.8149, + "theoretical_loss": 3.4376241756585992, + "tokens_seen": 1950246912 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020664994984954864, + "loss": 2.6251, + "theoretical_loss": 3.437614496476998, + "tokens_seen": 1950312448 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020663991975927785, + "loss": 2.519, + "theoretical_loss": 3.437604817711706, + "tokens_seen": 1950377984 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020662988966900703, + "loss": 2.7415, + "theoretical_loss": 3.4375951393626893, + "tokens_seen": 1950443520 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020661985957873621, + "loss": 2.4179, + "theoretical_loss": 3.4375854614299173, + "tokens_seen": 1950509056 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002066098294884654, + "loss": 2.7065, + "theoretical_loss": 3.437575783913358, + "tokens_seen": 1950574592 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002065997993981946, + "loss": 2.6448, + "theoretical_loss": 3.4375661068129792, + "tokens_seen": 1950640128 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020658976930792379, + "loss": 2.669, + "theoretical_loss": 3.4375564301287493, + "tokens_seen": 1950705664 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020657973921765297, + "loss": 2.6233, + "theoretical_loss": 3.4375467538606364, + "tokens_seen": 1950771200 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020656970912738215, + "loss": 2.53, + "theoretical_loss": 3.4375370780086083, + "tokens_seen": 1950836736 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020655967903711136, + "loss": 2.5888, + "theoretical_loss": 3.4375274025726332, + "tokens_seen": 1950902272 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020654964894684054, + "loss": 2.9186, + "theoretical_loss": 3.43751772755268, + "tokens_seen": 1950967808 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020653961885656972, + "loss": 2.455, + "theoretical_loss": 3.437508052948716, + "tokens_seen": 1951033344 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002065295887662989, + "loss": 2.632, + "theoretical_loss": 3.4374983787607096, + "tokens_seen": 1951098880 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020651955867602808, + "loss": 2.4564, + "theoretical_loss": 3.4374887049886294, + "tokens_seen": 1951164416 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002065095285857573, + "loss": 2.4838, + "theoretical_loss": 3.437479031632442, + "tokens_seen": 1951229952 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020649949849548647, + "loss": 2.669, + "theoretical_loss": 3.437469358692118, + "tokens_seen": 1951295488 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2158044, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.613546133041382, + "objective/train/theoretical_loss": 3.4374645223778937, + "objective/train/tokens_used": 1971788256, + "theoretical_loss": 3.4374645223778937, + "tokens_seen": 1951328256 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020648946840521565, + "loss": 2.7452, + "theoretical_loss": 3.437459686167623, + "tokens_seen": 1951361024 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020647943831494483, + "loss": 2.6177, + "theoretical_loss": 3.4374500140589275, + "tokens_seen": 1951426560 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020646940822467404, + "loss": 2.8259, + "theoretical_loss": 3.4374403423659974, + "tokens_seen": 1951492096 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020645937813440322, + "loss": 2.5426, + "theoretical_loss": 3.4374306710888027, + "tokens_seen": 1951557632 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002064493480441324, + "loss": 2.7361, + "theoretical_loss": 3.437421000227311, + "tokens_seen": 1951623168 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020643931795386158, + "loss": 2.5768, + "theoretical_loss": 3.43741132978149, + "tokens_seen": 1951688704 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020642928786359076, + "loss": 2.6777, + "theoretical_loss": 3.437401659751308, + "tokens_seen": 1951754240 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020641925777331997, + "loss": 2.4368, + "theoretical_loss": 3.437391990136734, + "tokens_seen": 1951819776 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020640922768304915, + "loss": 2.6671, + "theoretical_loss": 3.437382320937735, + "tokens_seen": 1951885312 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020639919759277833, + "loss": 2.5605, + "theoretical_loss": 3.43737265215428, + "tokens_seen": 1951950848 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020638916750250752, + "loss": 2.5767, + "theoretical_loss": 3.437362983786337, + "tokens_seen": 1952016384 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020637913741223672, + "loss": 2.6529, + "theoretical_loss": 3.437353315833874, + "tokens_seen": 1952081920 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002063691073219659, + "loss": 2.6068, + "theoretical_loss": 3.4373436482968596, + "tokens_seen": 1952147456 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020635907723169509, + "loss": 2.4964, + "theoretical_loss": 3.4373339811752617, + "tokens_seen": 1952212992 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020634904714142427, + "loss": 2.5973, + "theoretical_loss": 3.4373243144690484, + "tokens_seen": 1952278528 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020633901705115345, + "loss": 2.562, + "theoretical_loss": 3.437314648178188, + "tokens_seen": 1952344064 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020632898696088266, + "loss": 2.4183, + "theoretical_loss": 3.437304982302649, + "tokens_seen": 1952409600 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020631895687061184, + "loss": 2.4472, + "theoretical_loss": 3.437295316842399, + "tokens_seen": 1952475136 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020630892678034102, + "loss": 2.7607, + "theoretical_loss": 3.4372856517974073, + "tokens_seen": 1952540672 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002062988966900702, + "loss": 2.6647, + "theoretical_loss": 3.4372759871676406, + "tokens_seen": 1952606208 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002062888665997994, + "loss": 2.6988, + "theoretical_loss": 3.4372663229530684, + "tokens_seen": 1952671744 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002062788365095286, + "loss": 2.5508, + "theoretical_loss": 3.4372566591536584, + "tokens_seen": 1952737280 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020626880641925777, + "loss": 2.5075, + "theoretical_loss": 3.437246995769379, + "tokens_seen": 1952802816 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020625877632898695, + "loss": 2.7681, + "theoretical_loss": 3.4372373328001986, + "tokens_seen": 1952868352 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020624874623871613, + "loss": 2.3589, + "theoretical_loss": 3.4372276702460844, + "tokens_seen": 1952933888 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2159283, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9489166736602783, + "objective/train/theoretical_loss": 3.437222839124668, + "objective/train/tokens_used": 1973426656, + "theoretical_loss": 3.437222839124668, + "tokens_seen": 1952966656 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020623871614844534, + "loss": 2.6495, + "theoretical_loss": 3.4372180081070063, + "tokens_seen": 1952999424 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020622868605817452, + "loss": 2.7113, + "theoretical_loss": 3.4372083463829313, + "tokens_seen": 1953064960 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002062186559679037, + "loss": 2.7974, + "theoretical_loss": 3.4371986850738283, + "tokens_seen": 1953130496 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020620862587763288, + "loss": 2.6404, + "theoretical_loss": 3.4371890241796645, + "tokens_seen": 1953196032 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002061985957873621, + "loss": 2.4457, + "theoretical_loss": 3.4371793637004098, + "tokens_seen": 1953261568 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020618856569709127, + "loss": 2.829, + "theoretical_loss": 3.4371697036360316, + "tokens_seen": 1953327104 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020617853560682046, + "loss": 2.5762, + "theoretical_loss": 3.4371600439864975, + "tokens_seen": 1953392640 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020616850551654964, + "loss": 2.4515, + "theoretical_loss": 3.437150384751777, + "tokens_seen": 1953458176 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020615847542627882, + "loss": 2.458, + "theoretical_loss": 3.4371407259318376, + "tokens_seen": 1953523712 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020614844533600803, + "loss": 2.6094, + "theoretical_loss": 3.437131067526648, + "tokens_seen": 1953589248 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002061384152457372, + "loss": 2.5491, + "theoretical_loss": 3.437121409536176, + "tokens_seen": 1953654784 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002061283851554664, + "loss": 2.6774, + "theoretical_loss": 3.43711175196039, + "tokens_seen": 1953720320 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020611835506519557, + "loss": 2.5069, + "theoretical_loss": 3.437102094799259, + "tokens_seen": 1953785856 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020610832497492478, + "loss": 2.5381, + "theoretical_loss": 3.4370924380527503, + "tokens_seen": 1953851392 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020609829488465396, + "loss": 2.7842, + "theoretical_loss": 3.4370827817208327, + "tokens_seen": 1953916928 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020608826479438314, + "loss": 2.5689, + "theoretical_loss": 3.437073125803474, + "tokens_seen": 1953982464 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020607823470411235, + "loss": 2.4248, + "theoretical_loss": 3.4370634703006435, + "tokens_seen": 1954048000 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020606820461384156, + "loss": 2.637, + "theoretical_loss": 3.437053815212309, + "tokens_seen": 1954113536 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020605817452357074, + "loss": 2.6125, + "theoretical_loss": 3.437044160538438, + "tokens_seen": 1954179072 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020604814443329992, + "loss": 2.5594, + "theoretical_loss": 3.4370345062789998, + "tokens_seen": 1954244608 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002060381143430291, + "loss": 2.7086, + "theoretical_loss": 3.4370248524339626, + "tokens_seen": 1954310144 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020602808425275828, + "loss": 2.8609, + "theoretical_loss": 3.437015199003295, + "tokens_seen": 1954375680 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002060180541624875, + "loss": 2.5703, + "theoretical_loss": 3.437005545986964, + "tokens_seen": 1954441216 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020600802407221667, + "loss": 2.4817, + "theoretical_loss": 3.4369958933849394, + "tokens_seen": 1954506752 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020599799398194585, + "loss": 2.4929, + "theoretical_loss": 3.4369862411971885, + "tokens_seen": 1954572288 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2159776, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.758532762527466, + "objective/train/theoretical_loss": 3.436981415258656, + "objective/train/tokens_used": 1975065056, + "theoretical_loss": 3.436981415258656, + "tokens_seen": 1954605056 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020598796389167503, + "loss": 2.7008, + "theoretical_loss": 3.43697658942368, + "tokens_seen": 1954637824 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020597793380140424, + "loss": 2.7482, + "theoretical_loss": 3.436966938064383, + "tokens_seen": 1954703360 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020596790371113342, + "loss": 2.6106, + "theoretical_loss": 3.436957287119265, + "tokens_seen": 1954768896 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002059578736208626, + "loss": 2.7279, + "theoretical_loss": 3.436947636588294, + "tokens_seen": 1954834432 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020594784353059178, + "loss": 2.6636, + "theoretical_loss": 3.436937986471439, + "tokens_seen": 1954899968 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020593781344032096, + "loss": 2.7473, + "theoretical_loss": 3.436928336768668, + "tokens_seen": 1954965504 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020592778335005017, + "loss": 2.6069, + "theoretical_loss": 3.4369186874799498, + "tokens_seen": 1955031040 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020591775325977935, + "loss": 2.5877, + "theoretical_loss": 3.436909038605252, + "tokens_seen": 1955096576 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020590772316950853, + "loss": 2.8137, + "theoretical_loss": 3.4368993901445437, + "tokens_seen": 1955162112 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020589769307923772, + "loss": 2.5479, + "theoretical_loss": 3.436889742097793, + "tokens_seen": 1955227648 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020588766298896692, + "loss": 2.5797, + "theoretical_loss": 3.4368800944649682, + "tokens_seen": 1955293184 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002058776328986961, + "loss": 2.803, + "theoretical_loss": 3.4368704472460374, + "tokens_seen": 1955358720 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002058676028084253, + "loss": 2.6155, + "theoretical_loss": 3.4368608004409698, + "tokens_seen": 1955424256 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020585757271815447, + "loss": 2.4217, + "theoretical_loss": 3.4368511540497333, + "tokens_seen": 1955489792 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020584754262788365, + "loss": 2.6109, + "theoretical_loss": 3.436841508072296, + "tokens_seen": 1955555328 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020583751253761286, + "loss": 2.4857, + "theoretical_loss": 3.4368318625086265, + "tokens_seen": 1955620864 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020582748244734204, + "loss": 2.5308, + "theoretical_loss": 3.4368222173586935, + "tokens_seen": 1955686400 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020581745235707122, + "loss": 2.7297, + "theoretical_loss": 3.4368125726224648, + "tokens_seen": 1955751936 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002058074222668004, + "loss": 2.4862, + "theoretical_loss": 3.436802928299909, + "tokens_seen": 1955817472 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002057973921765296, + "loss": 2.5315, + "theoretical_loss": 3.436793284390995, + "tokens_seen": 1955883008 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002057873620862588, + "loss": 2.6282, + "theoretical_loss": 3.4367836408956904, + "tokens_seen": 1955948544 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020577733199598797, + "loss": 2.6559, + "theoretical_loss": 3.4367739978139644, + "tokens_seen": 1956014080 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020576730190571715, + "loss": 2.7726, + "theoretical_loss": 3.436764355145785, + "tokens_seen": 1956079616 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020575727181544633, + "loss": 2.5539, + "theoretical_loss": 3.43675471289112, + "tokens_seen": 1956145152 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020574724172517554, + "loss": 2.789, + "theoretical_loss": 3.436745071049939, + "tokens_seen": 1956210688 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2161124, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.708544969558716, + "objective/train/theoretical_loss": 3.436740250284395, + "objective/train/tokens_used": 1976703456, + "theoretical_loss": 3.436740250284395, + "tokens_seen": 1956243456 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020573721163490472, + "loss": 2.5768, + "theoretical_loss": 3.4367354296222095, + "tokens_seen": 1956276224 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002057271815446339, + "loss": 2.4206, + "theoretical_loss": 3.436725788607901, + "tokens_seen": 1956341760 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020571715145436308, + "loss": 2.5931, + "theoretical_loss": 3.43671614800698, + "tokens_seen": 1956407296 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002057071213640923, + "loss": 2.6696, + "theoretical_loss": 3.436706507819417, + "tokens_seen": 1956472832 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020569709127382147, + "loss": 2.5615, + "theoretical_loss": 3.4366968680451793, + "tokens_seen": 1956538368 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020568706118355066, + "loss": 2.8322, + "theoretical_loss": 3.4366872286842356, + "tokens_seen": 1956603904 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020567703109327984, + "loss": 2.5873, + "theoretical_loss": 3.4366775897365542, + "tokens_seen": 1956669440 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020566700100300902, + "loss": 2.5293, + "theoretical_loss": 3.4366679512021037, + "tokens_seen": 1956734976 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020565697091273823, + "loss": 2.7703, + "theoretical_loss": 3.436658313080853, + "tokens_seen": 1956800512 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002056469408224674, + "loss": 2.7097, + "theoretical_loss": 3.436648675372769, + "tokens_seen": 1956866048 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002056369107321966, + "loss": 2.5322, + "theoretical_loss": 3.4366390380778222, + "tokens_seen": 1956931584 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020562688064192577, + "loss": 2.6692, + "theoretical_loss": 3.4366294011959795, + "tokens_seen": 1956997120 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020561685055165498, + "loss": 2.8556, + "theoretical_loss": 3.4366197647272103, + "tokens_seen": 1957062656 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020560682046138416, + "loss": 2.4856, + "theoretical_loss": 3.4366101286714823, + "tokens_seen": 1957128192 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020559679037111334, + "loss": 2.5765, + "theoretical_loss": 3.436600493028765, + "tokens_seen": 1957193728 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020558676028084252, + "loss": 2.6878, + "theoretical_loss": 3.4365908577990254, + "tokens_seen": 1957259264 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002055767301905717, + "loss": 2.6062, + "theoretical_loss": 3.4365812229822335, + "tokens_seen": 1957324800 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002055667001003009, + "loss": 2.5677, + "theoretical_loss": 3.4365715885783565, + "tokens_seen": 1957390336 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002055566700100301, + "loss": 2.6259, + "theoretical_loss": 3.436561954587364, + "tokens_seen": 1957455872 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020554663991975927, + "loss": 2.7595, + "theoretical_loss": 3.4365523210092235, + "tokens_seen": 1957521408 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020553660982948845, + "loss": 2.7521, + "theoretical_loss": 3.4365426878439043, + "tokens_seen": 1957586944 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020552657973921766, + "loss": 2.6141, + "theoretical_loss": 3.436533055091374, + "tokens_seen": 1957652480 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020551654964894684, + "loss": 2.6476, + "theoretical_loss": 3.436523422751602, + "tokens_seen": 1957718016 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020550651955867602, + "loss": 2.5832, + "theoretical_loss": 3.436513790824556, + "tokens_seen": 1957783552 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002054964894684052, + "loss": 2.6924, + "theoretical_loss": 3.4365041593102053, + "tokens_seen": 1957849088 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2162517, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4593896865844727, + "objective/train/theoretical_loss": 3.436499343707781, + "objective/train/tokens_used": 1978341856, + "theoretical_loss": 3.436499343707781, + "tokens_seen": 1957881856 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020548645937813439, + "loss": 2.6174, + "theoretical_loss": 3.4364945282085184, + "tokens_seen": 1957914624 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002054764292878636, + "loss": 2.4569, + "theoretical_loss": 3.436484897519463, + "tokens_seen": 1957980160 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020546639919759278, + "loss": 2.462, + "theoretical_loss": 3.436475267243008, + "tokens_seen": 1958045696 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020545636910732196, + "loss": 2.6929, + "theoretical_loss": 3.4364656373791216, + "tokens_seen": 1958111232 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020544633901705114, + "loss": 2.7273, + "theoretical_loss": 3.436456007927773, + "tokens_seen": 1958176768 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020543630892678035, + "loss": 2.6582, + "theoretical_loss": 3.43644637888893, + "tokens_seen": 1958242304 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020542627883650953, + "loss": 2.527, + "theoretical_loss": 3.436436750262562, + "tokens_seen": 1958307840 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002054162487462387, + "loss": 2.6975, + "theoretical_loss": 3.436427122048637, + "tokens_seen": 1958373376 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002054062186559679, + "loss": 2.7237, + "theoretical_loss": 3.4364174942471233, + "tokens_seen": 1958438912 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002053961885656971, + "loss": 2.6742, + "theoretical_loss": 3.43640786685799, + "tokens_seen": 1958504448 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020538615847542628, + "loss": 2.5567, + "theoretical_loss": 3.436398239881205, + "tokens_seen": 1958569984 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020537612838515546, + "loss": 2.7129, + "theoretical_loss": 3.436388613316737, + "tokens_seen": 1958635520 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020536609829488464, + "loss": 2.6903, + "theoretical_loss": 3.4363789871645554, + "tokens_seen": 1958701056 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020535606820461382, + "loss": 2.6773, + "theoretical_loss": 3.436369361424628, + "tokens_seen": 1958766592 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020534603811434303, + "loss": 2.7323, + "theoretical_loss": 3.436359736096923, + "tokens_seen": 1958832128 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002053360080240722, + "loss": 2.5952, + "theoretical_loss": 3.4363501111814094, + "tokens_seen": 1958897664 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020532597793380142, + "loss": 2.6323, + "theoretical_loss": 3.436340486678056, + "tokens_seen": 1958963200 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002053159478435306, + "loss": 2.673, + "theoretical_loss": 3.4363308625868307, + "tokens_seen": 1959028736 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002053059177532598, + "loss": 2.5249, + "theoretical_loss": 3.436321238907703, + "tokens_seen": 1959094272 + }, + { + "epoch": 6.05, + "learning_rate": 0.000205295887662989, + "loss": 2.7786, + "theoretical_loss": 3.4363116156406406, + "tokens_seen": 1959159808 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020528585757271817, + "loss": 2.4718, + "theoretical_loss": 3.4363019927856127, + "tokens_seen": 1959225344 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020527582748244735, + "loss": 2.7009, + "theoretical_loss": 3.436292370342587, + "tokens_seen": 1959290880 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020526579739217653, + "loss": 2.5174, + "theoretical_loss": 3.436282748311533, + "tokens_seen": 1959356416 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020525576730190574, + "loss": 2.6219, + "theoretical_loss": 3.436273126692419, + "tokens_seen": 1959421952 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020524573721163492, + "loss": 2.8089, + "theoretical_loss": 3.436263505485214, + "tokens_seen": 1959487488 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2163089, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.750710964202881, + "objective/train/theoretical_loss": 3.4362586950360674, + "objective/train/tokens_used": 1979980256, + "theoretical_loss": 3.4362586950360674, + "tokens_seen": 1959520256 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002052357071213641, + "loss": 2.7487, + "theoretical_loss": 3.4362538846898856, + "tokens_seen": 1959553024 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020522567703109328, + "loss": 2.7272, + "theoretical_loss": 3.436244264306403, + "tokens_seen": 1959618560 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002052156469408225, + "loss": 2.4753, + "theoretical_loss": 3.4362346443347347, + "tokens_seen": 1959684096 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020520561685055167, + "loss": 2.4505, + "theoretical_loss": 3.436225024774849, + "tokens_seen": 1959749632 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020519558676028086, + "loss": 2.5698, + "theoretical_loss": 3.4362154056267156, + "tokens_seen": 1959815168 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020518555667001004, + "loss": 2.6309, + "theoretical_loss": 3.436205786890302, + "tokens_seen": 1959880704 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020517552657973922, + "loss": 2.593, + "theoretical_loss": 3.436196168565577, + "tokens_seen": 1959946240 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020516549648946843, + "loss": 2.67, + "theoretical_loss": 3.4361865506525096, + "tokens_seen": 1960011776 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002051554663991976, + "loss": 2.5126, + "theoretical_loss": 3.436176933151068, + "tokens_seen": 1960077312 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002051454363089268, + "loss": 2.4799, + "theoretical_loss": 3.436167316061221, + "tokens_seen": 1960142848 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020513540621865597, + "loss": 2.7867, + "theoretical_loss": 3.436157699382937, + "tokens_seen": 1960208384 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020512537612838518, + "loss": 2.5209, + "theoretical_loss": 3.4361480831161852, + "tokens_seen": 1960273920 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020511534603811436, + "loss": 2.6981, + "theoretical_loss": 3.4361384672609336, + "tokens_seen": 1960339456 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020510531594784354, + "loss": 2.4029, + "theoretical_loss": 3.436128851817151, + "tokens_seen": 1960404992 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020509528585757272, + "loss": 2.6001, + "theoretical_loss": 3.4361192367848066, + "tokens_seen": 1960470528 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002050852557673019, + "loss": 2.6947, + "theoretical_loss": 3.4361096221638685, + "tokens_seen": 1960536064 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002050752256770311, + "loss": 2.5609, + "theoretical_loss": 3.4361000079543054, + "tokens_seen": 1960601600 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002050651955867603, + "loss": 2.5888, + "theoretical_loss": 3.4360903941560856, + "tokens_seen": 1960667136 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020505516549648947, + "loss": 2.6389, + "theoretical_loss": 3.4360807807691787, + "tokens_seen": 1960732672 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020504513540621865, + "loss": 2.6308, + "theoretical_loss": 3.4360711677935525, + "tokens_seen": 1960798208 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020503510531594786, + "loss": 2.5531, + "theoretical_loss": 3.436061555229176, + "tokens_seen": 1960863744 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020502507522567704, + "loss": 2.7437, + "theoretical_loss": 3.436051943076018, + "tokens_seen": 1960929280 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020501504513540622, + "loss": 2.4892, + "theoretical_loss": 3.436042331334047, + "tokens_seen": 1960994816 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002050050150451354, + "loss": 2.6701, + "theoretical_loss": 3.4360327200032312, + "tokens_seen": 1961060352 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020499498495486459, + "loss": 2.5885, + "theoretical_loss": 3.43602310908354, + "tokens_seen": 1961125888 + }, + { + "epoch": 6.05, + "objective/train/docs_used": 2163737, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.642073154449463, + "objective/train/theoretical_loss": 3.4360183037778564, + "objective/train/tokens_used": 1981618656, + "theoretical_loss": 3.4360183037778564, + "tokens_seen": 1961158656 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002049849548645938, + "loss": 2.5992, + "theoretical_loss": 3.436013498574942, + "tokens_seen": 1961191424 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020497492477432298, + "loss": 2.788, + "theoretical_loss": 3.436003888477406, + "tokens_seen": 1961256960 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020496489468405216, + "loss": 2.5145, + "theoretical_loss": 3.4359942787908997, + "tokens_seen": 1961322496 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020495486459378134, + "loss": 2.7275, + "theoretical_loss": 3.4359846695153924, + "tokens_seen": 1961388032 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020494483450351055, + "loss": 2.6115, + "theoretical_loss": 3.4359750606508532, + "tokens_seen": 1961453568 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020493480441323973, + "loss": 2.6799, + "theoretical_loss": 3.4359654521972507, + "tokens_seen": 1961519104 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002049247743229689, + "loss": 2.5764, + "theoretical_loss": 3.4359558441545524, + "tokens_seen": 1961584640 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002049147442326981, + "loss": 2.623, + "theoretical_loss": 3.435946236522729, + "tokens_seen": 1961650176 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002049047141424273, + "loss": 2.8619, + "theoretical_loss": 3.4359366293017475, + "tokens_seen": 1961715712 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020489468405215648, + "loss": 2.736, + "theoretical_loss": 3.435927022491577, + "tokens_seen": 1961781248 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020488465396188566, + "loss": 2.7585, + "theoretical_loss": 3.4359174160921873, + "tokens_seen": 1961846784 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020487462387161484, + "loss": 2.5299, + "theoretical_loss": 3.4359078101035454, + "tokens_seen": 1961912320 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020486459378134402, + "loss": 2.2915, + "theoretical_loss": 3.4358982045256217, + "tokens_seen": 1961977856 + }, + { + "epoch": 6.05, + "learning_rate": 0.00020485456369107323, + "loss": 2.4366, + "theoretical_loss": 3.4358885993583836, + "tokens_seen": 1962043392 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002048445336008024, + "loss": 2.6753, + "theoretical_loss": 3.4358789946018007, + "tokens_seen": 1962108928 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002048345035105316, + "loss": 2.6225, + "theoretical_loss": 3.435869390255841, + "tokens_seen": 1962174464 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020482447342026077, + "loss": 2.7756, + "theoretical_loss": 3.4358597863204734, + "tokens_seen": 1962240000 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020481444332998998, + "loss": 2.8369, + "theoretical_loss": 3.4358501827956673, + "tokens_seen": 1962305536 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020480441323971916, + "loss": 2.5244, + "theoretical_loss": 3.4358405796813907, + "tokens_seen": 1962371072 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020479438314944834, + "loss": 2.7138, + "theoretical_loss": 3.4358309769776127, + "tokens_seen": 1962436608 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020478435305917753, + "loss": 2.5461, + "theoretical_loss": 3.435821374684302, + "tokens_seen": 1962502144 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002047743229689067, + "loss": 2.5908, + "theoretical_loss": 3.4358117728014275, + "tokens_seen": 1962567680 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020476429287863591, + "loss": 2.6344, + "theoretical_loss": 3.4358021713289575, + "tokens_seen": 1962633216 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002047542627883651, + "loss": 2.7724, + "theoretical_loss": 3.4357925702668606, + "tokens_seen": 1962698752 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020474423269809428, + "loss": 2.5862, + "theoretical_loss": 3.435782969615107, + "tokens_seen": 1962764288 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2165048, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.711935520172119, + "objective/train/theoretical_loss": 3.435778169443098, + "objective/train/tokens_used": 1983257056, + "theoretical_loss": 3.435778169443098, + "tokens_seen": 1962797056 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020473420260782346, + "loss": 2.5635, + "theoretical_loss": 3.4357733693736634, + "tokens_seen": 1962829824 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020472417251755267, + "loss": 2.4694, + "theoretical_loss": 3.4357637695425, + "tokens_seen": 1962895360 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020471414242728185, + "loss": 2.8584, + "theoretical_loss": 3.4357541701215855, + "tokens_seen": 1962960896 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020470411233701103, + "loss": 2.7121, + "theoretical_loss": 3.435744571110888, + "tokens_seen": 1963026432 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002046940822467402, + "loss": 2.6923, + "theoretical_loss": 3.4357349725103763, + "tokens_seen": 1963091968 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002046840521564694, + "loss": 2.4823, + "theoretical_loss": 3.43572537432002, + "tokens_seen": 1963157504 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002046740220661986, + "loss": 2.7639, + "theoretical_loss": 3.435715776539787, + "tokens_seen": 1963223040 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020466399197592778, + "loss": 2.486, + "theoretical_loss": 3.435706179169647, + "tokens_seen": 1963288576 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020465396188565696, + "loss": 2.6692, + "theoretical_loss": 3.4356965822095678, + "tokens_seen": 1963354112 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020464393179538614, + "loss": 2.5104, + "theoretical_loss": 3.4356869856595185, + "tokens_seen": 1963419648 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020463390170511535, + "loss": 2.412, + "theoretical_loss": 3.4356773895194683, + "tokens_seen": 1963485184 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020462387161484453, + "loss": 2.6127, + "theoretical_loss": 3.435667793789386, + "tokens_seen": 1963550720 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002046138415245737, + "loss": 2.7927, + "theoretical_loss": 3.4356581984692394, + "tokens_seen": 1963616256 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002046038114343029, + "loss": 2.3663, + "theoretical_loss": 3.435648603558999, + "tokens_seen": 1963681792 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020459378134403207, + "loss": 2.4142, + "theoretical_loss": 3.4356390090586317, + "tokens_seen": 1963747328 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020458375125376128, + "loss": 2.6527, + "theoretical_loss": 3.4356294149681075, + "tokens_seen": 1963812864 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002045737211634905, + "loss": 2.5708, + "theoretical_loss": 3.4356198212873954, + "tokens_seen": 1963878400 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020456369107321967, + "loss": 2.6155, + "theoretical_loss": 3.4356102280164635, + "tokens_seen": 1963943936 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020455366098294885, + "loss": 2.7131, + "theoretical_loss": 3.4356006351552812, + "tokens_seen": 1964009472 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020454363089267806, + "loss": 2.407, + "theoretical_loss": 3.435591042703817, + "tokens_seen": 1964075008 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020453360080240724, + "loss": 2.612, + "theoretical_loss": 3.4355814506620392, + "tokens_seen": 1964140544 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020452357071213642, + "loss": 2.8146, + "theoretical_loss": 3.435571859029918, + "tokens_seen": 1964206080 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002045135406218656, + "loss": 2.833, + "theoretical_loss": 3.4355622678074207, + "tokens_seen": 1964271616 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020450351053159479, + "loss": 2.7336, + "theoretical_loss": 3.4355526769945177, + "tokens_seen": 1964337152 + }, + { + "epoch": 6.06, + "learning_rate": 0.000204493480441324, + "loss": 2.7286, + "theoretical_loss": 3.4355430865911765, + "tokens_seen": 1964402688 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2165537, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6175053119659424, + "objective/train/theoretical_loss": 3.4355382915430823, + "objective/train/tokens_used": 1984895456, + "theoretical_loss": 3.4355382915430823, + "tokens_seen": 1964435456 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020448345035105318, + "loss": 2.7319, + "theoretical_loss": 3.4355334965973663, + "tokens_seen": 1964468224 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020447342026078236, + "loss": 2.5858, + "theoretical_loss": 3.4355239070130565, + "tokens_seen": 1964533760 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020446339017051154, + "loss": 2.5372, + "theoretical_loss": 3.4355143178382157, + "tokens_seen": 1964599296 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020445336008024075, + "loss": 2.6329, + "theoretical_loss": 3.4355047290728127, + "tokens_seen": 1964664832 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020444332998996993, + "loss": 2.7145, + "theoretical_loss": 3.4354951407168164, + "tokens_seen": 1964730368 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002044332998996991, + "loss": 2.6694, + "theoretical_loss": 3.435485552770195, + "tokens_seen": 1964795904 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002044232698094283, + "loss": 2.6818, + "theoretical_loss": 3.4354759652329183, + "tokens_seen": 1964861440 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002044132397191575, + "loss": 2.6614, + "theoretical_loss": 3.4354663781049544, + "tokens_seen": 1964926976 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020440320962888668, + "loss": 2.5152, + "theoretical_loss": 3.4354567913862732, + "tokens_seen": 1964992512 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020439317953861586, + "loss": 2.3586, + "theoretical_loss": 3.4354472050768425, + "tokens_seen": 1965058048 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020438314944834504, + "loss": 2.5085, + "theoretical_loss": 3.435437619176632, + "tokens_seen": 1965123584 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020437311935807422, + "loss": 2.6387, + "theoretical_loss": 3.4354280336856102, + "tokens_seen": 1965189120 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020436308926780343, + "loss": 2.711, + "theoretical_loss": 3.435418448603746, + "tokens_seen": 1965254656 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002043530591775326, + "loss": 2.5759, + "theoretical_loss": 3.4354088639310083, + "tokens_seen": 1965320192 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002043430290872618, + "loss": 2.6858, + "theoretical_loss": 3.435399279667366, + "tokens_seen": 1965385728 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020433299899699097, + "loss": 2.4715, + "theoretical_loss": 3.4353896958127876, + "tokens_seen": 1965451264 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020432296890672018, + "loss": 2.44, + "theoretical_loss": 3.435380112367243, + "tokens_seen": 1965516800 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020431293881644936, + "loss": 2.788, + "theoretical_loss": 3.4353705293307, + "tokens_seen": 1965582336 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020430290872617854, + "loss": 2.5231, + "theoretical_loss": 3.4353609467031285, + "tokens_seen": 1965647872 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020429287863590773, + "loss": 2.7212, + "theoretical_loss": 3.4353513644844966, + "tokens_seen": 1965713408 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002042828485456369, + "loss": 2.5523, + "theoretical_loss": 3.435341782674774, + "tokens_seen": 1965778944 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020427281845536611, + "loss": 2.7329, + "theoretical_loss": 3.435332201273929, + "tokens_seen": 1965844480 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002042627883650953, + "loss": 2.5107, + "theoretical_loss": 3.43532262028193, + "tokens_seen": 1965910016 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020425275827482448, + "loss": 2.5388, + "theoretical_loss": 3.4353130396987472, + "tokens_seen": 1965975552 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020424272818455366, + "loss": 2.6969, + "theoretical_loss": 3.435303459524349, + "tokens_seen": 1966041088 + }, + { + "debugging/Self-BLEU-5": 0.41924020765966247, + "debugging/distinct-1-grams": 0.7146877453850685, + "debugging/distinct-2-grams": 0.8701133085906988, + "debugging/entropy-1-grams": 5.921560380979235, + "debugging/entropy-2-grams": 6.750196970144997, + "debugging/length": 477.2, + "debugging/num_segments": 15, + "debugging/score": 0.004520536881345261, + "debugging/score_std": 0.0039004566714937747, + "epoch": 6.06, + "objective/train/docs_used": 2167173, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6649415493011475, + "objective/train/theoretical_loss": 3.435298669590434, + "objective/train/tokens_used": 1986533856, + "theoretical_loss": 3.435298669590434, + "tokens_seen": 1966073856 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020423269809428287, + "loss": 2.7566, + "theoretical_loss": 3.4352938797587043, + "tokens_seen": 1966106624 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020422266800401205, + "loss": 2.7225, + "theoretical_loss": 3.435284300401782, + "tokens_seen": 1966172160 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020421263791374123, + "loss": 2.5941, + "theoretical_loss": 3.435274721453551, + "tokens_seen": 1966237696 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002042026078234704, + "loss": 2.6689, + "theoretical_loss": 3.43526514291398, + "tokens_seen": 1966303232 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002041925777331996, + "loss": 2.6032, + "theoretical_loss": 3.4352555647830387, + "tokens_seen": 1966368768 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002041825476429288, + "loss": 2.5297, + "theoretical_loss": 3.435245987060695, + "tokens_seen": 1966434304 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020417251755265798, + "loss": 2.4376, + "theoretical_loss": 3.435236409746919, + "tokens_seen": 1966499840 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020416248746238716, + "loss": 2.6851, + "theoretical_loss": 3.435226832841679, + "tokens_seen": 1966565376 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020415245737211634, + "loss": 2.649, + "theoretical_loss": 3.4352172563449437, + "tokens_seen": 1966630912 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020414242728184555, + "loss": 2.4675, + "theoretical_loss": 3.435207680256683, + "tokens_seen": 1966696448 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020413239719157473, + "loss": 2.5518, + "theoretical_loss": 3.4351981045768647, + "tokens_seen": 1966761984 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002041223671013039, + "loss": 2.6369, + "theoretical_loss": 3.4351885293054587, + "tokens_seen": 1966827520 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002041123370110331, + "loss": 2.7837, + "theoretical_loss": 3.435178954442433, + "tokens_seen": 1966893056 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020410230692076228, + "loss": 2.6163, + "theoretical_loss": 3.4351693799877583, + "tokens_seen": 1966958592 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020409227683049148, + "loss": 2.7112, + "theoretical_loss": 3.435159805941402, + "tokens_seen": 1967024128 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020408224674022066, + "loss": 2.6542, + "theoretical_loss": 3.4351502323033336, + "tokens_seen": 1967089664 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020407221664994985, + "loss": 2.6289, + "theoretical_loss": 3.4351406590735216, + "tokens_seen": 1967155200 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020406218655967903, + "loss": 2.7545, + "theoretical_loss": 3.4351310862519364, + "tokens_seen": 1967220736 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020405215646940823, + "loss": 2.5073, + "theoretical_loss": 3.4351215138385456, + "tokens_seen": 1967286272 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020404212637913742, + "loss": 2.4868, + "theoretical_loss": 3.435111941833318, + "tokens_seen": 1967351808 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002040320962888666, + "loss": 2.7935, + "theoretical_loss": 3.4351023702362236, + "tokens_seen": 1967417344 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020402206619859578, + "loss": 2.6522, + "theoretical_loss": 3.4350927990472315, + "tokens_seen": 1967482880 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020401203610832496, + "loss": 2.6585, + "theoretical_loss": 3.43508322826631, + "tokens_seen": 1967548416 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020400200601805417, + "loss": 2.5886, + "theoretical_loss": 3.4350736578934282, + "tokens_seen": 1967613952 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020399197592778335, + "loss": 2.7386, + "theoretical_loss": 3.4350640879285557, + "tokens_seen": 1967679488 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2170339, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.212815046310425, + "objective/train/theoretical_loss": 3.435059303099113, + "objective/train/tokens_used": 1988172256, + "theoretical_loss": 3.435059303099113, + "tokens_seen": 1967712256 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020398194583751253, + "loss": 2.6892, + "theoretical_loss": 3.4350545183716603, + "tokens_seen": 1967745024 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002039719157472417, + "loss": 2.4825, + "theoretical_loss": 3.4350449492227124, + "tokens_seen": 1967810560 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020396188565697092, + "loss": 2.5109, + "theoretical_loss": 3.4350353804816804, + "tokens_seen": 1967876096 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002039518555667001, + "loss": 2.6833, + "theoretical_loss": 3.4350258121485333, + "tokens_seen": 1967941632 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020394182547642928, + "loss": 2.5155, + "theoretical_loss": 3.43501624422324, + "tokens_seen": 1968007168 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020393179538615846, + "loss": 2.8004, + "theoretical_loss": 3.43500667670577, + "tokens_seen": 1968072704 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020392176529588764, + "loss": 2.6403, + "theoretical_loss": 3.4349971095960923, + "tokens_seen": 1968138240 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020391173520561685, + "loss": 2.5064, + "theoretical_loss": 3.4349875428941754, + "tokens_seen": 1968203776 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020390170511534603, + "loss": 2.7226, + "theoretical_loss": 3.4349779765999884, + "tokens_seen": 1968269312 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020389167502507521, + "loss": 2.6393, + "theoretical_loss": 3.434968410713501, + "tokens_seen": 1968334848 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002038816449348044, + "loss": 2.637, + "theoretical_loss": 3.4349588452346813, + "tokens_seen": 1968400384 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002038716148445336, + "loss": 2.7543, + "theoretical_loss": 3.4349492801634995, + "tokens_seen": 1968465920 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020386158475426278, + "loss": 2.6984, + "theoretical_loss": 3.434939715499924, + "tokens_seen": 1968531456 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020385155466399197, + "loss": 2.6572, + "theoretical_loss": 3.4349301512439236, + "tokens_seen": 1968596992 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020384152457372115, + "loss": 2.4497, + "theoretical_loss": 3.434920587395468, + "tokens_seen": 1968662528 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020383149448345036, + "loss": 2.655, + "theoretical_loss": 3.4349110239545255, + "tokens_seen": 1968728064 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020382146439317956, + "loss": 2.6966, + "theoretical_loss": 3.4349014609210657, + "tokens_seen": 1968793600 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020381143430290874, + "loss": 2.7158, + "theoretical_loss": 3.4348918982950583, + "tokens_seen": 1968859136 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020380140421263793, + "loss": 2.5909, + "theoretical_loss": 3.434882336076471, + "tokens_seen": 1968924672 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002037913741223671, + "loss": 2.8302, + "theoretical_loss": 3.4348727742652736, + "tokens_seen": 1968990208 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020378134403209631, + "loss": 2.4481, + "theoretical_loss": 3.4348632128614356, + "tokens_seen": 1969055744 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002037713139418255, + "loss": 2.6457, + "theoretical_loss": 3.434853651864925, + "tokens_seen": 1969121280 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020376128385155468, + "loss": 2.5857, + "theoretical_loss": 3.4348440912757123, + "tokens_seen": 1969186816 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020375125376128386, + "loss": 2.6291, + "theoretical_loss": 3.434834531093765, + "tokens_seen": 1969252352 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020374122367101307, + "loss": 2.7276, + "theoretical_loss": 3.4348249713190535, + "tokens_seen": 1969317888 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2175295, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7439587116241455, + "objective/train/theoretical_loss": 3.4348201915844014, + "objective/train/tokens_used": 1989810656, + "theoretical_loss": 3.4348201915844014, + "tokens_seen": 1969350656 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020373119358074225, + "loss": 2.6218, + "theoretical_loss": 3.434815411951546, + "tokens_seen": 1969383424 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020372116349047143, + "loss": 2.6007, + "theoretical_loss": 3.4348058529912127, + "tokens_seen": 1969448960 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002037111334002006, + "loss": 2.8102, + "theoretical_loss": 3.434796294438022, + "tokens_seen": 1969514496 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002037011033099298, + "loss": 2.6189, + "theoretical_loss": 3.4347867362919424, + "tokens_seen": 1969580032 + }, + { + "epoch": 6.06, + "learning_rate": 0.000203691073219659, + "loss": 2.7158, + "theoretical_loss": 3.434777178552944, + "tokens_seen": 1969645568 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020368104312938818, + "loss": 2.6282, + "theoretical_loss": 3.4347676212209954, + "tokens_seen": 1969711104 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020367101303911736, + "loss": 2.6339, + "theoretical_loss": 3.4347580642960662, + "tokens_seen": 1969776640 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020366098294884654, + "loss": 2.7512, + "theoretical_loss": 3.4347485077781252, + "tokens_seen": 1969842176 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020365095285857575, + "loss": 2.582, + "theoretical_loss": 3.4347389516671414, + "tokens_seen": 1969907712 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020364092276830493, + "loss": 2.4691, + "theoretical_loss": 3.434729395963084, + "tokens_seen": 1969973248 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002036308926780341, + "loss": 2.5695, + "theoretical_loss": 3.4347198406659225, + "tokens_seen": 1970038784 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002036208625877633, + "loss": 2.6608, + "theoretical_loss": 3.4347102857756253, + "tokens_seen": 1970104320 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020361083249749248, + "loss": 2.6168, + "theoretical_loss": 3.4347007312921627, + "tokens_seen": 1970169856 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020360080240722168, + "loss": 2.7063, + "theoretical_loss": 3.4346911772155027, + "tokens_seen": 1970235392 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020359077231695086, + "loss": 2.7537, + "theoretical_loss": 3.434681623545615, + "tokens_seen": 1970300928 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020358074222668005, + "loss": 2.5204, + "theoretical_loss": 3.4346720702824687, + "tokens_seen": 1970366464 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020357071213640923, + "loss": 2.7403, + "theoretical_loss": 3.4346625174260326, + "tokens_seen": 1970432000 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020356068204613844, + "loss": 2.5202, + "theoretical_loss": 3.4346529649762765, + "tokens_seen": 1970497536 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020355065195586762, + "loss": 2.6354, + "theoretical_loss": 3.434643412933169, + "tokens_seen": 1970563072 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002035406218655968, + "loss": 2.5586, + "theoretical_loss": 3.4346338612966796, + "tokens_seen": 1970628608 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020353059177532598, + "loss": 2.6863, + "theoretical_loss": 3.434624310066777, + "tokens_seen": 1970694144 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020352056168505516, + "loss": 2.7993, + "theoretical_loss": 3.4346147592434315, + "tokens_seen": 1970759680 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020351053159478437, + "loss": 2.6088, + "theoretical_loss": 3.434605208826611, + "tokens_seen": 1970825216 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020350050150451355, + "loss": 2.5598, + "theoretical_loss": 3.4345956588162854, + "tokens_seen": 1970890752 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020349047141424273, + "loss": 2.4854, + "theoretical_loss": 3.4345861092124235, + "tokens_seen": 1970956288 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2180187, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.874157428741455, + "objective/train/theoretical_loss": 3.434581334562907, + "objective/train/tokens_used": 1991449056, + "theoretical_loss": 3.434581334562907, + "tokens_seen": 1970989056 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002034804413239719, + "loss": 2.8959, + "theoretical_loss": 3.4345765600149947, + "tokens_seen": 1971021824 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020347041123370112, + "loss": 2.8735, + "theoretical_loss": 3.434567011223968, + "tokens_seen": 1971087360 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002034603811434303, + "loss": 2.5168, + "theoretical_loss": 3.434557462839313, + "tokens_seen": 1971152896 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020345035105315948, + "loss": 2.7217, + "theoretical_loss": 3.4345479148609988, + "tokens_seen": 1971218432 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020344032096288866, + "loss": 2.7575, + "theoretical_loss": 3.4345383672889938, + "tokens_seen": 1971283968 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020343029087261784, + "loss": 2.6149, + "theoretical_loss": 3.434528820123268, + "tokens_seen": 1971349504 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020342026078234705, + "loss": 2.5829, + "theoretical_loss": 3.434519273363791, + "tokens_seen": 1971415040 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020341023069207623, + "loss": 2.6549, + "theoretical_loss": 3.434509727010531, + "tokens_seen": 1971480576 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020340020060180541, + "loss": 2.548, + "theoretical_loss": 3.4345001810634574, + "tokens_seen": 1971546112 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002033901705115346, + "loss": 2.3432, + "theoretical_loss": 3.43449063552254, + "tokens_seen": 1971611648 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002033801404212638, + "loss": 2.4991, + "theoretical_loss": 3.434481090387748, + "tokens_seen": 1971677184 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020337011033099298, + "loss": 2.5866, + "theoretical_loss": 3.4344715456590498, + "tokens_seen": 1971742720 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020336008024072217, + "loss": 2.7094, + "theoretical_loss": 3.4344620013364153, + "tokens_seen": 1971808256 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020335005015045135, + "loss": 2.7051, + "theoretical_loss": 3.434452457419814, + "tokens_seen": 1971873792 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020334002006018056, + "loss": 2.6199, + "theoretical_loss": 3.4344429139092143, + "tokens_seen": 1971939328 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020332998996990974, + "loss": 2.6744, + "theoretical_loss": 3.434433370804586, + "tokens_seen": 1972004864 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020331995987963892, + "loss": 2.6062, + "theoretical_loss": 3.4344238281058983, + "tokens_seen": 1972070400 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002033099297893681, + "loss": 2.6306, + "theoretical_loss": 3.43441428581312, + "tokens_seen": 1972135936 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020329989969909728, + "loss": 2.5141, + "theoretical_loss": 3.434404743926221, + "tokens_seen": 1972201472 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002032898696088265, + "loss": 2.7025, + "theoretical_loss": 3.43439520244517, + "tokens_seen": 1972267008 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020327983951855567, + "loss": 2.6609, + "theoretical_loss": 3.4343856613699364, + "tokens_seen": 1972332544 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020326980942828485, + "loss": 2.7909, + "theoretical_loss": 3.4343761207004895, + "tokens_seen": 1972398080 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020325977933801403, + "loss": 2.8164, + "theoretical_loss": 3.434366580436799, + "tokens_seen": 1972463616 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020324974924774324, + "loss": 2.6283, + "theoretical_loss": 3.434357040578833, + "tokens_seen": 1972529152 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020323971915747242, + "loss": 2.5346, + "theoretical_loss": 3.4343475011265623, + "tokens_seen": 1972594688 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2185242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.761200428009033, + "objective/train/theoretical_loss": 3.4343427315525528, + "objective/train/tokens_used": 1993087456, + "theoretical_loss": 3.4343427315525528, + "tokens_seen": 1972627456 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002032296890672016, + "loss": 2.7657, + "theoretical_loss": 3.434337962079955, + "tokens_seen": 1972660224 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020321965897693078, + "loss": 2.4677, + "theoretical_loss": 3.4343284234389806, + "tokens_seen": 1972725760 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020320962888665996, + "loss": 2.6205, + "theoretical_loss": 3.4343188852036093, + "tokens_seen": 1972791296 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020319959879638917, + "loss": 2.5752, + "theoretical_loss": 3.4343093473738087, + "tokens_seen": 1972856832 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020318956870611835, + "loss": 2.7338, + "theoretical_loss": 3.4342998099495494, + "tokens_seen": 1972922368 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020317953861584753, + "loss": 2.6993, + "theoretical_loss": 3.4342902729308005, + "tokens_seen": 1972987904 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020316950852557672, + "loss": 2.7084, + "theoretical_loss": 3.4342807363175307, + "tokens_seen": 1973053440 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020315947843530592, + "loss": 2.8145, + "theoretical_loss": 3.43427120010971, + "tokens_seen": 1973118976 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002031494483450351, + "loss": 2.7315, + "theoretical_loss": 3.434261664307307, + "tokens_seen": 1973184512 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020313941825476429, + "loss": 2.5918, + "theoretical_loss": 3.434252128910292, + "tokens_seen": 1973250048 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020312938816449347, + "loss": 2.6074, + "theoretical_loss": 3.434242593918633, + "tokens_seen": 1973315584 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020311935807422265, + "loss": 2.6719, + "theoretical_loss": 3.4342330593323007, + "tokens_seen": 1973381120 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020310932798395186, + "loss": 2.651, + "theoretical_loss": 3.434223525151263, + "tokens_seen": 1973446656 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020309929789368104, + "loss": 2.6945, + "theoretical_loss": 3.4342139913754903, + "tokens_seen": 1973512192 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020308926780341022, + "loss": 2.6871, + "theoretical_loss": 3.4342044580049516, + "tokens_seen": 1973577728 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002030792377131394, + "loss": 2.4657, + "theoretical_loss": 3.4341949250396158, + "tokens_seen": 1973643264 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020306920762286864, + "loss": 2.6778, + "theoretical_loss": 3.4341853924794528, + "tokens_seen": 1973708800 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020305917753259782, + "loss": 2.8264, + "theoretical_loss": 3.4341758603244315, + "tokens_seen": 1973774336 + }, + { + "epoch": 6.06, + "learning_rate": 0.000203049147442327, + "loss": 2.2585, + "theoretical_loss": 3.4341663285745216, + "tokens_seen": 1973839872 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020303911735205618, + "loss": 2.6983, + "theoretical_loss": 3.434156797229692, + "tokens_seen": 1973905408 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020302908726178536, + "loss": 2.7216, + "theoretical_loss": 3.434147266289912, + "tokens_seen": 1973970944 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020301905717151457, + "loss": 2.7063, + "theoretical_loss": 3.434137735755152, + "tokens_seen": 1974036480 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020300902708124375, + "loss": 2.7059, + "theoretical_loss": 3.43412820562538, + "tokens_seen": 1974102016 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020299899699097293, + "loss": 2.5909, + "theoretical_loss": 3.4341186759005664, + "tokens_seen": 1974167552 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002029889669007021, + "loss": 2.6379, + "theoretical_loss": 3.43410914658068, + "tokens_seen": 1974233088 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2190325, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.277067184448242, + "objective/train/theoretical_loss": 3.434104382072575, + "objective/train/tokens_used": 1994725856, + "theoretical_loss": 3.434104382072575, + "tokens_seen": 1974265856 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020297893681043132, + "loss": 2.4573, + "theoretical_loss": 3.43409961766569, + "tokens_seen": 1974298624 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002029689067201605, + "loss": 2.5783, + "theoretical_loss": 3.434090089155566, + "tokens_seen": 1974364160 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020295887662988968, + "loss": 2.7023, + "theoretical_loss": 3.434080561050277, + "tokens_seen": 1974429696 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020294884653961886, + "loss": 2.7856, + "theoretical_loss": 3.4340710333497935, + "tokens_seen": 1974495232 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020293881644934804, + "loss": 2.679, + "theoretical_loss": 3.434061506054084, + "tokens_seen": 1974560768 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020292878635907725, + "loss": 2.6404, + "theoretical_loss": 3.434051979163117, + "tokens_seen": 1974626304 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020291875626880643, + "loss": 2.6817, + "theoretical_loss": 3.434042452676864, + "tokens_seen": 1974691840 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020290872617853561, + "loss": 2.7226, + "theoretical_loss": 3.4340329265952922, + "tokens_seen": 1974757376 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002028986960882648, + "loss": 2.7224, + "theoretical_loss": 3.434023400918372, + "tokens_seen": 1974822912 + }, + { + "epoch": 6.06, + "learning_rate": 0.000202888665997994, + "loss": 2.6129, + "theoretical_loss": 3.434013875646073, + "tokens_seen": 1974888448 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020287863590772318, + "loss": 2.6646, + "theoretical_loss": 3.434004350778365, + "tokens_seen": 1974953984 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020286860581745237, + "loss": 2.5388, + "theoretical_loss": 3.4339948263152156, + "tokens_seen": 1975019520 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020285857572718155, + "loss": 2.667, + "theoretical_loss": 3.4339853022565965, + "tokens_seen": 1975085056 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020284854563691073, + "loss": 2.5201, + "theoretical_loss": 3.433975778602475, + "tokens_seen": 1975150592 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020283851554663994, + "loss": 2.7968, + "theoretical_loss": 3.433966255352822, + "tokens_seen": 1975216128 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020282848545636912, + "loss": 2.6491, + "theoretical_loss": 3.433956732507606, + "tokens_seen": 1975281664 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002028184553660983, + "loss": 2.7476, + "theoretical_loss": 3.4339472100667967, + "tokens_seen": 1975347200 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020280842527582748, + "loss": 2.6377, + "theoretical_loss": 3.4339376880303636, + "tokens_seen": 1975412736 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002027983951855567, + "loss": 2.7394, + "theoretical_loss": 3.433928166398276, + "tokens_seen": 1975478272 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020278836509528587, + "loss": 2.3912, + "theoretical_loss": 3.433918645170503, + "tokens_seen": 1975543808 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020277833500501505, + "loss": 2.5005, + "theoretical_loss": 3.4339091243470152, + "tokens_seen": 1975609344 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020276830491474423, + "loss": 2.6294, + "theoretical_loss": 3.433899603927781, + "tokens_seen": 1975674880 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020275827482447344, + "loss": 2.6261, + "theoretical_loss": 3.4338900839127695, + "tokens_seen": 1975740416 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020274824473420262, + "loss": 2.6498, + "theoretical_loss": 3.433880564301951, + "tokens_seen": 1975805952 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002027382146439318, + "loss": 2.4563, + "theoretical_loss": 3.4338710450952945, + "tokens_seen": 1975871488 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2193266, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3605144023895264, + "objective/train/theoretical_loss": 3.433866285643518, + "objective/train/tokens_used": 1996364256, + "theoretical_loss": 3.433866285643518, + "tokens_seen": 1975904256 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020272818455366098, + "loss": 2.5356, + "theoretical_loss": 3.4338615262927696, + "tokens_seen": 1975937024 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020271815446339016, + "loss": 2.562, + "theoretical_loss": 3.433852007894346, + "tokens_seen": 1976002560 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020270812437311937, + "loss": 2.4971, + "theoretical_loss": 3.433842489899993, + "tokens_seen": 1976068096 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020269809428284855, + "loss": 2.4357, + "theoretical_loss": 3.4338329723096788, + "tokens_seen": 1976133632 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020268806419257773, + "loss": 2.4446, + "theoretical_loss": 3.433823455123375, + "tokens_seen": 1976199168 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020267803410230692, + "loss": 2.7444, + "theoretical_loss": 3.433813938341049, + "tokens_seen": 1976264704 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020266800401203612, + "loss": 2.7978, + "theoretical_loss": 3.433804421962672, + "tokens_seen": 1976330240 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002026579739217653, + "loss": 2.6747, + "theoretical_loss": 3.4337949059882122, + "tokens_seen": 1976395776 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020264794383149449, + "loss": 2.7317, + "theoretical_loss": 3.43378539041764, + "tokens_seen": 1976461312 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020263791374122367, + "loss": 2.4466, + "theoretical_loss": 3.433775875250924, + "tokens_seen": 1976526848 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020262788365095285, + "loss": 2.4491, + "theoretical_loss": 3.4337663604880344, + "tokens_seen": 1976592384 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020261785356068206, + "loss": 2.7172, + "theoretical_loss": 3.43375684612894, + "tokens_seen": 1976657920 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020260782347041124, + "loss": 2.7044, + "theoretical_loss": 3.433747332173611, + "tokens_seen": 1976723456 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020259779338014042, + "loss": 2.5631, + "theoretical_loss": 3.433737818622016, + "tokens_seen": 1976788992 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002025877632898696, + "loss": 2.6689, + "theoretical_loss": 3.433728305474126, + "tokens_seen": 1976854528 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002025777331995988, + "loss": 2.5576, + "theoretical_loss": 3.4337187927299087, + "tokens_seen": 1976920064 + }, + { + "epoch": 6.06, + "learning_rate": 0.000202567703109328, + "loss": 2.3933, + "theoretical_loss": 3.4337092803893343, + "tokens_seen": 1976985600 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020255767301905717, + "loss": 2.6629, + "theoretical_loss": 3.4336997684523727, + "tokens_seen": 1977051136 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020254764292878635, + "loss": 2.595, + "theoretical_loss": 3.433690256918993, + "tokens_seen": 1977116672 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020253761283851553, + "loss": 2.5158, + "theoretical_loss": 3.4336807457891645, + "tokens_seen": 1977182208 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020252758274824474, + "loss": 2.6296, + "theoretical_loss": 3.4336712350628575, + "tokens_seen": 1977247744 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020251755265797392, + "loss": 2.4896, + "theoretical_loss": 3.4336617247400407, + "tokens_seen": 1977313280 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002025075225677031, + "loss": 2.6092, + "theoretical_loss": 3.433652214820684, + "tokens_seen": 1977378816 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020249749247743228, + "loss": 2.6046, + "theoretical_loss": 3.4336427053047567, + "tokens_seen": 1977444352 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002024874623871615, + "loss": 2.4709, + "theoretical_loss": 3.433633196192228, + "tokens_seen": 1977509888 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2194600, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.273306369781494, + "objective/train/theoretical_loss": 3.4336284417872287, + "objective/train/tokens_used": 1998002656, + "theoretical_loss": 3.4336284417872287, + "tokens_seen": 1977542656 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020247743229689067, + "loss": 2.3925, + "theoretical_loss": 3.4336236874830686, + "tokens_seen": 1977575424 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020246740220661985, + "loss": 2.8195, + "theoretical_loss": 3.4336141791772468, + "tokens_seen": 1977640960 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020245737211634904, + "loss": 2.6951, + "theoretical_loss": 3.433604671274732, + "tokens_seen": 1977706496 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020244734202607822, + "loss": 2.6677, + "theoretical_loss": 3.433595163775495, + "tokens_seen": 1977772032 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020243731193580743, + "loss": 2.653, + "theoretical_loss": 3.4335856566795044, + "tokens_seen": 1977837568 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002024272818455366, + "loss": 2.7325, + "theoretical_loss": 3.43357614998673, + "tokens_seen": 1977903104 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002024172517552658, + "loss": 2.7204, + "theoretical_loss": 3.4335666436971417, + "tokens_seen": 1977968640 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020240722166499497, + "loss": 2.6512, + "theoretical_loss": 3.4335571378107086, + "tokens_seen": 1978034176 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020239719157472418, + "loss": 2.6816, + "theoretical_loss": 3.4335476323274, + "tokens_seen": 1978099712 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020238716148445336, + "loss": 2.6562, + "theoretical_loss": 3.4335381272471857, + "tokens_seen": 1978165248 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020237713139418254, + "loss": 2.7029, + "theoretical_loss": 3.4335286225700354, + "tokens_seen": 1978230784 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020236710130391172, + "loss": 2.7346, + "theoretical_loss": 3.433519118295919, + "tokens_seen": 1978296320 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002023570712136409, + "loss": 2.7575, + "theoretical_loss": 3.433509614424805, + "tokens_seen": 1978361856 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002023470411233701, + "loss": 2.632, + "theoretical_loss": 3.433500110956664, + "tokens_seen": 1978427392 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002023370110330993, + "loss": 2.5318, + "theoretical_loss": 3.433490607891465, + "tokens_seen": 1978492928 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002023269809428285, + "loss": 2.3191, + "theoretical_loss": 3.433481105229178, + "tokens_seen": 1978558464 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020231695085255768, + "loss": 2.609, + "theoretical_loss": 3.433471602969772, + "tokens_seen": 1978624000 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002023069207622869, + "loss": 2.694, + "theoretical_loss": 3.4334621011132165, + "tokens_seen": 1978689536 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020229689067201607, + "loss": 2.443, + "theoretical_loss": 3.433452599659482, + "tokens_seen": 1978755072 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020228686058174525, + "loss": 2.66, + "theoretical_loss": 3.4334430986085374, + "tokens_seen": 1978820608 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020227683049147443, + "loss": 2.36, + "theoretical_loss": 3.4334335979603523, + "tokens_seen": 1978886144 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020226680040120364, + "loss": 2.637, + "theoretical_loss": 3.433424097714896, + "tokens_seen": 1978951680 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020225677031093282, + "loss": 2.6026, + "theoretical_loss": 3.4334145978721393, + "tokens_seen": 1979017216 + }, + { + "epoch": 6.06, + "learning_rate": 0.000202246740220662, + "loss": 2.553, + "theoretical_loss": 3.4334050984320506, + "tokens_seen": 1979082752 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020223671013039118, + "loss": 2.6663, + "theoretical_loss": 3.4333955993945997, + "tokens_seen": 1979148288 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2195396, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7587709426879883, + "objective/train/theoretical_loss": 3.4333908500268544, + "objective/train/tokens_used": 1999641056, + "theoretical_loss": 3.4333908500268544, + "tokens_seen": 1979181056 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020222668004012036, + "loss": 2.7856, + "theoretical_loss": 3.433386100759757, + "tokens_seen": 1979213824 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020221664994984957, + "loss": 2.6895, + "theoretical_loss": 3.433376602527491, + "tokens_seen": 1979279360 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020220661985957875, + "loss": 2.5302, + "theoretical_loss": 3.433367104697772, + "tokens_seen": 1979344896 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020219658976930793, + "loss": 2.5453, + "theoretical_loss": 3.4333576072705694, + "tokens_seen": 1979410432 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020218655967903712, + "loss": 2.5459, + "theoretical_loss": 3.433348110245853, + "tokens_seen": 1979475968 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020217652958876632, + "loss": 2.6611, + "theoretical_loss": 3.433338613623592, + "tokens_seen": 1979541504 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002021664994984955, + "loss": 2.563, + "theoretical_loss": 3.433329117403756, + "tokens_seen": 1979607040 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020215646940822469, + "loss": 2.4677, + "theoretical_loss": 3.4333196215863158, + "tokens_seen": 1979672576 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020214643931795387, + "loss": 2.5035, + "theoretical_loss": 3.4333101261712393, + "tokens_seen": 1979738112 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020213640922768305, + "loss": 2.6477, + "theoretical_loss": 3.433300631158497, + "tokens_seen": 1979803648 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020212637913741226, + "loss": 2.5623, + "theoretical_loss": 3.4332911365480587, + "tokens_seen": 1979869184 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020211634904714144, + "loss": 2.552, + "theoretical_loss": 3.433281642339894, + "tokens_seen": 1979934720 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020210631895687062, + "loss": 2.6248, + "theoretical_loss": 3.4332721485339723, + "tokens_seen": 1980000256 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002020962888665998, + "loss": 2.6258, + "theoretical_loss": 3.433262655130263, + "tokens_seen": 1980065792 + }, + { + "epoch": 6.06, + "learning_rate": 0.000202086258776329, + "loss": 2.4437, + "theoretical_loss": 3.4332531621287363, + "tokens_seen": 1980131328 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002020762286860582, + "loss": 2.7402, + "theoretical_loss": 3.4332436695293618, + "tokens_seen": 1980196864 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020206619859578737, + "loss": 2.4546, + "theoretical_loss": 3.4332341773321087, + "tokens_seen": 1980262400 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020205616850551655, + "loss": 2.5297, + "theoretical_loss": 3.4332246855369473, + "tokens_seen": 1980327936 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020204613841524573, + "loss": 2.4304, + "theoretical_loss": 3.4332151941438465, + "tokens_seen": 1980393472 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020203610832497494, + "loss": 2.5674, + "theoretical_loss": 3.4332057031527765, + "tokens_seen": 1980459008 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020202607823470412, + "loss": 2.4674, + "theoretical_loss": 3.4331962125637068, + "tokens_seen": 1980524544 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002020160481444333, + "loss": 2.6207, + "theoretical_loss": 3.4331867223766075, + "tokens_seen": 1980590080 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020200601805416248, + "loss": 2.6412, + "theoretical_loss": 3.433177232591447, + "tokens_seen": 1980655616 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002019959879638917, + "loss": 2.8113, + "theoretical_loss": 3.433167743208197, + "tokens_seen": 1980721152 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020198595787362087, + "loss": 2.5529, + "theoretical_loss": 3.433158254226825, + "tokens_seen": 1980786688 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2196190, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.828763961791992, + "objective/train/theoretical_loss": 3.4331535098868344, + "objective/train/tokens_used": 2001279456, + "theoretical_loss": 3.4331535098868344, + "tokens_seen": 1980819456 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020197592778335005, + "loss": 2.8576, + "theoretical_loss": 3.433148765647302, + "tokens_seen": 1980852224 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020196589769307924, + "loss": 2.7116, + "theoretical_loss": 3.433139277469598, + "tokens_seen": 1980917760 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020195586760280842, + "loss": 2.6188, + "theoretical_loss": 3.433129789693681, + "tokens_seen": 1980983296 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020194583751253763, + "loss": 2.5884, + "theoretical_loss": 3.433120302319523, + "tokens_seen": 1981048832 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002019358074222668, + "loss": 2.6758, + "theoretical_loss": 3.4331108153470917, + "tokens_seen": 1981114368 + }, + { + "epoch": 6.06, + "learning_rate": 0.000201925777331996, + "loss": 2.5441, + "theoretical_loss": 3.433101328776358, + "tokens_seen": 1981179904 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020191574724172517, + "loss": 2.598, + "theoretical_loss": 3.4330918426072907, + "tokens_seen": 1981245440 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020190571715145438, + "loss": 2.7494, + "theoretical_loss": 3.4330823568398605, + "tokens_seen": 1981310976 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020189568706118356, + "loss": 2.7003, + "theoretical_loss": 3.4330728714740366, + "tokens_seen": 1981376512 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020188565697091274, + "loss": 2.3992, + "theoretical_loss": 3.4330633865097884, + "tokens_seen": 1981442048 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020187562688064192, + "loss": 2.5396, + "theoretical_loss": 3.433053901947086, + "tokens_seen": 1981507584 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002018655967903711, + "loss": 2.5284, + "theoretical_loss": 3.4330444177858994, + "tokens_seen": 1981573120 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002018555667001003, + "loss": 2.5797, + "theoretical_loss": 3.433034934026198, + "tokens_seen": 1981638656 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002018455366098295, + "loss": 2.6793, + "theoretical_loss": 3.433025450667951, + "tokens_seen": 1981704192 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020183550651955867, + "loss": 2.6167, + "theoretical_loss": 3.433015967711129, + "tokens_seen": 1981769728 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020182547642928785, + "loss": 2.5586, + "theoretical_loss": 3.433006485155701, + "tokens_seen": 1981835264 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020181544633901706, + "loss": 2.6913, + "theoretical_loss": 3.432997003001638, + "tokens_seen": 1981900800 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020180541624874624, + "loss": 2.7768, + "theoretical_loss": 3.4329875212489083, + "tokens_seen": 1981966336 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020179538615847542, + "loss": 2.7237, + "theoretical_loss": 3.432978039897482, + "tokens_seen": 1982031872 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002017853560682046, + "loss": 2.6004, + "theoretical_loss": 3.432968558947329, + "tokens_seen": 1982097408 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020177532597793379, + "loss": 2.509, + "theoretical_loss": 3.4329590783984196, + "tokens_seen": 1982162944 + }, + { + "epoch": 6.06, + "learning_rate": 0.000201765295887663, + "loss": 2.5573, + "theoretical_loss": 3.432949598250723, + "tokens_seen": 1982228480 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020175526579739218, + "loss": 2.6996, + "theoretical_loss": 3.4329401185042085, + "tokens_seen": 1982294016 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020174523570712136, + "loss": 2.5012, + "theoretical_loss": 3.432930639158847, + "tokens_seen": 1982359552 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020173520561685054, + "loss": 2.5395, + "theoretical_loss": 3.4329211602146072, + "tokens_seen": 1982425088 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2197682, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.529543876647949, + "objective/train/theoretical_loss": 3.432916420892899, + "objective/train/tokens_used": 2002917856, + "theoretical_loss": 3.432916420892899, + "tokens_seen": 1982457856 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020172517552657975, + "loss": 2.625, + "theoretical_loss": 3.4329116816714595, + "tokens_seen": 1982490624 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020171514543630893, + "loss": 2.6081, + "theoretical_loss": 3.4329022035293733, + "tokens_seen": 1982556160 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002017051153460381, + "loss": 2.6826, + "theoretical_loss": 3.432892725788319, + "tokens_seen": 1982621696 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002016950852557673, + "loss": 2.6334, + "theoretical_loss": 3.432883248448266, + "tokens_seen": 1982687232 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002016850551654965, + "loss": 2.2707, + "theoretical_loss": 3.432873771509183, + "tokens_seen": 1982752768 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020167502507522568, + "loss": 2.548, + "theoretical_loss": 3.4328642949710417, + "tokens_seen": 1982818304 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020166499498495486, + "loss": 2.5682, + "theoretical_loss": 3.4328548188338104, + "tokens_seen": 1982883840 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020165496489468404, + "loss": 2.5454, + "theoretical_loss": 3.4328453430974597, + "tokens_seen": 1982949376 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020164493480441322, + "loss": 2.5929, + "theoretical_loss": 3.432835867761959, + "tokens_seen": 1983014912 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020163490471414243, + "loss": 2.4907, + "theoretical_loss": 3.4328263928272786, + "tokens_seen": 1983080448 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002016248746238716, + "loss": 2.6839, + "theoretical_loss": 3.432816918293388, + "tokens_seen": 1983145984 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002016148445336008, + "loss": 2.6783, + "theoretical_loss": 3.4328074441602565, + "tokens_seen": 1983211520 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020160481444332997, + "loss": 2.4567, + "theoretical_loss": 3.432797970427855, + "tokens_seen": 1983277056 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020159478435305918, + "loss": 2.4877, + "theoretical_loss": 3.432788497096152, + "tokens_seen": 1983342592 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020158475426278836, + "loss": 2.6798, + "theoretical_loss": 3.432779024165118, + "tokens_seen": 1983408128 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020157472417251757, + "loss": 2.8303, + "theoretical_loss": 3.432769551634723, + "tokens_seen": 1983473664 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020156469408224675, + "loss": 2.7191, + "theoretical_loss": 3.432760079504937, + "tokens_seen": 1983539200 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020155466399197593, + "loss": 2.5284, + "theoretical_loss": 3.432750607775729, + "tokens_seen": 1983604736 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020154463390170514, + "loss": 2.5452, + "theoretical_loss": 3.4327411364470697, + "tokens_seen": 1983670272 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020153460381143432, + "loss": 2.7376, + "theoretical_loss": 3.432731665518928, + "tokens_seen": 1983735808 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002015245737211635, + "loss": 2.5705, + "theoretical_loss": 3.432722194991274, + "tokens_seen": 1983801344 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020151454363089268, + "loss": 2.6302, + "theoretical_loss": 3.4327127248640785, + "tokens_seen": 1983866880 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002015045135406219, + "loss": 2.6422, + "theoretical_loss": 3.43270325513731, + "tokens_seen": 1983932416 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020149448345035107, + "loss": 2.5253, + "theoretical_loss": 3.4326937858109394, + "tokens_seen": 1983997952 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020148445336008026, + "loss": 2.721, + "theoretical_loss": 3.4326843168849357, + "tokens_seen": 1984063488 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2198320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.431255340576172, + "objective/train/theoretical_loss": 3.4326795825720624, + "objective/train/tokens_used": 2004556256, + "theoretical_loss": 3.4326795825720624, + "tokens_seen": 1984096256 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020147442326980944, + "loss": 2.5096, + "theoretical_loss": 3.4326748483592695, + "tokens_seen": 1984129024 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020146439317953862, + "loss": 2.7774, + "theoretical_loss": 3.43266538023391, + "tokens_seen": 1984194560 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020145436308926783, + "loss": 2.7149, + "theoretical_loss": 3.432655912508827, + "tokens_seen": 1984260096 + }, + { + "epoch": 6.06, + "learning_rate": 0.000201444332998997, + "loss": 2.553, + "theoretical_loss": 3.4326464451839915, + "tokens_seen": 1984325632 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002014343029087262, + "loss": 2.6791, + "theoretical_loss": 3.432636978259372, + "tokens_seen": 1984391168 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020142427281845537, + "loss": 2.5808, + "theoretical_loss": 3.4326275117349394, + "tokens_seen": 1984456704 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020141424272818458, + "loss": 2.5243, + "theoretical_loss": 3.4326180456106625, + "tokens_seen": 1984522240 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020140421263791376, + "loss": 2.7443, + "theoretical_loss": 3.4326085798865122, + "tokens_seen": 1984587776 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020139418254764294, + "loss": 2.8663, + "theoretical_loss": 3.432599114562458, + "tokens_seen": 1984653312 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020138415245737212, + "loss": 2.5852, + "theoretical_loss": 3.4325896496384694, + "tokens_seen": 1984718848 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002013741223671013, + "loss": 2.5786, + "theoretical_loss": 3.432580185114517, + "tokens_seen": 1984784384 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002013640922768305, + "loss": 2.5919, + "theoretical_loss": 3.4325707209905696, + "tokens_seen": 1984849920 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002013540621865597, + "loss": 2.6014, + "theoretical_loss": 3.4325612572665984, + "tokens_seen": 1984915456 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020134403209628887, + "loss": 2.613, + "theoretical_loss": 3.4325517939425723, + "tokens_seen": 1984980992 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020133400200601805, + "loss": 2.855, + "theoretical_loss": 3.432542331018462, + "tokens_seen": 1985046528 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020132397191574726, + "loss": 2.4476, + "theoretical_loss": 3.4325328684942367, + "tokens_seen": 1985112064 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020131394182547644, + "loss": 2.7378, + "theoretical_loss": 3.4325234063698664, + "tokens_seen": 1985177600 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020130391173520562, + "loss": 2.6698, + "theoretical_loss": 3.4325139446453212, + "tokens_seen": 1985243136 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002012938816449348, + "loss": 2.5875, + "theoretical_loss": 3.432504483320571, + "tokens_seen": 1985308672 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020128385155466399, + "loss": 2.5254, + "theoretical_loss": 3.432495022395586, + "tokens_seen": 1985374208 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002012738214643932, + "loss": 2.7194, + "theoretical_loss": 3.432485561870336, + "tokens_seen": 1985439744 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020126379137412238, + "loss": 2.4256, + "theoretical_loss": 3.43247610174479, + "tokens_seen": 1985505280 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020125376128385156, + "loss": 2.5481, + "theoretical_loss": 3.4324666420189187, + "tokens_seen": 1985570816 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020124373119358074, + "loss": 2.7763, + "theoretical_loss": 3.432457182692692, + "tokens_seen": 1985636352 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020123370110330995, + "loss": 2.7992, + "theoretical_loss": 3.4324477237660798, + "tokens_seen": 1985701888 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2199495, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.019129753112793, + "objective/train/theoretical_loss": 3.43244299445262, + "objective/train/tokens_used": 2006194656, + "theoretical_loss": 3.43244299445262, + "tokens_seen": 1985734656 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020122367101303913, + "loss": 2.6449, + "theoretical_loss": 3.4324382652390524, + "tokens_seen": 1985767424 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002012136409227683, + "loss": 2.7488, + "theoretical_loss": 3.432428807111579, + "tokens_seen": 1985832960 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002012036108324975, + "loss": 2.592, + "theoretical_loss": 3.43241934938363, + "tokens_seen": 1985898496 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002011935807422267, + "loss": 2.7684, + "theoretical_loss": 3.432409892055175, + "tokens_seen": 1985964032 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020118355065195588, + "loss": 2.6902, + "theoretical_loss": 3.4324004351261843, + "tokens_seen": 1986029568 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020117352056168506, + "loss": 2.4956, + "theoretical_loss": 3.4323909785966276, + "tokens_seen": 1986095104 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020116349047141424, + "loss": 2.685, + "theoretical_loss": 3.432381522466475, + "tokens_seen": 1986160640 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020115346038114342, + "loss": 2.6677, + "theoretical_loss": 3.4323720667356965, + "tokens_seen": 1986226176 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020114343029087263, + "loss": 2.7043, + "theoretical_loss": 3.432362611404262, + "tokens_seen": 1986291712 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002011334002006018, + "loss": 2.5696, + "theoretical_loss": 3.432353156472141, + "tokens_seen": 1986357248 + }, + { + "epoch": 6.06, + "learning_rate": 0.000201123370110331, + "loss": 2.3882, + "theoretical_loss": 3.4323437019393044, + "tokens_seen": 1986422784 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020111334002006017, + "loss": 2.3055, + "theoretical_loss": 3.4323342478057217, + "tokens_seen": 1986488320 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020110330992978938, + "loss": 2.5396, + "theoretical_loss": 3.432324794071363, + "tokens_seen": 1986553856 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020109327983951856, + "loss": 2.782, + "theoretical_loss": 3.4323153407361975, + "tokens_seen": 1986619392 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020108324974924774, + "loss": 2.73, + "theoretical_loss": 3.432305887800196, + "tokens_seen": 1986684928 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020107321965897692, + "loss": 2.6517, + "theoretical_loss": 3.432296435263328, + "tokens_seen": 1986750464 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002010631895687061, + "loss": 2.7806, + "theoretical_loss": 3.432286983125564, + "tokens_seen": 1986816000 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020105315947843531, + "loss": 2.6069, + "theoretical_loss": 3.432277531386874, + "tokens_seen": 1986881536 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002010431293881645, + "loss": 2.5568, + "theoretical_loss": 3.432268080047227, + "tokens_seen": 1986947072 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020103309929789368, + "loss": 2.7148, + "theoretical_loss": 3.4322586291065944, + "tokens_seen": 1987012608 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020102306920762286, + "loss": 2.5991, + "theoretical_loss": 3.432249178564945, + "tokens_seen": 1987078144 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020101303911735207, + "loss": 2.6706, + "theoretical_loss": 3.4322397284222497, + "tokens_seen": 1987143680 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020100300902708125, + "loss": 2.408, + "theoretical_loss": 3.4322302786784777, + "tokens_seen": 1987209216 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020099297893681043, + "loss": 2.7329, + "theoretical_loss": 3.4322208293335996, + "tokens_seen": 1987274752 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002009829488465396, + "loss": 2.5682, + "theoretical_loss": 3.432211380387585, + "tokens_seen": 1987340288 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2200102, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0450828075408936, + "objective/train/theoretical_loss": 3.4322066560641424, + "objective/train/tokens_used": 2007833056, + "theoretical_loss": 3.4322066560641424, + "tokens_seen": 1987373056 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002009729187562688, + "loss": 2.7907, + "theoretical_loss": 3.432201931840405, + "tokens_seen": 1987405824 + }, + { + "epoch": 6.06, + "learning_rate": 0.000200962888665998, + "loss": 2.6881, + "theoretical_loss": 3.4321924836920275, + "tokens_seen": 1987471360 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020095285857572718, + "loss": 2.776, + "theoretical_loss": 3.4321830359424244, + "tokens_seen": 1987536896 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020094282848545636, + "loss": 2.7524, + "theoretical_loss": 3.432173588591565, + "tokens_seen": 1987602432 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020093279839518554, + "loss": 2.7444, + "theoretical_loss": 3.4321641416394195, + "tokens_seen": 1987667968 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020092276830491475, + "loss": 2.4852, + "theoretical_loss": 3.4321546950859574, + "tokens_seen": 1987733504 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020091273821464393, + "loss": 2.52, + "theoretical_loss": 3.4321452489311497, + "tokens_seen": 1987799040 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002009027081243731, + "loss": 2.8314, + "theoretical_loss": 3.4321358031749654, + "tokens_seen": 1987864576 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002008926780341023, + "loss": 2.6044, + "theoretical_loss": 3.432126357817375, + "tokens_seen": 1987930112 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020088264794383147, + "loss": 2.6389, + "theoretical_loss": 3.432116912858349, + "tokens_seen": 1987995648 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020087261785356068, + "loss": 2.5202, + "theoretical_loss": 3.4321074682978567, + "tokens_seen": 1988061184 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020086258776328986, + "loss": 2.6395, + "theoretical_loss": 3.4320980241358683, + "tokens_seen": 1988126720 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020085255767301905, + "loss": 2.746, + "theoretical_loss": 3.4320885803723544, + "tokens_seen": 1988192256 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020084252758274823, + "loss": 2.6124, + "theoretical_loss": 3.4320791370072845, + "tokens_seen": 1988257792 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020083249749247743, + "loss": 2.4789, + "theoretical_loss": 3.432069694040629, + "tokens_seen": 1988323328 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020082246740220664, + "loss": 2.4769, + "theoretical_loss": 3.4320602514723575, + "tokens_seen": 1988388864 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020081243731193582, + "loss": 2.652, + "theoretical_loss": 3.4320508093024404, + "tokens_seen": 1988454400 + }, + { + "epoch": 6.06, + "learning_rate": 0.000200802407221665, + "loss": 2.55, + "theoretical_loss": 3.432041367530848, + "tokens_seen": 1988519936 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020079237713139419, + "loss": 2.5576, + "theoretical_loss": 3.4320319261575496, + "tokens_seen": 1988585472 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002007823470411234, + "loss": 2.4727, + "theoretical_loss": 3.4320224851825163, + "tokens_seen": 1988651008 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020077231695085258, + "loss": 2.4684, + "theoretical_loss": 3.432013044605717, + "tokens_seen": 1988716544 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020076228686058176, + "loss": 2.598, + "theoretical_loss": 3.4320036044271225, + "tokens_seen": 1988782080 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020075225677031094, + "loss": 2.7804, + "theoretical_loss": 3.431994164646703, + "tokens_seen": 1988847616 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020074222668004015, + "loss": 2.4845, + "theoretical_loss": 3.431984725264429, + "tokens_seen": 1988913152 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020073219658976933, + "loss": 2.5506, + "theoretical_loss": 3.431975286280269, + "tokens_seen": 1988978688 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2201576, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.202435255050659, + "objective/train/theoretical_loss": 3.4319705669374727, + "objective/train/tokens_used": 2009471456, + "theoretical_loss": 3.4319705669374727, + "tokens_seen": 1989011456 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002007221664994985, + "loss": 2.5634, + "theoretical_loss": 3.431965847694194, + "tokens_seen": 1989044224 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002007121364092277, + "loss": 2.4485, + "theoretical_loss": 3.4319564095061748, + "tokens_seen": 1989109760 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002007021063189569, + "loss": 2.6339, + "theoretical_loss": 3.4319469717161804, + "tokens_seen": 1989175296 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020069207622868608, + "loss": 2.5725, + "theoretical_loss": 3.431937534324182, + "tokens_seen": 1989240832 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020068204613841526, + "loss": 2.4322, + "theoretical_loss": 3.431928097330148, + "tokens_seen": 1989306368 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020067201604814444, + "loss": 2.4718, + "theoretical_loss": 3.43191866073405, + "tokens_seen": 1989371904 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020066198595787362, + "loss": 2.4864, + "theoretical_loss": 3.431909224535858, + "tokens_seen": 1989437440 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020065195586760283, + "loss": 2.5507, + "theoretical_loss": 3.4318997887355414, + "tokens_seen": 1989502976 + }, + { + "epoch": 6.06, + "learning_rate": 0.000200641925777332, + "loss": 2.6007, + "theoretical_loss": 3.4318903533330714, + "tokens_seen": 1989568512 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002006318956870612, + "loss": 2.6837, + "theoretical_loss": 3.4318809183284165, + "tokens_seen": 1989634048 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020062186559679037, + "loss": 2.3901, + "theoretical_loss": 3.4318714837215483, + "tokens_seen": 1989699584 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020061183550651958, + "loss": 2.5772, + "theoretical_loss": 3.431862049512436, + "tokens_seen": 1989765120 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020060180541624876, + "loss": 2.6731, + "theoretical_loss": 3.4318526157010503, + "tokens_seen": 1989830656 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020059177532597794, + "loss": 2.4923, + "theoretical_loss": 3.4318431822873614, + "tokens_seen": 1989896192 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020058174523570713, + "loss": 2.7912, + "theoretical_loss": 3.431833749271339, + "tokens_seen": 1989961728 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002005717151454363, + "loss": 2.5351, + "theoretical_loss": 3.4318243166529534, + "tokens_seen": 1990027264 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020056168505516551, + "loss": 2.5027, + "theoretical_loss": 3.4318148844321748, + "tokens_seen": 1990092800 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002005516549648947, + "loss": 2.5363, + "theoretical_loss": 3.431805452608973, + "tokens_seen": 1990158336 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020054162487462388, + "loss": 2.4346, + "theoretical_loss": 3.431796021183319, + "tokens_seen": 1990223872 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020053159478435306, + "loss": 2.6775, + "theoretical_loss": 3.4317865901551823, + "tokens_seen": 1990289408 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020052156469408227, + "loss": 2.5236, + "theoretical_loss": 3.4317771595245325, + "tokens_seen": 1990354944 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020051153460381145, + "loss": 2.2974, + "theoretical_loss": 3.4317677292913413, + "tokens_seen": 1990420480 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020050150451354063, + "loss": 2.5584, + "theoretical_loss": 3.431758299455578, + "tokens_seen": 1990486016 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002004914744232698, + "loss": 2.5095, + "theoretical_loss": 3.431748870017212, + "tokens_seen": 1990551552 + }, + { + "epoch": 6.06, + "learning_rate": 0.000200481444332999, + "loss": 2.5363, + "theoretical_loss": 3.431739440976215, + "tokens_seen": 1990617088 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2202228, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1467325687408447, + "objective/train/theoretical_loss": 3.43173472660472, + "objective/train/tokens_used": 2011109856, + "theoretical_loss": 3.43173472660472, + "tokens_seen": 1990649856 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002004714142427282, + "loss": 2.7626, + "theoretical_loss": 3.431730012332556, + "tokens_seen": 1990682624 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020046138415245738, + "loss": 2.6107, + "theoretical_loss": 3.4317205840862055, + "tokens_seen": 1990748160 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020045135406218656, + "loss": 2.3842, + "theoretical_loss": 3.431711156237134, + "tokens_seen": 1990813696 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020044132397191574, + "loss": 2.5643, + "theoretical_loss": 3.431701728785311, + "tokens_seen": 1990879232 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020043129388164495, + "loss": 2.5166, + "theoretical_loss": 3.431692301730708, + "tokens_seen": 1990944768 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020042126379137413, + "loss": 2.6078, + "theoretical_loss": 3.4316828750732933, + "tokens_seen": 1991010304 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002004112337011033, + "loss": 2.5004, + "theoretical_loss": 3.4316734488130387, + "tokens_seen": 1991075840 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002004012036108325, + "loss": 2.4937, + "theoretical_loss": 3.4316640229499136, + "tokens_seen": 1991141376 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020039117352056167, + "loss": 2.6354, + "theoretical_loss": 3.431654597483889, + "tokens_seen": 1991206912 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020038114343029088, + "loss": 2.4553, + "theoretical_loss": 3.4316451724149335, + "tokens_seen": 1991272448 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020037111334002006, + "loss": 2.4429, + "theoretical_loss": 3.431635747743019, + "tokens_seen": 1991337984 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020036108324974925, + "loss": 2.4852, + "theoretical_loss": 3.431626323468115, + "tokens_seen": 1991403520 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020035105315947843, + "loss": 2.535, + "theoretical_loss": 3.4316168995901917, + "tokens_seen": 1991469056 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020034102306920763, + "loss": 2.5947, + "theoretical_loss": 3.4316074761092192, + "tokens_seen": 1991534592 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020033099297893682, + "loss": 2.3614, + "theoretical_loss": 3.431598053025168, + "tokens_seen": 1991600128 + }, + { + "epoch": 6.06, + "learning_rate": 0.000200320962888666, + "loss": 2.6769, + "theoretical_loss": 3.431588630338008, + "tokens_seen": 1991665664 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020031093279839518, + "loss": 2.613, + "theoretical_loss": 3.43157920804771, + "tokens_seen": 1991731200 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020030090270812436, + "loss": 2.6764, + "theoretical_loss": 3.4315697861542436, + "tokens_seen": 1991796736 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020029087261785357, + "loss": 2.2781, + "theoretical_loss": 3.431560364657579, + "tokens_seen": 1991862272 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020028084252758275, + "loss": 2.6612, + "theoretical_loss": 3.4315509435576868, + "tokens_seen": 1991927808 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020027081243731193, + "loss": 2.4932, + "theoretical_loss": 3.4315415228545376, + "tokens_seen": 1991993344 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002002607823470411, + "loss": 2.4356, + "theoretical_loss": 3.4315321025481005, + "tokens_seen": 1992058880 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020025075225677032, + "loss": 2.4558, + "theoretical_loss": 3.431522682638347, + "tokens_seen": 1992124416 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002002407221664995, + "loss": 2.2041, + "theoretical_loss": 3.4315132631252463, + "tokens_seen": 1992189952 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020023069207622868, + "loss": 2.515, + "theoretical_loss": 3.4315038440087697, + "tokens_seen": 1992255488 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2203608, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.597972869873047, + "objective/train/theoretical_loss": 3.4314991345992554, + "objective/train/tokens_used": 2012748256, + "theoretical_loss": 3.4314991345992554, + "tokens_seen": 1992288256 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020022066198595786, + "loss": 2.4346, + "theoretical_loss": 3.431494425288886, + "tokens_seen": 1992321024 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020021063189568704, + "loss": 2.6398, + "theoretical_loss": 3.431485006965567, + "tokens_seen": 1992386560 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020020060180541625, + "loss": 2.6462, + "theoretical_loss": 3.431475589038782, + "tokens_seen": 1992452096 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020019057171514543, + "loss": 2.7151, + "theoretical_loss": 3.431466171508502, + "tokens_seen": 1992517632 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020018054162487461, + "loss": 2.4137, + "theoretical_loss": 3.4314567543746968, + "tokens_seen": 1992583168 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002001705115346038, + "loss": 2.7127, + "theoretical_loss": 3.4314473376373362, + "tokens_seen": 1992648704 + }, + { + "epoch": 6.06, + "learning_rate": 0.000200160481444333, + "loss": 2.5882, + "theoretical_loss": 3.431437921296391, + "tokens_seen": 1992714240 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020015045135406218, + "loss": 2.8735, + "theoretical_loss": 3.431428505351832, + "tokens_seen": 1992779776 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020014042126379137, + "loss": 2.6724, + "theoretical_loss": 3.4314190898036285, + "tokens_seen": 1992845312 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020013039117352055, + "loss": 2.7525, + "theoretical_loss": 3.431409674651751, + "tokens_seen": 1992910848 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020012036108324973, + "loss": 2.4759, + "theoretical_loss": 3.4314002598961704, + "tokens_seen": 1992976384 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020011033099297894, + "loss": 2.6471, + "theoretical_loss": 3.431390845536856, + "tokens_seen": 1993041920 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020010030090270812, + "loss": 2.5927, + "theoretical_loss": 3.4313814315737794, + "tokens_seen": 1993107456 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002000902708124373, + "loss": 2.4989, + "theoretical_loss": 3.4313720180069094, + "tokens_seen": 1993172992 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020008024072216648, + "loss": 2.4316, + "theoretical_loss": 3.4313626048362176, + "tokens_seen": 1993238528 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020007021063189571, + "loss": 2.4187, + "theoretical_loss": 3.4313531920616738, + "tokens_seen": 1993304064 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002000601805416249, + "loss": 2.269, + "theoretical_loss": 3.431343779683248, + "tokens_seen": 1993369600 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020005015045135408, + "loss": 2.4155, + "theoretical_loss": 3.431334367700911, + "tokens_seen": 1993435136 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020004012036108326, + "loss": 2.4846, + "theoretical_loss": 3.431324956114633, + "tokens_seen": 1993500672 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020003009027081247, + "loss": 2.4642, + "theoretical_loss": 3.4313155449243835, + "tokens_seen": 1993566208 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020002006018054165, + "loss": 2.5592, + "theoretical_loss": 3.431306134130134, + "tokens_seen": 1993631744 + }, + { + "epoch": 6.06, + "learning_rate": 0.00020001003009027083, + "loss": 2.5254, + "theoretical_loss": 3.4312967237318546, + "tokens_seen": 1993697280 + }, + { + "epoch": 6.06, + "learning_rate": 0.0002, + "loss": 2.5772, + "theoretical_loss": 3.4312873137295146, + "tokens_seen": 1993762816 + }, + { + "epoch": 6.06, + "learning_rate": 0.0001999899699097292, + "loss": 2.6421, + "theoretical_loss": 3.431277904123086, + "tokens_seen": 1993828352 + }, + { + "epoch": 6.06, + "learning_rate": 0.0001999799398194584, + "loss": 2.4464, + "theoretical_loss": 3.431268494912538, + "tokens_seen": 1993893888 + }, + { + "epoch": 6.06, + "objective/train/docs_used": 2204235, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7754595279693604, + "objective/train/theoretical_loss": 3.4312637904557093, + "objective/train/tokens_used": 2014386656, + "theoretical_loss": 3.4312637904557093, + "tokens_seen": 1993926656 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019996990972918758, + "loss": 2.6895, + "theoretical_loss": 3.4312590860978407, + "tokens_seen": 1993959424 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019995987963891676, + "loss": 2.5029, + "theoretical_loss": 3.4312496776789647, + "tokens_seen": 1994024960 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019994984954864594, + "loss": 2.5889, + "theoretical_loss": 3.431240269655881, + "tokens_seen": 1994090496 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019993981945837515, + "loss": 2.679, + "theoretical_loss": 3.4312308620285594, + "tokens_seen": 1994156032 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019992978936810433, + "loss": 2.5838, + "theoretical_loss": 3.4312214547969706, + "tokens_seen": 1994221568 + }, + { + "epoch": 6.06, + "learning_rate": 0.0001999197592778335, + "loss": 2.695, + "theoretical_loss": 3.4312120479610844, + "tokens_seen": 1994287104 + }, + { + "epoch": 6.06, + "learning_rate": 0.0001999097291875627, + "loss": 2.3792, + "theoretical_loss": 3.431202641520871, + "tokens_seen": 1994352640 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019989969909729187, + "loss": 2.4239, + "theoretical_loss": 3.431193235476302, + "tokens_seen": 1994418176 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019988966900702108, + "loss": 2.6039, + "theoretical_loss": 3.4311838298273463, + "tokens_seen": 1994483712 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019987963891675026, + "loss": 2.4489, + "theoretical_loss": 3.4311744245739746, + "tokens_seen": 1994549248 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019986960882647945, + "loss": 2.5003, + "theoretical_loss": 3.4311650197161585, + "tokens_seen": 1994614784 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019985957873620863, + "loss": 2.5359, + "theoretical_loss": 3.431155615253867, + "tokens_seen": 1994680320 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019984954864593783, + "loss": 2.6738, + "theoretical_loss": 3.4311462111870705, + "tokens_seen": 1994745856 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019983951855566702, + "loss": 2.6297, + "theoretical_loss": 3.43113680751574, + "tokens_seen": 1994811392 + }, + { + "epoch": 6.06, + "learning_rate": 0.0001998294884653962, + "loss": 2.4454, + "theoretical_loss": 3.431127404239846, + "tokens_seen": 1994876928 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019981945837512538, + "loss": 2.9032, + "theoretical_loss": 3.4311180013593585, + "tokens_seen": 1994942464 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019980942828485456, + "loss": 2.5726, + "theoretical_loss": 3.4311085988742476, + "tokens_seen": 1995008000 + }, + { + "epoch": 6.06, + "learning_rate": 0.00019979939819458377, + "loss": 2.4094, + "theoretical_loss": 3.4310991967844844, + "tokens_seen": 1995073536 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019978936810431295, + "loss": 2.3985, + "theoretical_loss": 3.4310897950900383, + "tokens_seen": 1995139072 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019977933801404213, + "loss": 2.6157, + "theoretical_loss": 3.4310803937908805, + "tokens_seen": 1995204608 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001997693079237713, + "loss": 2.4725, + "theoretical_loss": 3.4310709928869816, + "tokens_seen": 1995270144 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019975927783350052, + "loss": 2.5848, + "theoretical_loss": 3.431061592378311, + "tokens_seen": 1995335680 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001997492477432297, + "loss": 2.5015, + "theoretical_loss": 3.43105219226484, + "tokens_seen": 1995401216 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019973921765295888, + "loss": 2.5255, + "theoretical_loss": 3.4310427925465388, + "tokens_seen": 1995466752 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019972918756268806, + "loss": 2.6115, + "theoretical_loss": 3.4310333932233776, + "tokens_seen": 1995532288 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2205431, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6125056743621826, + "objective/train/theoretical_loss": 3.4310286937099654, + "objective/train/tokens_used": 2016025056, + "theoretical_loss": 3.4310286937099654, + "tokens_seen": 1995565056 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019971915747241724, + "loss": 2.5565, + "theoretical_loss": 3.4310239942953267, + "tokens_seen": 1995597824 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019970912738214645, + "loss": 2.5641, + "theoretical_loss": 3.431014595762357, + "tokens_seen": 1995663360 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019969909729187563, + "loss": 2.2986, + "theoretical_loss": 3.4310051976244385, + "tokens_seen": 1995728896 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019968906720160481, + "loss": 2.6132, + "theoretical_loss": 3.430995799881542, + "tokens_seen": 1995794432 + }, + { + "epoch": 6.07, + "learning_rate": 0.000199679037111334, + "loss": 2.7227, + "theoretical_loss": 3.4309864025336374, + "tokens_seen": 1995859968 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001996690070210632, + "loss": 2.4772, + "theoretical_loss": 3.430977005580696, + "tokens_seen": 1995925504 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019965897693079238, + "loss": 2.5386, + "theoretical_loss": 3.4309676090226873, + "tokens_seen": 1995991040 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019964894684052157, + "loss": 2.5913, + "theoretical_loss": 3.430958212859582, + "tokens_seen": 1996056576 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019963891675025075, + "loss": 2.6208, + "theoretical_loss": 3.430948817091351, + "tokens_seen": 1996122112 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019962888665997993, + "loss": 2.385, + "theoretical_loss": 3.430939421717964, + "tokens_seen": 1996187648 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019961885656970914, + "loss": 2.5734, + "theoretical_loss": 3.430930026739392, + "tokens_seen": 1996253184 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019960882647943832, + "loss": 2.7634, + "theoretical_loss": 3.4309206321556047, + "tokens_seen": 1996318720 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001995987963891675, + "loss": 2.3119, + "theoretical_loss": 3.430911237966574, + "tokens_seen": 1996384256 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019958876629889668, + "loss": 2.4712, + "theoretical_loss": 3.4309018441722694, + "tokens_seen": 1996449792 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001995787362086259, + "loss": 2.6212, + "theoretical_loss": 3.430892450772661, + "tokens_seen": 1996515328 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019956870611835507, + "loss": 2.6511, + "theoretical_loss": 3.4308830577677196, + "tokens_seen": 1996580864 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019955867602808425, + "loss": 2.6251, + "theoretical_loss": 3.4308736651574163, + "tokens_seen": 1996646400 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019954864593781343, + "loss": 2.4676, + "theoretical_loss": 3.4308642729417205, + "tokens_seen": 1996711936 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019953861584754264, + "loss": 2.7061, + "theoretical_loss": 3.4308548811206037, + "tokens_seen": 1996777472 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019952858575727182, + "loss": 2.5288, + "theoretical_loss": 3.4308454896940357, + "tokens_seen": 1996843008 + }, + { + "epoch": 6.07, + "learning_rate": 0.000199518555667001, + "loss": 2.6179, + "theoretical_loss": 3.4308360986619872, + "tokens_seen": 1996908544 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019950852557673018, + "loss": 2.4945, + "theoretical_loss": 3.430826708024428, + "tokens_seen": 1996974080 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019949849548645936, + "loss": 2.5947, + "theoretical_loss": 3.4308173177813304, + "tokens_seen": 1997039616 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019948846539618857, + "loss": 2.5664, + "theoretical_loss": 3.430807927932663, + "tokens_seen": 1997105152 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019947843530591775, + "loss": 2.5716, + "theoretical_loss": 3.430798538478397, + "tokens_seen": 1997170688 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2206015, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.822054386138916, + "objective/train/theoretical_loss": 3.4307938438991554, + "objective/train/tokens_used": 2017663456, + "theoretical_loss": 3.4307938438991554, + "tokens_seen": 1997203456 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019946840521564693, + "loss": 2.7327, + "theoretical_loss": 3.4307891494185028, + "tokens_seen": 1997236224 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019945837512537612, + "loss": 2.5422, + "theoretical_loss": 3.430779760752951, + "tokens_seen": 1997301760 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019944834503510532, + "loss": 2.6196, + "theoretical_loss": 3.430770372481712, + "tokens_seen": 1997367296 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001994383149448345, + "loss": 2.7897, + "theoretical_loss": 3.430760984604757, + "tokens_seen": 1997432832 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019942828485456369, + "loss": 2.5762, + "theoretical_loss": 3.4307515971220557, + "tokens_seen": 1997498368 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019941825476429287, + "loss": 2.4488, + "theoretical_loss": 3.4307422100335785, + "tokens_seen": 1997563904 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019940822467402205, + "loss": 2.5184, + "theoretical_loss": 3.430732823339296, + "tokens_seen": 1997629440 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019939819458375126, + "loss": 2.6973, + "theoretical_loss": 3.4307234370391795, + "tokens_seen": 1997694976 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019938816449348044, + "loss": 2.7098, + "theoretical_loss": 3.4307140511331986, + "tokens_seen": 1997760512 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019937813440320962, + "loss": 2.4587, + "theoretical_loss": 3.430704665621324, + "tokens_seen": 1997826048 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001993681043129388, + "loss": 2.541, + "theoretical_loss": 3.4306952805035267, + "tokens_seen": 1997891584 + }, + { + "epoch": 6.07, + "learning_rate": 0.000199358074222668, + "loss": 2.6152, + "theoretical_loss": 3.430685895779777, + "tokens_seen": 1997957120 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001993480441323972, + "loss": 2.4729, + "theoretical_loss": 3.430676511450045, + "tokens_seen": 1998022656 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019933801404212637, + "loss": 2.7543, + "theoretical_loss": 3.430667127514302, + "tokens_seen": 1998088192 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019932798395185555, + "loss": 2.764, + "theoretical_loss": 3.4306577439725174, + "tokens_seen": 1998153728 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019931795386158476, + "loss": 2.6767, + "theoretical_loss": 3.430648360824663, + "tokens_seen": 1998219264 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019930792377131397, + "loss": 2.5959, + "theoretical_loss": 3.430638978070709, + "tokens_seen": 1998284800 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019929789368104315, + "loss": 2.3643, + "theoretical_loss": 3.4306295957106254, + "tokens_seen": 1998350336 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019928786359077233, + "loss": 2.673, + "theoretical_loss": 3.4306202137443833, + "tokens_seen": 1998415872 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001992778335005015, + "loss": 2.5312, + "theoretical_loss": 3.4306108321719524, + "tokens_seen": 1998481408 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019926780341023072, + "loss": 2.6044, + "theoretical_loss": 3.4306014509933047, + "tokens_seen": 1998546944 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001992577733199599, + "loss": 2.2918, + "theoretical_loss": 3.4305920702084096, + "tokens_seen": 1998612480 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019924774322968908, + "loss": 2.5966, + "theoretical_loss": 3.4305826898172382, + "tokens_seen": 1998678016 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019923771313941826, + "loss": 2.6335, + "theoretical_loss": 3.430573309819761, + "tokens_seen": 1998743552 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019922768304914744, + "loss": 2.5783, + "theoretical_loss": 3.430563930215948, + "tokens_seen": 1998809088 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2207044, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.386420488357544, + "objective/train/theoretical_loss": 3.4305592405616565, + "objective/train/tokens_used": 2019301856, + "theoretical_loss": 3.4305592405616565, + "tokens_seen": 1998841856 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019921765295887665, + "loss": 2.4945, + "theoretical_loss": 3.4305545510057707, + "tokens_seen": 1998874624 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019920762286860583, + "loss": 2.6647, + "theoretical_loss": 3.4305451721891993, + "tokens_seen": 1998940160 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019919759277833501, + "loss": 2.6662, + "theoretical_loss": 3.4305357937662038, + "tokens_seen": 1999005696 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001991875626880642, + "loss": 2.5939, + "theoretical_loss": 3.4305264157367557, + "tokens_seen": 1999071232 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001991775325977934, + "loss": 2.6739, + "theoretical_loss": 3.4305170381008248, + "tokens_seen": 1999136768 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019916750250752258, + "loss": 2.4546, + "theoretical_loss": 3.4305076608583818, + "tokens_seen": 1999202304 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019915747241725177, + "loss": 2.3588, + "theoretical_loss": 3.4304982840093983, + "tokens_seen": 1999267840 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019914744232698095, + "loss": 2.5978, + "theoretical_loss": 3.4304889075538436, + "tokens_seen": 1999333376 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019913741223671013, + "loss": 2.2747, + "theoretical_loss": 3.430479531491689, + "tokens_seen": 1999398912 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019912738214643934, + "loss": 2.3891, + "theoretical_loss": 3.430470155822905, + "tokens_seen": 1999464448 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019911735205616852, + "loss": 2.4095, + "theoretical_loss": 3.430460780547462, + "tokens_seen": 1999529984 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001991073219658977, + "loss": 2.4165, + "theoretical_loss": 3.4304514056653304, + "tokens_seen": 1999595520 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019909729187562688, + "loss": 2.5769, + "theoretical_loss": 3.430442031176482, + "tokens_seen": 1999661056 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001990872617853561, + "loss": 2.4471, + "theoretical_loss": 3.430432657080886, + "tokens_seen": 1999726592 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019907723169508527, + "loss": 2.4597, + "theoretical_loss": 3.4304232833785133, + "tokens_seen": 1999792128 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019906720160481445, + "loss": 2.4847, + "theoretical_loss": 3.430413910069335, + "tokens_seen": 1999857664 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019905717151454363, + "loss": 2.6729, + "theoretical_loss": 3.430404537153322, + "tokens_seen": 1999923200 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019904714142427284, + "loss": 2.5952, + "theoretical_loss": 3.4303951646304442, + "tokens_seen": 1999988736 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019903711133400202, + "loss": 2.4516, + "theoretical_loss": 3.4303857925006724, + "tokens_seen": 2000054272 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001990270812437312, + "loss": 2.5072, + "theoretical_loss": 3.4303764207639773, + "tokens_seen": 2000119808 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019901705115346038, + "loss": 2.5411, + "theoretical_loss": 3.4303670494203296, + "tokens_seen": 2000185344 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019900702106318956, + "loss": 2.3764, + "theoretical_loss": 3.4303576784696994, + "tokens_seen": 2000250880 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019899699097291877, + "loss": 2.6049, + "theoretical_loss": 3.430348307912058, + "tokens_seen": 2000316416 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019898696088264795, + "loss": 2.5397, + "theoretical_loss": 3.4303389377473765, + "tokens_seen": 2000381952 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019897693079237713, + "loss": 2.6403, + "theoretical_loss": 3.4303295679756243, + "tokens_seen": 2000447488 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2207760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2208430767059326, + "objective/train/theoretical_loss": 3.4303248832370876, + "objective/train/tokens_used": 2020940256, + "theoretical_loss": 3.4303248832370876, + "tokens_seen": 2000480256 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019896690070210632, + "loss": 2.3751, + "theoretical_loss": 3.430320198596773, + "tokens_seen": 2000513024 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019895687061183552, + "loss": 2.6969, + "theoretical_loss": 3.430310829610793, + "tokens_seen": 2000578560 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001989468405215647, + "loss": 2.3314, + "theoretical_loss": 3.4303014610176543, + "tokens_seen": 2000644096 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019893681043129389, + "loss": 2.5299, + "theoretical_loss": 3.4302920928173286, + "tokens_seen": 2000709632 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019892678034102307, + "loss": 2.4588, + "theoretical_loss": 3.4302827250097856, + "tokens_seen": 2000775168 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019891675025075225, + "loss": 2.5424, + "theoretical_loss": 3.430273357594997, + "tokens_seen": 2000840704 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019890672016048146, + "loss": 2.3579, + "theoretical_loss": 3.430263990572933, + "tokens_seen": 2000906240 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019889669007021064, + "loss": 2.3711, + "theoretical_loss": 3.430254623943564, + "tokens_seen": 2000971776 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019888665997993982, + "loss": 2.5051, + "theoretical_loss": 3.4302452577068605, + "tokens_seen": 2001037312 + }, + { + "epoch": 6.07, + "learning_rate": 0.000198876629889669, + "loss": 2.8324, + "theoretical_loss": 3.430235891862794, + "tokens_seen": 2001102848 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001988665997993982, + "loss": 2.4569, + "theoretical_loss": 3.4302265264113347, + "tokens_seen": 2001168384 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001988565697091274, + "loss": 2.3283, + "theoretical_loss": 3.430217161352453, + "tokens_seen": 2001233920 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019884653961885657, + "loss": 2.5208, + "theoretical_loss": 3.430207796686121, + "tokens_seen": 2001299456 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019883650952858575, + "loss": 2.6795, + "theoretical_loss": 3.430198432412307, + "tokens_seen": 2001364992 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019882647943831493, + "loss": 2.4581, + "theoretical_loss": 3.4301890685309835, + "tokens_seen": 2001430528 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019881644934804414, + "loss": 2.7483, + "theoretical_loss": 3.430179705042121, + "tokens_seen": 2001496064 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019880641925777332, + "loss": 2.2557, + "theoretical_loss": 3.4301703419456895, + "tokens_seen": 2001561600 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001987963891675025, + "loss": 2.6698, + "theoretical_loss": 3.43016097924166, + "tokens_seen": 2001627136 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019878635907723168, + "loss": 2.5683, + "theoretical_loss": 3.430151616930004, + "tokens_seen": 2001692672 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001987763289869609, + "loss": 2.1698, + "theoretical_loss": 3.430142255010691, + "tokens_seen": 2001758208 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019876629889669007, + "loss": 2.6215, + "theoretical_loss": 3.4301328934836923, + "tokens_seen": 2001823744 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019875626880641925, + "loss": 2.4379, + "theoretical_loss": 3.4301235323489787, + "tokens_seen": 2001889280 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019874623871614844, + "loss": 2.581, + "theoretical_loss": 3.4301141716065207, + "tokens_seen": 2001954816 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019873620862587762, + "loss": 2.5488, + "theoretical_loss": 3.430104811256289, + "tokens_seen": 2002020352 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019872617853560683, + "loss": 2.5981, + "theoretical_loss": 3.4300954512982544, + "tokens_seen": 2002085888 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2209066, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5827605724334717, + "objective/train/theoretical_loss": 3.4300907714663023, + "objective/train/tokens_used": 2022578656, + "theoretical_loss": 3.4300907714663023, + "tokens_seen": 2002118656 + }, + { + "epoch": 6.07, + "learning_rate": 0.000198716148445336, + "loss": 2.5699, + "theoretical_loss": 3.430086091732388, + "tokens_seen": 2002151424 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001987061183550652, + "loss": 2.7089, + "theoretical_loss": 3.4300767325586596, + "tokens_seen": 2002216960 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019869608826479437, + "loss": 2.5329, + "theoretical_loss": 3.4300673737770415, + "tokens_seen": 2002282496 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019868605817452358, + "loss": 2.6014, + "theoretical_loss": 3.4300580153875027, + "tokens_seen": 2002348032 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019867602808425276, + "loss": 2.5899, + "theoretical_loss": 3.4300486573900146, + "tokens_seen": 2002413568 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019866599799398194, + "loss": 2.459, + "theoretical_loss": 3.4300392997845486, + "tokens_seen": 2002479104 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019865596790371112, + "loss": 2.3484, + "theoretical_loss": 3.4300299425710747, + "tokens_seen": 2002544640 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001986459378134403, + "loss": 2.7613, + "theoretical_loss": 3.430020585749564, + "tokens_seen": 2002610176 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001986359077231695, + "loss": 2.4346, + "theoretical_loss": 3.4300112293199865, + "tokens_seen": 2002675712 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001986258776328987, + "loss": 2.5238, + "theoretical_loss": 3.430001873282314, + "tokens_seen": 2002741248 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019861584754262787, + "loss": 2.6874, + "theoretical_loss": 3.429992517636517, + "tokens_seen": 2002806784 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019860581745235705, + "loss": 2.4247, + "theoretical_loss": 3.429983162382566, + "tokens_seen": 2002872320 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019859578736208626, + "loss": 2.4545, + "theoretical_loss": 3.429973807520432, + "tokens_seen": 2002937856 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019858575727181544, + "loss": 2.6762, + "theoretical_loss": 3.4299644530500855, + "tokens_seen": 2003003392 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019857572718154462, + "loss": 2.447, + "theoretical_loss": 3.4299550989714973, + "tokens_seen": 2003068928 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019856569709127383, + "loss": 2.3633, + "theoretical_loss": 3.429945745284638, + "tokens_seen": 2003134464 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019855566700100304, + "loss": 2.2775, + "theoretical_loss": 3.4299363919894788, + "tokens_seen": 2003200000 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019854563691073222, + "loss": 2.3463, + "theoretical_loss": 3.4299270390859906, + "tokens_seen": 2003265536 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001985356068204614, + "loss": 2.7624, + "theoretical_loss": 3.429917686574144, + "tokens_seen": 2003331072 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019852557673019058, + "loss": 2.6882, + "theoretical_loss": 3.4299083344539096, + "tokens_seen": 2003396608 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019851554663991976, + "loss": 2.5106, + "theoretical_loss": 3.429898982725258, + "tokens_seen": 2003462144 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019850551654964897, + "loss": 2.2386, + "theoretical_loss": 3.4298896313881606, + "tokens_seen": 2003527680 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019849548645937815, + "loss": 2.7426, + "theoretical_loss": 3.429880280442588, + "tokens_seen": 2003593216 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019848545636910733, + "loss": 2.622, + "theoretical_loss": 3.429870929888511, + "tokens_seen": 2003658752 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019847542627883652, + "loss": 2.6302, + "theoretical_loss": 3.4298615797259, + "tokens_seen": 2003724288 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2209762, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3572299480438232, + "objective/train/theoretical_loss": 3.4298569047913854, + "objective/train/tokens_used": 2024217056, + "theoretical_loss": 3.4298569047913854, + "tokens_seen": 2003757056 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019846539618856572, + "loss": 2.4542, + "theoretical_loss": 3.4298522299547263, + "tokens_seen": 2003789824 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001984553660982949, + "loss": 2.5864, + "theoretical_loss": 3.4298428805749603, + "tokens_seen": 2003855360 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019844533600802409, + "loss": 2.5739, + "theoretical_loss": 3.4298335315865733, + "tokens_seen": 2003920896 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019843530591775327, + "loss": 2.4161, + "theoretical_loss": 3.4298241829895355, + "tokens_seen": 2003986432 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019842527582748245, + "loss": 2.68, + "theoretical_loss": 3.4298148347838184, + "tokens_seen": 2004051968 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019841524573721166, + "loss": 2.4999, + "theoretical_loss": 3.429805486969393, + "tokens_seen": 2004117504 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019840521564694084, + "loss": 2.4913, + "theoretical_loss": 3.429796139546229, + "tokens_seen": 2004183040 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019839518555667002, + "loss": 2.4692, + "theoretical_loss": 3.4297867925142977, + "tokens_seen": 2004248576 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001983851554663992, + "loss": 2.4689, + "theoretical_loss": 3.4297774458735706, + "tokens_seen": 2004314112 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001983751253761284, + "loss": 2.6211, + "theoretical_loss": 3.4297680996240176, + "tokens_seen": 2004379648 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001983650952858576, + "loss": 2.7147, + "theoretical_loss": 3.4297587537656105, + "tokens_seen": 2004445184 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019835506519558677, + "loss": 2.5348, + "theoretical_loss": 3.4297494082983193, + "tokens_seen": 2004510720 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019834503510531595, + "loss": 2.4825, + "theoretical_loss": 3.4297400632221153, + "tokens_seen": 2004576256 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019833500501504513, + "loss": 2.508, + "theoretical_loss": 3.4297307185369688, + "tokens_seen": 2004641792 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019832497492477434, + "loss": 2.639, + "theoretical_loss": 3.4297213742428516, + "tokens_seen": 2004707328 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019831494483450352, + "loss": 2.2593, + "theoretical_loss": 3.4297120303397337, + "tokens_seen": 2004772864 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001983049147442327, + "loss": 2.6885, + "theoretical_loss": 3.4297026868275866, + "tokens_seen": 2004838400 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019829488465396188, + "loss": 2.4581, + "theoretical_loss": 3.4296933437063806, + "tokens_seen": 2004903936 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001982848545636911, + "loss": 2.6262, + "theoretical_loss": 3.4296840009760867, + "tokens_seen": 2004969472 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019827482447342027, + "loss": 2.7047, + "theoretical_loss": 3.429674658636676, + "tokens_seen": 2005035008 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019826479438314945, + "loss": 2.6792, + "theoretical_loss": 3.4296653166881192, + "tokens_seen": 2005100544 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019825476429287864, + "loss": 2.3426, + "theoretical_loss": 3.4296559751303874, + "tokens_seen": 2005166080 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019824473420260782, + "loss": 2.3985, + "theoretical_loss": 3.429646633963451, + "tokens_seen": 2005231616 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019823470411233703, + "loss": 2.5955, + "theoretical_loss": 3.429637293187281, + "tokens_seen": 2005297152 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001982246740220662, + "loss": 2.4232, + "theoretical_loss": 3.429627952801849, + "tokens_seen": 2005362688 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2211131, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5044546127319336, + "objective/train/theoretical_loss": 3.4296232827556503, + "objective/train/tokens_used": 2025855456, + "theoretical_loss": 3.4296232827556503, + "tokens_seen": 2005395456 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001982146439317954, + "loss": 2.443, + "theoretical_loss": 3.4296186128071255, + "tokens_seen": 2005428224 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019820461384152457, + "loss": 2.5316, + "theoretical_loss": 3.4296092732030807, + "tokens_seen": 2005493760 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019819458375125378, + "loss": 2.5128, + "theoretical_loss": 3.429599933989686, + "tokens_seen": 2005559296 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019818455366098296, + "loss": 2.4833, + "theoretical_loss": 3.4295905951669123, + "tokens_seen": 2005624832 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019817452357071214, + "loss": 2.6094, + "theoretical_loss": 3.429581256734731, + "tokens_seen": 2005690368 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019816449348044132, + "loss": 2.4955, + "theoretical_loss": 3.429571918693112, + "tokens_seen": 2005755904 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001981544633901705, + "loss": 2.298, + "theoretical_loss": 3.429562581042027, + "tokens_seen": 2005821440 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001981444332998997, + "loss": 2.4574, + "theoretical_loss": 3.429553243781447, + "tokens_seen": 2005886976 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001981344032096289, + "loss": 2.5812, + "theoretical_loss": 3.429543906911342, + "tokens_seen": 2005952512 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019812437311935807, + "loss": 2.5403, + "theoretical_loss": 3.4295345704316835, + "tokens_seen": 2006018048 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019811434302908725, + "loss": 2.7513, + "theoretical_loss": 3.4295252343424423, + "tokens_seen": 2006083584 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019810431293881646, + "loss": 2.4075, + "theoretical_loss": 3.4295158986435896, + "tokens_seen": 2006149120 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019809428284854564, + "loss": 2.4988, + "theoretical_loss": 3.4295065633350963, + "tokens_seen": 2006214656 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019808425275827482, + "loss": 2.6384, + "theoretical_loss": 3.429497228416933, + "tokens_seen": 2006280192 + }, + { + "epoch": 6.07, + "learning_rate": 0.000198074222668004, + "loss": 2.6064, + "theoretical_loss": 3.4294878938890703, + "tokens_seen": 2006345728 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019806419257773319, + "loss": 2.4774, + "theoretical_loss": 3.4294785597514803, + "tokens_seen": 2006411264 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001980541624874624, + "loss": 2.3288, + "theoretical_loss": 3.429469226004133, + "tokens_seen": 2006476800 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019804413239719157, + "loss": 2.5688, + "theoretical_loss": 3.4294598926469995, + "tokens_seen": 2006542336 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019803410230692076, + "loss": 2.603, + "theoretical_loss": 3.429450559680051, + "tokens_seen": 2006607872 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019802407221664994, + "loss": 2.5761, + "theoretical_loss": 3.429441227103258, + "tokens_seen": 2006673408 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019801404212637915, + "loss": 2.4211, + "theoretical_loss": 3.429431894916592, + "tokens_seen": 2006738944 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019800401203610833, + "loss": 2.4704, + "theoretical_loss": 3.4294225631200232, + "tokens_seen": 2006804480 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001979939819458375, + "loss": 2.4464, + "theoretical_loss": 3.4294132317135237, + "tokens_seen": 2006870016 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001979839518555667, + "loss": 2.2976, + "theoretical_loss": 3.4294039006970634, + "tokens_seen": 2006935552 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001979739217652959, + "loss": 2.577, + "theoretical_loss": 3.429394570070614, + "tokens_seen": 2007001088 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2211779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.585545301437378, + "objective/train/theoretical_loss": 3.429389904903634, + "objective/train/tokens_used": 2027493856, + "theoretical_loss": 3.429389904903634, + "tokens_seen": 2007033856 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019796389167502508, + "loss": 2.7372, + "theoretical_loss": 3.4293852398341453, + "tokens_seen": 2007066624 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019795386158475426, + "loss": 2.6262, + "theoretical_loss": 3.42937590998763, + "tokens_seen": 2007132160 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019794383149448344, + "loss": 2.702, + "theoretical_loss": 3.429366580531038, + "tokens_seen": 2007197696 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019793380140421262, + "loss": 2.6035, + "theoretical_loss": 3.42935725146434, + "tokens_seen": 2007263232 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019792377131394183, + "loss": 2.3747, + "theoretical_loss": 3.4293479227875077, + "tokens_seen": 2007328768 + }, + { + "epoch": 6.07, + "learning_rate": 0.000197913741223671, + "loss": 2.6181, + "theoretical_loss": 3.4293385945005115, + "tokens_seen": 2007394304 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001979037111334002, + "loss": 2.5793, + "theoretical_loss": 3.4293292666033226, + "tokens_seen": 2007459840 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019789368104312937, + "loss": 2.5192, + "theoretical_loss": 3.4293199390959126, + "tokens_seen": 2007525376 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019788365095285858, + "loss": 2.647, + "theoretical_loss": 3.4293106119782513, + "tokens_seen": 2007590912 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019787362086258776, + "loss": 2.5456, + "theoretical_loss": 3.429301285250311, + "tokens_seen": 2007656448 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019786359077231694, + "loss": 2.4942, + "theoretical_loss": 3.429291958912062, + "tokens_seen": 2007721984 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019785356068204612, + "loss": 2.4142, + "theoretical_loss": 3.4292826329634747, + "tokens_seen": 2007787520 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001978435305917753, + "loss": 2.471, + "theoretical_loss": 3.4292733074045207, + "tokens_seen": 2007853056 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019783350050150451, + "loss": 2.4307, + "theoretical_loss": 3.4292639822351716, + "tokens_seen": 2007918592 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001978234704112337, + "loss": 2.5715, + "theoretical_loss": 3.429254657455398, + "tokens_seen": 2007984128 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001978134403209629, + "loss": 2.4501, + "theoretical_loss": 3.42924533306517, + "tokens_seen": 2008049664 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019780341023069208, + "loss": 2.6553, + "theoretical_loss": 3.42923600906446, + "tokens_seen": 2008115200 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001977933801404213, + "loss": 2.4929, + "theoretical_loss": 3.4292266854532376, + "tokens_seen": 2008180736 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019778335005015047, + "loss": 2.517, + "theoretical_loss": 3.4292173622314754, + "tokens_seen": 2008246272 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019777331995987965, + "loss": 2.4997, + "theoretical_loss": 3.429208039399143, + "tokens_seen": 2008311808 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019776328986960884, + "loss": 2.7793, + "theoretical_loss": 3.429198716956212, + "tokens_seen": 2008377344 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019775325977933802, + "loss": 2.5741, + "theoretical_loss": 3.4291893949026546, + "tokens_seen": 2008442880 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019774322968906723, + "loss": 2.3714, + "theoretical_loss": 3.42918007323844, + "tokens_seen": 2008508416 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001977331995987964, + "loss": 2.7592, + "theoretical_loss": 3.4291707519635395, + "tokens_seen": 2008573952 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001977231695085256, + "loss": 2.5333, + "theoretical_loss": 3.429161431077925, + "tokens_seen": 2008639488 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2212802, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6943588256835938, + "objective/train/theoretical_loss": 3.4291567707810904, + "objective/train/tokens_used": 2029132256, + "theoretical_loss": 3.4291567707810904, + "tokens_seen": 2008672256 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019771313941825477, + "loss": 2.4882, + "theoretical_loss": 3.429152110581567, + "tokens_seen": 2008705024 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019770310932798398, + "loss": 2.5904, + "theoretical_loss": 3.4291427904744367, + "tokens_seen": 2008770560 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019769307923771316, + "loss": 2.4968, + "theoretical_loss": 3.429133470756505, + "tokens_seen": 2008836096 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019768304914744234, + "loss": 2.69, + "theoretical_loss": 3.4291241514277435, + "tokens_seen": 2008901632 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019767301905717152, + "loss": 2.5312, + "theoretical_loss": 3.4291148324881227, + "tokens_seen": 2008967168 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001976629889669007, + "loss": 2.5895, + "theoretical_loss": 3.429105513937613, + "tokens_seen": 2009032704 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001976529588766299, + "loss": 2.6422, + "theoretical_loss": 3.4290961957761876, + "tokens_seen": 2009098240 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001976429287863591, + "loss": 2.5551, + "theoretical_loss": 3.4290868780038153, + "tokens_seen": 2009163776 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019763289869608827, + "loss": 2.678, + "theoretical_loss": 3.4290775606204686, + "tokens_seen": 2009229312 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019762286860581745, + "loss": 2.5931, + "theoretical_loss": 3.4290682436261175, + "tokens_seen": 2009294848 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019761283851554666, + "loss": 2.5192, + "theoretical_loss": 3.429058927020734, + "tokens_seen": 2009360384 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019760280842527584, + "loss": 2.6817, + "theoretical_loss": 3.4290496108042885, + "tokens_seen": 2009425920 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019759277833500502, + "loss": 2.3846, + "theoretical_loss": 3.4290402949767524, + "tokens_seen": 2009491456 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001975827482447342, + "loss": 2.7337, + "theoretical_loss": 3.429030979538097, + "tokens_seen": 2009556992 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019757271815446339, + "loss": 2.7914, + "theoretical_loss": 3.429021664488293, + "tokens_seen": 2009622528 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001975626880641926, + "loss": 2.6122, + "theoretical_loss": 3.4290123498273113, + "tokens_seen": 2009688064 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019755265797392178, + "loss": 2.6016, + "theoretical_loss": 3.429003035555124, + "tokens_seen": 2009753600 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019754262788365096, + "loss": 2.7229, + "theoretical_loss": 3.428993721671701, + "tokens_seen": 2009819136 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019753259779338014, + "loss": 2.6358, + "theoretical_loss": 3.4289844081770138, + "tokens_seen": 2009884672 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019752256770310935, + "loss": 2.6053, + "theoretical_loss": 3.428975095071034, + "tokens_seen": 2009950208 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019751253761283853, + "loss": 2.7536, + "theoretical_loss": 3.428965782353732, + "tokens_seen": 2010015744 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001975025075225677, + "loss": 2.8927, + "theoretical_loss": 3.4289564700250796, + "tokens_seen": 2010081280 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001974924774322969, + "loss": 2.3614, + "theoretical_loss": 3.4289471580850472, + "tokens_seen": 2010146816 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001974824473420261, + "loss": 2.6811, + "theoretical_loss": 3.4289378465336062, + "tokens_seen": 2010212352 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019747241725175528, + "loss": 2.5046, + "theoretical_loss": 3.428928535370728, + "tokens_seen": 2010277888 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2213466, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7084853649139404, + "objective/train/theoretical_loss": 3.428923879934991, + "objective/train/tokens_used": 2030770656, + "theoretical_loss": 3.428923879934991, + "tokens_seen": 2010310656 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019746238716148446, + "loss": 2.479, + "theoretical_loss": 3.4289192245963833, + "tokens_seen": 2010343424 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019745235707121364, + "loss": 2.6381, + "theoretical_loss": 3.4289099142105437, + "tokens_seen": 2010408960 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019744232698094282, + "loss": 2.4071, + "theoretical_loss": 3.4289006042131795, + "tokens_seen": 2010474496 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019743229689067203, + "loss": 2.5805, + "theoretical_loss": 3.428891294604263, + "tokens_seen": 2010540032 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001974222668004012, + "loss": 2.665, + "theoretical_loss": 3.4288819853837644, + "tokens_seen": 2010605568 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001974122367101304, + "loss": 2.6371, + "theoretical_loss": 3.428872676551655, + "tokens_seen": 2010671104 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019740220661985957, + "loss": 2.7296, + "theoretical_loss": 3.4288633681079057, + "tokens_seen": 2010736640 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019739217652958878, + "loss": 2.51, + "theoretical_loss": 3.4288540600524886, + "tokens_seen": 2010802176 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019738214643931796, + "loss": 2.3996, + "theoretical_loss": 3.4288447523853742, + "tokens_seen": 2010867712 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019737211634904714, + "loss": 2.5048, + "theoretical_loss": 3.4288354451065333, + "tokens_seen": 2010933248 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019736208625877632, + "loss": 2.3563, + "theoretical_loss": 3.4288261382159377, + "tokens_seen": 2010998784 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001973520561685055, + "loss": 2.6408, + "theoretical_loss": 3.4288168317135583, + "tokens_seen": 2011064320 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019734202607823471, + "loss": 2.2463, + "theoretical_loss": 3.428807525599366, + "tokens_seen": 2011129856 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001973319959879639, + "loss": 2.7768, + "theoretical_loss": 3.4287982198733324, + "tokens_seen": 2011195392 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019732196589769308, + "loss": 2.795, + "theoretical_loss": 3.4287889145354287, + "tokens_seen": 2011260928 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019731193580742226, + "loss": 2.6637, + "theoretical_loss": 3.4287796095856256, + "tokens_seen": 2011326464 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019730190571715147, + "loss": 2.6016, + "theoretical_loss": 3.4287703050238942, + "tokens_seen": 2011392000 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019729187562688065, + "loss": 2.5543, + "theoretical_loss": 3.4287610008502067, + "tokens_seen": 2011457536 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019728184553660983, + "loss": 2.5128, + "theoretical_loss": 3.428751697064533, + "tokens_seen": 2011523072 + }, + { + "epoch": 6.07, + "learning_rate": 0.000197271815446339, + "loss": 2.5473, + "theoretical_loss": 3.4287423936668446, + "tokens_seen": 2011588608 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001972617853560682, + "loss": 2.5576, + "theoretical_loss": 3.428733090657113, + "tokens_seen": 2011654144 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001972517552657974, + "loss": 2.545, + "theoretical_loss": 3.42872378803531, + "tokens_seen": 2011719680 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019724172517552658, + "loss": 2.496, + "theoretical_loss": 3.4287144858014056, + "tokens_seen": 2011785216 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019723169508525576, + "loss": 2.722, + "theoretical_loss": 3.4287051839553717, + "tokens_seen": 2011850752 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019722166499498494, + "loss": 2.5581, + "theoretical_loss": 3.4286958824971787, + "tokens_seen": 2011916288 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2214547, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.138517379760742, + "objective/train/theoretical_loss": 3.428691231913514, + "objective/train/tokens_used": 2032409056, + "theoretical_loss": 3.428691231913514, + "tokens_seen": 2011949056 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019721163490471415, + "loss": 2.2702, + "theoretical_loss": 3.428686581426799, + "tokens_seen": 2011981824 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019720160481444333, + "loss": 2.7822, + "theoretical_loss": 3.4286772807442025, + "tokens_seen": 2012047360 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001971915747241725, + "loss": 2.6523, + "theoretical_loss": 3.4286679804493616, + "tokens_seen": 2012112896 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001971815446339017, + "loss": 2.4413, + "theoretical_loss": 3.428658680542247, + "tokens_seen": 2012178432 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019717151454363087, + "loss": 2.7283, + "theoretical_loss": 3.4286493810228293, + "tokens_seen": 2012243968 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019716148445336008, + "loss": 2.4623, + "theoretical_loss": 3.428640081891081, + "tokens_seen": 2012309504 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019715145436308926, + "loss": 2.6768, + "theoretical_loss": 3.4286307831469722, + "tokens_seen": 2012375040 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019714142427281844, + "loss": 2.4476, + "theoretical_loss": 3.4286214847904746, + "tokens_seen": 2012440576 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019713139418254763, + "loss": 2.6295, + "theoretical_loss": 3.4286121868215593, + "tokens_seen": 2012506112 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019712136409227683, + "loss": 2.6025, + "theoretical_loss": 3.4286028892401976, + "tokens_seen": 2012571648 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019711133400200602, + "loss": 2.5821, + "theoretical_loss": 3.4285935920463606, + "tokens_seen": 2012637184 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001971013039117352, + "loss": 2.6283, + "theoretical_loss": 3.42858429524002, + "tokens_seen": 2012702720 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019709127382146438, + "loss": 2.6982, + "theoretical_loss": 3.428574998821146, + "tokens_seen": 2012768256 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019708124373119356, + "loss": 2.6272, + "theoretical_loss": 3.4285657027897107, + "tokens_seen": 2012833792 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001970712136409228, + "loss": 2.5992, + "theoretical_loss": 3.4285564071456855, + "tokens_seen": 2012899328 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019706118355065198, + "loss": 2.7673, + "theoretical_loss": 3.4285471118890407, + "tokens_seen": 2012964864 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019705115346038116, + "loss": 2.6581, + "theoretical_loss": 3.4285378170197482, + "tokens_seen": 2013030400 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019704112337011034, + "loss": 2.4843, + "theoretical_loss": 3.4285285225377793, + "tokens_seen": 2013095936 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019703109327983955, + "loss": 2.5184, + "theoretical_loss": 3.4285192284431054, + "tokens_seen": 2013161472 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019702106318956873, + "loss": 2.5366, + "theoretical_loss": 3.4285099347356973, + "tokens_seen": 2013227008 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001970110330992979, + "loss": 2.5556, + "theoretical_loss": 3.4285006414155266, + "tokens_seen": 2013292544 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001970010030090271, + "loss": 2.5664, + "theoretical_loss": 3.428491348482564, + "tokens_seen": 2013358080 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001969909729187563, + "loss": 2.7131, + "theoretical_loss": 3.4284820559367812, + "tokens_seen": 2013423616 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019698094282848548, + "loss": 2.7554, + "theoretical_loss": 3.4284727637781494, + "tokens_seen": 2013489152 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019697091273821466, + "loss": 2.513, + "theoretical_loss": 3.4284634720066403, + "tokens_seen": 2013554688 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2215127, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4114856719970703, + "objective/train/theoretical_loss": 3.4284588262660476, + "objective/train/tokens_used": 2034047456, + "theoretical_loss": 3.4284588262660476, + "tokens_seen": 2013587456 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019696088264794384, + "loss": 2.5091, + "theoretical_loss": 3.428454180622224, + "tokens_seen": 2013620224 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019695085255767302, + "loss": 2.6058, + "theoretical_loss": 3.428444889624873, + "tokens_seen": 2013685760 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019694082246740223, + "loss": 2.395, + "theoretical_loss": 3.4284355990145583, + "tokens_seen": 2013751296 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001969307923771314, + "loss": 2.6074, + "theoretical_loss": 3.428426308791251, + "tokens_seen": 2013816832 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001969207622868606, + "loss": 2.7796, + "theoretical_loss": 3.428417018954922, + "tokens_seen": 2013882368 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019691073219658977, + "loss": 2.5249, + "theoretical_loss": 3.4284077295055435, + "tokens_seen": 2013947904 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019690070210631898, + "loss": 2.4523, + "theoretical_loss": 3.428398440443086, + "tokens_seen": 2014013440 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019689067201604816, + "loss": 2.5872, + "theoretical_loss": 3.428389151767521, + "tokens_seen": 2014078976 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019688064192577734, + "loss": 2.5791, + "theoretical_loss": 3.4283798634788196, + "tokens_seen": 2014144512 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019687061183550652, + "loss": 2.6035, + "theoretical_loss": 3.428370575576954, + "tokens_seen": 2014210048 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001968605817452357, + "loss": 2.5395, + "theoretical_loss": 3.428361288061894, + "tokens_seen": 2014275584 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019685055165496491, + "loss": 2.3095, + "theoretical_loss": 3.4283520009336126, + "tokens_seen": 2014341120 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001968405215646941, + "loss": 2.4929, + "theoretical_loss": 3.4283427141920795, + "tokens_seen": 2014406656 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019683049147442328, + "loss": 2.5707, + "theoretical_loss": 3.4283334278372672, + "tokens_seen": 2014472192 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019682046138415246, + "loss": 2.7006, + "theoretical_loss": 3.4283241418691466, + "tokens_seen": 2014537728 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019681043129388167, + "loss": 2.6003, + "theoretical_loss": 3.428314856287689, + "tokens_seen": 2014603264 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019680040120361085, + "loss": 2.5532, + "theoretical_loss": 3.4283055710928663, + "tokens_seen": 2014668800 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019679037111334003, + "loss": 2.4538, + "theoretical_loss": 3.4282962862846484, + "tokens_seen": 2014734336 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001967803410230692, + "loss": 2.5013, + "theoretical_loss": 3.4282870018630076, + "tokens_seen": 2014799872 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001967703109327984, + "loss": 2.7054, + "theoretical_loss": 3.428277717827916, + "tokens_seen": 2014865408 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001967602808425276, + "loss": 2.5474, + "theoretical_loss": 3.4282684341793432, + "tokens_seen": 2014930944 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019675025075225678, + "loss": 2.5492, + "theoretical_loss": 3.4282591509172615, + "tokens_seen": 2014996480 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019674022066198596, + "loss": 2.5166, + "theoretical_loss": 3.4282498680416422, + "tokens_seen": 2015062016 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019673019057171514, + "loss": 2.7566, + "theoretical_loss": 3.4282405855524565, + "tokens_seen": 2015127552 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019672016048144435, + "loss": 2.8166, + "theoretical_loss": 3.4282313034496763, + "tokens_seen": 2015193088 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2216593, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6543381214141846, + "objective/train/theoretical_loss": 3.428226662543179, + "objective/train/tokens_used": 2035685856, + "theoretical_loss": 3.428226662543179, + "tokens_seen": 2015225856 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019671013039117353, + "loss": 2.496, + "theoretical_loss": 3.428222021733272, + "tokens_seen": 2015258624 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001967001003009027, + "loss": 2.6532, + "theoretical_loss": 3.4282127404032154, + "tokens_seen": 2015324160 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001966900702106319, + "loss": 2.4634, + "theoretical_loss": 3.4282034594594784, + "tokens_seen": 2015389696 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019668004012036107, + "loss": 2.7677, + "theoretical_loss": 3.4281941789020314, + "tokens_seen": 2015455232 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019667001003009028, + "loss": 2.4504, + "theoretical_loss": 3.428184898730846, + "tokens_seen": 2015520768 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019665997993981946, + "loss": 2.6598, + "theoretical_loss": 3.428175618945894, + "tokens_seen": 2015586304 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019664994984954865, + "loss": 2.8639, + "theoretical_loss": 3.428166339547147, + "tokens_seen": 2015651840 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019663991975927783, + "loss": 2.4768, + "theoretical_loss": 3.4281570605345757, + "tokens_seen": 2015717376 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019662988966900703, + "loss": 2.6725, + "theoretical_loss": 3.4281477819081516, + "tokens_seen": 2015782912 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019661985957873622, + "loss": 2.6827, + "theoretical_loss": 3.428138503667846, + "tokens_seen": 2015848448 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001966098294884654, + "loss": 2.6137, + "theoretical_loss": 3.4281292258136307, + "tokens_seen": 2015913984 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019659979939819458, + "loss": 2.6078, + "theoretical_loss": 3.4281199483454765, + "tokens_seen": 2015979520 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019658976930792376, + "loss": 2.6546, + "theoretical_loss": 3.4281106712633553, + "tokens_seen": 2016045056 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019657973921765297, + "loss": 2.5652, + "theoretical_loss": 3.4281013945672383, + "tokens_seen": 2016110592 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019656970912738215, + "loss": 2.7261, + "theoretical_loss": 3.428092118257097, + "tokens_seen": 2016176128 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019655967903711133, + "loss": 2.6092, + "theoretical_loss": 3.4280828423329024, + "tokens_seen": 2016241664 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001965496489468405, + "loss": 2.6507, + "theoretical_loss": 3.4280735667946267, + "tokens_seen": 2016307200 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019653961885656972, + "loss": 2.5, + "theoretical_loss": 3.42806429164224, + "tokens_seen": 2016372736 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001965295887662989, + "loss": 2.5191, + "theoretical_loss": 3.428055016875715, + "tokens_seen": 2016438272 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019651955867602808, + "loss": 2.8762, + "theoretical_loss": 3.428045742495023, + "tokens_seen": 2016503808 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019650952858575726, + "loss": 2.5111, + "theoretical_loss": 3.4280364685001343, + "tokens_seen": 2016569344 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019649949849548644, + "loss": 2.3789, + "theoretical_loss": 3.4280271948910213, + "tokens_seen": 2016634880 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019648946840521565, + "loss": 2.8205, + "theoretical_loss": 3.428017921667655, + "tokens_seen": 2016700416 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019647943831494483, + "loss": 2.7382, + "theoretical_loss": 3.428008648830007, + "tokens_seen": 2016765952 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019646940822467401, + "loss": 2.8756, + "theoretical_loss": 3.427999376378049, + "tokens_seen": 2016831488 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2217240, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.795468807220459, + "objective/train/theoretical_loss": 3.4279947402966946, + "objective/train/tokens_used": 2037324256, + "theoretical_loss": 3.4279947402966946, + "tokens_seen": 2016864256 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001964593781344032, + "loss": 2.702, + "theoretical_loss": 3.4279901043117516, + "tokens_seen": 2016897024 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001964493480441324, + "loss": 2.5434, + "theoretical_loss": 3.4279808326310874, + "tokens_seen": 2016962560 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019643931795386158, + "loss": 2.6022, + "theoretical_loss": 3.427971561336027, + "tokens_seen": 2017028096 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019642928786359077, + "loss": 2.5983, + "theoretical_loss": 3.4279622904265414, + "tokens_seen": 2017093632 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019641925777331995, + "loss": 2.7638, + "theoretical_loss": 3.427953019902603, + "tokens_seen": 2017159168 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019640922768304913, + "loss": 2.6794, + "theoretical_loss": 3.427943749764183, + "tokens_seen": 2017224704 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019639919759277834, + "loss": 2.7658, + "theoretical_loss": 3.4279344800112526, + "tokens_seen": 2017290240 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019638916750250752, + "loss": 2.4497, + "theoretical_loss": 3.4279252106437834, + "tokens_seen": 2017355776 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001963791374122367, + "loss": 2.3954, + "theoretical_loss": 3.4279159416617473, + "tokens_seen": 2017421312 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019636910732196588, + "loss": 2.5088, + "theoretical_loss": 3.427906673065115, + "tokens_seen": 2017486848 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001963590772316951, + "loss": 2.5956, + "theoretical_loss": 3.4278974048538577, + "tokens_seen": 2017552384 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019634904714142427, + "loss": 2.7124, + "theoretical_loss": 3.4278881370279475, + "tokens_seen": 2017617920 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019633901705115345, + "loss": 2.6299, + "theoretical_loss": 3.4278788695873565, + "tokens_seen": 2017683456 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019632898696088263, + "loss": 2.5168, + "theoretical_loss": 3.4278696025320547, + "tokens_seen": 2017748992 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019631895687061187, + "loss": 2.6088, + "theoretical_loss": 3.4278603358620146, + "tokens_seen": 2017814528 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019630892678034105, + "loss": 2.6595, + "theoretical_loss": 3.4278510695772075, + "tokens_seen": 2017880064 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019629889669007023, + "loss": 2.3272, + "theoretical_loss": 3.4278418036776044, + "tokens_seen": 2017945600 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001962888665997994, + "loss": 2.5523, + "theoretical_loss": 3.4278325381631776, + "tokens_seen": 2018011136 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001962788365095286, + "loss": 2.5834, + "theoretical_loss": 3.4278232730338973, + "tokens_seen": 2018076672 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001962688064192578, + "loss": 2.6435, + "theoretical_loss": 3.4278140082897366, + "tokens_seen": 2018142208 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019625877632898698, + "loss": 2.5815, + "theoretical_loss": 3.4278047439306656, + "tokens_seen": 2018207744 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019624874623871616, + "loss": 2.5336, + "theoretical_loss": 3.4277954799566563, + "tokens_seen": 2018273280 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019623871614844534, + "loss": 2.7866, + "theoretical_loss": 3.4277862163676804, + "tokens_seen": 2018338816 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019622868605817455, + "loss": 2.4711, + "theoretical_loss": 3.4277769531637095, + "tokens_seen": 2018404352 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019621865596790373, + "loss": 2.6196, + "theoretical_loss": 3.427767690344715, + "tokens_seen": 2018469888 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2218542, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5761263370513916, + "objective/train/theoretical_loss": 3.4277630590795747, + "objective/train/tokens_used": 2038962656, + "theoretical_loss": 3.4277630590795747, + "tokens_seen": 2018502656 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001962086258776329, + "loss": 2.5271, + "theoretical_loss": 3.4277584279106676, + "tokens_seen": 2018535424 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001961985957873621, + "loss": 2.5907, + "theoretical_loss": 3.42774916586154, + "tokens_seen": 2018600960 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019618856569709127, + "loss": 2.7287, + "theoretical_loss": 3.4277399041973027, + "tokens_seen": 2018666496 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019617853560682048, + "loss": 2.6314, + "theoretical_loss": 3.427730642917928, + "tokens_seen": 2018732032 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019616850551654966, + "loss": 2.5787, + "theoretical_loss": 3.427721382023387, + "tokens_seen": 2018797568 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019615847542627885, + "loss": 2.5696, + "theoretical_loss": 3.4277121215136512, + "tokens_seen": 2018863104 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019614844533600803, + "loss": 2.5292, + "theoretical_loss": 3.4277028613886924, + "tokens_seen": 2018928640 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019613841524573723, + "loss": 2.7014, + "theoretical_loss": 3.427693601648482, + "tokens_seen": 2018994176 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019612838515546642, + "loss": 2.6079, + "theoretical_loss": 3.4276843422929915, + "tokens_seen": 2019059712 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001961183550651956, + "loss": 2.459, + "theoretical_loss": 3.4276750833221916, + "tokens_seen": 2019125248 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019610832497492478, + "loss": 2.6182, + "theoretical_loss": 3.4276658247360556, + "tokens_seen": 2019190784 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019609829488465396, + "loss": 2.4912, + "theoretical_loss": 3.4276565665345538, + "tokens_seen": 2019256320 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019608826479438317, + "loss": 2.4646, + "theoretical_loss": 3.427647308717658, + "tokens_seen": 2019321856 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019607823470411235, + "loss": 2.6236, + "theoretical_loss": 3.4276380512853395, + "tokens_seen": 2019387392 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019606820461384153, + "loss": 2.6444, + "theoretical_loss": 3.4276287942375703, + "tokens_seen": 2019452928 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001960581745235707, + "loss": 2.7338, + "theoretical_loss": 3.4276195375743215, + "tokens_seen": 2019518464 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019604814443329992, + "loss": 2.5083, + "theoretical_loss": 3.427610281295565, + "tokens_seen": 2019584000 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001960381143430291, + "loss": 2.5852, + "theoretical_loss": 3.4276010254012723, + "tokens_seen": 2019649536 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019602808425275828, + "loss": 2.5344, + "theoretical_loss": 3.427591769891415, + "tokens_seen": 2019715072 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019601805416248746, + "loss": 2.6897, + "theoretical_loss": 3.427582514765964, + "tokens_seen": 2019780608 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019600802407221664, + "loss": 2.6179, + "theoretical_loss": 3.427573260024892, + "tokens_seen": 2019846144 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019599799398194585, + "loss": 2.641, + "theoretical_loss": 3.42756400566817, + "tokens_seen": 2019911680 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019598796389167503, + "loss": 2.2845, + "theoretical_loss": 3.4275547516957694, + "tokens_seen": 2019977216 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019597793380140421, + "loss": 2.5541, + "theoretical_loss": 3.427545498107662, + "tokens_seen": 2020042752 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001959679037111334, + "loss": 2.634, + "theoretical_loss": 3.427536244903819, + "tokens_seen": 2020108288 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2219399, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6442480087280273, + "objective/train/theoretical_loss": 3.427531618445988, + "objective/train/tokens_used": 2040601056, + "theoretical_loss": 3.427531618445988, + "tokens_seen": 2020141056 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001959578736208626, + "loss": 2.5326, + "theoretical_loss": 3.4275269920842124, + "tokens_seen": 2020173824 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019594784353059178, + "loss": 2.5681, + "theoretical_loss": 3.427517739648814, + "tokens_seen": 2020239360 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019593781344032097, + "loss": 2.7466, + "theoretical_loss": 3.4275084875975947, + "tokens_seen": 2020304896 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019592778335005015, + "loss": 2.5236, + "theoretical_loss": 3.4274992359305267, + "tokens_seen": 2020370432 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019591775325977933, + "loss": 2.7579, + "theoretical_loss": 3.427489984647581, + "tokens_seen": 2020435968 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019590772316950854, + "loss": 2.4481, + "theoretical_loss": 3.4274807337487294, + "tokens_seen": 2020501504 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019589769307923772, + "loss": 2.6024, + "theoretical_loss": 3.427471483233944, + "tokens_seen": 2020567040 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001958876629889669, + "loss": 2.7671, + "theoretical_loss": 3.4274622331031956, + "tokens_seen": 2020632576 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019587763289869608, + "loss": 2.6469, + "theoretical_loss": 3.4274529833564564, + "tokens_seen": 2020698112 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001958676028084253, + "loss": 2.5137, + "theoretical_loss": 3.427443733993698, + "tokens_seen": 2020763648 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019585757271815447, + "loss": 2.5891, + "theoretical_loss": 3.4274344850148917, + "tokens_seen": 2020829184 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019584754262788365, + "loss": 2.2111, + "theoretical_loss": 3.4274252364200093, + "tokens_seen": 2020894720 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019583751253761283, + "loss": 2.6259, + "theoretical_loss": 3.427415988209022, + "tokens_seen": 2020960256 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019582748244734204, + "loss": 2.6164, + "theoretical_loss": 3.427406740381902, + "tokens_seen": 2021025792 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019581745235707122, + "loss": 2.3906, + "theoretical_loss": 3.4273974929386206, + "tokens_seen": 2021091328 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001958074222668004, + "loss": 2.5582, + "theoretical_loss": 3.4273882458791496, + "tokens_seen": 2021156864 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019579739217652958, + "loss": 2.5075, + "theoretical_loss": 3.4273789992034605, + "tokens_seen": 2021222400 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019578736208625876, + "loss": 2.5946, + "theoretical_loss": 3.427369752911525, + "tokens_seen": 2021287936 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019577733199598797, + "loss": 2.6793, + "theoretical_loss": 3.4273605070033146, + "tokens_seen": 2021353472 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019576730190571715, + "loss": 2.6528, + "theoretical_loss": 3.427351261478801, + "tokens_seen": 2021419008 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019575727181544633, + "loss": 2.2541, + "theoretical_loss": 3.427342016337956, + "tokens_seen": 2021484544 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019574724172517552, + "loss": 2.3802, + "theoretical_loss": 3.427332771580751, + "tokens_seen": 2021550080 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019573721163490472, + "loss": 2.7394, + "theoretical_loss": 3.427323527207158, + "tokens_seen": 2021615616 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001957271815446339, + "loss": 2.6018, + "theoretical_loss": 3.427314283217148, + "tokens_seen": 2021681152 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019571715145436309, + "loss": 2.4578, + "theoretical_loss": 3.427305039610693, + "tokens_seen": 2021746688 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2220946, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.796506881713867, + "objective/train/theoretical_loss": 3.4273004179512903, + "objective/train/tokens_used": 2042239456, + "theoretical_loss": 3.4273004179512903, + "tokens_seen": 2021779456 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019570712136409227, + "loss": 2.6027, + "theoretical_loss": 3.427295796387765, + "tokens_seen": 2021812224 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019569709127382145, + "loss": 2.4899, + "theoretical_loss": 3.4272865535483352, + "tokens_seen": 2021877760 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019568706118355066, + "loss": 2.7016, + "theoretical_loss": 3.4272773110923755, + "tokens_seen": 2021943296 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019567703109327984, + "loss": 2.6204, + "theoretical_loss": 3.427268069019857, + "tokens_seen": 2022008832 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019566700100300902, + "loss": 2.4985, + "theoretical_loss": 3.4272588273307525, + "tokens_seen": 2022074368 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001956569709127382, + "loss": 2.7448, + "theoretical_loss": 3.4272495860250327, + "tokens_seen": 2022139904 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001956469408224674, + "loss": 2.4709, + "theoretical_loss": 3.42724034510267, + "tokens_seen": 2022205440 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001956369107321966, + "loss": 2.483, + "theoretical_loss": 3.427231104563635, + "tokens_seen": 2022270976 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019562688064192577, + "loss": 2.2475, + "theoretical_loss": 3.4272218644079, + "tokens_seen": 2022336512 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019561685055165495, + "loss": 2.6234, + "theoretical_loss": 3.4272126246354375, + "tokens_seen": 2022402048 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019560682046138413, + "loss": 2.68, + "theoretical_loss": 3.4272033852462176, + "tokens_seen": 2022467584 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019559679037111334, + "loss": 2.4605, + "theoretical_loss": 3.4271941462402133, + "tokens_seen": 2022533120 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019558676028084252, + "loss": 2.7428, + "theoretical_loss": 3.4271849076173955, + "tokens_seen": 2022598656 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001955767301905717, + "loss": 2.3718, + "theoretical_loss": 3.4271756693777364, + "tokens_seen": 2022664192 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001955667001003009, + "loss": 2.4838, + "theoretical_loss": 3.427166431521207, + "tokens_seen": 2022729728 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019555667001003012, + "loss": 2.1615, + "theoretical_loss": 3.42715719404778, + "tokens_seen": 2022795264 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001955466399197593, + "loss": 2.4748, + "theoretical_loss": 3.427147956957426, + "tokens_seen": 2022860800 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019553660982948848, + "loss": 2.5309, + "theoretical_loss": 3.4271387202501176, + "tokens_seen": 2022926336 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019552657973921766, + "loss": 2.5999, + "theoretical_loss": 3.427129483925826, + "tokens_seen": 2022991872 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019551654964894684, + "loss": 2.6014, + "theoretical_loss": 3.427120247984523, + "tokens_seen": 2023057408 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019550651955867605, + "loss": 2.726, + "theoretical_loss": 3.427111012426181, + "tokens_seen": 2023122944 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019549648946840523, + "loss": 2.4055, + "theoretical_loss": 3.4271017772507704, + "tokens_seen": 2023188480 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019548645937813441, + "loss": 2.4727, + "theoretical_loss": 3.427092542458264, + "tokens_seen": 2023254016 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001954764292878636, + "loss": 2.4225, + "theoretical_loss": 3.427083308048633, + "tokens_seen": 2023319552 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001954663991975928, + "loss": 2.6664, + "theoretical_loss": 3.4270740740218493, + "tokens_seen": 2023385088 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2221619, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2211577892303467, + "objective/train/theoretical_loss": 3.427069457152016, + "objective/train/tokens_used": 2043877856, + "theoretical_loss": 3.427069457152016, + "tokens_seen": 2023417856 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019545636910732198, + "loss": 2.4293, + "theoretical_loss": 3.4270648403778843, + "tokens_seen": 2023450624 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019544633901705117, + "loss": 2.8273, + "theoretical_loss": 3.42705560711671, + "tokens_seen": 2023516160 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019543630892678035, + "loss": 2.5525, + "theoretical_loss": 3.427046374238299, + "tokens_seen": 2023581696 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019542627883650953, + "loss": 2.7075, + "theoretical_loss": 3.4270371417426215, + "tokens_seen": 2023647232 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019541624874623874, + "loss": 2.5183, + "theoretical_loss": 3.42702790962965, + "tokens_seen": 2023712768 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019540621865596792, + "loss": 2.6549, + "theoretical_loss": 3.4270186778993565, + "tokens_seen": 2023778304 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001953961885656971, + "loss": 2.4644, + "theoretical_loss": 3.427009446551712, + "tokens_seen": 2023843840 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019538615847542628, + "loss": 2.4279, + "theoretical_loss": 3.4270002155866894, + "tokens_seen": 2023909376 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001953761283851555, + "loss": 2.4744, + "theoretical_loss": 3.4269909850042586, + "tokens_seen": 2023974912 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019536609829488467, + "loss": 2.6261, + "theoretical_loss": 3.4269817548043937, + "tokens_seen": 2024040448 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019535606820461385, + "loss": 2.5435, + "theoretical_loss": 3.4269725249870646, + "tokens_seen": 2024105984 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019534603811434303, + "loss": 2.7975, + "theoretical_loss": 3.4269632955522438, + "tokens_seen": 2024171520 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019533600802407224, + "loss": 2.74, + "theoretical_loss": 3.4269540664999028, + "tokens_seen": 2024237056 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019532597793380142, + "loss": 2.409, + "theoretical_loss": 3.4269448378300136, + "tokens_seen": 2024302592 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001953159478435306, + "loss": 2.5848, + "theoretical_loss": 3.4269356095425483, + "tokens_seen": 2024368128 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019530591775325978, + "loss": 2.4618, + "theoretical_loss": 3.426926381637478, + "tokens_seen": 2024433664 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019529588766298896, + "loss": 2.5457, + "theoretical_loss": 3.4269171541147747, + "tokens_seen": 2024499200 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019528585757271817, + "loss": 2.617, + "theoretical_loss": 3.42690792697441, + "tokens_seen": 2024564736 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019527582748244735, + "loss": 2.7204, + "theoretical_loss": 3.4268987002163565, + "tokens_seen": 2024630272 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019526579739217653, + "loss": 2.4672, + "theoretical_loss": 3.426889473840585, + "tokens_seen": 2024695808 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019525576730190572, + "loss": 2.4126, + "theoretical_loss": 3.426880247847068, + "tokens_seen": 2024761344 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019524573721163492, + "loss": 2.9284, + "theoretical_loss": 3.4268710222357766, + "tokens_seen": 2024826880 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001952357071213641, + "loss": 2.6603, + "theoretical_loss": 3.426861797006683, + "tokens_seen": 2024892416 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019522567703109329, + "loss": 2.7328, + "theoretical_loss": 3.426852572159759, + "tokens_seen": 2024957952 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019521564694082247, + "loss": 2.6567, + "theoretical_loss": 3.4268433476949767, + "tokens_seen": 2025023488 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2222826, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.895820379257202, + "objective/train/theoretical_loss": 3.42683873560588, + "objective/train/tokens_used": 2045516256, + "theoretical_loss": 3.42683873560588, + "tokens_seen": 2025056256 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019520561685055165, + "loss": 2.8989, + "theoretical_loss": 3.4268341236123074, + "tokens_seen": 2025089024 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019519558676028086, + "loss": 2.5819, + "theoretical_loss": 3.426824899911723, + "tokens_seen": 2025154560 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019518555667001004, + "loss": 2.5411, + "theoretical_loss": 3.4268156765931956, + "tokens_seen": 2025220096 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019517552657973922, + "loss": 2.9662, + "theoretical_loss": 3.4268064536566967, + "tokens_seen": 2025285632 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001951654964894684, + "loss": 2.586, + "theoretical_loss": 3.4267972311021984, + "tokens_seen": 2025351168 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001951554663991976, + "loss": 2.5704, + "theoretical_loss": 3.4267880089296723, + "tokens_seen": 2025416704 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001951454363089268, + "loss": 2.6985, + "theoretical_loss": 3.4267787871390896, + "tokens_seen": 2025482240 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019513540621865597, + "loss": 2.5905, + "theoretical_loss": 3.4267695657304236, + "tokens_seen": 2025547776 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019512537612838515, + "loss": 2.5204, + "theoretical_loss": 3.426760344703645, + "tokens_seen": 2025613312 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019511534603811433, + "loss": 2.488, + "theoretical_loss": 3.426751124058726, + "tokens_seen": 2025678848 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019510531594784354, + "loss": 2.4337, + "theoretical_loss": 3.426741903795638, + "tokens_seen": 2025744384 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019509528585757272, + "loss": 2.6002, + "theoretical_loss": 3.426732683914354, + "tokens_seen": 2025809920 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001950852557673019, + "loss": 2.5938, + "theoretical_loss": 3.4267234644148443, + "tokens_seen": 2025875456 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019507522567703108, + "loss": 2.8687, + "theoretical_loss": 3.426714245297082, + "tokens_seen": 2025940992 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001950651955867603, + "loss": 2.837, + "theoretical_loss": 3.426705026561038, + "tokens_seen": 2026006528 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019505516549648947, + "loss": 2.4771, + "theoretical_loss": 3.426695808206685, + "tokens_seen": 2026072064 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019504513540621865, + "loss": 2.3647, + "theoretical_loss": 3.4266865902339942, + "tokens_seen": 2026137600 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019503510531594784, + "loss": 2.7144, + "theoretical_loss": 3.4266773726429376, + "tokens_seen": 2026203136 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019502507522567702, + "loss": 2.6085, + "theoretical_loss": 3.4266681554334872, + "tokens_seen": 2026268672 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019501504513540622, + "loss": 2.8281, + "theoretical_loss": 3.426658938605615, + "tokens_seen": 2026334208 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001950050150451354, + "loss": 2.5049, + "theoretical_loss": 3.4266497221592926, + "tokens_seen": 2026399744 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001949949849548646, + "loss": 2.4063, + "theoretical_loss": 3.426640506094492, + "tokens_seen": 2026465280 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019498495486459377, + "loss": 2.5539, + "theoretical_loss": 3.426631290411185, + "tokens_seen": 2026530816 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019497492477432298, + "loss": 2.5692, + "theoretical_loss": 3.4266220751093432, + "tokens_seen": 2026596352 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019496489468405216, + "loss": 2.3331, + "theoretical_loss": 3.426612860188939, + "tokens_seen": 2026661888 + }, + { + "epoch": 6.07, + "objective/train/docs_used": 2223391, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7132108211517334, + "objective/train/theoretical_loss": 3.426608252871767, + "objective/train/tokens_used": 2047154656, + "theoretical_loss": 3.426608252871767, + "tokens_seen": 2026694656 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019495486459378134, + "loss": 2.662, + "theoretical_loss": 3.4266036456499442, + "tokens_seen": 2026727424 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019494483450351052, + "loss": 2.6785, + "theoretical_loss": 3.42659443149233, + "tokens_seen": 2026792960 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001949348044132397, + "loss": 2.6895, + "theoretical_loss": 3.4265852177160694, + "tokens_seen": 2026858496 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001949247743229689, + "loss": 2.4901, + "theoretical_loss": 3.426576004321133, + "tokens_seen": 2026924032 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001949147442326981, + "loss": 2.6898, + "theoretical_loss": 3.4265667913074935, + "tokens_seen": 2026989568 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019490471414242727, + "loss": 2.3856, + "theoretical_loss": 3.4265575786751232, + "tokens_seen": 2027055104 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019489468405215645, + "loss": 2.6132, + "theoretical_loss": 3.4265483664239933, + "tokens_seen": 2027120640 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019488465396188566, + "loss": 2.6881, + "theoretical_loss": 3.4265391545540758, + "tokens_seen": 2027186176 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019487462387161484, + "loss": 2.656, + "theoretical_loss": 3.4265299430653426, + "tokens_seen": 2027251712 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019486459378134402, + "loss": 2.3505, + "theoretical_loss": 3.4265207319577655, + "tokens_seen": 2027317248 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001948545636910732, + "loss": 2.5348, + "theoretical_loss": 3.426511521231317, + "tokens_seen": 2027382784 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019484453360080239, + "loss": 2.5066, + "theoretical_loss": 3.426502310885968, + "tokens_seen": 2027448320 + }, + { + "epoch": 6.07, + "learning_rate": 0.0001948345035105316, + "loss": 2.5822, + "theoretical_loss": 3.4264931009216912, + "tokens_seen": 2027513856 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019482447342026077, + "loss": 2.5276, + "theoretical_loss": 3.4264838913384583, + "tokens_seen": 2027579392 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019481444332998998, + "loss": 2.7382, + "theoretical_loss": 3.4264746821362415, + "tokens_seen": 2027644928 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019480441323971916, + "loss": 2.8132, + "theoretical_loss": 3.4264654733150124, + "tokens_seen": 2027710464 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019479438314944837, + "loss": 2.7733, + "theoretical_loss": 3.426456264874743, + "tokens_seen": 2027776000 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019478435305917755, + "loss": 2.7636, + "theoretical_loss": 3.4264470568154053, + "tokens_seen": 2027841536 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019477432296890673, + "loss": 2.5219, + "theoretical_loss": 3.4264378491369705, + "tokens_seen": 2027907072 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019476429287863592, + "loss": 2.4423, + "theoretical_loss": 3.426428641839412, + "tokens_seen": 2027972608 + }, + { + "epoch": 6.07, + "learning_rate": 0.00019475426278836512, + "loss": 2.7085, + "theoretical_loss": 3.4264194349227006, + "tokens_seen": 2028038144 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001947442326980943, + "loss": 2.7722, + "theoretical_loss": 3.4264102283868083, + "tokens_seen": 2028103680 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019473420260782349, + "loss": 2.6241, + "theoretical_loss": 3.4264010222317074, + "tokens_seen": 2028169216 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019472417251755267, + "loss": 2.3807, + "theoretical_loss": 3.42639181645737, + "tokens_seen": 2028234752 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019471414242728185, + "loss": 2.5009, + "theoretical_loss": 3.426382611063768, + "tokens_seen": 2028300288 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2223835, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4823851585388184, + "objective/train/theoretical_loss": 3.4263780085097335, + "objective/train/tokens_used": 2048793056, + "theoretical_loss": 3.4263780085097335, + "tokens_seen": 2028333056 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019470411233701106, + "loss": 2.6178, + "theoretical_loss": 3.4263734060508724, + "tokens_seen": 2028365824 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019469408224674024, + "loss": 2.8437, + "theoretical_loss": 3.4263642014186564, + "tokens_seen": 2028431360 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019468405215646942, + "loss": 2.4313, + "theoretical_loss": 3.426354997167091, + "tokens_seen": 2028496896 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001946740220661986, + "loss": 2.6243, + "theoretical_loss": 3.426345793296149, + "tokens_seen": 2028562432 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001946639919759278, + "loss": 2.525, + "theoretical_loss": 3.4263365898058025, + "tokens_seen": 2028627968 + }, + { + "epoch": 6.08, + "learning_rate": 0.000194653961885657, + "loss": 2.5613, + "theoretical_loss": 3.4263273866960224, + "tokens_seen": 2028693504 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019464393179538617, + "loss": 2.7174, + "theoretical_loss": 3.426318183966781, + "tokens_seen": 2028759040 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019463390170511535, + "loss": 2.3721, + "theoretical_loss": 3.4263089816180505, + "tokens_seen": 2028824576 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019462387161484453, + "loss": 2.4181, + "theoretical_loss": 3.4262997796498036, + "tokens_seen": 2028890112 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019461384152457374, + "loss": 2.4588, + "theoretical_loss": 3.426290578062011, + "tokens_seen": 2028955648 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019460381143430292, + "loss": 2.7036, + "theoretical_loss": 3.426281376854645, + "tokens_seen": 2029021184 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001945937813440321, + "loss": 2.6678, + "theoretical_loss": 3.4262721760276786, + "tokens_seen": 2029086720 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019458375125376128, + "loss": 2.5563, + "theoretical_loss": 3.426262975581082, + "tokens_seen": 2029152256 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001945737211634905, + "loss": 2.4738, + "theoretical_loss": 3.4262537755148292, + "tokens_seen": 2029217792 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019456369107321967, + "loss": 2.5416, + "theoretical_loss": 3.426244575828891, + "tokens_seen": 2029283328 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019455366098294885, + "loss": 2.6619, + "theoretical_loss": 3.4262353765232385, + "tokens_seen": 2029348864 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019454363089267804, + "loss": 2.5274, + "theoretical_loss": 3.426226177597846, + "tokens_seen": 2029414400 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019453360080240722, + "loss": 2.5692, + "theoretical_loss": 3.4262169790526835, + "tokens_seen": 2029479936 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019452357071213642, + "loss": 2.4777, + "theoretical_loss": 3.4262077808877245, + "tokens_seen": 2029545472 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001945135406218656, + "loss": 2.5478, + "theoretical_loss": 3.42619858310294, + "tokens_seen": 2029611008 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001945035105315948, + "loss": 2.441, + "theoretical_loss": 3.4261893856983026, + "tokens_seen": 2029676544 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019449348044132397, + "loss": 2.5245, + "theoretical_loss": 3.4261801886737837, + "tokens_seen": 2029742080 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019448345035105318, + "loss": 2.8162, + "theoretical_loss": 3.4261709920293555, + "tokens_seen": 2029807616 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019447342026078236, + "loss": 2.6256, + "theoretical_loss": 3.4261617957649904, + "tokens_seen": 2029873152 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019446339017051154, + "loss": 2.7774, + "theoretical_loss": 3.42615259988066, + "tokens_seen": 2029938688 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2225122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0851826667785645, + "objective/train/theoretical_loss": 3.4261480020809993, + "objective/train/tokens_used": 2050431456, + "theoretical_loss": 3.4261480020809993, + "tokens_seen": 2029971456 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019445336008024072, + "loss": 2.8841, + "theoretical_loss": 3.4261434043763366, + "tokens_seen": 2030004224 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001944433299899699, + "loss": 2.3714, + "theoretical_loss": 3.426134209251992, + "tokens_seen": 2030069760 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001944332998996991, + "loss": 2.5417, + "theoretical_loss": 3.4261250145075985, + "tokens_seen": 2030135296 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001944232698094283, + "loss": 2.6934, + "theoretical_loss": 3.4261158201431288, + "tokens_seen": 2030200832 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019441323971915747, + "loss": 2.447, + "theoretical_loss": 3.4261066261585533, + "tokens_seen": 2030266368 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019440320962888665, + "loss": 2.472, + "theoretical_loss": 3.426097432553845, + "tokens_seen": 2030331904 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019439317953861586, + "loss": 2.3305, + "theoretical_loss": 3.426088239328976, + "tokens_seen": 2030397440 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019438314944834504, + "loss": 2.4122, + "theoretical_loss": 3.426079046483918, + "tokens_seen": 2030462976 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019437311935807422, + "loss": 2.7028, + "theoretical_loss": 3.4260698540186434, + "tokens_seen": 2030528512 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001943630892678034, + "loss": 2.5242, + "theoretical_loss": 3.426060661933124, + "tokens_seen": 2030594048 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019435305917753259, + "loss": 2.7517, + "theoretical_loss": 3.426051470227332, + "tokens_seen": 2030659584 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001943430290872618, + "loss": 2.633, + "theoretical_loss": 3.4260422789012392, + "tokens_seen": 2030725120 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019433299899699097, + "loss": 2.5746, + "theoretical_loss": 3.426033087954818, + "tokens_seen": 2030790656 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019432296890672016, + "loss": 2.6142, + "theoretical_loss": 3.4260238973880406, + "tokens_seen": 2030856192 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019431293881644934, + "loss": 2.7431, + "theoretical_loss": 3.4260147072008786, + "tokens_seen": 2030921728 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019430290872617855, + "loss": 2.448, + "theoretical_loss": 3.426005517393304, + "tokens_seen": 2030987264 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019429287863590773, + "loss": 2.5761, + "theoretical_loss": 3.42599632796529, + "tokens_seen": 2031052800 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001942828485456369, + "loss": 2.6718, + "theoretical_loss": 3.425987138916807, + "tokens_seen": 2031118336 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001942728184553661, + "loss": 2.7222, + "theoretical_loss": 3.4259779502478285, + "tokens_seen": 2031183872 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001942627883650953, + "loss": 2.5072, + "theoretical_loss": 3.4259687619583254, + "tokens_seen": 2031249408 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019425275827482448, + "loss": 2.3746, + "theoretical_loss": 3.425959574048271, + "tokens_seen": 2031314944 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019424272818455366, + "loss": 2.5063, + "theoretical_loss": 3.425950386517636, + "tokens_seen": 2031380480 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019423269809428284, + "loss": 2.416, + "theoretical_loss": 3.4259411993663935, + "tokens_seen": 2031446016 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019422266800401202, + "loss": 2.7255, + "theoretical_loss": 3.4259320125945156, + "tokens_seen": 2031511552 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019421263791374123, + "loss": 2.5213, + "theoretical_loss": 3.4259228262019743, + "tokens_seen": 2031577088 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2225934, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.481046438217163, + "objective/train/theoretical_loss": 3.4259182331479456, + "objective/train/tokens_used": 2052069856, + "theoretical_loss": 3.4259182331479456, + "tokens_seen": 2031609856 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001942026078234704, + "loss": 2.6629, + "theoretical_loss": 3.4259136401887407, + "tokens_seen": 2031642624 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001941925777331996, + "loss": 2.6418, + "theoretical_loss": 3.4259044545547885, + "tokens_seen": 2031708160 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019418254764292877, + "loss": 2.6521, + "theoretical_loss": 3.425895269300089, + "tokens_seen": 2031773696 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019417251755265798, + "loss": 2.4006, + "theoretical_loss": 3.425886084424614, + "tokens_seen": 2031839232 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019416248746238716, + "loss": 2.564, + "theoretical_loss": 3.4258768999283364, + "tokens_seen": 2031904768 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019415245737211634, + "loss": 2.4862, + "theoretical_loss": 3.4258677158112274, + "tokens_seen": 2031970304 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019414242728184552, + "loss": 2.4242, + "theoretical_loss": 3.4258585320732604, + "tokens_seen": 2032035840 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001941323971915747, + "loss": 2.816, + "theoretical_loss": 3.425849348714406, + "tokens_seen": 2032101376 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019412236710130391, + "loss": 2.475, + "theoretical_loss": 3.4258401657346376, + "tokens_seen": 2032166912 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001941123370110331, + "loss": 2.5888, + "theoretical_loss": 3.425830983133926, + "tokens_seen": 2032232448 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019410230692076228, + "loss": 2.5345, + "theoretical_loss": 3.425821800912245, + "tokens_seen": 2032297984 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019409227683049146, + "loss": 2.5152, + "theoretical_loss": 3.4258126190695655, + "tokens_seen": 2032363520 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019408224674022067, + "loss": 2.5594, + "theoretical_loss": 3.42580343760586, + "tokens_seen": 2032429056 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019407221664994985, + "loss": 2.583, + "theoretical_loss": 3.4257942565211, + "tokens_seen": 2032494592 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019406218655967905, + "loss": 2.8229, + "theoretical_loss": 3.4257850758152593, + "tokens_seen": 2032560128 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019405215646940824, + "loss": 2.5447, + "theoretical_loss": 3.425775895488308, + "tokens_seen": 2032625664 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019404212637913742, + "loss": 2.4659, + "theoretical_loss": 3.4257667155402203, + "tokens_seen": 2032691200 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019403209628886663, + "loss": 2.7017, + "theoretical_loss": 3.4257575359709667, + "tokens_seen": 2032756736 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001940220661985958, + "loss": 2.3144, + "theoretical_loss": 3.42574835678052, + "tokens_seen": 2032822272 + }, + { + "epoch": 6.08, + "learning_rate": 0.000194012036108325, + "loss": 2.5616, + "theoretical_loss": 3.425739177968852, + "tokens_seen": 2032887808 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019400200601805417, + "loss": 2.5881, + "theoretical_loss": 3.425729999535936, + "tokens_seen": 2032953344 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019399197592778338, + "loss": 2.5103, + "theoretical_loss": 3.4257208214817427, + "tokens_seen": 2033018880 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019398194583751256, + "loss": 2.3023, + "theoretical_loss": 3.425711643806245, + "tokens_seen": 2033084416 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019397191574724174, + "loss": 2.3112, + "theoretical_loss": 3.4257024665094153, + "tokens_seen": 2033149952 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019396188565697092, + "loss": 2.3049, + "theoretical_loss": 3.4256932895912247, + "tokens_seen": 2033215488 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2227203, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7987263202667236, + "objective/train/theoretical_loss": 3.425688701274111, + "objective/train/tokens_used": 2053708256, + "theoretical_loss": 3.425688701274111, + "tokens_seen": 2033248256 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001939518555667001, + "loss": 2.6619, + "theoretical_loss": 3.4256841130516467, + "tokens_seen": 2033281024 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001939418254764293, + "loss": 2.6613, + "theoretical_loss": 3.425674936890653, + "tokens_seen": 2033346560 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001939317953861585, + "loss": 2.7105, + "theoretical_loss": 3.425665761108215, + "tokens_seen": 2033412096 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019392176529588767, + "loss": 2.4832, + "theoretical_loss": 3.4256565857043064, + "tokens_seen": 2033477632 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019391173520561685, + "loss": 2.577, + "theoretical_loss": 3.425647410678898, + "tokens_seen": 2033543168 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019390170511534606, + "loss": 2.5484, + "theoretical_loss": 3.425638236031963, + "tokens_seen": 2033608704 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019389167502507524, + "loss": 2.5228, + "theoretical_loss": 3.4256290617634724, + "tokens_seen": 2033674240 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019388164493480442, + "loss": 2.9123, + "theoretical_loss": 3.4256198878734, + "tokens_seen": 2033739776 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001938716148445336, + "loss": 2.478, + "theoretical_loss": 3.4256107143617163, + "tokens_seen": 2033805312 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019386158475426279, + "loss": 2.5332, + "theoretical_loss": 3.425601541228395, + "tokens_seen": 2033870848 + }, + { + "epoch": 6.08, + "learning_rate": 0.000193851554663992, + "loss": 2.8192, + "theoretical_loss": 3.425592368473407, + "tokens_seen": 2033936384 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019384152457372117, + "loss": 2.4349, + "theoretical_loss": 3.425583196096725, + "tokens_seen": 2034001920 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019383149448345036, + "loss": 2.6362, + "theoretical_loss": 3.425574024098322, + "tokens_seen": 2034067456 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019382146439317954, + "loss": 2.658, + "theoretical_loss": 3.4255648524781694, + "tokens_seen": 2034132992 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019381143430290875, + "loss": 2.7842, + "theoretical_loss": 3.425555681236239, + "tokens_seen": 2034198528 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019380140421263793, + "loss": 2.668, + "theoretical_loss": 3.4255465103725045, + "tokens_seen": 2034264064 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001937913741223671, + "loss": 2.5498, + "theoretical_loss": 3.4255373398869366, + "tokens_seen": 2034329600 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001937813440320963, + "loss": 2.8376, + "theoretical_loss": 3.425528169779508, + "tokens_seen": 2034395136 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001937713139418255, + "loss": 2.6168, + "theoretical_loss": 3.4255190000501914, + "tokens_seen": 2034460672 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019376128385155468, + "loss": 2.5567, + "theoretical_loss": 3.4255098306989584, + "tokens_seen": 2034526208 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019375125376128386, + "loss": 2.3481, + "theoretical_loss": 3.425500661725782, + "tokens_seen": 2034591744 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019374122367101304, + "loss": 2.3187, + "theoretical_loss": 3.4254914931306333, + "tokens_seen": 2034657280 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019373119358074222, + "loss": 2.7667, + "theoretical_loss": 3.425482324913486, + "tokens_seen": 2034722816 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019372116349047143, + "loss": 2.6127, + "theoretical_loss": 3.425473157074311, + "tokens_seen": 2034788352 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001937111334002006, + "loss": 2.6685, + "theoretical_loss": 3.425463989613081, + "tokens_seen": 2034853888 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2227905, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4430534839630127, + "objective/train/theoretical_loss": 3.4254594060241867, + "objective/train/tokens_used": 2055346656, + "theoretical_loss": 3.4254594060241867, + "tokens_seen": 2034886656 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001937011033099298, + "loss": 2.3858, + "theoretical_loss": 3.4254548225297685, + "tokens_seen": 2034919424 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019369107321965897, + "loss": 2.5586, + "theoretical_loss": 3.425445655824346, + "tokens_seen": 2034984960 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019368104312938818, + "loss": 2.6116, + "theoretical_loss": 3.4254364894967844, + "tokens_seen": 2035050496 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019367101303911736, + "loss": 2.6818, + "theoretical_loss": 3.4254273235470576, + "tokens_seen": 2035116032 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019366098294884654, + "loss": 2.6034, + "theoretical_loss": 3.4254181579751366, + "tokens_seen": 2035181568 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019365095285857572, + "loss": 2.5145, + "theoretical_loss": 3.4254089927809943, + "tokens_seen": 2035247104 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001936409227683049, + "loss": 2.5667, + "theoretical_loss": 3.425399827964603, + "tokens_seen": 2035312640 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019363089267803411, + "loss": 2.5972, + "theoretical_loss": 3.4253906635259352, + "tokens_seen": 2035378176 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001936208625877633, + "loss": 2.3577, + "theoretical_loss": 3.4253814994649625, + "tokens_seen": 2035443712 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019361083249749248, + "loss": 2.6074, + "theoretical_loss": 3.4253723357816575, + "tokens_seen": 2035509248 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019360080240722166, + "loss": 2.5595, + "theoretical_loss": 3.4253631724759925, + "tokens_seen": 2035574784 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019359077231695087, + "loss": 2.5426, + "theoretical_loss": 3.42535400954794, + "tokens_seen": 2035640320 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019358074222668005, + "loss": 2.8326, + "theoretical_loss": 3.425344846997471, + "tokens_seen": 2035705856 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019357071213640923, + "loss": 2.7474, + "theoretical_loss": 3.42533568482456, + "tokens_seen": 2035771392 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001935606820461384, + "loss": 2.6425, + "theoretical_loss": 3.4253265230291774, + "tokens_seen": 2035836928 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001935506519558676, + "loss": 2.7061, + "theoretical_loss": 3.425317361611296, + "tokens_seen": 2035902464 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001935406218655968, + "loss": 2.5551, + "theoretical_loss": 3.425308200570889, + "tokens_seen": 2035968000 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019353059177532598, + "loss": 2.3384, + "theoretical_loss": 3.425299039907928, + "tokens_seen": 2036033536 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019352056168505516, + "loss": 2.6152, + "theoretical_loss": 3.425289879622385, + "tokens_seen": 2036099072 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019351053159478434, + "loss": 2.5647, + "theoretical_loss": 3.425280719714232, + "tokens_seen": 2036164608 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019350050150451355, + "loss": 2.7416, + "theoretical_loss": 3.425271560183443, + "tokens_seen": 2036230144 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019349047141424273, + "loss": 2.3163, + "theoretical_loss": 3.4252624010299884, + "tokens_seen": 2036295680 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001934804413239719, + "loss": 2.6346, + "theoretical_loss": 3.4252532422538415, + "tokens_seen": 2036361216 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001934704112337011, + "loss": 2.5161, + "theoretical_loss": 3.4252440838549747, + "tokens_seen": 2036426752 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019346038114343027, + "loss": 2.749, + "theoretical_loss": 3.42523492583336, + "tokens_seen": 2036492288 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2229485, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6127874851226807, + "objective/train/theoretical_loss": 3.425230346964013, + "objective/train/tokens_used": 2056985056, + "theoretical_loss": 3.425230346964013, + "tokens_seen": 2036525056 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019345035105315948, + "loss": 2.4141, + "theoretical_loss": 3.425225768188969, + "tokens_seen": 2036557824 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019344032096288866, + "loss": 2.7425, + "theoretical_loss": 3.4252166109217757, + "tokens_seen": 2036623360 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019343029087261784, + "loss": 2.5856, + "theoretical_loss": 3.425207454031751, + "tokens_seen": 2036688896 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019342026078234703, + "loss": 2.7823, + "theoretical_loss": 3.425198297518868, + "tokens_seen": 2036754432 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019341023069207623, + "loss": 2.7238, + "theoretical_loss": 3.4251891413830986, + "tokens_seen": 2036819968 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019340020060180542, + "loss": 2.356, + "theoretical_loss": 3.4251799856244154, + "tokens_seen": 2036885504 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001933901705115346, + "loss": 2.5381, + "theoretical_loss": 3.4251708302427906, + "tokens_seen": 2036951040 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019338014042126378, + "loss": 2.5728, + "theoretical_loss": 3.425161675238197, + "tokens_seen": 2037016576 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019337011033099296, + "loss": 2.7085, + "theoretical_loss": 3.425152520610606, + "tokens_seen": 2037082112 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019336008024072217, + "loss": 2.5224, + "theoretical_loss": 3.4251433663599906, + "tokens_seen": 2037147648 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019335005015045135, + "loss": 2.6934, + "theoretical_loss": 3.425134212486323, + "tokens_seen": 2037213184 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019334002006018053, + "loss": 2.5093, + "theoretical_loss": 3.425125058989576, + "tokens_seen": 2037278720 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001933299899699097, + "loss": 2.5908, + "theoretical_loss": 3.425115905869721, + "tokens_seen": 2037344256 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019331995987963892, + "loss": 2.6153, + "theoretical_loss": 3.425106753126731, + "tokens_seen": 2037409792 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019330992978936813, + "loss": 2.4715, + "theoretical_loss": 3.4250976007605782, + "tokens_seen": 2037475328 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001932998996990973, + "loss": 2.4864, + "theoretical_loss": 3.4250884487712354, + "tokens_seen": 2037540864 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001932898696088265, + "loss": 2.6548, + "theoretical_loss": 3.4250792971586743, + "tokens_seen": 2037606400 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001932798395185557, + "loss": 2.4282, + "theoretical_loss": 3.425070145922868, + "tokens_seen": 2037671936 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019326980942828488, + "loss": 2.4941, + "theoretical_loss": 3.425060995063788, + "tokens_seen": 2037737472 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019325977933801406, + "loss": 2.5741, + "theoretical_loss": 3.4250518445814073, + "tokens_seen": 2037803008 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019324974924774324, + "loss": 2.641, + "theoretical_loss": 3.4250426944756978, + "tokens_seen": 2037868544 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019323971915747242, + "loss": 2.5199, + "theoretical_loss": 3.4250335447466327, + "tokens_seen": 2037934080 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019322968906720163, + "loss": 2.6552, + "theoretical_loss": 3.4250243953941832, + "tokens_seen": 2037999616 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001932196589769308, + "loss": 2.4302, + "theoretical_loss": 3.425015246418323, + "tokens_seen": 2038065152 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019320962888666, + "loss": 2.7165, + "theoretical_loss": 3.4250060978190233, + "tokens_seen": 2038130688 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2230000, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8307676315307617, + "objective/train/theoretical_loss": 3.4250015236605753, + "objective/train/tokens_used": 2058623456, + "theoretical_loss": 3.4250015236605753, + "tokens_seen": 2038163456 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019319959879638917, + "loss": 2.6092, + "theoretical_loss": 3.424996949596257, + "tokens_seen": 2038196224 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019318956870611838, + "loss": 2.413, + "theoretical_loss": 3.424987801749997, + "tokens_seen": 2038261760 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019317953861584756, + "loss": 2.5025, + "theoretical_loss": 3.424978654280215, + "tokens_seen": 2038327296 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019316950852557674, + "loss": 2.5929, + "theoretical_loss": 3.4249695071868835, + "tokens_seen": 2038392832 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019315947843530592, + "loss": 2.5254, + "theoretical_loss": 3.424960360469975, + "tokens_seen": 2038458368 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001931494483450351, + "loss": 2.6288, + "theoretical_loss": 3.4249512141294622, + "tokens_seen": 2038523904 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019313941825476431, + "loss": 2.5196, + "theoretical_loss": 3.4249420681653175, + "tokens_seen": 2038589440 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001931293881644935, + "loss": 2.6967, + "theoretical_loss": 3.4249329225775127, + "tokens_seen": 2038654976 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019311935807422268, + "loss": 2.4379, + "theoretical_loss": 3.42492377736602, + "tokens_seen": 2038720512 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019310932798395186, + "loss": 2.6168, + "theoretical_loss": 3.4249146325308137, + "tokens_seen": 2038786048 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019309929789368107, + "loss": 2.6235, + "theoretical_loss": 3.4249054880718637, + "tokens_seen": 2038851584 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019308926780341025, + "loss": 2.4452, + "theoretical_loss": 3.424896343989144, + "tokens_seen": 2038917120 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019307923771313943, + "loss": 2.426, + "theoretical_loss": 3.4248872002826274, + "tokens_seen": 2038982656 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001930692076228686, + "loss": 2.6263, + "theoretical_loss": 3.424878056952285, + "tokens_seen": 2039048192 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001930591775325978, + "loss": 2.5642, + "theoretical_loss": 3.4248689139980897, + "tokens_seen": 2039113728 + }, + { + "epoch": 6.08, + "learning_rate": 0.000193049147442327, + "loss": 2.4626, + "theoretical_loss": 3.4248597714200146, + "tokens_seen": 2039179264 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019303911735205618, + "loss": 2.5183, + "theoretical_loss": 3.4248506292180316, + "tokens_seen": 2039244800 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019302908726178536, + "loss": 2.3188, + "theoretical_loss": 3.4248414873921127, + "tokens_seen": 2039310336 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019301905717151454, + "loss": 2.5064, + "theoretical_loss": 3.4248323459422307, + "tokens_seen": 2039375872 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019300902708124375, + "loss": 2.5126, + "theoretical_loss": 3.4248232048683587, + "tokens_seen": 2039441408 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019299899699097293, + "loss": 2.5694, + "theoretical_loss": 3.4248140641704685, + "tokens_seen": 2039506944 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001929889669007021, + "loss": 2.6799, + "theoretical_loss": 3.4248049238485327, + "tokens_seen": 2039572480 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001929789368104313, + "loss": 2.8109, + "theoretical_loss": 3.4247957839025234, + "tokens_seen": 2039638016 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019296890672016047, + "loss": 2.4058, + "theoretical_loss": 3.4247866443324133, + "tokens_seen": 2039703552 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019295887662988968, + "loss": 2.4997, + "theoretical_loss": 3.4247775051381755, + "tokens_seen": 2039769088 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2231327, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4359278678894043, + "objective/train/theoretical_loss": 3.424772935682, + "objective/train/tokens_used": 2060261856, + "theoretical_loss": 3.424772935682, + "tokens_seen": 2039801856 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019294884653961886, + "loss": 2.5662, + "theoretical_loss": 3.424768366319782, + "tokens_seen": 2039834624 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019293881644934804, + "loss": 2.532, + "theoretical_loss": 3.4247592278772045, + "tokens_seen": 2039900160 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019292878635907723, + "loss": 2.2277, + "theoretical_loss": 3.4247500898104164, + "tokens_seen": 2039965696 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019291875626880643, + "loss": 2.5011, + "theoretical_loss": 3.4247409521193903, + "tokens_seen": 2040031232 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019290872617853562, + "loss": 2.5451, + "theoretical_loss": 3.424731814804098, + "tokens_seen": 2040096768 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001928986960882648, + "loss": 2.5452, + "theoretical_loss": 3.424722677864512, + "tokens_seen": 2040162304 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019288866599799398, + "loss": 2.1722, + "theoretical_loss": 3.424713541300606, + "tokens_seen": 2040227840 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019287863590772316, + "loss": 2.4097, + "theoretical_loss": 3.4247044051123505, + "tokens_seen": 2040293376 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019286860581745237, + "loss": 2.6304, + "theoretical_loss": 3.4246952692997197, + "tokens_seen": 2040358912 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019285857572718155, + "loss": 2.546, + "theoretical_loss": 3.4246861338626853, + "tokens_seen": 2040424448 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019284854563691073, + "loss": 2.5186, + "theoretical_loss": 3.42467699880122, + "tokens_seen": 2040489984 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001928385155466399, + "loss": 2.4014, + "theoretical_loss": 3.424667864115296, + "tokens_seen": 2040555520 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019282848545636912, + "loss": 2.446, + "theoretical_loss": 3.424658729804886, + "tokens_seen": 2040621056 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001928184553660983, + "loss": 2.5318, + "theoretical_loss": 3.4246495958699628, + "tokens_seen": 2040686592 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019280842527582748, + "loss": 2.4585, + "theoretical_loss": 3.4246404623104985, + "tokens_seen": 2040752128 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019279839518555666, + "loss": 2.5491, + "theoretical_loss": 3.424631329126466, + "tokens_seen": 2040817664 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019278836509528584, + "loss": 2.5131, + "theoretical_loss": 3.4246221963178374, + "tokens_seen": 2040883200 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019277833500501505, + "loss": 2.6601, + "theoretical_loss": 3.4246130638845855, + "tokens_seen": 2040948736 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019276830491474423, + "loss": 2.5304, + "theoretical_loss": 3.4246039318266823, + "tokens_seen": 2041014272 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001927582748244734, + "loss": 2.4476, + "theoretical_loss": 3.424594800144101, + "tokens_seen": 2041079808 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001927482447342026, + "loss": 2.6851, + "theoretical_loss": 3.424585668836814, + "tokens_seen": 2041145344 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001927382146439318, + "loss": 2.3881, + "theoretical_loss": 3.4245765379047937, + "tokens_seen": 2041210880 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019272818455366098, + "loss": 2.4321, + "theoretical_loss": 3.4245674073480123, + "tokens_seen": 2041276416 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019271815446339017, + "loss": 2.6284, + "theoretical_loss": 3.424558277166443, + "tokens_seen": 2041341952 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019270812437311935, + "loss": 2.402, + "theoretical_loss": 3.4245491473600573, + "tokens_seen": 2041407488 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2231847, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.307051420211792, + "objective/train/theoretical_loss": 3.4245445825975507, + "objective/train/tokens_used": 2061900256, + "theoretical_loss": 3.4245445825975507, + "tokens_seen": 2041440256 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019269809428284853, + "loss": 2.6997, + "theoretical_loss": 3.4245400179288294, + "tokens_seen": 2041473024 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019268806419257774, + "loss": 2.4607, + "theoretical_loss": 3.42453088887273, + "tokens_seen": 2041538560 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019267803410230692, + "loss": 2.5117, + "theoretical_loss": 3.424521760191733, + "tokens_seen": 2041604096 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001926680040120361, + "loss": 2.3577, + "theoretical_loss": 3.4245126318858103, + "tokens_seen": 2041669632 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019265797392176528, + "loss": 2.5209, + "theoretical_loss": 3.4245035039549347, + "tokens_seen": 2041735168 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001926479438314945, + "loss": 2.5755, + "theoretical_loss": 3.4244943763990783, + "tokens_seen": 2041800704 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019263791374122367, + "loss": 2.4922, + "theoretical_loss": 3.4244852492182147, + "tokens_seen": 2041866240 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019262788365095285, + "loss": 2.7366, + "theoretical_loss": 3.4244761224123152, + "tokens_seen": 2041931776 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019261785356068203, + "loss": 2.5611, + "theoretical_loss": 3.424466995981353, + "tokens_seen": 2041997312 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019260782347041124, + "loss": 2.6114, + "theoretical_loss": 3.4244578699253005, + "tokens_seen": 2042062848 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019259779338014042, + "loss": 2.7541, + "theoretical_loss": 3.4244487442441307, + "tokens_seen": 2042128384 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001925877632898696, + "loss": 2.6377, + "theoretical_loss": 3.4244396189378157, + "tokens_seen": 2042193920 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019257773319959878, + "loss": 2.5882, + "theoretical_loss": 3.4244304940063284, + "tokens_seen": 2042259456 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019256770310932796, + "loss": 2.5705, + "theoretical_loss": 3.4244213694496404, + "tokens_seen": 2042324992 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001925576730190572, + "loss": 2.5837, + "theoretical_loss": 3.424412245267726, + "tokens_seen": 2042390528 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019254764292878638, + "loss": 2.6502, + "theoretical_loss": 3.424403121460556, + "tokens_seen": 2042456064 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019253761283851556, + "loss": 2.5624, + "theoretical_loss": 3.4243939980281044, + "tokens_seen": 2042521600 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019252758274824474, + "loss": 2.434, + "theoretical_loss": 3.424384874970343, + "tokens_seen": 2042587136 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019251755265797395, + "loss": 2.613, + "theoretical_loss": 3.424375752287245, + "tokens_seen": 2042652672 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019250752256770313, + "loss": 2.5999, + "theoretical_loss": 3.424366629978782, + "tokens_seen": 2042718208 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001924974924774323, + "loss": 2.571, + "theoretical_loss": 3.4243575080449276, + "tokens_seen": 2042783744 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001924874623871615, + "loss": 2.6554, + "theoretical_loss": 3.4243483864856543, + "tokens_seen": 2042849280 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019247743229689067, + "loss": 2.646, + "theoretical_loss": 3.424339265300934, + "tokens_seen": 2042914816 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019246740220661988, + "loss": 2.4833, + "theoretical_loss": 3.4243301444907397, + "tokens_seen": 2042980352 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019245737211634906, + "loss": 2.3277, + "theoretical_loss": 3.4243210240550437, + "tokens_seen": 2043045888 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2232909, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1624960899353027, + "objective/train/theoretical_loss": 3.4243164639776245, + "objective/train/tokens_used": 2063538656, + "theoretical_loss": 3.4243164639776245, + "tokens_seen": 2043078656 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019244734202607825, + "loss": 2.5234, + "theoretical_loss": 3.4243119039938192, + "tokens_seen": 2043111424 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019243731193580743, + "loss": 2.5487, + "theoretical_loss": 3.424302784307039, + "tokens_seen": 2043176960 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019242728184553663, + "loss": 2.7658, + "theoretical_loss": 3.4242936649946745, + "tokens_seen": 2043242496 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019241725175526582, + "loss": 2.5501, + "theoretical_loss": 3.4242845460566995, + "tokens_seen": 2043308032 + }, + { + "epoch": 6.08, + "learning_rate": 0.000192407221664995, + "loss": 2.5325, + "theoretical_loss": 3.424275427493086, + "tokens_seen": 2043373568 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019239719157472418, + "loss": 2.7016, + "theoretical_loss": 3.424266309303807, + "tokens_seen": 2043439104 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019238716148445336, + "loss": 2.4821, + "theoretical_loss": 3.4242571914888345, + "tokens_seen": 2043504640 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019237713139418257, + "loss": 2.4877, + "theoretical_loss": 3.4242480740481422, + "tokens_seen": 2043570176 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019236710130391175, + "loss": 2.5232, + "theoretical_loss": 3.4242389569817018, + "tokens_seen": 2043635712 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019235707121364093, + "loss": 2.773, + "theoretical_loss": 3.4242298402894864, + "tokens_seen": 2043701248 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001923470411233701, + "loss": 2.5783, + "theoretical_loss": 3.4242207239714686, + "tokens_seen": 2043766784 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019233701103309932, + "loss": 2.4239, + "theoretical_loss": 3.4242116080276204, + "tokens_seen": 2043832320 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001923269809428285, + "loss": 2.3469, + "theoretical_loss": 3.4242024924579155, + "tokens_seen": 2043897856 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019231695085255768, + "loss": 2.6236, + "theoretical_loss": 3.4241933772623256, + "tokens_seen": 2043963392 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019230692076228686, + "loss": 2.5615, + "theoretical_loss": 3.424184262440824, + "tokens_seen": 2044028928 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019229689067201604, + "loss": 2.6071, + "theoretical_loss": 3.4241751479933833, + "tokens_seen": 2044094464 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019228686058174525, + "loss": 2.5083, + "theoretical_loss": 3.4241660339199758, + "tokens_seen": 2044160000 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019227683049147443, + "loss": 2.6415, + "theoretical_loss": 3.4241569202205744, + "tokens_seen": 2044225536 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019226680040120361, + "loss": 2.559, + "theoretical_loss": 3.4241478068951516, + "tokens_seen": 2044291072 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001922567703109328, + "loss": 2.5689, + "theoretical_loss": 3.4241386939436804, + "tokens_seen": 2044356608 + }, + { + "epoch": 6.08, + "learning_rate": 0.000192246740220662, + "loss": 2.6658, + "theoretical_loss": 3.4241295813661328, + "tokens_seen": 2044422144 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019223671013039118, + "loss": 2.6055, + "theoretical_loss": 3.4241204691624825, + "tokens_seen": 2044487680 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019222668004012037, + "loss": 2.6344, + "theoretical_loss": 3.424111357332701, + "tokens_seen": 2044553216 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019221664994984955, + "loss": 2.7517, + "theoretical_loss": 3.424102245876762, + "tokens_seen": 2044618752 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019220661985957873, + "loss": 2.4626, + "theoretical_loss": 3.4240931347946377, + "tokens_seen": 2044684288 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2233511, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.661818742752075, + "objective/train/theoretical_loss": 3.4240885793937474, + "objective/train/tokens_used": 2065177056, + "theoretical_loss": 3.4240885793937474, + "tokens_seen": 2044717056 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019219658976930794, + "loss": 2.6262, + "theoretical_loss": 3.4240840240863006, + "tokens_seen": 2044749824 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019218655967903712, + "loss": 2.5349, + "theoretical_loss": 3.424074913751724, + "tokens_seen": 2044815360 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001921765295887663, + "loss": 2.4872, + "theoretical_loss": 3.4240658037908798, + "tokens_seen": 2044880896 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019216649949849548, + "loss": 2.6419, + "theoretical_loss": 3.4240566942037414, + "tokens_seen": 2044946432 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001921564694082247, + "loss": 2.5788, + "theoretical_loss": 3.424047584990281, + "tokens_seen": 2045011968 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019214643931795387, + "loss": 2.4212, + "theoretical_loss": 3.4240384761504714, + "tokens_seen": 2045077504 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019213640922768305, + "loss": 2.2524, + "theoretical_loss": 3.4240293676842857, + "tokens_seen": 2045143040 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019212637913741223, + "loss": 2.6085, + "theoretical_loss": 3.424020259591696, + "tokens_seen": 2045208576 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019211634904714144, + "loss": 2.5441, + "theoretical_loss": 3.424011151872676, + "tokens_seen": 2045274112 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019210631895687062, + "loss": 2.5859, + "theoretical_loss": 3.424002044527197, + "tokens_seen": 2045339648 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001920962888665998, + "loss": 2.5021, + "theoretical_loss": 3.4239929375552327, + "tokens_seen": 2045405184 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019208625877632898, + "loss": 2.6316, + "theoretical_loss": 3.423983830956755, + "tokens_seen": 2045470720 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019207622868605816, + "loss": 2.6284, + "theoretical_loss": 3.4239747247317376, + "tokens_seen": 2045536256 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019206619859578737, + "loss": 2.5193, + "theoretical_loss": 3.423965618880153, + "tokens_seen": 2045601792 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019205616850551655, + "loss": 2.4997, + "theoretical_loss": 3.423956513401973, + "tokens_seen": 2045667328 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019204613841524573, + "loss": 2.4017, + "theoretical_loss": 3.4239474082971717, + "tokens_seen": 2045732864 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019203610832497491, + "loss": 2.538, + "theoretical_loss": 3.4239383035657207, + "tokens_seen": 2045798400 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019202607823470412, + "loss": 2.4758, + "theoretical_loss": 3.4239291992075933, + "tokens_seen": 2045863936 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001920160481444333, + "loss": 2.6483, + "theoretical_loss": 3.423920095222762, + "tokens_seen": 2045929472 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019200601805416249, + "loss": 2.6636, + "theoretical_loss": 3.4239109916112, + "tokens_seen": 2045995008 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019199598796389167, + "loss": 2.3504, + "theoretical_loss": 3.4239018883728796, + "tokens_seen": 2046060544 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019198595787362085, + "loss": 2.3893, + "theoretical_loss": 3.4238927855077734, + "tokens_seen": 2046126080 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019197592778335006, + "loss": 2.7089, + "theoretical_loss": 3.4238836830158546, + "tokens_seen": 2046191616 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019196589769307924, + "loss": 2.442, + "theoretical_loss": 3.4238745808970954, + "tokens_seen": 2046257152 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019195586760280842, + "loss": 2.6986, + "theoretical_loss": 3.4238654791514693, + "tokens_seen": 2046322688 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2234682, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4722702503204346, + "objective/train/theoretical_loss": 3.4238609284185726, + "objective/train/tokens_used": 2066815456, + "theoretical_loss": 3.4238609284185726, + "tokens_seen": 2046355456 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001919458375125376, + "loss": 2.5769, + "theoretical_loss": 3.4238563777789484, + "tokens_seen": 2046388224 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001919358074222668, + "loss": 2.3951, + "theoretical_loss": 3.4238472767795054, + "tokens_seen": 2046453760 + }, + { + "epoch": 6.08, + "learning_rate": 0.000191925777331996, + "loss": 2.6412, + "theoretical_loss": 3.4238381761531143, + "tokens_seen": 2046519296 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019191574724172517, + "loss": 2.7789, + "theoretical_loss": 3.423829075899746, + "tokens_seen": 2046584832 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019190571715145435, + "loss": 2.6392, + "theoretical_loss": 3.4238199760193746, + "tokens_seen": 2046650368 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019189568706118353, + "loss": 2.5857, + "theoretical_loss": 3.423810876511972, + "tokens_seen": 2046715904 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019188565697091274, + "loss": 2.4938, + "theoretical_loss": 3.423801777377512, + "tokens_seen": 2046781440 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019187562688064192, + "loss": 2.4751, + "theoretical_loss": 3.4237926786159667, + "tokens_seen": 2046846976 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001918655967903711, + "loss": 2.6167, + "theoretical_loss": 3.4237835802273087, + "tokens_seen": 2046912512 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019185556670010028, + "loss": 2.4865, + "theoretical_loss": 3.423774482211511, + "tokens_seen": 2046978048 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001918455366098295, + "loss": 2.438, + "theoretical_loss": 3.4237653845685463, + "tokens_seen": 2047043584 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019183550651955867, + "loss": 2.5512, + "theoretical_loss": 3.4237562872983878, + "tokens_seen": 2047109120 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019182547642928785, + "loss": 2.525, + "theoretical_loss": 3.4237471904010084, + "tokens_seen": 2047174656 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019181544633901706, + "loss": 2.5457, + "theoretical_loss": 3.42373809387638, + "tokens_seen": 2047240192 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019180541624874624, + "loss": 2.6463, + "theoretical_loss": 3.423728997724476, + "tokens_seen": 2047305728 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019179538615847545, + "loss": 2.7023, + "theoretical_loss": 3.423719901945269, + "tokens_seen": 2047371264 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019178535606820463, + "loss": 2.5326, + "theoretical_loss": 3.4237108065387325, + "tokens_seen": 2047436800 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019177532597793381, + "loss": 2.7732, + "theoretical_loss": 3.423701711504838, + "tokens_seen": 2047502336 + }, + { + "epoch": 6.08, + "learning_rate": 0.000191765295887663, + "loss": 2.5148, + "theoretical_loss": 3.4236926168435593, + "tokens_seen": 2047567872 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001917552657973922, + "loss": 2.4427, + "theoretical_loss": 3.4236835225548683, + "tokens_seen": 2047633408 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019174523570712138, + "loss": 2.3265, + "theoretical_loss": 3.4236744286387393, + "tokens_seen": 2047698944 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019173520561685057, + "loss": 2.6963, + "theoretical_loss": 3.423665335095144, + "tokens_seen": 2047764480 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019172517552657975, + "loss": 2.6512, + "theoretical_loss": 3.4236562419240553, + "tokens_seen": 2047830016 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019171514543630893, + "loss": 2.4586, + "theoretical_loss": 3.423647149125446, + "tokens_seen": 2047895552 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019170511534603814, + "loss": 2.734, + "theoretical_loss": 3.4236380566992892, + "tokens_seen": 2047961088 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2235368, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5678956508636475, + "objective/train/theoretical_loss": 3.423633510625872, + "objective/train/tokens_used": 2068453856, + "theoretical_loss": 3.423633510625872, + "tokens_seen": 2047993856 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019169508525576732, + "loss": 2.4156, + "theoretical_loss": 3.423628964645558, + "tokens_seen": 2048026624 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001916850551654965, + "loss": 2.5092, + "theoretical_loss": 3.423619872964225, + "tokens_seen": 2048092160 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019167502507522568, + "loss": 2.4754, + "theoretical_loss": 3.4236107816552623, + "tokens_seen": 2048157696 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001916649949849549, + "loss": 2.5901, + "theoretical_loss": 3.4236016907186433, + "tokens_seen": 2048223232 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019165496489468407, + "loss": 2.4266, + "theoretical_loss": 3.4235926001543406, + "tokens_seen": 2048288768 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019164493480441325, + "loss": 2.5363, + "theoretical_loss": 3.423583509962328, + "tokens_seen": 2048354304 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019163490471414243, + "loss": 2.5573, + "theoretical_loss": 3.4235744201425775, + "tokens_seen": 2048419840 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019162487462387164, + "loss": 2.5465, + "theoretical_loss": 3.423565330695062, + "tokens_seen": 2048485376 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019161484453360082, + "loss": 2.455, + "theoretical_loss": 3.423556241619754, + "tokens_seen": 2048550912 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019160481444333, + "loss": 2.6167, + "theoretical_loss": 3.4235471529166275, + "tokens_seen": 2048616448 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019159478435305918, + "loss": 2.5538, + "theoretical_loss": 3.4235380645856544, + "tokens_seen": 2048681984 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019158475426278836, + "loss": 2.5371, + "theoretical_loss": 3.423528976626808, + "tokens_seen": 2048747520 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019157472417251757, + "loss": 2.6608, + "theoretical_loss": 3.4235198890400604, + "tokens_seen": 2048813056 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019156469408224675, + "loss": 2.4317, + "theoretical_loss": 3.423510801825385, + "tokens_seen": 2048878592 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019155466399197593, + "loss": 2.7994, + "theoretical_loss": 3.4235017149827556, + "tokens_seen": 2048944128 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019154463390170512, + "loss": 2.5181, + "theoretical_loss": 3.423492628512143, + "tokens_seen": 2049009664 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019153460381143432, + "loss": 2.3944, + "theoretical_loss": 3.423483542413522, + "tokens_seen": 2049075200 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001915245737211635, + "loss": 2.4379, + "theoretical_loss": 3.4234744566868645, + "tokens_seen": 2049140736 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019151454363089269, + "loss": 2.6642, + "theoretical_loss": 3.4234653713321435, + "tokens_seen": 2049206272 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019150451354062187, + "loss": 2.4137, + "theoretical_loss": 3.423456286349332, + "tokens_seen": 2049271808 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019149448345035105, + "loss": 2.5209, + "theoretical_loss": 3.423447201738403, + "tokens_seen": 2049337344 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019148445336008026, + "loss": 2.7819, + "theoretical_loss": 3.423438117499329, + "tokens_seen": 2049402880 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019147442326980944, + "loss": 2.704, + "theoretical_loss": 3.4234290336320834, + "tokens_seen": 2049468416 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019146439317953862, + "loss": 2.849, + "theoretical_loss": 3.4234199501366382, + "tokens_seen": 2049533952 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001914543630892678, + "loss": 2.4896, + "theoretical_loss": 3.4234108670129677, + "tokens_seen": 2049599488 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2236637, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.447277545928955, + "objective/train/theoretical_loss": 3.4234063255905385, + "objective/train/tokens_used": 2070092256, + "theoretical_loss": 3.4234063255905385, + "tokens_seen": 2049632256 + }, + { + "epoch": 6.08, + "learning_rate": 0.000191444332998997, + "loss": 2.6342, + "theoretical_loss": 3.4234017842610434, + "tokens_seen": 2049665024 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001914343029087262, + "loss": 2.6485, + "theoretical_loss": 3.423392701880839, + "tokens_seen": 2049730560 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019142427281845537, + "loss": 2.5928, + "theoretical_loss": 3.423383619872327, + "tokens_seen": 2049796096 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019141424272818455, + "loss": 2.4136, + "theoretical_loss": 3.423374538235481, + "tokens_seen": 2049861632 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019140421263791373, + "loss": 2.3783, + "theoretical_loss": 3.4233654569702727, + "tokens_seen": 2049927168 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019139418254764294, + "loss": 2.542, + "theoretical_loss": 3.4233563760766765, + "tokens_seen": 2049992704 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019138415245737212, + "loss": 2.5733, + "theoretical_loss": 3.423347295554664, + "tokens_seen": 2050058240 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001913741223671013, + "loss": 2.846, + "theoretical_loss": 3.423338215404209, + "tokens_seen": 2050123776 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019136409227683048, + "loss": 2.4889, + "theoretical_loss": 3.4233291356252837, + "tokens_seen": 2050189312 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001913540621865597, + "loss": 2.5135, + "theoretical_loss": 3.423320056217862, + "tokens_seen": 2050254848 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019134403209628887, + "loss": 2.7589, + "theoretical_loss": 3.4233109771819157, + "tokens_seen": 2050320384 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019133400200601805, + "loss": 2.6014, + "theoretical_loss": 3.423301898517418, + "tokens_seen": 2050385920 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019132397191574724, + "loss": 2.5976, + "theoretical_loss": 3.4232928202243427, + "tokens_seen": 2050451456 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019131394182547642, + "loss": 2.5187, + "theoretical_loss": 3.4232837423026616, + "tokens_seen": 2050516992 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019130391173520562, + "loss": 2.7904, + "theoretical_loss": 3.423274664752349, + "tokens_seen": 2050582528 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001912938816449348, + "loss": 2.4665, + "theoretical_loss": 3.4232655875733764, + "tokens_seen": 2050648064 + }, + { + "epoch": 6.08, + "learning_rate": 0.000191283851554664, + "loss": 2.4817, + "theoretical_loss": 3.423256510765717, + "tokens_seen": 2050713600 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019127382146439317, + "loss": 2.6005, + "theoretical_loss": 3.423247434329345, + "tokens_seen": 2050779136 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019126379137412238, + "loss": 2.6884, + "theoretical_loss": 3.4232383582642316, + "tokens_seen": 2050844672 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019125376128385156, + "loss": 2.4333, + "theoretical_loss": 3.4232292825703508, + "tokens_seen": 2050910208 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019124373119358074, + "loss": 2.3955, + "theoretical_loss": 3.4232202072476756, + "tokens_seen": 2050975744 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019123370110330992, + "loss": 2.7396, + "theoretical_loss": 3.4232111322961787, + "tokens_seen": 2051041280 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001912236710130391, + "loss": 2.4444, + "theoretical_loss": 3.423202057715833, + "tokens_seen": 2051106816 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001912136409227683, + "loss": 2.797, + "theoretical_loss": 3.423192983506611, + "tokens_seen": 2051172352 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001912036108324975, + "loss": 2.5691, + "theoretical_loss": 3.4231839096684866, + "tokens_seen": 2051237888 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2237224, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6253304481506348, + "objective/train/theoretical_loss": 3.423179372888577, + "objective/train/tokens_used": 2071730656, + "theoretical_loss": 3.423179372888577, + "tokens_seen": 2051270656 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019119358074222667, + "loss": 2.5497, + "theoretical_loss": 3.423174836201432, + "tokens_seen": 2051303424 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019118355065195585, + "loss": 2.22, + "theoretical_loss": 3.423165763105421, + "tokens_seen": 2051368960 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019117352056168506, + "loss": 2.5806, + "theoretical_loss": 3.4231566903804262, + "tokens_seen": 2051434496 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019116349047141424, + "loss": 2.4524, + "theoretical_loss": 3.42314761802642, + "tokens_seen": 2051500032 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019115346038114342, + "loss": 2.6546, + "theoretical_loss": 3.423138546043376, + "tokens_seen": 2051565568 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001911434302908726, + "loss": 2.5388, + "theoretical_loss": 3.423129474431267, + "tokens_seen": 2051631104 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019113340020060178, + "loss": 2.3591, + "theoretical_loss": 3.423120403190066, + "tokens_seen": 2051696640 + }, + { + "epoch": 6.08, + "learning_rate": 0.000191123370110331, + "loss": 2.5041, + "theoretical_loss": 3.4231113323197464, + "tokens_seen": 2051762176 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019111334002006017, + "loss": 2.5558, + "theoretical_loss": 3.42310226182028, + "tokens_seen": 2051827712 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019110330992978936, + "loss": 2.4548, + "theoretical_loss": 3.4230931916916414, + "tokens_seen": 2051893248 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019109327983951854, + "loss": 2.3418, + "theoretical_loss": 3.423084121933803, + "tokens_seen": 2051958784 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019108324974924774, + "loss": 2.3782, + "theoretical_loss": 3.4230750525467366, + "tokens_seen": 2052024320 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019107321965897693, + "loss": 2.376, + "theoretical_loss": 3.4230659835304165, + "tokens_seen": 2052089856 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019106318956870613, + "loss": 2.3563, + "theoretical_loss": 3.423056914884816, + "tokens_seen": 2052155392 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019105315947843532, + "loss": 2.7716, + "theoretical_loss": 3.4230478466099066, + "tokens_seen": 2052220928 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019104312938816452, + "loss": 2.4839, + "theoretical_loss": 3.4230387787056626, + "tokens_seen": 2052286464 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001910330992978937, + "loss": 2.4212, + "theoretical_loss": 3.423029711172057, + "tokens_seen": 2052352000 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019102306920762289, + "loss": 2.567, + "theoretical_loss": 3.423020644009062, + "tokens_seen": 2052417536 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019101303911735207, + "loss": 2.4772, + "theoretical_loss": 3.423011577216651, + "tokens_seen": 2052483072 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019100300902708125, + "loss": 2.3159, + "theoretical_loss": 3.423002510794797, + "tokens_seen": 2052548608 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019099297893681046, + "loss": 2.3906, + "theoretical_loss": 3.422993444743473, + "tokens_seen": 2052614144 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019098294884653964, + "loss": 2.6224, + "theoretical_loss": 3.422984379062653, + "tokens_seen": 2052679680 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019097291875626882, + "loss": 2.364, + "theoretical_loss": 3.4229753137523087, + "tokens_seen": 2052745216 + }, + { + "epoch": 6.08, + "learning_rate": 0.000190962888665998, + "loss": 2.524, + "theoretical_loss": 3.422966248812413, + "tokens_seen": 2052810752 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001909528585757272, + "loss": 2.6217, + "theoretical_loss": 3.4229571842429403, + "tokens_seen": 2052876288 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2237925, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4200751781463623, + "objective/train/theoretical_loss": 3.422952652097104, + "objective/train/tokens_used": 2073369056, + "theoretical_loss": 3.422952652097104, + "tokens_seen": 2052909056 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001909428284854564, + "loss": 2.4063, + "theoretical_loss": 3.4229481200438627, + "tokens_seen": 2052941824 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019093279839518557, + "loss": 2.2331, + "theoretical_loss": 3.4229390562151534, + "tokens_seen": 2053007360 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019092276830491475, + "loss": 2.4979, + "theoretical_loss": 3.422929992756785, + "tokens_seen": 2053072896 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019091273821464393, + "loss": 2.587, + "theoretical_loss": 3.4229209296687317, + "tokens_seen": 2053138432 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019090270812437314, + "loss": 2.459, + "theoretical_loss": 3.4229118669509653, + "tokens_seen": 2053203968 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019089267803410232, + "loss": 2.5007, + "theoretical_loss": 3.42290280460346, + "tokens_seen": 2053269504 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001908826479438315, + "loss": 2.6176, + "theoretical_loss": 3.422893742626188, + "tokens_seen": 2053335040 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019087261785356068, + "loss": 2.659, + "theoretical_loss": 3.4228846810191222, + "tokens_seen": 2053400576 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001908625877632899, + "loss": 2.5527, + "theoretical_loss": 3.4228756197822365, + "tokens_seen": 2053466112 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019085255767301907, + "loss": 2.7057, + "theoretical_loss": 3.422866558915503, + "tokens_seen": 2053531648 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019084252758274825, + "loss": 2.6479, + "theoretical_loss": 3.422857498418896, + "tokens_seen": 2053597184 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019083249749247744, + "loss": 2.4988, + "theoretical_loss": 3.4228484382923874, + "tokens_seen": 2053662720 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019082246740220662, + "loss": 2.512, + "theoretical_loss": 3.422839378535951, + "tokens_seen": 2053728256 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019081243731193582, + "loss": 2.4843, + "theoretical_loss": 3.42283031914956, + "tokens_seen": 2053793792 + }, + { + "epoch": 6.08, + "learning_rate": 0.000190802407221665, + "loss": 2.3986, + "theoretical_loss": 3.422821260133187, + "tokens_seen": 2053859328 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001907923771313942, + "loss": 2.5574, + "theoretical_loss": 3.4228122014868045, + "tokens_seen": 2053924864 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019078234704112337, + "loss": 2.2207, + "theoretical_loss": 3.4228031432103867, + "tokens_seen": 2053990400 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019077231695085258, + "loss": 2.4264, + "theoretical_loss": 3.4227940853039063, + "tokens_seen": 2054055936 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019076228686058176, + "loss": 2.3237, + "theoretical_loss": 3.422785027767336, + "tokens_seen": 2054121472 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019075225677031094, + "loss": 2.7426, + "theoretical_loss": 3.4227759706006498, + "tokens_seen": 2054187008 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019074222668004012, + "loss": 2.5079, + "theoretical_loss": 3.42276691380382, + "tokens_seen": 2054252544 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001907321965897693, + "loss": 2.5402, + "theoretical_loss": 3.42275785737682, + "tokens_seen": 2054318080 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001907221664994985, + "loss": 2.6162, + "theoretical_loss": 3.4227488013196226, + "tokens_seen": 2054383616 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001907121364092277, + "loss": 2.4864, + "theoretical_loss": 3.4227397456322013, + "tokens_seen": 2054449152 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019070210631895687, + "loss": 2.5751, + "theoretical_loss": 3.4227306903145287, + "tokens_seen": 2054514688 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2239247, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4290542602539062, + "objective/train/theoretical_loss": 3.4227261627943406, + "objective/train/tokens_used": 2075007456, + "theoretical_loss": 3.4227261627943406, + "tokens_seen": 2054547456 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019069207622868605, + "loss": 2.33, + "theoretical_loss": 3.422721635366579, + "tokens_seen": 2054580224 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019068204613841526, + "loss": 2.2246, + "theoretical_loss": 3.422712580788324, + "tokens_seen": 2054645760 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019067201604814444, + "loss": 2.5954, + "theoretical_loss": 3.4227035265797374, + "tokens_seen": 2054711296 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019066198595787362, + "loss": 2.2777, + "theoretical_loss": 3.422694472740792, + "tokens_seen": 2054776832 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001906519558676028, + "loss": 2.7523, + "theoretical_loss": 3.4226854192714624, + "tokens_seen": 2054842368 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019064192577733199, + "loss": 2.5804, + "theoretical_loss": 3.4226763661717197, + "tokens_seen": 2054907904 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001906318956870612, + "loss": 2.1276, + "theoretical_loss": 3.4226673134415377, + "tokens_seen": 2054973440 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019062186559679037, + "loss": 2.2056, + "theoretical_loss": 3.42265826108089, + "tokens_seen": 2055038976 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019061183550651956, + "loss": 2.3974, + "theoretical_loss": 3.4226492090897493, + "tokens_seen": 2055104512 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019060180541624874, + "loss": 2.2572, + "theoretical_loss": 3.4226401574680887, + "tokens_seen": 2055170048 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019059177532597794, + "loss": 2.3401, + "theoretical_loss": 3.422631106215882, + "tokens_seen": 2055235584 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019058174523570713, + "loss": 2.4152, + "theoretical_loss": 3.4226220553331013, + "tokens_seen": 2055301120 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001905717151454363, + "loss": 2.699, + "theoretical_loss": 3.4226130048197208, + "tokens_seen": 2055366656 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001905616850551655, + "loss": 2.5908, + "theoretical_loss": 3.4226039546757128, + "tokens_seen": 2055432192 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019055165496489467, + "loss": 2.5906, + "theoretical_loss": 3.422594904901051, + "tokens_seen": 2055497728 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019054162487462388, + "loss": 2.6575, + "theoretical_loss": 3.422585855495708, + "tokens_seen": 2055563264 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019053159478435306, + "loss": 2.4193, + "theoretical_loss": 3.4225768064596576, + "tokens_seen": 2055628800 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019052156469408224, + "loss": 2.1132, + "theoretical_loss": 3.422567757792873, + "tokens_seen": 2055694336 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019051153460381142, + "loss": 2.3188, + "theoretical_loss": 3.422558709495326, + "tokens_seen": 2055759872 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019050150451354063, + "loss": 2.5649, + "theoretical_loss": 3.4225496615669915, + "tokens_seen": 2055825408 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001904914744232698, + "loss": 2.5968, + "theoretical_loss": 3.4225406140078416, + "tokens_seen": 2055890944 + }, + { + "epoch": 6.08, + "learning_rate": 0.000190481444332999, + "loss": 2.1406, + "theoretical_loss": 3.4225315668178498, + "tokens_seen": 2055956480 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019047141424272817, + "loss": 2.5014, + "theoretical_loss": 3.4225225199969893, + "tokens_seen": 2056022016 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019046138415245738, + "loss": 2.5266, + "theoretical_loss": 3.4225134735452336, + "tokens_seen": 2056087552 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019045135406218656, + "loss": 2.4607, + "theoretical_loss": 3.422504427462555, + "tokens_seen": 2056153088 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.430849075317383, + "objective/train/theoretical_loss": 3.4224999045596114, + "objective/train/tokens_used": 2076645856, + "theoretical_loss": 3.4224999045596114, + "tokens_seen": 2056185856 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019044132397191574, + "loss": 2.6319, + "theoretical_loss": 3.422495381748927, + "tokens_seen": 2056218624 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019043129388164492, + "loss": 2.3775, + "theoretical_loss": 3.422486336404324, + "tokens_seen": 2056284160 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001904212637913741, + "loss": 2.3967, + "theoretical_loss": 3.4224772914287174, + "tokens_seen": 2056349696 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001904112337011033, + "loss": 2.4555, + "theoretical_loss": 3.4224682468220813, + "tokens_seen": 2056415232 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001904012036108325, + "loss": 2.3023, + "theoretical_loss": 3.422459202584389, + "tokens_seen": 2056480768 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019039117352056168, + "loss": 2.7111, + "theoretical_loss": 3.422450158715613, + "tokens_seen": 2056546304 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019038114343029086, + "loss": 2.4437, + "theoretical_loss": 3.4224411152157272, + "tokens_seen": 2056611840 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019037111334002007, + "loss": 2.315, + "theoretical_loss": 3.4224320720847046, + "tokens_seen": 2056677376 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019036108324974925, + "loss": 2.5453, + "theoretical_loss": 3.422423029322518, + "tokens_seen": 2056742912 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019035105315947843, + "loss": 2.6109, + "theoretical_loss": 3.4224139869291412, + "tokens_seen": 2056808448 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001903410230692076, + "loss": 2.5447, + "theoretical_loss": 3.422404944904547, + "tokens_seen": 2056873984 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001903309929789368, + "loss": 2.4414, + "theoretical_loss": 3.422395903248709, + "tokens_seen": 2056939520 + }, + { + "epoch": 6.08, + "learning_rate": 0.000190320962888666, + "loss": 2.5074, + "theoretical_loss": 3.4223868619616, + "tokens_seen": 2057005056 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001903109327983952, + "loss": 2.4426, + "theoretical_loss": 3.4223778210431934, + "tokens_seen": 2057070592 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001903009027081244, + "loss": 2.4296, + "theoretical_loss": 3.4223687804934624, + "tokens_seen": 2057136128 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019029087261785357, + "loss": 2.3161, + "theoretical_loss": 3.4223597403123804, + "tokens_seen": 2057201664 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019028084252758278, + "loss": 2.4263, + "theoretical_loss": 3.42235070049992, + "tokens_seen": 2057267200 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019027081243731196, + "loss": 2.5198, + "theoretical_loss": 3.422341661056055, + "tokens_seen": 2057332736 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019026078234704114, + "loss": 2.5225, + "theoretical_loss": 3.422332621980759, + "tokens_seen": 2057398272 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019025075225677032, + "loss": 2.3695, + "theoretical_loss": 3.422323583274004, + "tokens_seen": 2057463808 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001902407221664995, + "loss": 2.3669, + "theoretical_loss": 3.4223145449357646, + "tokens_seen": 2057529344 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001902306920762287, + "loss": 2.539, + "theoretical_loss": 3.422305506966013, + "tokens_seen": 2057594880 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001902206619859579, + "loss": 2.3575, + "theoretical_loss": 3.4222964693647233, + "tokens_seen": 2057660416 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019021063189568707, + "loss": 2.3674, + "theoretical_loss": 3.422287432131868, + "tokens_seen": 2057725952 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019020060180541625, + "loss": 2.3663, + "theoretical_loss": 3.422278395267421, + "tokens_seen": 2057791488 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4583640098571777, + "objective/train/theoretical_loss": 3.4222738769733416, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.4222738769733416, + "tokens_seen": 2057824256 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019019057171514546, + "loss": 2.5474, + "theoretical_loss": 3.4222693587713544, + "tokens_seen": 2057857024 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019018054162487464, + "loss": 2.419, + "theoretical_loss": 3.422260322643643, + "tokens_seen": 2057922560 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019017051153460382, + "loss": 2.411, + "theoretical_loss": 3.4222512868842587, + "tokens_seen": 2057988096 + }, + { + "epoch": 6.08, + "learning_rate": 0.000190160481444333, + "loss": 2.3117, + "theoretical_loss": 3.4222422514931754, + "tokens_seen": 2058053632 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019015045135406219, + "loss": 2.5112, + "theoretical_loss": 3.4222332164703673, + "tokens_seen": 2058119168 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001901404212637914, + "loss": 2.5345, + "theoretical_loss": 3.422224181815806, + "tokens_seen": 2058184704 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019013039117352057, + "loss": 2.3348, + "theoretical_loss": 3.422215147529465, + "tokens_seen": 2058250240 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019012036108324976, + "loss": 2.3462, + "theoretical_loss": 3.4222061136113187, + "tokens_seen": 2058315776 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019011033099297894, + "loss": 2.4937, + "theoretical_loss": 3.422197080061339, + "tokens_seen": 2058381312 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019010030090270815, + "loss": 2.3308, + "theoretical_loss": 3.4221880468795005, + "tokens_seen": 2058446848 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019009027081243733, + "loss": 2.4994, + "theoretical_loss": 3.4221790140657755, + "tokens_seen": 2058512384 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001900802407221665, + "loss": 2.2876, + "theoretical_loss": 3.422169981620138, + "tokens_seen": 2058577920 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001900702106318957, + "loss": 2.4519, + "theoretical_loss": 3.4221609495425604, + "tokens_seen": 2058643456 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019006018054162487, + "loss": 2.6685, + "theoretical_loss": 3.422151917833017, + "tokens_seen": 2058708992 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019005015045135408, + "loss": 2.6641, + "theoretical_loss": 3.4221428864914802, + "tokens_seen": 2058774528 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019004012036108326, + "loss": 2.5978, + "theoretical_loss": 3.4221338555179237, + "tokens_seen": 2058840064 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019003009027081244, + "loss": 2.4677, + "theoretical_loss": 3.422124824912321, + "tokens_seen": 2058905600 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019002006018054162, + "loss": 2.301, + "theoretical_loss": 3.422115794674645, + "tokens_seen": 2058971136 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019001003009027083, + "loss": 2.3754, + "theoretical_loss": 3.422106764804869, + "tokens_seen": 2059036672 + }, + { + "epoch": 6.08, + "learning_rate": 0.00019, + "loss": 2.3653, + "theoretical_loss": 3.422097735302967, + "tokens_seen": 2059102208 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001899899699097292, + "loss": 2.5775, + "theoretical_loss": 3.422088706168911, + "tokens_seen": 2059167744 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018997993981945837, + "loss": 2.4857, + "theoretical_loss": 3.4220796774026754, + "tokens_seen": 2059233280 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018996990972918758, + "loss": 2.7078, + "theoretical_loss": 3.422070649004233, + "tokens_seen": 2059298816 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018995987963891676, + "loss": 2.4787, + "theoretical_loss": 3.422061620973558, + "tokens_seen": 2059364352 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018994984954864594, + "loss": 2.3499, + "theoretical_loss": 3.4220525933106227, + "tokens_seen": 2059429888 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1419692039489746, + "objective/train/theoretical_loss": 3.422048079617049, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.422048079617049, + "tokens_seen": 2059462656 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018993981945837512, + "loss": 2.4219, + "theoretical_loss": 3.4220435660154003, + "tokens_seen": 2059495424 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001899297893681043, + "loss": 2.4005, + "theoretical_loss": 3.422034539087865, + "tokens_seen": 2059560960 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018991975927783351, + "loss": 2.2798, + "theoretical_loss": 3.4220255125279895, + "tokens_seen": 2059626496 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001899097291875627, + "loss": 2.475, + "theoretical_loss": 3.4220164863357474, + "tokens_seen": 2059692032 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018989969909729188, + "loss": 2.4172, + "theoretical_loss": 3.4220074605111117, + "tokens_seen": 2059757568 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018988966900702106, + "loss": 2.3652, + "theoretical_loss": 3.421998435054056, + "tokens_seen": 2059823104 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018987963891675027, + "loss": 2.1941, + "theoretical_loss": 3.421989409964554, + "tokens_seen": 2059888640 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018986960882647945, + "loss": 2.485, + "theoretical_loss": 3.4219803852425787, + "tokens_seen": 2059954176 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018985957873620863, + "loss": 2.7053, + "theoretical_loss": 3.421971360888103, + "tokens_seen": 2060019712 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001898495486459378, + "loss": 2.476, + "theoretical_loss": 3.4219623369011005, + "tokens_seen": 2060085248 + }, + { + "epoch": 6.08, + "learning_rate": 0.000189839518555667, + "loss": 2.4578, + "theoretical_loss": 3.421953313281545, + "tokens_seen": 2060150784 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001898294884653962, + "loss": 2.5017, + "theoretical_loss": 3.4219442900294093, + "tokens_seen": 2060216320 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018981945837512538, + "loss": 2.1545, + "theoretical_loss": 3.4219352671446672, + "tokens_seen": 2060281856 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018980942828485456, + "loss": 2.467, + "theoretical_loss": 3.421926244627292, + "tokens_seen": 2060347392 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018979939819458374, + "loss": 2.3308, + "theoretical_loss": 3.4219172224772567, + "tokens_seen": 2060412928 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018978936810431295, + "loss": 2.6139, + "theoretical_loss": 3.421908200694535, + "tokens_seen": 2060478464 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018977933801404213, + "loss": 2.3843, + "theoretical_loss": 3.4218991792791, + "tokens_seen": 2060544000 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001897693079237713, + "loss": 2.5371, + "theoretical_loss": 3.4218901582309247, + "tokens_seen": 2060609536 + }, + { + "epoch": 6.08, + "learning_rate": 0.0001897592778335005, + "loss": 2.3853, + "theoretical_loss": 3.4218811375499834, + "tokens_seen": 2060675072 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018974924774322967, + "loss": 2.5131, + "theoretical_loss": 3.4218721172362487, + "tokens_seen": 2060740608 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018973921765295888, + "loss": 2.2588, + "theoretical_loss": 3.421863097289695, + "tokens_seen": 2060806144 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018972918756268806, + "loss": 2.5686, + "theoretical_loss": 3.421854077710295, + "tokens_seen": 2060871680 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018971915747241724, + "loss": 2.5489, + "theoretical_loss": 3.4218450584980213, + "tokens_seen": 2060937216 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018970912738214643, + "loss": 2.568, + "theoretical_loss": 3.421836039652848, + "tokens_seen": 2061002752 + }, + { + "epoch": 6.08, + "learning_rate": 0.00018969909729187563, + "loss": 2.3999, + "theoretical_loss": 3.4218270211747495, + "tokens_seen": 2061068288 + }, + { + "epoch": 6.08, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1681628227233887, + "objective/train/theoretical_loss": 3.4218225120733443, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.4218225120733443, + "tokens_seen": 2061101056 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018968906720160481, + "loss": 2.3795, + "theoretical_loss": 3.4218180030636978, + "tokens_seen": 2061133824 + }, + { + "epoch": 6.09, + "learning_rate": 0.000189679037111334, + "loss": 2.3975, + "theoretical_loss": 3.4218089853196663, + "tokens_seen": 2061199360 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018966900702106318, + "loss": 2.5486, + "theoretical_loss": 3.421799967942629, + "tokens_seen": 2061264896 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018965897693079236, + "loss": 2.661, + "theoretical_loss": 3.421790950932559, + "tokens_seen": 2061330432 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018964894684052157, + "loss": 2.3767, + "theoretical_loss": 3.42178193428943, + "tokens_seen": 2061395968 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018963891675025075, + "loss": 2.3342, + "theoretical_loss": 3.421772918013215, + "tokens_seen": 2061461504 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018962888665997993, + "loss": 2.6058, + "theoretical_loss": 3.421763902103888, + "tokens_seen": 2061527040 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001896188565697091, + "loss": 2.3219, + "theoretical_loss": 3.4217548865614216, + "tokens_seen": 2061592576 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018960882647943832, + "loss": 2.7521, + "theoretical_loss": 3.42174587138579, + "tokens_seen": 2061658112 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001895987963891675, + "loss": 2.6784, + "theoretical_loss": 3.421736856576966, + "tokens_seen": 2061723648 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018958876629889668, + "loss": 2.5022, + "theoretical_loss": 3.4217278421349233, + "tokens_seen": 2061789184 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018957873620862586, + "loss": 2.5008, + "theoretical_loss": 3.4217188280596353, + "tokens_seen": 2061854720 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018956870611835504, + "loss": 2.3008, + "theoretical_loss": 3.4217098143510754, + "tokens_seen": 2061920256 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018955867602808428, + "loss": 2.3193, + "theoretical_loss": 3.4217008010092167, + "tokens_seen": 2061985792 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018954864593781346, + "loss": 2.4781, + "theoretical_loss": 3.4216917880340336, + "tokens_seen": 2062051328 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018953861584754264, + "loss": 2.6736, + "theoretical_loss": 3.4216827754254986, + "tokens_seen": 2062116864 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018952858575727182, + "loss": 2.4707, + "theoretical_loss": 3.421673763183585, + "tokens_seen": 2062182400 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018951855566700103, + "loss": 2.6442, + "theoretical_loss": 3.421664751308267, + "tokens_seen": 2062247936 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001895085255767302, + "loss": 2.5921, + "theoretical_loss": 3.421655739799518, + "tokens_seen": 2062313472 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001894984954864594, + "loss": 2.4704, + "theoretical_loss": 3.4216467286573105, + "tokens_seen": 2062379008 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018948846539618857, + "loss": 2.489, + "theoretical_loss": 3.421637717881619, + "tokens_seen": 2062444544 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018947843530591778, + "loss": 2.5684, + "theoretical_loss": 3.4216287074724163, + "tokens_seen": 2062510080 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018946840521564696, + "loss": 2.5811, + "theoretical_loss": 3.4216196974296764, + "tokens_seen": 2062575616 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018945837512537614, + "loss": 2.5987, + "theoretical_loss": 3.421610687753372, + "tokens_seen": 2062641152 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018944834503510532, + "loss": 2.6342, + "theoretical_loss": 3.4216016784434773, + "tokens_seen": 2062706688 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2309670448303223, + "objective/train/theoretical_loss": 3.421597173925925, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.421597173925925, + "tokens_seen": 2062739456 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001894383149448345, + "loss": 2.4457, + "theoretical_loss": 3.421592669499965, + "tokens_seen": 2062772224 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018942828485456371, + "loss": 2.4583, + "theoretical_loss": 3.4215836609228094, + "tokens_seen": 2062837760 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001894182547642929, + "loss": 2.3993, + "theoretical_loss": 3.4215746527119837, + "tokens_seen": 2062903296 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018940822467402208, + "loss": 2.3884, + "theoretical_loss": 3.421565644867461, + "tokens_seen": 2062968832 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018939819458375126, + "loss": 2.4028, + "theoretical_loss": 3.421556637389215, + "tokens_seen": 2063034368 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018938816449348047, + "loss": 2.3214, + "theoretical_loss": 3.421547630277219, + "tokens_seen": 2063099904 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018937813440320965, + "loss": 2.5917, + "theoretical_loss": 3.421538623531447, + "tokens_seen": 2063165440 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018936810431293883, + "loss": 2.5861, + "theoretical_loss": 3.4215296171518714, + "tokens_seen": 2063230976 + }, + { + "epoch": 6.09, + "learning_rate": 0.000189358074222668, + "loss": 2.3544, + "theoretical_loss": 3.421520611138467, + "tokens_seen": 2063296512 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001893480441323972, + "loss": 2.4849, + "theoretical_loss": 3.4215116054912067, + "tokens_seen": 2063362048 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001893380140421264, + "loss": 2.4324, + "theoretical_loss": 3.421502600210064, + "tokens_seen": 2063427584 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018932798395185558, + "loss": 2.376, + "theoretical_loss": 3.421493595295012, + "tokens_seen": 2063493120 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018931795386158476, + "loss": 2.5724, + "theoretical_loss": 3.421484590746025, + "tokens_seen": 2063558656 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018930792377131394, + "loss": 2.4675, + "theoretical_loss": 3.4214755865630755, + "tokens_seen": 2063624192 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018929789368104315, + "loss": 2.2705, + "theoretical_loss": 3.421466582746138, + "tokens_seen": 2063689728 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018928786359077233, + "loss": 2.3122, + "theoretical_loss": 3.4214575792951853, + "tokens_seen": 2063755264 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001892778335005015, + "loss": 2.5259, + "theoretical_loss": 3.4214485762101914, + "tokens_seen": 2063820800 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001892678034102307, + "loss": 2.3712, + "theoretical_loss": 3.4214395734911287, + "tokens_seen": 2063886336 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018925777331995987, + "loss": 2.4045, + "theoretical_loss": 3.4214305711379724, + "tokens_seen": 2063951872 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018924774322968908, + "loss": 2.5618, + "theoretical_loss": 3.421421569150695, + "tokens_seen": 2064017408 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018923771313941826, + "loss": 2.6009, + "theoretical_loss": 3.42141256752927, + "tokens_seen": 2064082944 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018922768304914744, + "loss": 2.438, + "theoretical_loss": 3.4214035662736713, + "tokens_seen": 2064148480 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018921765295887663, + "loss": 2.5738, + "theoretical_loss": 3.4213945653838724, + "tokens_seen": 2064214016 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018920762286860583, + "loss": 2.6122, + "theoretical_loss": 3.421385564859846, + "tokens_seen": 2064279552 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018919759277833502, + "loss": 2.512, + "theoretical_loss": 3.421376564701567, + "tokens_seen": 2064345088 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2449283599853516, + "objective/train/theoretical_loss": 3.4213720647595736, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.4213720647595736, + "tokens_seen": 2064377856 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001891875626880642, + "loss": 2.7295, + "theoretical_loss": 3.4213675649090076, + "tokens_seen": 2064410624 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018917753259779338, + "loss": 2.6114, + "theoretical_loss": 3.4213585654821417, + "tokens_seen": 2064476160 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018916750250752256, + "loss": 2.3615, + "theoretical_loss": 3.4213495664209437, + "tokens_seen": 2064541696 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018915747241725177, + "loss": 2.548, + "theoretical_loss": 3.4213405677253865, + "tokens_seen": 2064607232 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018914744232698095, + "loss": 2.3632, + "theoretical_loss": 3.421331569395443, + "tokens_seen": 2064672768 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018913741223671013, + "loss": 2.619, + "theoretical_loss": 3.4213225714310873, + "tokens_seen": 2064738304 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001891273821464393, + "loss": 2.4289, + "theoretical_loss": 3.4213135738322937, + "tokens_seen": 2064803840 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018911735205616852, + "loss": 2.4257, + "theoretical_loss": 3.421304576599035, + "tokens_seen": 2064869376 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001891073219658977, + "loss": 2.4398, + "theoretical_loss": 3.421295579731284, + "tokens_seen": 2064934912 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018909729187562688, + "loss": 2.4556, + "theoretical_loss": 3.4212865832290156, + "tokens_seen": 2065000448 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018908726178535606, + "loss": 2.7128, + "theoretical_loss": 3.421277587092203, + "tokens_seen": 2065065984 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018907723169508524, + "loss": 2.4233, + "theoretical_loss": 3.4212685913208194, + "tokens_seen": 2065131520 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018906720160481445, + "loss": 2.6515, + "theoretical_loss": 3.4212595959148384, + "tokens_seen": 2065197056 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018905717151454363, + "loss": 2.5642, + "theoretical_loss": 3.4212506008742336, + "tokens_seen": 2065262592 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001890471414242728, + "loss": 2.3588, + "theoretical_loss": 3.4212416061989788, + "tokens_seen": 2065328128 + }, + { + "epoch": 6.09, + "learning_rate": 0.000189037111334002, + "loss": 2.4718, + "theoretical_loss": 3.4212326118890477, + "tokens_seen": 2065393664 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001890270812437312, + "loss": 2.3923, + "theoretical_loss": 3.4212236179444133, + "tokens_seen": 2065459200 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018901705115346038, + "loss": 2.4145, + "theoretical_loss": 3.4212146243650494, + "tokens_seen": 2065524736 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018900702106318956, + "loss": 2.4764, + "theoretical_loss": 3.4212056311509293, + "tokens_seen": 2065590272 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018899699097291875, + "loss": 2.5497, + "theoretical_loss": 3.4211966383020274, + "tokens_seen": 2065655808 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018898696088264793, + "loss": 2.3762, + "theoretical_loss": 3.421187645818317, + "tokens_seen": 2065721344 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018897693079237714, + "loss": 2.5983, + "theoretical_loss": 3.421178653699771, + "tokens_seen": 2065786880 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018896690070210632, + "loss": 2.2854, + "theoretical_loss": 3.4211696619463634, + "tokens_seen": 2065852416 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001889568706118355, + "loss": 2.3763, + "theoretical_loss": 3.421160670558068, + "tokens_seen": 2065917952 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018894684052156468, + "loss": 2.4267, + "theoretical_loss": 3.4211516795348587, + "tokens_seen": 2065983488 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.296285390853882, + "objective/train/theoretical_loss": 3.4211471841601524, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.4211471841601524, + "tokens_seen": 2066016256 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001889368104312939, + "loss": 2.5289, + "theoretical_loss": 3.4211426888767082, + "tokens_seen": 2066049024 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018892678034102307, + "loss": 2.5439, + "theoretical_loss": 3.4211336985835903, + "tokens_seen": 2066114560 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018891675025075225, + "loss": 2.4723, + "theoretical_loss": 3.4211247086554795, + "tokens_seen": 2066180096 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018890672016048143, + "loss": 2.2804, + "theoretical_loss": 3.4211157190923487, + "tokens_seen": 2066245632 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018889669007021064, + "loss": 2.6178, + "theoretical_loss": 3.421106729894171, + "tokens_seen": 2066311168 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018888665997993982, + "loss": 2.5717, + "theoretical_loss": 3.4210977410609207, + "tokens_seen": 2066376704 + }, + { + "epoch": 6.09, + "learning_rate": 0.000188876629889669, + "loss": 2.5744, + "theoretical_loss": 3.4210887525925715, + "tokens_seen": 2066442240 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018886659979939818, + "loss": 2.4453, + "theoretical_loss": 3.4210797644890967, + "tokens_seen": 2066507776 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018885656970912736, + "loss": 2.4267, + "theoretical_loss": 3.4210707767504704, + "tokens_seen": 2066573312 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018884653961885657, + "loss": 2.5681, + "theoretical_loss": 3.4210617893766653, + "tokens_seen": 2066638848 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018883650952858575, + "loss": 2.478, + "theoretical_loss": 3.421052802367656, + "tokens_seen": 2066704384 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018882647943831493, + "loss": 2.5378, + "theoretical_loss": 3.421043815723415, + "tokens_seen": 2066769920 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018881644934804411, + "loss": 2.5065, + "theoretical_loss": 3.4210348294439172, + "tokens_seen": 2066835456 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018880641925777335, + "loss": 2.3072, + "theoretical_loss": 3.4210258435291356, + "tokens_seen": 2066900992 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018879638916750253, + "loss": 2.5015, + "theoretical_loss": 3.421016857979044, + "tokens_seen": 2066966528 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001887863590772317, + "loss": 2.6559, + "theoretical_loss": 3.4210078727936155, + "tokens_seen": 2067032064 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001887763289869609, + "loss": 2.234, + "theoretical_loss": 3.420998887972824, + "tokens_seen": 2067097600 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018876629889669007, + "loss": 2.2222, + "theoretical_loss": 3.4209899035166442, + "tokens_seen": 2067163136 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018875626880641928, + "loss": 2.4727, + "theoretical_loss": 3.4209809194250482, + "tokens_seen": 2067228672 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018874623871614846, + "loss": 2.5184, + "theoretical_loss": 3.4209719356980104, + "tokens_seen": 2067294208 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018873620862587764, + "loss": 2.4626, + "theoretical_loss": 3.420962952335504, + "tokens_seen": 2067359744 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018872617853560683, + "loss": 2.4714, + "theoretical_loss": 3.4209539693375035, + "tokens_seen": 2067425280 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018871614844533603, + "loss": 2.3264, + "theoretical_loss": 3.420944986703982, + "tokens_seen": 2067490816 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018870611835506522, + "loss": 2.5431, + "theoretical_loss": 3.420936004434913, + "tokens_seen": 2067556352 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001886960882647944, + "loss": 2.2144, + "theoretical_loss": 3.4209270225302704, + "tokens_seen": 2067621888 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.750241994857788, + "objective/train/theoretical_loss": 3.420922531714601, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.420922531714601, + "tokens_seen": 2067654656 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018868605817452358, + "loss": 2.7017, + "theoretical_loss": 3.4209180409900277, + "tokens_seen": 2067687424 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018867602808425276, + "loss": 2.5523, + "theoretical_loss": 3.420909059814159, + "tokens_seen": 2067752960 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018866599799398197, + "loss": 2.4303, + "theoretical_loss": 3.420900079002638, + "tokens_seen": 2067818496 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018865596790371115, + "loss": 2.31, + "theoretical_loss": 3.420891098555437, + "tokens_seen": 2067884032 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018864593781344033, + "loss": 2.2238, + "theoretical_loss": 3.420882118472531, + "tokens_seen": 2067949568 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001886359077231695, + "loss": 2.3946, + "theoretical_loss": 3.420873138753894, + "tokens_seen": 2068015104 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018862587763289872, + "loss": 2.4007, + "theoretical_loss": 3.420864159399499, + "tokens_seen": 2068080640 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001886158475426279, + "loss": 2.3624, + "theoretical_loss": 3.4208551804093195, + "tokens_seen": 2068146176 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018860581745235708, + "loss": 2.5771, + "theoretical_loss": 3.4208462017833297, + "tokens_seen": 2068211712 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018859578736208626, + "loss": 2.4406, + "theoretical_loss": 3.4208372235215023, + "tokens_seen": 2068277248 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018858575727181544, + "loss": 2.4539, + "theoretical_loss": 3.4208282456238126, + "tokens_seen": 2068342784 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018857572718154465, + "loss": 2.3242, + "theoretical_loss": 3.420819268090233, + "tokens_seen": 2068408320 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018856569709127383, + "loss": 2.5855, + "theoretical_loss": 3.4208102909207376, + "tokens_seen": 2068473856 + }, + { + "epoch": 6.09, + "learning_rate": 0.000188555667001003, + "loss": 2.3863, + "theoretical_loss": 3.4208013141153004, + "tokens_seen": 2068539392 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001885456369107322, + "loss": 2.603, + "theoretical_loss": 3.4207923376738947, + "tokens_seen": 2068604928 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001885356068204614, + "loss": 2.3251, + "theoretical_loss": 3.420783361596494, + "tokens_seen": 2068670464 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018852557673019058, + "loss": 2.5004, + "theoretical_loss": 3.420774385883073, + "tokens_seen": 2068736000 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018851554663991976, + "loss": 2.6119, + "theoretical_loss": 3.420765410533604, + "tokens_seen": 2068801536 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018850551654964895, + "loss": 2.299, + "theoretical_loss": 3.420756435548062, + "tokens_seen": 2068867072 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018849548645937813, + "loss": 2.3747, + "theoretical_loss": 3.42074746092642, + "tokens_seen": 2068932608 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018848545636910734, + "loss": 2.6736, + "theoretical_loss": 3.4207384866686517, + "tokens_seen": 2068998144 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018847542627883652, + "loss": 2.3143, + "theoretical_loss": 3.4207295127747313, + "tokens_seen": 2069063680 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001884653961885657, + "loss": 2.4344, + "theoretical_loss": 3.4207205392446323, + "tokens_seen": 2069129216 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018845536609829488, + "loss": 2.4931, + "theoretical_loss": 3.420711566078328, + "tokens_seen": 2069194752 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001884453360080241, + "loss": 2.3401, + "theoretical_loss": 3.420702593275793, + "tokens_seen": 2069260288 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.194164514541626, + "objective/train/theoretical_loss": 3.42069810701093, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.42069810701093, + "tokens_seen": 2069293056 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018843530591775327, + "loss": 2.3024, + "theoretical_loss": 3.420693620837, + "tokens_seen": 2069325824 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018842527582748245, + "loss": 2.4973, + "theoretical_loss": 3.420684648761924, + "tokens_seen": 2069391360 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018841524573721163, + "loss": 2.3125, + "theoretical_loss": 3.420675677050537, + "tokens_seen": 2069456896 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018840521564694084, + "loss": 2.4061, + "theoretical_loss": 3.420666705702814, + "tokens_seen": 2069522432 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018839518555667002, + "loss": 2.1241, + "theoretical_loss": 3.420657734718729, + "tokens_seen": 2069587968 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001883851554663992, + "loss": 2.5351, + "theoretical_loss": 3.420648764098255, + "tokens_seen": 2069653504 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018837512537612838, + "loss": 2.5485, + "theoretical_loss": 3.420639793841366, + "tokens_seen": 2069719040 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018836509528585756, + "loss": 2.2811, + "theoretical_loss": 3.4206308239480356, + "tokens_seen": 2069784576 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018835506519558677, + "loss": 2.2711, + "theoretical_loss": 3.420621854418238, + "tokens_seen": 2069850112 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018834503510531595, + "loss": 2.4541, + "theoretical_loss": 3.4206128852519466, + "tokens_seen": 2069915648 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018833500501504513, + "loss": 2.2661, + "theoretical_loss": 3.420603916449135, + "tokens_seen": 2069981184 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018832497492477431, + "loss": 2.4411, + "theoretical_loss": 3.420594948009777, + "tokens_seen": 2070046720 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018831494483450352, + "loss": 2.4481, + "theoretical_loss": 3.420585979933847, + "tokens_seen": 2070112256 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001883049147442327, + "loss": 2.6267, + "theoretical_loss": 3.420577012221318, + "tokens_seen": 2070177792 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018829488465396189, + "loss": 2.5558, + "theoretical_loss": 3.420568044872164, + "tokens_seen": 2070243328 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018828485456369107, + "loss": 2.5489, + "theoretical_loss": 3.4205590778863586, + "tokens_seen": 2070308864 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018827482447342025, + "loss": 2.4262, + "theoretical_loss": 3.4205501112638768, + "tokens_seen": 2070374400 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018826479438314946, + "loss": 2.4611, + "theoretical_loss": 3.4205411450046905, + "tokens_seen": 2070439936 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018825476429287864, + "loss": 2.2925, + "theoretical_loss": 3.4205321791087746, + "tokens_seen": 2070505472 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018824473420260782, + "loss": 2.529, + "theoretical_loss": 3.420523213576103, + "tokens_seen": 2070571008 + }, + { + "epoch": 6.09, + "learning_rate": 0.000188234704112337, + "loss": 2.4518, + "theoretical_loss": 3.420514248406649, + "tokens_seen": 2070636544 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001882246740220662, + "loss": 2.2678, + "theoretical_loss": 3.420505283600386, + "tokens_seen": 2070702080 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001882146439317954, + "loss": 2.5283, + "theoretical_loss": 3.420496319157289, + "tokens_seen": 2070767616 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018820461384152457, + "loss": 2.5603, + "theoretical_loss": 3.4204873550773307, + "tokens_seen": 2070833152 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018819458375125375, + "loss": 2.3326, + "theoretical_loss": 3.4204783913604855, + "tokens_seen": 2070898688 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5347697734832764, + "objective/train/theoretical_loss": 3.4204739096382224, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.4204739096382224, + "tokens_seen": 2070931456 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018818455366098293, + "loss": 2.5933, + "theoretical_loss": 3.4204694280067276, + "tokens_seen": 2070964224 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018817452357071214, + "loss": 2.4465, + "theoretical_loss": 3.4204604650160295, + "tokens_seen": 2071029760 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018816449348044132, + "loss": 2.61, + "theoretical_loss": 3.4204515023883664, + "tokens_seen": 2071095296 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001881544633901705, + "loss": 2.53, + "theoretical_loss": 3.4204425401237106, + "tokens_seen": 2071160832 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018814443329989968, + "loss": 2.7224, + "theoretical_loss": 3.4204335782220374, + "tokens_seen": 2071226368 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001881344032096289, + "loss": 2.3992, + "theoretical_loss": 3.4204246166833197, + "tokens_seen": 2071291904 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018812437311935807, + "loss": 2.2179, + "theoretical_loss": 3.4204156555075316, + "tokens_seen": 2071357440 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018811434302908725, + "loss": 2.7077, + "theoretical_loss": 3.4204066946946474, + "tokens_seen": 2071422976 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018810431293881643, + "loss": 2.53, + "theoretical_loss": 3.4203977342446397, + "tokens_seen": 2071488512 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018809428284854562, + "loss": 2.4338, + "theoretical_loss": 3.4203887741574834, + "tokens_seen": 2071554048 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018808425275827482, + "loss": 2.4914, + "theoretical_loss": 3.4203798144331516, + "tokens_seen": 2071619584 + }, + { + "epoch": 6.09, + "learning_rate": 0.000188074222668004, + "loss": 2.5207, + "theoretical_loss": 3.420370855071619, + "tokens_seen": 2071685120 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018806419257773319, + "loss": 2.4573, + "theoretical_loss": 3.4203618960728592, + "tokens_seen": 2071750656 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001880541624874624, + "loss": 2.4166, + "theoretical_loss": 3.4203529374368453, + "tokens_seen": 2071816192 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001880441323971916, + "loss": 2.4417, + "theoretical_loss": 3.420343979163552, + "tokens_seen": 2071881728 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018803410230692078, + "loss": 2.5734, + "theoretical_loss": 3.4203350212529524, + "tokens_seen": 2071947264 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018802407221664997, + "loss": 2.4261, + "theoretical_loss": 3.420326063705021, + "tokens_seen": 2072012800 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018801404212637915, + "loss": 2.4839, + "theoretical_loss": 3.420317106519731, + "tokens_seen": 2072078336 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018800401203610833, + "loss": 2.4338, + "theoretical_loss": 3.420308149697057, + "tokens_seen": 2072143872 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018799398194583754, + "loss": 2.5506, + "theoretical_loss": 3.420299193236972, + "tokens_seen": 2072209408 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018798395185556672, + "loss": 2.4282, + "theoretical_loss": 3.420290237139451, + "tokens_seen": 2072274944 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001879739217652959, + "loss": 2.3751, + "theoretical_loss": 3.4202812814044665, + "tokens_seen": 2072340480 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018796389167502508, + "loss": 2.3517, + "theoretical_loss": 3.420272326031993, + "tokens_seen": 2072406016 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001879538615847543, + "loss": 2.464, + "theoretical_loss": 3.4202633710220054, + "tokens_seen": 2072471552 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018794383149448347, + "loss": 2.345, + "theoretical_loss": 3.4202544163744752, + "tokens_seen": 2072537088 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5456392765045166, + "objective/train/theoretical_loss": 3.4202499391866246, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.4202499391866246, + "tokens_seen": 2072569856 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018793380140421265, + "loss": 2.4078, + "theoretical_loss": 3.4202454620893787, + "tokens_seen": 2072602624 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018792377131394183, + "loss": 2.4665, + "theoretical_loss": 3.4202365081666883, + "tokens_seen": 2072668160 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018791374122367104, + "loss": 2.539, + "theoretical_loss": 3.4202275546063783, + "tokens_seen": 2072733696 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018790371113340022, + "loss": 2.5321, + "theoretical_loss": 3.420218601408423, + "tokens_seen": 2072799232 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001878936810431294, + "loss": 2.5668, + "theoretical_loss": 3.4202096485727953, + "tokens_seen": 2072864768 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018788365095285858, + "loss": 2.4164, + "theoretical_loss": 3.4202006960994695, + "tokens_seen": 2072930304 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018787362086258776, + "loss": 2.5615, + "theoretical_loss": 3.4201917439884197, + "tokens_seen": 2072995840 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018786359077231697, + "loss": 2.3363, + "theoretical_loss": 3.42018279223962, + "tokens_seen": 2073061376 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018785356068204615, + "loss": 2.6778, + "theoretical_loss": 3.4201738408530438, + "tokens_seen": 2073126912 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018784353059177533, + "loss": 2.5883, + "theoretical_loss": 3.420164889828665, + "tokens_seen": 2073192448 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018783350050150451, + "loss": 2.3954, + "theoretical_loss": 3.4201559391664578, + "tokens_seen": 2073257984 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018782347041123372, + "loss": 2.3639, + "theoretical_loss": 3.4201469888663962, + "tokens_seen": 2073323520 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001878134403209629, + "loss": 2.1751, + "theoretical_loss": 3.4201380389284535, + "tokens_seen": 2073389056 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018780341023069209, + "loss": 2.5603, + "theoretical_loss": 3.420129089352604, + "tokens_seen": 2073454592 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018779338014042127, + "loss": 2.4947, + "theoretical_loss": 3.420120140138822, + "tokens_seen": 2073520128 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018778335005015045, + "loss": 2.34, + "theoretical_loss": 3.4201111912870807, + "tokens_seen": 2073585664 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018777331995987966, + "loss": 2.4242, + "theoretical_loss": 3.4201022427973546, + "tokens_seen": 2073651200 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018776328986960884, + "loss": 2.4202, + "theoretical_loss": 3.4200932946696168, + "tokens_seen": 2073716736 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018775325977933802, + "loss": 2.5021, + "theoretical_loss": 3.4200843469038418, + "tokens_seen": 2073782272 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001877432296890672, + "loss": 2.4758, + "theoretical_loss": 3.420075399500004, + "tokens_seen": 2073847808 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001877331995987964, + "loss": 2.5417, + "theoretical_loss": 3.420066452458076, + "tokens_seen": 2073913344 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001877231695085256, + "loss": 2.6268, + "theoretical_loss": 3.420057505778033, + "tokens_seen": 2073978880 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018771313941825477, + "loss": 2.4758, + "theoretical_loss": 3.4200485594598486, + "tokens_seen": 2074044416 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018770310932798395, + "loss": 2.6935, + "theoretical_loss": 3.420039613503496, + "tokens_seen": 2074109952 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018769307923771313, + "loss": 2.4409, + "theoretical_loss": 3.42003066790895, + "tokens_seen": 2074175488 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1351969242095947, + "objective/train/theoretical_loss": 3.4200261952473463, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.4200261952473463, + "tokens_seen": 2074208256 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018768304914744234, + "loss": 2.3668, + "theoretical_loss": 3.4200217226761844, + "tokens_seen": 2074241024 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018767301905717152, + "loss": 2.2536, + "theoretical_loss": 3.420012777805173, + "tokens_seen": 2074306560 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001876629889669007, + "loss": 2.6107, + "theoretical_loss": 3.4200038332958895, + "tokens_seen": 2074372096 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018765295887662988, + "loss": 2.5042, + "theoretical_loss": 3.419994889148308, + "tokens_seen": 2074437632 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001876429287863591, + "loss": 2.3223, + "theoretical_loss": 3.419985945362402, + "tokens_seen": 2074503168 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018763289869608827, + "loss": 2.4271, + "theoretical_loss": 3.419977001938147, + "tokens_seen": 2074568704 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018762286860581745, + "loss": 2.3236, + "theoretical_loss": 3.419968058875515, + "tokens_seen": 2074634240 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018761283851554663, + "loss": 2.4802, + "theoretical_loss": 3.4199591161744816, + "tokens_seen": 2074699776 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018760280842527582, + "loss": 2.5041, + "theoretical_loss": 3.4199501738350198, + "tokens_seen": 2074765312 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018759277833500502, + "loss": 2.5939, + "theoretical_loss": 3.4199412318571034, + "tokens_seen": 2074830848 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001875827482447342, + "loss": 2.5466, + "theoretical_loss": 3.419932290240707, + "tokens_seen": 2074896384 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001875727181544634, + "loss": 2.4668, + "theoretical_loss": 3.4199233489858045, + "tokens_seen": 2074961920 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018756268806419257, + "loss": 2.5723, + "theoretical_loss": 3.4199144080923696, + "tokens_seen": 2075027456 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018755265797392178, + "loss": 2.6544, + "theoretical_loss": 3.419905467560376, + "tokens_seen": 2075092992 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018754262788365096, + "loss": 2.339, + "theoretical_loss": 3.419896527389798, + "tokens_seen": 2075158528 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018753259779338014, + "loss": 2.4984, + "theoretical_loss": 3.41988758758061, + "tokens_seen": 2075224064 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018752256770310932, + "loss": 2.45, + "theoretical_loss": 3.4198786481327854, + "tokens_seen": 2075289600 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001875125376128385, + "loss": 2.3906, + "theoretical_loss": 3.419869709046299, + "tokens_seen": 2075355136 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001875025075225677, + "loss": 2.6234, + "theoretical_loss": 3.4198607703211232, + "tokens_seen": 2075420672 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001874924774322969, + "loss": 2.3475, + "theoretical_loss": 3.4198518319572333, + "tokens_seen": 2075486208 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018748244734202607, + "loss": 2.6139, + "theoretical_loss": 3.419842893954603, + "tokens_seen": 2075551744 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018747241725175525, + "loss": 2.6684, + "theoretical_loss": 3.419833956313206, + "tokens_seen": 2075617280 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018746238716148446, + "loss": 2.5256, + "theoretical_loss": 3.4198250190330164, + "tokens_seen": 2075682816 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018745235707121364, + "loss": 2.4975, + "theoretical_loss": 3.419816082114009, + "tokens_seen": 2075748352 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018744232698094282, + "loss": 2.6185, + "theoretical_loss": 3.4198071455561565, + "tokens_seen": 2075813888 + }, + { + "epoch": 6.09, + "objective/train/docs_used": 2244242, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.417935609817505, + "objective/train/theoretical_loss": 3.4198026774126555, + "objective/train/tokens_used": 2077316576, + "theoretical_loss": 3.4198026774126555, + "tokens_seen": 2075846656 + }, + { + "epoch": 6.09, + "learning_rate": 0.000187432296890672, + "loss": 2.3314, + "theoretical_loss": 3.4197982093594335, + "tokens_seen": 2075879424 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018742226680040118, + "loss": 2.5059, + "theoretical_loss": 3.419789273523814, + "tokens_seen": 2075944960 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001874122367101304, + "loss": 2.565, + "theoretical_loss": 3.4197803380492724, + "tokens_seen": 2076010496 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018740220661985957, + "loss": 2.5831, + "theoretical_loss": 3.4197714029357824, + "tokens_seen": 2076076032 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018739217652958876, + "loss": 2.4752, + "theoretical_loss": 3.4197624681833174, + "tokens_seen": 2076141568 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018738214643931794, + "loss": 2.4296, + "theoretical_loss": 3.4197535337918525, + "tokens_seen": 2076207104 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018737211634904714, + "loss": 2.3717, + "theoretical_loss": 3.419744599761361, + "tokens_seen": 2076272640 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018736208625877633, + "loss": 2.522, + "theoretical_loss": 3.419735666091817, + "tokens_seen": 2076338176 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001873520561685055, + "loss": 2.8317, + "theoretical_loss": 3.419726732783195, + "tokens_seen": 2076403712 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001873420260782347, + "loss": 2.393, + "theoretical_loss": 3.4197177998354684, + "tokens_seen": 2076469248 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018733199598796387, + "loss": 2.4189, + "theoretical_loss": 3.4197088672486116, + "tokens_seen": 2076534784 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018732196589769308, + "loss": 2.2387, + "theoretical_loss": 3.419699935022598, + "tokens_seen": 2076600320 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018731193580742229, + "loss": 2.2945, + "theoretical_loss": 3.4196910031574026, + "tokens_seen": 2076665856 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018730190571715147, + "loss": 2.5001, + "theoretical_loss": 3.419682071652999, + "tokens_seen": 2076731392 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018729187562688065, + "loss": 2.6005, + "theoretical_loss": 3.419673140509361, + "tokens_seen": 2076796928 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018728184553660986, + "loss": 2.3858, + "theoretical_loss": 3.4196642097264633, + "tokens_seen": 2076862464 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018727181544633904, + "loss": 2.2797, + "theoretical_loss": 3.41965527930428, + "tokens_seen": 2076928000 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018726178535606822, + "loss": 2.586, + "theoretical_loss": 3.4196463492427833, + "tokens_seen": 2076993536 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001872517552657974, + "loss": 2.3146, + "theoretical_loss": 3.4196374195419494, + "tokens_seen": 2077059072 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001872417251755266, + "loss": 2.3508, + "theoretical_loss": 3.419628490201752, + "tokens_seen": 2077124608 + }, + { + "epoch": 6.09, + "learning_rate": 0.0001872316950852558, + "loss": 2.3925, + "theoretical_loss": 3.4196195612221643, + "tokens_seen": 2077190144 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018722166499498497, + "loss": 2.4908, + "theoretical_loss": 3.4196106326031606, + "tokens_seen": 2077255680 + }, + { + "epoch": 6.09, + "learning_rate": 0.00018721163490471415, + "loss": 2.4534, + "theoretical_loss": 3.4196017043447156, + "tokens_seen": 2077321216 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018720160481444333, + "loss": 3.2909, + "theoretical_loss": 3.4195906840228787, + "tokens_seen": 2077402112 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018719157472417254, + "loss": 2.5885, + "theoretical_loss": 3.419581756569962, + "tokens_seen": 2077467648 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2306938, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.543551445007324, + "objective/train/theoretical_loss": 3.4195795247630585, + "objective/train/tokens_used": 2097944032, + "theoretical_loss": 3.4195795247630585, + "tokens_seen": 2077484032 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018718154463390172, + "loss": 2.5536, + "theoretical_loss": 3.4195728294775205, + "tokens_seen": 2077533184 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001871715145436309, + "loss": 2.5311, + "theoretical_loss": 3.419563902745527, + "tokens_seen": 2077598720 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018716148445336008, + "loss": 2.587, + "theoretical_loss": 3.4195549763739557, + "tokens_seen": 2077664256 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001871514543630893, + "loss": 2.527, + "theoretical_loss": 3.4195460503627815, + "tokens_seen": 2077729792 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018714142427281847, + "loss": 2.6204, + "theoretical_loss": 3.4195371247119777, + "tokens_seen": 2077795328 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018713139418254765, + "loss": 2.5339, + "theoretical_loss": 3.419528199421519, + "tokens_seen": 2077860864 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018712136409227684, + "loss": 2.5326, + "theoretical_loss": 3.4195192744913787, + "tokens_seen": 2077926400 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018711133400200602, + "loss": 2.6886, + "theoretical_loss": 3.4195103499215316, + "tokens_seen": 2077991936 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018710130391173522, + "loss": 2.4732, + "theoretical_loss": 3.419501425711952, + "tokens_seen": 2078057472 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001870912738214644, + "loss": 2.405, + "theoretical_loss": 3.4194925018626128, + "tokens_seen": 2078123008 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001870812437311936, + "loss": 2.5218, + "theoretical_loss": 3.4194835783734896, + "tokens_seen": 2078188544 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018707121364092277, + "loss": 2.5885, + "theoretical_loss": 3.4194746552445556, + "tokens_seen": 2078254080 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018706118355065198, + "loss": 2.5196, + "theoretical_loss": 3.4194657324757847, + "tokens_seen": 2078319616 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018705115346038116, + "loss": 2.6843, + "theoretical_loss": 3.4194568100671514, + "tokens_seen": 2078385152 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018704112337011034, + "loss": 2.6349, + "theoretical_loss": 3.41944788801863, + "tokens_seen": 2078450688 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018703109327983952, + "loss": 2.5527, + "theoretical_loss": 3.4194389663301945, + "tokens_seen": 2078516224 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001870210631895687, + "loss": 2.5976, + "theoretical_loss": 3.419430045001819, + "tokens_seen": 2078581760 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001870110330992979, + "loss": 2.3245, + "theoretical_loss": 3.419421124033477, + "tokens_seen": 2078647296 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001870010030090271, + "loss": 2.3569, + "theoretical_loss": 3.4194122034251437, + "tokens_seen": 2078712832 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018699097291875627, + "loss": 2.4327, + "theoretical_loss": 3.419403283176792, + "tokens_seen": 2078778368 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018698094282848545, + "loss": 2.5871, + "theoretical_loss": 3.419394363288397, + "tokens_seen": 2078843904 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018697091273821466, + "loss": 2.5065, + "theoretical_loss": 3.419385443759933, + "tokens_seen": 2078909440 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018696088264794384, + "loss": 2.5471, + "theoretical_loss": 3.4193765245913736, + "tokens_seen": 2078974976 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018695085255767302, + "loss": 2.5587, + "theoretical_loss": 3.4193676057826927, + "tokens_seen": 2079040512 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001869408224674022, + "loss": 2.752, + "theoretical_loss": 3.4193586873338644, + "tokens_seen": 2079106048 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2311722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5360188484191895, + "objective/train/theoretical_loss": 3.419356457777882, + "objective/train/tokens_used": 2099582432, + "theoretical_loss": 3.419356457777882, + "tokens_seen": 2079122432 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018693079237713138, + "loss": 2.5852, + "theoretical_loss": 3.419349769244864, + "tokens_seen": 2079171584 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001869207622868606, + "loss": 2.6475, + "theoretical_loss": 3.4193408515156642, + "tokens_seen": 2079237120 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018691073219658977, + "loss": 2.7285, + "theoretical_loss": 3.4193319341462396, + "tokens_seen": 2079302656 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018690070210631896, + "loss": 2.4223, + "theoretical_loss": 3.4193230171365645, + "tokens_seen": 2079368192 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018689067201604814, + "loss": 2.4385, + "theoretical_loss": 3.4193141004866137, + "tokens_seen": 2079433728 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018688064192577734, + "loss": 2.727, + "theoretical_loss": 3.41930518419636, + "tokens_seen": 2079499264 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018687061183550653, + "loss": 2.5858, + "theoretical_loss": 3.4192962682657786, + "tokens_seen": 2079564800 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001868605817452357, + "loss": 2.5435, + "theoretical_loss": 3.419287352694843, + "tokens_seen": 2079630336 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001868505516549649, + "loss": 2.7208, + "theoretical_loss": 3.419278437483528, + "tokens_seen": 2079695872 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018684052156469407, + "loss": 2.6443, + "theoretical_loss": 3.4192695226318075, + "tokens_seen": 2079761408 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018683049147442328, + "loss": 2.6378, + "theoretical_loss": 3.4192606081396555, + "tokens_seen": 2079826944 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018682046138415246, + "loss": 2.5207, + "theoretical_loss": 3.4192516940070456, + "tokens_seen": 2079892480 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018681043129388164, + "loss": 2.5719, + "theoretical_loss": 3.4192427802339536, + "tokens_seen": 2079958016 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018680040120361082, + "loss": 2.6226, + "theoretical_loss": 3.419233866820352, + "tokens_seen": 2080023552 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018679037111334003, + "loss": 2.5891, + "theoretical_loss": 3.419224953766216, + "tokens_seen": 2080089088 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001867803410230692, + "loss": 2.3265, + "theoretical_loss": 3.4192160410715196, + "tokens_seen": 2080154624 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001867703109327984, + "loss": 2.8092, + "theoretical_loss": 3.419207128736237, + "tokens_seen": 2080220160 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018676028084252757, + "loss": 2.6003, + "theoretical_loss": 3.419198216760342, + "tokens_seen": 2080285696 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018675025075225678, + "loss": 2.5165, + "theoretical_loss": 3.4191893051438083, + "tokens_seen": 2080351232 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018674022066198596, + "loss": 2.433, + "theoretical_loss": 3.4191803938866117, + "tokens_seen": 2080416768 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018673019057171514, + "loss": 2.4505, + "theoretical_loss": 3.419171482988725, + "tokens_seen": 2080482304 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018672016048144432, + "loss": 2.7156, + "theoretical_loss": 3.4191625724501233, + "tokens_seen": 2080547840 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001867101303911735, + "loss": 2.568, + "theoretical_loss": 3.41915366227078, + "tokens_seen": 2080613376 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001867001003009027, + "loss": 2.5608, + "theoretical_loss": 3.41914475245067, + "tokens_seen": 2080678912 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001866900702106319, + "loss": 2.6965, + "theoretical_loss": 3.4191358429897667, + "tokens_seen": 2080744448 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2316975, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9159996509552, + "objective/train/theoretical_loss": 3.419133615680664, + "objective/train/tokens_used": 2101220832, + "theoretical_loss": 3.419133615680664, + "tokens_seen": 2080760832 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018668004012036108, + "loss": 2.7847, + "theoretical_loss": 3.4191269338880454, + "tokens_seen": 2080809984 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018667001003009026, + "loss": 2.7266, + "theoretical_loss": 3.4191180251454796, + "tokens_seen": 2080875520 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018665997993981946, + "loss": 2.6016, + "theoretical_loss": 3.4191091167620433, + "tokens_seen": 2080941056 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018664994984954865, + "loss": 2.6357, + "theoretical_loss": 3.419100208737711, + "tokens_seen": 2081006592 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018663991975927783, + "loss": 2.5449, + "theoretical_loss": 3.4190913010724575, + "tokens_seen": 2081072128 + }, + { + "epoch": 7.0, + "learning_rate": 0.000186629889669007, + "loss": 2.5736, + "theoretical_loss": 3.4190823937662556, + "tokens_seen": 2081137664 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001866198595787362, + "loss": 2.5685, + "theoretical_loss": 3.419073486819081, + "tokens_seen": 2081203200 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001866098294884654, + "loss": 2.6093, + "theoretical_loss": 3.4190645802309074, + "tokens_seen": 2081268736 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018659979939819458, + "loss": 2.5744, + "theoretical_loss": 3.419055674001709, + "tokens_seen": 2081334272 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018658976930792376, + "loss": 2.6067, + "theoretical_loss": 3.419046768131459, + "tokens_seen": 2081399808 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018657973921765294, + "loss": 2.5871, + "theoretical_loss": 3.4190378626201334, + "tokens_seen": 2081465344 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018656970912738215, + "loss": 2.4435, + "theoretical_loss": 3.419028957467705, + "tokens_seen": 2081530880 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018655967903711136, + "loss": 2.7229, + "theoretical_loss": 3.419020052674149, + "tokens_seen": 2081596416 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018654964894684054, + "loss": 2.7116, + "theoretical_loss": 3.4190111482394396, + "tokens_seen": 2081661952 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018653961885656972, + "loss": 2.4376, + "theoretical_loss": 3.4190022441635506, + "tokens_seen": 2081727488 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001865295887662989, + "loss": 2.6514, + "theoretical_loss": 3.4189933404464563, + "tokens_seen": 2081793024 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001865195586760281, + "loss": 2.5698, + "theoretical_loss": 3.418984437088131, + "tokens_seen": 2081858560 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001865095285857573, + "loss": 2.499, + "theoretical_loss": 3.4189755340885495, + "tokens_seen": 2081924096 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018649949849548647, + "loss": 2.6017, + "theoretical_loss": 3.418966631447685, + "tokens_seen": 2081989632 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018648946840521565, + "loss": 2.7118, + "theoretical_loss": 3.4189577291655127, + "tokens_seen": 2082055168 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018647943831494486, + "loss": 2.819, + "theoretical_loss": 3.418948827242006, + "tokens_seen": 2082120704 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018646940822467404, + "loss": 2.6656, + "theoretical_loss": 3.4189399256771393, + "tokens_seen": 2082186240 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018645937813440322, + "loss": 2.6731, + "theoretical_loss": 3.418931024470888, + "tokens_seen": 2082251776 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001864493480441324, + "loss": 2.6439, + "theoretical_loss": 3.4189221236232252, + "tokens_seen": 2082317312 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018643931795386159, + "loss": 2.6984, + "theoretical_loss": 3.4189132231341253, + "tokens_seen": 2082382848 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2321848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5215673446655273, + "objective/train/theoretical_loss": 3.418910998067873, + "objective/train/tokens_used": 2102859232, + "theoretical_loss": 3.418910998067873, + "tokens_seen": 2082399232 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001864292878635908, + "loss": 2.6488, + "theoretical_loss": 3.418904323003563, + "tokens_seen": 2082448384 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018641925777331997, + "loss": 2.7589, + "theoretical_loss": 3.4188954232315125, + "tokens_seen": 2082513920 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018640922768304916, + "loss": 2.592, + "theoretical_loss": 3.4188865238179478, + "tokens_seen": 2082579456 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018639919759277834, + "loss": 2.5678, + "theoretical_loss": 3.4188776247628434, + "tokens_seen": 2082644992 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018638916750250754, + "loss": 2.5817, + "theoretical_loss": 3.418868726066173, + "tokens_seen": 2082710528 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018637913741223673, + "loss": 2.4563, + "theoretical_loss": 3.4188598277279123, + "tokens_seen": 2082776064 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001863691073219659, + "loss": 2.5705, + "theoretical_loss": 3.418850929748034, + "tokens_seen": 2082841600 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001863590772316951, + "loss": 2.548, + "theoretical_loss": 3.418842032126513, + "tokens_seen": 2082907136 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018634904714142427, + "loss": 2.647, + "theoretical_loss": 3.418833134863324, + "tokens_seen": 2082972672 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018633901705115348, + "loss": 2.7456, + "theoretical_loss": 3.418824237958441, + "tokens_seen": 2083038208 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018632898696088266, + "loss": 2.6323, + "theoretical_loss": 3.418815341411838, + "tokens_seen": 2083103744 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018631895687061184, + "loss": 2.7509, + "theoretical_loss": 3.418806445223489, + "tokens_seen": 2083169280 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018630892678034102, + "loss": 2.4868, + "theoretical_loss": 3.41879754939337, + "tokens_seen": 2083234816 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018629889669007023, + "loss": 2.6933, + "theoretical_loss": 3.4187886539214536, + "tokens_seen": 2083300352 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001862888665997994, + "loss": 2.673, + "theoretical_loss": 3.4187797588077147, + "tokens_seen": 2083365888 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001862788365095286, + "loss": 2.4999, + "theoretical_loss": 3.4187708640521275, + "tokens_seen": 2083431424 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018626880641925777, + "loss": 2.5897, + "theoretical_loss": 3.4187619696546667, + "tokens_seen": 2083496960 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018625877632898698, + "loss": 2.5974, + "theoretical_loss": 3.4187530756153057, + "tokens_seen": 2083562496 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018624874623871616, + "loss": 2.6684, + "theoretical_loss": 3.41874418193402, + "tokens_seen": 2083628032 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018623871614844534, + "loss": 2.7537, + "theoretical_loss": 3.418735288610783, + "tokens_seen": 2083693568 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018622868605817452, + "loss": 2.5332, + "theoretical_loss": 3.41872639564557, + "tokens_seen": 2083759104 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001862186559679037, + "loss": 2.5413, + "theoretical_loss": 3.418717503038354, + "tokens_seen": 2083824640 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001862086258776329, + "loss": 2.6924, + "theoretical_loss": 3.4187086107891105, + "tokens_seen": 2083890176 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001861985957873621, + "loss": 2.6602, + "theoretical_loss": 3.418699718897813, + "tokens_seen": 2083955712 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018618856569709128, + "loss": 2.578, + "theoretical_loss": 3.4186908273644363, + "tokens_seen": 2084021248 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2326755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6580069065093994, + "objective/train/theoretical_loss": 3.418688604537014, + "objective/train/tokens_used": 2104497632, + "theoretical_loss": 3.418688604537014, + "tokens_seen": 2084037632 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018617853560682046, + "loss": 2.5865, + "theoretical_loss": 3.4186819361889547, + "tokens_seen": 2084086784 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018616850551654966, + "loss": 2.676, + "theoretical_loss": 3.4186730453713423, + "tokens_seen": 2084152320 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018615847542627885, + "loss": 2.4652, + "theoretical_loss": 3.418664154911574, + "tokens_seen": 2084217856 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018614844533600803, + "loss": 2.8442, + "theoretical_loss": 3.4186552648096233, + "tokens_seen": 2084283392 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001861384152457372, + "loss": 2.6602, + "theoretical_loss": 3.4186463750654656, + "tokens_seen": 2084348928 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001861283851554664, + "loss": 2.5656, + "theoretical_loss": 3.418637485679074, + "tokens_seen": 2084414464 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001861183550651956, + "loss": 2.6324, + "theoretical_loss": 3.418628596650424, + "tokens_seen": 2084480000 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018610832497492478, + "loss": 2.5967, + "theoretical_loss": 3.4186197079794898, + "tokens_seen": 2084545536 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018609829488465396, + "loss": 2.599, + "theoretical_loss": 3.4186108196662444, + "tokens_seen": 2084611072 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018608826479438314, + "loss": 2.645, + "theoretical_loss": 3.418601931710664, + "tokens_seen": 2084676608 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018607823470411235, + "loss": 2.5033, + "theoretical_loss": 3.418593044112722, + "tokens_seen": 2084742144 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018606820461384153, + "loss": 2.542, + "theoretical_loss": 3.418584156872393, + "tokens_seen": 2084807680 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001860581745235707, + "loss": 2.8218, + "theoretical_loss": 3.418575269989651, + "tokens_seen": 2084873216 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001860481444332999, + "loss": 2.4888, + "theoretical_loss": 3.418566383464471, + "tokens_seen": 2084938752 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018603811434302907, + "loss": 2.4653, + "theoretical_loss": 3.418557497296827, + "tokens_seen": 2085004288 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018602808425275828, + "loss": 2.5351, + "theoretical_loss": 3.4185486114866936, + "tokens_seen": 2085069824 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018601805416248746, + "loss": 2.3834, + "theoretical_loss": 3.4185397260340444, + "tokens_seen": 2085135360 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018600802407221664, + "loss": 2.5422, + "theoretical_loss": 3.418530840938855, + "tokens_seen": 2085200896 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018599799398194583, + "loss": 2.5219, + "theoretical_loss": 3.418521956201099, + "tokens_seen": 2085266432 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018598796389167503, + "loss": 2.5758, + "theoretical_loss": 3.4185130718207506, + "tokens_seen": 2085331968 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018597793380140421, + "loss": 2.4273, + "theoretical_loss": 3.418504187797785, + "tokens_seen": 2085397504 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001859679037111334, + "loss": 2.8424, + "theoretical_loss": 3.418495304132176, + "tokens_seen": 2085463040 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018595787362086258, + "loss": 2.6155, + "theoretical_loss": 3.418486420823898, + "tokens_seen": 2085528576 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018594784353059176, + "loss": 2.5679, + "theoretical_loss": 3.418477537872926, + "tokens_seen": 2085594112 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018593781344032097, + "loss": 2.726, + "theoretical_loss": 3.4184686552792334, + "tokens_seen": 2085659648 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2331809, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.565418243408203, + "objective/train/theoretical_loss": 3.418466434686632, + "objective/train/tokens_used": 2106136032, + "theoretical_loss": 3.418466434686632, + "tokens_seen": 2085676032 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018592778335005015, + "loss": 2.6452, + "theoretical_loss": 3.4184597730427955, + "tokens_seen": 2085725184 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018591775325977933, + "loss": 2.6272, + "theoretical_loss": 3.4184508911635865, + "tokens_seen": 2085790720 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001859077231695085, + "loss": 2.8037, + "theoretical_loss": 3.41844200964158, + "tokens_seen": 2085856256 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018589769307923772, + "loss": 2.4485, + "theoretical_loss": 3.4184331284767513, + "tokens_seen": 2085921792 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001858876629889669, + "loss": 2.6452, + "theoretical_loss": 3.4184242476690754, + "tokens_seen": 2085987328 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018587763289869608, + "loss": 2.5357, + "theoretical_loss": 3.418415367218525, + "tokens_seen": 2086052864 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018586760280842526, + "loss": 2.5852, + "theoretical_loss": 3.4184064871250754, + "tokens_seen": 2086118400 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018585757271815444, + "loss": 2.5984, + "theoretical_loss": 3.4183976073887017, + "tokens_seen": 2086183936 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018584754262788365, + "loss": 2.5313, + "theoretical_loss": 3.4183887280093774, + "tokens_seen": 2086249472 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018583751253761283, + "loss": 2.5462, + "theoretical_loss": 3.418379848987077, + "tokens_seen": 2086315008 + }, + { + "epoch": 7.0, + "learning_rate": 0.000185827482447342, + "loss": 2.6253, + "theoretical_loss": 3.4183709703217753, + "tokens_seen": 2086380544 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001858174523570712, + "loss": 2.6736, + "theoretical_loss": 3.4183620920134468, + "tokens_seen": 2086446080 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018580742226680043, + "loss": 2.5737, + "theoretical_loss": 3.4183532140620656, + "tokens_seen": 2086511616 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001857973921765296, + "loss": 2.5473, + "theoretical_loss": 3.418344336467606, + "tokens_seen": 2086577152 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001857873620862588, + "loss": 2.5858, + "theoretical_loss": 3.4183354592300423, + "tokens_seen": 2086642688 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018577733199598797, + "loss": 2.6686, + "theoretical_loss": 3.41832658234935, + "tokens_seen": 2086708224 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018576730190571718, + "loss": 2.4532, + "theoretical_loss": 3.418317705825503, + "tokens_seen": 2086773760 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018575727181544636, + "loss": 2.3079, + "theoretical_loss": 3.418308829658475, + "tokens_seen": 2086839296 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018574724172517554, + "loss": 2.5557, + "theoretical_loss": 3.4182999538482415, + "tokens_seen": 2086904832 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018573721163490472, + "loss": 2.5171, + "theoretical_loss": 3.418291078394776, + "tokens_seen": 2086970368 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001857271815446339, + "loss": 2.4748, + "theoretical_loss": 3.4182822032980544, + "tokens_seen": 2087035904 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018571715145436311, + "loss": 2.514, + "theoretical_loss": 3.4182733285580493, + "tokens_seen": 2087101440 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001857071213640923, + "loss": 2.5369, + "theoretical_loss": 3.4182644541747367, + "tokens_seen": 2087166976 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018569709127382148, + "loss": 2.4749, + "theoretical_loss": 3.4182555801480903, + "tokens_seen": 2087232512 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018568706118355066, + "loss": 2.5788, + "theoretical_loss": 3.418246706478085, + "tokens_seen": 2087298048 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2336980, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6299309730529785, + "objective/train/theoretical_loss": 3.418244488116305, + "objective/train/tokens_used": 2107774432, + "theoretical_loss": 3.418244488116305, + "tokens_seen": 2087314432 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018567703109327987, + "loss": 2.6139, + "theoretical_loss": 3.418237833164695, + "tokens_seen": 2087363584 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018566700100300905, + "loss": 2.5092, + "theoretical_loss": 3.418228960207894, + "tokens_seen": 2087429120 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018565697091273823, + "loss": 2.629, + "theoretical_loss": 3.418220087607658, + "tokens_seen": 2087494656 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001856469408224674, + "loss": 2.6197, + "theoretical_loss": 3.4182112153639608, + "tokens_seen": 2087560192 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001856369107321966, + "loss": 2.4511, + "theoretical_loss": 3.418202343476776, + "tokens_seen": 2087625728 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001856268806419258, + "loss": 2.6577, + "theoretical_loss": 3.4181934719460796, + "tokens_seen": 2087691264 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018561685055165498, + "loss": 2.5835, + "theoretical_loss": 3.4181846007718453, + "tokens_seen": 2087756800 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018560682046138416, + "loss": 2.7601, + "theoretical_loss": 3.418175729954047, + "tokens_seen": 2087822336 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018559679037111334, + "loss": 2.4092, + "theoretical_loss": 3.418166859492661, + "tokens_seen": 2087887872 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018558676028084255, + "loss": 2.6296, + "theoretical_loss": 3.41815798938766, + "tokens_seen": 2087953408 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018557673019057173, + "loss": 2.6093, + "theoretical_loss": 3.4181491196390192, + "tokens_seen": 2088018944 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001855667001003009, + "loss": 2.3989, + "theoretical_loss": 3.4181402502467133, + "tokens_seen": 2088084480 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001855566700100301, + "loss": 2.6135, + "theoretical_loss": 3.418131381210716, + "tokens_seen": 2088150016 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018554663991975927, + "loss": 2.6321, + "theoretical_loss": 3.418122512531003, + "tokens_seen": 2088215552 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018553660982948848, + "loss": 2.5827, + "theoretical_loss": 3.4181136442075477, + "tokens_seen": 2088281088 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018552657973921766, + "loss": 2.6946, + "theoretical_loss": 3.4181047762403254, + "tokens_seen": 2088346624 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018551654964894684, + "loss": 2.5262, + "theoretical_loss": 3.41809590862931, + "tokens_seen": 2088412160 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018550651955867603, + "loss": 2.5287, + "theoretical_loss": 3.418087041374476, + "tokens_seen": 2088477696 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018549648946840523, + "loss": 2.6373, + "theoretical_loss": 3.418078174475799, + "tokens_seen": 2088543232 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018548645937813441, + "loss": 2.6081, + "theoretical_loss": 3.4180693079332523, + "tokens_seen": 2088608768 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001854764292878636, + "loss": 2.588, + "theoretical_loss": 3.4180604417468112, + "tokens_seen": 2088674304 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018546639919759278, + "loss": 2.68, + "theoretical_loss": 3.41805157591645, + "tokens_seen": 2088739840 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018545636910732196, + "loss": 2.6772, + "theoretical_loss": 3.4180427104421423, + "tokens_seen": 2088805376 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018544633901705117, + "loss": 2.4709, + "theoretical_loss": 3.4180338453238637, + "tokens_seen": 2088870912 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018543630892678035, + "loss": 2.4501, + "theoretical_loss": 3.4180249805615888, + "tokens_seen": 2088936448 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2342028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3815712928771973, + "objective/train/theoretical_loss": 3.4180227644266425, + "objective/train/tokens_used": 2109412832, + "theoretical_loss": 3.4180227644266425, + "tokens_seen": 2088952832 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018542627883650953, + "loss": 2.5115, + "theoretical_loss": 3.4180161161552913, + "tokens_seen": 2089001984 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001854162487462387, + "loss": 2.5951, + "theoretical_loss": 3.418007252104947, + "tokens_seen": 2089067520 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018540621865596792, + "loss": 2.5713, + "theoretical_loss": 3.417998388410529, + "tokens_seen": 2089133056 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001853961885656971, + "loss": 2.4986, + "theoretical_loss": 3.417989525072013, + "tokens_seen": 2089198592 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018538615847542628, + "loss": 2.5121, + "theoretical_loss": 3.4179806620893727, + "tokens_seen": 2089264128 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018537612838515546, + "loss": 2.7159, + "theoretical_loss": 3.417971799462583, + "tokens_seen": 2089329664 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018536609829488464, + "loss": 2.4015, + "theoretical_loss": 3.4179629371916187, + "tokens_seen": 2089395200 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018535606820461385, + "loss": 2.6145, + "theoretical_loss": 3.4179540752764543, + "tokens_seen": 2089460736 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018534603811434303, + "loss": 2.5617, + "theoretical_loss": 3.417945213717064, + "tokens_seen": 2089526272 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001853360080240722, + "loss": 2.4729, + "theoretical_loss": 3.4179363525134225, + "tokens_seen": 2089591808 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001853259779338014, + "loss": 2.5472, + "theoretical_loss": 3.4179274916655045, + "tokens_seen": 2089657344 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001853159478435306, + "loss": 2.4471, + "theoretical_loss": 3.4179186311732845, + "tokens_seen": 2089722880 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018530591775325978, + "loss": 2.4304, + "theoretical_loss": 3.4179097710367365, + "tokens_seen": 2089788416 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018529588766298896, + "loss": 2.4356, + "theoretical_loss": 3.4179009112558365, + "tokens_seen": 2089853952 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018528585757271815, + "loss": 2.6651, + "theoretical_loss": 3.4178920518305578, + "tokens_seen": 2089919488 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018527582748244733, + "loss": 2.6277, + "theoretical_loss": 3.417883192760875, + "tokens_seen": 2089985024 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018526579739217654, + "loss": 2.677, + "theoretical_loss": 3.4178743340467634, + "tokens_seen": 2090050560 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018525576730190572, + "loss": 2.6492, + "theoretical_loss": 3.417865475688197, + "tokens_seen": 2090116096 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001852457372116349, + "loss": 2.6156, + "theoretical_loss": 3.4178566176851506, + "tokens_seen": 2090181632 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018523570712136408, + "loss": 2.6596, + "theoretical_loss": 3.417847760037599, + "tokens_seen": 2090247168 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001852256770310933, + "loss": 2.6402, + "theoretical_loss": 3.4178389027455167, + "tokens_seen": 2090312704 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018521564694082247, + "loss": 2.4802, + "theoretical_loss": 3.417830045808878, + "tokens_seen": 2090378240 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018520561685055165, + "loss": 2.6393, + "theoretical_loss": 3.4178211892276575, + "tokens_seen": 2090443776 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018519558676028083, + "loss": 2.5611, + "theoretical_loss": 3.41781233300183, + "tokens_seen": 2090509312 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018518555667001004, + "loss": 2.5282, + "theoretical_loss": 3.41780347713137, + "tokens_seen": 2090574848 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2346758, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.306332588195801, + "objective/train/theoretical_loss": 3.4178012632192782, + "objective/train/tokens_used": 2111051232, + "theoretical_loss": 3.4178012632192782, + "tokens_seen": 2090591232 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018517552657973922, + "loss": 2.4267, + "theoretical_loss": 3.4177946216162525, + "tokens_seen": 2090640384 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001851654964894684, + "loss": 2.452, + "theoretical_loss": 3.4177857664564515, + "tokens_seen": 2090705920 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018515546639919758, + "loss": 2.6185, + "theoretical_loss": 3.4177769116519414, + "tokens_seen": 2090771456 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018514543630892676, + "loss": 2.5768, + "theoretical_loss": 3.4177680572026983, + "tokens_seen": 2090836992 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018513540621865597, + "loss": 2.6181, + "theoretical_loss": 3.4177592031086945, + "tokens_seen": 2090902528 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018512537612838515, + "loss": 2.4589, + "theoretical_loss": 3.417750349369907, + "tokens_seen": 2090968064 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018511534603811433, + "loss": 2.7248, + "theoretical_loss": 3.417741495986309, + "tokens_seen": 2091033600 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018510531594784351, + "loss": 2.4593, + "theoretical_loss": 3.417732642957875, + "tokens_seen": 2091099136 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018509528585757272, + "loss": 2.5812, + "theoretical_loss": 3.4177237902845805, + "tokens_seen": 2091164672 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001850852557673019, + "loss": 2.573, + "theoretical_loss": 3.4177149379664, + "tokens_seen": 2091230208 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018507522567703108, + "loss": 2.4235, + "theoretical_loss": 3.4177060860033066, + "tokens_seen": 2091295744 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018506519558676027, + "loss": 2.69, + "theoretical_loss": 3.417697234395277, + "tokens_seen": 2091361280 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018505516549648947, + "loss": 2.5968, + "theoretical_loss": 3.417688383142285, + "tokens_seen": 2091426816 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018504513540621868, + "loss": 2.4594, + "theoretical_loss": 3.417679532244305, + "tokens_seen": 2091492352 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018503510531594786, + "loss": 2.6486, + "theoretical_loss": 3.4176706817013116, + "tokens_seen": 2091557888 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018502507522567704, + "loss": 2.6904, + "theoretical_loss": 3.4176618315132803, + "tokens_seen": 2091623424 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018501504513540623, + "loss": 2.6465, + "theoretical_loss": 3.4176529816801846, + "tokens_seen": 2091688960 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018500501504513543, + "loss": 2.6568, + "theoretical_loss": 3.4176441322019997, + "tokens_seen": 2091754496 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018499498495486462, + "loss": 2.6969, + "theoretical_loss": 3.4176352830787002, + "tokens_seen": 2091820032 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001849849548645938, + "loss": 2.5606, + "theoretical_loss": 3.417626434310261, + "tokens_seen": 2091885568 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018497492477432298, + "loss": 2.5666, + "theoretical_loss": 3.4176175858966564, + "tokens_seen": 2091951104 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018496489468405216, + "loss": 2.4845, + "theoretical_loss": 3.417608737837861, + "tokens_seen": 2092016640 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018495486459378137, + "loss": 2.5571, + "theoretical_loss": 3.41759989013385, + "tokens_seen": 2092082176 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018494483450351055, + "loss": 2.5577, + "theoretical_loss": 3.417591042784597, + "tokens_seen": 2092147712 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018493480441323973, + "loss": 2.4996, + "theoretical_loss": 3.417582195790078, + "tokens_seen": 2092213248 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 2351875, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2650232315063477, + "objective/train/theoretical_loss": 3.4175799840968724, + "objective/train/tokens_used": 2112689632, + "theoretical_loss": 3.4175799840968724, + "tokens_seen": 2092229632 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001849247743229689, + "loss": 2.4512, + "theoretical_loss": 3.417573349150267, + "tokens_seen": 2092278784 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018491474423269812, + "loss": 2.7394, + "theoretical_loss": 3.4175645028651385, + "tokens_seen": 2092344320 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001849047141424273, + "loss": 2.5095, + "theoretical_loss": 3.4175556569346672, + "tokens_seen": 2092409856 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018489468405215648, + "loss": 2.5034, + "theoretical_loss": 3.417546811358828, + "tokens_seen": 2092475392 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018488465396188566, + "loss": 2.6377, + "theoretical_loss": 3.417537966137596, + "tokens_seen": 2092540928 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018487462387161484, + "loss": 2.6553, + "theoretical_loss": 3.4175291212709444, + "tokens_seen": 2092606464 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018486459378134405, + "loss": 2.3635, + "theoretical_loss": 3.4175202767588497, + "tokens_seen": 2092672000 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018485456369107323, + "loss": 2.5819, + "theoretical_loss": 3.4175114326012856, + "tokens_seen": 2092737536 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001848445336008024, + "loss": 2.4671, + "theoretical_loss": 3.417502588798227, + "tokens_seen": 2092803072 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001848345035105316, + "loss": 2.7781, + "theoretical_loss": 3.417493745349648, + "tokens_seen": 2092868608 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001848244734202608, + "loss": 2.3511, + "theoretical_loss": 3.4174849022555245, + "tokens_seen": 2092934144 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018481444332998998, + "loss": 2.591, + "theoretical_loss": 3.4174760595158302, + "tokens_seen": 2092999680 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018480441323971916, + "loss": 2.4651, + "theoretical_loss": 3.41746721713054, + "tokens_seen": 2093065216 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018479438314944835, + "loss": 2.6861, + "theoretical_loss": 3.4174583750996286, + "tokens_seen": 2093130752 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018478435305917753, + "loss": 2.6149, + "theoretical_loss": 3.4174495334230715, + "tokens_seen": 2093196288 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018477432296890674, + "loss": 2.685, + "theoretical_loss": 3.4174406921008424, + "tokens_seen": 2093261824 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018476429287863592, + "loss": 2.5309, + "theoretical_loss": 3.417431851132916, + "tokens_seen": 2093327360 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001847542627883651, + "loss": 2.6313, + "theoretical_loss": 3.4174230105192676, + "tokens_seen": 2093392896 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018474423269809428, + "loss": 2.4922, + "theoretical_loss": 3.4174141702598715, + "tokens_seen": 2093458432 + }, + { + "epoch": 7.0, + "learning_rate": 0.0001847342026078235, + "loss": 2.6641, + "theoretical_loss": 3.4174053303547023, + "tokens_seen": 2093523968 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018472417251755267, + "loss": 2.6294, + "theoretical_loss": 3.4173964908037355, + "tokens_seen": 2093589504 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018471414242728185, + "loss": 2.5691, + "theoretical_loss": 3.417387651606945, + "tokens_seen": 2093655040 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018470411233701103, + "loss": 2.5277, + "theoretical_loss": 3.4173788127643063, + "tokens_seen": 2093720576 + }, + { + "epoch": 7.0, + "learning_rate": 0.00018469408224674024, + "loss": 2.6797, + "theoretical_loss": 3.4173699742757933, + "tokens_seen": 2093786112 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018468405215646942, + "loss": 2.6484, + "theoretical_loss": 3.4173611361413814, + "tokens_seen": 2093851648 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2356734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2795612812042236, + "objective/train/theoretical_loss": 3.4173589266631037, + "objective/train/tokens_used": 2114328032, + "theoretical_loss": 3.4173589266631037, + "tokens_seen": 2093868032 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001846740220661986, + "loss": 2.3378, + "theoretical_loss": 3.4173522983610445, + "tokens_seen": 2093917184 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018466399197592778, + "loss": 2.6058, + "theoretical_loss": 3.4173434609347577, + "tokens_seen": 2093982720 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018465396188565696, + "loss": 2.5342, + "theoretical_loss": 3.4173346238624966, + "tokens_seen": 2094048256 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018464393179538617, + "loss": 2.6264, + "theoretical_loss": 3.417325787144235, + "tokens_seen": 2094113792 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018463390170511535, + "loss": 2.6335, + "theoretical_loss": 3.4173169507799477, + "tokens_seen": 2094179328 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018462387161484453, + "loss": 2.5521, + "theoretical_loss": 3.4173081147696096, + "tokens_seen": 2094244864 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018461384152457371, + "loss": 2.6245, + "theoretical_loss": 3.4172992791131955, + "tokens_seen": 2094310400 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018460381143430292, + "loss": 2.5337, + "theoretical_loss": 3.41729044381068, + "tokens_seen": 2094375936 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001845937813440321, + "loss": 2.6979, + "theoretical_loss": 3.417281608862038, + "tokens_seen": 2094441472 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018458375125376128, + "loss": 2.6857, + "theoretical_loss": 3.4172727742672446, + "tokens_seen": 2094507008 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018457372116349047, + "loss": 2.4098, + "theoretical_loss": 3.417263940026274, + "tokens_seen": 2094572544 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018456369107321965, + "loss": 2.5304, + "theoretical_loss": 3.417255106139101, + "tokens_seen": 2094638080 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018455366098294886, + "loss": 2.7174, + "theoretical_loss": 3.4172462726057002, + "tokens_seen": 2094703616 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018454363089267804, + "loss": 2.5831, + "theoretical_loss": 3.417237439426047, + "tokens_seen": 2094769152 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018453360080240722, + "loss": 2.6545, + "theoretical_loss": 3.4172286066001156, + "tokens_seen": 2094834688 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001845235707121364, + "loss": 2.4706, + "theoretical_loss": 3.417219774127881, + "tokens_seen": 2094900224 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001845135406218656, + "loss": 2.566, + "theoretical_loss": 3.4172109420093184, + "tokens_seen": 2094965760 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001845035105315948, + "loss": 2.4848, + "theoretical_loss": 3.4172021102444017, + "tokens_seen": 2095031296 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018449348044132397, + "loss": 2.3919, + "theoretical_loss": 3.4171932788331065, + "tokens_seen": 2095096832 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018448345035105315, + "loss": 2.6424, + "theoretical_loss": 3.4171844477754068, + "tokens_seen": 2095162368 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018447342026078233, + "loss": 2.5758, + "theoretical_loss": 3.417175617071278, + "tokens_seen": 2095227904 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018446339017051154, + "loss": 2.4096, + "theoretical_loss": 3.4171667867206947, + "tokens_seen": 2095293440 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018445336008024072, + "loss": 2.2801, + "theoretical_loss": 3.4171579567236314, + "tokens_seen": 2095358976 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001844433299899699, + "loss": 2.4911, + "theoretical_loss": 3.417149127080063, + "tokens_seen": 2095424512 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018443329989969908, + "loss": 2.5284, + "theoretical_loss": 3.417140297789965, + "tokens_seen": 2095490048 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2361955, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.291240692138672, + "objective/train/theoretical_loss": 3.417138090522667, + "objective/train/tokens_used": 2115966432, + "theoretical_loss": 3.417138090522667, + "tokens_seen": 2095506432 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001844232698094283, + "loss": 2.5549, + "theoretical_loss": 3.417131468853311, + "tokens_seen": 2095555584 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018441323971915747, + "loss": 2.6695, + "theoretical_loss": 3.4171226402700774, + "tokens_seen": 2095621120 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018440320962888665, + "loss": 2.6554, + "theoretical_loss": 3.4171138120402373, + "tokens_seen": 2095686656 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018439317953861583, + "loss": 2.4574, + "theoretical_loss": 3.4171049841637666, + "tokens_seen": 2095752192 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018438314944834502, + "loss": 2.6329, + "theoretical_loss": 3.417096156640639, + "tokens_seen": 2095817728 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018437311935807422, + "loss": 2.6702, + "theoretical_loss": 3.417087329470831, + "tokens_seen": 2095883264 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001843630892678034, + "loss": 2.4895, + "theoretical_loss": 3.417078502654316, + "tokens_seen": 2095948800 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018435305917753259, + "loss": 2.5076, + "theoretical_loss": 3.417069676191069, + "tokens_seen": 2096014336 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018434302908726177, + "loss": 2.6165, + "theoretical_loss": 3.4170608500810657, + "tokens_seen": 2096079872 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018433299899699098, + "loss": 2.7778, + "theoretical_loss": 3.41705202432428, + "tokens_seen": 2096145408 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018432296890672016, + "loss": 2.6067, + "theoretical_loss": 3.417043198920687, + "tokens_seen": 2096210944 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018431293881644934, + "loss": 2.694, + "theoretical_loss": 3.4170343738702615, + "tokens_seen": 2096276480 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018430290872617855, + "loss": 2.6312, + "theoretical_loss": 3.417025549172979, + "tokens_seen": 2096342016 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018429287863590773, + "loss": 2.702, + "theoretical_loss": 3.417016724828813, + "tokens_seen": 2096407552 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018428284854563694, + "loss": 2.4006, + "theoretical_loss": 3.417007900837739, + "tokens_seen": 2096473088 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018427281845536612, + "loss": 2.6314, + "theoretical_loss": 3.4169990771997325, + "tokens_seen": 2096538624 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001842627883650953, + "loss": 2.7762, + "theoretical_loss": 3.4169902539147667, + "tokens_seen": 2096604160 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018425275827482448, + "loss": 2.6869, + "theoretical_loss": 3.416981430982818, + "tokens_seen": 2096669696 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001842427281845537, + "loss": 2.5771, + "theoretical_loss": 3.416972608403861, + "tokens_seen": 2096735232 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001842427281845537, + "loss": 2.4553, + "theoretical_loss": 3.4169637861778703, + "tokens_seen": 2096800768 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018423269809428287, + "loss": 2.6509, + "theoretical_loss": 3.4169549643048205, + "tokens_seen": 2096866304 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018422266800401205, + "loss": 2.5045, + "theoretical_loss": 3.4169461427846866, + "tokens_seen": 2096931840 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018421263791374123, + "loss": 2.7359, + "theoretical_loss": 3.4169373216174432, + "tokens_seen": 2096997376 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018420260782347044, + "loss": 2.4689, + "theoretical_loss": 3.416928500803066, + "tokens_seen": 2097062912 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018419257773319962, + "loss": 2.5499, + "theoretical_loss": 3.416919680341529, + "tokens_seen": 2097128448 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2366959, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.542247772216797, + "objective/train/theoretical_loss": 3.4169174752812728, + "objective/train/tokens_used": 2117604832, + "theoretical_loss": 3.4169174752812728, + "tokens_seen": 2097144832 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001841825476429288, + "loss": 2.5599, + "theoretical_loss": 3.4169108602328073, + "tokens_seen": 2097193984 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018417251755265798, + "loss": 2.4574, + "theoretical_loss": 3.416902040476876, + "tokens_seen": 2097259520 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018416248746238716, + "loss": 2.7255, + "theoretical_loss": 3.4168932210737095, + "tokens_seen": 2097325056 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018415245737211637, + "loss": 2.6396, + "theoretical_loss": 3.4168844020232836, + "tokens_seen": 2097390592 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018414242728184555, + "loss": 2.4955, + "theoretical_loss": 3.416875583325572, + "tokens_seen": 2097456128 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018413239719157473, + "loss": 2.5378, + "theoretical_loss": 3.4168667649805498, + "tokens_seen": 2097521664 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018412236710130391, + "loss": 2.6408, + "theoretical_loss": 3.416857946988193, + "tokens_seen": 2097587200 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018411233701103312, + "loss": 2.576, + "theoretical_loss": 3.416849129348475, + "tokens_seen": 2097652736 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001841023069207623, + "loss": 2.5168, + "theoretical_loss": 3.416840312061372, + "tokens_seen": 2097718272 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018409227683049149, + "loss": 2.6763, + "theoretical_loss": 3.4168314951268575, + "tokens_seen": 2097783808 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018408224674022067, + "loss": 2.583, + "theoretical_loss": 3.416822678544907, + "tokens_seen": 2097849344 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018407221664994985, + "loss": 2.7235, + "theoretical_loss": 3.4168138623154967, + "tokens_seen": 2097914880 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018406218655967906, + "loss": 2.5024, + "theoretical_loss": 3.4168050464385993, + "tokens_seen": 2097980416 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018405215646940824, + "loss": 2.5439, + "theoretical_loss": 3.4167962309141906, + "tokens_seen": 2098045952 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018404212637913742, + "loss": 2.6396, + "theoretical_loss": 3.416787415742246, + "tokens_seen": 2098111488 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001840320962888666, + "loss": 2.524, + "theoretical_loss": 3.41677860092274, + "tokens_seen": 2098177024 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001840220661985958, + "loss": 2.4445, + "theoretical_loss": 3.4167697864556477, + "tokens_seen": 2098242560 + }, + { + "epoch": 7.01, + "learning_rate": 0.000184012036108325, + "loss": 2.7832, + "theoretical_loss": 3.4167609723409433, + "tokens_seen": 2098308096 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018400200601805417, + "loss": 2.6976, + "theoretical_loss": 3.4167521585786025, + "tokens_seen": 2098373632 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018399197592778335, + "loss": 2.4393, + "theoretical_loss": 3.4167433451686, + "tokens_seen": 2098439168 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018398194583751253, + "loss": 2.554, + "theoretical_loss": 3.41673453211091, + "tokens_seen": 2098504704 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018397191574724174, + "loss": 2.6012, + "theoretical_loss": 3.4167257194055085, + "tokens_seen": 2098570240 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018396188565697092, + "loss": 2.6525, + "theoretical_loss": 3.4167169070523697, + "tokens_seen": 2098635776 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001839518555667001, + "loss": 2.6464, + "theoretical_loss": 3.4167080950514688, + "tokens_seen": 2098701312 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018394182547642928, + "loss": 2.585, + "theoretical_loss": 3.4166992834027807, + "tokens_seen": 2098766848 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2372196, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5404465198516846, + "objective/train/theoretical_loss": 3.4166970805456396, + "objective/train/tokens_used": 2119243232, + "theoretical_loss": 3.4166970805456396, + "tokens_seen": 2098783232 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001839317953861585, + "loss": 2.6847, + "theoretical_loss": 3.4166904721062807, + "tokens_seen": 2098832384 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018392176529588767, + "loss": 2.7132, + "theoretical_loss": 3.416681661161943, + "tokens_seen": 2098897920 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018391173520561685, + "loss": 2.6502, + "theoretical_loss": 3.416672850569743, + "tokens_seen": 2098963456 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018390170511534603, + "loss": 2.7373, + "theoretical_loss": 3.416664040329655, + "tokens_seen": 2099028992 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018389167502507522, + "loss": 2.7364, + "theoretical_loss": 3.4166552304416555, + "tokens_seen": 2099094528 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018388164493480442, + "loss": 2.4247, + "theoretical_loss": 3.4166464209057175, + "tokens_seen": 2099160064 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001838716148445336, + "loss": 2.5782, + "theoretical_loss": 3.416637611721817, + "tokens_seen": 2099225600 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018386158475426279, + "loss": 2.586, + "theoretical_loss": 3.416628802889929, + "tokens_seen": 2099291136 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018385155466399197, + "loss": 2.5696, + "theoretical_loss": 3.416619994410028, + "tokens_seen": 2099356672 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018384152457372118, + "loss": 2.3529, + "theoretical_loss": 3.416611186282089, + "tokens_seen": 2099422208 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018383149448345036, + "loss": 2.5885, + "theoretical_loss": 3.416602378506087, + "tokens_seen": 2099487744 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018382146439317954, + "loss": 2.5841, + "theoretical_loss": 3.4165935710819975, + "tokens_seen": 2099553280 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018381143430290872, + "loss": 2.4646, + "theoretical_loss": 3.416584764009795, + "tokens_seen": 2099618816 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001838014042126379, + "loss": 2.6602, + "theoretical_loss": 3.4165759572894543, + "tokens_seen": 2099684352 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001837913741223671, + "loss": 2.7923, + "theoretical_loss": 3.41656715092095, + "tokens_seen": 2099749888 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001837813440320963, + "loss": 2.581, + "theoretical_loss": 3.4165583449042587, + "tokens_seen": 2099815424 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018377131394182547, + "loss": 2.6346, + "theoretical_loss": 3.4165495392393535, + "tokens_seen": 2099880960 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018376128385155465, + "loss": 2.3773, + "theoretical_loss": 3.4165407339262104, + "tokens_seen": 2099946496 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018375125376128386, + "loss": 2.5005, + "theoretical_loss": 3.4165319289648037, + "tokens_seen": 2100012032 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018374122367101304, + "loss": 2.4803, + "theoretical_loss": 3.416523124355109, + "tokens_seen": 2100077568 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018373119358074222, + "loss": 2.5201, + "theoretical_loss": 3.416514320097101, + "tokens_seen": 2100143104 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001837211634904714, + "loss": 2.5293, + "theoretical_loss": 3.416505516190755, + "tokens_seen": 2100208640 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018371113340020058, + "loss": 2.3189, + "theoretical_loss": 3.4164967126360457, + "tokens_seen": 2100274176 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001837011033099298, + "loss": 2.6519, + "theoretical_loss": 3.4164879094329477, + "tokens_seen": 2100339712 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018369107321965897, + "loss": 2.5874, + "theoretical_loss": 3.416479106581437, + "tokens_seen": 2100405248 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2374980, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4069173336029053, + "objective/train/theoretical_loss": 3.416476905923491, + "objective/train/tokens_used": 2120881632, + "theoretical_loss": 3.416476905923491, + "tokens_seen": 2100421632 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018368104312938815, + "loss": 2.5701, + "theoretical_loss": 3.4164703040814874, + "tokens_seen": 2100470784 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018367101303911734, + "loss": 2.6777, + "theoretical_loss": 3.4164615019330746, + "tokens_seen": 2100536320 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018366098294884654, + "loss": 2.5337, + "theoretical_loss": 3.4164527001361735, + "tokens_seen": 2100601856 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018365095285857573, + "loss": 2.6214, + "theoretical_loss": 3.416443898690759, + "tokens_seen": 2100667392 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001836409227683049, + "loss": 2.5131, + "theoretical_loss": 3.416435097596806, + "tokens_seen": 2100732928 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001836308926780341, + "loss": 2.5476, + "theoretical_loss": 3.41642629685429, + "tokens_seen": 2100798464 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018362086258776327, + "loss": 2.6949, + "theoretical_loss": 3.4164174964631853, + "tokens_seen": 2100864000 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018361083249749248, + "loss": 2.4228, + "theoretical_loss": 3.416408696423467, + "tokens_seen": 2100929536 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018360080240722166, + "loss": 2.6106, + "theoretical_loss": 3.4163998967351112, + "tokens_seen": 2100995072 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018359077231695084, + "loss": 2.5298, + "theoretical_loss": 3.4163910973980913, + "tokens_seen": 2101060608 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018358074222668002, + "loss": 2.4712, + "theoretical_loss": 3.4163822984123833, + "tokens_seen": 2101126144 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018357071213640923, + "loss": 2.5495, + "theoretical_loss": 3.4163734997779622, + "tokens_seen": 2101191680 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001835606820461384, + "loss": 2.4837, + "theoretical_loss": 3.416364701494803, + "tokens_seen": 2101257216 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018355065195586762, + "loss": 2.6087, + "theoretical_loss": 3.41635590356288, + "tokens_seen": 2101322752 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001835406218655968, + "loss": 2.5402, + "theoretical_loss": 3.4163471059821693, + "tokens_seen": 2101388288 + }, + { + "epoch": 7.01, + "learning_rate": 0.000183530591775326, + "loss": 2.5212, + "theoretical_loss": 3.4163383087526453, + "tokens_seen": 2101453824 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001835205616850552, + "loss": 2.5596, + "theoretical_loss": 3.4163295118742827, + "tokens_seen": 2101519360 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018351053159478437, + "loss": 2.5065, + "theoretical_loss": 3.4163207153470574, + "tokens_seen": 2101584896 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018350050150451355, + "loss": 2.5555, + "theoretical_loss": 3.416311919170944, + "tokens_seen": 2101650432 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018349047141424273, + "loss": 2.3445, + "theoretical_loss": 3.416303123345917, + "tokens_seen": 2101715968 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018348044132397194, + "loss": 2.4941, + "theoretical_loss": 3.4162943278719524, + "tokens_seen": 2101781504 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018347041123370112, + "loss": 2.4724, + "theoretical_loss": 3.4162855327490247, + "tokens_seen": 2101847040 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001834603811434303, + "loss": 2.5137, + "theoretical_loss": 3.4162767379771095, + "tokens_seen": 2101912576 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018345035105315948, + "loss": 2.5969, + "theoretical_loss": 3.416267943556181, + "tokens_seen": 2101978112 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001834403209628887, + "loss": 2.4885, + "theoretical_loss": 3.4162591494862147, + "tokens_seen": 2102043648 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2375764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.589604616165161, + "objective/train/theoretical_loss": 3.4162569510235583, + "objective/train/tokens_used": 2122520032, + "theoretical_loss": 3.4162569510235583, + "tokens_seen": 2102060032 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018343029087261787, + "loss": 2.6912, + "theoretical_loss": 3.4162503557671857, + "tokens_seen": 2102109184 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018342026078234705, + "loss": 2.5532, + "theoretical_loss": 3.416241562399069, + "tokens_seen": 2102174720 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018341023069207623, + "loss": 2.5673, + "theoretical_loss": 3.416232769381839, + "tokens_seen": 2102240256 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018340020060180542, + "loss": 2.5831, + "theoretical_loss": 3.4162239767154725, + "tokens_seen": 2102305792 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018339017051153462, + "loss": 2.5529, + "theoretical_loss": 3.4162151843999427, + "tokens_seen": 2102371328 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001833801404212638, + "loss": 2.6466, + "theoretical_loss": 3.4162063924352255, + "tokens_seen": 2102436864 + }, + { + "epoch": 7.01, + "learning_rate": 0.000183370110330993, + "loss": 2.4849, + "theoretical_loss": 3.416197600821296, + "tokens_seen": 2102502400 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018336008024072217, + "loss": 2.5212, + "theoretical_loss": 3.416188809558129, + "tokens_seen": 2102567936 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018335005015045138, + "loss": 2.5702, + "theoretical_loss": 3.4161800186457, + "tokens_seen": 2102633472 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018334002006018056, + "loss": 2.3584, + "theoretical_loss": 3.4161712280839835, + "tokens_seen": 2102699008 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018332998996990974, + "loss": 2.6583, + "theoretical_loss": 3.416162437872955, + "tokens_seen": 2102764544 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018331995987963892, + "loss": 2.6595, + "theoretical_loss": 3.41615364801259, + "tokens_seen": 2102830080 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001833099297893681, + "loss": 2.6848, + "theoretical_loss": 3.4161448585028618, + "tokens_seen": 2102895616 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001832998996990973, + "loss": 2.645, + "theoretical_loss": 3.4161360693437475, + "tokens_seen": 2102961152 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001832898696088265, + "loss": 2.6288, + "theoretical_loss": 3.416127280535221, + "tokens_seen": 2103026688 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018327983951855567, + "loss": 2.5745, + "theoretical_loss": 3.416118492077259, + "tokens_seen": 2103092224 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018326980942828485, + "loss": 2.5519, + "theoretical_loss": 3.416109703969834, + "tokens_seen": 2103157760 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018325977933801406, + "loss": 2.6205, + "theoretical_loss": 3.4161009162129234, + "tokens_seen": 2103223296 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018324974924774324, + "loss": 2.6997, + "theoretical_loss": 3.416092128806501, + "tokens_seen": 2103288832 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018323971915747242, + "loss": 2.6943, + "theoretical_loss": 3.416083341750542, + "tokens_seen": 2103354368 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001832296890672016, + "loss": 2.4898, + "theoretical_loss": 3.4160745550450224, + "tokens_seen": 2103419904 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018321965897693078, + "loss": 2.4297, + "theoretical_loss": 3.4160657686899167, + "tokens_seen": 2103485440 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018320962888666, + "loss": 2.4921, + "theoretical_loss": 3.4160569826851996, + "tokens_seen": 2103550976 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018319959879638917, + "loss": 2.6804, + "theoretical_loss": 3.4160481970308467, + "tokens_seen": 2103616512 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018318956870611836, + "loss": 2.628, + "theoretical_loss": 3.416039411726833, + "tokens_seen": 2103682048 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2376888, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.270189046859741, + "objective/train/theoretical_loss": 3.4160372154555674, + "objective/train/tokens_used": 2124158432, + "theoretical_loss": 3.4160372154555674, + "tokens_seen": 2103698432 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018317953861584754, + "loss": 2.5802, + "theoretical_loss": 3.416030626773134, + "tokens_seen": 2103747584 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018316950852557674, + "loss": 2.7228, + "theoretical_loss": 3.4160218421697244, + "tokens_seen": 2103813120 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018315947843530593, + "loss": 2.5548, + "theoretical_loss": 3.416013057916579, + "tokens_seen": 2103878656 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001831494483450351, + "loss": 2.5891, + "theoretical_loss": 3.4160042740136736, + "tokens_seen": 2103944192 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001831394182547643, + "loss": 2.4421, + "theoretical_loss": 3.415995490460983, + "tokens_seen": 2104009728 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018312938816449347, + "loss": 2.6413, + "theoretical_loss": 3.415986707258482, + "tokens_seen": 2104075264 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018311935807422268, + "loss": 2.6682, + "theoretical_loss": 3.415977924406147, + "tokens_seen": 2104140800 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018310932798395186, + "loss": 2.6914, + "theoretical_loss": 3.4159691419039517, + "tokens_seen": 2104206336 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018309929789368104, + "loss": 2.6175, + "theoretical_loss": 3.415960359751872, + "tokens_seen": 2104271872 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018308926780341022, + "loss": 2.6022, + "theoretical_loss": 3.4159515779498824, + "tokens_seen": 2104337408 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018307923771313943, + "loss": 2.4999, + "theoretical_loss": 3.415942796497959, + "tokens_seen": 2104402944 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001830692076228686, + "loss": 2.7872, + "theoretical_loss": 3.415934015396076, + "tokens_seen": 2104468480 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001830591775325978, + "loss": 2.479, + "theoretical_loss": 3.415925234644209, + "tokens_seen": 2104534016 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018304914744232697, + "loss": 2.537, + "theoretical_loss": 3.4159164542423333, + "tokens_seen": 2104599552 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018303911735205618, + "loss": 2.5763, + "theoretical_loss": 3.4159076741904237, + "tokens_seen": 2104665088 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018302908726178536, + "loss": 2.6327, + "theoretical_loss": 3.415898894488455, + "tokens_seen": 2104730624 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018301905717151454, + "loss": 2.6085, + "theoretical_loss": 3.4158901151364036, + "tokens_seen": 2104796160 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018300902708124372, + "loss": 2.6289, + "theoretical_loss": 3.4158813361342437, + "tokens_seen": 2104861696 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001829989969909729, + "loss": 2.574, + "theoretical_loss": 3.415872557481951, + "tokens_seen": 2104927232 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001829889669007021, + "loss": 2.646, + "theoretical_loss": 3.4158637791795, + "tokens_seen": 2104992768 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001829789368104313, + "loss": 2.6066, + "theoretical_loss": 3.4158550012268662, + "tokens_seen": 2105058304 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018296890672016048, + "loss": 2.6374, + "theoretical_loss": 3.415846223624025, + "tokens_seen": 2105123840 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018295887662988966, + "loss": 2.5385, + "theoretical_loss": 3.415837446370951, + "tokens_seen": 2105189376 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018294884653961886, + "loss": 2.6068, + "theoretical_loss": 3.4158286694676203, + "tokens_seen": 2105254912 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018293881644934805, + "loss": 2.628, + "theoretical_loss": 3.4158198929140067, + "tokens_seen": 2105320448 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2377469, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.387984037399292, + "objective/train/theoretical_loss": 3.415817698830244, + "objective/train/tokens_used": 2125796832, + "theoretical_loss": 3.415817698830244, + "tokens_seen": 2105336832 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018292878635907723, + "loss": 2.5586, + "theoretical_loss": 3.4158111167100866, + "tokens_seen": 2105385984 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001829187562688064, + "loss": 2.6161, + "theoretical_loss": 3.415802340855835, + "tokens_seen": 2105451520 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001829087261785356, + "loss": 2.8107, + "theoretical_loss": 3.4157935653512266, + "tokens_seen": 2105517056 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001828986960882648, + "loss": 2.5427, + "theoretical_loss": 3.415784790196237, + "tokens_seen": 2105582592 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018288866599799398, + "loss": 2.4387, + "theoretical_loss": 3.4157760153908407, + "tokens_seen": 2105648128 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018287863590772316, + "loss": 2.4092, + "theoretical_loss": 3.415767240935014, + "tokens_seen": 2105713664 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018286860581745234, + "loss": 2.5494, + "theoretical_loss": 3.4157584668287315, + "tokens_seen": 2105779200 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018285857572718155, + "loss": 2.4591, + "theoretical_loss": 3.4157496930719686, + "tokens_seen": 2105844736 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018284854563691073, + "loss": 2.5789, + "theoretical_loss": 3.4157409196646995, + "tokens_seen": 2105910272 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001828385155466399, + "loss": 2.5116, + "theoretical_loss": 3.415732146606901, + "tokens_seen": 2105975808 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001828284854563691, + "loss": 2.5761, + "theoretical_loss": 3.4157233738985475, + "tokens_seen": 2106041344 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018281845536609827, + "loss": 2.2672, + "theoretical_loss": 3.415714601539614, + "tokens_seen": 2106106880 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018280842527582748, + "loss": 2.6292, + "theoretical_loss": 3.4157058295300757, + "tokens_seen": 2106172416 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001827983951855567, + "loss": 2.6975, + "theoretical_loss": 3.415697057869908, + "tokens_seen": 2106237952 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018278836509528587, + "loss": 2.6097, + "theoretical_loss": 3.415688286559087, + "tokens_seen": 2106303488 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018277833500501505, + "loss": 2.6386, + "theoretical_loss": 3.4156795155975863, + "tokens_seen": 2106369024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018276830491474426, + "loss": 2.6162, + "theoretical_loss": 3.4156707449853823, + "tokens_seen": 2106434560 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018275827482447344, + "loss": 2.6353, + "theoretical_loss": 3.4156619747224495, + "tokens_seen": 2106500096 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018274824473420262, + "loss": 2.5724, + "theoretical_loss": 3.4156532048087636, + "tokens_seen": 2106565632 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001827382146439318, + "loss": 2.576, + "theoretical_loss": 3.4156444352442996, + "tokens_seen": 2106631168 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018272818455366098, + "loss": 2.6011, + "theoretical_loss": 3.4156356660290337, + "tokens_seen": 2106696704 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001827181544633902, + "loss": 2.5883, + "theoretical_loss": 3.415626897162939, + "tokens_seen": 2106762240 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018270812437311937, + "loss": 2.6243, + "theoretical_loss": 3.415618128645993, + "tokens_seen": 2106827776 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018269809428284856, + "loss": 2.6609, + "theoretical_loss": 3.415609360478169, + "tokens_seen": 2106893312 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018268806419257774, + "loss": 2.6197, + "theoretical_loss": 3.4156005926594437, + "tokens_seen": 2106958848 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2378664, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.795783281326294, + "objective/train/theoretical_loss": 3.415598400759306, + "objective/train/tokens_used": 2127435232, + "theoretical_loss": 3.415598400759306, + "tokens_seen": 2106975232 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018267803410230694, + "loss": 2.692, + "theoretical_loss": 3.4155918251897917, + "tokens_seen": 2107024384 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018266800401203613, + "loss": 2.4451, + "theoretical_loss": 3.4155830580691884, + "tokens_seen": 2107089920 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001826579739217653, + "loss": 2.6765, + "theoretical_loss": 3.4155742912976086, + "tokens_seen": 2107155456 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001826479438314945, + "loss": 2.5124, + "theoretical_loss": 3.4155655248750287, + "tokens_seen": 2107220992 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018263791374122367, + "loss": 2.6285, + "theoretical_loss": 3.4155567588014226, + "tokens_seen": 2107286528 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018262788365095288, + "loss": 2.5932, + "theoretical_loss": 3.4155479930767663, + "tokens_seen": 2107352064 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018261785356068206, + "loss": 2.5341, + "theoretical_loss": 3.415539227701035, + "tokens_seen": 2107417600 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018260782347041124, + "loss": 2.4823, + "theoretical_loss": 3.415530462674204, + "tokens_seen": 2107483136 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018259779338014042, + "loss": 2.7062, + "theoretical_loss": 3.4155216979962484, + "tokens_seen": 2107548672 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018258776328986963, + "loss": 2.5859, + "theoretical_loss": 3.4155129336671433, + "tokens_seen": 2107614208 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001825777331995988, + "loss": 2.4755, + "theoretical_loss": 3.415504169686864, + "tokens_seen": 2107679744 + }, + { + "epoch": 7.01, + "learning_rate": 0.000182567703109328, + "loss": 2.3585, + "theoretical_loss": 3.4154954060553866, + "tokens_seen": 2107745280 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018255767301905717, + "loss": 2.6922, + "theoretical_loss": 3.4154866427726853, + "tokens_seen": 2107810816 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018254764292878638, + "loss": 2.4035, + "theoretical_loss": 3.4154778798387357, + "tokens_seen": 2107876352 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018253761283851556, + "loss": 2.6631, + "theoretical_loss": 3.4154691172535134, + "tokens_seen": 2107941888 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018252758274824474, + "loss": 2.6011, + "theoretical_loss": 3.415460355016993, + "tokens_seen": 2108007424 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018251755265797392, + "loss": 2.6906, + "theoretical_loss": 3.415451593129151, + "tokens_seen": 2108072960 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001825075225677031, + "loss": 2.4661, + "theoretical_loss": 3.415442831589961, + "tokens_seen": 2108138496 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001824974924774323, + "loss": 2.6536, + "theoretical_loss": 3.4154340703994004, + "tokens_seen": 2108204032 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001824874623871615, + "loss": 2.6209, + "theoretical_loss": 3.4154253095574423, + "tokens_seen": 2108269568 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018247743229689068, + "loss": 2.4655, + "theoretical_loss": 3.415416549064063, + "tokens_seen": 2108335104 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018246740220661986, + "loss": 2.6564, + "theoretical_loss": 3.4154077889192385, + "tokens_seen": 2108400640 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018245737211634906, + "loss": 2.5286, + "theoretical_loss": 3.415399029122943, + "tokens_seen": 2108466176 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018244734202607825, + "loss": 2.4385, + "theoretical_loss": 3.415390269675152, + "tokens_seen": 2108531712 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018243731193580743, + "loss": 2.4843, + "theoretical_loss": 3.415381510575841, + "tokens_seen": 2108597248 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2379466, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8361635208129883, + "objective/train/theoretical_loss": 3.415379320855461, + "objective/train/tokens_used": 2129073632, + "theoretical_loss": 3.415379320855461, + "tokens_seen": 2108613632 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001824272818455366, + "loss": 2.5025, + "theoretical_loss": 3.415372751824986, + "tokens_seen": 2108662784 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001824172517552658, + "loss": 2.6067, + "theoretical_loss": 3.4153639934225613, + "tokens_seen": 2108728320 + }, + { + "epoch": 7.01, + "learning_rate": 0.000182407221664995, + "loss": 2.5564, + "theoretical_loss": 3.4153552353685424, + "tokens_seen": 2108793856 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018239719157472418, + "loss": 2.5548, + "theoretical_loss": 3.4153464776629043, + "tokens_seen": 2108859392 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018238716148445336, + "loss": 2.5553, + "theoretical_loss": 3.415337720305623, + "tokens_seen": 2108924928 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018237713139418254, + "loss": 2.4826, + "theoretical_loss": 3.415328963296674, + "tokens_seen": 2108990464 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018236710130391175, + "loss": 2.6086, + "theoretical_loss": 3.4153202066360318, + "tokens_seen": 2109056000 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018235707121364093, + "loss": 2.3711, + "theoretical_loss": 3.415311450323672, + "tokens_seen": 2109121536 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001823470411233701, + "loss": 2.5063, + "theoretical_loss": 3.4153026943595703, + "tokens_seen": 2109187072 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001823370110330993, + "loss": 2.7648, + "theoretical_loss": 3.4152939387437016, + "tokens_seen": 2109252608 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018232698094282847, + "loss": 2.6031, + "theoretical_loss": 3.415285183476042, + "tokens_seen": 2109318144 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018231695085255768, + "loss": 2.3386, + "theoretical_loss": 3.4152764285565658, + "tokens_seen": 2109383680 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018230692076228686, + "loss": 2.4616, + "theoretical_loss": 3.4152676739852486, + "tokens_seen": 2109449216 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018229689067201604, + "loss": 2.3788, + "theoretical_loss": 3.415258919762066, + "tokens_seen": 2109514752 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018228686058174523, + "loss": 2.5234, + "theoretical_loss": 3.4152501658869934, + "tokens_seen": 2109580288 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018227683049147443, + "loss": 2.5948, + "theoretical_loss": 3.4152414123600057, + "tokens_seen": 2109645824 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018226680040120361, + "loss": 2.3946, + "theoretical_loss": 3.4152326591810787, + "tokens_seen": 2109711360 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001822567703109328, + "loss": 2.5822, + "theoretical_loss": 3.415223906350188, + "tokens_seen": 2109776896 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018224674022066198, + "loss": 2.3705, + "theoretical_loss": 3.4152151538673077, + "tokens_seen": 2109842432 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018223671013039116, + "loss": 2.4482, + "theoretical_loss": 3.4152064017324144, + "tokens_seen": 2109907968 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018222668004012037, + "loss": 2.5579, + "theoretical_loss": 3.4151976499454832, + "tokens_seen": 2109973504 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018221664994984955, + "loss": 2.543, + "theoretical_loss": 3.4151888985064893, + "tokens_seen": 2110039040 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018220661985957873, + "loss": 2.4051, + "theoretical_loss": 3.415180147415408, + "tokens_seen": 2110104576 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001821965897693079, + "loss": 2.4532, + "theoretical_loss": 3.4151713966722146, + "tokens_seen": 2110170112 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018218655967903712, + "loss": 2.4918, + "theoretical_loss": 3.415162646276885, + "tokens_seen": 2110235648 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2381063, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5806539058685303, + "objective/train/theoretical_loss": 3.415160458732403, + "objective/train/tokens_used": 2130712032, + "theoretical_loss": 3.415160458732403, + "tokens_seen": 2110252032 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001821765295887663, + "loss": 2.6499, + "theoretical_loss": 3.415153896229394, + "tokens_seen": 2110301184 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018216649949849548, + "loss": 2.6849, + "theoretical_loss": 3.4151451465297167, + "tokens_seen": 2110366720 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018215646940822466, + "loss": 2.5273, + "theoretical_loss": 3.415136397177829, + "tokens_seen": 2110432256 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018214643931795384, + "loss": 2.4647, + "theoretical_loss": 3.4151276481737067, + "tokens_seen": 2110497792 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018213640922768305, + "loss": 2.6264, + "theoretical_loss": 3.4151188995173243, + "tokens_seen": 2110563328 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018212637913741223, + "loss": 2.5175, + "theoretical_loss": 3.4151101512086575, + "tokens_seen": 2110628864 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001821163490471414, + "loss": 2.5324, + "theoretical_loss": 3.415101403247682, + "tokens_seen": 2110694400 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001821063189568706, + "loss": 2.4516, + "theoretical_loss": 3.415092655634373, + "tokens_seen": 2110759936 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001820962888665998, + "loss": 2.3799, + "theoretical_loss": 3.4150839083687057, + "tokens_seen": 2110825472 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018208625877632898, + "loss": 2.5181, + "theoretical_loss": 3.4150751614506554, + "tokens_seen": 2110891008 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018207622868605816, + "loss": 2.6177, + "theoretical_loss": 3.4150664148801977, + "tokens_seen": 2110956544 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018206619859578735, + "loss": 2.7595, + "theoretical_loss": 3.415057668657308, + "tokens_seen": 2111022080 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018205616850551658, + "loss": 2.5862, + "theoretical_loss": 3.415048922781962, + "tokens_seen": 2111087616 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018204613841524576, + "loss": 2.5921, + "theoretical_loss": 3.4150401772541343, + "tokens_seen": 2111153152 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018203610832497494, + "loss": 2.4102, + "theoretical_loss": 3.4150314320738016, + "tokens_seen": 2111218688 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018202607823470412, + "loss": 2.6297, + "theoretical_loss": 3.4150226872409375, + "tokens_seen": 2111284224 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001820160481444333, + "loss": 2.6018, + "theoretical_loss": 3.415013942755519, + "tokens_seen": 2111349760 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001820060180541625, + "loss": 2.4874, + "theoretical_loss": 3.415005198617521, + "tokens_seen": 2111415296 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001819959879638917, + "loss": 2.5416, + "theoretical_loss": 3.4149964548269187, + "tokens_seen": 2111480832 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018198595787362088, + "loss": 2.5728, + "theoretical_loss": 3.414987711383688, + "tokens_seen": 2111546368 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018197592778335006, + "loss": 2.5681, + "theoretical_loss": 3.4149789682878033, + "tokens_seen": 2111611904 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018196589769307926, + "loss": 2.3723, + "theoretical_loss": 3.414970225539241, + "tokens_seen": 2111677440 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018195586760280845, + "loss": 2.4716, + "theoretical_loss": 3.414961483137976, + "tokens_seen": 2111742976 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018194583751253763, + "loss": 2.4663, + "theoretical_loss": 3.4149527410839844, + "tokens_seen": 2111808512 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001819358074222668, + "loss": 2.628, + "theoretical_loss": 3.414943999377241, + "tokens_seen": 2111874048 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2381797, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.571430206298828, + "objective/train/theoretical_loss": 3.41494181400481, + "objective/train/tokens_used": 2132350432, + "theoretical_loss": 3.41494181400481, + "tokens_seen": 2111890432 + }, + { + "epoch": 7.01, + "learning_rate": 0.000181925777331996, + "loss": 2.4593, + "theoretical_loss": 3.4149352580177217, + "tokens_seen": 2111939584 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001819157472417252, + "loss": 2.7065, + "theoretical_loss": 3.414926517005401, + "tokens_seen": 2112005120 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018190571715145438, + "loss": 2.3608, + "theoretical_loss": 3.414917776340255, + "tokens_seen": 2112070656 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018189568706118356, + "loss": 2.4616, + "theoretical_loss": 3.4149090360222596, + "tokens_seen": 2112136192 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018188565697091274, + "loss": 2.6525, + "theoretical_loss": 3.41490029605139, + "tokens_seen": 2112201728 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018187562688064195, + "loss": 2.6073, + "theoretical_loss": 3.4148915564276203, + "tokens_seen": 2112267264 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018186559679037113, + "loss": 2.6188, + "theoretical_loss": 3.414882817150928, + "tokens_seen": 2112332800 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001818555667001003, + "loss": 2.5228, + "theoretical_loss": 3.4148740782212874, + "tokens_seen": 2112398336 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001818455366098295, + "loss": 2.7502, + "theoretical_loss": 3.414865339638674, + "tokens_seen": 2112463872 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018183550651955867, + "loss": 2.5434, + "theoretical_loss": 3.4148566014030637, + "tokens_seen": 2112529408 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018182547642928788, + "loss": 2.5442, + "theoretical_loss": 3.4148478635144315, + "tokens_seen": 2112594944 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018181544633901706, + "loss": 2.3078, + "theoretical_loss": 3.414839125972753, + "tokens_seen": 2112660480 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018180541624874624, + "loss": 2.7308, + "theoretical_loss": 3.4148303887780034, + "tokens_seen": 2112726016 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018179538615847543, + "loss": 2.8082, + "theoretical_loss": 3.4148216519301586, + "tokens_seen": 2112791552 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018178535606820463, + "loss": 2.5305, + "theoretical_loss": 3.414812915429194, + "tokens_seen": 2112857088 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018177532597793381, + "loss": 2.4508, + "theoretical_loss": 3.4148041792750847, + "tokens_seen": 2112922624 + }, + { + "epoch": 7.01, + "learning_rate": 0.000181765295887663, + "loss": 2.461, + "theoretical_loss": 3.414795443467807, + "tokens_seen": 2112988160 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018175526579739218, + "loss": 2.5504, + "theoretical_loss": 3.4147867080073353, + "tokens_seen": 2113053696 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018174523570712136, + "loss": 2.6899, + "theoretical_loss": 3.414777972893646, + "tokens_seen": 2113119232 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018173520561685057, + "loss": 2.4797, + "theoretical_loss": 3.414769238126714, + "tokens_seen": 2113184768 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018172517552657975, + "loss": 2.4619, + "theoretical_loss": 3.414760503706515, + "tokens_seen": 2113250304 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018171514543630893, + "loss": 2.4306, + "theoretical_loss": 3.4147517696330243, + "tokens_seen": 2113315840 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001817051153460381, + "loss": 2.576, + "theoretical_loss": 3.414743035906218, + "tokens_seen": 2113381376 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018169508525576732, + "loss": 2.5234, + "theoretical_loss": 3.4147343025260706, + "tokens_seen": 2113446912 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001816850551654965, + "loss": 2.6586, + "theoretical_loss": 3.4147255694925582, + "tokens_seen": 2113512448 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2383128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0670626163482666, + "objective/train/theoretical_loss": 3.414723386288339, + "objective/train/tokens_used": 2133988832, + "theoretical_loss": 3.414723386288339, + "tokens_seen": 2113528832 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018167502507522568, + "loss": 2.5338, + "theoretical_loss": 3.414716836805656, + "tokens_seen": 2113577984 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018166499498495486, + "loss": 2.4081, + "theoretical_loss": 3.4147081044653405, + "tokens_seen": 2113643520 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018165496489468404, + "loss": 2.5988, + "theoretical_loss": 3.4146993724715857, + "tokens_seen": 2113709056 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018164493480441325, + "loss": 2.5153, + "theoretical_loss": 3.414690640824368, + "tokens_seen": 2113774592 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018163490471414243, + "loss": 2.4788, + "theoretical_loss": 3.4146819095236625, + "tokens_seen": 2113840128 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001816248746238716, + "loss": 2.5782, + "theoretical_loss": 3.4146731785694455, + "tokens_seen": 2113905664 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001816148445336008, + "loss": 2.4118, + "theoretical_loss": 3.4146644479616914, + "tokens_seen": 2113971200 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018160481444333, + "loss": 2.5447, + "theoretical_loss": 3.4146557177003762, + "tokens_seen": 2114036736 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018159478435305918, + "loss": 2.5074, + "theoretical_loss": 3.414646987785476, + "tokens_seen": 2114102272 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018158475426278836, + "loss": 2.7804, + "theoretical_loss": 3.414638258216965, + "tokens_seen": 2114167808 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018157472417251755, + "loss": 2.3573, + "theoretical_loss": 3.41462952899482, + "tokens_seen": 2114233344 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018156469408224673, + "loss": 2.599, + "theoretical_loss": 3.414620800119016, + "tokens_seen": 2114298880 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018155466399197593, + "loss": 2.4266, + "theoretical_loss": 3.4146120715895285, + "tokens_seen": 2114364416 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018154463390170512, + "loss": 2.3268, + "theoretical_loss": 3.414603343406333, + "tokens_seen": 2114429952 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001815346038114343, + "loss": 2.4465, + "theoretical_loss": 3.4145946155694054, + "tokens_seen": 2114495488 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018152457372116348, + "loss": 2.4741, + "theoretical_loss": 3.4145858880787205, + "tokens_seen": 2114561024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018151454363089269, + "loss": 2.5861, + "theoretical_loss": 3.414577160934254, + "tokens_seen": 2114626560 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018150451354062187, + "loss": 2.5166, + "theoretical_loss": 3.4145684341359823, + "tokens_seen": 2114692096 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018149448345035105, + "loss": 2.5255, + "theoretical_loss": 3.41455970768388, + "tokens_seen": 2114757632 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018148445336008023, + "loss": 2.473, + "theoretical_loss": 3.414550981577923, + "tokens_seen": 2114823168 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001814744232698094, + "loss": 2.6244, + "theoretical_loss": 3.4145422558180867, + "tokens_seen": 2114888704 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018146439317953862, + "loss": 2.6086, + "theoretical_loss": 3.4145335304043467, + "tokens_seen": 2114954240 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001814543630892678, + "loss": 2.5223, + "theoretical_loss": 3.414524805336679, + "tokens_seen": 2115019776 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018144433299899698, + "loss": 2.5581, + "theoretical_loss": 3.4145160806150585, + "tokens_seen": 2115085312 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018143430290872616, + "loss": 2.6515, + "theoretical_loss": 3.4145073562394606, + "tokens_seen": 2115150848 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2383795, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7746195793151855, + "objective/train/theoretical_loss": 3.4145051751996247, + "objective/train/tokens_used": 2135627232, + "theoretical_loss": 3.4145051751996247, + "tokens_seen": 2115167232 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018142427281845537, + "loss": 2.4599, + "theoretical_loss": 3.4144986322098623, + "tokens_seen": 2115216384 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018141424272818455, + "loss": 2.3289, + "theoretical_loss": 3.414489908526237, + "tokens_seen": 2115281920 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018140421263791373, + "loss": 2.5132, + "theoretical_loss": 3.414481185188562, + "tokens_seen": 2115347456 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018139418254764291, + "loss": 2.3336, + "theoretical_loss": 3.4144724621968123, + "tokens_seen": 2115412992 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018138415245737212, + "loss": 2.413, + "theoretical_loss": 3.414463739550963, + "tokens_seen": 2115478528 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001813741223671013, + "loss": 2.488, + "theoretical_loss": 3.4144550172509907, + "tokens_seen": 2115544064 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018136409227683048, + "loss": 2.6921, + "theoretical_loss": 3.41444629529687, + "tokens_seen": 2115609600 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018135406218655967, + "loss": 2.8189, + "theoretical_loss": 3.414437573688577, + "tokens_seen": 2115675136 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018134403209628885, + "loss": 2.603, + "theoretical_loss": 3.4144288524260866, + "tokens_seen": 2115740672 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018133400200601805, + "loss": 2.7074, + "theoretical_loss": 3.414420131509375, + "tokens_seen": 2115806208 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018132397191574724, + "loss": 2.6366, + "theoretical_loss": 3.414411410938418, + "tokens_seen": 2115871744 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018131394182547642, + "loss": 2.6307, + "theoretical_loss": 3.4144026907131906, + "tokens_seen": 2115937280 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018130391173520563, + "loss": 2.5594, + "theoretical_loss": 3.414393970833669, + "tokens_seen": 2116002816 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018129388164493483, + "loss": 2.4242, + "theoretical_loss": 3.4143852512998283, + "tokens_seen": 2116068352 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018128385155466401, + "loss": 2.5224, + "theoretical_loss": 3.414376532111644, + "tokens_seen": 2116133888 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001812738214643932, + "loss": 2.2968, + "theoretical_loss": 3.414367813269092, + "tokens_seen": 2116199424 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018126379137412238, + "loss": 2.6919, + "theoretical_loss": 3.4143590947721476, + "tokens_seen": 2116264960 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018125376128385156, + "loss": 2.4704, + "theoretical_loss": 3.4143503766207868, + "tokens_seen": 2116330496 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018124373119358077, + "loss": 2.352, + "theoretical_loss": 3.414341658814985, + "tokens_seen": 2116396032 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018123370110330995, + "loss": 2.7685, + "theoretical_loss": 3.4143329413547177, + "tokens_seen": 2116461568 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018122367101303913, + "loss": 2.3649, + "theoretical_loss": 3.414324224239961, + "tokens_seen": 2116527104 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001812136409227683, + "loss": 2.4179, + "theoretical_loss": 3.4143155074706897, + "tokens_seen": 2116592640 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018120361083249752, + "loss": 2.6813, + "theoretical_loss": 3.4143067910468803, + "tokens_seen": 2116658176 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001811935807422267, + "loss": 2.4458, + "theoretical_loss": 3.4142980749685075, + "tokens_seen": 2116723712 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018118355065195588, + "loss": 2.4784, + "theoretical_loss": 3.414289359235547, + "tokens_seen": 2116789248 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2385327, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8846988677978516, + "objective/train/theoretical_loss": 3.414287180356275, + "objective/train/tokens_used": 2137265632, + "theoretical_loss": 3.414287180356275, + "tokens_seen": 2116805632 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018117352056168506, + "loss": 2.6953, + "theoretical_loss": 3.4142806438479756, + "tokens_seen": 2116854784 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018116349047141424, + "loss": 2.6359, + "theoretical_loss": 3.4142719288057677, + "tokens_seen": 2116920320 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018115346038114345, + "loss": 2.6596, + "theoretical_loss": 3.414263214108899, + "tokens_seen": 2116985856 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018114343029087263, + "loss": 2.4877, + "theoretical_loss": 3.414254499757346, + "tokens_seen": 2117051392 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001811334002006018, + "loss": 2.5685, + "theoretical_loss": 3.4142457857510835, + "tokens_seen": 2117116928 + }, + { + "epoch": 7.01, + "learning_rate": 0.000181123370110331, + "loss": 2.358, + "theoretical_loss": 3.4142370720900876, + "tokens_seen": 2117182464 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001811133400200602, + "loss": 2.4018, + "theoretical_loss": 3.4142283587743334, + "tokens_seen": 2117248000 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018110330992978938, + "loss": 2.328, + "theoretical_loss": 3.414219645803797, + "tokens_seen": 2117313536 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018109327983951856, + "loss": 2.4702, + "theoretical_loss": 3.4142109331784543, + "tokens_seen": 2117379072 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018108324974924775, + "loss": 2.4035, + "theoretical_loss": 3.4142022208982796, + "tokens_seen": 2117444608 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018107321965897693, + "loss": 2.6994, + "theoretical_loss": 3.4141935089632502, + "tokens_seen": 2117510144 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018106318956870613, + "loss": 2.5863, + "theoretical_loss": 3.4141847973733412, + "tokens_seen": 2117575680 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018105315947843532, + "loss": 2.4547, + "theoretical_loss": 3.4141760861285277, + "tokens_seen": 2117641216 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001810431293881645, + "loss": 2.4293, + "theoretical_loss": 3.4141673752287858, + "tokens_seen": 2117706752 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018103309929789368, + "loss": 2.4692, + "theoretical_loss": 3.4141586646740913, + "tokens_seen": 2117772288 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001810230692076229, + "loss": 2.5699, + "theoretical_loss": 3.4141499544644196, + "tokens_seen": 2117837824 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018101303911735207, + "loss": 2.6265, + "theoretical_loss": 3.414141244599746, + "tokens_seen": 2117903360 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018100300902708125, + "loss": 2.5108, + "theoretical_loss": 3.414132535080047, + "tokens_seen": 2117968896 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018099297893681043, + "loss": 2.6404, + "theoretical_loss": 3.414123825905298, + "tokens_seen": 2118034432 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001809829488465396, + "loss": 2.4612, + "theoretical_loss": 3.4141151170754735, + "tokens_seen": 2118099968 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018097291875626882, + "loss": 2.4334, + "theoretical_loss": 3.414106408590551, + "tokens_seen": 2118165504 + }, + { + "epoch": 7.01, + "learning_rate": 0.000180962888665998, + "loss": 2.4661, + "theoretical_loss": 3.4140977004505055, + "tokens_seen": 2118231040 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018095285857572718, + "loss": 2.4247, + "theoretical_loss": 3.414088992655312, + "tokens_seen": 2118296576 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018094282848545636, + "loss": 2.2405, + "theoretical_loss": 3.414080285204947, + "tokens_seen": 2118362112 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018093279839518557, + "loss": 2.5177, + "theoretical_loss": 3.4140715780993856, + "tokens_seen": 2118427648 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2386066, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.109576463699341, + "objective/train/theoretical_loss": 3.4140694013768678, + "objective/train/tokens_used": 2138904032, + "theoretical_loss": 3.4140694013768678, + "tokens_seen": 2118444032 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018092276830491475, + "loss": 2.6017, + "theoretical_loss": 3.4140628713386034, + "tokens_seen": 2118493184 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018091273821464393, + "loss": 2.4209, + "theoretical_loss": 3.414054164922577, + "tokens_seen": 2118558720 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018090270812437311, + "loss": 2.3596, + "theoretical_loss": 3.414045458851281, + "tokens_seen": 2118624256 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018089267803410232, + "loss": 2.6975, + "theoretical_loss": 3.4140367531246922, + "tokens_seen": 2118689792 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001808826479438315, + "loss": 2.4986, + "theoretical_loss": 3.414028047742786, + "tokens_seen": 2118755328 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018087261785356068, + "loss": 2.5378, + "theoretical_loss": 3.4140193427055365, + "tokens_seen": 2118820864 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018086258776328987, + "loss": 2.3145, + "theoretical_loss": 3.4140106380129214, + "tokens_seen": 2118886400 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018085255767301905, + "loss": 2.6195, + "theoretical_loss": 3.414001933664916, + "tokens_seen": 2118951936 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018084252758274826, + "loss": 2.5342, + "theoretical_loss": 3.4139932296614948, + "tokens_seen": 2119017472 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018083249749247744, + "loss": 2.3709, + "theoretical_loss": 3.413984526002635, + "tokens_seen": 2119083008 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018082246740220662, + "loss": 2.5574, + "theoretical_loss": 3.4139758226883115, + "tokens_seen": 2119148544 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001808124373119358, + "loss": 2.5273, + "theoretical_loss": 3.4139671197185, + "tokens_seen": 2119214080 + }, + { + "epoch": 7.01, + "learning_rate": 0.000180802407221665, + "loss": 2.7271, + "theoretical_loss": 3.4139584170931765, + "tokens_seen": 2119279616 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001807923771313942, + "loss": 2.6878, + "theoretical_loss": 3.4139497148123166, + "tokens_seen": 2119345152 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018078234704112337, + "loss": 2.4596, + "theoretical_loss": 3.4139410128758962, + "tokens_seen": 2119410688 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018077231695085255, + "loss": 2.6672, + "theoretical_loss": 3.4139323112838906, + "tokens_seen": 2119476224 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018076228686058173, + "loss": 2.6532, + "theoretical_loss": 3.4139236100362758, + "tokens_seen": 2119541760 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018075225677031094, + "loss": 2.5302, + "theoretical_loss": 3.4139149091330276, + "tokens_seen": 2119607296 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018074222668004012, + "loss": 2.6498, + "theoretical_loss": 3.4139062085741214, + "tokens_seen": 2119672832 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001807321965897693, + "loss": 2.4591, + "theoretical_loss": 3.413897508359533, + "tokens_seen": 2119738368 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018072216649949848, + "loss": 2.4544, + "theoretical_loss": 3.4138888084892383, + "tokens_seen": 2119803904 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001807121364092277, + "loss": 2.5812, + "theoretical_loss": 3.4138801089632134, + "tokens_seen": 2119869440 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018070210631895687, + "loss": 2.5454, + "theoretical_loss": 3.413871409781433, + "tokens_seen": 2119934976 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018069207622868605, + "loss": 2.5917, + "theoretical_loss": 3.413862710943874, + "tokens_seen": 2120000512 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018068204613841523, + "loss": 2.5789, + "theoretical_loss": 3.4138540124505115, + "tokens_seen": 2120066048 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2387687, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.261141777038574, + "objective/train/theoretical_loss": 3.4138518378809484, + "objective/train/tokens_used": 2140542432, + "theoretical_loss": 3.4138518378809484, + "tokens_seen": 2120082432 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018067201604814442, + "loss": 2.5424, + "theoretical_loss": 3.413845314301321, + "tokens_seen": 2120131584 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018066198595787362, + "loss": 2.7159, + "theoretical_loss": 3.413836616496279, + "tokens_seen": 2120197120 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001806519558676028, + "loss": 2.409, + "theoretical_loss": 3.41382791903536, + "tokens_seen": 2120262656 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018064192577733199, + "loss": 2.3312, + "theoretical_loss": 3.4138192219185415, + "tokens_seen": 2120328192 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018063189568706117, + "loss": 2.7992, + "theoretical_loss": 3.4138105251457977, + "tokens_seen": 2120393728 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018062186559679038, + "loss": 2.5864, + "theoretical_loss": 3.413801828717106, + "tokens_seen": 2120459264 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018061183550651956, + "loss": 2.5174, + "theoretical_loss": 3.41379313263244, + "tokens_seen": 2120524800 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018060180541624874, + "loss": 2.6103, + "theoretical_loss": 3.4137844368917767, + "tokens_seen": 2120590336 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018059177532597792, + "loss": 2.5445, + "theoretical_loss": 3.413775741495092, + "tokens_seen": 2120655872 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001805817452357071, + "loss": 2.6561, + "theoretical_loss": 3.413767046442362, + "tokens_seen": 2120721408 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001805717151454363, + "loss": 2.6245, + "theoretical_loss": 3.413758351733561, + "tokens_seen": 2120786944 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001805616850551655, + "loss": 2.4965, + "theoretical_loss": 3.4137496573686663, + "tokens_seen": 2120852480 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001805516549648947, + "loss": 2.4162, + "theoretical_loss": 3.4137409633476525, + "tokens_seen": 2120918016 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018054162487462388, + "loss": 2.5758, + "theoretical_loss": 3.413732269670496, + "tokens_seen": 2120983552 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001805315947843531, + "loss": 2.5278, + "theoretical_loss": 3.4137235763371727, + "tokens_seen": 2121049088 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018052156469408227, + "loss": 2.6752, + "theoretical_loss": 3.413714883347658, + "tokens_seen": 2121114624 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018051153460381145, + "loss": 2.6832, + "theoretical_loss": 3.4137061907019275, + "tokens_seen": 2121180160 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018050150451354063, + "loss": 2.6145, + "theoretical_loss": 3.4136974983999577, + "tokens_seen": 2121245696 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001804914744232698, + "loss": 2.6429, + "theoretical_loss": 3.413688806441724, + "tokens_seen": 2121311232 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018048144433299902, + "loss": 2.5686, + "theoretical_loss": 3.4136801148272022, + "tokens_seen": 2121376768 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001804714142427282, + "loss": 2.4892, + "theoretical_loss": 3.4136714235563685, + "tokens_seen": 2121442304 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018046138415245738, + "loss": 2.5585, + "theoretical_loss": 3.4136627326291977, + "tokens_seen": 2121507840 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018045135406218656, + "loss": 2.4973, + "theoretical_loss": 3.413654042045666, + "tokens_seen": 2121573376 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018044132397191577, + "loss": 2.3226, + "theoretical_loss": 3.4136453518057497, + "tokens_seen": 2121638912 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018043129388164495, + "loss": 2.5376, + "theoretical_loss": 3.4136366619094245, + "tokens_seen": 2121704448 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2388389, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.552199602127075, + "objective/train/theoretical_loss": 3.4136344894890263, + "objective/train/tokens_used": 2142180832, + "theoretical_loss": 3.4136344894890263, + "tokens_seen": 2121720832 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018042126379137413, + "loss": 2.4449, + "theoretical_loss": 3.4136279723566654, + "tokens_seen": 2121769984 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018041123370110331, + "loss": 2.4756, + "theoretical_loss": 3.4136192831474492, + "tokens_seen": 2121835520 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018040120361083252, + "loss": 2.5792, + "theoretical_loss": 3.413610594281751, + "tokens_seen": 2121901056 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001803911735205617, + "loss": 2.7169, + "theoretical_loss": 3.4136019057595473, + "tokens_seen": 2121966592 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018038114343029088, + "loss": 2.5856, + "theoretical_loss": 3.4135932175808126, + "tokens_seen": 2122032128 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018037111334002007, + "loss": 2.6853, + "theoretical_loss": 3.4135845297455245, + "tokens_seen": 2122097664 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018036108324974925, + "loss": 2.3778, + "theoretical_loss": 3.4135758422536577, + "tokens_seen": 2122163200 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018035105315947846, + "loss": 2.4258, + "theoretical_loss": 3.4135671551051883, + "tokens_seen": 2122228736 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018034102306920764, + "loss": 2.5639, + "theoretical_loss": 3.413558468300092, + "tokens_seen": 2122294272 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018033099297893682, + "loss": 2.3817, + "theoretical_loss": 3.413549781838345, + "tokens_seen": 2122359808 + }, + { + "epoch": 7.01, + "learning_rate": 0.000180320962888666, + "loss": 2.4018, + "theoretical_loss": 3.413541095719922, + "tokens_seen": 2122425344 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001803109327983952, + "loss": 2.3246, + "theoretical_loss": 3.413532409944801, + "tokens_seen": 2122490880 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001803009027081244, + "loss": 2.6682, + "theoretical_loss": 3.413523724512955, + "tokens_seen": 2122556416 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018029087261785357, + "loss": 2.554, + "theoretical_loss": 3.4135150394243623, + "tokens_seen": 2122621952 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018028084252758275, + "loss": 2.5737, + "theoretical_loss": 3.413506354678997, + "tokens_seen": 2122687488 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018027081243731193, + "loss": 2.6743, + "theoretical_loss": 3.4134976702768363, + "tokens_seen": 2122753024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018026078234704114, + "loss": 2.5442, + "theoretical_loss": 3.413488986217856, + "tokens_seen": 2122818560 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018025075225677032, + "loss": 2.4533, + "theoretical_loss": 3.4134803025020304, + "tokens_seen": 2122884096 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001802407221664995, + "loss": 2.6092, + "theoretical_loss": 3.4134716191293366, + "tokens_seen": 2122949632 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018023069207622868, + "loss": 2.3158, + "theoretical_loss": 3.4134629360997506, + "tokens_seen": 2123015168 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001802206619859579, + "loss": 2.7701, + "theoretical_loss": 3.4134542534132475, + "tokens_seen": 2123080704 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018021063189568707, + "loss": 2.4785, + "theoretical_loss": 3.4134455710698033, + "tokens_seen": 2123146240 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018020060180541625, + "loss": 2.4197, + "theoretical_loss": 3.413436889069394, + "tokens_seen": 2123211776 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018019057171514543, + "loss": 2.4526, + "theoretical_loss": 3.413428207411996, + "tokens_seen": 2123277312 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018018054162487462, + "loss": 2.4657, + "theoretical_loss": 3.4134195260975844, + "tokens_seen": 2123342848 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2389084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3248581886291504, + "objective/train/theoretical_loss": 3.4134173558225704, + "objective/train/tokens_used": 2143819232, + "theoretical_loss": 3.4134173558225704, + "tokens_seen": 2123359232 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018017051153460382, + "loss": 2.2586, + "theoretical_loss": 3.413410845126135, + "tokens_seen": 2123408384 + }, + { + "epoch": 7.01, + "learning_rate": 0.000180160481444333, + "loss": 2.6453, + "theoretical_loss": 3.413402164497625, + "tokens_seen": 2123473920 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018015045135406219, + "loss": 2.3263, + "theoretical_loss": 3.413393484212029, + "tokens_seen": 2123539456 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018014042126379137, + "loss": 2.4194, + "theoretical_loss": 3.413384804269322, + "tokens_seen": 2123604992 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018013039117352058, + "loss": 2.4192, + "theoretical_loss": 3.4133761246694823, + "tokens_seen": 2123670528 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018012036108324976, + "loss": 2.3334, + "theoretical_loss": 3.413367445412484, + "tokens_seen": 2123736064 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018011033099297894, + "loss": 2.6724, + "theoretical_loss": 3.4133587664983036, + "tokens_seen": 2123801600 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018010030090270812, + "loss": 2.4439, + "theoretical_loss": 3.4133500879269167, + "tokens_seen": 2123867136 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001800902708124373, + "loss": 2.6018, + "theoretical_loss": 3.4133414096982992, + "tokens_seen": 2123932672 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001800802407221665, + "loss": 2.5375, + "theoretical_loss": 3.413332731812427, + "tokens_seen": 2123998208 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001800702106318957, + "loss": 2.5439, + "theoretical_loss": 3.413324054269277, + "tokens_seen": 2124063744 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018006018054162487, + "loss": 2.681, + "theoretical_loss": 3.413315377068823, + "tokens_seen": 2124129280 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018005015045135405, + "loss": 2.3332, + "theoretical_loss": 3.4133067002110433, + "tokens_seen": 2124194816 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018004012036108326, + "loss": 2.4606, + "theoretical_loss": 3.413298023695912, + "tokens_seen": 2124260352 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018003009027081244, + "loss": 2.7636, + "theoretical_loss": 3.4132893475234054, + "tokens_seen": 2124325888 + }, + { + "epoch": 7.01, + "learning_rate": 0.00018002006018054162, + "loss": 2.3242, + "theoretical_loss": 3.4132806716935, + "tokens_seen": 2124391424 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001800100300902708, + "loss": 2.4928, + "theoretical_loss": 3.413271996206171, + "tokens_seen": 2124456960 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017999999999999998, + "loss": 2.477, + "theoretical_loss": 3.413263321061395, + "tokens_seen": 2124522496 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001799899699097292, + "loss": 2.5043, + "theoretical_loss": 3.413254646259147, + "tokens_seen": 2124588032 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017997993981945837, + "loss": 2.4606, + "theoretical_loss": 3.4132459717994035, + "tokens_seen": 2124653568 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017996990972918755, + "loss": 2.4555, + "theoretical_loss": 3.413237297682141, + "tokens_seen": 2124719104 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017995987963891674, + "loss": 2.4352, + "theoretical_loss": 3.413228623907334, + "tokens_seen": 2124784640 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017994984954864594, + "loss": 2.3616, + "theoretical_loss": 3.413219950474959, + "tokens_seen": 2124850176 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017993981945837513, + "loss": 2.4241, + "theoretical_loss": 3.4132112773849923, + "tokens_seen": 2124915712 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001799297893681043, + "loss": 2.5619, + "theoretical_loss": 3.41320260463741, + "tokens_seen": 2124981248 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2390381, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.788762331008911, + "objective/train/theoretical_loss": 3.4132004365040087, + "objective/train/tokens_used": 2145457632, + "theoretical_loss": 3.4132004365040087, + "tokens_seen": 2124997632 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001799197592778335, + "loss": 2.5733, + "theoretical_loss": 3.413193932232187, + "tokens_seen": 2125046784 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017990972918756267, + "loss": 2.5679, + "theoretical_loss": 3.4131852601693007, + "tokens_seen": 2125112320 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017989969909729188, + "loss": 2.4584, + "theoretical_loss": 3.4131765884487257, + "tokens_seen": 2125177856 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017988966900702106, + "loss": 2.6, + "theoretical_loss": 3.4131679170704383, + "tokens_seen": 2125243392 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017987963891675024, + "loss": 2.4475, + "theoretical_loss": 3.4131592460344145, + "tokens_seen": 2125308928 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017986960882647942, + "loss": 2.6812, + "theoretical_loss": 3.413150575340631, + "tokens_seen": 2125374464 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017985957873620863, + "loss": 2.65, + "theoretical_loss": 3.4131419049890623, + "tokens_seen": 2125440000 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001798495486459378, + "loss": 2.5139, + "theoretical_loss": 3.4131332349796857, + "tokens_seen": 2125505536 + }, + { + "epoch": 7.01, + "learning_rate": 0.000179839518555667, + "loss": 2.747, + "theoretical_loss": 3.4131245653124758, + "tokens_seen": 2125571072 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017982948846539617, + "loss": 2.4409, + "theoretical_loss": 3.41311589598741, + "tokens_seen": 2125636608 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017981945837512538, + "loss": 2.6485, + "theoretical_loss": 3.413107227004463, + "tokens_seen": 2125702144 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017980942828485456, + "loss": 2.4273, + "theoretical_loss": 3.413098558363611, + "tokens_seen": 2125767680 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017979939819458377, + "loss": 2.4764, + "theoretical_loss": 3.413089890064831, + "tokens_seen": 2125833216 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017978936810431295, + "loss": 2.4084, + "theoretical_loss": 3.4130812221080977, + "tokens_seen": 2125898752 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017977933801404213, + "loss": 2.5487, + "theoretical_loss": 3.4130725544933878, + "tokens_seen": 2125964288 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017976930792377134, + "loss": 2.4049, + "theoretical_loss": 3.4130638872206767, + "tokens_seen": 2126029824 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017975927783350052, + "loss": 2.737, + "theoretical_loss": 3.413055220289941, + "tokens_seen": 2126095360 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001797492477432297, + "loss": 2.5743, + "theoretical_loss": 3.413046553701156, + "tokens_seen": 2126160896 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017973921765295888, + "loss": 2.4361, + "theoretical_loss": 3.413037887454298, + "tokens_seen": 2126226432 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001797291875626881, + "loss": 2.3477, + "theoretical_loss": 3.413029221549343, + "tokens_seen": 2126291968 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017971915747241727, + "loss": 2.4319, + "theoretical_loss": 3.413020555986267, + "tokens_seen": 2126357504 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017970912738214645, + "loss": 2.5689, + "theoretical_loss": 3.4130118907650466, + "tokens_seen": 2126423040 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017969909729187563, + "loss": 2.7105, + "theoretical_loss": 3.413003225885656, + "tokens_seen": 2126488576 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017968906720160482, + "loss": 2.453, + "theoretical_loss": 3.412994561348073, + "tokens_seen": 2126554112 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017967903711133402, + "loss": 2.4506, + "theoretical_loss": 3.4129858971522724, + "tokens_seen": 2126619648 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 2391921, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5840580463409424, + "objective/train/theoretical_loss": 3.4129837311567233, + "objective/train/tokens_used": 2147096032, + "theoretical_loss": 3.4129837311567233, + "tokens_seen": 2126636032 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001796690070210632, + "loss": 2.4481, + "theoretical_loss": 3.412977233298231, + "tokens_seen": 2126685184 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017965897693079239, + "loss": 2.5934, + "theoretical_loss": 3.4129685697859244, + "tokens_seen": 2126750720 + }, + { + "epoch": 7.01, + "learning_rate": 0.00017964894684052157, + "loss": 2.5368, + "theoretical_loss": 3.4129599066153284, + "tokens_seen": 2126816256 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017963891675025078, + "loss": 2.6252, + "theoretical_loss": 3.4129512437864196, + "tokens_seen": 2126881792 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017962888665997996, + "loss": 2.6648, + "theoretical_loss": 3.4129425812991734, + "tokens_seen": 2126947328 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017961885656970914, + "loss": 2.6119, + "theoretical_loss": 3.4129339191535664, + "tokens_seen": 2127012864 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017960882647943832, + "loss": 2.667, + "theoretical_loss": 3.4129252573495736, + "tokens_seen": 2127078400 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001795987963891675, + "loss": 2.5008, + "theoretical_loss": 3.4129165958871717, + "tokens_seen": 2127143936 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001795887662988967, + "loss": 2.5035, + "theoretical_loss": 3.412907934766337, + "tokens_seen": 2127209472 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001795787362086259, + "loss": 2.4751, + "theoretical_loss": 3.412899273987045, + "tokens_seen": 2127275008 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017956870611835507, + "loss": 2.667, + "theoretical_loss": 3.412890613549272, + "tokens_seen": 2127340544 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017955867602808425, + "loss": 2.4158, + "theoretical_loss": 3.412881953452993, + "tokens_seen": 2127406080 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017954864593781346, + "loss": 2.5268, + "theoretical_loss": 3.412873293698186, + "tokens_seen": 2127471616 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017953861584754264, + "loss": 2.5105, + "theoretical_loss": 3.412864634284825, + "tokens_seen": 2127537152 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017952858575727182, + "loss": 2.5331, + "theoretical_loss": 3.4128559752128877, + "tokens_seen": 2127602688 + }, + { + "epoch": 7.02, + "learning_rate": 0.000179518555667001, + "loss": 2.3807, + "theoretical_loss": 3.4128473164823485, + "tokens_seen": 2127668224 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017950852557673018, + "loss": 2.4996, + "theoretical_loss": 3.412838658093185, + "tokens_seen": 2127733760 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001794984954864594, + "loss": 2.5431, + "theoretical_loss": 3.412830000045372, + "tokens_seen": 2127799296 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017948846539618857, + "loss": 2.4717, + "theoretical_loss": 3.4128213423388862, + "tokens_seen": 2127864832 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017947843530591775, + "loss": 2.5594, + "theoretical_loss": 3.4128126849737033, + "tokens_seen": 2127930368 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017946840521564694, + "loss": 2.5085, + "theoretical_loss": 3.4128040279497993, + "tokens_seen": 2127995904 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017945837512537614, + "loss": 2.3308, + "theoretical_loss": 3.4127953712671513, + "tokens_seen": 2128061440 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017944834503510533, + "loss": 2.4545, + "theoretical_loss": 3.4127867149257334, + "tokens_seen": 2128126976 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001794383149448345, + "loss": 2.4402, + "theoretical_loss": 3.4127780589255234, + "tokens_seen": 2128192512 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001794282848545637, + "loss": 2.3708, + "theoretical_loss": 3.4127694032664966, + "tokens_seen": 2128258048 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2392693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5955708026885986, + "objective/train/theoretical_loss": 3.4127672394050466, + "objective/train/tokens_used": 2148734432, + "theoretical_loss": 3.4127672394050466, + "tokens_seen": 2128274432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017941825476429287, + "loss": 2.5765, + "theoretical_loss": 3.412760747948629, + "tokens_seen": 2128323584 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017940822467402208, + "loss": 2.4445, + "theoretical_loss": 3.412752092971896, + "tokens_seen": 2128389120 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017939819458375126, + "loss": 2.5669, + "theoretical_loss": 3.412743438336275, + "tokens_seen": 2128454656 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017938816449348044, + "loss": 2.5609, + "theoretical_loss": 3.4127347840417417, + "tokens_seen": 2128520192 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017937813440320962, + "loss": 2.6275, + "theoretical_loss": 3.412726130088272, + "tokens_seen": 2128585728 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017936810431293883, + "loss": 2.5745, + "theoretical_loss": 3.412717476475841, + "tokens_seen": 2128651264 + }, + { + "epoch": 7.02, + "learning_rate": 0.000179358074222668, + "loss": 2.6615, + "theoretical_loss": 3.4127088232044263, + "tokens_seen": 2128716800 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001793480441323972, + "loss": 2.6483, + "theoretical_loss": 3.412700170274003, + "tokens_seen": 2128782336 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017933801404212637, + "loss": 2.4508, + "theoretical_loss": 3.4126915176845474, + "tokens_seen": 2128847872 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017932798395185558, + "loss": 2.6335, + "theoretical_loss": 3.412682865436036, + "tokens_seen": 2128913408 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017931795386158476, + "loss": 2.4028, + "theoretical_loss": 3.4126742135284442, + "tokens_seen": 2128978944 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017930792377131394, + "loss": 2.6348, + "theoretical_loss": 3.4126655619617483, + "tokens_seen": 2129044480 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017929789368104312, + "loss": 2.4914, + "theoretical_loss": 3.4126569107359246, + "tokens_seen": 2129110016 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001792878635907723, + "loss": 2.6069, + "theoretical_loss": 3.4126482598509487, + "tokens_seen": 2129175552 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001792778335005015, + "loss": 2.4966, + "theoretical_loss": 3.4126396093067974, + "tokens_seen": 2129241088 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001792678034102307, + "loss": 2.5292, + "theoretical_loss": 3.412630959103446, + "tokens_seen": 2129306624 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017925777331995988, + "loss": 2.5283, + "theoretical_loss": 3.4126223092408714, + "tokens_seen": 2129372160 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017924774322968906, + "loss": 2.5528, + "theoretical_loss": 3.4126136597190486, + "tokens_seen": 2129437696 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017923771313941826, + "loss": 2.6329, + "theoretical_loss": 3.412605010537955, + "tokens_seen": 2129503232 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017922768304914745, + "loss": 2.7534, + "theoretical_loss": 3.412596361697566, + "tokens_seen": 2129568768 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017921765295887663, + "loss": 2.3817, + "theoretical_loss": 3.4125877131978575, + "tokens_seen": 2129634304 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001792076228686058, + "loss": 2.7482, + "theoretical_loss": 3.4125790650388055, + "tokens_seen": 2129699840 + }, + { + "epoch": 7.02, + "learning_rate": 0.000179197592778335, + "loss": 2.3997, + "theoretical_loss": 3.412570417220387, + "tokens_seen": 2129765376 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001791875626880642, + "loss": 2.6196, + "theoretical_loss": 3.4125617697425774, + "tokens_seen": 2129830912 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017917753259779338, + "loss": 2.3384, + "theoretical_loss": 3.4125531226053525, + "tokens_seen": 2129896448 + }, + { + "debugging/Self-BLEU-5": 0.5780644449118214, + "debugging/distinct-1-grams": 0.7692140282720603, + "debugging/distinct-2-grams": 0.9478638479934645, + "debugging/entropy-1-grams": 6.194560159806141, + "debugging/entropy-2-grams": 7.355767246334368, + "debugging/length": 485.03846153846155, + "debugging/num_segments": 26, + "debugging/score": 0.0009396677529348355, + "debugging/score_std": 0.0017049085980240229, + "epoch": 7.02, + "objective/train/docs_used": 2393414, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.685330629348755, + "objective/train/theoretical_loss": 3.4125509608742597, + "objective/train/tokens_used": 2150372832, + "theoretical_loss": 3.4125509608742597, + "tokens_seen": 2129912832 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017916750250752256, + "loss": 2.4491, + "theoretical_loss": 3.412544475808689, + "tokens_seen": 2129961984 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017915747241725174, + "loss": 2.5455, + "theoretical_loss": 3.4125358293525627, + "tokens_seen": 2130027520 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017914744232698095, + "loss": 2.5059, + "theoretical_loss": 3.4125271832369504, + "tokens_seen": 2130093056 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017913741223671013, + "loss": 2.7456, + "theoretical_loss": 3.4125185374618274, + "tokens_seen": 2130158592 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001791273821464393, + "loss": 2.5444, + "theoretical_loss": 3.4125098920271704, + "tokens_seen": 2130224128 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001791173520561685, + "loss": 2.5994, + "theoretical_loss": 3.4125012469329548, + "tokens_seen": 2130289664 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017910732196589767, + "loss": 2.455, + "theoretical_loss": 3.4124926021791566, + "tokens_seen": 2130355200 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017909729187562688, + "loss": 2.5194, + "theoretical_loss": 3.4124839577657533, + "tokens_seen": 2130420736 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017908726178535606, + "loss": 2.5228, + "theoretical_loss": 3.41247531369272, + "tokens_seen": 2130486272 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017907723169508524, + "loss": 2.5231, + "theoretical_loss": 3.4124666699600326, + "tokens_seen": 2130551808 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017906720160481442, + "loss": 2.4305, + "theoretical_loss": 3.412458026567668, + "tokens_seen": 2130617344 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017905717151454363, + "loss": 2.5617, + "theoretical_loss": 3.412449383515602, + "tokens_seen": 2130682880 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017904714142427284, + "loss": 2.6146, + "theoretical_loss": 3.4124407408038104, + "tokens_seen": 2130748416 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017903711133400202, + "loss": 2.4336, + "theoretical_loss": 3.4124320984322702, + "tokens_seen": 2130813952 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001790270812437312, + "loss": 2.4181, + "theoretical_loss": 3.4124234564009566, + "tokens_seen": 2130879488 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017901705115346038, + "loss": 2.3691, + "theoretical_loss": 3.4124148147098463, + "tokens_seen": 2130945024 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001790070210631896, + "loss": 2.7253, + "theoretical_loss": 3.412406173358915, + "tokens_seen": 2131010560 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017899699097291877, + "loss": 2.765, + "theoretical_loss": 3.4123975323481393, + "tokens_seen": 2131076096 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017898696088264796, + "loss": 2.589, + "theoretical_loss": 3.4123888916774954, + "tokens_seen": 2131141632 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017897693079237714, + "loss": 2.3239, + "theoretical_loss": 3.412380251346959, + "tokens_seen": 2131207168 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017896690070210634, + "loss": 2.4992, + "theoretical_loss": 3.412371611356506, + "tokens_seen": 2131272704 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017895687061183553, + "loss": 2.3037, + "theoretical_loss": 3.4123629717061137, + "tokens_seen": 2131338240 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001789468405215647, + "loss": 2.5073, + "theoretical_loss": 3.4123543323957573, + "tokens_seen": 2131403776 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001789368104312939, + "loss": 2.3137, + "theoretical_loss": 3.4123456934254133, + "tokens_seen": 2131469312 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017892678034102307, + "loss": 2.6743, + "theoretical_loss": 3.4123370547950573, + "tokens_seen": 2131534848 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2394683, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.9020442962646484, + "objective/train/theoretical_loss": 3.412334895190589, + "objective/train/tokens_used": 2152011232, + "theoretical_loss": 3.412334895190589, + "tokens_seen": 2131551232 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017891675025075228, + "loss": 2.22, + "theoretical_loss": 3.412328416504667, + "tokens_seen": 2131600384 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017890672016048146, + "loss": 2.6275, + "theoretical_loss": 3.4123197785542168, + "tokens_seen": 2131665920 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017889669007021064, + "loss": 2.5273, + "theoretical_loss": 3.412311140943684, + "tokens_seen": 2131731456 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017888665997993982, + "loss": 2.7012, + "theoretical_loss": 3.412302503673044, + "tokens_seen": 2131796992 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017887662988966903, + "loss": 2.6138, + "theoretical_loss": 3.412293866742274, + "tokens_seen": 2131862528 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001788665997993982, + "loss": 2.4054, + "theoretical_loss": 3.4122852301513493, + "tokens_seen": 2131928064 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001788565697091274, + "loss": 2.6141, + "theoretical_loss": 3.412276593900246, + "tokens_seen": 2131993600 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017884653961885657, + "loss": 2.4204, + "theoretical_loss": 3.4122679579889414, + "tokens_seen": 2132059136 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017883650952858578, + "loss": 2.785, + "theoretical_loss": 3.41225932241741, + "tokens_seen": 2132124672 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017882647943831496, + "loss": 2.4517, + "theoretical_loss": 3.4122506871856295, + "tokens_seen": 2132190208 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017881644934804414, + "loss": 2.3207, + "theoretical_loss": 3.4122420522935752, + "tokens_seen": 2132255744 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017880641925777332, + "loss": 2.7074, + "theoretical_loss": 3.4122334177412235, + "tokens_seen": 2132321280 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001787963891675025, + "loss": 2.6167, + "theoretical_loss": 3.412224783528551, + "tokens_seen": 2132386816 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001787863590772317, + "loss": 2.7056, + "theoretical_loss": 3.412216149655533, + "tokens_seen": 2132452352 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001787763289869609, + "loss": 2.5731, + "theoretical_loss": 3.4122075161221472, + "tokens_seen": 2132517888 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017876629889669008, + "loss": 2.4097, + "theoretical_loss": 3.4121988829283687, + "tokens_seen": 2132583424 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017875626880641926, + "loss": 2.3843, + "theoretical_loss": 3.4121902500741736, + "tokens_seen": 2132648960 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017874623871614846, + "loss": 2.4421, + "theoretical_loss": 3.412181617559538, + "tokens_seen": 2132714496 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017873620862587765, + "loss": 2.4413, + "theoretical_loss": 3.412172985384439, + "tokens_seen": 2132780032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017872617853560683, + "loss": 2.4957, + "theoretical_loss": 3.412164353548852, + "tokens_seen": 2132845568 + }, + { + "epoch": 7.02, + "learning_rate": 0.000178716148445336, + "loss": 2.5543, + "theoretical_loss": 3.412155722052754, + "tokens_seen": 2132911104 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001787061183550652, + "loss": 2.5735, + "theoretical_loss": 3.4121470908961204, + "tokens_seen": 2132976640 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001786960882647944, + "loss": 2.4059, + "theoretical_loss": 3.4121384600789275, + "tokens_seen": 2133042176 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017868605817452358, + "loss": 2.5933, + "theoretical_loss": 3.4121298296011524, + "tokens_seen": 2133107712 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017867602808425276, + "loss": 2.5251, + "theoretical_loss": 3.4121211994627707, + "tokens_seen": 2133173248 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2395294, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.331108570098877, + "objective/train/theoretical_loss": 3.412119041981202, + "objective/train/tokens_used": 2153649632, + "theoretical_loss": 3.412119041981202, + "tokens_seen": 2133189632 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017866599799398194, + "loss": 2.5092, + "theoretical_loss": 3.4121125696637584, + "tokens_seen": 2133238784 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017865596790371115, + "loss": 2.469, + "theoretical_loss": 3.4121039402040916, + "tokens_seen": 2133304320 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017864593781344033, + "loss": 2.4121, + "theoretical_loss": 3.4120953110837475, + "tokens_seen": 2133369856 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001786359077231695, + "loss": 2.558, + "theoretical_loss": 3.4120866823027014, + "tokens_seen": 2133435392 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001786258776328987, + "loss": 2.6409, + "theoretical_loss": 3.41207805386093, + "tokens_seen": 2133500928 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017861584754262787, + "loss": 2.5681, + "theoretical_loss": 3.4120694257584097, + "tokens_seen": 2133566464 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017860581745235708, + "loss": 2.6484, + "theoretical_loss": 3.412060797995116, + "tokens_seen": 2133632000 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017859578736208626, + "loss": 2.4195, + "theoretical_loss": 3.412052170571026, + "tokens_seen": 2133697536 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017858575727181544, + "loss": 2.3959, + "theoretical_loss": 3.412043543486115, + "tokens_seen": 2133763072 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017857572718154462, + "loss": 2.6663, + "theoretical_loss": 3.4120349167403603, + "tokens_seen": 2133828608 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017856569709127383, + "loss": 2.4926, + "theoretical_loss": 3.4120262903337375, + "tokens_seen": 2133894144 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017855566700100301, + "loss": 2.4712, + "theoretical_loss": 3.412017664266223, + "tokens_seen": 2133959680 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001785456369107322, + "loss": 2.5968, + "theoretical_loss": 3.412009038537793, + "tokens_seen": 2134025216 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017853560682046138, + "loss": 2.4654, + "theoretical_loss": 3.412000413148424, + "tokens_seen": 2134090752 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017852557673019056, + "loss": 2.7613, + "theoretical_loss": 3.4119917880980912, + "tokens_seen": 2134156288 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017851554663991977, + "loss": 2.4522, + "theoretical_loss": 3.4119831633867728, + "tokens_seen": 2134221824 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017850551654964895, + "loss": 2.5328, + "theoretical_loss": 3.4119745390144436, + "tokens_seen": 2134287360 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017849548645937813, + "loss": 2.5419, + "theoretical_loss": 3.41196591498108, + "tokens_seen": 2134352896 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001784854563691073, + "loss": 2.5436, + "theoretical_loss": 3.411957291286659, + "tokens_seen": 2134418432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017847542627883652, + "loss": 2.5708, + "theoretical_loss": 3.4119486679311564, + "tokens_seen": 2134483968 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001784653961885657, + "loss": 2.4682, + "theoretical_loss": 3.411940044914548, + "tokens_seen": 2134549504 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017845536609829488, + "loss": 2.5701, + "theoretical_loss": 3.411931422236811, + "tokens_seen": 2134615040 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017844533600802406, + "loss": 2.438, + "theoretical_loss": 3.4119227998979214, + "tokens_seen": 2134680576 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017843530591775324, + "loss": 2.4359, + "theoretical_loss": 3.411914177897855, + "tokens_seen": 2134746112 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017842527582748245, + "loss": 2.6921, + "theoretical_loss": 3.4119055562365883, + "tokens_seen": 2134811648 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2396857, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2103617191314697, + "objective/train/theoretical_loss": 3.4119034008742064, + "objective/train/tokens_used": 2155288032, + "theoretical_loss": 3.4119034008742064, + "tokens_seen": 2134828032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017841524573721163, + "loss": 2.4878, + "theoretical_loss": 3.4118969349140977, + "tokens_seen": 2134877184 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001784052156469408, + "loss": 2.5679, + "theoretical_loss": 3.41188831393036, + "tokens_seen": 2134942720 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017839518555667, + "loss": 2.7064, + "theoretical_loss": 3.411879693285351, + "tokens_seen": 2135008256 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001783851554663992, + "loss": 2.2935, + "theoretical_loss": 3.4118710729790465, + "tokens_seen": 2135073792 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017837512537612838, + "loss": 2.4977, + "theoretical_loss": 3.411862453011423, + "tokens_seen": 2135139328 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017836509528585756, + "loss": 2.259, + "theoretical_loss": 3.4118538333824575, + "tokens_seen": 2135204864 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017835506519558675, + "loss": 2.5873, + "theoretical_loss": 3.411845214092126, + "tokens_seen": 2135270400 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017834503510531593, + "loss": 2.3719, + "theoretical_loss": 3.4118365951404046, + "tokens_seen": 2135335936 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017833500501504513, + "loss": 2.5504, + "theoretical_loss": 3.4118279765272694, + "tokens_seen": 2135401472 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017832497492477432, + "loss": 2.5789, + "theoretical_loss": 3.411819358252697, + "tokens_seen": 2135467008 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001783149448345035, + "loss": 2.3634, + "theoretical_loss": 3.411810740316664, + "tokens_seen": 2135532544 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017830491474423268, + "loss": 2.496, + "theoretical_loss": 3.411802122719146, + "tokens_seen": 2135598080 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001782948846539619, + "loss": 2.7935, + "theoretical_loss": 3.4117935054601203, + "tokens_seen": 2135663616 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001782848545636911, + "loss": 2.6067, + "theoretical_loss": 3.411784888539562, + "tokens_seen": 2135729152 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017827482447342028, + "loss": 2.4004, + "theoretical_loss": 3.4117762719574483, + "tokens_seen": 2135794688 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017826479438314946, + "loss": 2.4555, + "theoretical_loss": 3.4117676557137555, + "tokens_seen": 2135860224 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017825476429287866, + "loss": 2.7207, + "theoretical_loss": 3.4117590398084596, + "tokens_seen": 2135925760 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017824473420260785, + "loss": 2.4721, + "theoretical_loss": 3.411750424241537, + "tokens_seen": 2135991296 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017823470411233703, + "loss": 2.4252, + "theoretical_loss": 3.4117418090129643, + "tokens_seen": 2136056832 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001782246740220662, + "loss": 2.3903, + "theoretical_loss": 3.411733194122717, + "tokens_seen": 2136122368 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001782146439317954, + "loss": 2.2234, + "theoretical_loss": 3.4117245795707727, + "tokens_seen": 2136187904 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001782046138415246, + "loss": 2.6066, + "theoretical_loss": 3.4117159653571068, + "tokens_seen": 2136253440 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017819458375125378, + "loss": 2.6177, + "theoretical_loss": 3.411707351481696, + "tokens_seen": 2136318976 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017818455366098296, + "loss": 2.8105, + "theoretical_loss": 3.4116987379445165, + "tokens_seen": 2136384512 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017817452357071214, + "loss": 2.5195, + "theoretical_loss": 3.4116901247455447, + "tokens_seen": 2136450048 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2397487, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.489830732345581, + "objective/train/theoretical_loss": 3.411687971498644, + "objective/train/tokens_used": 2156926432, + "theoretical_loss": 3.411687971498644, + "tokens_seen": 2136466432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017816449348044135, + "loss": 2.4359, + "theoretical_loss": 3.4116815118847565, + "tokens_seen": 2136515584 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017815446339017053, + "loss": 2.5381, + "theoretical_loss": 3.411672899362129, + "tokens_seen": 2136581120 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001781444332998997, + "loss": 2.7308, + "theoretical_loss": 3.4116642871776386, + "tokens_seen": 2136646656 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001781344032096289, + "loss": 2.322, + "theoretical_loss": 3.411655675331261, + "tokens_seen": 2136712192 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017812437311935807, + "loss": 2.509, + "theoretical_loss": 3.411647063822973, + "tokens_seen": 2136777728 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017811434302908728, + "loss": 2.531, + "theoretical_loss": 3.411638452652751, + "tokens_seen": 2136843264 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017810431293881646, + "loss": 2.428, + "theoretical_loss": 3.41162984182057, + "tokens_seen": 2136908800 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017809428284854564, + "loss": 2.885, + "theoretical_loss": 3.411621231326409, + "tokens_seen": 2136974336 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017808425275827483, + "loss": 2.5861, + "theoretical_loss": 3.411612621170242, + "tokens_seen": 2137039872 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017807422266800403, + "loss": 2.5649, + "theoretical_loss": 3.4116040113520465, + "tokens_seen": 2137105408 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017806419257773321, + "loss": 2.4704, + "theoretical_loss": 3.411595401871799, + "tokens_seen": 2137170944 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001780541624874624, + "loss": 2.3792, + "theoretical_loss": 3.4115867927294747, + "tokens_seen": 2137236480 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017804413239719158, + "loss": 2.5176, + "theoretical_loss": 3.4115781839250516, + "tokens_seen": 2137302016 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017803410230692076, + "loss": 2.5573, + "theoretical_loss": 3.411569575458505, + "tokens_seen": 2137367552 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017802407221664997, + "loss": 2.4015, + "theoretical_loss": 3.4115609673298115, + "tokens_seen": 2137433088 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017801404212637915, + "loss": 2.3605, + "theoretical_loss": 3.4115523595389474, + "tokens_seen": 2137498624 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017800401203610833, + "loss": 2.5328, + "theoretical_loss": 3.4115437520858896, + "tokens_seen": 2137564160 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001779939819458375, + "loss": 2.5333, + "theoretical_loss": 3.4115351449706135, + "tokens_seen": 2137629696 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017798395185556672, + "loss": 2.4909, + "theoretical_loss": 3.4115265381930966, + "tokens_seen": 2137695232 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001779739217652959, + "loss": 2.556, + "theoretical_loss": 3.4115179317533144, + "tokens_seen": 2137760768 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017796389167502508, + "loss": 2.4426, + "theoretical_loss": 3.411509325651244, + "tokens_seen": 2137826304 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017795386158475426, + "loss": 2.6881, + "theoretical_loss": 3.4115007198868614, + "tokens_seen": 2137891840 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017794383149448344, + "loss": 2.6499, + "theoretical_loss": 3.4114921144601427, + "tokens_seen": 2137957376 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017793380140421265, + "loss": 2.4935, + "theoretical_loss": 3.411483509371065, + "tokens_seen": 2138022912 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017792377131394183, + "loss": 2.5771, + "theoretical_loss": 3.4114749046196042, + "tokens_seen": 2138088448 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2398844, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.478889226913452, + "objective/train/theoretical_loss": 3.411472753484489, + "objective/train/tokens_used": 2158564832, + "theoretical_loss": 3.411472753484489, + "tokens_seen": 2138104832 + }, + { + "epoch": 7.02, + "learning_rate": 0.000177913741223671, + "loss": 2.5824, + "theoretical_loss": 3.411466300205737, + "tokens_seen": 2138153984 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001779037111334002, + "loss": 2.4355, + "theoretical_loss": 3.41145769612944, + "tokens_seen": 2138219520 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001778936810431294, + "loss": 2.6232, + "theoretical_loss": 3.4114490923906886, + "tokens_seen": 2138285056 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017788365095285858, + "loss": 2.5285, + "theoretical_loss": 3.4114404889894603, + "tokens_seen": 2138350592 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017787362086258776, + "loss": 2.632, + "theoretical_loss": 3.4114318859257313, + "tokens_seen": 2138416128 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017786359077231695, + "loss": 2.4681, + "theoretical_loss": 3.4114232831994773, + "tokens_seen": 2138481664 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017785356068204613, + "loss": 2.5891, + "theoretical_loss": 3.411414680810676, + "tokens_seen": 2138547200 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017784353059177533, + "loss": 2.2514, + "theoretical_loss": 3.4114060787593026, + "tokens_seen": 2138612736 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017783350050150452, + "loss": 2.6213, + "theoretical_loss": 3.411397477045334, + "tokens_seen": 2138678272 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001778234704112337, + "loss": 2.5377, + "theoretical_loss": 3.411388875668747, + "tokens_seen": 2138743808 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017781344032096288, + "loss": 2.3683, + "theoretical_loss": 3.4113802746295168, + "tokens_seen": 2138809344 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017780341023069209, + "loss": 2.3649, + "theoretical_loss": 3.4113716739276216, + "tokens_seen": 2138874880 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017779338014042127, + "loss": 2.3816, + "theoretical_loss": 3.4113630735630363, + "tokens_seen": 2138940416 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017778335005015045, + "loss": 2.5758, + "theoretical_loss": 3.411354473535739, + "tokens_seen": 2139005952 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017777331995987963, + "loss": 2.4829, + "theoretical_loss": 3.411345873845704, + "tokens_seen": 2139071488 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001777632898696088, + "loss": 2.3607, + "theoretical_loss": 3.411337274492909, + "tokens_seen": 2139137024 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017775325977933802, + "loss": 2.7108, + "theoretical_loss": 3.4113286754773307, + "tokens_seen": 2139202560 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001777432296890672, + "loss": 2.5505, + "theoretical_loss": 3.411320076798945, + "tokens_seen": 2139268096 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017773319959879638, + "loss": 2.5193, + "theoretical_loss": 3.411311478457728, + "tokens_seen": 2139333632 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017772316950852556, + "loss": 2.6744, + "theoretical_loss": 3.4113028804536576, + "tokens_seen": 2139399168 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017771313941825477, + "loss": 2.4483, + "theoretical_loss": 3.4112942827867085, + "tokens_seen": 2139464704 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017770310932798395, + "loss": 2.5497, + "theoretical_loss": 3.4112856854568583, + "tokens_seen": 2139530240 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017769307923771313, + "loss": 2.3169, + "theoretical_loss": 3.4112770884640833, + "tokens_seen": 2139595776 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017768304914744231, + "loss": 2.4063, + "theoretical_loss": 3.411268491808359, + "tokens_seen": 2139661312 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017767301905717152, + "loss": 2.3223, + "theoretical_loss": 3.411259895489663, + "tokens_seen": 2139726848 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2399407, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3298182487487793, + "objective/train/theoretical_loss": 3.4112577464626472, + "objective/train/tokens_used": 2160203232, + "theoretical_loss": 3.4112577464626472, + "tokens_seen": 2139743232 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001776629889669007, + "loss": 2.3507, + "theoretical_loss": 3.411251299507972, + "tokens_seen": 2139792384 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017765295887662988, + "loss": 2.5886, + "theoretical_loss": 3.411242703863261, + "tokens_seen": 2139857920 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017764292878635907, + "loss": 2.7023, + "theoretical_loss": 3.411234108555508, + "tokens_seen": 2139923456 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017763289869608825, + "loss": 2.6424, + "theoretical_loss": 3.4112255135846885, + "tokens_seen": 2139988992 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017762286860581745, + "loss": 2.5415, + "theoretical_loss": 3.411216918950779, + "tokens_seen": 2140054528 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017761283851554664, + "loss": 2.3646, + "theoretical_loss": 3.4112083246537566, + "tokens_seen": 2140120064 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017760280842527582, + "loss": 2.5852, + "theoretical_loss": 3.411199730693597, + "tokens_seen": 2140185600 + }, + { + "epoch": 7.02, + "learning_rate": 0.000177592778335005, + "loss": 2.5588, + "theoretical_loss": 3.411191137070278, + "tokens_seen": 2140251136 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001775827482447342, + "loss": 2.6798, + "theoretical_loss": 3.4111825437837746, + "tokens_seen": 2140316672 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001775727181544634, + "loss": 2.619, + "theoretical_loss": 3.4111739508340637, + "tokens_seen": 2140382208 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017756268806419257, + "loss": 2.3189, + "theoretical_loss": 3.4111653582211225, + "tokens_seen": 2140447744 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017755265797392175, + "loss": 2.7718, + "theoretical_loss": 3.4111567659449262, + "tokens_seen": 2140513280 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017754262788365096, + "loss": 2.628, + "theoretical_loss": 3.4111481740054526, + "tokens_seen": 2140578816 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017753259779338017, + "loss": 2.613, + "theoretical_loss": 3.4111395824026776, + "tokens_seen": 2140644352 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017752256770310935, + "loss": 2.6408, + "theoretical_loss": 3.411130991136578, + "tokens_seen": 2140709888 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017751253761283853, + "loss": 2.5016, + "theoretical_loss": 3.4111224002071294, + "tokens_seen": 2140775424 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001775025075225677, + "loss": 2.6689, + "theoretical_loss": 3.411113809614309, + "tokens_seen": 2140840960 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017749247743229692, + "loss": 2.5894, + "theoretical_loss": 3.411105219358094, + "tokens_seen": 2140906496 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001774824473420261, + "loss": 2.4645, + "theoretical_loss": 3.41109662943846, + "tokens_seen": 2140972032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017747241725175528, + "loss": 2.5633, + "theoretical_loss": 3.4110880398553833, + "tokens_seen": 2141037568 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017746238716148446, + "loss": 2.455, + "theoretical_loss": 3.411079450608841, + "tokens_seen": 2141103104 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017745235707121364, + "loss": 2.3731, + "theoretical_loss": 3.411070861698809, + "tokens_seen": 2141168640 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017744232698094285, + "loss": 2.52, + "theoretical_loss": 3.411062273125265, + "tokens_seen": 2141234176 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017743229689067203, + "loss": 2.6776, + "theoretical_loss": 3.4110536848881843, + "tokens_seen": 2141299712 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001774222668004012, + "loss": 2.5815, + "theoretical_loss": 3.4110450969875434, + "tokens_seen": 2141365248 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2400889, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.823261022567749, + "objective/train/theoretical_loss": 3.4110429500649495, + "objective/train/tokens_used": 2161841632, + "theoretical_loss": 3.4110429500649495, + "tokens_seen": 2141381632 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001774122367101304, + "loss": 2.6692, + "theoretical_loss": 3.4110365094233197, + "tokens_seen": 2141430784 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001774022066198596, + "loss": 2.6624, + "theoretical_loss": 3.4110279221954896, + "tokens_seen": 2141496320 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017739217652958878, + "loss": 2.5251, + "theoretical_loss": 3.4110193353040286, + "tokens_seen": 2141561856 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017738214643931796, + "loss": 2.5704, + "theoretical_loss": 3.4110107487489145, + "tokens_seen": 2141627392 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017737211634904715, + "loss": 2.2809, + "theoretical_loss": 3.4110021625301234, + "tokens_seen": 2141692928 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017736208625877633, + "loss": 2.4456, + "theoretical_loss": 3.4109935766476314, + "tokens_seen": 2141758464 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017735205616850553, + "loss": 2.4886, + "theoretical_loss": 3.410984991101416, + "tokens_seen": 2141824000 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017734202607823472, + "loss": 2.4463, + "theoretical_loss": 3.4109764058914527, + "tokens_seen": 2141889536 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001773319959879639, + "loss": 2.4821, + "theoretical_loss": 3.410967821017718, + "tokens_seen": 2141955072 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017732196589769308, + "loss": 2.6568, + "theoretical_loss": 3.4109592364801893, + "tokens_seen": 2142020608 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017731193580742229, + "loss": 2.4482, + "theoretical_loss": 3.410950652278843, + "tokens_seen": 2142086144 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017730190571715147, + "loss": 2.6374, + "theoretical_loss": 3.410942068413655, + "tokens_seen": 2142151680 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017729187562688065, + "loss": 2.6423, + "theoretical_loss": 3.4109334848846027, + "tokens_seen": 2142217216 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017728184553660983, + "loss": 2.667, + "theoretical_loss": 3.410924901691662, + "tokens_seen": 2142282752 + }, + { + "epoch": 7.02, + "learning_rate": 0.000177271815446339, + "loss": 2.3315, + "theoretical_loss": 3.4109163188348095, + "tokens_seen": 2142348288 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017726178535606822, + "loss": 2.5662, + "theoretical_loss": 3.410907736314022, + "tokens_seen": 2142413824 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001772517552657974, + "loss": 2.5799, + "theoretical_loss": 3.410899154129276, + "tokens_seen": 2142479360 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017724172517552658, + "loss": 2.4375, + "theoretical_loss": 3.410890572280548, + "tokens_seen": 2142544896 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017723169508525576, + "loss": 2.4626, + "theoretical_loss": 3.410881990767815, + "tokens_seen": 2142610432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017722166499498497, + "loss": 2.7091, + "theoretical_loss": 3.4108734095910527, + "tokens_seen": 2142675968 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017721163490471415, + "loss": 2.4796, + "theoretical_loss": 3.4108648287502383, + "tokens_seen": 2142741504 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017720160481444333, + "loss": 2.3291, + "theoretical_loss": 3.4108562482453486, + "tokens_seen": 2142807040 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017719157472417251, + "loss": 2.5571, + "theoretical_loss": 3.410847668076359, + "tokens_seen": 2142872576 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017718154463390172, + "loss": 2.7483, + "theoretical_loss": 3.4108390882432476, + "tokens_seen": 2142938112 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001771715145436309, + "loss": 2.6117, + "theoretical_loss": 3.41083050874599, + "tokens_seen": 2143003648 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2401731, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.869143009185791, + "objective/train/theoretical_loss": 3.41082836392415, + "objective/train/tokens_used": 2163480032, + "theoretical_loss": 3.41082836392415, + "tokens_seen": 2143020032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017716148445336008, + "loss": 2.697, + "theoretical_loss": 3.4108219295845634, + "tokens_seen": 2143069184 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017715145436308927, + "loss": 2.5967, + "theoretical_loss": 3.4108133507589438, + "tokens_seen": 2143134720 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017714142427281845, + "loss": 2.7742, + "theoretical_loss": 3.410804772269108, + "tokens_seen": 2143200256 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017713139418254765, + "loss": 2.4043, + "theoretical_loss": 3.410796194115032, + "tokens_seen": 2143265792 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017712136409227684, + "loss": 2.5846, + "theoretical_loss": 3.4107876162966937, + "tokens_seen": 2143331328 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017711133400200602, + "loss": 2.4789, + "theoretical_loss": 3.410779038814069, + "tokens_seen": 2143396864 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001771013039117352, + "loss": 2.7235, + "theoretical_loss": 3.410770461667134, + "tokens_seen": 2143462400 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001770912738214644, + "loss": 2.4698, + "theoretical_loss": 3.4107618848558667, + "tokens_seen": 2143527936 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001770812437311936, + "loss": 2.3621, + "theoretical_loss": 3.410753308380242, + "tokens_seen": 2143593472 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017707121364092277, + "loss": 2.438, + "theoretical_loss": 3.4107447322402376, + "tokens_seen": 2143659008 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017706118355065195, + "loss": 2.4921, + "theoretical_loss": 3.4107361564358296, + "tokens_seen": 2143724544 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017705115346038113, + "loss": 2.4935, + "theoretical_loss": 3.410727580966995, + "tokens_seen": 2143790080 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017704112337011034, + "loss": 2.6835, + "theoretical_loss": 3.4107190058337107, + "tokens_seen": 2143855616 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017703109327983952, + "loss": 2.4639, + "theoretical_loss": 3.410710431035952, + "tokens_seen": 2143921152 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001770210631895687, + "loss": 2.6445, + "theoretical_loss": 3.410701856573697, + "tokens_seen": 2143986688 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017701103309929788, + "loss": 2.4527, + "theoretical_loss": 3.410693282446921, + "tokens_seen": 2144052224 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001770010030090271, + "loss": 2.4351, + "theoretical_loss": 3.410684708655602, + "tokens_seen": 2144117760 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017699097291875627, + "loss": 2.4845, + "theoretical_loss": 3.410676135199716, + "tokens_seen": 2144183296 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017698094282848545, + "loss": 2.4586, + "theoretical_loss": 3.410667562079239, + "tokens_seen": 2144248832 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017697091273821463, + "loss": 2.5081, + "theoretical_loss": 3.4106589892941477, + "tokens_seen": 2144314368 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017696088264794382, + "loss": 2.6296, + "theoretical_loss": 3.4106504168444203, + "tokens_seen": 2144379904 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017695085255767302, + "loss": 2.8459, + "theoretical_loss": 3.4106418447300317, + "tokens_seen": 2144445440 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001769408224674022, + "loss": 2.4157, + "theoretical_loss": 3.4106332729509594, + "tokens_seen": 2144510976 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017693079237713139, + "loss": 2.6314, + "theoretical_loss": 3.41062470150718, + "tokens_seen": 2144576512 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017692076228686057, + "loss": 2.3643, + "theoretical_loss": 3.4106161303986693, + "tokens_seen": 2144642048 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2403087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.166632652282715, + "objective/train/theoretical_loss": 3.4106139876739245, + "objective/train/tokens_used": 2165118432, + "theoretical_loss": 3.4106139876739245, + "tokens_seen": 2144658432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017691073219658978, + "loss": 2.4327, + "theoretical_loss": 3.410607559625405, + "tokens_seen": 2144707584 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017690070210631896, + "loss": 2.4478, + "theoretical_loss": 3.4105989891873634, + "tokens_seen": 2144773120 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017689067201604814, + "loss": 2.5275, + "theoretical_loss": 3.410590419084521, + "tokens_seen": 2144838656 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017688064192577732, + "loss": 2.6009, + "theoretical_loss": 3.4105818493168543, + "tokens_seen": 2144904192 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001768706118355065, + "loss": 2.572, + "theoretical_loss": 3.41057327988434, + "tokens_seen": 2144969728 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001768605817452357, + "loss": 2.4561, + "theoretical_loss": 3.4105647107869554, + "tokens_seen": 2145035264 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001768505516549649, + "loss": 2.7079, + "theoretical_loss": 3.4105561420246766, + "tokens_seen": 2145100800 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017684052156469407, + "loss": 2.5578, + "theoretical_loss": 3.41054757359748, + "tokens_seen": 2145166336 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017683049147442325, + "loss": 2.5507, + "theoretical_loss": 3.4105390055053433, + "tokens_seen": 2145231872 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017682046138415246, + "loss": 2.5855, + "theoretical_loss": 3.4105304377482417, + "tokens_seen": 2145297408 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017681043129388164, + "loss": 2.386, + "theoretical_loss": 3.4105218703261535, + "tokens_seen": 2145362944 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017680040120361085, + "loss": 2.7283, + "theoretical_loss": 3.4105133032390538, + "tokens_seen": 2145428480 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017679037111334003, + "loss": 2.6471, + "theoretical_loss": 3.41050473648692, + "tokens_seen": 2145494016 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001767803410230692, + "loss": 2.7268, + "theoretical_loss": 3.410496170069729, + "tokens_seen": 2145559552 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017677031093279842, + "loss": 2.4896, + "theoretical_loss": 3.410487603987457, + "tokens_seen": 2145625088 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001767602808425276, + "loss": 2.5397, + "theoretical_loss": 3.410479038240081, + "tokens_seen": 2145690624 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017675025075225678, + "loss": 2.5076, + "theoretical_loss": 3.410470472827577, + "tokens_seen": 2145756160 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017674022066198596, + "loss": 2.5776, + "theoretical_loss": 3.410461907749923, + "tokens_seen": 2145821696 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017673019057171517, + "loss": 2.5306, + "theoretical_loss": 3.410453343007095, + "tokens_seen": 2145887232 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017672016048144435, + "loss": 2.7759, + "theoretical_loss": 3.4104447785990692, + "tokens_seen": 2145952768 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017671013039117353, + "loss": 2.4757, + "theoretical_loss": 3.4104362145258227, + "tokens_seen": 2146018304 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017670010030090271, + "loss": 2.7373, + "theoretical_loss": 3.410427650787333, + "tokens_seen": 2146083840 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017669007021063192, + "loss": 2.7976, + "theoretical_loss": 3.4104190873835747, + "tokens_seen": 2146149376 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001766800401203611, + "loss": 2.5736, + "theoretical_loss": 3.4104105243145266, + "tokens_seen": 2146214912 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017667001003009028, + "loss": 2.5735, + "theoretical_loss": 3.410401961580164, + "tokens_seen": 2146280448 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2404412, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.564814329147339, + "objective/train/theoretical_loss": 3.4103998209488657, + "objective/train/tokens_used": 2166756832, + "theoretical_loss": 3.4103998209488657, + "tokens_seen": 2146296832 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017665997993981947, + "loss": 2.4816, + "theoretical_loss": 3.4103933991804647, + "tokens_seen": 2146345984 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017664994984954865, + "loss": 2.6518, + "theoretical_loss": 3.410384837115405, + "tokens_seen": 2146411520 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017663991975927786, + "loss": 2.6211, + "theoretical_loss": 3.4103762753849614, + "tokens_seen": 2146477056 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017662988966900704, + "loss": 2.5583, + "theoretical_loss": 3.4103677139891104, + "tokens_seen": 2146542592 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017661985957873622, + "loss": 2.5391, + "theoretical_loss": 3.410359152927829, + "tokens_seen": 2146608128 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001766098294884654, + "loss": 2.3571, + "theoretical_loss": 3.4103505922010937, + "tokens_seen": 2146673664 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001765997993981946, + "loss": 2.5706, + "theoretical_loss": 3.4103420318088817, + "tokens_seen": 2146739200 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001765897693079238, + "loss": 2.5773, + "theoretical_loss": 3.41033347175117, + "tokens_seen": 2146804736 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017657973921765297, + "loss": 2.6391, + "theoretical_loss": 3.410324912027934, + "tokens_seen": 2146870272 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017656970912738215, + "loss": 2.3381, + "theoretical_loss": 3.4103163526391516, + "tokens_seen": 2146935808 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017655967903711133, + "loss": 2.5497, + "theoretical_loss": 3.410307793584799, + "tokens_seen": 2147001344 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017654964894684054, + "loss": 2.6113, + "theoretical_loss": 3.410299234864853, + "tokens_seen": 2147066880 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017653961885656972, + "loss": 2.3307, + "theoretical_loss": 3.41029067647929, + "tokens_seen": 2147132416 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001765295887662989, + "loss": 2.4814, + "theoretical_loss": 3.410282118428088, + "tokens_seen": 2147197952 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017651955867602808, + "loss": 2.5155, + "theoretical_loss": 3.4102735607112225, + "tokens_seen": 2147263488 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001765095285857573, + "loss": 2.5381, + "theoretical_loss": 3.4102650033286697, + "tokens_seen": 2147329024 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017649949849548647, + "loss": 2.6204, + "theoretical_loss": 3.410256446280408, + "tokens_seen": 2147394560 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017648946840521565, + "loss": 2.5085, + "theoretical_loss": 3.4102478895664134, + "tokens_seen": 2147460096 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017647943831494483, + "loss": 2.5501, + "theoretical_loss": 3.410239333186662, + "tokens_seen": 2147525632 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017646940822467402, + "loss": 2.597, + "theoretical_loss": 3.4102307771411318, + "tokens_seen": 2147591168 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017645937813440322, + "loss": 2.5889, + "theoretical_loss": 3.4102222214297986, + "tokens_seen": 2147656704 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001764493480441324, + "loss": 2.6034, + "theoretical_loss": 3.4102136660526394, + "tokens_seen": 2147722240 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017643931795386159, + "loss": 2.6646, + "theoretical_loss": 3.4102051110096308, + "tokens_seen": 2147787776 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017642928786359077, + "loss": 2.3768, + "theoretical_loss": 3.41019655630075, + "tokens_seen": 2147853312 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017641925777331998, + "loss": 2.5911, + "theoretical_loss": 3.410188001925973, + "tokens_seen": 2147918848 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2405087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.652845859527588, + "objective/train/theoretical_loss": 3.41018586338448, + "objective/train/tokens_used": 2168395232, + "theoretical_loss": 3.41018586338448, + "tokens_seen": 2147935232 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017640922768304916, + "loss": 2.8381, + "theoretical_loss": 3.4101794478852776, + "tokens_seen": 2147984384 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017639919759277834, + "loss": 2.5703, + "theoretical_loss": 3.4101708941786395, + "tokens_seen": 2148049920 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017638916750250752, + "loss": 2.6487, + "theoretical_loss": 3.4101623408060364, + "tokens_seen": 2148115456 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001763791374122367, + "loss": 2.495, + "theoretical_loss": 3.4101537877674444, + "tokens_seen": 2148180992 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001763691073219659, + "loss": 2.5075, + "theoretical_loss": 3.4101452350628403, + "tokens_seen": 2148246528 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001763590772316951, + "loss": 2.7205, + "theoretical_loss": 3.410136682692202, + "tokens_seen": 2148312064 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017634904714142427, + "loss": 2.5611, + "theoretical_loss": 3.4101281306555045, + "tokens_seen": 2148377600 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017633901705115345, + "loss": 2.5213, + "theoretical_loss": 3.410119578952725, + "tokens_seen": 2148443136 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017632898696088266, + "loss": 2.7356, + "theoretical_loss": 3.4101110275838415, + "tokens_seen": 2148508672 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017631895687061184, + "loss": 2.2895, + "theoretical_loss": 3.41010247654883, + "tokens_seen": 2148574208 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017630892678034102, + "loss": 2.4999, + "theoretical_loss": 3.410093925847667, + "tokens_seen": 2148639744 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001762988966900702, + "loss": 2.5566, + "theoretical_loss": 3.410085375480329, + "tokens_seen": 2148705280 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017628886659979938, + "loss": 2.569, + "theoretical_loss": 3.4100768254467937, + "tokens_seen": 2148770816 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001762788365095286, + "loss": 2.4906, + "theoretical_loss": 3.410068275747038, + "tokens_seen": 2148836352 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017626880641925777, + "loss": 2.6894, + "theoretical_loss": 3.4100597263810375, + "tokens_seen": 2148901888 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017625877632898695, + "loss": 2.7075, + "theoretical_loss": 3.41005117734877, + "tokens_seen": 2148967424 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017624874623871614, + "loss": 2.4781, + "theoretical_loss": 3.4100426286502117, + "tokens_seen": 2149032960 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017623871614844534, + "loss": 2.5912, + "theoretical_loss": 3.41003408028534, + "tokens_seen": 2149098496 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017622868605817452, + "loss": 2.7192, + "theoretical_loss": 3.4100255322541315, + "tokens_seen": 2149164032 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001762186559679037, + "loss": 2.515, + "theoretical_loss": 3.4100169845565627, + "tokens_seen": 2149229568 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001762086258776329, + "loss": 2.6706, + "theoretical_loss": 3.4100084371926105, + "tokens_seen": 2149295104 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017619859578736207, + "loss": 2.5821, + "theoretical_loss": 3.4099998901622524, + "tokens_seen": 2149360640 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017618856569709128, + "loss": 2.5532, + "theoretical_loss": 3.409991343465464, + "tokens_seen": 2149426176 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017617853560682046, + "loss": 2.5092, + "theoretical_loss": 3.4099827971022227, + "tokens_seen": 2149491712 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017616850551654964, + "loss": 2.5747, + "theoretical_loss": 3.409974251072505, + "tokens_seen": 2149557248 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2406322, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.338921546936035, + "objective/train/theoretical_loss": 3.4099721146171866, + "objective/train/tokens_used": 2170033632, + "theoretical_loss": 3.4099721146171866, + "tokens_seen": 2149573632 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017615847542627882, + "loss": 2.4759, + "theoretical_loss": 3.4099657053762886, + "tokens_seen": 2149622784 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017614844533600803, + "loss": 2.6318, + "theoretical_loss": 3.40995716001355, + "tokens_seen": 2149688320 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001761384152457372, + "loss": 2.5135, + "theoretical_loss": 3.4099486149842653, + "tokens_seen": 2149753856 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001761283851554664, + "loss": 2.4024, + "theoretical_loss": 3.4099400702884117, + "tokens_seen": 2149819392 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017611835506519557, + "loss": 2.6163, + "theoretical_loss": 3.4099315259259666, + "tokens_seen": 2149884928 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017610832497492478, + "loss": 2.3588, + "theoretical_loss": 3.409922981896906, + "tokens_seen": 2149950464 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017609829488465396, + "loss": 2.4638, + "theoretical_loss": 3.4099144382012074, + "tokens_seen": 2150016000 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017608826479438314, + "loss": 2.372, + "theoretical_loss": 3.4099058948388468, + "tokens_seen": 2150081536 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017607823470411232, + "loss": 2.1089, + "theoretical_loss": 3.409897351809802, + "tokens_seen": 2150147072 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001760682046138415, + "loss": 2.5839, + "theoretical_loss": 3.409888809114049, + "tokens_seen": 2150212608 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001760581745235707, + "loss": 2.5006, + "theoretical_loss": 3.409880266751566, + "tokens_seen": 2150278144 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017604814443329992, + "loss": 2.5149, + "theoretical_loss": 3.4098717247223282, + "tokens_seen": 2150343680 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001760381143430291, + "loss": 2.568, + "theoretical_loss": 3.409863183026313, + "tokens_seen": 2150409216 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017602808425275828, + "loss": 2.6649, + "theoretical_loss": 3.4098546416634976, + "tokens_seen": 2150474752 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001760180541624875, + "loss": 2.6671, + "theoretical_loss": 3.4098461006338585, + "tokens_seen": 2150540288 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017600802407221667, + "loss": 2.7443, + "theoretical_loss": 3.4098375599373725, + "tokens_seen": 2150605824 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017599799398194585, + "loss": 2.3265, + "theoretical_loss": 3.409829019574017, + "tokens_seen": 2150671360 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017598796389167503, + "loss": 2.7039, + "theoretical_loss": 3.409820479543768, + "tokens_seen": 2150736896 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017597793380140422, + "loss": 2.642, + "theoretical_loss": 3.409811939846603, + "tokens_seen": 2150802432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017596790371113342, + "loss": 2.7735, + "theoretical_loss": 3.409803400482499, + "tokens_seen": 2150867968 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001759578736208626, + "loss": 2.5436, + "theoretical_loss": 3.4097948614514326, + "tokens_seen": 2150933504 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017594784353059179, + "loss": 2.6639, + "theoretical_loss": 3.40978632275338, + "tokens_seen": 2150999040 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017593781344032097, + "loss": 2.529, + "theoretical_loss": 3.4097777843883192, + "tokens_seen": 2151064576 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017592778335005018, + "loss": 2.3743, + "theoretical_loss": 3.4097692463562264, + "tokens_seen": 2151130112 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017591775325977936, + "loss": 2.3937, + "theoretical_loss": 3.4097607086570787, + "tokens_seen": 2151195648 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2406733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4008755683898926, + "objective/train/theoretical_loss": 3.4097585742843117, + "objective/train/tokens_used": 2171672032, + "theoretical_loss": 3.4097585742843117, + "tokens_seen": 2151212032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017590772316950854, + "loss": 2.4127, + "theoretical_loss": 3.4097521712908527, + "tokens_seen": 2151261184 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017589769307923772, + "loss": 2.5364, + "theoretical_loss": 3.4097436342575254, + "tokens_seen": 2151326720 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001758876629889669, + "loss": 2.4703, + "theoretical_loss": 3.4097350975570744, + "tokens_seen": 2151392256 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001758776328986961, + "loss": 2.6452, + "theoretical_loss": 3.4097265611894754, + "tokens_seen": 2151457792 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001758676028084253, + "loss": 2.3446, + "theoretical_loss": 3.409718025154706, + "tokens_seen": 2151523328 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017585757271815447, + "loss": 2.6889, + "theoretical_loss": 3.4097094894527427, + "tokens_seen": 2151588864 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017584754262788365, + "loss": 2.7873, + "theoretical_loss": 3.409700954083563, + "tokens_seen": 2151654400 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017583751253761286, + "loss": 2.6705, + "theoretical_loss": 3.4096924190471434, + "tokens_seen": 2151719936 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017582748244734204, + "loss": 2.3987, + "theoretical_loss": 3.40968388434346, + "tokens_seen": 2151785472 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017581745235707122, + "loss": 2.4609, + "theoretical_loss": 3.409675349972491, + "tokens_seen": 2151851008 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001758074222668004, + "loss": 2.5088, + "theoretical_loss": 3.409666815934213, + "tokens_seen": 2151916544 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017579739217652958, + "loss": 2.625, + "theoretical_loss": 3.4096582822286026, + "tokens_seen": 2151982080 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001757873620862588, + "loss": 2.6174, + "theoretical_loss": 3.4096497488556365, + "tokens_seen": 2152047616 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017577733199598797, + "loss": 2.3546, + "theoretical_loss": 3.409641215815292, + "tokens_seen": 2152113152 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017576730190571715, + "loss": 2.4596, + "theoretical_loss": 3.409632683107546, + "tokens_seen": 2152178688 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017575727181544634, + "loss": 2.4778, + "theoretical_loss": 3.409624150732375, + "tokens_seen": 2152244224 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017574724172517554, + "loss": 2.6007, + "theoretical_loss": 3.4096156186897564, + "tokens_seen": 2152309760 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017573721163490473, + "loss": 2.5691, + "theoretical_loss": 3.409607086979667, + "tokens_seen": 2152375296 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001757271815446339, + "loss": 2.5826, + "theoretical_loss": 3.4095985556020834, + "tokens_seen": 2152440832 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001757171514543631, + "loss": 2.499, + "theoretical_loss": 3.409590024556983, + "tokens_seen": 2152506368 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017570712136409227, + "loss": 2.6922, + "theoretical_loss": 3.4095814938443425, + "tokens_seen": 2152571904 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017569709127382148, + "loss": 2.7505, + "theoretical_loss": 3.4095729634641385, + "tokens_seen": 2152637440 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017568706118355066, + "loss": 2.683, + "theoretical_loss": 3.409564433416348, + "tokens_seen": 2152702976 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017567703109327984, + "loss": 2.8491, + "theoretical_loss": 3.409555903700949, + "tokens_seen": 2152768512 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017566700100300902, + "loss": 2.2025, + "theoretical_loss": 3.409547374317917, + "tokens_seen": 2152834048 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2408000, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5167782306671143, + "objective/train/theoretical_loss": 3.4095452420240884, + "objective/train/tokens_used": 2173310432, + "theoretical_loss": 3.4095452420240884, + "tokens_seen": 2152850432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017565697091273823, + "loss": 2.6845, + "theoretical_loss": 3.4095388452672295, + "tokens_seen": 2152899584 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001756469408224674, + "loss": 2.5372, + "theoretical_loss": 3.4095303165488637, + "tokens_seen": 2152965120 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001756369107321966, + "loss": 2.4097, + "theoretical_loss": 3.409521788162796, + "tokens_seen": 2153030656 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017562688064192577, + "loss": 2.5502, + "theoretical_loss": 3.4095132601090032, + "tokens_seen": 2153096192 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017561685055165498, + "loss": 2.5803, + "theoretical_loss": 3.4095047323874637, + "tokens_seen": 2153161728 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017560682046138416, + "loss": 2.6696, + "theoretical_loss": 3.4094962049981525, + "tokens_seen": 2153227264 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017559679037111334, + "loss": 2.3788, + "theoretical_loss": 3.4094876779410477, + "tokens_seen": 2153292800 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017558676028084252, + "loss": 2.5387, + "theoretical_loss": 3.409479151216126, + "tokens_seen": 2153358336 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001755767301905717, + "loss": 2.3347, + "theoretical_loss": 3.409470624823365, + "tokens_seen": 2153423872 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001755667001003009, + "loss": 2.4877, + "theoretical_loss": 3.40946209876274, + "tokens_seen": 2153489408 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001755566700100301, + "loss": 2.5063, + "theoretical_loss": 3.40945357303423, + "tokens_seen": 2153554944 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017554663991975927, + "loss": 2.584, + "theoretical_loss": 3.40944504763781, + "tokens_seen": 2153620480 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017553660982948846, + "loss": 2.3214, + "theoretical_loss": 3.4094365225734578, + "tokens_seen": 2153686016 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017552657973921766, + "loss": 2.5754, + "theoretical_loss": 3.409427997841151, + "tokens_seen": 2153751552 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017551654964894685, + "loss": 2.6613, + "theoretical_loss": 3.409419473440866, + "tokens_seen": 2153817088 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017550651955867603, + "loss": 2.6803, + "theoretical_loss": 3.409410949372579, + "tokens_seen": 2153882624 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001754964894684052, + "loss": 2.4971, + "theoretical_loss": 3.4094024256362685, + "tokens_seen": 2153948160 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001754864593781344, + "loss": 2.5539, + "theoretical_loss": 3.4093939022319106, + "tokens_seen": 2154013696 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001754764292878636, + "loss": 2.494, + "theoretical_loss": 3.4093853791594824, + "tokens_seen": 2154079232 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017546639919759278, + "loss": 2.6554, + "theoretical_loss": 3.4093768564189606, + "tokens_seen": 2154144768 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017545636910732196, + "loss": 2.5532, + "theoretical_loss": 3.4093683340103222, + "tokens_seen": 2154210304 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017544633901705114, + "loss": 2.5898, + "theoretical_loss": 3.409359811933545, + "tokens_seen": 2154275840 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017543630892678035, + "loss": 2.5853, + "theoretical_loss": 3.409351290188605, + "tokens_seen": 2154341376 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017542627883650953, + "loss": 2.4871, + "theoretical_loss": 3.40934276877548, + "tokens_seen": 2154406912 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001754162487462387, + "loss": 2.3633, + "theoretical_loss": 3.4093342476941464, + "tokens_seen": 2154472448 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2408684, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3143258094787598, + "objective/train/theoretical_loss": 3.4093321174756523, + "objective/train/tokens_used": 2174948832, + "theoretical_loss": 3.4093321174756523, + "tokens_seen": 2154488832 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001754062186559679, + "loss": 2.4726, + "theoretical_loss": 3.409325726944581, + "tokens_seen": 2154537984 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017539618856569707, + "loss": 2.4028, + "theoretical_loss": 3.409317206526761, + "tokens_seen": 2154603520 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017538615847542628, + "loss": 2.6641, + "theoretical_loss": 3.409308686440664, + "tokens_seen": 2154669056 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017537612838515546, + "loss": 2.4296, + "theoretical_loss": 3.4093001666862666, + "tokens_seen": 2154734592 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017536609829488464, + "loss": 2.6972, + "theoretical_loss": 3.4092916472635455, + "tokens_seen": 2154800128 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017535606820461382, + "loss": 2.4694, + "theoretical_loss": 3.409283128172478, + "tokens_seen": 2154865664 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017534603811434303, + "loss": 2.5393, + "theoretical_loss": 3.409274609413041, + "tokens_seen": 2154931200 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017533600802407221, + "loss": 2.3116, + "theoretical_loss": 3.409266090985211, + "tokens_seen": 2154996736 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001753259779338014, + "loss": 2.6766, + "theoretical_loss": 3.4092575728889662, + "tokens_seen": 2155062272 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017531594784353058, + "loss": 2.5314, + "theoretical_loss": 3.409249055124283, + "tokens_seen": 2155127808 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017530591775325976, + "loss": 2.5956, + "theoretical_loss": 3.4092405376911383, + "tokens_seen": 2155193344 + }, + { + "epoch": 7.02, + "learning_rate": 0.000175295887662989, + "loss": 2.5734, + "theoretical_loss": 3.409232020589509, + "tokens_seen": 2155258880 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017528585757271817, + "loss": 2.5415, + "theoretical_loss": 3.409223503819372, + "tokens_seen": 2155324416 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017527582748244735, + "loss": 2.4057, + "theoretical_loss": 3.409214987380705, + "tokens_seen": 2155389952 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017526579739217654, + "loss": 2.4128, + "theoretical_loss": 3.4092064712734844, + "tokens_seen": 2155455488 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017525576730190574, + "loss": 2.6727, + "theoretical_loss": 3.409197955497688, + "tokens_seen": 2155521024 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017524573721163493, + "loss": 2.421, + "theoretical_loss": 3.409189440053292, + "tokens_seen": 2155586560 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001752357071213641, + "loss": 2.5713, + "theoretical_loss": 3.409180924940274, + "tokens_seen": 2155652096 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001752256770310933, + "loss": 2.5493, + "theoretical_loss": 3.40917241015861, + "tokens_seen": 2155717632 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017521564694082247, + "loss": 2.6047, + "theoretical_loss": 3.409163895708278, + "tokens_seen": 2155783168 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017520561685055168, + "loss": 2.5566, + "theoretical_loss": 3.4091553815892555, + "tokens_seen": 2155848704 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017519558676028086, + "loss": 2.6549, + "theoretical_loss": 3.409146867801518, + "tokens_seen": 2155914240 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017518555667001004, + "loss": 2.6378, + "theoretical_loss": 3.4091383543450435, + "tokens_seen": 2155979776 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017517552657973922, + "loss": 2.7986, + "theoretical_loss": 3.4091298412198094, + "tokens_seen": 2156045312 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017516549648946843, + "loss": 2.6029, + "theoretical_loss": 3.409121328425792, + "tokens_seen": 2156110848 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2409451, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.921675443649292, + "objective/train/theoretical_loss": 3.409119200279038, + "objective/train/tokens_used": 2176587232, + "theoretical_loss": 3.409119200279038, + "tokens_seen": 2156127232 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001751554663991976, + "loss": 2.647, + "theoretical_loss": 3.4091128159629687, + "tokens_seen": 2156176384 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001751454363089268, + "loss": 2.5492, + "theoretical_loss": 3.4091043038313167, + "tokens_seen": 2156241920 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017513540621865597, + "loss": 2.5875, + "theoretical_loss": 3.4090957920308123, + "tokens_seen": 2156307456 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017512537612838518, + "loss": 2.5745, + "theoretical_loss": 3.4090872805614336, + "tokens_seen": 2156372992 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017511534603811436, + "loss": 2.4977, + "theoretical_loss": 3.4090787694231572, + "tokens_seen": 2156438528 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017510531594784354, + "loss": 2.6105, + "theoretical_loss": 3.4090702586159596, + "tokens_seen": 2156504064 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017509528585757272, + "loss": 2.3399, + "theoretical_loss": 3.4090617481398184, + "tokens_seen": 2156569600 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001750852557673019, + "loss": 2.6786, + "theoretical_loss": 3.4090532379947107, + "tokens_seen": 2156635136 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001750752256770311, + "loss": 2.8697, + "theoretical_loss": 3.4090447281806133, + "tokens_seen": 2156700672 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001750651955867603, + "loss": 2.5577, + "theoretical_loss": 3.409036218697504, + "tokens_seen": 2156766208 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017505516549648947, + "loss": 2.5094, + "theoretical_loss": 3.409027709545359, + "tokens_seen": 2156831744 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017504513540621866, + "loss": 2.624, + "theoretical_loss": 3.4090192007241558, + "tokens_seen": 2156897280 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017503510531594786, + "loss": 2.7536, + "theoretical_loss": 3.409010692233871, + "tokens_seen": 2156962816 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017502507522567705, + "loss": 2.3881, + "theoretical_loss": 3.4090021840744824, + "tokens_seen": 2157028352 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017501504513540623, + "loss": 2.4729, + "theoretical_loss": 3.4089936762459665, + "tokens_seen": 2157093888 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001750050150451354, + "loss": 2.4308, + "theoretical_loss": 3.408985168748301, + "tokens_seen": 2157159424 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001749949849548646, + "loss": 2.6624, + "theoretical_loss": 3.408976661581462, + "tokens_seen": 2157224960 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001749849548645938, + "loss": 2.7673, + "theoretical_loss": 3.408968154745428, + "tokens_seen": 2157290496 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017497492477432298, + "loss": 2.4748, + "theoretical_loss": 3.4089596482401743, + "tokens_seen": 2157356032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017496489468405216, + "loss": 2.4835, + "theoretical_loss": 3.4089511420656793, + "tokens_seen": 2157421568 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017495486459378134, + "loss": 2.4369, + "theoretical_loss": 3.4089426362219197, + "tokens_seen": 2157487104 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017494483450351055, + "loss": 2.6871, + "theoretical_loss": 3.4089341307088725, + "tokens_seen": 2157552640 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017493480441323973, + "loss": 2.491, + "theoretical_loss": 3.4089256255265155, + "tokens_seen": 2157618176 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001749247743229689, + "loss": 2.6301, + "theoretical_loss": 3.4089171206748246, + "tokens_seen": 2157683712 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001749147442326981, + "loss": 2.4773, + "theoretical_loss": 3.408908616153778, + "tokens_seen": 2157749248 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2410237, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.177361488342285, + "objective/train/theoretical_loss": 3.408906490075177, + "objective/train/tokens_used": 2178225632, + "theoretical_loss": 3.408906490075177, + "tokens_seen": 2157765632 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017490471414242727, + "loss": 2.436, + "theoretical_loss": 3.408900111963352, + "tokens_seen": 2157814784 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017489468405215648, + "loss": 2.7082, + "theoretical_loss": 3.408891608103524, + "tokens_seen": 2157880320 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017488465396188566, + "loss": 2.6063, + "theoretical_loss": 3.4088831045742713, + "tokens_seen": 2157945856 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017487462387161484, + "loss": 2.6402, + "theoretical_loss": 3.408874601375571, + "tokens_seen": 2158011392 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017486459378134402, + "loss": 2.6123, + "theoretical_loss": 3.4088660985074, + "tokens_seen": 2158076928 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017485456369107323, + "loss": 2.3855, + "theoretical_loss": 3.408857595969735, + "tokens_seen": 2158142464 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017484453360080241, + "loss": 2.3494, + "theoretical_loss": 3.408849093762554, + "tokens_seen": 2158208000 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001748345035105316, + "loss": 2.6079, + "theoretical_loss": 3.408840591885834, + "tokens_seen": 2158273536 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017482447342026078, + "loss": 2.6757, + "theoretical_loss": 3.4088320903395517, + "tokens_seen": 2158339072 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017481444332998996, + "loss": 2.5398, + "theoretical_loss": 3.4088235891236844, + "tokens_seen": 2158404608 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017480441323971917, + "loss": 2.399, + "theoretical_loss": 3.408815088238209, + "tokens_seen": 2158470144 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017479438314944835, + "loss": 2.5076, + "theoretical_loss": 3.408806587683103, + "tokens_seen": 2158535680 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017478435305917753, + "loss": 2.7827, + "theoretical_loss": 3.408798087458343, + "tokens_seen": 2158601216 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001747743229689067, + "loss": 2.7367, + "theoretical_loss": 3.4087895875639065, + "tokens_seen": 2158666752 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017476429287863592, + "loss": 2.7773, + "theoretical_loss": 3.408781087999771, + "tokens_seen": 2158732288 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001747542627883651, + "loss": 2.5913, + "theoretical_loss": 3.4087725887659133, + "tokens_seen": 2158797824 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017474423269809428, + "loss": 2.4727, + "theoretical_loss": 3.40876408986231, + "tokens_seen": 2158863360 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017473420260782346, + "loss": 2.7178, + "theoretical_loss": 3.408755591288939, + "tokens_seen": 2158928896 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017472417251755264, + "loss": 2.6164, + "theoretical_loss": 3.4087470930457773, + "tokens_seen": 2158994432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017471414242728185, + "loss": 2.7302, + "theoretical_loss": 3.4087385951328018, + "tokens_seen": 2159059968 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017470411233701103, + "loss": 2.6479, + "theoretical_loss": 3.4087300975499897, + "tokens_seen": 2159125504 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001746940822467402, + "loss": 2.5601, + "theoretical_loss": 3.4087216002973184, + "tokens_seen": 2159191040 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001746840521564694, + "loss": 2.8099, + "theoretical_loss": 3.408713103374765, + "tokens_seen": 2159256576 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001746740220661986, + "loss": 2.5994, + "theoretical_loss": 3.4087046067823064, + "tokens_seen": 2159322112 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017466399197592778, + "loss": 2.6593, + "theoretical_loss": 3.40869611051992, + "tokens_seen": 2159387648 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 2411485, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6026055812835693, + "objective/train/theoretical_loss": 3.4086939865058943, + "objective/train/tokens_used": 2179864032, + "theoretical_loss": 3.4086939865058943, + "tokens_seen": 2159404032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017465396188565696, + "loss": 2.688, + "theoretical_loss": 3.4086876145875826, + "tokens_seen": 2159453184 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017464393179538614, + "loss": 2.7377, + "theoretical_loss": 3.4086791189852716, + "tokens_seen": 2159518720 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017463390170511533, + "loss": 2.6004, + "theoretical_loss": 3.4086706237129647, + "tokens_seen": 2159584256 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017462387161484453, + "loss": 2.4113, + "theoretical_loss": 3.4086621287706382, + "tokens_seen": 2159649792 + }, + { + "epoch": 7.02, + "learning_rate": 0.00017461384152457372, + "loss": 2.7033, + "theoretical_loss": 3.4086536341582696, + "tokens_seen": 2159715328 + }, + { + "epoch": 7.02, + "learning_rate": 0.0001746038114343029, + "loss": 2.4492, + "theoretical_loss": 3.408645139875836, + "tokens_seen": 2159780864 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017459378134403208, + "loss": 2.5712, + "theoretical_loss": 3.408636645923315, + "tokens_seen": 2159846400 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017458375125376129, + "loss": 2.527, + "theoretical_loss": 3.4086281523006834, + "tokens_seen": 2159911936 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017457372116349047, + "loss": 2.7034, + "theoretical_loss": 3.4086196590079183, + "tokens_seen": 2159977472 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017456369107321965, + "loss": 2.4615, + "theoretical_loss": 3.408611166044997, + "tokens_seen": 2160043008 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017455366098294883, + "loss": 2.5681, + "theoretical_loss": 3.4086026734118966, + "tokens_seen": 2160108544 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017454363089267806, + "loss": 2.4608, + "theoretical_loss": 3.4085941811085947, + "tokens_seen": 2160174080 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017453360080240725, + "loss": 2.6312, + "theoretical_loss": 3.4085856891350677, + "tokens_seen": 2160239616 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017452357071213643, + "loss": 2.7846, + "theoretical_loss": 3.4085771974912937, + "tokens_seen": 2160305152 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001745135406218656, + "loss": 2.7045, + "theoretical_loss": 3.4085687061772494, + "tokens_seen": 2160370688 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001745035105315948, + "loss": 2.4384, + "theoretical_loss": 3.408560215192912, + "tokens_seen": 2160436224 + }, + { + "epoch": 7.03, + "learning_rate": 0.000174493480441324, + "loss": 2.6606, + "theoretical_loss": 3.4085517245382584, + "tokens_seen": 2160501760 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017448345035105318, + "loss": 2.5484, + "theoretical_loss": 3.4085432342132664, + "tokens_seen": 2160567296 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017447342026078236, + "loss": 2.8467, + "theoretical_loss": 3.408534744217913, + "tokens_seen": 2160632832 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017446339017051154, + "loss": 2.7059, + "theoretical_loss": 3.4085262545521755, + "tokens_seen": 2160698368 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017445336008024075, + "loss": 2.2717, + "theoretical_loss": 3.408517765216031, + "tokens_seen": 2160763904 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017444332998996993, + "loss": 2.7703, + "theoretical_loss": 3.408509276209456, + "tokens_seen": 2160829440 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001744332998996991, + "loss": 2.4738, + "theoretical_loss": 3.4085007875324287, + "tokens_seen": 2160894976 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001744232698094283, + "loss": 2.837, + "theoretical_loss": 3.4084922991849265, + "tokens_seen": 2160960512 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017441323971915747, + "loss": 2.4541, + "theoretical_loss": 3.4084838111669256, + "tokens_seen": 2161026048 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2412087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.742750644683838, + "objective/train/theoretical_loss": 3.4084816892139074, + "objective/train/tokens_used": 2181502432, + "theoretical_loss": 3.4084816892139074, + "tokens_seen": 2161042432 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017440320962888668, + "loss": 2.6362, + "theoretical_loss": 3.4084753234784033, + "tokens_seen": 2161091584 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017439317953861586, + "loss": 2.3747, + "theoretical_loss": 3.408466836119338, + "tokens_seen": 2161157120 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017438314944834504, + "loss": 2.5258, + "theoretical_loss": 3.408458349089706, + "tokens_seen": 2161222656 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017437311935807422, + "loss": 2.4951, + "theoretical_loss": 3.4084498623894843, + "tokens_seen": 2161288192 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017436308926780343, + "loss": 2.649, + "theoretical_loss": 3.408441376018651, + "tokens_seen": 2161353728 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017435305917753261, + "loss": 2.6774, + "theoretical_loss": 3.4084328899771825, + "tokens_seen": 2161419264 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001743430290872618, + "loss": 2.643, + "theoretical_loss": 3.4084244042650567, + "tokens_seen": 2161484800 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017433299899699098, + "loss": 2.5864, + "theoretical_loss": 3.40841591888225, + "tokens_seen": 2161550336 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017432296890672016, + "loss": 2.4373, + "theoretical_loss": 3.4084074338287405, + "tokens_seen": 2161615872 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017431293881644937, + "loss": 2.5499, + "theoretical_loss": 3.4083989491045052, + "tokens_seen": 2161681408 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017430290872617855, + "loss": 2.655, + "theoretical_loss": 3.408390464709521, + "tokens_seen": 2161746944 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017429287863590773, + "loss": 2.6311, + "theoretical_loss": 3.408381980643765, + "tokens_seen": 2161812480 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001742828485456369, + "loss": 2.4183, + "theoretical_loss": 3.4083734969072155, + "tokens_seen": 2161878016 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017427281845536612, + "loss": 2.2683, + "theoretical_loss": 3.4083650134998487, + "tokens_seen": 2161943552 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001742627883650953, + "loss": 2.798, + "theoretical_loss": 3.408356530421642, + "tokens_seen": 2162009088 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017425275827482448, + "loss": 2.4568, + "theoretical_loss": 3.408348047672573, + "tokens_seen": 2162074624 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017424272818455366, + "loss": 2.5496, + "theoretical_loss": 3.408339565252619, + "tokens_seen": 2162140160 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017423269809428284, + "loss": 2.6147, + "theoretical_loss": 3.408331083161757, + "tokens_seen": 2162205696 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017422266800401205, + "loss": 2.605, + "theoretical_loss": 3.4083226013999646, + "tokens_seen": 2162271232 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017421263791374123, + "loss": 2.5229, + "theoretical_loss": 3.4083141199672182, + "tokens_seen": 2162336768 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001742026078234704, + "loss": 2.4297, + "theoretical_loss": 3.408305638863496, + "tokens_seen": 2162402304 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001741925777331996, + "loss": 2.6811, + "theoretical_loss": 3.4082971580887746, + "tokens_seen": 2162467840 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001741825476429288, + "loss": 2.3424, + "theoretical_loss": 3.4082886776430317, + "tokens_seen": 2162533376 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017417251755265798, + "loss": 2.6019, + "theoretical_loss": 3.4082801975262447, + "tokens_seen": 2162598912 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017416248746238716, + "loss": 2.5735, + "theoretical_loss": 3.40827171773839, + "tokens_seen": 2162664448 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2413473, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.364506244659424, + "objective/train/theoretical_loss": 3.4082695978428195, + "objective/train/tokens_used": 2183140832, + "theoretical_loss": 3.4082695978428195, + "tokens_seen": 2162680832 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017415245737211634, + "loss": 2.5987, + "theoretical_loss": 3.408263238279446, + "tokens_seen": 2162729984 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017414242728184553, + "loss": 2.7611, + "theoretical_loss": 3.4082547591493895, + "tokens_seen": 2162795520 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017413239719157473, + "loss": 2.4869, + "theoretical_loss": 3.4082462803481977, + "tokens_seen": 2162861056 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017412236710130392, + "loss": 2.4118, + "theoretical_loss": 3.4082378018758477, + "tokens_seen": 2162926592 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001741123370110331, + "loss": 2.4866, + "theoretical_loss": 3.408229323732317, + "tokens_seen": 2162992128 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017410230692076228, + "loss": 2.4614, + "theoretical_loss": 3.408220845917583, + "tokens_seen": 2163057664 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017409227683049149, + "loss": 2.5036, + "theoretical_loss": 3.408212368431623, + "tokens_seen": 2163123200 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017408224674022067, + "loss": 2.2997, + "theoretical_loss": 3.4082038912744137, + "tokens_seen": 2163188736 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017407221664994985, + "loss": 2.7316, + "theoretical_loss": 3.408195414445933, + "tokens_seen": 2163254272 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017406218655967903, + "loss": 2.7028, + "theoretical_loss": 3.4081869379461587, + "tokens_seen": 2163319808 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001740521564694082, + "loss": 2.7847, + "theoretical_loss": 3.408178461775067, + "tokens_seen": 2163385344 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017404212637913742, + "loss": 2.6509, + "theoretical_loss": 3.4081699859326355, + "tokens_seen": 2163450880 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001740320962888666, + "loss": 2.4698, + "theoretical_loss": 3.408161510418842, + "tokens_seen": 2163516416 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017402206619859578, + "loss": 2.6885, + "theoretical_loss": 3.408153035233663, + "tokens_seen": 2163581952 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017401203610832496, + "loss": 2.539, + "theoretical_loss": 3.408144560377077, + "tokens_seen": 2163647488 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017400200601805417, + "loss": 2.6441, + "theoretical_loss": 3.4081360858490597, + "tokens_seen": 2163713024 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017399197592778335, + "loss": 2.7228, + "theoretical_loss": 3.4081276116495896, + "tokens_seen": 2163778560 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017398194583751253, + "loss": 2.5885, + "theoretical_loss": 3.4081191377786437, + "tokens_seen": 2163844096 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017397191574724171, + "loss": 2.2504, + "theoretical_loss": 3.4081106642361996, + "tokens_seen": 2163909632 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017396188565697092, + "loss": 2.7454, + "theoretical_loss": 3.408102191022234, + "tokens_seen": 2163975168 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001739518555667001, + "loss": 2.5046, + "theoretical_loss": 3.4080937181367243, + "tokens_seen": 2164040704 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017394182547642928, + "loss": 2.6252, + "theoretical_loss": 3.408085245579649, + "tokens_seen": 2164106240 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017393179538615847, + "loss": 2.6272, + "theoretical_loss": 3.4080767733509836, + "tokens_seen": 2164171776 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017392176529588765, + "loss": 2.6221, + "theoretical_loss": 3.4080683014507063, + "tokens_seen": 2164237312 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017391173520561685, + "loss": 2.867, + "theoretical_loss": 3.4080598298787947, + "tokens_seen": 2164302848 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2414141, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.689037322998047, + "objective/train/theoretical_loss": 3.4080577120371216, + "objective/train/tokens_used": 2184779232, + "theoretical_loss": 3.4080577120371216, + "tokens_seen": 2164319232 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017390170511534604, + "loss": 2.7066, + "theoretical_loss": 3.408051358635226, + "tokens_seen": 2164368384 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017389167502507522, + "loss": 2.6534, + "theoretical_loss": 3.4080428877199775, + "tokens_seen": 2164433920 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001738816449348044, + "loss": 2.7305, + "theoretical_loss": 3.408034417133026, + "tokens_seen": 2164499456 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001738716148445336, + "loss": 2.5517, + "theoretical_loss": 3.4080259468743495, + "tokens_seen": 2164564992 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001738615847542628, + "loss": 2.5463, + "theoretical_loss": 3.4080174769439253, + "tokens_seen": 2164630528 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017385155466399197, + "loss": 2.5119, + "theoretical_loss": 3.40800900734173, + "tokens_seen": 2164696064 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017384152457372115, + "loss": 2.6188, + "theoretical_loss": 3.408000538067742, + "tokens_seen": 2164761600 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017383149448345033, + "loss": 2.7385, + "theoretical_loss": 3.407992069121938, + "tokens_seen": 2164827136 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017382146439317954, + "loss": 2.4745, + "theoretical_loss": 3.4079836005042954, + "tokens_seen": 2164892672 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017381143430290872, + "loss": 2.7103, + "theoretical_loss": 3.4079751322147915, + "tokens_seen": 2164958208 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001738014042126379, + "loss": 2.6764, + "theoretical_loss": 3.4079666642534043, + "tokens_seen": 2165023744 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001737913741223671, + "loss": 2.6494, + "theoretical_loss": 3.4079581966201102, + "tokens_seen": 2165089280 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017378134403209632, + "loss": 2.6947, + "theoretical_loss": 3.407949729314887, + "tokens_seen": 2165154816 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001737713139418255, + "loss": 2.497, + "theoretical_loss": 3.4079412623377126, + "tokens_seen": 2165220352 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017376128385155468, + "loss": 2.5939, + "theoretical_loss": 3.407932795688563, + "tokens_seen": 2165285888 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017375125376128386, + "loss": 2.7241, + "theoretical_loss": 3.4079243293674173, + "tokens_seen": 2165351424 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017374122367101304, + "loss": 2.6726, + "theoretical_loss": 3.4079158633742512, + "tokens_seen": 2165416960 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017373119358074225, + "loss": 2.5526, + "theoretical_loss": 3.4079073977090433, + "tokens_seen": 2165482496 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017372116349047143, + "loss": 2.5356, + "theoretical_loss": 3.4078989323717703, + "tokens_seen": 2165548032 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001737111334002006, + "loss": 2.8555, + "theoretical_loss": 3.4078904673624097, + "tokens_seen": 2165613568 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001737011033099298, + "loss": 2.6253, + "theoretical_loss": 3.4078820026809393, + "tokens_seen": 2165679104 + }, + { + "epoch": 7.03, + "learning_rate": 0.000173691073219659, + "loss": 2.4634, + "theoretical_loss": 3.407873538327336, + "tokens_seen": 2165744640 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017368104312938818, + "loss": 2.6088, + "theoretical_loss": 3.407865074301577, + "tokens_seen": 2165810176 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017367101303911736, + "loss": 2.635, + "theoretical_loss": 3.4078566106036403, + "tokens_seen": 2165875712 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017366098294884655, + "loss": 2.5451, + "theoretical_loss": 3.4078481472335027, + "tokens_seen": 2165941248 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2415028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6646692752838135, + "objective/train/theoretical_loss": 3.407846031442184, + "objective/train/tokens_used": 2186417632, + "theoretical_loss": 3.407846031442184, + "tokens_seen": 2165957632 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017365095285857573, + "loss": 2.4675, + "theoretical_loss": 3.407839684191142, + "tokens_seen": 2166006784 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017364092276830493, + "loss": 2.5574, + "theoretical_loss": 3.4078312214765356, + "tokens_seen": 2166072320 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017363089267803412, + "loss": 2.7948, + "theoretical_loss": 3.4078227590896604, + "tokens_seen": 2166137856 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001736208625877633, + "loss": 2.7734, + "theoretical_loss": 3.407814297030494, + "tokens_seen": 2166203392 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017361083249749248, + "loss": 2.6098, + "theoretical_loss": 3.4078058352990146, + "tokens_seen": 2166268928 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017360080240722169, + "loss": 2.645, + "theoretical_loss": 3.4077973738951983, + "tokens_seen": 2166334464 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017359077231695087, + "loss": 2.6986, + "theoretical_loss": 3.4077889128190235, + "tokens_seen": 2166400000 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017358074222668005, + "loss": 2.6262, + "theoretical_loss": 3.407780452070467, + "tokens_seen": 2166465536 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017357071213640923, + "loss": 2.8734, + "theoretical_loss": 3.407771991649507, + "tokens_seen": 2166531072 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001735606820461384, + "loss": 2.7662, + "theoretical_loss": 3.407763531556119, + "tokens_seen": 2166596608 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017355065195586762, + "loss": 2.78, + "theoretical_loss": 3.407755071790283, + "tokens_seen": 2166662144 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001735406218655968, + "loss": 2.4576, + "theoretical_loss": 3.407746612351975, + "tokens_seen": 2166727680 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017353059177532598, + "loss": 2.6398, + "theoretical_loss": 3.4077381532411724, + "tokens_seen": 2166793216 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017352056168505516, + "loss": 2.7455, + "theoretical_loss": 3.4077296944578523, + "tokens_seen": 2166858752 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017351053159478437, + "loss": 2.72, + "theoretical_loss": 3.4077212360019935, + "tokens_seen": 2166924288 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017350050150451355, + "loss": 2.6024, + "theoretical_loss": 3.407712777873572, + "tokens_seen": 2166989824 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017349047141424273, + "loss": 2.7666, + "theoretical_loss": 3.4077043200725656, + "tokens_seen": 2167055360 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017348044132397191, + "loss": 2.9556, + "theoretical_loss": 3.4076958625989517, + "tokens_seen": 2167120896 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017347041123370112, + "loss": 2.6005, + "theoretical_loss": 3.407687405452709, + "tokens_seen": 2167186432 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001734603811434303, + "loss": 2.4858, + "theoretical_loss": 3.4076789486338126, + "tokens_seen": 2167251968 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017345035105315948, + "loss": 2.5232, + "theoretical_loss": 3.4076704921422416, + "tokens_seen": 2167317504 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017344032096288867, + "loss": 2.6358, + "theoretical_loss": 3.407662035977973, + "tokens_seen": 2167383040 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017343029087261785, + "loss": 2.4416, + "theoretical_loss": 3.4076535801409844, + "tokens_seen": 2167448576 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017342026078234705, + "loss": 2.653, + "theoretical_loss": 3.407645124631253, + "tokens_seen": 2167514112 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017341023069207624, + "loss": 2.4991, + "theoretical_loss": 3.407636669448756, + "tokens_seen": 2167579648 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2415982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.918199062347412, + "objective/train/theoretical_loss": 3.4076345557042598, + "objective/train/tokens_used": 2188056032, + "theoretical_loss": 3.4076345557042598, + "tokens_seen": 2167596032 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017340020060180542, + "loss": 2.6409, + "theoretical_loss": 3.4076282145934713, + "tokens_seen": 2167645184 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001733901705115346, + "loss": 2.7603, + "theoretical_loss": 3.4076197600653764, + "tokens_seen": 2167710720 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001733801404212638, + "loss": 2.6902, + "theoretical_loss": 3.4076113058644486, + "tokens_seen": 2167776256 + }, + { + "epoch": 7.03, + "learning_rate": 0.000173370110330993, + "loss": 2.5476, + "theoretical_loss": 3.407602851990665, + "tokens_seen": 2167841792 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017336008024072217, + "loss": 2.5406, + "theoretical_loss": 3.4075943984440036, + "tokens_seen": 2167907328 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017335005015045135, + "loss": 2.765, + "theoretical_loss": 3.407585945224441, + "tokens_seen": 2167972864 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017334002006018053, + "loss": 2.6917, + "theoretical_loss": 3.407577492331956, + "tokens_seen": 2168038400 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017332998996990974, + "loss": 2.7801, + "theoretical_loss": 3.407569039766525, + "tokens_seen": 2168103936 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017331995987963892, + "loss": 2.6522, + "theoretical_loss": 3.407560587528126, + "tokens_seen": 2168169472 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001733099297893681, + "loss": 2.6276, + "theoretical_loss": 3.4075521356167355, + "tokens_seen": 2168235008 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017329989969909728, + "loss": 2.4506, + "theoretical_loss": 3.407543684032332, + "tokens_seen": 2168300544 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001732898696088265, + "loss": 2.418, + "theoretical_loss": 3.407535232774893, + "tokens_seen": 2168366080 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017327983951855567, + "loss": 2.5034, + "theoretical_loss": 3.4075267818443953, + "tokens_seen": 2168431616 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017326980942828485, + "loss": 2.5175, + "theoretical_loss": 3.407518331240817, + "tokens_seen": 2168497152 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017325977933801403, + "loss": 2.5659, + "theoretical_loss": 3.407509880964135, + "tokens_seen": 2168562688 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017324974924774322, + "loss": 2.6654, + "theoretical_loss": 3.4075014310143272, + "tokens_seen": 2168628224 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017323971915747242, + "loss": 2.7126, + "theoretical_loss": 3.4074929813913712, + "tokens_seen": 2168693760 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001732296890672016, + "loss": 2.5093, + "theoretical_loss": 3.4074845320952436, + "tokens_seen": 2168759296 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017321965897693079, + "loss": 2.69, + "theoretical_loss": 3.407476083125923, + "tokens_seen": 2168824832 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017320962888665997, + "loss": 2.6825, + "theoretical_loss": 3.4074676344833863, + "tokens_seen": 2168890368 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017319959879638917, + "loss": 2.6431, + "theoretical_loss": 3.407459186167611, + "tokens_seen": 2168955904 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017318956870611836, + "loss": 2.575, + "theoretical_loss": 3.4074507381785746, + "tokens_seen": 2169021440 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017317953861584754, + "loss": 2.5599, + "theoretical_loss": 3.4074422905162547, + "tokens_seen": 2169086976 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017316950852557672, + "loss": 2.7489, + "theoretical_loss": 3.4074338431806286, + "tokens_seen": 2169152512 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001731594784353059, + "loss": 2.6487, + "theoretical_loss": 3.407425396171674, + "tokens_seen": 2169218048 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2415982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0726399421691895, + "objective/train/theoretical_loss": 3.4074232844704753, + "objective/train/tokens_used": 2189694432, + "theoretical_loss": 3.4074232844704753, + "tokens_seen": 2169234432 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001731494483450351, + "loss": 2.7137, + "theoretical_loss": 3.4074169494893685, + "tokens_seen": 2169283584 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001731394182547643, + "loss": 2.6832, + "theoretical_loss": 3.4074085031336896, + "tokens_seen": 2169349120 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017312938816449347, + "loss": 2.6508, + "theoretical_loss": 3.4074000571046144, + "tokens_seen": 2169414656 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017311935807422265, + "loss": 2.7971, + "theoretical_loss": 3.4073916114021205, + "tokens_seen": 2169480192 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017310932798395186, + "loss": 2.7632, + "theoretical_loss": 3.4073831660261855, + "tokens_seen": 2169545728 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017309929789368104, + "loss": 2.7691, + "theoretical_loss": 3.4073747209767875, + "tokens_seen": 2169611264 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017308926780341022, + "loss": 2.5956, + "theoretical_loss": 3.407366276253903, + "tokens_seen": 2169676800 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001730792377131394, + "loss": 2.78, + "theoretical_loss": 3.40735783185751, + "tokens_seen": 2169742336 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017306920762286858, + "loss": 2.6099, + "theoretical_loss": 3.407349387787586, + "tokens_seen": 2169807872 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001730591775325978, + "loss": 2.4864, + "theoretical_loss": 3.4073409440441087, + "tokens_seen": 2169873408 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017304914744232697, + "loss": 2.8618, + "theoretical_loss": 3.4073325006270556, + "tokens_seen": 2169938944 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017303911735205618, + "loss": 2.777, + "theoretical_loss": 3.4073240575364037, + "tokens_seen": 2170004480 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017302908726178536, + "loss": 2.8189, + "theoretical_loss": 3.407315614772131, + "tokens_seen": 2170070016 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017301905717151457, + "loss": 2.671, + "theoretical_loss": 3.4073071723342148, + "tokens_seen": 2170135552 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017300902708124375, + "loss": 2.4726, + "theoretical_loss": 3.407298730222633, + "tokens_seen": 2170201088 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017299899699097293, + "loss": 2.6876, + "theoretical_loss": 3.407290288437363, + "tokens_seen": 2170266624 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017298896690070211, + "loss": 2.6898, + "theoretical_loss": 3.407281846978382, + "tokens_seen": 2170332160 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017297893681043132, + "loss": 2.4628, + "theoretical_loss": 3.4072734058456677, + "tokens_seen": 2170397696 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001729689067201605, + "loss": 2.8105, + "theoretical_loss": 3.4072649650391975, + "tokens_seen": 2170463232 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017295887662988968, + "loss": 2.709, + "theoretical_loss": 3.4072565245589495, + "tokens_seen": 2170528768 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017294884653961887, + "loss": 2.659, + "theoretical_loss": 3.407248084404901, + "tokens_seen": 2170594304 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017293881644934805, + "loss": 2.8532, + "theoretical_loss": 3.4072396445770288, + "tokens_seen": 2170659840 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017292878635907725, + "loss": 2.7838, + "theoretical_loss": 3.4072312050753113, + "tokens_seen": 2170725376 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017291875626880644, + "loss": 2.5954, + "theoretical_loss": 3.407222765899726, + "tokens_seen": 2170790912 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017290872617853562, + "loss": 2.5574, + "theoretical_loss": 3.40721432705025, + "tokens_seen": 2170856448 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2415982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.528714656829834, + "objective/train/theoretical_loss": 3.407212217388833, + "objective/train/tokens_used": 2191332832, + "theoretical_loss": 3.407212217388833, + "tokens_seen": 2170872832 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001728986960882648, + "loss": 2.8463, + "theoretical_loss": 3.4072058885268617, + "tokens_seen": 2170921984 + }, + { + "epoch": 7.03, + "learning_rate": 0.000172888665997994, + "loss": 2.7924, + "theoretical_loss": 3.4071974503295377, + "tokens_seen": 2170987520 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001728786359077232, + "loss": 2.854, + "theoretical_loss": 3.4071890124582556, + "tokens_seen": 2171053056 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017286860581745237, + "loss": 2.8219, + "theoretical_loss": 3.4071805749129935, + "tokens_seen": 2171118592 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017285857572718155, + "loss": 2.496, + "theoretical_loss": 3.407172137693729, + "tokens_seen": 2171184128 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017284854563691073, + "loss": 2.7062, + "theoretical_loss": 3.407163700800439, + "tokens_seen": 2171249664 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017283851554663994, + "loss": 2.9251, + "theoretical_loss": 3.407155264233102, + "tokens_seen": 2171315200 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017282848545636912, + "loss": 2.7007, + "theoretical_loss": 3.4071468279916948, + "tokens_seen": 2171380736 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001728184553660983, + "loss": 2.7834, + "theoretical_loss": 3.4071383920761953, + "tokens_seen": 2171446272 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017280842527582748, + "loss": 2.7097, + "theoretical_loss": 3.407129956486581, + "tokens_seen": 2171511808 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001727983951855567, + "loss": 2.7007, + "theoretical_loss": 3.4071215212228294, + "tokens_seen": 2171577344 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017278836509528587, + "loss": 2.6901, + "theoretical_loss": 3.407113086284918, + "tokens_seen": 2171642880 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017277833500501505, + "loss": 2.7558, + "theoretical_loss": 3.407104651672825, + "tokens_seen": 2171708416 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017276830491474423, + "loss": 2.9363, + "theoretical_loss": 3.4070962173865276, + "tokens_seen": 2171773952 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017275827482447342, + "loss": 2.6712, + "theoretical_loss": 3.407087783426003, + "tokens_seen": 2171839488 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017274824473420262, + "loss": 2.7397, + "theoretical_loss": 3.407079349791229, + "tokens_seen": 2171905024 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001727382146439318, + "loss": 2.731, + "theoretical_loss": 3.407070916482183, + "tokens_seen": 2171970560 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017272818455366099, + "loss": 2.7854, + "theoretical_loss": 3.4070624834988434, + "tokens_seen": 2172036096 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017271815446339017, + "loss": 2.7114, + "theoretical_loss": 3.407054050841187, + "tokens_seen": 2172101632 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017270812437311937, + "loss": 2.8054, + "theoretical_loss": 3.407045618509192, + "tokens_seen": 2172167168 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017269809428284856, + "loss": 2.8181, + "theoretical_loss": 3.4070371865028353, + "tokens_seen": 2172232704 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017268806419257774, + "loss": 2.7337, + "theoretical_loss": 3.407028754822095, + "tokens_seen": 2172298240 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017267803410230692, + "loss": 2.6873, + "theoretical_loss": 3.4070203234669485, + "tokens_seen": 2172363776 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001726680040120361, + "loss": 2.8449, + "theoretical_loss": 3.4070118924373736, + "tokens_seen": 2172429312 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001726579739217653, + "loss": 2.6152, + "theoretical_loss": 3.407003461733348, + "tokens_seen": 2172494848 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2417552, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8164074420928955, + "objective/train/theoretical_loss": 3.407001354108206, + "objective/train/tokens_used": 2192971232, + "theoretical_loss": 3.407001354108206, + "tokens_seen": 2172511232 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001726479438314945, + "loss": 2.817, + "theoretical_loss": 3.4069950313548487, + "tokens_seen": 2172560384 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017263791374122367, + "loss": 2.432, + "theoretical_loss": 3.406986601301854, + "tokens_seen": 2172625920 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017262788365095285, + "loss": 2.5548, + "theoretical_loss": 3.406978171574341, + "tokens_seen": 2172691456 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017261785356068206, + "loss": 2.6235, + "theoretical_loss": 3.4069697421722873, + "tokens_seen": 2172756992 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017260782347041124, + "loss": 2.4683, + "theoretical_loss": 3.406961313095671, + "tokens_seen": 2172822528 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017259779338014042, + "loss": 2.8263, + "theoretical_loss": 3.4069528843444696, + "tokens_seen": 2172888064 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001725877632898696, + "loss": 2.8601, + "theoretical_loss": 3.4069444559186604, + "tokens_seen": 2172953600 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017257773319959878, + "loss": 2.7532, + "theoretical_loss": 3.4069360278182215, + "tokens_seen": 2173019136 + }, + { + "epoch": 7.03, + "learning_rate": 0.000172567703109328, + "loss": 2.7987, + "theoretical_loss": 3.40692760004313, + "tokens_seen": 2173084672 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017255767301905717, + "loss": 2.6614, + "theoretical_loss": 3.406919172593364, + "tokens_seen": 2173150208 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017254764292878635, + "loss": 2.6233, + "theoretical_loss": 3.4069107454689007, + "tokens_seen": 2173215744 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017253761283851554, + "loss": 2.7504, + "theoretical_loss": 3.406902318669718, + "tokens_seen": 2173281280 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017252758274824474, + "loss": 2.7002, + "theoretical_loss": 3.4068938921957934, + "tokens_seen": 2173346816 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017251755265797392, + "loss": 2.7004, + "theoretical_loss": 3.406885466047105, + "tokens_seen": 2173412352 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001725075225677031, + "loss": 2.8941, + "theoretical_loss": 3.4068770402236295, + "tokens_seen": 2173477888 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001724974924774323, + "loss": 2.6559, + "theoretical_loss": 3.406868614725345, + "tokens_seen": 2173543424 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017248746238716147, + "loss": 2.9264, + "theoretical_loss": 3.40686018955223, + "tokens_seen": 2173608960 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017247743229689068, + "loss": 2.4264, + "theoretical_loss": 3.406851764704261, + "tokens_seen": 2173674496 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017246740220661986, + "loss": 2.6842, + "theoretical_loss": 3.4068433401814158, + "tokens_seen": 2173740032 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017245737211634904, + "loss": 2.7788, + "theoretical_loss": 3.4068349159836724, + "tokens_seen": 2173805568 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017244734202607822, + "loss": 2.7304, + "theoretical_loss": 3.4068264921110085, + "tokens_seen": 2173871104 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017243731193580743, + "loss": 2.6841, + "theoretical_loss": 3.4068180685634015, + "tokens_seen": 2173936640 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001724272818455366, + "loss": 2.8102, + "theoretical_loss": 3.406809645340829, + "tokens_seen": 2174002176 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001724172517552658, + "loss": 2.6133, + "theoretical_loss": 3.4068012224432693, + "tokens_seen": 2174067712 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017240722166499497, + "loss": 2.8133, + "theoretical_loss": 3.4067927998706993, + "tokens_seen": 2174133248 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2418178, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.524359941482544, + "objective/train/theoretical_loss": 3.406790694278334, + "objective/train/tokens_used": 2194609632, + "theoretical_loss": 3.406790694278334, + "tokens_seen": 2174149632 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017239719157472418, + "loss": 2.7252, + "theoretical_loss": 3.406784377623097, + "tokens_seen": 2174198784 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017238716148445336, + "loss": 2.7878, + "theoretical_loss": 3.4067759557004402, + "tokens_seen": 2174264320 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017237713139418254, + "loss": 2.6634, + "theoretical_loss": 3.406767534102706, + "tokens_seen": 2174329856 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017236710130391172, + "loss": 2.8591, + "theoretical_loss": 3.4067591128298726, + "tokens_seen": 2174395392 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001723570712136409, + "loss": 2.6878, + "theoretical_loss": 3.4067506918819177, + "tokens_seen": 2174460928 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001723470411233701, + "loss": 2.7846, + "theoretical_loss": 3.4067422712588185, + "tokens_seen": 2174526464 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001723370110330993, + "loss": 2.8115, + "theoretical_loss": 3.406733850960553, + "tokens_seen": 2174592000 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017232698094282847, + "loss": 2.5932, + "theoretical_loss": 3.406725430987099, + "tokens_seen": 2174657536 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017231695085255766, + "loss": 2.4564, + "theoretical_loss": 3.4067170113384337, + "tokens_seen": 2174723072 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017230692076228686, + "loss": 2.6879, + "theoretical_loss": 3.406708592014536, + "tokens_seen": 2174788608 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017229689067201607, + "loss": 2.9597, + "theoretical_loss": 3.406700173015382, + "tokens_seen": 2174854144 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017228686058174525, + "loss": 2.7908, + "theoretical_loss": 3.40669175434095, + "tokens_seen": 2174919680 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017227683049147443, + "loss": 2.6191, + "theoretical_loss": 3.406683335991218, + "tokens_seen": 2174985216 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017226680040120362, + "loss": 2.5427, + "theoretical_loss": 3.406674917966164, + "tokens_seen": 2175050752 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017225677031093282, + "loss": 2.575, + "theoretical_loss": 3.406666500265765, + "tokens_seen": 2175116288 + }, + { + "epoch": 7.03, + "learning_rate": 0.000172246740220662, + "loss": 2.952, + "theoretical_loss": 3.406658082889998, + "tokens_seen": 2175181824 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017223671013039119, + "loss": 2.6522, + "theoretical_loss": 3.4066496658388425, + "tokens_seen": 2175247360 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017222668004012037, + "loss": 2.6672, + "theoretical_loss": 3.406641249112275, + "tokens_seen": 2175312896 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017221664994984958, + "loss": 2.6896, + "theoretical_loss": 3.4066328327102733, + "tokens_seen": 2175378432 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017220661985957876, + "loss": 2.813, + "theoretical_loss": 3.4066244166328157, + "tokens_seen": 2175443968 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017219658976930794, + "loss": 2.668, + "theoretical_loss": 3.4066160008798794, + "tokens_seen": 2175509504 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017218655967903712, + "loss": 2.8252, + "theoretical_loss": 3.406607585451442, + "tokens_seen": 2175575040 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001721765295887663, + "loss": 2.7793, + "theoretical_loss": 3.406599170347482, + "tokens_seen": 2175640576 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001721664994984955, + "loss": 2.8098, + "theoretical_loss": 3.4065907555679757, + "tokens_seen": 2175706112 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001721564694082247, + "loss": 2.462, + "theoretical_loss": 3.406582341112902, + "tokens_seen": 2175771648 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2419379, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.512960195541382, + "objective/train/theoretical_loss": 3.4065802375498233, + "objective/train/tokens_used": 2196248032, + "theoretical_loss": 3.4065802375498233, + "tokens_seen": 2175788032 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017214643931795387, + "loss": 2.7043, + "theoretical_loss": 3.4065739269822384, + "tokens_seen": 2175837184 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017213640922768305, + "loss": 2.7457, + "theoretical_loss": 3.4065655131759627, + "tokens_seen": 2175902720 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017212637913741226, + "loss": 2.5919, + "theoretical_loss": 3.406557099694052, + "tokens_seen": 2175968256 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017211634904714144, + "loss": 2.6806, + "theoretical_loss": 3.4065486865364845, + "tokens_seen": 2176033792 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017210631895687062, + "loss": 2.6604, + "theoretical_loss": 3.4065402737032384, + "tokens_seen": 2176099328 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001720962888665998, + "loss": 2.6932, + "theoretical_loss": 3.4065318611942903, + "tokens_seen": 2176164864 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017208625877632898, + "loss": 2.7284, + "theoretical_loss": 3.406523449009619, + "tokens_seen": 2176230400 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001720762286860582, + "loss": 2.6897, + "theoretical_loss": 3.4065150371492012, + "tokens_seen": 2176295936 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017206619859578737, + "loss": 2.6566, + "theoretical_loss": 3.406506625613016, + "tokens_seen": 2176361472 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017205616850551655, + "loss": 2.5886, + "theoretical_loss": 3.4064982144010396, + "tokens_seen": 2176427008 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017204613841524574, + "loss": 2.6256, + "theoretical_loss": 3.406489803513251, + "tokens_seen": 2176492544 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017203610832497494, + "loss": 2.5797, + "theoretical_loss": 3.4064813929496274, + "tokens_seen": 2176558080 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017202607823470412, + "loss": 2.7255, + "theoretical_loss": 3.4064729827101465, + "tokens_seen": 2176623616 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001720160481444333, + "loss": 2.6932, + "theoretical_loss": 3.406464572794786, + "tokens_seen": 2176689152 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001720060180541625, + "loss": 2.449, + "theoretical_loss": 3.406456163203524, + "tokens_seen": 2176754688 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017199598796389167, + "loss": 2.3505, + "theoretical_loss": 3.406447753936338, + "tokens_seen": 2176820224 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017198595787362088, + "loss": 2.9087, + "theoretical_loss": 3.4064393449932058, + "tokens_seen": 2176885760 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017197592778335006, + "loss": 2.5905, + "theoretical_loss": 3.4064309363741048, + "tokens_seen": 2176951296 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017196589769307924, + "loss": 2.5813, + "theoretical_loss": 3.4064225280790135, + "tokens_seen": 2177016832 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017195586760280842, + "loss": 2.558, + "theoretical_loss": 3.4064141201079092, + "tokens_seen": 2177082368 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017194583751253763, + "loss": 2.7218, + "theoretical_loss": 3.40640571246077, + "tokens_seen": 2177147904 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001719358074222668, + "loss": 2.6796, + "theoretical_loss": 3.406397305137573, + "tokens_seen": 2177213440 + }, + { + "epoch": 7.03, + "learning_rate": 0.000171925777331996, + "loss": 2.6738, + "theoretical_loss": 3.406388898138297, + "tokens_seen": 2177278976 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017191574724172517, + "loss": 2.562, + "theoretical_loss": 3.4063804914629183, + "tokens_seen": 2177344512 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017190571715145438, + "loss": 2.5498, + "theoretical_loss": 3.4063720851114163, + "tokens_seen": 2177410048 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2420039, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.81219482421875, + "objective/train/theoretical_loss": 3.4063699835741437, + "objective/train/tokens_used": 2197886432, + "theoretical_loss": 3.4063699835741437, + "tokens_seen": 2177426432 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017189568706118356, + "loss": 2.6393, + "theoretical_loss": 3.4063636790837677, + "tokens_seen": 2177475584 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017188565697091274, + "loss": 2.707, + "theoretical_loss": 3.4063552733799503, + "tokens_seen": 2177541120 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017187562688064192, + "loss": 2.9957, + "theoretical_loss": 3.4063468679999422, + "tokens_seen": 2177606656 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001718655967903711, + "loss": 2.7062, + "theoretical_loss": 3.4063384629437214, + "tokens_seen": 2177672192 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001718555667001003, + "loss": 2.8179, + "theoretical_loss": 3.406330058211265, + "tokens_seen": 2177737728 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001718455366098295, + "loss": 2.8567, + "theoretical_loss": 3.406321653802552, + "tokens_seen": 2177803264 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017183550651955867, + "loss": 2.6001, + "theoretical_loss": 3.4063132497175586, + "tokens_seen": 2177868800 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017182547642928786, + "loss": 2.6072, + "theoretical_loss": 3.4063048459562637, + "tokens_seen": 2177934336 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017181544633901706, + "loss": 2.5299, + "theoretical_loss": 3.406296442518645, + "tokens_seen": 2177999872 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017180541624874625, + "loss": 2.855, + "theoretical_loss": 3.4062880394046795, + "tokens_seen": 2178065408 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017179538615847543, + "loss": 2.6596, + "theoretical_loss": 3.406279636614346, + "tokens_seen": 2178130944 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001717853560682046, + "loss": 2.7667, + "theoretical_loss": 3.406271234147622, + "tokens_seen": 2178196480 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001717753259779338, + "loss": 2.642, + "theoretical_loss": 3.4062628320044848, + "tokens_seen": 2178262016 + }, + { + "epoch": 7.03, + "learning_rate": 0.000171765295887663, + "loss": 2.9291, + "theoretical_loss": 3.4062544301849127, + "tokens_seen": 2178327552 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017175526579739218, + "loss": 2.6525, + "theoretical_loss": 3.4062460286888836, + "tokens_seen": 2178393088 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017174523570712136, + "loss": 2.5964, + "theoretical_loss": 3.406237627516375, + "tokens_seen": 2178458624 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017173520561685054, + "loss": 2.6352, + "theoretical_loss": 3.406229226667364, + "tokens_seen": 2178524160 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017172517552657975, + "loss": 2.8153, + "theoretical_loss": 3.40622082614183, + "tokens_seen": 2178589696 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017171514543630893, + "loss": 2.7204, + "theoretical_loss": 3.40621242593975, + "tokens_seen": 2178655232 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001717051153460381, + "loss": 2.4631, + "theoretical_loss": 3.4062040260611015, + "tokens_seen": 2178720768 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001716950852557673, + "loss": 2.6459, + "theoretical_loss": 3.406195626505863, + "tokens_seen": 2178786304 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017168505516549647, + "loss": 2.6511, + "theoretical_loss": 3.4061872272740117, + "tokens_seen": 2178851840 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017167502507522568, + "loss": 2.3384, + "theoretical_loss": 3.406178828365526, + "tokens_seen": 2178917376 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017166499498495486, + "loss": 2.7383, + "theoretical_loss": 3.406170429780383, + "tokens_seen": 2178982912 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017165496489468404, + "loss": 2.5376, + "theoretical_loss": 3.4061620315185612, + "tokens_seen": 2179048448 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2423691, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4639205932617188, + "objective/train/theoretical_loss": 3.4061599320036224, + "objective/train/tokens_used": 2199524832, + "theoretical_loss": 3.4061599320036224, + "tokens_seen": 2179064832 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017164493480441322, + "loss": 2.6913, + "theoretical_loss": 3.4061536335800384, + "tokens_seen": 2179113984 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017163490471414243, + "loss": 2.7721, + "theoretical_loss": 3.406145235964792, + "tokens_seen": 2179179520 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017162487462387161, + "loss": 2.4005, + "theoretical_loss": 3.4061368386728, + "tokens_seen": 2179245056 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001716148445336008, + "loss": 2.7406, + "theoretical_loss": 3.4061284417040403, + "tokens_seen": 2179310592 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017160481444332998, + "loss": 2.6665, + "theoretical_loss": 3.4061200450584908, + "tokens_seen": 2179376128 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017159478435305916, + "loss": 2.7642, + "theoretical_loss": 3.406111648736129, + "tokens_seen": 2179441664 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017158475426278837, + "loss": 2.6683, + "theoretical_loss": 3.4061032527369335, + "tokens_seen": 2179507200 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017157472417251755, + "loss": 2.4823, + "theoretical_loss": 3.4060948570608813, + "tokens_seen": 2179572736 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017156469408224673, + "loss": 2.6754, + "theoretical_loss": 3.4060864617079503, + "tokens_seen": 2179638272 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001715546639919759, + "loss": 2.6414, + "theoretical_loss": 3.406078066678119, + "tokens_seen": 2179703808 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017154463390170514, + "loss": 2.5888, + "theoretical_loss": 3.4060696719713652, + "tokens_seen": 2179769344 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017153460381143433, + "loss": 2.6578, + "theoretical_loss": 3.406061277587666, + "tokens_seen": 2179834880 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001715245737211635, + "loss": 2.6502, + "theoretical_loss": 3.406052883527, + "tokens_seen": 2179900416 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001715145436308927, + "loss": 2.5036, + "theoretical_loss": 3.4060444897893443, + "tokens_seen": 2179965952 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017150451354062187, + "loss": 2.6583, + "theoretical_loss": 3.4060360963746774, + "tokens_seen": 2180031488 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017149448345035108, + "loss": 2.7375, + "theoretical_loss": 3.4060277032829775, + "tokens_seen": 2180097024 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017148445336008026, + "loss": 2.6991, + "theoretical_loss": 3.406019310514221, + "tokens_seen": 2180162560 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017147442326980944, + "loss": 2.5516, + "theoretical_loss": 3.4060109180683877, + "tokens_seen": 2180228096 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017146439317953862, + "loss": 2.7587, + "theoretical_loss": 3.406002525945454, + "tokens_seen": 2180293632 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017145436308926783, + "loss": 2.6834, + "theoretical_loss": 3.405994134145398, + "tokens_seen": 2180359168 + }, + { + "epoch": 7.03, + "learning_rate": 0.000171444332998997, + "loss": 2.7574, + "theoretical_loss": 3.4059857426681983, + "tokens_seen": 2180424704 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001714343029087262, + "loss": 2.8516, + "theoretical_loss": 3.4059773515138323, + "tokens_seen": 2180490240 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017142427281845537, + "loss": 2.7291, + "theoretical_loss": 3.4059689606822774, + "tokens_seen": 2180555776 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017141424272818455, + "loss": 2.6277, + "theoretical_loss": 3.4059605701735123, + "tokens_seen": 2180621312 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017140421263791376, + "loss": 2.8033, + "theoretical_loss": 3.4059521799875148, + "tokens_seen": 2180686848 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2428968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8116869926452637, + "objective/train/theoretical_loss": 3.405950082491445, + "objective/train/tokens_used": 2201163232, + "theoretical_loss": 3.405950082491445, + "tokens_seen": 2180703232 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017139418254764294, + "loss": 2.5361, + "theoretical_loss": 3.405943790124262, + "tokens_seen": 2180752384 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017138415245737212, + "loss": 2.6895, + "theoretical_loss": 3.405935400583733, + "tokens_seen": 2180817920 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001713741223671013, + "loss": 2.7182, + "theoretical_loss": 3.4059270113659044, + "tokens_seen": 2180883456 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001713640922768305, + "loss": 2.8269, + "theoretical_loss": 3.4059186224707547, + "tokens_seen": 2180948992 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001713540621865597, + "loss": 2.5077, + "theoretical_loss": 3.405910233898262, + "tokens_seen": 2181014528 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017134403209628887, + "loss": 2.8396, + "theoretical_loss": 3.405901845648404, + "tokens_seen": 2181080064 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017133400200601806, + "loss": 2.8007, + "theoretical_loss": 3.4058934577211586, + "tokens_seen": 2181145600 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017132397191574726, + "loss": 2.8151, + "theoretical_loss": 3.4058850701165038, + "tokens_seen": 2181211136 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017131394182547645, + "loss": 2.7426, + "theoretical_loss": 3.405876682834417, + "tokens_seen": 2181276672 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017130391173520563, + "loss": 2.6608, + "theoretical_loss": 3.4058682958748765, + "tokens_seen": 2181342208 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001712938816449348, + "loss": 2.6709, + "theoretical_loss": 3.4058599092378605, + "tokens_seen": 2181407744 + }, + { + "epoch": 7.03, + "learning_rate": 0.000171283851554664, + "loss": 2.5398, + "theoretical_loss": 3.4058515229233466, + "tokens_seen": 2181473280 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001712738214643932, + "loss": 2.7622, + "theoretical_loss": 3.4058431369313125, + "tokens_seen": 2181538816 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017126379137412238, + "loss": 2.6424, + "theoretical_loss": 3.4058347512617364, + "tokens_seen": 2181604352 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017125376128385156, + "loss": 2.4858, + "theoretical_loss": 3.405826365914596, + "tokens_seen": 2181669888 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017124373119358074, + "loss": 2.7455, + "theoretical_loss": 3.4058179808898696, + "tokens_seen": 2181735424 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017123370110330995, + "loss": 2.4358, + "theoretical_loss": 3.4058095961875345, + "tokens_seen": 2181800960 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017122367101303913, + "loss": 2.7801, + "theoretical_loss": 3.4058012118075696, + "tokens_seen": 2181866496 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001712136409227683, + "loss": 2.6842, + "theoretical_loss": 3.405792827749952, + "tokens_seen": 2181932032 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001712036108324975, + "loss": 2.6182, + "theoretical_loss": 3.4057844440146594, + "tokens_seen": 2181997568 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017119358074222667, + "loss": 2.6557, + "theoretical_loss": 3.4057760606016707, + "tokens_seen": 2182063104 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017118355065195588, + "loss": 2.74, + "theoretical_loss": 3.405767677510963, + "tokens_seen": 2182128640 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017117352056168506, + "loss": 2.816, + "theoretical_loss": 3.4057592947425146, + "tokens_seen": 2182194176 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017116349047141424, + "loss": 2.511, + "theoretical_loss": 3.4057509122963032, + "tokens_seen": 2182259712 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017115346038114342, + "loss": 2.556, + "theoretical_loss": 3.405742530172307, + "tokens_seen": 2182325248 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2433838, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7705180644989014, + "objective/train/theoretical_loss": 3.4057404346916518, + "objective/train/tokens_used": 2202801632, + "theoretical_loss": 3.4057404346916518, + "tokens_seen": 2182341632 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017114343029087263, + "loss": 2.6836, + "theoretical_loss": 3.405734148370504, + "tokens_seen": 2182390784 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017113340020060181, + "loss": 2.5067, + "theoretical_loss": 3.405725766890872, + "tokens_seen": 2182456320 + }, + { + "epoch": 7.03, + "learning_rate": 0.000171123370110331, + "loss": 2.5808, + "theoretical_loss": 3.405717385733389, + "tokens_seen": 2182521856 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017111334002006018, + "loss": 2.6608, + "theoretical_loss": 3.405709004898033, + "tokens_seen": 2182587392 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017110330992978936, + "loss": 2.6806, + "theoretical_loss": 3.405700624384781, + "tokens_seen": 2182652928 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017109327983951857, + "loss": 2.6774, + "theoretical_loss": 3.4056922441936126, + "tokens_seen": 2182718464 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017108324974924775, + "loss": 2.5753, + "theoretical_loss": 3.4056838643245047, + "tokens_seen": 2182784000 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017107321965897693, + "loss": 2.79, + "theoretical_loss": 3.405675484777435, + "tokens_seen": 2182849536 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001710631895687061, + "loss": 2.6314, + "theoretical_loss": 3.4056671055523826, + "tokens_seen": 2182915072 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017105315947843532, + "loss": 2.5858, + "theoretical_loss": 3.4056587266493246, + "tokens_seen": 2182980608 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001710431293881645, + "loss": 2.6777, + "theoretical_loss": 3.4056503480682387, + "tokens_seen": 2183046144 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017103309929789368, + "loss": 2.7534, + "theoretical_loss": 3.405641969809104, + "tokens_seen": 2183111680 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017102306920762286, + "loss": 2.7194, + "theoretical_loss": 3.405633591871897, + "tokens_seen": 2183177216 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017101303911735204, + "loss": 2.7716, + "theoretical_loss": 3.405625214256597, + "tokens_seen": 2183242752 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017100300902708125, + "loss": 2.5834, + "theoretical_loss": 3.4056168369631816, + "tokens_seen": 2183308288 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017099297893681043, + "loss": 2.6627, + "theoretical_loss": 3.4056084599916283, + "tokens_seen": 2183373824 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001709829488465396, + "loss": 2.5507, + "theoretical_loss": 3.4056000833419153, + "tokens_seen": 2183439360 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001709729187562688, + "loss": 2.8618, + "theoretical_loss": 3.4055917070140205, + "tokens_seen": 2183504896 + }, + { + "epoch": 7.03, + "learning_rate": 0.000170962888665998, + "loss": 2.6766, + "theoretical_loss": 3.4055833310079224, + "tokens_seen": 2183570432 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017095285857572718, + "loss": 2.8023, + "theoretical_loss": 3.405574955323598, + "tokens_seen": 2183635968 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017094282848545636, + "loss": 2.7716, + "theoretical_loss": 3.405566579961026, + "tokens_seen": 2183701504 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017093279839518554, + "loss": 2.3896, + "theoretical_loss": 3.405558204920185, + "tokens_seen": 2183767040 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017092276830491473, + "loss": 2.7451, + "theoretical_loss": 3.4055498302010516, + "tokens_seen": 2183832576 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017091273821464393, + "loss": 2.4879, + "theoretical_loss": 3.4055414558036046, + "tokens_seen": 2183898112 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017090270812437312, + "loss": 2.8447, + "theoretical_loss": 3.4055330817278215, + "tokens_seen": 2183963648 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2436298, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8084614276885986, + "objective/train/theoretical_loss": 3.4055309882591334, + "objective/train/tokens_used": 2204440032, + "theoretical_loss": 3.4055309882591334, + "tokens_seen": 2183980032 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001708926780341023, + "loss": 2.793, + "theoretical_loss": 3.4055247079736812, + "tokens_seen": 2184029184 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017088264794383148, + "loss": 2.8939, + "theoretical_loss": 3.4055163345411605, + "tokens_seen": 2184094720 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017087261785356069, + "loss": 2.6107, + "theoretical_loss": 3.4055079614302386, + "tokens_seen": 2184160256 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017086258776328987, + "loss": 2.7291, + "theoretical_loss": 3.4054995886408923, + "tokens_seen": 2184225792 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017085255767301905, + "loss": 2.6644, + "theoretical_loss": 3.405491216173101, + "tokens_seen": 2184291328 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017084252758274823, + "loss": 2.6251, + "theoretical_loss": 3.4054828440268414, + "tokens_seen": 2184356864 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001708324974924774, + "loss": 2.6273, + "theoretical_loss": 3.4054744722020915, + "tokens_seen": 2184422400 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017082246740220662, + "loss": 2.8744, + "theoretical_loss": 3.4054661006988307, + "tokens_seen": 2184487936 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001708124373119358, + "loss": 2.5519, + "theoretical_loss": 3.4054577295170354, + "tokens_seen": 2184553472 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017080240722166498, + "loss": 2.6364, + "theoretical_loss": 3.405449358656685, + "tokens_seen": 2184619008 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001707923771313942, + "loss": 2.8119, + "theoretical_loss": 3.4054409881177565, + "tokens_seen": 2184684544 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001707823470411234, + "loss": 2.7427, + "theoretical_loss": 3.4054326179002286, + "tokens_seen": 2184750080 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017077231695085258, + "loss": 2.6103, + "theoretical_loss": 3.4054242480040786, + "tokens_seen": 2184815616 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017076228686058176, + "loss": 2.6628, + "theoretical_loss": 3.4054158784292854, + "tokens_seen": 2184881152 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017075225677031094, + "loss": 2.6704, + "theoretical_loss": 3.4054075091758262, + "tokens_seen": 2184946688 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017074222668004015, + "loss": 2.4788, + "theoretical_loss": 3.405399140243679, + "tokens_seen": 2185012224 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017073219658976933, + "loss": 2.4024, + "theoretical_loss": 3.4053907716328227, + "tokens_seen": 2185077760 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001707221664994985, + "loss": 2.7431, + "theoretical_loss": 3.4053824033432347, + "tokens_seen": 2185143296 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001707121364092277, + "loss": 2.6823, + "theoretical_loss": 3.4053740353748934, + "tokens_seen": 2185208832 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017070210631895687, + "loss": 2.7335, + "theoretical_loss": 3.405365667727776, + "tokens_seen": 2185274368 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017069207622868608, + "loss": 2.6007, + "theoretical_loss": 3.4053573004018616, + "tokens_seen": 2185339904 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017068204613841526, + "loss": 2.5925, + "theoretical_loss": 3.4053489333971276, + "tokens_seen": 2185405440 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017067201604814444, + "loss": 2.5822, + "theoretical_loss": 3.4053405667135523, + "tokens_seen": 2185470976 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017066198595787362, + "loss": 2.6442, + "theoretical_loss": 3.4053322003511135, + "tokens_seen": 2185536512 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017065195586760283, + "loss": 2.66, + "theoretical_loss": 3.4053238343097894, + "tokens_seen": 2185602048 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2441473, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6622517108917236, + "objective/train/theoretical_loss": 3.40532174284963, + "objective/train/tokens_used": 2206078432, + "theoretical_loss": 3.40532174284963, + "tokens_seen": 2185618432 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017064192577733201, + "loss": 2.7217, + "theoretical_loss": 3.405315468589558, + "tokens_seen": 2185667584 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001706318956870612, + "loss": 2.777, + "theoretical_loss": 3.405307103190398, + "tokens_seen": 2185733120 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017062186559679038, + "loss": 2.5742, + "theoretical_loss": 3.405298738112286, + "tokens_seen": 2185798656 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017061183550651956, + "loss": 2.6928, + "theoretical_loss": 3.4052903733552013, + "tokens_seen": 2185864192 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017060180541624877, + "loss": 2.6821, + "theoretical_loss": 3.4052820089191216, + "tokens_seen": 2185929728 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017059177532597795, + "loss": 2.6711, + "theoretical_loss": 3.4052736448040246, + "tokens_seen": 2185995264 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017058174523570713, + "loss": 2.5533, + "theoretical_loss": 3.405265281009889, + "tokens_seen": 2186060800 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001705717151454363, + "loss": 2.6077, + "theoretical_loss": 3.4052569175366925, + "tokens_seen": 2186126336 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017056168505516552, + "loss": 2.3686, + "theoretical_loss": 3.405248554384413, + "tokens_seen": 2186191872 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001705516549648947, + "loss": 2.5465, + "theoretical_loss": 3.4052401915530286, + "tokens_seen": 2186257408 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017054162487462388, + "loss": 2.7368, + "theoretical_loss": 3.405231829042518, + "tokens_seen": 2186322944 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017053159478435306, + "loss": 2.5824, + "theoretical_loss": 3.4052234668528585, + "tokens_seen": 2186388480 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017052156469408224, + "loss": 2.5025, + "theoretical_loss": 3.4052151049840287, + "tokens_seen": 2186454016 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017051153460381145, + "loss": 2.5839, + "theoretical_loss": 3.405206743436006, + "tokens_seen": 2186519552 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017050150451354063, + "loss": 2.4682, + "theoretical_loss": 3.405198382208769, + "tokens_seen": 2186585088 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001704914744232698, + "loss": 2.7268, + "theoretical_loss": 3.405190021302296, + "tokens_seen": 2186650624 + }, + { + "epoch": 7.03, + "learning_rate": 0.000170481444332999, + "loss": 2.765, + "theoretical_loss": 3.405181660716565, + "tokens_seen": 2186716160 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001704714142427282, + "loss": 2.7023, + "theoretical_loss": 3.405173300451553, + "tokens_seen": 2186781696 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017046138415245738, + "loss": 2.6825, + "theoretical_loss": 3.4051649405072397, + "tokens_seen": 2186847232 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017045135406218656, + "loss": 2.6172, + "theoretical_loss": 3.4051565808836024, + "tokens_seen": 2186912768 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017044132397191574, + "loss": 2.7249, + "theoretical_loss": 3.405148221580619, + "tokens_seen": 2186978304 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017043129388164493, + "loss": 2.7173, + "theoretical_loss": 3.4051398625982676, + "tokens_seen": 2187043840 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017042126379137413, + "loss": 2.5586, + "theoretical_loss": 3.405131503936527, + "tokens_seen": 2187109376 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017041123370110332, + "loss": 2.5364, + "theoretical_loss": 3.4051231455953745, + "tokens_seen": 2187174912 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001704012036108325, + "loss": 2.6742, + "theoretical_loss": 3.4051147875747887, + "tokens_seen": 2187240448 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2442561, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.914799928665161, + "objective/train/theoretical_loss": 3.405112698119728, + "objective/train/tokens_used": 2207716832, + "theoretical_loss": 3.405112698119728, + "tokens_seen": 2187256832 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017039117352056168, + "loss": 2.6864, + "theoretical_loss": 3.4051064298747473, + "tokens_seen": 2187305984 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017038114343029089, + "loss": 2.549, + "theoretical_loss": 3.4050980724952287, + "tokens_seen": 2187371520 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017037111334002007, + "loss": 2.7992, + "theoretical_loss": 3.405089715436211, + "tokens_seen": 2187437056 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017036108324974925, + "loss": 2.5015, + "theoretical_loss": 3.405081358697672, + "tokens_seen": 2187502592 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017035105315947843, + "loss": 2.529, + "theoretical_loss": 3.40507300227959, + "tokens_seen": 2187568128 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001703410230692076, + "loss": 2.53, + "theoretical_loss": 3.405064646181944, + "tokens_seen": 2187633664 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017033099297893682, + "loss": 2.4696, + "theoretical_loss": 3.40505629040471, + "tokens_seen": 2187699200 + }, + { + "epoch": 7.03, + "learning_rate": 0.000170320962888666, + "loss": 2.4772, + "theoretical_loss": 3.405047934947868, + "tokens_seen": 2187764736 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017031093279839518, + "loss": 2.5849, + "theoretical_loss": 3.4050395798113957, + "tokens_seen": 2187830272 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017030090270812436, + "loss": 2.5613, + "theoretical_loss": 3.4050312249952706, + "tokens_seen": 2187895808 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017029087261785357, + "loss": 2.5915, + "theoretical_loss": 3.4050228704994714, + "tokens_seen": 2187961344 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017028084252758275, + "loss": 2.7027, + "theoretical_loss": 3.4050145163239764, + "tokens_seen": 2188026880 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017027081243731193, + "loss": 2.5124, + "theoretical_loss": 3.405006162468763, + "tokens_seen": 2188092416 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001702607823470411, + "loss": 2.5143, + "theoretical_loss": 3.4049978089338095, + "tokens_seen": 2188157952 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017025075225677032, + "loss": 2.7057, + "theoretical_loss": 3.404989455719095, + "tokens_seen": 2188223488 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001702407221664995, + "loss": 2.7247, + "theoretical_loss": 3.404981102824596, + "tokens_seen": 2188289024 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017023069207622868, + "loss": 2.7462, + "theoretical_loss": 3.404972750250292, + "tokens_seen": 2188354560 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017022066198595786, + "loss": 2.6133, + "theoretical_loss": 3.4049643979961606, + "tokens_seen": 2188420096 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017021063189568705, + "loss": 2.6348, + "theoretical_loss": 3.40495604606218, + "tokens_seen": 2188485632 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017020060180541625, + "loss": 2.6567, + "theoretical_loss": 3.4049476944483286, + "tokens_seen": 2188551168 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017019057171514544, + "loss": 2.7619, + "theoretical_loss": 3.4049393431545836, + "tokens_seen": 2188616704 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017018054162487462, + "loss": 2.7434, + "theoretical_loss": 3.4049309921809243, + "tokens_seen": 2188682240 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001701705115346038, + "loss": 2.5113, + "theoretical_loss": 3.4049226415273286, + "tokens_seen": 2188747776 + }, + { + "epoch": 7.03, + "learning_rate": 0.000170160481444333, + "loss": 2.6131, + "theoretical_loss": 3.404914291193774, + "tokens_seen": 2188813312 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001701504513540622, + "loss": 2.7786, + "theoretical_loss": 3.4049059411802394, + "tokens_seen": 2188878848 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2443260, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.345069169998169, + "objective/train/theoretical_loss": 3.4049038537268563, + "objective/train/tokens_used": 2209355232, + "theoretical_loss": 3.4049038537268563, + "tokens_seen": 2188895232 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017014042126379137, + "loss": 2.791, + "theoretical_loss": 3.4048975914867023, + "tokens_seen": 2188944384 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017013039117352055, + "loss": 2.6914, + "theoretical_loss": 3.4048892421131414, + "tokens_seen": 2189009920 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017012036108324973, + "loss": 2.6163, + "theoretical_loss": 3.4048808930595347, + "tokens_seen": 2189075456 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017011033099297894, + "loss": 2.6019, + "theoretical_loss": 3.40487254432586, + "tokens_seen": 2189140992 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017010030090270812, + "loss": 2.6604, + "theoretical_loss": 3.4048641959120958, + "tokens_seen": 2189206528 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001700902708124373, + "loss": 2.7422, + "theoretical_loss": 3.4048558478182205, + "tokens_seen": 2189272064 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017008024072216648, + "loss": 2.804, + "theoretical_loss": 3.404847500044212, + "tokens_seen": 2189337600 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001700702106318957, + "loss": 2.726, + "theoretical_loss": 3.404839152590048, + "tokens_seen": 2189403136 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017006018054162487, + "loss": 2.5844, + "theoretical_loss": 3.404830805455708, + "tokens_seen": 2189468672 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017005015045135405, + "loss": 2.7247, + "theoretical_loss": 3.404822458641169, + "tokens_seen": 2189534208 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017004012036108326, + "loss": 2.5824, + "theoretical_loss": 3.4048141121464086, + "tokens_seen": 2189599744 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017003009027081244, + "loss": 2.6017, + "theoretical_loss": 3.404805765971407, + "tokens_seen": 2189665280 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017002006018054165, + "loss": 2.8634, + "theoretical_loss": 3.404797420116141, + "tokens_seen": 2189730816 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017001003009027083, + "loss": 2.5665, + "theoretical_loss": 3.4047890745805884, + "tokens_seen": 2189796352 + }, + { + "epoch": 7.03, + "learning_rate": 0.00017, + "loss": 2.6352, + "theoretical_loss": 3.4047807293647288, + "tokens_seen": 2189861888 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001699899699097292, + "loss": 2.9747, + "theoretical_loss": 3.404772384468539, + "tokens_seen": 2189927424 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001699799398194584, + "loss": 2.7916, + "theoretical_loss": 3.4047640398919983, + "tokens_seen": 2189992960 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016996990972918758, + "loss": 2.8333, + "theoretical_loss": 3.404755695635084, + "tokens_seen": 2190058496 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016995987963891676, + "loss": 2.7403, + "theoretical_loss": 3.404747351697775, + "tokens_seen": 2190124032 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016994984954864594, + "loss": 2.5412, + "theoretical_loss": 3.404739008080049, + "tokens_seen": 2190189568 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016993981945837513, + "loss": 2.5278, + "theoretical_loss": 3.4047306647818845, + "tokens_seen": 2190255104 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016992978936810433, + "loss": 2.4758, + "theoretical_loss": 3.404722321803259, + "tokens_seen": 2190320640 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016991975927783352, + "loss": 2.6285, + "theoretical_loss": 3.404713979144152, + "tokens_seen": 2190386176 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001699097291875627, + "loss": 2.5944, + "theoretical_loss": 3.4047056368045405, + "tokens_seen": 2190451712 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016989969909729188, + "loss": 2.73, + "theoretical_loss": 3.4046972947844036, + "tokens_seen": 2190517248 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2444516, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0443437099456787, + "objective/train/theoretical_loss": 3.404695209329285, + "objective/train/tokens_used": 2210993632, + "theoretical_loss": 3.404695209329285, + "tokens_seen": 2190533632 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016988966900702109, + "loss": 2.822, + "theoretical_loss": 3.4046889530837188, + "tokens_seen": 2190582784 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016987963891675027, + "loss": 2.5296, + "theoretical_loss": 3.4046806117024646, + "tokens_seen": 2190648320 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016986960882647945, + "loss": 2.579, + "theoretical_loss": 3.4046722706406194, + "tokens_seen": 2190713856 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016985957873620863, + "loss": 2.7152, + "theoretical_loss": 3.4046639298981614, + "tokens_seen": 2190779392 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001698495486459378, + "loss": 2.772, + "theoretical_loss": 3.4046555894750683, + "tokens_seen": 2190844928 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016983951855566702, + "loss": 2.7255, + "theoretical_loss": 3.4046472493713185, + "tokens_seen": 2190910464 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001698294884653962, + "loss": 2.4884, + "theoretical_loss": 3.4046389095868905, + "tokens_seen": 2190976000 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016981945837512538, + "loss": 2.6286, + "theoretical_loss": 3.4046305701217627, + "tokens_seen": 2191041536 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016980942828485456, + "loss": 2.5881, + "theoretical_loss": 3.404622230975913, + "tokens_seen": 2191107072 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016979939819458377, + "loss": 2.5778, + "theoretical_loss": 3.404613892149319, + "tokens_seen": 2191172608 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016978936810431295, + "loss": 2.6923, + "theoretical_loss": 3.4046055536419604, + "tokens_seen": 2191238144 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016977933801404213, + "loss": 2.7194, + "theoretical_loss": 3.4045972154538147, + "tokens_seen": 2191303680 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001697693079237713, + "loss": 2.6243, + "theoretical_loss": 3.4045888775848594, + "tokens_seen": 2191369216 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016975927783350052, + "loss": 2.6113, + "theoretical_loss": 3.4045805400350737, + "tokens_seen": 2191434752 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001697492477432297, + "loss": 2.8532, + "theoretical_loss": 3.4045722028044354, + "tokens_seen": 2191500288 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016973921765295888, + "loss": 2.4322, + "theoretical_loss": 3.404563865892923, + "tokens_seen": 2191565824 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016972918756268807, + "loss": 2.2768, + "theoretical_loss": 3.4045555293005147, + "tokens_seen": 2191631360 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016971915747241725, + "loss": 2.7925, + "theoretical_loss": 3.404547193027188, + "tokens_seen": 2191696896 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016970912738214645, + "loss": 2.7215, + "theoretical_loss": 3.4045388570729225, + "tokens_seen": 2191762432 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016969909729187564, + "loss": 2.5702, + "theoretical_loss": 3.404530521437696, + "tokens_seen": 2191827968 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016968906720160482, + "loss": 2.8867, + "theoretical_loss": 3.404522186121486, + "tokens_seen": 2191893504 + }, + { + "epoch": 7.03, + "learning_rate": 0.000169679037111334, + "loss": 2.9948, + "theoretical_loss": 3.404513851124271, + "tokens_seen": 2191959040 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001696690070210632, + "loss": 2.5146, + "theoretical_loss": 3.40450551644603, + "tokens_seen": 2192024576 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001696589769307924, + "loss": 2.4715, + "theoretical_loss": 3.4044971820867405, + "tokens_seen": 2192090112 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016964894684052157, + "loss": 2.6907, + "theoretical_loss": 3.4044888480463813, + "tokens_seen": 2192155648 + }, + { + "epoch": 7.03, + "objective/train/docs_used": 2445201, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.247011423110962, + "objective/train/theoretical_loss": 3.4044867645861214, + "objective/train/tokens_used": 2212632032, + "theoretical_loss": 3.4044867645861214, + "tokens_seen": 2192172032 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016963891675025075, + "loss": 2.4854, + "theoretical_loss": 3.40448051432493, + "tokens_seen": 2192221184 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016962888665997993, + "loss": 2.744, + "theoretical_loss": 3.4044721809223653, + "tokens_seen": 2192286720 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016961885656970914, + "loss": 2.5372, + "theoretical_loss": 3.4044638478386657, + "tokens_seen": 2192352256 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016960882647943832, + "loss": 2.6591, + "theoretical_loss": 3.404455515073809, + "tokens_seen": 2192417792 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001695987963891675, + "loss": 2.4238, + "theoretical_loss": 3.4044471826277736, + "tokens_seen": 2192483328 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016958876629889668, + "loss": 2.4842, + "theoretical_loss": 3.4044388505005383, + "tokens_seen": 2192548864 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001695787362086259, + "loss": 2.5719, + "theoretical_loss": 3.4044305186920805, + "tokens_seen": 2192614400 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016956870611835507, + "loss": 2.7274, + "theoretical_loss": 3.4044221872023788, + "tokens_seen": 2192679936 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016955867602808425, + "loss": 2.8612, + "theoretical_loss": 3.404413856031412, + "tokens_seen": 2192745472 + }, + { + "epoch": 7.03, + "learning_rate": 0.00016954864593781343, + "loss": 2.6268, + "theoretical_loss": 3.404405525179157, + "tokens_seen": 2192811008 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016953861584754261, + "loss": 2.6612, + "theoretical_loss": 3.4043971946455938, + "tokens_seen": 2192876544 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016952858575727182, + "loss": 2.4999, + "theoretical_loss": 3.4043888644307, + "tokens_seen": 2192942080 + }, + { + "epoch": 7.04, + "learning_rate": 0.000169518555667001, + "loss": 2.4474, + "theoretical_loss": 3.4043805345344533, + "tokens_seen": 2193007616 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016950852557673019, + "loss": 2.4193, + "theoretical_loss": 3.4043722049568323, + "tokens_seen": 2193073152 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016949849548645937, + "loss": 2.7628, + "theoretical_loss": 3.404363875697816, + "tokens_seen": 2193138688 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016948846539618857, + "loss": 2.5125, + "theoretical_loss": 3.4043555467573823, + "tokens_seen": 2193204224 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016947843530591776, + "loss": 2.4539, + "theoretical_loss": 3.404347218135509, + "tokens_seen": 2193269760 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016946840521564694, + "loss": 2.602, + "theoretical_loss": 3.4043388898321747, + "tokens_seen": 2193335296 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016945837512537612, + "loss": 2.691, + "theoretical_loss": 3.404330561847358, + "tokens_seen": 2193400832 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001694483450351053, + "loss": 2.637, + "theoretical_loss": 3.404322234181037, + "tokens_seen": 2193466368 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001694383149448345, + "loss": 2.5846, + "theoretical_loss": 3.4043139068331896, + "tokens_seen": 2193531904 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001694282848545637, + "loss": 2.8032, + "theoretical_loss": 3.4043055798037947, + "tokens_seen": 2193597440 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016941825476429287, + "loss": 2.7005, + "theoretical_loss": 3.4042972530928304, + "tokens_seen": 2193662976 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016940822467402205, + "loss": 2.8874, + "theoretical_loss": 3.404288926700275, + "tokens_seen": 2193728512 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016939819458375126, + "loss": 2.6652, + "theoretical_loss": 3.4042806006261066, + "tokens_seen": 2193794048 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2446519, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.953860282897949, + "objective/train/theoretical_loss": 3.40427851915731, + "objective/train/tokens_used": 2214270432, + "theoretical_loss": 3.40427851915731, + "tokens_seen": 2193810432 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016938816449348044, + "loss": 2.6521, + "theoretical_loss": 3.404272274870304, + "tokens_seen": 2193859584 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016937813440320962, + "loss": 2.4085, + "theoretical_loss": 3.404263949432845, + "tokens_seen": 2193925120 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001693681043129388, + "loss": 2.8485, + "theoretical_loss": 3.4042556243137083, + "tokens_seen": 2193990656 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016935807422266798, + "loss": 2.3354, + "theoretical_loss": 3.404247299512872, + "tokens_seen": 2194056192 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001693480441323972, + "loss": 2.6869, + "theoretical_loss": 3.4042389750303146, + "tokens_seen": 2194121728 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016933801404212637, + "loss": 2.7472, + "theoretical_loss": 3.404230650866014, + "tokens_seen": 2194187264 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016932798395185555, + "loss": 2.7508, + "theoretical_loss": 3.404222327019949, + "tokens_seen": 2194252800 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016931795386158473, + "loss": 2.6946, + "theoretical_loss": 3.404214003492098, + "tokens_seen": 2194318336 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016930792377131394, + "loss": 2.8967, + "theoretical_loss": 3.404205680282439, + "tokens_seen": 2194383872 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016929789368104312, + "loss": 2.6896, + "theoretical_loss": 3.4041973573909505, + "tokens_seen": 2194449408 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016928786359077233, + "loss": 2.6595, + "theoretical_loss": 3.4041890348176107, + "tokens_seen": 2194514944 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016927783350050151, + "loss": 2.5022, + "theoretical_loss": 3.4041807125623977, + "tokens_seen": 2194580480 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016926780341023072, + "loss": 2.6002, + "theoretical_loss": 3.4041723906252903, + "tokens_seen": 2194646016 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001692577733199599, + "loss": 2.5964, + "theoretical_loss": 3.404164069006267, + "tokens_seen": 2194711552 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016924774322968908, + "loss": 2.4975, + "theoretical_loss": 3.404155747705306, + "tokens_seen": 2194777088 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016923771313941827, + "loss": 2.4973, + "theoretical_loss": 3.404147426722385, + "tokens_seen": 2194842624 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016922768304914745, + "loss": 2.5933, + "theoretical_loss": 3.404139106057483, + "tokens_seen": 2194908160 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016921765295887665, + "loss": 2.5953, + "theoretical_loss": 3.4041307857105783, + "tokens_seen": 2194973696 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016920762286860584, + "loss": 2.6291, + "theoretical_loss": 3.404122465681649, + "tokens_seen": 2195039232 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016919759277833502, + "loss": 2.7024, + "theoretical_loss": 3.4041141459706736, + "tokens_seen": 2195104768 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001691875626880642, + "loss": 2.5853, + "theoretical_loss": 3.4041058265776303, + "tokens_seen": 2195170304 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001691775325977934, + "loss": 2.3575, + "theoretical_loss": 3.404097507502498, + "tokens_seen": 2195235840 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001691675025075226, + "loss": 2.4595, + "theoretical_loss": 3.4040891887452545, + "tokens_seen": 2195301376 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016915747241725177, + "loss": 2.7669, + "theoretical_loss": 3.404080870305878, + "tokens_seen": 2195366912 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016914744232698095, + "loss": 2.5209, + "theoretical_loss": 3.404072552184348, + "tokens_seen": 2195432448 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2447330, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5352206230163574, + "objective/train/theoretical_loss": 3.404070472703626, + "objective/train/tokens_used": 2215908832, + "theoretical_loss": 3.404070472703626, + "tokens_seen": 2195448832 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016913741223671013, + "loss": 2.6086, + "theoretical_loss": 3.404064234380641, + "tokens_seen": 2195497984 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016912738214643934, + "loss": 2.6127, + "theoretical_loss": 3.4040559168947375, + "tokens_seen": 2195563520 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016911735205616852, + "loss": 2.6527, + "theoretical_loss": 3.4040475997266144, + "tokens_seen": 2195629056 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001691073219658977, + "loss": 2.4897, + "theoretical_loss": 3.4040392828762505, + "tokens_seen": 2195694592 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016909729187562688, + "loss": 2.4711, + "theoretical_loss": 3.404030966343624, + "tokens_seen": 2195760128 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001690872617853561, + "loss": 2.6409, + "theoretical_loss": 3.4040226501287134, + "tokens_seen": 2195825664 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016907723169508527, + "loss": 2.3818, + "theoretical_loss": 3.4040143342314972, + "tokens_seen": 2195891200 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016906720160481445, + "loss": 2.624, + "theoretical_loss": 3.4040060186519536, + "tokens_seen": 2195956736 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016905717151454363, + "loss": 2.6744, + "theoretical_loss": 3.4039977033900612, + "tokens_seen": 2196022272 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016904714142427281, + "loss": 2.637, + "theoretical_loss": 3.4039893884457983, + "tokens_seen": 2196087808 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016903711133400202, + "loss": 2.6093, + "theoretical_loss": 3.4039810738191436, + "tokens_seen": 2196153344 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001690270812437312, + "loss": 2.5872, + "theoretical_loss": 3.403972759510075, + "tokens_seen": 2196218880 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016901705115346039, + "loss": 2.6355, + "theoretical_loss": 3.4039644455185707, + "tokens_seen": 2196284416 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016900702106318957, + "loss": 2.6684, + "theoretical_loss": 3.4039561318446094, + "tokens_seen": 2196349952 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016899699097291877, + "loss": 2.6191, + "theoretical_loss": 3.40394781848817, + "tokens_seen": 2196415488 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016898696088264796, + "loss": 2.558, + "theoretical_loss": 3.4039395054492303, + "tokens_seen": 2196481024 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016897693079237714, + "loss": 2.6916, + "theoretical_loss": 3.403931192727769, + "tokens_seen": 2196546560 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016896690070210632, + "loss": 2.5594, + "theoretical_loss": 3.4039228803237638, + "tokens_seen": 2196612096 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001689568706118355, + "loss": 2.6232, + "theoretical_loss": 3.403914568237194, + "tokens_seen": 2196677632 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001689468405215647, + "loss": 2.6358, + "theoretical_loss": 3.4039062564680376, + "tokens_seen": 2196743168 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001689368104312939, + "loss": 2.4576, + "theoretical_loss": 3.403897945016273, + "tokens_seen": 2196808704 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016892678034102307, + "loss": 2.4176, + "theoretical_loss": 3.403889633881879, + "tokens_seen": 2196874240 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016891675025075225, + "loss": 2.717, + "theoretical_loss": 3.4038813230648333, + "tokens_seen": 2196939776 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016890672016048146, + "loss": 2.6701, + "theoretical_loss": 3.4038730125651147, + "tokens_seen": 2197005312 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016889669007021064, + "loss": 2.6691, + "theoretical_loss": 3.403864702382702, + "tokens_seen": 2197070848 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2448362, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.901463508605957, + "objective/train/theoretical_loss": 3.403862624886675, + "objective/train/tokens_used": 2217547232, + "theoretical_loss": 3.403862624886675, + "tokens_seen": 2197087232 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016888665997993982, + "loss": 2.7616, + "theoretical_loss": 3.403856392517573, + "tokens_seen": 2197136384 + }, + { + "epoch": 7.04, + "learning_rate": 0.000168876629889669, + "loss": 2.618, + "theoretical_loss": 3.4038480829697066, + "tokens_seen": 2197201920 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016886659979939818, + "loss": 2.6937, + "theoretical_loss": 3.4038397737390804, + "tokens_seen": 2197267456 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001688565697091274, + "loss": 2.6413, + "theoretical_loss": 3.4038314648256742, + "tokens_seen": 2197332992 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016884653961885657, + "loss": 2.6245, + "theoretical_loss": 3.403823156229465, + "tokens_seen": 2197398528 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016883650952858575, + "loss": 2.6156, + "theoretical_loss": 3.403814847950432, + "tokens_seen": 2197464064 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016882647943831494, + "loss": 2.4258, + "theoretical_loss": 3.4038065399885538, + "tokens_seen": 2197529600 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016881644934804414, + "loss": 2.478, + "theoretical_loss": 3.4037982323438087, + "tokens_seen": 2197595136 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016880641925777332, + "loss": 2.5075, + "theoretical_loss": 3.4037899250161745, + "tokens_seen": 2197660672 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001687963891675025, + "loss": 2.6594, + "theoretical_loss": 3.4037816180056306, + "tokens_seen": 2197726208 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001687863590772317, + "loss": 2.4728, + "theoretical_loss": 3.403773311312155, + "tokens_seen": 2197791744 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016877632898696087, + "loss": 2.6497, + "theoretical_loss": 3.4037650049357255, + "tokens_seen": 2197857280 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016876629889669008, + "loss": 2.5498, + "theoretical_loss": 3.4037566988763217, + "tokens_seen": 2197922816 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016875626880641926, + "loss": 2.4331, + "theoretical_loss": 3.403748393133921, + "tokens_seen": 2197988352 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016874623871614844, + "loss": 2.7533, + "theoretical_loss": 3.403740087708503, + "tokens_seen": 2198053888 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016873620862587762, + "loss": 2.5425, + "theoretical_loss": 3.4037317826000453, + "tokens_seen": 2198119424 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016872617853560683, + "loss": 2.6373, + "theoretical_loss": 3.4037234778085264, + "tokens_seen": 2198184960 + }, + { + "epoch": 7.04, + "learning_rate": 0.000168716148445336, + "loss": 2.6511, + "theoretical_loss": 3.403715173333925, + "tokens_seen": 2198250496 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001687061183550652, + "loss": 2.528, + "theoretical_loss": 3.4037068691762196, + "tokens_seen": 2198316032 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016869608826479437, + "loss": 2.5932, + "theoretical_loss": 3.4036985653353886, + "tokens_seen": 2198381568 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016868605817452355, + "loss": 2.4989, + "theoretical_loss": 3.40369026181141, + "tokens_seen": 2198447104 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016867602808425276, + "loss": 2.5247, + "theoretical_loss": 3.4036819586042633, + "tokens_seen": 2198512640 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016866599799398194, + "loss": 2.7343, + "theoretical_loss": 3.4036736557139253, + "tokens_seen": 2198578176 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016865596790371112, + "loss": 2.5436, + "theoretical_loss": 3.4036653531403767, + "tokens_seen": 2198643712 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001686459378134403, + "loss": 2.5518, + "theoretical_loss": 3.403657050883594, + "tokens_seen": 2198709248 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2449144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7241549491882324, + "objective/train/theoretical_loss": 3.4036549753688905, + "objective/train/tokens_used": 2219185632, + "theoretical_loss": 3.4036549753688905, + "tokens_seen": 2198725632 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001686359077231695, + "loss": 2.6861, + "theoretical_loss": 3.4036487489435565, + "tokens_seen": 2198774784 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001686258776328987, + "loss": 2.4303, + "theoretical_loss": 3.403640447320243, + "tokens_seen": 2198840320 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016861584754262787, + "loss": 2.563, + "theoretical_loss": 3.4036321460136314, + "tokens_seen": 2198905856 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016860581745235706, + "loss": 2.4006, + "theoretical_loss": 3.4036238450237004, + "tokens_seen": 2198971392 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016859578736208626, + "loss": 2.7463, + "theoretical_loss": 3.4036155443504286, + "tokens_seen": 2199036928 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016858575727181544, + "loss": 2.7098, + "theoretical_loss": 3.403607243993794, + "tokens_seen": 2199102464 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016857572718154463, + "loss": 2.7042, + "theoretical_loss": 3.4035989439537757, + "tokens_seen": 2199168000 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001685656970912738, + "loss": 2.5046, + "theoretical_loss": 3.4035906442303516, + "tokens_seen": 2199233536 + }, + { + "epoch": 7.04, + "learning_rate": 0.000168555667001003, + "loss": 2.6671, + "theoretical_loss": 3.403582344823501, + "tokens_seen": 2199299072 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001685456369107322, + "loss": 2.8131, + "theoretical_loss": 3.4035740457332015, + "tokens_seen": 2199364608 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001685356068204614, + "loss": 2.6293, + "theoretical_loss": 3.4035657469594325, + "tokens_seen": 2199430144 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016852557673019059, + "loss": 2.6994, + "theoretical_loss": 3.4035574485021716, + "tokens_seen": 2199495680 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016851554663991977, + "loss": 2.8173, + "theoretical_loss": 3.403549150361398, + "tokens_seen": 2199561216 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016850551654964897, + "loss": 2.6475, + "theoretical_loss": 3.4035408525370894, + "tokens_seen": 2199626752 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016849548645937816, + "loss": 2.6163, + "theoretical_loss": 3.403532555029225, + "tokens_seen": 2199692288 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016848545636910734, + "loss": 2.4508, + "theoretical_loss": 3.403524257837783, + "tokens_seen": 2199757824 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016847542627883652, + "loss": 2.7064, + "theoretical_loss": 3.4035159609627423, + "tokens_seen": 2199823360 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001684653961885657, + "loss": 2.6692, + "theoretical_loss": 3.403507664404081, + "tokens_seen": 2199888896 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001684553660982949, + "loss": 2.743, + "theoretical_loss": 3.4034993681617776, + "tokens_seen": 2199954432 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001684453360080241, + "loss": 2.6262, + "theoretical_loss": 3.403491072235811, + "tokens_seen": 2200019968 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016843530591775327, + "loss": 2.5075, + "theoretical_loss": 3.4034827766261593, + "tokens_seen": 2200085504 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016842527582748245, + "loss": 2.657, + "theoretical_loss": 3.4034744813328013, + "tokens_seen": 2200151040 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016841524573721166, + "loss": 2.5803, + "theoretical_loss": 3.403466186355715, + "tokens_seen": 2200216576 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016840521564694084, + "loss": 2.5003, + "theoretical_loss": 3.4034578916948797, + "tokens_seen": 2200282112 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016839518555667002, + "loss": 2.4417, + "theoretical_loss": 3.4034495973502734, + "tokens_seen": 2200347648 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2450574, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7152175903320312, + "objective/train/theoretical_loss": 3.40344752381353, + "objective/train/tokens_used": 2220824032, + "theoretical_loss": 3.40344752381353, + "tokens_seen": 2200364032 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001683851554663992, + "loss": 2.6054, + "theoretical_loss": 3.403441303321875, + "tokens_seen": 2200413184 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016837512537612838, + "loss": 2.7825, + "theoretical_loss": 3.4034330096096626, + "tokens_seen": 2200478720 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001683650952858576, + "loss": 2.5118, + "theoretical_loss": 3.403424716213615, + "tokens_seen": 2200544256 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016835506519558677, + "loss": 2.8578, + "theoretical_loss": 3.4034164231337103, + "tokens_seen": 2200609792 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016834503510531595, + "loss": 2.6592, + "theoretical_loss": 3.403408130369928, + "tokens_seen": 2200675328 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016833500501504514, + "loss": 2.6539, + "theoretical_loss": 3.4033998379222457, + "tokens_seen": 2200740864 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016832497492477434, + "loss": 2.7133, + "theoretical_loss": 3.403391545790642, + "tokens_seen": 2200806400 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016831494483450352, + "loss": 2.5158, + "theoretical_loss": 3.403383253975096, + "tokens_seen": 2200871936 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001683049147442327, + "loss": 2.6787, + "theoretical_loss": 3.403374962475586, + "tokens_seen": 2200937472 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001682948846539619, + "loss": 2.68, + "theoretical_loss": 3.4033666712920905, + "tokens_seen": 2201003008 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016828485456369107, + "loss": 2.5093, + "theoretical_loss": 3.4033583804245877, + "tokens_seen": 2201068544 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016827482447342028, + "loss": 2.7096, + "theoretical_loss": 3.4033500898730566, + "tokens_seen": 2201134080 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016826479438314946, + "loss": 2.577, + "theoretical_loss": 3.403341799637476, + "tokens_seen": 2201199616 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016825476429287864, + "loss": 2.6586, + "theoretical_loss": 3.4033335097178234, + "tokens_seen": 2201265152 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016824473420260782, + "loss": 2.3959, + "theoretical_loss": 3.4033252201140787, + "tokens_seen": 2201330688 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016823470411233703, + "loss": 2.7135, + "theoretical_loss": 3.4033169308262194, + "tokens_seen": 2201396224 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001682246740220662, + "loss": 2.5531, + "theoretical_loss": 3.403308641854225, + "tokens_seen": 2201461760 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001682146439317954, + "loss": 2.3739, + "theoretical_loss": 3.4033003531980732, + "tokens_seen": 2201527296 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016820461384152457, + "loss": 2.7958, + "theoretical_loss": 3.4032920648577427, + "tokens_seen": 2201592832 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016819458375125375, + "loss": 2.4854, + "theoretical_loss": 3.4032837768332125, + "tokens_seen": 2201658368 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016818455366098296, + "loss": 2.6914, + "theoretical_loss": 3.4032754891244608, + "tokens_seen": 2201723904 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016817452357071214, + "loss": 2.6297, + "theoretical_loss": 3.4032672017314667, + "tokens_seen": 2201789440 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016816449348044132, + "loss": 2.5963, + "theoretical_loss": 3.4032589146542076, + "tokens_seen": 2201854976 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001681544633901705, + "loss": 2.6235, + "theoretical_loss": 3.4032506278926635, + "tokens_seen": 2201920512 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001681444332998997, + "loss": 2.7011, + "theoretical_loss": 3.403242341446812, + "tokens_seen": 2201986048 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2451411, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.740835189819336, + "objective/train/theoretical_loss": 3.4032402698846735, + "objective/train/tokens_used": 2222462432, + "theoretical_loss": 3.4032402698846735, + "tokens_seen": 2202002432 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001681344032096289, + "loss": 2.2894, + "theoretical_loss": 3.403234055316632, + "tokens_seen": 2202051584 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016812437311935807, + "loss": 2.5711, + "theoretical_loss": 3.403225769502102, + "tokens_seen": 2202117120 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016811434302908726, + "loss": 2.6126, + "theoretical_loss": 3.403217484003201, + "tokens_seen": 2202182656 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016810431293881646, + "loss": 2.5529, + "theoretical_loss": 3.4032091988199067, + "tokens_seen": 2202248192 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016809428284854564, + "loss": 2.55, + "theoretical_loss": 3.403200913952199, + "tokens_seen": 2202313728 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016808425275827483, + "loss": 2.5428, + "theoretical_loss": 3.403192629400055, + "tokens_seen": 2202379264 + }, + { + "epoch": 7.04, + "learning_rate": 0.000168074222668004, + "loss": 2.5518, + "theoretical_loss": 3.4031843451634542, + "tokens_seen": 2202444800 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001680641925777332, + "loss": 2.5642, + "theoretical_loss": 3.403176061242375, + "tokens_seen": 2202510336 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001680541624874624, + "loss": 2.3439, + "theoretical_loss": 3.403167777636796, + "tokens_seen": 2202575872 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016804413239719158, + "loss": 2.4388, + "theoretical_loss": 3.4031594943466956, + "tokens_seen": 2202641408 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016803410230692076, + "loss": 2.505, + "theoretical_loss": 3.403151211372053, + "tokens_seen": 2202706944 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016802407221664994, + "loss": 2.6645, + "theoretical_loss": 3.403142928712846, + "tokens_seen": 2202772480 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016801404212637915, + "loss": 2.313, + "theoretical_loss": 3.403134646369054, + "tokens_seen": 2202838016 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016800401203610833, + "loss": 2.5844, + "theoretical_loss": 3.403126364340655, + "tokens_seen": 2202903552 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001679939819458375, + "loss": 2.6112, + "theoretical_loss": 3.403118082627628, + "tokens_seen": 2202969088 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001679839518555667, + "loss": 2.6732, + "theoretical_loss": 3.403109801229951, + "tokens_seen": 2203034624 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016797392176529587, + "loss": 2.6055, + "theoretical_loss": 3.4031015201476036, + "tokens_seen": 2203100160 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016796389167502508, + "loss": 2.6237, + "theoretical_loss": 3.4030932393805635, + "tokens_seen": 2203165696 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016795386158475426, + "loss": 2.6749, + "theoretical_loss": 3.4030849589288095, + "tokens_seen": 2203231232 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016794383149448344, + "loss": 2.6899, + "theoretical_loss": 3.4030766787923206, + "tokens_seen": 2203296768 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016793380140421262, + "loss": 2.5979, + "theoretical_loss": 3.403068398971075, + "tokens_seen": 2203362304 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016792377131394183, + "loss": 2.5797, + "theoretical_loss": 3.4030601194650516, + "tokens_seen": 2203427840 + }, + { + "epoch": 7.04, + "learning_rate": 0.000167913741223671, + "loss": 2.8528, + "theoretical_loss": 3.403051840274229, + "tokens_seen": 2203493376 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001679037111334002, + "loss": 2.6355, + "theoretical_loss": 3.4030435613985857, + "tokens_seen": 2203558912 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016789368104312938, + "loss": 2.527, + "theoretical_loss": 3.4030352828381005, + "tokens_seen": 2203624448 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2452802, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.769700527191162, + "objective/train/theoretical_loss": 3.40303321324722, + "objective/train/tokens_used": 2224100832, + "theoretical_loss": 3.40303321324722, + "tokens_seen": 2203640832 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016788365095285856, + "loss": 2.4368, + "theoretical_loss": 3.4030270045927518, + "tokens_seen": 2203689984 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016787362086258776, + "loss": 2.783, + "theoretical_loss": 3.403018726662518, + "tokens_seen": 2203755520 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016786359077231695, + "loss": 2.716, + "theoretical_loss": 3.403010449047379, + "tokens_seen": 2203821056 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016785356068204613, + "loss": 2.3504, + "theoretical_loss": 3.403002171747312, + "tokens_seen": 2203886592 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001678435305917753, + "loss": 2.5799, + "theoretical_loss": 3.4029938947622957, + "tokens_seen": 2203952128 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016783350050150452, + "loss": 2.3999, + "theoretical_loss": 3.40298561809231, + "tokens_seen": 2204017664 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001678234704112337, + "loss": 2.6708, + "theoretical_loss": 3.4029773417373326, + "tokens_seen": 2204083200 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016781344032096288, + "loss": 2.7486, + "theoretical_loss": 3.402969065697342, + "tokens_seen": 2204148736 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016780341023069206, + "loss": 2.7086, + "theoretical_loss": 3.402960789972317, + "tokens_seen": 2204214272 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016779338014042124, + "loss": 2.3989, + "theoretical_loss": 3.402952514562237, + "tokens_seen": 2204279808 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016778335005015048, + "loss": 2.5156, + "theoretical_loss": 3.4029442394670797, + "tokens_seen": 2204345344 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016777331995987966, + "loss": 2.6708, + "theoretical_loss": 3.402935964686824, + "tokens_seen": 2204410880 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016776328986960884, + "loss": 2.575, + "theoretical_loss": 3.402927690221449, + "tokens_seen": 2204476416 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016775325977933802, + "loss": 2.4404, + "theoretical_loss": 3.4029194160709326, + "tokens_seen": 2204541952 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016774322968906723, + "loss": 2.6569, + "theoretical_loss": 3.402911142235254, + "tokens_seen": 2204607488 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001677331995987964, + "loss": 2.5937, + "theoretical_loss": 3.4029028687143916, + "tokens_seen": 2204673024 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001677231695085256, + "loss": 2.6754, + "theoretical_loss": 3.4028945955083243, + "tokens_seen": 2204738560 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016771313941825477, + "loss": 2.5876, + "theoretical_loss": 3.402886322617031, + "tokens_seen": 2204804096 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016770310932798395, + "loss": 2.4474, + "theoretical_loss": 3.4028780500404894, + "tokens_seen": 2204869632 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016769307923771316, + "loss": 2.7058, + "theoretical_loss": 3.4028697777786796, + "tokens_seen": 2204935168 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016768304914744234, + "loss": 2.5599, + "theoretical_loss": 3.402861505831579, + "tokens_seen": 2205000704 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016767301905717152, + "loss": 2.4977, + "theoretical_loss": 3.4028532341991666, + "tokens_seen": 2205066240 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001676629889669007, + "loss": 2.7768, + "theoretical_loss": 3.4028449628814212, + "tokens_seen": 2205131776 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001676529588766299, + "loss": 2.432, + "theoretical_loss": 3.4028366918783215, + "tokens_seen": 2205197312 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001676429287863591, + "loss": 2.5788, + "theoretical_loss": 3.4028284211898465, + "tokens_seen": 2205262848 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2453688, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5808918476104736, + "objective/train/theoretical_loss": 3.4028263535668852, + "objective/train/tokens_used": 2225739232, + "theoretical_loss": 3.4028263535668852, + "tokens_seen": 2205279232 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016763289869608827, + "loss": 2.5682, + "theoretical_loss": 3.4028201508159746, + "tokens_seen": 2205328384 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016762286860581746, + "loss": 2.4742, + "theoretical_loss": 3.4028118807566843, + "tokens_seen": 2205393920 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016761283851554666, + "loss": 2.5198, + "theoretical_loss": 3.4028036110119544, + "tokens_seen": 2205459456 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016760280842527584, + "loss": 2.4465, + "theoretical_loss": 3.4027953415817636, + "tokens_seen": 2205524992 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016759277833500503, + "loss": 2.6358, + "theoretical_loss": 3.4027870724660905, + "tokens_seen": 2205590528 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001675827482447342, + "loss": 2.5532, + "theoretical_loss": 3.402778803664914, + "tokens_seen": 2205656064 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001675727181544634, + "loss": 2.6149, + "theoretical_loss": 3.4027705351782127, + "tokens_seen": 2205721600 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001675626880641926, + "loss": 2.4623, + "theoretical_loss": 3.4027622670059654, + "tokens_seen": 2205787136 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016755265797392178, + "loss": 2.3602, + "theoretical_loss": 3.4027539991481506, + "tokens_seen": 2205852672 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016754262788365096, + "loss": 2.5289, + "theoretical_loss": 3.402745731604747, + "tokens_seen": 2205918208 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016753259779338014, + "loss": 2.6739, + "theoretical_loss": 3.4027374643757335, + "tokens_seen": 2205983744 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016752256770310935, + "loss": 2.7367, + "theoretical_loss": 3.4027291974610887, + "tokens_seen": 2206049280 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016751253761283853, + "loss": 2.7092, + "theoretical_loss": 3.4027209308607915, + "tokens_seen": 2206114816 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001675025075225677, + "loss": 2.6945, + "theoretical_loss": 3.40271266457482, + "tokens_seen": 2206180352 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001674924774322969, + "loss": 2.5806, + "theoretical_loss": 3.4027043986031535, + "tokens_seen": 2206245888 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016748244734202607, + "loss": 2.3428, + "theoretical_loss": 3.4026961329457706, + "tokens_seen": 2206311424 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016747241725175528, + "loss": 2.396, + "theoretical_loss": 3.40268786760265, + "tokens_seen": 2206376960 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016746238716148446, + "loss": 2.4322, + "theoretical_loss": 3.4026796025737704, + "tokens_seen": 2206442496 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016745235707121364, + "loss": 2.5367, + "theoretical_loss": 3.4026713378591102, + "tokens_seen": 2206508032 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016744232698094282, + "loss": 2.6214, + "theoretical_loss": 3.4026630734586485, + "tokens_seen": 2206573568 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016743229689067203, + "loss": 2.5108, + "theoretical_loss": 3.4026548093723643, + "tokens_seen": 2206639104 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001674222668004012, + "loss": 2.5522, + "theoretical_loss": 3.402646545600236, + "tokens_seen": 2206704640 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001674122367101304, + "loss": 2.5297, + "theoretical_loss": 3.402638282142242, + "tokens_seen": 2206770176 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016740220661985958, + "loss": 2.455, + "theoretical_loss": 3.4026300189983614, + "tokens_seen": 2206835712 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016739217652958876, + "loss": 2.4566, + "theoretical_loss": 3.402621756168573, + "tokens_seen": 2206901248 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2455273, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.248014211654663, + "objective/train/theoretical_loss": 3.4026196905102, + "objective/train/tokens_used": 2227377632, + "theoretical_loss": 3.4026196905102, + "tokens_seen": 2206917632 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016738214643931797, + "loss": 2.5464, + "theoretical_loss": 3.402613493652855, + "tokens_seen": 2206966784 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016737211634904715, + "loss": 2.6351, + "theoretical_loss": 3.402605231451187, + "tokens_seen": 2207032320 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016736208625877633, + "loss": 2.7325, + "theoretical_loss": 3.4025969695635467, + "tokens_seen": 2207097856 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001673520561685055, + "loss": 2.4161, + "theoretical_loss": 3.402588707989914, + "tokens_seen": 2207163392 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016734202607823472, + "loss": 2.5919, + "theoretical_loss": 3.4025804467302665, + "tokens_seen": 2207228928 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001673319959879639, + "loss": 2.6639, + "theoretical_loss": 3.4025721857845843, + "tokens_seen": 2207294464 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016732196589769308, + "loss": 2.4751, + "theoretical_loss": 3.402563925152845, + "tokens_seen": 2207360000 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016731193580742226, + "loss": 2.4959, + "theoretical_loss": 3.402555664835027, + "tokens_seen": 2207425536 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016730190571715144, + "loss": 2.4434, + "theoretical_loss": 3.40254740483111, + "tokens_seen": 2207491072 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016729187562688065, + "loss": 2.59, + "theoretical_loss": 3.4025391451410734, + "tokens_seen": 2207556608 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016728184553660983, + "loss": 2.5319, + "theoretical_loss": 3.402530885764894, + "tokens_seen": 2207622144 + }, + { + "epoch": 7.04, + "learning_rate": 0.000167271815446339, + "loss": 2.4408, + "theoretical_loss": 3.4025226267025523, + "tokens_seen": 2207687680 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001672617853560682, + "loss": 2.7386, + "theoretical_loss": 3.4025143679540264, + "tokens_seen": 2207753216 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001672517552657974, + "loss": 2.6093, + "theoretical_loss": 3.4025061095192943, + "tokens_seen": 2207818752 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016724172517552658, + "loss": 2.3719, + "theoretical_loss": 3.402497851398336, + "tokens_seen": 2207884288 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016723169508525576, + "loss": 2.627, + "theoretical_loss": 3.4024895935911297, + "tokens_seen": 2207949824 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016722166499498494, + "loss": 2.579, + "theoretical_loss": 3.4024813360976545, + "tokens_seen": 2208015360 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016721163490471413, + "loss": 2.5005, + "theoretical_loss": 3.4024730789178883, + "tokens_seen": 2208080896 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016720160481444333, + "loss": 2.5765, + "theoretical_loss": 3.4024648220518108, + "tokens_seen": 2208146432 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016719157472417251, + "loss": 2.4252, + "theoretical_loss": 3.4024565654994006, + "tokens_seen": 2208211968 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001671815446339017, + "loss": 2.5633, + "theoretical_loss": 3.402448309260636, + "tokens_seen": 2208277504 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016717151454363088, + "loss": 2.5723, + "theoretical_loss": 3.4024400533354964, + "tokens_seen": 2208343040 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016716148445336009, + "loss": 2.7255, + "theoretical_loss": 3.4024317977239598, + "tokens_seen": 2208408576 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016715145436308927, + "loss": 2.4389, + "theoretical_loss": 3.4024235424260056, + "tokens_seen": 2208474112 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016714142427281845, + "loss": 2.6946, + "theoretical_loss": 3.4024152874416127, + "tokens_seen": 2208539648 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2455889, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3239023685455322, + "objective/train/theoretical_loss": 3.4024132237445057, + "objective/train/tokens_used": 2229016032, + "theoretical_loss": 3.4024132237445057, + "tokens_seen": 2208556032 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016713139418254763, + "loss": 2.7356, + "theoretical_loss": 3.4024070327707596, + "tokens_seen": 2208605184 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001671213640922768, + "loss": 2.5707, + "theoretical_loss": 3.402398778413425, + "tokens_seen": 2208670720 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016711133400200602, + "loss": 2.6044, + "theoretical_loss": 3.4023905243695878, + "tokens_seen": 2208736256 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001671013039117352, + "loss": 2.5912, + "theoretical_loss": 3.402382270639227, + "tokens_seen": 2208801792 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016709127382146438, + "loss": 2.5383, + "theoretical_loss": 3.4023740172223205, + "tokens_seen": 2208867328 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016708124373119356, + "loss": 2.58, + "theoretical_loss": 3.4023657641188483, + "tokens_seen": 2208932864 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016707121364092277, + "loss": 2.5145, + "theoretical_loss": 3.402357511328789, + "tokens_seen": 2208998400 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016706118355065195, + "loss": 2.1819, + "theoretical_loss": 3.4023492588521207, + "tokens_seen": 2209063936 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016705115346038113, + "loss": 2.5791, + "theoretical_loss": 3.4023410066888227, + "tokens_seen": 2209129472 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016704112337011034, + "loss": 2.7027, + "theoretical_loss": 3.4023327548388735, + "tokens_seen": 2209195008 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016703109327983955, + "loss": 2.561, + "theoretical_loss": 3.402324503302252, + "tokens_seen": 2209260544 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016702106318956873, + "loss": 2.4145, + "theoretical_loss": 3.402316252078937, + "tokens_seen": 2209326080 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001670110330992979, + "loss": 2.4277, + "theoretical_loss": 3.402308001168908, + "tokens_seen": 2209391616 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001670010030090271, + "loss": 2.634, + "theoretical_loss": 3.4022997505721433, + "tokens_seen": 2209457152 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016699097291875627, + "loss": 2.4463, + "theoretical_loss": 3.402291500288621, + "tokens_seen": 2209522688 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016698094282848548, + "loss": 2.5919, + "theoretical_loss": 3.402283250318321, + "tokens_seen": 2209588224 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016697091273821466, + "loss": 2.2184, + "theoretical_loss": 3.4022750006612217, + "tokens_seen": 2209653760 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016696088264794384, + "loss": 2.4962, + "theoretical_loss": 3.4022667513173017, + "tokens_seen": 2209719296 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016695085255767302, + "loss": 2.5836, + "theoretical_loss": 3.4022585022865397, + "tokens_seen": 2209784832 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016694082246740223, + "loss": 2.5739, + "theoretical_loss": 3.402250253568915, + "tokens_seen": 2209850368 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016693079237713141, + "loss": 2.7622, + "theoretical_loss": 3.402242005164407, + "tokens_seen": 2209915904 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001669207622868606, + "loss": 2.5216, + "theoretical_loss": 3.402233757072993, + "tokens_seen": 2209981440 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016691073219658978, + "loss": 2.7068, + "theoretical_loss": 3.402225509294653, + "tokens_seen": 2210046976 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016690070210631896, + "loss": 2.6774, + "theoretical_loss": 3.4022172618293656, + "tokens_seen": 2210112512 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016689067201604817, + "loss": 2.5983, + "theoretical_loss": 3.4022090146771093, + "tokens_seen": 2210178048 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2456643, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2588746547698975, + "objective/train/theoretical_loss": 3.4022069529379535, + "objective/train/tokens_used": 2230654432, + "theoretical_loss": 3.4022069529379535, + "tokens_seen": 2210194432 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016688064192577735, + "loss": 2.4119, + "theoretical_loss": 3.402200767837863, + "tokens_seen": 2210243584 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016687061183550653, + "loss": 2.4376, + "theoretical_loss": 3.4021925213116058, + "tokens_seen": 2210309120 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001668605817452357, + "loss": 2.4519, + "theoretical_loss": 3.4021842750983167, + "tokens_seen": 2210374656 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016685055165496492, + "loss": 2.498, + "theoretical_loss": 3.4021760291979737, + "tokens_seen": 2210440192 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001668405215646941, + "loss": 2.3935, + "theoretical_loss": 3.4021677836105564, + "tokens_seen": 2210505728 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016683049147442328, + "loss": 2.7198, + "theoretical_loss": 3.402159538336044, + "tokens_seen": 2210571264 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016682046138415246, + "loss": 2.7678, + "theoretical_loss": 3.402151293374414, + "tokens_seen": 2210636800 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016681043129388164, + "loss": 2.6771, + "theoretical_loss": 3.4021430487256463, + "tokens_seen": 2210702336 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016680040120361085, + "loss": 2.6874, + "theoretical_loss": 3.40213480438972, + "tokens_seen": 2210767872 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016679037111334003, + "loss": 2.7675, + "theoretical_loss": 3.402126560366613, + "tokens_seen": 2210833408 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001667803410230692, + "loss": 2.5438, + "theoretical_loss": 3.402118316656305, + "tokens_seen": 2210898944 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001667703109327984, + "loss": 2.6287, + "theoretical_loss": 3.402110073258774, + "tokens_seen": 2210964480 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001667602808425276, + "loss": 2.647, + "theoretical_loss": 3.402101830174, + "tokens_seen": 2211030016 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016675025075225678, + "loss": 2.5081, + "theoretical_loss": 3.4020935874019607, + "tokens_seen": 2211095552 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016674022066198596, + "loss": 2.5544, + "theoretical_loss": 3.4020853449426354, + "tokens_seen": 2211161088 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016673019057171514, + "loss": 2.7483, + "theoretical_loss": 3.4020771027960035, + "tokens_seen": 2211226624 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016672016048144433, + "loss": 2.5713, + "theoretical_loss": 3.402068860962043, + "tokens_seen": 2211292160 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016671013039117353, + "loss": 2.4542, + "theoretical_loss": 3.4020606194407335, + "tokens_seen": 2211357696 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016670010030090271, + "loss": 2.7571, + "theoretical_loss": 3.4020523782320535, + "tokens_seen": 2211423232 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001666900702106319, + "loss": 2.7118, + "theoretical_loss": 3.4020441373359818, + "tokens_seen": 2211488768 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016668004012036108, + "loss": 2.4706, + "theoretical_loss": 3.4020358967524977, + "tokens_seen": 2211554304 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016667001003009029, + "loss": 2.5572, + "theoretical_loss": 3.4020276564815797, + "tokens_seen": 2211619840 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016665997993981947, + "loss": 2.5637, + "theoretical_loss": 3.4020194165232067, + "tokens_seen": 2211685376 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016664994984954865, + "loss": 2.4876, + "theoretical_loss": 3.4020111768773575, + "tokens_seen": 2211750912 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016663991975927783, + "loss": 2.5664, + "theoretical_loss": 3.4020029375440113, + "tokens_seen": 2211816448 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2457924, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2814316749572754, + "objective/train/theoretical_loss": 3.4020008777595008, + "objective/train/tokens_used": 2232292832, + "theoretical_loss": 3.4020008777595008, + "tokens_seen": 2211832832 + }, + { + "epoch": 7.04, + "learning_rate": 0.000166629889669007, + "loss": 2.4015, + "theoretical_loss": 3.401994698523147, + "tokens_seen": 2211881984 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016661985957873622, + "loss": 2.3189, + "theoretical_loss": 3.401986459814743, + "tokens_seen": 2211947520 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001666098294884654, + "loss": 2.5887, + "theoretical_loss": 3.401978221418779, + "tokens_seen": 2212013056 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016659979939819458, + "loss": 2.5265, + "theoretical_loss": 3.401969983335233, + "tokens_seen": 2212078592 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016658976930792376, + "loss": 2.5356, + "theoretical_loss": 3.4019617455640847, + "tokens_seen": 2212144128 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016657973921765297, + "loss": 2.3703, + "theoretical_loss": 3.4019535081053123, + "tokens_seen": 2212209664 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016656970912738215, + "loss": 2.5743, + "theoretical_loss": 3.4019452709588953, + "tokens_seen": 2212275200 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016655967903711133, + "loss": 2.6498, + "theoretical_loss": 3.401937034124812, + "tokens_seen": 2212340736 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001665496489468405, + "loss": 2.9023, + "theoretical_loss": 3.401928797603042, + "tokens_seen": 2212406272 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016653961885656972, + "loss": 2.4225, + "theoretical_loss": 3.4019205613935637, + "tokens_seen": 2212471808 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001665295887662989, + "loss": 2.8004, + "theoretical_loss": 3.4019123254963564, + "tokens_seen": 2212537344 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016651955867602808, + "loss": 2.5755, + "theoretical_loss": 3.4019040899113984, + "tokens_seen": 2212602880 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016650952858575726, + "loss": 2.5946, + "theoretical_loss": 3.401895854638669, + "tokens_seen": 2212668416 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016649949849548645, + "loss": 2.4909, + "theoretical_loss": 3.401887619678147, + "tokens_seen": 2212733952 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016648946840521565, + "loss": 2.5153, + "theoretical_loss": 3.4018793850298117, + "tokens_seen": 2212799488 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016647943831494484, + "loss": 2.4923, + "theoretical_loss": 3.4018711506936414, + "tokens_seen": 2212865024 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016646940822467402, + "loss": 2.6103, + "theoretical_loss": 3.4018629166696157, + "tokens_seen": 2212930560 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001664593781344032, + "loss": 2.5391, + "theoretical_loss": 3.4018546829577128, + "tokens_seen": 2212996096 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001664493480441324, + "loss": 2.7263, + "theoretical_loss": 3.401846449557912, + "tokens_seen": 2213061632 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001664393179538616, + "loss": 2.7556, + "theoretical_loss": 3.4018382164701926, + "tokens_seen": 2213127168 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016642928786359077, + "loss": 2.4956, + "theoretical_loss": 3.401829983694533, + "tokens_seen": 2213192704 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016641925777331995, + "loss": 2.3784, + "theoretical_loss": 3.401821751230912, + "tokens_seen": 2213258240 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016640922768304913, + "loss": 2.3937, + "theoretical_loss": 3.4018135190793095, + "tokens_seen": 2213323776 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016639919759277834, + "loss": 2.4079, + "theoretical_loss": 3.4018052872397035, + "tokens_seen": 2213389312 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016638916750250752, + "loss": 2.3871, + "theoretical_loss": 3.401797055712073, + "tokens_seen": 2213454848 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2458756, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6560356616973877, + "objective/train/theoretical_loss": 3.401794997878909, + "objective/train/tokens_used": 2233931232, + "theoretical_loss": 3.401794997878909, + "tokens_seen": 2213471232 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001663791374122367, + "loss": 2.3816, + "theoretical_loss": 3.401788824496397, + "tokens_seen": 2213520384 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016636910732196588, + "loss": 2.6143, + "theoretical_loss": 3.4017805935926546, + "tokens_seen": 2213585920 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001663590772316951, + "loss": 2.7237, + "theoretical_loss": 3.401772363000825, + "tokens_seen": 2213651456 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016634904714142427, + "loss": 2.5025, + "theoretical_loss": 3.4017641327208867, + "tokens_seen": 2213716992 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016633901705115345, + "loss": 2.6712, + "theoretical_loss": 3.401755902752819, + "tokens_seen": 2213782528 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016632898696088263, + "loss": 2.6891, + "theoretical_loss": 3.4017476730966005, + "tokens_seen": 2213848064 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016631895687061181, + "loss": 2.6098, + "theoretical_loss": 3.40173944375221, + "tokens_seen": 2213913600 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016630892678034102, + "loss": 2.6186, + "theoretical_loss": 3.4017312147196277, + "tokens_seen": 2213979136 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001662988966900702, + "loss": 2.5599, + "theoretical_loss": 3.401722985998831, + "tokens_seen": 2214044672 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001662888665997994, + "loss": 2.3798, + "theoretical_loss": 3.4017147575897995, + "tokens_seen": 2214110208 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001662788365095286, + "loss": 2.3763, + "theoretical_loss": 3.401706529492512, + "tokens_seen": 2214175744 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001662688064192578, + "loss": 2.6777, + "theoretical_loss": 3.401698301706948, + "tokens_seen": 2214241280 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016625877632898698, + "loss": 2.3836, + "theoretical_loss": 3.401690074233086, + "tokens_seen": 2214306816 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016624874623871616, + "loss": 2.52, + "theoretical_loss": 3.401681847070905, + "tokens_seen": 2214372352 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016623871614844534, + "loss": 2.3195, + "theoretical_loss": 3.401673620220383, + "tokens_seen": 2214437888 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016622868605817453, + "loss": 2.5106, + "theoretical_loss": 3.4016653936815016, + "tokens_seen": 2214503424 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016621865596790373, + "loss": 2.6908, + "theoretical_loss": 3.4016571674542373, + "tokens_seen": 2214568960 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016620862587763292, + "loss": 2.7578, + "theoretical_loss": 3.40164894153857, + "tokens_seen": 2214634496 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001661985957873621, + "loss": 2.4554, + "theoretical_loss": 3.401640715934479, + "tokens_seen": 2214700032 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016618856569709128, + "loss": 2.3114, + "theoretical_loss": 3.4016324906419424, + "tokens_seen": 2214765568 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016617853560682049, + "loss": 2.6341, + "theoretical_loss": 3.40162426566094, + "tokens_seen": 2214831104 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016616850551654967, + "loss": 2.3368, + "theoretical_loss": 3.40161604099145, + "tokens_seen": 2214896640 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016615847542627885, + "loss": 2.6292, + "theoretical_loss": 3.4016078166334522, + "tokens_seen": 2214962176 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016614844533600803, + "loss": 2.4793, + "theoretical_loss": 3.4015995925869253, + "tokens_seen": 2215027712 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001661384152457372, + "loss": 2.6835, + "theoretical_loss": 3.4015913688518484, + "tokens_seen": 2215093248 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2459987, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4145164489746094, + "objective/train/theoretical_loss": 3.4015893129667405, + "objective/train/tokens_used": 2235569632, + "theoretical_loss": 3.4015893129667405, + "tokens_seen": 2215109632 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016612838515546642, + "loss": 2.6175, + "theoretical_loss": 3.4015831454281997, + "tokens_seen": 2215158784 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001661183550651956, + "loss": 2.4597, + "theoretical_loss": 3.4015749223159593, + "tokens_seen": 2215224320 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016610832497492478, + "loss": 2.5505, + "theoretical_loss": 3.4015666995151053, + "tokens_seen": 2215289856 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016609829488465396, + "loss": 2.5549, + "theoretical_loss": 3.401558477025617, + "tokens_seen": 2215355392 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016608826479438317, + "loss": 2.4762, + "theoretical_loss": 3.401550254847474, + "tokens_seen": 2215420928 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016607823470411235, + "loss": 2.4648, + "theoretical_loss": 3.4015420329806543, + "tokens_seen": 2215486464 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016606820461384153, + "loss": 2.8312, + "theoretical_loss": 3.401533811425138, + "tokens_seen": 2215552000 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001660581745235707, + "loss": 2.3514, + "theoretical_loss": 3.4015255901809027, + "tokens_seen": 2215617536 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016604814443329992, + "loss": 2.4702, + "theoretical_loss": 3.401517369247929, + "tokens_seen": 2215683072 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001660381143430291, + "loss": 2.5528, + "theoretical_loss": 3.4015091486261944, + "tokens_seen": 2215748608 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016602808425275828, + "loss": 2.6168, + "theoretical_loss": 3.401500928315679, + "tokens_seen": 2215814144 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016601805416248746, + "loss": 2.4598, + "theoretical_loss": 3.4014927083163613, + "tokens_seen": 2215879680 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016600802407221665, + "loss": 2.6333, + "theoretical_loss": 3.4014844886282205, + "tokens_seen": 2215945216 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016599799398194585, + "loss": 2.5168, + "theoretical_loss": 3.4014762692512357, + "tokens_seen": 2216010752 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016598796389167504, + "loss": 2.6447, + "theoretical_loss": 3.401468050185386, + "tokens_seen": 2216076288 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016597793380140422, + "loss": 2.8025, + "theoretical_loss": 3.4014598314306497, + "tokens_seen": 2216141824 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001659679037111334, + "loss": 2.6577, + "theoretical_loss": 3.4014516129870067, + "tokens_seen": 2216207360 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001659578736208626, + "loss": 2.463, + "theoretical_loss": 3.4014433948544354, + "tokens_seen": 2216272896 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001659478435305918, + "loss": 2.4963, + "theoretical_loss": 3.4014351770329156, + "tokens_seen": 2216338432 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016593781344032097, + "loss": 2.3855, + "theoretical_loss": 3.401426959522425, + "tokens_seen": 2216403968 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016592778335005015, + "loss": 2.3715, + "theoretical_loss": 3.401418742322944, + "tokens_seen": 2216469504 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016591775325977933, + "loss": 2.55, + "theoretical_loss": 3.401410525434451, + "tokens_seen": 2216535040 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016590772316950854, + "loss": 2.4355, + "theoretical_loss": 3.401402308856925, + "tokens_seen": 2216600576 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016589769307923772, + "loss": 2.5176, + "theoretical_loss": 3.4013940925903454, + "tokens_seen": 2216666112 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001658876629889669, + "loss": 2.5627, + "theoretical_loss": 3.401385876634691, + "tokens_seen": 2216731648 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2460785, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.487323760986328, + "objective/train/theoretical_loss": 3.401383822694357, + "objective/train/tokens_used": 2237208032, + "theoretical_loss": 3.401383822694357, + "tokens_seen": 2216748032 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016587763289869608, + "loss": 2.5157, + "theoretical_loss": 3.4013776609899407, + "tokens_seen": 2216797184 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001658676028084253, + "loss": 2.801, + "theoretical_loss": 3.4013694456560737, + "tokens_seen": 2216862720 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016585757271815447, + "loss": 2.6338, + "theoretical_loss": 3.4013612306330687, + "tokens_seen": 2216928256 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016584754262788365, + "loss": 2.4821, + "theoretical_loss": 3.4013530159209058, + "tokens_seen": 2216993792 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016583751253761283, + "loss": 2.4242, + "theoretical_loss": 3.4013448015195626, + "tokens_seen": 2217059328 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016582748244734201, + "loss": 2.6045, + "theoretical_loss": 3.4013365874290193, + "tokens_seen": 2217124864 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016581745235707122, + "loss": 2.6094, + "theoretical_loss": 3.4013283736492546, + "tokens_seen": 2217190400 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001658074222668004, + "loss": 2.4569, + "theoretical_loss": 3.401320160180247, + "tokens_seen": 2217255936 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016579739217652959, + "loss": 2.6809, + "theoretical_loss": 3.4013119470219766, + "tokens_seen": 2217321472 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016578736208625877, + "loss": 2.5658, + "theoretical_loss": 3.4013037341744217, + "tokens_seen": 2217387008 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016577733199598797, + "loss": 2.5389, + "theoretical_loss": 3.4012955216375618, + "tokens_seen": 2217452544 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016576730190571716, + "loss": 2.7178, + "theoretical_loss": 3.4012873094113756, + "tokens_seen": 2217518080 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016575727181544634, + "loss": 2.4499, + "theoretical_loss": 3.401279097495842, + "tokens_seen": 2217583616 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016574724172517552, + "loss": 2.6289, + "theoretical_loss": 3.4012708858909404, + "tokens_seen": 2217649152 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001657372116349047, + "loss": 2.7239, + "theoretical_loss": 3.40126267459665, + "tokens_seen": 2217714688 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001657271815446339, + "loss": 2.479, + "theoretical_loss": 3.40125446361295, + "tokens_seen": 2217780224 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001657171514543631, + "loss": 2.5005, + "theoretical_loss": 3.401246252939819, + "tokens_seen": 2217845760 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016570712136409227, + "loss": 2.6772, + "theoretical_loss": 3.401238042577236, + "tokens_seen": 2217911296 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016569709127382145, + "loss": 2.5782, + "theoretical_loss": 3.4012298325251806, + "tokens_seen": 2217976832 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016568706118355066, + "loss": 2.5029, + "theoretical_loss": 3.401221622783632, + "tokens_seen": 2218042368 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016567703109327984, + "loss": 2.5519, + "theoretical_loss": 3.4012134133525684, + "tokens_seen": 2218107904 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016566700100300902, + "loss": 2.5916, + "theoretical_loss": 3.401205204231969, + "tokens_seen": 2218173440 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001656569709127382, + "loss": 2.6518, + "theoretical_loss": 3.401196995421814, + "tokens_seen": 2218238976 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016564694082246738, + "loss": 2.7503, + "theoretical_loss": 3.4011887869220816, + "tokens_seen": 2218304512 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001656369107321966, + "loss": 2.6563, + "theoretical_loss": 3.401180578732751, + "tokens_seen": 2218370048 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2462178, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6930065155029297, + "objective/train/theoretical_loss": 3.4011785267339163, + "objective/train/tokens_used": 2238846432, + "theoretical_loss": 3.4011785267339163, + "tokens_seen": 2218386432 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016562688064192577, + "loss": 2.5342, + "theoretical_loss": 3.4011723708538013, + "tokens_seen": 2218435584 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016561685055165495, + "loss": 2.4845, + "theoretical_loss": 3.4011641632852117, + "tokens_seen": 2218501120 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016560682046138413, + "loss": 2.6233, + "theoretical_loss": 3.4011559560269613, + "tokens_seen": 2218566656 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016559679037111334, + "loss": 2.462, + "theoretical_loss": 3.401147749079029, + "tokens_seen": 2218632192 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016558676028084252, + "loss": 2.4868, + "theoretical_loss": 3.401139542441394, + "tokens_seen": 2218697728 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001655767301905717, + "loss": 2.4718, + "theoretical_loss": 3.4011313361140356, + "tokens_seen": 2218763264 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016556670010030089, + "loss": 2.579, + "theoretical_loss": 3.4011231300969333, + "tokens_seen": 2218828800 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016555667001003007, + "loss": 2.4977, + "theoretical_loss": 3.4011149243900647, + "tokens_seen": 2218894336 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016554663991975928, + "loss": 2.5163, + "theoretical_loss": 3.40110671899341, + "tokens_seen": 2218959872 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016553660982948848, + "loss": 2.3464, + "theoretical_loss": 3.4010985139069487, + "tokens_seen": 2219025408 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016552657973921767, + "loss": 2.5308, + "theoretical_loss": 3.4010903091306592, + "tokens_seen": 2219090944 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016551654964894685, + "loss": 2.5806, + "theoretical_loss": 3.4010821046645208, + "tokens_seen": 2219156480 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016550651955867605, + "loss": 2.5003, + "theoretical_loss": 3.4010739005085124, + "tokens_seen": 2219222016 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016549648946840524, + "loss": 2.3237, + "theoretical_loss": 3.401065696662614, + "tokens_seen": 2219287552 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016548645937813442, + "loss": 2.4951, + "theoretical_loss": 3.401057493126803, + "tokens_seen": 2219353088 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001654764292878636, + "loss": 2.6057, + "theoretical_loss": 3.40104928990106, + "tokens_seen": 2219418624 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001654663991975928, + "loss": 2.5045, + "theoretical_loss": 3.401041086985364, + "tokens_seen": 2219484160 + }, + { + "epoch": 7.04, + "learning_rate": 0.000165456369107322, + "loss": 2.6923, + "theoretical_loss": 3.4010328843796938, + "tokens_seen": 2219549696 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016544633901705117, + "loss": 2.7211, + "theoretical_loss": 3.401024682084029, + "tokens_seen": 2219615232 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016543630892678035, + "loss": 2.5536, + "theoretical_loss": 3.401016480098347, + "tokens_seen": 2219680768 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016542627883650953, + "loss": 2.338, + "theoretical_loss": 3.4010082784226294, + "tokens_seen": 2219746304 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016541624874623874, + "loss": 2.5667, + "theoretical_loss": 3.401000077056854, + "tokens_seen": 2219811840 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016540621865596792, + "loss": 2.5449, + "theoretical_loss": 3.4009918760009996, + "tokens_seen": 2219877376 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001653961885656971, + "loss": 2.3786, + "theoretical_loss": 3.400983675255046, + "tokens_seen": 2219942912 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016538615847542628, + "loss": 2.425, + "theoretical_loss": 3.4009754748189724, + "tokens_seen": 2220008448 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2462814, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1375701427459717, + "objective/train/theoretical_loss": 3.4009734247583703, + "objective/train/tokens_used": 2240484832, + "theoretical_loss": 3.4009734247583703, + "tokens_seen": 2220024832 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001653761283851555, + "loss": 2.5803, + "theoretical_loss": 3.400967274692758, + "tokens_seen": 2220073984 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016536609829488467, + "loss": 2.693, + "theoretical_loss": 3.400959074876381, + "tokens_seen": 2220139520 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016535606820461385, + "loss": 2.3121, + "theoretical_loss": 3.4009508753698214, + "tokens_seen": 2220205056 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016534603811434303, + "loss": 2.4068, + "theoretical_loss": 3.4009426761730586, + "tokens_seen": 2220270592 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016533600802407221, + "loss": 2.6275, + "theoretical_loss": 3.400934477286071, + "tokens_seen": 2220336128 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016532597793380142, + "loss": 2.3195, + "theoretical_loss": 3.400926278708838, + "tokens_seen": 2220401664 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001653159478435306, + "loss": 2.5358, + "theoretical_loss": 3.400918080441339, + "tokens_seen": 2220467200 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016530591775325979, + "loss": 2.6111, + "theoretical_loss": 3.400909882483553, + "tokens_seen": 2220532736 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016529588766298897, + "loss": 2.3019, + "theoretical_loss": 3.4009016848354587, + "tokens_seen": 2220598272 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016528585757271817, + "loss": 2.5709, + "theoretical_loss": 3.4008934874970365, + "tokens_seen": 2220663808 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016527582748244736, + "loss": 2.6308, + "theoretical_loss": 3.4008852904682643, + "tokens_seen": 2220729344 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016526579739217654, + "loss": 2.4775, + "theoretical_loss": 3.4008770937491217, + "tokens_seen": 2220794880 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016525576730190572, + "loss": 2.3557, + "theoretical_loss": 3.400868897339588, + "tokens_seen": 2220860416 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001652457372116349, + "loss": 2.4597, + "theoretical_loss": 3.400860701239642, + "tokens_seen": 2220925952 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001652357071213641, + "loss": 2.4289, + "theoretical_loss": 3.4008525054492633, + "tokens_seen": 2220991488 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001652256770310933, + "loss": 2.4908, + "theoretical_loss": 3.4008443099684316, + "tokens_seen": 2221057024 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016521564694082247, + "loss": 2.4226, + "theoretical_loss": 3.4008361147971247, + "tokens_seen": 2221122560 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016520561685055165, + "loss": 2.5864, + "theoretical_loss": 3.400827919935322, + "tokens_seen": 2221188096 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016519558676028086, + "loss": 2.4662, + "theoretical_loss": 3.400819725383004, + "tokens_seen": 2221253632 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016518555667001004, + "loss": 2.6602, + "theoretical_loss": 3.400811531140149, + "tokens_seen": 2221319168 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016517552657973922, + "loss": 2.6246, + "theoretical_loss": 3.400803337206736, + "tokens_seen": 2221384704 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001651654964894684, + "loss": 2.8023, + "theoretical_loss": 3.4007951435827444, + "tokens_seen": 2221450240 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016515546639919758, + "loss": 2.4013, + "theoretical_loss": 3.4007869502681536, + "tokens_seen": 2221515776 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001651454363089268, + "loss": 2.4047, + "theoretical_loss": 3.400778757262942, + "tokens_seen": 2221581312 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016513540621865597, + "loss": 2.4227, + "theoretical_loss": 3.40077056456709, + "tokens_seen": 2221646848 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2464160, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5602309703826904, + "objective/train/theoretical_loss": 3.400768516441462, + "objective/train/tokens_used": 2242123232, + "theoretical_loss": 3.400768516441462, + "tokens_seen": 2221663232 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016512537612838515, + "loss": 2.6445, + "theoretical_loss": 3.400762372180576, + "tokens_seen": 2221712384 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016511534603811433, + "loss": 2.6289, + "theoretical_loss": 3.4007541801033794, + "tokens_seen": 2221777920 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016510531594784354, + "loss": 2.7099, + "theoretical_loss": 3.4007459883354794, + "tokens_seen": 2221843456 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016509528585757272, + "loss": 2.5973, + "theoretical_loss": 3.4007377968768546, + "tokens_seen": 2221908992 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001650852557673019, + "loss": 2.5124, + "theoretical_loss": 3.400729605727485, + "tokens_seen": 2221974528 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001650752256770311, + "loss": 2.556, + "theoretical_loss": 3.40072141488735, + "tokens_seen": 2222040064 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016506519558676027, + "loss": 2.5906, + "theoretical_loss": 3.400713224356428, + "tokens_seen": 2222105600 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016505516549648948, + "loss": 2.5106, + "theoretical_loss": 3.400705034134699, + "tokens_seen": 2222171136 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016504513540621866, + "loss": 2.6098, + "theoretical_loss": 3.4006968442221415, + "tokens_seen": 2222236672 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016503510531594784, + "loss": 2.3191, + "theoretical_loss": 3.400688654618735, + "tokens_seen": 2222302208 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016502507522567702, + "loss": 2.5532, + "theoretical_loss": 3.4006804653244584, + "tokens_seen": 2222367744 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016501504513540623, + "loss": 2.7025, + "theoretical_loss": 3.400672276339292, + "tokens_seen": 2222433280 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001650050150451354, + "loss": 2.5905, + "theoretical_loss": 3.4006640876632135, + "tokens_seen": 2222498816 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001649949849548646, + "loss": 2.6712, + "theoretical_loss": 3.4006558992962033, + "tokens_seen": 2222564352 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016498495486459377, + "loss": 2.7824, + "theoretical_loss": 3.40064771123824, + "tokens_seen": 2222629888 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016497492477432295, + "loss": 2.4235, + "theoretical_loss": 3.400639523489303, + "tokens_seen": 2222695424 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016496489468405216, + "loss": 2.4845, + "theoretical_loss": 3.4006313360493716, + "tokens_seen": 2222760960 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016495486459378134, + "loss": 2.4989, + "theoretical_loss": 3.400623148918425, + "tokens_seen": 2222826496 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016494483450351052, + "loss": 2.2728, + "theoretical_loss": 3.4006149620964425, + "tokens_seen": 2222892032 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001649348044132397, + "loss": 2.8237, + "theoretical_loss": 3.400606775583403, + "tokens_seen": 2222957568 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001649247743229689, + "loss": 2.5746, + "theoretical_loss": 3.4005985893792863, + "tokens_seen": 2223023104 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001649147442326981, + "loss": 2.7854, + "theoretical_loss": 3.400590403484071, + "tokens_seen": 2223088640 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016490471414242727, + "loss": 2.5664, + "theoretical_loss": 3.4005822178977367, + "tokens_seen": 2223154176 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016489468405215646, + "loss": 2.1162, + "theoretical_loss": 3.4005740326202627, + "tokens_seen": 2223219712 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016488465396188566, + "loss": 2.7239, + "theoretical_loss": 3.400565847651628, + "tokens_seen": 2223285248 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2464951, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.437429428100586, + "objective/train/theoretical_loss": 3.400563801457723, + "objective/train/tokens_used": 2243761632, + "theoretical_loss": 3.400563801457723, + "tokens_seen": 2223301632 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016487462387161484, + "loss": 2.6034, + "theoretical_loss": 3.4005576629918117, + "tokens_seen": 2223350784 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016486459378134403, + "loss": 2.4235, + "theoretical_loss": 3.4005494786407935, + "tokens_seen": 2223416320 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001648545636910732, + "loss": 2.295, + "theoretical_loss": 3.4005412945985527, + "tokens_seen": 2223481856 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001648445336008024, + "loss": 2.536, + "theoretical_loss": 3.4005331108650685, + "tokens_seen": 2223547392 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001648345035105316, + "loss": 2.5298, + "theoretical_loss": 3.4005249274403195, + "tokens_seen": 2223612928 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016482447342026078, + "loss": 2.356, + "theoretical_loss": 3.4005167443242854, + "tokens_seen": 2223678464 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016481444332998996, + "loss": 2.3328, + "theoretical_loss": 3.4005085615169457, + "tokens_seen": 2223744000 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016480441323971914, + "loss": 2.3784, + "theoretical_loss": 3.400500379018279, + "tokens_seen": 2223809536 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016479438314944835, + "loss": 2.543, + "theoretical_loss": 3.4004921968282655, + "tokens_seen": 2223875072 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016478435305917756, + "loss": 2.672, + "theoretical_loss": 3.400484014946884, + "tokens_seen": 2223940608 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016477432296890674, + "loss": 2.4813, + "theoretical_loss": 3.4004758333741134, + "tokens_seen": 2224006144 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016476429287863592, + "loss": 2.5149, + "theoretical_loss": 3.400467652109933, + "tokens_seen": 2224071680 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001647542627883651, + "loss": 2.4967, + "theoretical_loss": 3.4004594711543232, + "tokens_seen": 2224137216 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001647442326980943, + "loss": 2.5658, + "theoretical_loss": 3.4004512905072617, + "tokens_seen": 2224202752 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001647342026078235, + "loss": 2.684, + "theoretical_loss": 3.4004431101687285, + "tokens_seen": 2224268288 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016472417251755267, + "loss": 2.5506, + "theoretical_loss": 3.4004349301387036, + "tokens_seen": 2224333824 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016471414242728185, + "loss": 2.5758, + "theoretical_loss": 3.400426750417165, + "tokens_seen": 2224399360 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016470411233701106, + "loss": 2.5682, + "theoretical_loss": 3.400418571004092, + "tokens_seen": 2224464896 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016469408224674024, + "loss": 2.6355, + "theoretical_loss": 3.4004103918994653, + "tokens_seen": 2224530432 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016468405215646942, + "loss": 2.7683, + "theoretical_loss": 3.4004022131032627, + "tokens_seen": 2224595968 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001646740220661986, + "loss": 2.3118, + "theoretical_loss": 3.4003940346154646, + "tokens_seen": 2224661504 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016466399197592778, + "loss": 2.6209, + "theoretical_loss": 3.400385856436049, + "tokens_seen": 2224727040 + }, + { + "epoch": 7.04, + "learning_rate": 0.000164653961885657, + "loss": 2.4357, + "theoretical_loss": 3.400377678564997, + "tokens_seen": 2224792576 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016464393179538617, + "loss": 2.7265, + "theoretical_loss": 3.400369501002286, + "tokens_seen": 2224858112 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016463390170511535, + "loss": 2.6239, + "theoretical_loss": 3.4003613237478962, + "tokens_seen": 2224923648 + }, + { + "epoch": 7.04, + "objective/train/docs_used": 2466575, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6779794692993164, + "objective/train/theoretical_loss": 3.4003592794824717, + "objective/train/tokens_used": 2245400032, + "theoretical_loss": 3.4003592794824717, + "tokens_seen": 2224940032 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016462387161484454, + "loss": 2.4948, + "theoretical_loss": 3.400353146801807, + "tokens_seen": 2224989184 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016461384152457374, + "loss": 2.5879, + "theoretical_loss": 3.4003449701639976, + "tokens_seen": 2225054720 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016460381143430292, + "loss": 2.5365, + "theoretical_loss": 3.400336793834447, + "tokens_seen": 2225120256 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001645937813440321, + "loss": 2.4454, + "theoretical_loss": 3.400328617813135, + "tokens_seen": 2225185792 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001645837512537613, + "loss": 2.5732, + "theoretical_loss": 3.40032044210004, + "tokens_seen": 2225251328 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016457372116349047, + "loss": 2.5562, + "theoretical_loss": 3.4003122666951424, + "tokens_seen": 2225316864 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016456369107321968, + "loss": 2.5306, + "theoretical_loss": 3.400304091598421, + "tokens_seen": 2225382400 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016455366098294886, + "loss": 2.469, + "theoretical_loss": 3.400295916809855, + "tokens_seen": 2225447936 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016454363089267804, + "loss": 2.5462, + "theoretical_loss": 3.4002877423294238, + "tokens_seen": 2225513472 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016453360080240722, + "loss": 2.5415, + "theoretical_loss": 3.400279568157107, + "tokens_seen": 2225579008 + }, + { + "epoch": 7.04, + "learning_rate": 0.00016452357071213643, + "loss": 2.512, + "theoretical_loss": 3.4002713942928833, + "tokens_seen": 2225644544 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001645135406218656, + "loss": 2.5341, + "theoretical_loss": 3.4002632207367327, + "tokens_seen": 2225710080 + }, + { + "epoch": 7.04, + "learning_rate": 0.0001645035105315948, + "loss": 2.5635, + "theoretical_loss": 3.400255047488634, + "tokens_seen": 2225775616 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016449348044132397, + "loss": 2.4937, + "theoretical_loss": 3.400246874548567, + "tokens_seen": 2225841152 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016448345035105315, + "loss": 2.2818, + "theoretical_loss": 3.4002387019165106, + "tokens_seen": 2225906688 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016447342026078236, + "loss": 2.4462, + "theoretical_loss": 3.400230529592444, + "tokens_seen": 2225972224 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016446339017051154, + "loss": 2.5255, + "theoretical_loss": 3.400222357576347, + "tokens_seen": 2226037760 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016445336008024072, + "loss": 2.522, + "theoretical_loss": 3.400214185868199, + "tokens_seen": 2226103296 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001644433299899699, + "loss": 2.3445, + "theoretical_loss": 3.4002060144679787, + "tokens_seen": 2226168832 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001644332998996991, + "loss": 2.4582, + "theoretical_loss": 3.400197843375666, + "tokens_seen": 2226234368 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001644232698094283, + "loss": 2.552, + "theoretical_loss": 3.4001896725912397, + "tokens_seen": 2226299904 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016441323971915747, + "loss": 2.5936, + "theoretical_loss": 3.40018150211468, + "tokens_seen": 2226365440 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016440320962888666, + "loss": 2.3367, + "theoretical_loss": 3.400173331945965, + "tokens_seen": 2226430976 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016439317953861586, + "loss": 2.6799, + "theoretical_loss": 3.4001651620850755, + "tokens_seen": 2226496512 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016438314944834504, + "loss": 2.4653, + "theoretical_loss": 3.400156992531989, + "tokens_seen": 2226562048 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2467215, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7138404846191406, + "objective/train/theoretical_loss": 3.40015495019181, + "objective/train/tokens_used": 2247038432, + "theoretical_loss": 3.40015495019181, + "tokens_seen": 2226578432 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016437311935807423, + "loss": 2.4192, + "theoretical_loss": 3.400148823286687, + "tokens_seen": 2226627584 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001643630892678034, + "loss": 2.5863, + "theoretical_loss": 3.4001406543491473, + "tokens_seen": 2226693120 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001643530591775326, + "loss": 2.5194, + "theoretical_loss": 3.4001324857193493, + "tokens_seen": 2226758656 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001643430290872618, + "loss": 2.5512, + "theoretical_loss": 3.4001243173972737, + "tokens_seen": 2226824192 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016433299899699098, + "loss": 2.5186, + "theoretical_loss": 3.400116149382898, + "tokens_seen": 2226889728 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016432296890672016, + "loss": 2.4675, + "theoretical_loss": 3.400107981676203, + "tokens_seen": 2226955264 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016431293881644934, + "loss": 2.5111, + "theoretical_loss": 3.4000998142771675, + "tokens_seen": 2227020800 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016430290872617855, + "loss": 2.408, + "theoretical_loss": 3.4000916471857705, + "tokens_seen": 2227086336 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016429287863590773, + "loss": 2.4639, + "theoretical_loss": 3.4000834804019924, + "tokens_seen": 2227151872 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001642828485456369, + "loss": 2.5817, + "theoretical_loss": 3.4000753139258113, + "tokens_seen": 2227217408 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001642728184553661, + "loss": 2.4353, + "theoretical_loss": 3.4000671477572073, + "tokens_seen": 2227282944 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016426278836509527, + "loss": 2.7358, + "theoretical_loss": 3.40005898189616, + "tokens_seen": 2227348480 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016425275827482448, + "loss": 2.366, + "theoretical_loss": 3.400050816342648, + "tokens_seen": 2227414016 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016424272818455366, + "loss": 2.7103, + "theoretical_loss": 3.4000426510966513, + "tokens_seen": 2227479552 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016423269809428284, + "loss": 2.366, + "theoretical_loss": 3.4000344861581486, + "tokens_seen": 2227545088 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016422266800401202, + "loss": 2.5835, + "theoretical_loss": 3.40002632152712, + "tokens_seen": 2227610624 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016421263791374123, + "loss": 2.701, + "theoretical_loss": 3.4000181572035446, + "tokens_seen": 2227676160 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001642026078234704, + "loss": 2.5845, + "theoretical_loss": 3.400009993187402, + "tokens_seen": 2227741696 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001641925777331996, + "loss": 2.4863, + "theoretical_loss": 3.400001829478671, + "tokens_seen": 2227807232 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016418254764292878, + "loss": 2.5996, + "theoretical_loss": 3.3999936660773313, + "tokens_seen": 2227872768 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016417251755265796, + "loss": 2.5761, + "theoretical_loss": 3.3999855029833626, + "tokens_seen": 2227938304 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016416248746238716, + "loss": 2.4479, + "theoretical_loss": 3.3999773401967444, + "tokens_seen": 2228003840 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016415245737211635, + "loss": 2.451, + "theoretical_loss": 3.399969177717455, + "tokens_seen": 2228069376 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016414242728184553, + "loss": 2.6271, + "theoretical_loss": 3.3999610155454745, + "tokens_seen": 2228134912 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001641323971915747, + "loss": 2.6263, + "theoretical_loss": 3.3999528536807824, + "tokens_seen": 2228200448 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2468498, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3341238498687744, + "objective/train/theoretical_loss": 3.399950813262621, + "objective/train/tokens_used": 2248676832, + "theoretical_loss": 3.399950813262621, + "tokens_seen": 2228216832 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016412236710130392, + "loss": 2.5895, + "theoretical_loss": 3.3999446921233583, + "tokens_seen": 2228265984 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001641123370110331, + "loss": 2.5387, + "theoretical_loss": 3.399936530873181, + "tokens_seen": 2228331520 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016410230692076228, + "loss": 2.6196, + "theoretical_loss": 3.3999283699302296, + "tokens_seen": 2228397056 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016409227683049146, + "loss": 2.5861, + "theoretical_loss": 3.3999202092944847, + "tokens_seen": 2228462592 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016408224674022064, + "loss": 2.3674, + "theoretical_loss": 3.399912048965925, + "tokens_seen": 2228528128 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016407221664994985, + "loss": 2.5949, + "theoretical_loss": 3.3999038889445297, + "tokens_seen": 2228593664 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016406218655967903, + "loss": 2.8083, + "theoretical_loss": 3.3998957292302787, + "tokens_seen": 2228659200 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001640521564694082, + "loss": 2.3291, + "theoretical_loss": 3.3998875698231514, + "tokens_seen": 2228724736 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001640421263791374, + "loss": 2.5446, + "theoretical_loss": 3.3998794107231265, + "tokens_seen": 2228790272 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016403209628886663, + "loss": 2.6207, + "theoretical_loss": 3.3998712519301844, + "tokens_seen": 2228855808 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001640220661985958, + "loss": 2.5168, + "theoretical_loss": 3.3998630934443037, + "tokens_seen": 2228921344 + }, + { + "epoch": 7.05, + "learning_rate": 0.000164012036108325, + "loss": 2.4427, + "theoretical_loss": 3.399854935265464, + "tokens_seen": 2228986880 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016400200601805417, + "loss": 2.3897, + "theoretical_loss": 3.399846777393645, + "tokens_seen": 2229052416 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016399197592778335, + "loss": 2.6505, + "theoretical_loss": 3.399838619828826, + "tokens_seen": 2229117952 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016398194583751256, + "loss": 2.8211, + "theoretical_loss": 3.3998304625709865, + "tokens_seen": 2229183488 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016397191574724174, + "loss": 2.3272, + "theoretical_loss": 3.3998223056201056, + "tokens_seen": 2229249024 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016396188565697092, + "loss": 2.4998, + "theoretical_loss": 3.399814148976163, + "tokens_seen": 2229314560 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001639518555667001, + "loss": 2.5928, + "theoretical_loss": 3.3998059926391377, + "tokens_seen": 2229380096 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001639418254764293, + "loss": 2.6122, + "theoretical_loss": 3.39979783660901, + "tokens_seen": 2229445632 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001639317953861585, + "loss": 2.5906, + "theoretical_loss": 3.399789680885759, + "tokens_seen": 2229511168 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016392176529588767, + "loss": 2.3937, + "theoretical_loss": 3.3997815254693635, + "tokens_seen": 2229576704 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016391173520561686, + "loss": 2.6078, + "theoretical_loss": 3.3997733703598034, + "tokens_seen": 2229642240 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016390170511534606, + "loss": 2.6053, + "theoretical_loss": 3.3997652155570584, + "tokens_seen": 2229707776 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016389167502507524, + "loss": 2.4851, + "theoretical_loss": 3.3997570610611074, + "tokens_seen": 2229773312 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016388164493480443, + "loss": 2.6072, + "theoretical_loss": 3.39974890687193, + "tokens_seen": 2229838848 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2468500, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.782400131225586, + "objective/train/theoretical_loss": 3.399746868372567, + "objective/train/tokens_used": 2250315232, + "theoretical_loss": 3.399746868372567, + "tokens_seen": 2229855232 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001638716148445336, + "loss": 2.7137, + "theoretical_loss": 3.3997407529895063, + "tokens_seen": 2229904384 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001638615847542628, + "loss": 2.5723, + "theoretical_loss": 3.399732599413815, + "tokens_seen": 2229969920 + }, + { + "epoch": 7.05, + "learning_rate": 0.000163851554663992, + "loss": 2.5844, + "theoretical_loss": 3.3997244461448357, + "tokens_seen": 2230035456 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016384152457372118, + "loss": 2.4591, + "theoretical_loss": 3.3997162931825478, + "tokens_seen": 2230100992 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016383149448345036, + "loss": 2.503, + "theoretical_loss": 3.399708140526931, + "tokens_seen": 2230166528 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016382146439317954, + "loss": 2.5943, + "theoretical_loss": 3.3996999881779644, + "tokens_seen": 2230232064 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016381143430290875, + "loss": 2.5993, + "theoretical_loss": 3.3996918361356276, + "tokens_seen": 2230297600 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016380140421263793, + "loss": 2.51, + "theoretical_loss": 3.3996836843999003, + "tokens_seen": 2230363136 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001637913741223671, + "loss": 2.6185, + "theoretical_loss": 3.3996755329707615, + "tokens_seen": 2230428672 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001637813440320963, + "loss": 2.5673, + "theoretical_loss": 3.3996673818481913, + "tokens_seen": 2230494208 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016377131394182547, + "loss": 2.4846, + "theoretical_loss": 3.3996592310321687, + "tokens_seen": 2230559744 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016376128385155468, + "loss": 2.7065, + "theoretical_loss": 3.3996510805226734, + "tokens_seen": 2230625280 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016375125376128386, + "loss": 2.6306, + "theoretical_loss": 3.3996429303196845, + "tokens_seen": 2230690816 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016374122367101304, + "loss": 2.5108, + "theoretical_loss": 3.3996347804231815, + "tokens_seen": 2230756352 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016373119358074222, + "loss": 2.4893, + "theoretical_loss": 3.3996266308331444, + "tokens_seen": 2230821888 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016372116349047143, + "loss": 2.7361, + "theoretical_loss": 3.3996184815495525, + "tokens_seen": 2230887424 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001637111334002006, + "loss": 2.5527, + "theoretical_loss": 3.3996103325723848, + "tokens_seen": 2230952960 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001637011033099298, + "loss": 2.6152, + "theoretical_loss": 3.399602183901621, + "tokens_seen": 2231018496 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001637011033099298, + "loss": 2.6495, + "theoretical_loss": 3.3995940355372407, + "tokens_seen": 2231084032 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016369107321965898, + "loss": 2.6318, + "theoretical_loss": 3.3995858874792235, + "tokens_seen": 2231149568 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016368104312938816, + "loss": 2.6683, + "theoretical_loss": 3.399577739727549, + "tokens_seen": 2231215104 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016367101303911736, + "loss": 2.4301, + "theoretical_loss": 3.399569592282196, + "tokens_seen": 2231280640 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016366098294884655, + "loss": 2.4458, + "theoretical_loss": 3.399561445143145, + "tokens_seen": 2231346176 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016365095285857573, + "loss": 2.5351, + "theoretical_loss": 3.399553298310374, + "tokens_seen": 2231411712 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001636409227683049, + "loss": 2.4278, + "theoretical_loss": 3.399545151783864, + "tokens_seen": 2231477248 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2468500, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5570969581604004, + "objective/train/theoretical_loss": 3.3995431152000872, + "objective/train/tokens_used": 2251953632, + "theoretical_loss": 3.3995431152000872, + "tokens_seen": 2231493632 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016363089267803412, + "loss": 2.624, + "theoretical_loss": 3.3995370055635936, + "tokens_seen": 2231542784 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001636208625877633, + "loss": 2.4999, + "theoretical_loss": 3.3995288596495428, + "tokens_seen": 2231608320 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016361083249749248, + "loss": 2.62, + "theoretical_loss": 3.3995207140416905, + "tokens_seen": 2231673856 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016360080240722166, + "loss": 2.5908, + "theoretical_loss": 3.3995125687400174, + "tokens_seen": 2231739392 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016359077231695084, + "loss": 2.6336, + "theoretical_loss": 3.399504423744501, + "tokens_seen": 2231804928 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016358074222668005, + "loss": 2.3843, + "theoretical_loss": 3.399496279055123, + "tokens_seen": 2231870464 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016357071213640923, + "loss": 2.5515, + "theoretical_loss": 3.3994881346718615, + "tokens_seen": 2231936000 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001635606820461384, + "loss": 2.5386, + "theoretical_loss": 3.3994799905946964, + "tokens_seen": 2232001536 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001635506519558676, + "loss": 2.5528, + "theoretical_loss": 3.399471846823607, + "tokens_seen": 2232067072 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001635406218655968, + "loss": 2.5487, + "theoretical_loss": 3.399463703358573, + "tokens_seen": 2232132608 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016353059177532598, + "loss": 2.7037, + "theoretical_loss": 3.3994555601995744, + "tokens_seen": 2232198144 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016352056168505516, + "loss": 2.6138, + "theoretical_loss": 3.39944741734659, + "tokens_seen": 2232263680 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016351053159478434, + "loss": 2.6, + "theoretical_loss": 3.399439274799599, + "tokens_seen": 2232329216 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016350050150451353, + "loss": 2.2841, + "theoretical_loss": 3.3994311325585826, + "tokens_seen": 2232394752 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016349047141424273, + "loss": 2.4638, + "theoretical_loss": 3.3994229906235183, + "tokens_seen": 2232460288 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016348044132397191, + "loss": 2.7795, + "theoretical_loss": 3.399414848994387, + "tokens_seen": 2232525824 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001634704112337011, + "loss": 2.409, + "theoretical_loss": 3.3994067076711674, + "tokens_seen": 2232591360 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016346038114343028, + "loss": 2.5132, + "theoretical_loss": 3.399398566653839, + "tokens_seen": 2232656896 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016345035105315949, + "loss": 2.5935, + "theoretical_loss": 3.3993904259423826, + "tokens_seen": 2232722432 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016344032096288867, + "loss": 2.5717, + "theoretical_loss": 3.399382285536776, + "tokens_seen": 2232787968 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016343029087261785, + "loss": 2.5361, + "theoretical_loss": 3.399374145437, + "tokens_seen": 2232853504 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016342026078234703, + "loss": 2.481, + "theoretical_loss": 3.3993660056430337, + "tokens_seen": 2232919040 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001634102306920762, + "loss": 2.3355, + "theoretical_loss": 3.3993578661548565, + "tokens_seen": 2232984576 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016340020060180542, + "loss": 2.494, + "theoretical_loss": 3.3993497269724484, + "tokens_seen": 2233050112 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001633901705115346, + "loss": 2.6795, + "theoretical_loss": 3.3993415880957882, + "tokens_seen": 2233115648 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2470005, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.795356273651123, + "objective/train/theoretical_loss": 3.3993395534243938, + "objective/train/tokens_used": 2253592032, + "theoretical_loss": 3.3993395534243938, + "tokens_seen": 2233132032 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016338014042126378, + "loss": 2.6391, + "theoretical_loss": 3.399333449524856, + "tokens_seen": 2233181184 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016337011033099296, + "loss": 2.6536, + "theoretical_loss": 3.399325311259631, + "tokens_seen": 2233246720 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016336008024072217, + "loss": 2.4637, + "theoretical_loss": 3.3993171733000933, + "tokens_seen": 2233312256 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016335005015045135, + "loss": 2.6439, + "theoretical_loss": 3.3993090356462217, + "tokens_seen": 2233377792 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016334002006018053, + "loss": 2.5419, + "theoretical_loss": 3.399300898297996, + "tokens_seen": 2233443328 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001633299899699097, + "loss": 2.512, + "theoretical_loss": 3.3992927612553965, + "tokens_seen": 2233508864 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016331995987963892, + "loss": 2.8751, + "theoretical_loss": 3.3992846245184016, + "tokens_seen": 2233574400 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001633099297893681, + "loss": 2.6682, + "theoretical_loss": 3.3992764880869917, + "tokens_seen": 2233639936 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016329989969909728, + "loss": 2.4985, + "theoretical_loss": 3.3992683519611457, + "tokens_seen": 2233705472 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016328986960882646, + "loss": 2.5462, + "theoretical_loss": 3.3992602161408443, + "tokens_seen": 2233771008 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016327983951855567, + "loss": 2.5535, + "theoretical_loss": 3.3992520806260655, + "tokens_seen": 2233836544 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016326980942828488, + "loss": 2.5491, + "theoretical_loss": 3.39924394541679, + "tokens_seen": 2233902080 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016325977933801406, + "loss": 2.6774, + "theoretical_loss": 3.399235810512997, + "tokens_seen": 2233967616 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016324974924774324, + "loss": 2.6009, + "theoretical_loss": 3.399227675914666, + "tokens_seen": 2234033152 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016323971915747242, + "loss": 2.6578, + "theoretical_loss": 3.3992195416217763, + "tokens_seen": 2234098688 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016322968906720163, + "loss": 2.7501, + "theoretical_loss": 3.3992114076343083, + "tokens_seen": 2234164224 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001632196589769308, + "loss": 2.4936, + "theoretical_loss": 3.399203273952241, + "tokens_seen": 2234229760 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016320962888666, + "loss": 2.6222, + "theoretical_loss": 3.399195140575554, + "tokens_seen": 2234295296 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016319959879638918, + "loss": 2.4416, + "theoretical_loss": 3.399187007504227, + "tokens_seen": 2234360832 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016318956870611836, + "loss": 2.6611, + "theoretical_loss": 3.399178874738239, + "tokens_seen": 2234426368 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016317953861584757, + "loss": 2.5053, + "theoretical_loss": 3.3991707422775708, + "tokens_seen": 2234491904 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016316950852557675, + "loss": 2.3979, + "theoretical_loss": 3.3991626101222012, + "tokens_seen": 2234557440 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016315947843530593, + "loss": 2.4044, + "theoretical_loss": 3.3991544782721097, + "tokens_seen": 2234622976 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001631494483450351, + "loss": 2.4792, + "theoretical_loss": 3.3991463467272762, + "tokens_seen": 2234688512 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016313941825476432, + "loss": 2.668, + "theoretical_loss": 3.39913821548768, + "tokens_seen": 2234754048 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2470559, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.702888011932373, + "objective/train/theoretical_loss": 3.3991361827254716, + "objective/train/tokens_used": 2255230432, + "theoretical_loss": 3.3991361827254716, + "tokens_seen": 2234770432 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001631293881644935, + "loss": 2.6022, + "theoretical_loss": 3.399130084553301, + "tokens_seen": 2234819584 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016311935807422268, + "loss": 2.4774, + "theoretical_loss": 3.399121953924118, + "tokens_seen": 2234885120 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016310932798395186, + "loss": 2.4132, + "theoretical_loss": 3.399113823600112, + "tokens_seen": 2234950656 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016309929789368104, + "loss": 2.4818, + "theoretical_loss": 3.3991056935812614, + "tokens_seen": 2235016192 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016308926780341025, + "loss": 2.5401, + "theoretical_loss": 3.3990975638675462, + "tokens_seen": 2235081728 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016307923771313943, + "loss": 2.7219, + "theoretical_loss": 3.399089434458946, + "tokens_seen": 2235147264 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001630692076228686, + "loss": 2.5073, + "theoretical_loss": 3.3990813053554407, + "tokens_seen": 2235212800 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001630591775325978, + "loss": 2.604, + "theoretical_loss": 3.3990731765570095, + "tokens_seen": 2235278336 + }, + { + "epoch": 7.05, + "learning_rate": 0.000163049147442327, + "loss": 2.6538, + "theoretical_loss": 3.3990650480636324, + "tokens_seen": 2235343872 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016303911735205618, + "loss": 2.6586, + "theoretical_loss": 3.3990569198752887, + "tokens_seen": 2235409408 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016302908726178536, + "loss": 2.4986, + "theoretical_loss": 3.399048791991958, + "tokens_seen": 2235474944 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016301905717151454, + "loss": 2.5009, + "theoretical_loss": 3.3990406644136195, + "tokens_seen": 2235540480 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016300902708124373, + "loss": 2.5207, + "theoretical_loss": 3.3990325371402537, + "tokens_seen": 2235606016 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016299899699097293, + "loss": 2.4737, + "theoretical_loss": 3.39902441017184, + "tokens_seen": 2235671552 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016298896690070211, + "loss": 2.6817, + "theoretical_loss": 3.3990162835083573, + "tokens_seen": 2235737088 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001629789368104313, + "loss": 2.509, + "theoretical_loss": 3.399008157149786, + "tokens_seen": 2235802624 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016296890672016048, + "loss": 2.5338, + "theoretical_loss": 3.3990000310961057, + "tokens_seen": 2235868160 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016295887662988969, + "loss": 2.4804, + "theoretical_loss": 3.3989919053472955, + "tokens_seen": 2235933696 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016294884653961887, + "loss": 2.628, + "theoretical_loss": 3.3989837799033356, + "tokens_seen": 2235999232 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016293881644934805, + "loss": 2.4206, + "theoretical_loss": 3.3989756547642047, + "tokens_seen": 2236064768 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016292878635907723, + "loss": 2.6093, + "theoretical_loss": 3.398967529929884, + "tokens_seen": 2236130304 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001629187562688064, + "loss": 2.7743, + "theoretical_loss": 3.3989594054003516, + "tokens_seen": 2236195840 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016290872617853562, + "loss": 2.5053, + "theoretical_loss": 3.398951281175588, + "tokens_seen": 2236261376 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001628986960882648, + "loss": 2.5481, + "theoretical_loss": 3.398943157255572, + "tokens_seen": 2236326912 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016288866599799398, + "loss": 2.5194, + "theoretical_loss": 3.3989350336402846, + "tokens_seen": 2236392448 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2471831, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.507685422897339, + "objective/train/theoretical_loss": 3.398933002784074, + "objective/train/tokens_used": 2256868832, + "theoretical_loss": 3.398933002784074, + "tokens_seen": 2236408832 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016287863590772316, + "loss": 2.4382, + "theoretical_loss": 3.3989269103297044, + "tokens_seen": 2236457984 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016286860581745237, + "loss": 2.6608, + "theoretical_loss": 3.398918787323811, + "tokens_seen": 2236523520 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016285857572718155, + "loss": 2.4912, + "theoretical_loss": 3.398910664622585, + "tokens_seen": 2236589056 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016284854563691073, + "loss": 2.5075, + "theoretical_loss": 3.3989025422260046, + "tokens_seen": 2236654592 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001628385155466399, + "loss": 2.6413, + "theoretical_loss": 3.3988944201340505, + "tokens_seen": 2236720128 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016282848545636912, + "loss": 2.4228, + "theoretical_loss": 3.3988862983467025, + "tokens_seen": 2236785664 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001628184553660983, + "loss": 2.4591, + "theoretical_loss": 3.3988781768639393, + "tokens_seen": 2236851200 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016280842527582748, + "loss": 2.5281, + "theoretical_loss": 3.3988700556857414, + "tokens_seen": 2236916736 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016279839518555666, + "loss": 2.3792, + "theoretical_loss": 3.3988619348120883, + "tokens_seen": 2236982272 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016278836509528585, + "loss": 2.7398, + "theoretical_loss": 3.398853814242959, + "tokens_seen": 2237047808 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016277833500501505, + "loss": 2.563, + "theoretical_loss": 3.398845693978334, + "tokens_seen": 2237113344 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016276830491474423, + "loss": 2.456, + "theoretical_loss": 3.398837574018193, + "tokens_seen": 2237178880 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016275827482447342, + "loss": 2.526, + "theoretical_loss": 3.398829454362515, + "tokens_seen": 2237244416 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001627482447342026, + "loss": 2.7816, + "theoretical_loss": 3.39882133501128, + "tokens_seen": 2237309952 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001627382146439318, + "loss": 2.5496, + "theoretical_loss": 3.3988132159644673, + "tokens_seen": 2237375488 + }, + { + "epoch": 7.05, + "learning_rate": 0.000162728184553661, + "loss": 2.1864, + "theoretical_loss": 3.398805097222057, + "tokens_seen": 2237441024 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016271815446339017, + "loss": 2.4996, + "theoretical_loss": 3.398796978784029, + "tokens_seen": 2237506560 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016270812437311935, + "loss": 2.5829, + "theoretical_loss": 3.3987888606503622, + "tokens_seen": 2237572096 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016269809428284853, + "loss": 2.6916, + "theoretical_loss": 3.398780742821037, + "tokens_seen": 2237637632 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016268806419257774, + "loss": 2.5186, + "theoretical_loss": 3.398772625296033, + "tokens_seen": 2237703168 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016267803410230692, + "loss": 2.6568, + "theoretical_loss": 3.3987645080753293, + "tokens_seen": 2237768704 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001626680040120361, + "loss": 2.5907, + "theoretical_loss": 3.398756391158906, + "tokens_seen": 2237834240 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016265797392176528, + "loss": 2.6888, + "theoretical_loss": 3.3987482745467426, + "tokens_seen": 2237899776 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001626479438314945, + "loss": 2.5174, + "theoretical_loss": 3.3987401582388195, + "tokens_seen": 2237965312 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016263791374122367, + "loss": 2.3748, + "theoretical_loss": 3.3987320422351153, + "tokens_seen": 2238030848 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2472586, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.850743532180786, + "objective/train/theoretical_loss": 3.398730013281721, + "objective/train/tokens_used": 2258507232, + "theoretical_loss": 3.398730013281721, + "tokens_seen": 2238047232 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016262788365095285, + "loss": 2.4631, + "theoretical_loss": 3.3987239265356104, + "tokens_seen": 2238096384 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016261785356068203, + "loss": 2.3854, + "theoretical_loss": 3.398715811140284, + "tokens_seen": 2238161920 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016260782347041121, + "loss": 2.3619, + "theoretical_loss": 3.3987076960491165, + "tokens_seen": 2238227456 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016259779338014042, + "loss": 2.6527, + "theoretical_loss": 3.398699581262087, + "tokens_seen": 2238292992 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001625877632898696, + "loss": 2.5726, + "theoretical_loss": 3.3986914667791757, + "tokens_seen": 2238358528 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016257773319959878, + "loss": 2.7731, + "theoretical_loss": 3.3986833526003615, + "tokens_seen": 2238424064 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016256770310932797, + "loss": 2.4365, + "theoretical_loss": 3.398675238725625, + "tokens_seen": 2238489600 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016255767301905717, + "loss": 2.48, + "theoretical_loss": 3.3986671251549447, + "tokens_seen": 2238555136 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016254764292878636, + "loss": 2.7582, + "theoretical_loss": 3.3986590118883018, + "tokens_seen": 2238620672 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016253761283851554, + "loss": 2.5672, + "theoretical_loss": 3.398650898925675, + "tokens_seen": 2238686208 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016252758274824474, + "loss": 2.4471, + "theoretical_loss": 3.3986427862670445, + "tokens_seen": 2238751744 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016251755265797393, + "loss": 2.5957, + "theoretical_loss": 3.39863467391239, + "tokens_seen": 2238817280 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016250752256770313, + "loss": 2.7445, + "theoretical_loss": 3.3986265618616907, + "tokens_seen": 2238882816 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016249749247743231, + "loss": 2.5863, + "theoretical_loss": 3.3986184501149266, + "tokens_seen": 2238948352 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001624874623871615, + "loss": 2.574, + "theoretical_loss": 3.3986103386720776, + "tokens_seen": 2239013888 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016247743229689068, + "loss": 2.6588, + "theoretical_loss": 3.3986022275331234, + "tokens_seen": 2239079424 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016246740220661989, + "loss": 2.551, + "theoretical_loss": 3.3985941166980433, + "tokens_seen": 2239144960 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016245737211634907, + "loss": 2.5548, + "theoretical_loss": 3.398586006166817, + "tokens_seen": 2239210496 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016244734202607825, + "loss": 2.594, + "theoretical_loss": 3.3985778959394253, + "tokens_seen": 2239276032 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016243731193580743, + "loss": 2.537, + "theoretical_loss": 3.398569786015847, + "tokens_seen": 2239341568 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001624272818455366, + "loss": 2.653, + "theoretical_loss": 3.3985616763960618, + "tokens_seen": 2239407104 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016241725175526582, + "loss": 2.5005, + "theoretical_loss": 3.3985535670800497, + "tokens_seen": 2239472640 + }, + { + "epoch": 7.05, + "learning_rate": 0.000162407221664995, + "loss": 2.458, + "theoretical_loss": 3.3985454580677903, + "tokens_seen": 2239538176 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016239719157472418, + "loss": 2.6284, + "theoretical_loss": 3.3985373493592634, + "tokens_seen": 2239603712 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016238716148445336, + "loss": 2.4283, + "theoretical_loss": 3.3985292409544487, + "tokens_seen": 2239669248 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2473901, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7133913040161133, + "objective/train/theoretical_loss": 3.3985272139006977, + "objective/train/tokens_used": 2260145632, + "theoretical_loss": 3.3985272139006977, + "tokens_seen": 2239685632 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016237713139418257, + "loss": 2.5781, + "theoretical_loss": 3.3985211328533262, + "tokens_seen": 2239734784 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016236710130391175, + "loss": 2.5601, + "theoretical_loss": 3.398513025055875, + "tokens_seen": 2239800320 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016235707121364093, + "loss": 2.407, + "theoretical_loss": 3.3985049175620756, + "tokens_seen": 2239865856 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001623470411233701, + "loss": 2.4348, + "theoretical_loss": 3.398496810371907, + "tokens_seen": 2239931392 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016233701103309932, + "loss": 2.6019, + "theoretical_loss": 3.39848870348535, + "tokens_seen": 2239996928 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001623269809428285, + "loss": 2.4602, + "theoretical_loss": 3.3984805969023832, + "tokens_seen": 2240062464 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016231695085255768, + "loss": 2.6333, + "theoretical_loss": 3.398472490622987, + "tokens_seen": 2240128000 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016230692076228686, + "loss": 2.4427, + "theoretical_loss": 3.398464384647141, + "tokens_seen": 2240193536 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016229689067201605, + "loss": 2.4138, + "theoretical_loss": 3.398456278974825, + "tokens_seen": 2240259072 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016228686058174525, + "loss": 2.2662, + "theoretical_loss": 3.3984481736060186, + "tokens_seen": 2240324608 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016227683049147444, + "loss": 2.4961, + "theoretical_loss": 3.3984400685407015, + "tokens_seen": 2240390144 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016226680040120362, + "loss": 2.6154, + "theoretical_loss": 3.3984319637788536, + "tokens_seen": 2240455680 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001622567703109328, + "loss": 2.5567, + "theoretical_loss": 3.398423859320455, + "tokens_seen": 2240521216 + }, + { + "epoch": 7.05, + "learning_rate": 0.000162246740220662, + "loss": 2.762, + "theoretical_loss": 3.3984157551654848, + "tokens_seen": 2240586752 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001622367101303912, + "loss": 2.5139, + "theoretical_loss": 3.398407651313923, + "tokens_seen": 2240652288 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016222668004012037, + "loss": 2.5415, + "theoretical_loss": 3.3983995477657496, + "tokens_seen": 2240717824 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016221664994984955, + "loss": 2.501, + "theoretical_loss": 3.3983914445209447, + "tokens_seen": 2240783360 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016220661985957873, + "loss": 2.6882, + "theoretical_loss": 3.3983833415794873, + "tokens_seen": 2240848896 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016219658976930794, + "loss": 2.4454, + "theoretical_loss": 3.398375238941357, + "tokens_seen": 2240914432 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016218655967903712, + "loss": 2.6263, + "theoretical_loss": 3.3983671366065344, + "tokens_seen": 2240979968 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001621765295887663, + "loss": 2.5805, + "theoretical_loss": 3.398359034574999, + "tokens_seen": 2241045504 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016216649949849548, + "loss": 2.5671, + "theoretical_loss": 3.3983509328467307, + "tokens_seen": 2241111040 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001621564694082247, + "loss": 2.7138, + "theoretical_loss": 3.398342831421709, + "tokens_seen": 2241176576 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016214643931795387, + "loss": 2.531, + "theoretical_loss": 3.3983347302999136, + "tokens_seen": 2241242112 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016213640922768305, + "loss": 2.5621, + "theoretical_loss": 3.3983266294813244, + "tokens_seen": 2241307648 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2474546, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.218095064163208, + "objective/train/theoretical_loss": 3.398324604324051, + "objective/train/tokens_used": 2261784032, + "theoretical_loss": 3.398324604324051, + "tokens_seen": 2241324032 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016212637913741223, + "loss": 2.4963, + "theoretical_loss": 3.3983185289659215, + "tokens_seen": 2241373184 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016211634904714141, + "loss": 2.6598, + "theoretical_loss": 3.3983104287536845, + "tokens_seen": 2241438720 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016210631895687062, + "loss": 2.601, + "theoretical_loss": 3.3983023288445926, + "tokens_seen": 2241504256 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001620962888665998, + "loss": 2.4092, + "theoretical_loss": 3.3982942292386267, + "tokens_seen": 2241569792 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016208625877632898, + "loss": 2.5182, + "theoretical_loss": 3.398286129935766, + "tokens_seen": 2241635328 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016207622868605817, + "loss": 2.6751, + "theoretical_loss": 3.3982780309359897, + "tokens_seen": 2241700864 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016206619859578737, + "loss": 2.6696, + "theoretical_loss": 3.3982699322392786, + "tokens_seen": 2241766400 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016205616850551656, + "loss": 2.2897, + "theoretical_loss": 3.398261833845612, + "tokens_seen": 2241831936 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016204613841524574, + "loss": 2.3596, + "theoretical_loss": 3.39825373575497, + "tokens_seen": 2241897472 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016203610832497492, + "loss": 2.4412, + "theoretical_loss": 3.398245637967332, + "tokens_seen": 2241963008 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001620260782347041, + "loss": 2.5879, + "theoretical_loss": 3.398237540482678, + "tokens_seen": 2242028544 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001620160481444333, + "loss": 2.5687, + "theoretical_loss": 3.398229443300988, + "tokens_seen": 2242094080 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001620060180541625, + "loss": 2.5155, + "theoretical_loss": 3.3982213464222415, + "tokens_seen": 2242159616 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016199598796389167, + "loss": 2.6192, + "theoretical_loss": 3.3982132498464184, + "tokens_seen": 2242225152 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016198595787362085, + "loss": 2.5076, + "theoretical_loss": 3.3982051535734987, + "tokens_seen": 2242290688 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016197592778335006, + "loss": 2.4531, + "theoretical_loss": 3.398197057603462, + "tokens_seen": 2242356224 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016196589769307924, + "loss": 2.8021, + "theoretical_loss": 3.398188961936288, + "tokens_seen": 2242421760 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016195586760280842, + "loss": 2.6634, + "theoretical_loss": 3.3981808665719573, + "tokens_seen": 2242487296 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001619458375125376, + "loss": 2.5988, + "theoretical_loss": 3.3981727715104486, + "tokens_seen": 2242552832 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016193580742226678, + "loss": 2.5537, + "theoretical_loss": 3.3981646767517426, + "tokens_seen": 2242618368 + }, + { + "epoch": 7.05, + "learning_rate": 0.000161925777331996, + "loss": 2.6312, + "theoretical_loss": 3.3981565822958184, + "tokens_seen": 2242683904 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016191574724172517, + "loss": 2.5248, + "theoretical_loss": 3.3981484881426565, + "tokens_seen": 2242749440 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016190571715145435, + "loss": 2.6848, + "theoretical_loss": 3.3981403942922364, + "tokens_seen": 2242814976 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016189568706118353, + "loss": 2.6274, + "theoretical_loss": 3.398132300744538, + "tokens_seen": 2242880512 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016188565697091274, + "loss": 2.5252, + "theoretical_loss": 3.398124207499541, + "tokens_seen": 2242946048 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2475891, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8257968425750732, + "objective/train/theoretical_loss": 3.398122184235586, + "objective/train/tokens_used": 2263422432, + "theoretical_loss": 3.398122184235586, + "tokens_seen": 2242962432 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016187562688064192, + "loss": 2.5885, + "theoretical_loss": 3.398116114557225, + "tokens_seen": 2243011584 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001618655967903711, + "loss": 2.687, + "theoretical_loss": 3.3981080219175706, + "tokens_seen": 2243077120 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016185556670010029, + "loss": 2.6817, + "theoretical_loss": 3.398099929580557, + "tokens_seen": 2243142656 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016184553660982947, + "loss": 2.4879, + "theoretical_loss": 3.3980918375461644, + "tokens_seen": 2243208192 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016183550651955868, + "loss": 2.6783, + "theoretical_loss": 3.3980837458143727, + "tokens_seen": 2243273728 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016182547642928786, + "loss": 2.6669, + "theoretical_loss": 3.398075654385161, + "tokens_seen": 2243339264 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016181544633901704, + "loss": 2.2992, + "theoretical_loss": 3.3980675632585102, + "tokens_seen": 2243404800 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016180541624874622, + "loss": 2.5807, + "theoretical_loss": 3.398059472434399, + "tokens_seen": 2243470336 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016179538615847543, + "loss": 2.4818, + "theoretical_loss": 3.3980513819128086, + "tokens_seen": 2243535872 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016178535606820464, + "loss": 2.4686, + "theoretical_loss": 3.398043291693718, + "tokens_seen": 2243601408 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016177532597793382, + "loss": 2.4262, + "theoretical_loss": 3.398035201777107, + "tokens_seen": 2243666944 + }, + { + "epoch": 7.05, + "learning_rate": 0.000161765295887663, + "loss": 2.7186, + "theoretical_loss": 3.3980271121629553, + "tokens_seen": 2243732480 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001617552657973922, + "loss": 2.5869, + "theoretical_loss": 3.3980190228512437, + "tokens_seen": 2243798016 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001617452357071214, + "loss": 2.4694, + "theoretical_loss": 3.398010933841951, + "tokens_seen": 2243863552 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016173520561685057, + "loss": 2.4924, + "theoretical_loss": 3.398002845135058, + "tokens_seen": 2243929088 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016172517552657975, + "loss": 2.6144, + "theoretical_loss": 3.3979947567305437, + "tokens_seen": 2243994624 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016171514543630893, + "loss": 2.6366, + "theoretical_loss": 3.3979866686283886, + "tokens_seen": 2244060160 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016170511534603814, + "loss": 2.512, + "theoretical_loss": 3.397978580828572, + "tokens_seen": 2244125696 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016169508525576732, + "loss": 2.651, + "theoretical_loss": 3.3979704933310746, + "tokens_seen": 2244191232 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001616850551654965, + "loss": 2.5567, + "theoretical_loss": 3.3979624061358753, + "tokens_seen": 2244256768 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016167502507522568, + "loss": 2.5347, + "theoretical_loss": 3.3979543192429547, + "tokens_seen": 2244322304 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001616649949849549, + "loss": 2.5854, + "theoretical_loss": 3.3979462326522922, + "tokens_seen": 2244387840 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016165496489468407, + "loss": 2.446, + "theoretical_loss": 3.397938146363868, + "tokens_seen": 2244453376 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016164493480441325, + "loss": 2.4394, + "theoretical_loss": 3.3979300603776617, + "tokens_seen": 2244518912 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016163490471414243, + "loss": 2.3227, + "theoretical_loss": 3.397921974693654, + "tokens_seen": 2244584448 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2476481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7694172859191895, + "objective/train/theoretical_loss": 3.3979199533198674, + "objective/train/tokens_used": 2265060832, + "theoretical_loss": 3.3979199533198674, + "tokens_seen": 2244600832 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016162487462387161, + "loss": 2.5079, + "theoretical_loss": 3.3979138893118233, + "tokens_seen": 2244649984 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016161484453360082, + "loss": 2.571, + "theoretical_loss": 3.397905804232151, + "tokens_seen": 2244715520 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016160481444333, + "loss": 2.6098, + "theoretical_loss": 3.397897719454616, + "tokens_seen": 2244781056 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016159478435305918, + "loss": 2.4994, + "theoretical_loss": 3.3978896349791983, + "tokens_seen": 2244846592 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016158475426278837, + "loss": 2.6186, + "theoretical_loss": 3.397881550805878, + "tokens_seen": 2244912128 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016157472417251757, + "loss": 2.6671, + "theoretical_loss": 3.397873466934635, + "tokens_seen": 2244977664 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016156469408224676, + "loss": 2.592, + "theoretical_loss": 3.3978653833654495, + "tokens_seen": 2245043200 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016155466399197594, + "loss": 2.4369, + "theoretical_loss": 3.3978573000983006, + "tokens_seen": 2245108736 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016154463390170512, + "loss": 2.6841, + "theoretical_loss": 3.397849217133169, + "tokens_seen": 2245174272 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001615346038114343, + "loss": 2.6281, + "theoretical_loss": 3.3978411344700343, + "tokens_seen": 2245239808 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001615245737211635, + "loss": 2.3323, + "theoretical_loss": 3.397833052108876, + "tokens_seen": 2245305344 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001615145436308927, + "loss": 2.3768, + "theoretical_loss": 3.3978249700496748, + "tokens_seen": 2245370880 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016150451354062187, + "loss": 2.5437, + "theoretical_loss": 3.39781688829241, + "tokens_seen": 2245436416 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016149448345035105, + "loss": 2.5375, + "theoretical_loss": 3.3978088068370615, + "tokens_seen": 2245501952 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016148445336008026, + "loss": 2.5858, + "theoretical_loss": 3.39780072568361, + "tokens_seen": 2245567488 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016147442326980944, + "loss": 2.6953, + "theoretical_loss": 3.397792644832034, + "tokens_seen": 2245633024 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016146439317953862, + "loss": 2.4748, + "theoretical_loss": 3.397784564282315, + "tokens_seen": 2245698560 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001614543630892678, + "loss": 2.8421, + "theoretical_loss": 3.3977764840344316, + "tokens_seen": 2245764096 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016144433299899698, + "loss": 2.3646, + "theoretical_loss": 3.3977684040883642, + "tokens_seen": 2245829632 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001614343029087262, + "loss": 2.8264, + "theoretical_loss": 3.3977603244440933, + "tokens_seen": 2245895168 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016142427281845537, + "loss": 2.5887, + "theoretical_loss": 3.397752245101598, + "tokens_seen": 2245960704 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016141424272818455, + "loss": 2.4087, + "theoretical_loss": 3.3977441660608583, + "tokens_seen": 2246026240 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016140421263791373, + "loss": 2.5377, + "theoretical_loss": 3.3977360873218547, + "tokens_seen": 2246091776 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016139418254764294, + "loss": 2.5812, + "theoretical_loss": 3.3977280088845663, + "tokens_seen": 2246157312 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016138415245737212, + "loss": 2.6282, + "theoretical_loss": 3.397719930748974, + "tokens_seen": 2246222848 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2477877, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.438322067260742, + "objective/train/theoretical_loss": 3.3977179112622133, + "objective/train/tokens_used": 2266699232, + "theoretical_loss": 3.3977179112622133, + "tokens_seen": 2246239232 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001613741223671013, + "loss": 2.6526, + "theoretical_loss": 3.397711852915057, + "tokens_seen": 2246288384 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016136409227683049, + "loss": 2.5271, + "theoretical_loss": 3.3977037753827957, + "tokens_seen": 2246353920 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016135406218655967, + "loss": 2.6503, + "theoretical_loss": 3.3976956981521695, + "tokens_seen": 2246419456 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016134403209628888, + "loss": 2.643, + "theoretical_loss": 3.3976876212231586, + "tokens_seen": 2246484992 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016133400200601806, + "loss": 2.6142, + "theoretical_loss": 3.3976795445957433, + "tokens_seen": 2246550528 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016132397191574724, + "loss": 2.4848, + "theoretical_loss": 3.3976714682699027, + "tokens_seen": 2246616064 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016131394182547642, + "loss": 2.7203, + "theoretical_loss": 3.397663392245618, + "tokens_seen": 2246681600 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016130391173520563, + "loss": 2.7638, + "theoretical_loss": 3.397655316522868, + "tokens_seen": 2246747136 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001612938816449348, + "loss": 2.3941, + "theoretical_loss": 3.397647241101633, + "tokens_seen": 2246812672 + }, + { + "epoch": 7.05, + "learning_rate": 0.000161283851554664, + "loss": 2.4234, + "theoretical_loss": 3.397639165981893, + "tokens_seen": 2246878208 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016127382146439317, + "loss": 2.5635, + "theoretical_loss": 3.3976310911636283, + "tokens_seen": 2246943744 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016126379137412235, + "loss": 2.5848, + "theoretical_loss": 3.397623016646818, + "tokens_seen": 2247009280 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016125376128385156, + "loss": 2.637, + "theoretical_loss": 3.397614942431443, + "tokens_seen": 2247074816 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016124373119358074, + "loss": 2.5988, + "theoretical_loss": 3.3976068685174825, + "tokens_seen": 2247140352 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016123370110330992, + "loss": 2.5647, + "theoretical_loss": 3.397598794904917, + "tokens_seen": 2247205888 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001612236710130391, + "loss": 2.5931, + "theoretical_loss": 3.3975907215937258, + "tokens_seen": 2247271424 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001612136409227683, + "loss": 2.6196, + "theoretical_loss": 3.3975826485838896, + "tokens_seen": 2247336960 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001612036108324975, + "loss": 2.4291, + "theoretical_loss": 3.397574575875388, + "tokens_seen": 2247402496 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016119358074222667, + "loss": 2.5148, + "theoretical_loss": 3.397566503468201, + "tokens_seen": 2247468032 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016118355065195585, + "loss": 2.4341, + "theoretical_loss": 3.3975584313623086, + "tokens_seen": 2247533568 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016117352056168506, + "loss": 2.4703, + "theoretical_loss": 3.397550359557691, + "tokens_seen": 2247599104 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016116349047141424, + "loss": 2.5469, + "theoretical_loss": 3.3975422880543276, + "tokens_seen": 2247664640 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016115346038114343, + "loss": 2.5402, + "theoretical_loss": 3.3975342168521987, + "tokens_seen": 2247730176 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001611434302908726, + "loss": 2.8045, + "theoretical_loss": 3.397526145951285, + "tokens_seen": 2247795712 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001611334002006018, + "loss": 2.6102, + "theoretical_loss": 3.397518075351565, + "tokens_seen": 2247861248 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2478650, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4053664207458496, + "objective/train/theoretical_loss": 3.3975160577486943, + "objective/train/tokens_used": 2268337632, + "theoretical_loss": 3.3975160577486943, + "tokens_seen": 2247877632 + }, + { + "epoch": 7.05, + "learning_rate": 0.000161123370110331, + "loss": 2.6626, + "theoretical_loss": 3.39751000505302, + "tokens_seen": 2247926784 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016111334002006018, + "loss": 2.6397, + "theoretical_loss": 3.3975019350556286, + "tokens_seen": 2247992320 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016110330992978936, + "loss": 2.6139, + "theoretical_loss": 3.397493865359372, + "tokens_seen": 2248057856 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016109327983951854, + "loss": 2.697, + "theoretical_loss": 3.3974857959642297, + "tokens_seen": 2248123392 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016108324974924775, + "loss": 2.5528, + "theoretical_loss": 3.397477726870182, + "tokens_seen": 2248188928 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016107321965897693, + "loss": 2.7912, + "theoretical_loss": 3.397469658077209, + "tokens_seen": 2248254464 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001610631895687061, + "loss": 2.8034, + "theoretical_loss": 3.3974615895852893, + "tokens_seen": 2248320000 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001610531594784353, + "loss": 2.534, + "theoretical_loss": 3.3974535213944046, + "tokens_seen": 2248385536 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016104312938816447, + "loss": 2.6768, + "theoretical_loss": 3.397445453504534, + "tokens_seen": 2248451072 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001610330992978937, + "loss": 2.551, + "theoretical_loss": 3.397437385915658, + "tokens_seen": 2248516608 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001610230692076229, + "loss": 2.7737, + "theoretical_loss": 3.3974293186277564, + "tokens_seen": 2248582144 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016101303911735207, + "loss": 2.6779, + "theoretical_loss": 3.397421251640809, + "tokens_seen": 2248647680 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016100300902708125, + "loss": 2.4844, + "theoretical_loss": 3.3974131849547957, + "tokens_seen": 2248713216 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016099297893681046, + "loss": 2.6119, + "theoretical_loss": 3.397405118569697, + "tokens_seen": 2248778752 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016098294884653964, + "loss": 2.762, + "theoretical_loss": 3.3973970524854926, + "tokens_seen": 2248844288 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016097291875626882, + "loss": 2.5794, + "theoretical_loss": 3.3973889867021625, + "tokens_seen": 2248909824 + }, + { + "epoch": 7.05, + "learning_rate": 0.000160962888665998, + "loss": 2.5048, + "theoretical_loss": 3.397380921219687, + "tokens_seen": 2248975360 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016095285857572718, + "loss": 2.7627, + "theoretical_loss": 3.3973728560380456, + "tokens_seen": 2249040896 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001609428284854564, + "loss": 2.7116, + "theoretical_loss": 3.3973647911572185, + "tokens_seen": 2249106432 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016093279839518557, + "loss": 2.7964, + "theoretical_loss": 3.3973567265771862, + "tokens_seen": 2249171968 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016092276830491475, + "loss": 2.4235, + "theoretical_loss": 3.397348662297928, + "tokens_seen": 2249237504 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016091273821464393, + "loss": 2.5437, + "theoretical_loss": 3.397340598319424, + "tokens_seen": 2249303040 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016090270812437314, + "loss": 2.5245, + "theoretical_loss": 3.397332534641655, + "tokens_seen": 2249368576 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016089267803410232, + "loss": 2.547, + "theoretical_loss": 3.3973244712646, + "tokens_seen": 2249434112 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001608826479438315, + "loss": 2.6584, + "theoretical_loss": 3.39731640818824, + "tokens_seen": 2249499648 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2479889, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.682476758956909, + "objective/train/theoretical_loss": 3.3973143924661313, + "objective/train/tokens_used": 2269976032, + "theoretical_loss": 3.3973143924661313, + "tokens_seen": 2249516032 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016087261785356069, + "loss": 2.505, + "theoretical_loss": 3.3973083454125543, + "tokens_seen": 2249565184 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016086258776328987, + "loss": 2.3544, + "theoretical_loss": 3.397300282937523, + "tokens_seen": 2249630720 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016085255767301908, + "loss": 2.6783, + "theoretical_loss": 3.3972922207631266, + "tokens_seen": 2249696256 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016084252758274826, + "loss": 2.5402, + "theoretical_loss": 3.3972841588893448, + "tokens_seen": 2249761792 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016083249749247744, + "loss": 2.6184, + "theoretical_loss": 3.3972760973161575, + "tokens_seen": 2249827328 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016082246740220662, + "loss": 2.4218, + "theoretical_loss": 3.3972680360435445, + "tokens_seen": 2249892864 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016081243731193583, + "loss": 2.5958, + "theoretical_loss": 3.397259975071487, + "tokens_seen": 2249958400 + }, + { + "epoch": 7.05, + "learning_rate": 0.000160802407221665, + "loss": 2.5545, + "theoretical_loss": 3.397251914399964, + "tokens_seen": 2250023936 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001607923771313942, + "loss": 2.6406, + "theoretical_loss": 3.3972438540289556, + "tokens_seen": 2250089472 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016078234704112337, + "loss": 2.4919, + "theoretical_loss": 3.3972357939584423, + "tokens_seen": 2250155008 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016077231695085255, + "loss": 2.3683, + "theoretical_loss": 3.397227734188404, + "tokens_seen": 2250220544 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016076228686058176, + "loss": 2.5905, + "theoretical_loss": 3.39721967471882, + "tokens_seen": 2250286080 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016075225677031094, + "loss": 2.6031, + "theoretical_loss": 3.3972116155496717, + "tokens_seen": 2250351616 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016074222668004012, + "loss": 2.2772, + "theoretical_loss": 3.3972035566809384, + "tokens_seen": 2250417152 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001607321965897693, + "loss": 2.5603, + "theoretical_loss": 3.3971954981126, + "tokens_seen": 2250482688 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001607221664994985, + "loss": 2.4345, + "theoretical_loss": 3.397187439844637, + "tokens_seen": 2250548224 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001607121364092277, + "loss": 2.6963, + "theoretical_loss": 3.397179381877029, + "tokens_seen": 2250613760 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016070210631895687, + "loss": 2.5385, + "theoretical_loss": 3.397171324209756, + "tokens_seen": 2250679296 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016069207622868605, + "loss": 2.5614, + "theoretical_loss": 3.397163266842799, + "tokens_seen": 2250744832 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016068204613841526, + "loss": 2.5456, + "theoretical_loss": 3.397155209776137, + "tokens_seen": 2250810368 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016067201604814444, + "loss": 2.5124, + "theoretical_loss": 3.39714715300975, + "tokens_seen": 2250875904 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016066198595787363, + "loss": 2.5564, + "theoretical_loss": 3.3971390965436195, + "tokens_seen": 2250941440 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001606519558676028, + "loss": 2.4776, + "theoretical_loss": 3.397131040377724, + "tokens_seen": 2251006976 + }, + { + "epoch": 7.05, + "learning_rate": 0.000160641925777332, + "loss": 2.3529, + "theoretical_loss": 3.3971229845120448, + "tokens_seen": 2251072512 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001606318956870612, + "loss": 2.6086, + "theoretical_loss": 3.3971149289465608, + "tokens_seen": 2251138048 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2480382, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2003512382507324, + "objective/train/theoretical_loss": 3.3971129151020927, + "objective/train/tokens_used": 2271614432, + "theoretical_loss": 3.3971129151020927, + "tokens_seen": 2251154432 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016062186559679038, + "loss": 2.504, + "theoretical_loss": 3.3971068736812526, + "tokens_seen": 2251203584 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016061183550651956, + "loss": 2.5586, + "theoretical_loss": 3.3970988187161004, + "tokens_seen": 2251269120 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016060180541624874, + "loss": 2.606, + "theoretical_loss": 3.397090764051084, + "tokens_seen": 2251334656 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016059177532597795, + "loss": 2.5156, + "theoretical_loss": 3.397082709686184, + "tokens_seen": 2251400192 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016058174523570713, + "loss": 2.4044, + "theoretical_loss": 3.3970746556213802, + "tokens_seen": 2251465728 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001605717151454363, + "loss": 2.3804, + "theoretical_loss": 3.397066601856652, + "tokens_seen": 2251531264 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001605616850551655, + "loss": 2.5085, + "theoretical_loss": 3.397058548391981, + "tokens_seen": 2251596800 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016055165496489467, + "loss": 2.6078, + "theoretical_loss": 3.397050495227346, + "tokens_seen": 2251662336 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016054162487462388, + "loss": 2.7765, + "theoretical_loss": 3.3970424423627277, + "tokens_seen": 2251727872 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016053159478435306, + "loss": 2.6129, + "theoretical_loss": 3.3970343897981055, + "tokens_seen": 2251793408 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016052156469408224, + "loss": 2.5291, + "theoretical_loss": 3.39702633753346, + "tokens_seen": 2251858944 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016051153460381142, + "loss": 2.8485, + "theoretical_loss": 3.3970182855687714, + "tokens_seen": 2251924480 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016050150451354063, + "loss": 2.7542, + "theoretical_loss": 3.39701023390402, + "tokens_seen": 2251990016 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001604914744232698, + "loss": 2.3546, + "theoretical_loss": 3.397002182539185, + "tokens_seen": 2252055552 + }, + { + "epoch": 7.05, + "learning_rate": 0.000160481444332999, + "loss": 2.5229, + "theoretical_loss": 3.396994131474248, + "tokens_seen": 2252121088 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016047141424272818, + "loss": 2.6252, + "theoretical_loss": 3.396986080709187, + "tokens_seen": 2252186624 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016046138415245736, + "loss": 2.6454, + "theoretical_loss": 3.396978030243984, + "tokens_seen": 2252252160 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016045135406218656, + "loss": 2.6574, + "theoretical_loss": 3.396969980078618, + "tokens_seen": 2252317696 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016044132397191575, + "loss": 2.7322, + "theoretical_loss": 3.39696193021307, + "tokens_seen": 2252383232 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016043129388164493, + "loss": 2.568, + "theoretical_loss": 3.3969538806473194, + "tokens_seen": 2252448768 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001604212637913741, + "loss": 2.5229, + "theoretical_loss": 3.396945831381346, + "tokens_seen": 2252514304 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016041123370110332, + "loss": 2.6098, + "theoretical_loss": 3.3969377824151312, + "tokens_seen": 2252579840 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001604012036108325, + "loss": 2.5463, + "theoretical_loss": 3.396929733748654, + "tokens_seen": 2252645376 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016039117352056168, + "loss": 2.6817, + "theoretical_loss": 3.3969216853818947, + "tokens_seen": 2252710912 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016038114343029086, + "loss": 2.7097, + "theoretical_loss": 3.3969136373148334, + "tokens_seen": 2252776448 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2481730, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3523752689361572, + "objective/train/theoretical_loss": 3.396911625344894, + "objective/train/tokens_used": 2273252832, + "theoretical_loss": 3.396911625344894, + "tokens_seen": 2252792832 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016037111334002004, + "loss": 2.6592, + "theoretical_loss": 3.396905589547451, + "tokens_seen": 2252841984 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016036108324974925, + "loss": 2.7348, + "theoretical_loss": 3.396897542079727, + "tokens_seen": 2252907520 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016035105315947843, + "loss": 2.5796, + "theoretical_loss": 3.3968894949116413, + "tokens_seen": 2252973056 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001603410230692076, + "loss": 2.5154, + "theoretical_loss": 3.396881448043174, + "tokens_seen": 2253038592 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001603309929789368, + "loss": 2.551, + "theoretical_loss": 3.3968734014743056, + "tokens_seen": 2253104128 + }, + { + "epoch": 7.05, + "learning_rate": 0.000160320962888666, + "loss": 2.6398, + "theoretical_loss": 3.3968653552050165, + "tokens_seen": 2253169664 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016031093279839518, + "loss": 2.5444, + "theoretical_loss": 3.3968573092352865, + "tokens_seen": 2253235200 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016030090270812436, + "loss": 2.6139, + "theoretical_loss": 3.3968492635650955, + "tokens_seen": 2253300736 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016029087261785354, + "loss": 2.4053, + "theoretical_loss": 3.396841218194424, + "tokens_seen": 2253366272 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016028084252758275, + "loss": 2.5113, + "theoretical_loss": 3.3968331731232517, + "tokens_seen": 2253431808 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016027081243731196, + "loss": 2.5772, + "theoretical_loss": 3.3968251283515594, + "tokens_seen": 2253497344 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016026078234704114, + "loss": 2.641, + "theoretical_loss": 3.3968170838793266, + "tokens_seen": 2253562880 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016025075225677032, + "loss": 2.5541, + "theoretical_loss": 3.396809039706534, + "tokens_seen": 2253628416 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001602407221664995, + "loss": 2.6657, + "theoretical_loss": 3.396800995833161, + "tokens_seen": 2253693952 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001602306920762287, + "loss": 2.6461, + "theoretical_loss": 3.3967929522591884, + "tokens_seen": 2253759488 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001602206619859579, + "loss": 2.7277, + "theoretical_loss": 3.3967849089845963, + "tokens_seen": 2253825024 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016021063189568707, + "loss": 2.6866, + "theoretical_loss": 3.3967768660093647, + "tokens_seen": 2253890560 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016020060180541626, + "loss": 2.6461, + "theoretical_loss": 3.396768823333474, + "tokens_seen": 2253956096 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016019057171514546, + "loss": 2.5206, + "theoretical_loss": 3.396760780956904, + "tokens_seen": 2254021632 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016018054162487464, + "loss": 2.69, + "theoretical_loss": 3.3967527388796346, + "tokens_seen": 2254087168 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016017051153460383, + "loss": 2.5521, + "theoretical_loss": 3.3967446971016466, + "tokens_seen": 2254152704 + }, + { + "epoch": 7.05, + "learning_rate": 0.000160160481444333, + "loss": 2.7638, + "theoretical_loss": 3.3967366556229197, + "tokens_seen": 2254218240 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001601504513540622, + "loss": 2.6365, + "theoretical_loss": 3.3967286144434343, + "tokens_seen": 2254283776 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001601404212637914, + "loss": 2.6584, + "theoretical_loss": 3.396720573563171, + "tokens_seen": 2254349312 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016013039117352058, + "loss": 2.718, + "theoretical_loss": 3.396712532982109, + "tokens_seen": 2254414848 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2482388, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.854142904281616, + "objective/train/theoretical_loss": 3.3967105228835917, + "objective/train/tokens_used": 2274891232, + "theoretical_loss": 3.3967105228835917, + "tokens_seen": 2254431232 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016012036108324976, + "loss": 2.5955, + "theoretical_loss": 3.3967044927002292, + "tokens_seen": 2254480384 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016011033099297894, + "loss": 2.7615, + "theoretical_loss": 3.396696452717512, + "tokens_seen": 2254545920 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016010030090270815, + "loss": 2.5678, + "theoretical_loss": 3.396688413033936, + "tokens_seen": 2254611456 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016009027081243733, + "loss": 2.5296, + "theoretical_loss": 3.3966803736494833, + "tokens_seen": 2254676992 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001600802407221665, + "loss": 2.5985, + "theoretical_loss": 3.396672334564133, + "tokens_seen": 2254742528 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001600702106318957, + "loss": 2.7545, + "theoretical_loss": 3.3966642957778657, + "tokens_seen": 2254808064 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016006018054162487, + "loss": 2.5354, + "theoretical_loss": 3.3966562572906613, + "tokens_seen": 2254873600 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016005015045135408, + "loss": 2.763, + "theoretical_loss": 3.3966482191025, + "tokens_seen": 2254939136 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016004012036108326, + "loss": 2.6548, + "theoretical_loss": 3.3966401812133626, + "tokens_seen": 2255004672 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016003009027081244, + "loss": 2.8273, + "theoretical_loss": 3.3966321436232283, + "tokens_seen": 2255070208 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016002006018054162, + "loss": 2.802, + "theoretical_loss": 3.396624106332078, + "tokens_seen": 2255135744 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016001003009027083, + "loss": 2.6629, + "theoretical_loss": 3.3966160693398915, + "tokens_seen": 2255201280 + }, + { + "epoch": 7.05, + "learning_rate": 0.00016, + "loss": 2.6947, + "theoretical_loss": 3.396608032646649, + "tokens_seen": 2255266816 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001599899699097292, + "loss": 2.4606, + "theoretical_loss": 3.3965999962523314, + "tokens_seen": 2255332352 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015997993981945838, + "loss": 2.5299, + "theoretical_loss": 3.396591960156918, + "tokens_seen": 2255397888 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015996990972918756, + "loss": 2.6106, + "theoretical_loss": 3.3965839243603893, + "tokens_seen": 2255463424 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015995987963891676, + "loss": 2.7287, + "theoretical_loss": 3.3965758888627255, + "tokens_seen": 2255528960 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015994984954864595, + "loss": 2.6319, + "theoretical_loss": 3.396567853663907, + "tokens_seen": 2255594496 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015993981945837513, + "loss": 2.4423, + "theoretical_loss": 3.3965598187639134, + "tokens_seen": 2255660032 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001599297893681043, + "loss": 2.692, + "theoretical_loss": 3.396551784162726, + "tokens_seen": 2255725568 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015991975927783352, + "loss": 2.5551, + "theoretical_loss": 3.396543749860324, + "tokens_seen": 2255791104 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001599097291875627, + "loss": 2.7107, + "theoretical_loss": 3.396535715856688, + "tokens_seen": 2255856640 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015989969909729188, + "loss": 2.7447, + "theoretical_loss": 3.396527682151798, + "tokens_seen": 2255922176 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015988966900702106, + "loss": 2.626, + "theoretical_loss": 3.3965196487456346, + "tokens_seen": 2255987712 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015987963891675024, + "loss": 2.5922, + "theoretical_loss": 3.396511615638178, + "tokens_seen": 2256053248 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2483695, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6994292736053467, + "objective/train/theoretical_loss": 3.3965096074079844, + "objective/train/tokens_used": 2276529632, + "theoretical_loss": 3.3965096074079844, + "tokens_seen": 2256069632 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015986960882647945, + "loss": 2.902, + "theoretical_loss": 3.396503582829408, + "tokens_seen": 2256118784 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015985957873620863, + "loss": 2.8516, + "theoretical_loss": 3.3964955503193046, + "tokens_seen": 2256184320 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001598495486459378, + "loss": 2.604, + "theoretical_loss": 3.396487518107849, + "tokens_seen": 2256249856 + }, + { + "epoch": 7.05, + "learning_rate": 0.000159839518555667, + "loss": 2.594, + "theoretical_loss": 3.396479486195021, + "tokens_seen": 2256315392 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001598294884653962, + "loss": 2.4509, + "theoretical_loss": 3.3964714545808006, + "tokens_seen": 2256380928 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015981945837512538, + "loss": 2.6713, + "theoretical_loss": 3.396463423265168, + "tokens_seen": 2256446464 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015980942828485456, + "loss": 2.6256, + "theoretical_loss": 3.3964553922481033, + "tokens_seen": 2256512000 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015979939819458374, + "loss": 2.6198, + "theoretical_loss": 3.3964473615295874, + "tokens_seen": 2256577536 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015978936810431293, + "loss": 2.5258, + "theoretical_loss": 3.3964393311095997, + "tokens_seen": 2256643072 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015977933801404213, + "loss": 2.4746, + "theoretical_loss": 3.3964313009881213, + "tokens_seen": 2256708608 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015976930792377131, + "loss": 2.5937, + "theoretical_loss": 3.3964232711651317, + "tokens_seen": 2256774144 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001597592778335005, + "loss": 2.6072, + "theoretical_loss": 3.3964152416406117, + "tokens_seen": 2256839680 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015974924774322968, + "loss": 2.5061, + "theoretical_loss": 3.396407212414541, + "tokens_seen": 2256905216 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015973921765295888, + "loss": 2.6723, + "theoretical_loss": 3.3963991834869, + "tokens_seen": 2256970752 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015972918756268807, + "loss": 2.6422, + "theoretical_loss": 3.3963911548576693, + "tokens_seen": 2257036288 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015971915747241725, + "loss": 2.6743, + "theoretical_loss": 3.396383126526829, + "tokens_seen": 2257101824 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015970912738214643, + "loss": 2.5908, + "theoretical_loss": 3.3963750984943593, + "tokens_seen": 2257167360 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001596990972918756, + "loss": 2.6795, + "theoretical_loss": 3.39636707076024, + "tokens_seen": 2257232896 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015968906720160482, + "loss": 2.6473, + "theoretical_loss": 3.396359043324452, + "tokens_seen": 2257298432 + }, + { + "epoch": 7.05, + "learning_rate": 0.000159679037111334, + "loss": 2.5001, + "theoretical_loss": 3.3963510161869754, + "tokens_seen": 2257363968 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015966900702106318, + "loss": 2.5656, + "theoretical_loss": 3.39634298934779, + "tokens_seen": 2257429504 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015965897693079236, + "loss": 2.2451, + "theoretical_loss": 3.3963349628068764, + "tokens_seen": 2257495040 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015964894684052157, + "loss": 2.3947, + "theoretical_loss": 3.396326936564215, + "tokens_seen": 2257560576 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015963891675025075, + "loss": 2.6309, + "theoretical_loss": 3.3963189106197857, + "tokens_seen": 2257626112 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015962888665997993, + "loss": 2.5496, + "theoretical_loss": 3.3963108849735693, + "tokens_seen": 2257691648 + }, + { + "epoch": 7.05, + "objective/train/docs_used": 2484343, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.956984043121338, + "objective/train/theoretical_loss": 3.3963088786086084, + "objective/train/tokens_used": 2278168032, + "theoretical_loss": 3.3963088786086084, + "tokens_seen": 2257708032 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001596188565697091, + "loss": 2.7285, + "theoretical_loss": 3.396302859625546, + "tokens_seen": 2257757184 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001596088264794383, + "loss": 2.5657, + "theoretical_loss": 3.396294834575695, + "tokens_seen": 2257822720 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001595987963891675, + "loss": 2.698, + "theoretical_loss": 3.396286809823998, + "tokens_seen": 2257888256 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015958876629889668, + "loss": 2.474, + "theoretical_loss": 3.3962787853704346, + "tokens_seen": 2257953792 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015957873620862586, + "loss": 2.4884, + "theoretical_loss": 3.3962707612149847, + "tokens_seen": 2258019328 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015956870611835505, + "loss": 2.9764, + "theoretical_loss": 3.3962627373576297, + "tokens_seen": 2258084864 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015955867602808425, + "loss": 2.6596, + "theoretical_loss": 3.3962547137983483, + "tokens_seen": 2258150400 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015954864593781343, + "loss": 2.5068, + "theoretical_loss": 3.3962466905371222, + "tokens_seen": 2258215936 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015953861584754262, + "loss": 2.672, + "theoretical_loss": 3.396238667573931, + "tokens_seen": 2258281472 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015952858575727182, + "loss": 2.4275, + "theoretical_loss": 3.3962306449087554, + "tokens_seen": 2258347008 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015951855566700103, + "loss": 2.5878, + "theoretical_loss": 3.396222622541575, + "tokens_seen": 2258412544 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001595085255767302, + "loss": 2.7259, + "theoretical_loss": 3.3962146004723706, + "tokens_seen": 2258478080 + }, + { + "epoch": 7.05, + "learning_rate": 0.0001594984954864594, + "loss": 2.7238, + "theoretical_loss": 3.3962065787011224, + "tokens_seen": 2258543616 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015948846539618858, + "loss": 2.5432, + "theoretical_loss": 3.3961985572278106, + "tokens_seen": 2258609152 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015947843530591776, + "loss": 2.4862, + "theoretical_loss": 3.3961905360524156, + "tokens_seen": 2258674688 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015946840521564696, + "loss": 2.3796, + "theoretical_loss": 3.3961825151749174, + "tokens_seen": 2258740224 + }, + { + "epoch": 7.05, + "learning_rate": 0.00015945837512537615, + "loss": 2.523, + "theoretical_loss": 3.3961744945952965, + "tokens_seen": 2258805760 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015944834503510533, + "loss": 2.6604, + "theoretical_loss": 3.3961664743135334, + "tokens_seen": 2258871296 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001594383149448345, + "loss": 2.4927, + "theoretical_loss": 3.3961584543296084, + "tokens_seen": 2258936832 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015942828485456372, + "loss": 2.5778, + "theoretical_loss": 3.396150434643501, + "tokens_seen": 2259002368 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001594182547642929, + "loss": 2.6345, + "theoretical_loss": 3.3961424152551922, + "tokens_seen": 2259067904 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015940822467402208, + "loss": 2.4512, + "theoretical_loss": 3.396134396164663, + "tokens_seen": 2259133440 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015939819458375126, + "loss": 2.6055, + "theoretical_loss": 3.396126377371892, + "tokens_seen": 2259198976 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015938816449348044, + "loss": 2.6688, + "theoretical_loss": 3.396118358876861, + "tokens_seen": 2259264512 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015937813440320965, + "loss": 2.6364, + "theoretical_loss": 3.3961103406795496, + "tokens_seen": 2259330048 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2485654, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.902060031890869, + "objective/train/theoretical_loss": 3.396108336176738, + "objective/train/tokens_used": 2279806432, + "theoretical_loss": 3.396108336176738, + "tokens_seen": 2259346432 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015936810431293883, + "loss": 2.704, + "theoretical_loss": 3.3961023227799383, + "tokens_seen": 2259395584 + }, + { + "epoch": 7.06, + "learning_rate": 0.000159358074222668, + "loss": 2.7011, + "theoretical_loss": 3.396094305178007, + "tokens_seen": 2259461120 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001593480441323972, + "loss": 2.5714, + "theoretical_loss": 3.396086287873737, + "tokens_seen": 2259526656 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001593380140421264, + "loss": 2.8891, + "theoretical_loss": 3.3960782708671076, + "tokens_seen": 2259592192 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015932798395185558, + "loss": 2.6387, + "theoretical_loss": 3.3960702541580994, + "tokens_seen": 2259657728 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015931795386158476, + "loss": 2.5204, + "theoretical_loss": 3.396062237746693, + "tokens_seen": 2259723264 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015930792377131394, + "loss": 2.5644, + "theoretical_loss": 3.3960542216328684, + "tokens_seen": 2259788800 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015929789368104313, + "loss": 2.5784, + "theoretical_loss": 3.396046205816606, + "tokens_seen": 2259854336 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015928786359077233, + "loss": 2.3804, + "theoretical_loss": 3.3960381902978867, + "tokens_seen": 2259919872 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015927783350050151, + "loss": 2.5967, + "theoretical_loss": 3.3960301750766897, + "tokens_seen": 2259985408 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001592678034102307, + "loss": 2.8137, + "theoretical_loss": 3.3960221601529965, + "tokens_seen": 2260050944 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015925777331995988, + "loss": 2.6023, + "theoretical_loss": 3.3960141455267863, + "tokens_seen": 2260116480 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015924774322968909, + "loss": 2.4909, + "theoretical_loss": 3.3960061311980407, + "tokens_seen": 2260182016 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015923771313941827, + "loss": 2.4372, + "theoretical_loss": 3.395998117166739, + "tokens_seen": 2260247552 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015922768304914745, + "loss": 2.4012, + "theoretical_loss": 3.3959901034328617, + "tokens_seen": 2260313088 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015921765295887663, + "loss": 2.5908, + "theoretical_loss": 3.3959820899963895, + "tokens_seen": 2260378624 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001592076228686058, + "loss": 2.6266, + "theoretical_loss": 3.3959740768573026, + "tokens_seen": 2260444160 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015919759277833502, + "loss": 2.5775, + "theoretical_loss": 3.3959660640155813, + "tokens_seen": 2260509696 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001591875626880642, + "loss": 2.5518, + "theoretical_loss": 3.3959580514712058, + "tokens_seen": 2260575232 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015917753259779338, + "loss": 2.4847, + "theoretical_loss": 3.395950039224157, + "tokens_seen": 2260640768 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015916750250752256, + "loss": 2.6645, + "theoretical_loss": 3.3959420272744145, + "tokens_seen": 2260706304 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015915747241725177, + "loss": 2.724, + "theoretical_loss": 3.395934015621959, + "tokens_seen": 2260771840 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015914744232698095, + "loss": 2.6796, + "theoretical_loss": 3.395926004266771, + "tokens_seen": 2260837376 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015913741223671013, + "loss": 2.5852, + "theoretical_loss": 3.3959179932088306, + "tokens_seen": 2260902912 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001591273821464393, + "loss": 2.517, + "theoretical_loss": 3.395909982448118, + "tokens_seen": 2260968448 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2486143, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5272669792175293, + "objective/train/theoretical_loss": 3.3959079798043796, + "objective/train/tokens_used": 2281444832, + "theoretical_loss": 3.3959079798043796, + "tokens_seen": 2260984832 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001591173520561685, + "loss": 2.4859, + "theoretical_loss": 3.3959019719846144, + "tokens_seen": 2261033984 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001591073219658977, + "loss": 2.7316, + "theoretical_loss": 3.3958939618182993, + "tokens_seen": 2261099520 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015909729187562688, + "loss": 2.3724, + "theoretical_loss": 3.395885951949153, + "tokens_seen": 2261165056 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015908726178535606, + "loss": 2.6557, + "theoretical_loss": 3.395877942377157, + "tokens_seen": 2261230592 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015907723169508525, + "loss": 2.7533, + "theoretical_loss": 3.39586993310229, + "tokens_seen": 2261296128 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015906720160481445, + "loss": 2.7963, + "theoretical_loss": 3.395861924124534, + "tokens_seen": 2261361664 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015905717151454363, + "loss": 2.7513, + "theoretical_loss": 3.395853915443868, + "tokens_seen": 2261427200 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015904714142427282, + "loss": 2.509, + "theoretical_loss": 3.3958459070602727, + "tokens_seen": 2261492736 + }, + { + "epoch": 7.06, + "learning_rate": 0.000159037111334002, + "loss": 2.7581, + "theoretical_loss": 3.3958378989737295, + "tokens_seen": 2261558272 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001590270812437312, + "loss": 2.7955, + "theoretical_loss": 3.395829891184218, + "tokens_seen": 2261623808 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015901705115346039, + "loss": 2.6917, + "theoretical_loss": 3.395821883691718, + "tokens_seen": 2261689344 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015900702106318957, + "loss": 2.3817, + "theoretical_loss": 3.3958138764962107, + "tokens_seen": 2261754880 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015899699097291875, + "loss": 2.6505, + "theoretical_loss": 3.3958058695976763, + "tokens_seen": 2261820416 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015898696088264793, + "loss": 2.4814, + "theoretical_loss": 3.395797862996095, + "tokens_seen": 2261885952 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015897693079237714, + "loss": 2.5558, + "theoretical_loss": 3.3957898566914477, + "tokens_seen": 2261951488 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015896690070210632, + "loss": 2.6454, + "theoretical_loss": 3.395781850683714, + "tokens_seen": 2262017024 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001589568706118355, + "loss": 2.5206, + "theoretical_loss": 3.395773844972875, + "tokens_seen": 2262082560 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015894684052156468, + "loss": 2.7428, + "theoretical_loss": 3.3957658395589103, + "tokens_seen": 2262148096 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001589368104312939, + "loss": 2.406, + "theoretical_loss": 3.3957578344418016, + "tokens_seen": 2262213632 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015892678034102307, + "loss": 2.6705, + "theoretical_loss": 3.395749829621528, + "tokens_seen": 2262279168 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015891675025075225, + "loss": 2.5262, + "theoretical_loss": 3.39574182509807, + "tokens_seen": 2262344704 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015890672016048143, + "loss": 2.7768, + "theoretical_loss": 3.3957338208714085, + "tokens_seen": 2262410240 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015889669007021061, + "loss": 2.555, + "theoretical_loss": 3.395725816941524, + "tokens_seen": 2262475776 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015888665997993982, + "loss": 2.6162, + "theoretical_loss": 3.3957178133083965, + "tokens_seen": 2262541312 + }, + { + "epoch": 7.06, + "learning_rate": 0.000158876629889669, + "loss": 2.4325, + "theoretical_loss": 3.395709809972007, + "tokens_seen": 2262606848 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2487779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.023876190185547, + "objective/train/theoretical_loss": 3.3957078091842723, + "objective/train/tokens_used": 2283083232, + "theoretical_loss": 3.3957078091842723, + "tokens_seen": 2262623232 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015886659979939818, + "loss": 2.388, + "theoretical_loss": 3.395701806932335, + "tokens_seen": 2262672384 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015885656970912737, + "loss": 2.5428, + "theoretical_loss": 3.395693804189361, + "tokens_seen": 2262737920 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015884653961885657, + "loss": 2.5073, + "theoretical_loss": 3.3956858017430664, + "tokens_seen": 2262803456 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015883650952858575, + "loss": 2.5398, + "theoretical_loss": 3.3956777995934306, + "tokens_seen": 2262868992 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015882647943831494, + "loss": 2.4557, + "theoretical_loss": 3.3956697977404344, + "tokens_seen": 2262934528 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015881644934804412, + "loss": 2.51, + "theoretical_loss": 3.3956617961840583, + "tokens_seen": 2263000064 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001588064192577733, + "loss": 2.6236, + "theoretical_loss": 3.395653794924283, + "tokens_seen": 2263065600 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001587963891675025, + "loss": 2.5496, + "theoretical_loss": 3.395645793961088, + "tokens_seen": 2263131136 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001587863590772317, + "loss": 2.5153, + "theoretical_loss": 3.3956377932944544, + "tokens_seen": 2263196672 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001587763289869609, + "loss": 2.7132, + "theoretical_loss": 3.3956297929243626, + "tokens_seen": 2263262208 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015876629889669008, + "loss": 2.6234, + "theoretical_loss": 3.395621792850793, + "tokens_seen": 2263327744 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015875626880641929, + "loss": 2.2668, + "theoretical_loss": 3.3956137930737254, + "tokens_seen": 2263393280 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015874623871614847, + "loss": 2.3621, + "theoretical_loss": 3.395605793593141, + "tokens_seen": 2263458816 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015873620862587765, + "loss": 2.4648, + "theoretical_loss": 3.39559779440902, + "tokens_seen": 2263524352 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015872617853560683, + "loss": 2.5965, + "theoretical_loss": 3.3955897955213428, + "tokens_seen": 2263589888 + }, + { + "epoch": 7.06, + "learning_rate": 0.000158716148445336, + "loss": 2.6574, + "theoretical_loss": 3.39558179693009, + "tokens_seen": 2263655424 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015870611835506522, + "loss": 2.5386, + "theoretical_loss": 3.3955737986352417, + "tokens_seen": 2263720960 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001586960882647944, + "loss": 2.5862, + "theoretical_loss": 3.3955658006367786, + "tokens_seen": 2263786496 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015868605817452358, + "loss": 2.6595, + "theoretical_loss": 3.3955578029346807, + "tokens_seen": 2263852032 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015867602808425276, + "loss": 2.6506, + "theoretical_loss": 3.395549805528929, + "tokens_seen": 2263917568 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015866599799398197, + "loss": 2.5618, + "theoretical_loss": 3.395541808419504, + "tokens_seen": 2263983104 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015865596790371115, + "loss": 2.7309, + "theoretical_loss": 3.3955338116063856, + "tokens_seen": 2264048640 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015864593781344033, + "loss": 2.6256, + "theoretical_loss": 3.395525815089554, + "tokens_seen": 2264114176 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001586359077231695, + "loss": 2.7425, + "theoretical_loss": 3.395517818868991, + "tokens_seen": 2264179712 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001586258776328987, + "loss": 2.7432, + "theoretical_loss": 3.395509822944676, + "tokens_seen": 2264245248 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2490945, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.436795711517334, + "objective/train/theoretical_loss": 3.3955078240098837, + "objective/train/tokens_used": 2284721632, + "theoretical_loss": 3.3955078240098837, + "tokens_seen": 2264261632 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001586158475426279, + "loss": 2.4135, + "theoretical_loss": 3.39550182731659, + "tokens_seen": 2264310784 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015860581745235708, + "loss": 2.8202, + "theoretical_loss": 3.3954938319847123, + "tokens_seen": 2264376320 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015859578736208626, + "loss": 2.4812, + "theoretical_loss": 3.3954858369490246, + "tokens_seen": 2264441856 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015858575727181545, + "loss": 2.8167, + "theoretical_loss": 3.3954778422095067, + "tokens_seen": 2264507392 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015857572718154465, + "loss": 2.6876, + "theoretical_loss": 3.3954698477661394, + "tokens_seen": 2264572928 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015856569709127383, + "loss": 2.5511, + "theoretical_loss": 3.395461853618903, + "tokens_seen": 2264638464 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015855566700100302, + "loss": 2.723, + "theoretical_loss": 3.395453859767778, + "tokens_seen": 2264704000 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001585456369107322, + "loss": 2.4874, + "theoretical_loss": 3.395445866212745, + "tokens_seen": 2264769536 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001585356068204614, + "loss": 2.4726, + "theoretical_loss": 3.395437872953784, + "tokens_seen": 2264835072 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001585255767301906, + "loss": 2.6634, + "theoretical_loss": 3.395429879990876, + "tokens_seen": 2264900608 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015851554663991977, + "loss": 2.5216, + "theoretical_loss": 3.3954218873240016, + "tokens_seen": 2264966144 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015850551654964895, + "loss": 2.5465, + "theoretical_loss": 3.3954138949531405, + "tokens_seen": 2265031680 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015849548645937813, + "loss": 2.6512, + "theoretical_loss": 3.3954059028782737, + "tokens_seen": 2265097216 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015848545636910734, + "loss": 2.6257, + "theoretical_loss": 3.3953979110993817, + "tokens_seen": 2265162752 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015847542627883652, + "loss": 2.5778, + "theoretical_loss": 3.3953899196164445, + "tokens_seen": 2265228288 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001584653961885657, + "loss": 2.6563, + "theoretical_loss": 3.395381928429443, + "tokens_seen": 2265293824 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015845536609829488, + "loss": 2.688, + "theoretical_loss": 3.3953739375383583, + "tokens_seen": 2265359360 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001584453360080241, + "loss": 2.481, + "theoretical_loss": 3.395365946943169, + "tokens_seen": 2265424896 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015843530591775327, + "loss": 2.556, + "theoretical_loss": 3.395357956643858, + "tokens_seen": 2265490432 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015842527582748245, + "loss": 2.5563, + "theoretical_loss": 3.3953499666404037, + "tokens_seen": 2265555968 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015841524573721163, + "loss": 2.7174, + "theoretical_loss": 3.3953419769327877, + "tokens_seen": 2265621504 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015840521564694081, + "loss": 2.5582, + "theoretical_loss": 3.3953339875209907, + "tokens_seen": 2265687040 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015839518555667002, + "loss": 2.6985, + "theoretical_loss": 3.395325998404992, + "tokens_seen": 2265752576 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001583851554663992, + "loss": 2.738, + "theoretical_loss": 3.3953180095847735, + "tokens_seen": 2265818112 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015837512537612838, + "loss": 2.5113, + "theoretical_loss": 3.3953100210603147, + "tokens_seen": 2265883648 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2495901, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.793480634689331, + "objective/train/theoretical_loss": 3.39530802397541, + "objective/train/tokens_used": 2286360032, + "theoretical_loss": 3.39530802397541, + "tokens_seen": 2265900032 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015836509528585757, + "loss": 2.6105, + "theoretical_loss": 3.395302032831596, + "tokens_seen": 2265949184 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015835506519558677, + "loss": 2.5321, + "theoretical_loss": 3.395294044898599, + "tokens_seen": 2266014720 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015834503510531596, + "loss": 2.4819, + "theoretical_loss": 3.395286057261303, + "tokens_seen": 2266080256 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015833500501504514, + "loss": 2.5852, + "theoretical_loss": 3.3952780699196894, + "tokens_seen": 2266145792 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015832497492477432, + "loss": 2.5228, + "theoretical_loss": 3.395270082873738, + "tokens_seen": 2266211328 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001583149448345035, + "loss": 2.6404, + "theoretical_loss": 3.39526209612343, + "tokens_seen": 2266276864 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001583049147442327, + "loss": 2.4648, + "theoretical_loss": 3.395254109668745, + "tokens_seen": 2266342400 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001582948846539619, + "loss": 2.7444, + "theoretical_loss": 3.3952461235096645, + "tokens_seen": 2266407936 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015828485456369107, + "loss": 2.6121, + "theoretical_loss": 3.395238137646168, + "tokens_seen": 2266473472 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015827482447342025, + "loss": 2.6039, + "theoretical_loss": 3.3952301520782373, + "tokens_seen": 2266539008 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015826479438314946, + "loss": 2.5424, + "theoretical_loss": 3.395222166805852, + "tokens_seen": 2266604544 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015825476429287864, + "loss": 2.4854, + "theoretical_loss": 3.3952141818289925, + "tokens_seen": 2266670080 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015824473420260782, + "loss": 2.524, + "theoretical_loss": 3.39520619714764, + "tokens_seen": 2266735616 + }, + { + "epoch": 7.06, + "learning_rate": 0.000158234704112337, + "loss": 2.6019, + "theoretical_loss": 3.395198212761774, + "tokens_seen": 2266801152 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015822467402206618, + "loss": 2.8238, + "theoretical_loss": 3.3951902286713764, + "tokens_seen": 2266866688 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001582146439317954, + "loss": 2.735, + "theoretical_loss": 3.395182244876427, + "tokens_seen": 2266932224 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015820461384152457, + "loss": 2.6147, + "theoretical_loss": 3.395174261376906, + "tokens_seen": 2266997760 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015819458375125375, + "loss": 2.6125, + "theoretical_loss": 3.395166278172794, + "tokens_seen": 2267063296 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015818455366098293, + "loss": 2.6286, + "theoretical_loss": 3.395158295264072, + "tokens_seen": 2267128832 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015817452357071214, + "loss": 2.6591, + "theoretical_loss": 3.39515031265072, + "tokens_seen": 2267194368 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015816449348044132, + "loss": 2.5735, + "theoretical_loss": 3.3951423303327193, + "tokens_seen": 2267259904 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001581544633901705, + "loss": 2.4882, + "theoretical_loss": 3.39513434831005, + "tokens_seen": 2267325440 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015814443329989969, + "loss": 2.7928, + "theoretical_loss": 3.3951263665826925, + "tokens_seen": 2267390976 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015813440320962887, + "loss": 2.6187, + "theoretical_loss": 3.395118385150627, + "tokens_seen": 2267456512 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015812437311935808, + "loss": 2.5667, + "theoretical_loss": 3.395110404013835, + "tokens_seen": 2267522048 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2500793, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3790395259857178, + "objective/train/theoretical_loss": 3.395108408775771, + "objective/train/tokens_used": 2287998432, + "theoretical_loss": 3.395108408775771, + "tokens_seen": 2267538432 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015811434302908726, + "loss": 2.5545, + "theoretical_loss": 3.3951024231722964, + "tokens_seen": 2267587584 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015810431293881644, + "loss": 2.4768, + "theoretical_loss": 3.395094442625992, + "tokens_seen": 2267653120 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015809428284854562, + "loss": 2.5655, + "theoretical_loss": 3.395086462374902, + "tokens_seen": 2267718656 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015808425275827483, + "loss": 2.3018, + "theoretical_loss": 3.3950784824190072, + "tokens_seen": 2267784192 + }, + { + "epoch": 7.06, + "learning_rate": 0.000158074222668004, + "loss": 2.6356, + "theoretical_loss": 3.3950705027582884, + "tokens_seen": 2267849728 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001580641925777332, + "loss": 2.4569, + "theoretical_loss": 3.3950625233927254, + "tokens_seen": 2267915264 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015805416248746237, + "loss": 2.5668, + "theoretical_loss": 3.3950545443222997, + "tokens_seen": 2267980800 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015804413239719155, + "loss": 2.5163, + "theoretical_loss": 3.3950465655469912, + "tokens_seen": 2268046336 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015803410230692076, + "loss": 2.5225, + "theoretical_loss": 3.3950385870667805, + "tokens_seen": 2268111872 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015802407221664997, + "loss": 2.653, + "theoretical_loss": 3.3950306088816484, + "tokens_seen": 2268177408 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015801404212637915, + "loss": 2.4007, + "theoretical_loss": 3.3950226309915754, + "tokens_seen": 2268242944 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015800401203610833, + "loss": 2.5894, + "theoretical_loss": 3.395014653396542, + "tokens_seen": 2268308480 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015799398194583754, + "loss": 2.6681, + "theoretical_loss": 3.395006676096529, + "tokens_seen": 2268374016 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015798395185556672, + "loss": 2.7175, + "theoretical_loss": 3.3949986990915164, + "tokens_seen": 2268439552 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001579739217652959, + "loss": 2.5525, + "theoretical_loss": 3.3949907223814852, + "tokens_seen": 2268505088 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015796389167502508, + "loss": 2.5171, + "theoretical_loss": 3.3949827459664164, + "tokens_seen": 2268570624 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001579538615847543, + "loss": 2.6121, + "theoretical_loss": 3.3949747698462893, + "tokens_seen": 2268636160 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015794383149448347, + "loss": 2.5737, + "theoretical_loss": 3.394966794021086, + "tokens_seen": 2268701696 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015793380140421265, + "loss": 2.6451, + "theoretical_loss": 3.3949588184907857, + "tokens_seen": 2268767232 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015792377131394183, + "loss": 2.6211, + "theoretical_loss": 3.39495084325537, + "tokens_seen": 2268832768 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015791374122367101, + "loss": 2.6272, + "theoretical_loss": 3.3949428683148186, + "tokens_seen": 2268898304 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015790371113340022, + "loss": 2.61, + "theoretical_loss": 3.394934893669113, + "tokens_seen": 2268963840 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001578936810431294, + "loss": 2.5504, + "theoretical_loss": 3.394926919318233, + "tokens_seen": 2269029376 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015788365095285858, + "loss": 2.3819, + "theoretical_loss": 3.3949189452621598, + "tokens_seen": 2269094912 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015787362086258777, + "loss": 2.7259, + "theoretical_loss": 3.394910971500874, + "tokens_seen": 2269160448 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2505848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0692059993743896, + "objective/train/theoretical_loss": 3.3949089781066104, + "objective/train/tokens_used": 2289636832, + "theoretical_loss": 3.3949089781066104, + "tokens_seen": 2269176832 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015786359077231697, + "loss": 2.822, + "theoretical_loss": 3.3949029980343552, + "tokens_seen": 2269225984 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015785356068204616, + "loss": 2.5282, + "theoretical_loss": 3.394895024862585, + "tokens_seen": 2269291520 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015784353059177534, + "loss": 2.7114, + "theoretical_loss": 3.3948870519855436, + "tokens_seen": 2269357056 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015783350050150452, + "loss": 2.3602, + "theoretical_loss": 3.394879079403212, + "tokens_seen": 2269422592 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001578234704112337, + "loss": 2.674, + "theoretical_loss": 3.3948711071155704, + "tokens_seen": 2269488128 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001578134403209629, + "loss": 2.5431, + "theoretical_loss": 3.3948631351225993, + "tokens_seen": 2269553664 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001578034102306921, + "loss": 2.5711, + "theoretical_loss": 3.3948551634242796, + "tokens_seen": 2269619200 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015779338014042127, + "loss": 2.6279, + "theoretical_loss": 3.394847192020592, + "tokens_seen": 2269684736 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015778335005015045, + "loss": 2.6633, + "theoretical_loss": 3.3948392209115164, + "tokens_seen": 2269750272 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015777331995987966, + "loss": 2.558, + "theoretical_loss": 3.3948312500970337, + "tokens_seen": 2269815808 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015776328986960884, + "loss": 2.5414, + "theoretical_loss": 3.394823279577125, + "tokens_seen": 2269881344 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015775325977933802, + "loss": 2.735, + "theoretical_loss": 3.394815309351771, + "tokens_seen": 2269946880 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001577432296890672, + "loss": 2.6422, + "theoretical_loss": 3.3948073394209515, + "tokens_seen": 2270012416 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015773319959879638, + "loss": 2.7253, + "theoretical_loss": 3.3947993697846477, + "tokens_seen": 2270077952 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001577231695085256, + "loss": 2.6253, + "theoretical_loss": 3.39479140044284, + "tokens_seen": 2270143488 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015771313941825477, + "loss": 2.8799, + "theoretical_loss": 3.394783431395509, + "tokens_seen": 2270209024 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015770310932798395, + "loss": 2.501, + "theoretical_loss": 3.3947754626426354, + "tokens_seen": 2270274560 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015769307923771313, + "loss": 2.5175, + "theoretical_loss": 3.3947674941841997, + "tokens_seen": 2270340096 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015768304914744234, + "loss": 2.6989, + "theoretical_loss": 3.3947595260201826, + "tokens_seen": 2270405632 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015767301905717152, + "loss": 2.48, + "theoretical_loss": 3.394751558150565, + "tokens_seen": 2270471168 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001576629889669007, + "loss": 2.6571, + "theoretical_loss": 3.394743590575327, + "tokens_seen": 2270536704 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015765295887662989, + "loss": 2.5902, + "theoretical_loss": 3.3947356232944497, + "tokens_seen": 2270602240 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015764292878635907, + "loss": 2.5127, + "theoretical_loss": 3.3947276563079134, + "tokens_seen": 2270667776 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015763289869608828, + "loss": 2.5543, + "theoretical_loss": 3.394719689615699, + "tokens_seen": 2270733312 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015762286860581746, + "loss": 2.6656, + "theoretical_loss": 3.394711723217787, + "tokens_seen": 2270798848 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2510931, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.811091184616089, + "objective/train/theoretical_loss": 3.394709731664291, + "objective/train/tokens_used": 2291275232, + "theoretical_loss": 3.394709731664291, + "tokens_seen": 2270815232 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015761283851554664, + "loss": 2.6615, + "theoretical_loss": 3.394703757114158, + "tokens_seen": 2270864384 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015760280842527582, + "loss": 2.5582, + "theoretical_loss": 3.394695791304792, + "tokens_seen": 2270929920 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015759277833500503, + "loss": 2.4832, + "theoretical_loss": 3.394687825789671, + "tokens_seen": 2270995456 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001575827482447342, + "loss": 2.6027, + "theoretical_loss": 3.394679860568775, + "tokens_seen": 2271060992 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001575727181544634, + "loss": 2.5871, + "theoretical_loss": 3.3946718956420843, + "tokens_seen": 2271126528 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015756268806419257, + "loss": 2.7603, + "theoretical_loss": 3.3946639310095796, + "tokens_seen": 2271192064 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015755265797392175, + "loss": 2.5339, + "theoretical_loss": 3.394655966671242, + "tokens_seen": 2271257600 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015754262788365096, + "loss": 2.6995, + "theoretical_loss": 3.394648002627052, + "tokens_seen": 2271323136 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015753259779338014, + "loss": 2.5199, + "theoretical_loss": 3.3946400388769904, + "tokens_seen": 2271388672 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015752256770310932, + "loss": 2.5705, + "theoretical_loss": 3.3946320754210375, + "tokens_seen": 2271454208 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001575125376128385, + "loss": 2.6965, + "theoretical_loss": 3.394624112259174, + "tokens_seen": 2271519744 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001575025075225677, + "loss": 2.5403, + "theoretical_loss": 3.3946161493913802, + "tokens_seen": 2271585280 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001574924774322969, + "loss": 2.3983, + "theoretical_loss": 3.3946081868176377, + "tokens_seen": 2271650816 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015748244734202607, + "loss": 2.4587, + "theoretical_loss": 3.3946002245379265, + "tokens_seen": 2271716352 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015747241725175525, + "loss": 2.575, + "theoretical_loss": 3.3945922625522273, + "tokens_seen": 2271781888 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015746238716148446, + "loss": 2.3677, + "theoretical_loss": 3.394584300860521, + "tokens_seen": 2271847424 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015745235707121364, + "loss": 2.4598, + "theoretical_loss": 3.3945763394627884, + "tokens_seen": 2271912960 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015744232698094283, + "loss": 2.5039, + "theoretical_loss": 3.3945683783590095, + "tokens_seen": 2271978496 + }, + { + "epoch": 7.06, + "learning_rate": 0.000157432296890672, + "loss": 2.5021, + "theoretical_loss": 3.3945604175491653, + "tokens_seen": 2272044032 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001574222668004012, + "loss": 2.7328, + "theoretical_loss": 3.3945524570332366, + "tokens_seen": 2272109568 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001574122367101304, + "loss": 2.5934, + "theoretical_loss": 3.394544496811204, + "tokens_seen": 2272175104 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015740220661985958, + "loss": 2.3989, + "theoretical_loss": 3.3945365368830482, + "tokens_seen": 2272240640 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015739217652958876, + "loss": 2.6258, + "theoretical_loss": 3.3945285772487503, + "tokens_seen": 2272306176 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015738214643931794, + "loss": 2.539, + "theoretical_loss": 3.39452061790829, + "tokens_seen": 2272371712 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015737211634904715, + "loss": 2.5477, + "theoretical_loss": 3.3945126588616485, + "tokens_seen": 2272437248 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2513872, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5320253372192383, + "objective/train/theoretical_loss": 3.394510669145895, + "objective/train/tokens_used": 2292913632, + "theoretical_loss": 3.394510669145895, + "tokens_seen": 2272453632 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015736208625877633, + "loss": 2.6277, + "theoretical_loss": 3.394504700108807, + "tokens_seen": 2272502784 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001573520561685055, + "loss": 2.6377, + "theoretical_loss": 3.394496741649745, + "tokens_seen": 2272568320 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001573420260782347, + "loss": 2.6132, + "theoretical_loss": 3.3944887834844444, + "tokens_seen": 2272633856 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015733199598796387, + "loss": 2.5568, + "theoretical_loss": 3.394480825612885, + "tokens_seen": 2272699392 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015732196589769308, + "loss": 2.5959, + "theoretical_loss": 3.3944728680350478, + "tokens_seen": 2272764928 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015731193580742226, + "loss": 2.4615, + "theoretical_loss": 3.394464910750914, + "tokens_seen": 2272830464 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015730190571715144, + "loss": 2.5746, + "theoretical_loss": 3.3944569537604634, + "tokens_seen": 2272896000 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015729187562688062, + "loss": 2.6553, + "theoretical_loss": 3.394448997063677, + "tokens_seen": 2272961536 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015728184553660986, + "loss": 2.6453, + "theoretical_loss": 3.394441040660536, + "tokens_seen": 2273027072 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015727181544633904, + "loss": 2.5487, + "theoretical_loss": 3.39443308455102, + "tokens_seen": 2273092608 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015726178535606822, + "loss": 2.5705, + "theoretical_loss": 3.394425128735111, + "tokens_seen": 2273158144 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001572517552657974, + "loss": 2.579, + "theoretical_loss": 3.394417173212789, + "tokens_seen": 2273223680 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015724172517552658, + "loss": 2.7293, + "theoretical_loss": 3.394409217984035, + "tokens_seen": 2273289216 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001572316950852558, + "loss": 2.4021, + "theoretical_loss": 3.394401263048829, + "tokens_seen": 2273354752 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015722166499498497, + "loss": 2.6861, + "theoretical_loss": 3.3943933084071523, + "tokens_seen": 2273420288 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015721163490471415, + "loss": 2.5008, + "theoretical_loss": 3.394385354058986, + "tokens_seen": 2273485824 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015720160481444333, + "loss": 2.3493, + "theoretical_loss": 3.3943774000043097, + "tokens_seen": 2273551360 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015719157472417254, + "loss": 2.7541, + "theoretical_loss": 3.394369446243105, + "tokens_seen": 2273616896 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015718154463390172, + "loss": 2.571, + "theoretical_loss": 3.3943614927753525, + "tokens_seen": 2273682432 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001571715145436309, + "loss": 2.5607, + "theoretical_loss": 3.394353539601033, + "tokens_seen": 2273747968 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015716148445336009, + "loss": 2.7334, + "theoretical_loss": 3.394345586720126, + "tokens_seen": 2273813504 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015715145436308927, + "loss": 2.8081, + "theoretical_loss": 3.394337634132614, + "tokens_seen": 2273879040 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015714142427281848, + "loss": 2.517, + "theoretical_loss": 3.394329681838477, + "tokens_seen": 2273944576 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015713139418254766, + "loss": 2.4713, + "theoretical_loss": 3.3943217298376953, + "tokens_seen": 2274010112 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015712136409227684, + "loss": 2.5502, + "theoretical_loss": 3.39431377813025, + "tokens_seen": 2274075648 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2514553, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.435678482055664, + "objective/train/theoretical_loss": 3.3943117902492204, + "objective/train/tokens_used": 2294552032, + "theoretical_loss": 3.3943117902492204, + "tokens_seen": 2274092032 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015711133400200602, + "loss": 2.5433, + "theoretical_loss": 3.394305826716122, + "tokens_seen": 2274141184 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015710130391173523, + "loss": 2.4097, + "theoretical_loss": 3.3942978755952917, + "tokens_seen": 2274206720 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001570912738214644, + "loss": 2.7832, + "theoretical_loss": 3.39428992476774, + "tokens_seen": 2274272256 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001570812437311936, + "loss": 2.5984, + "theoretical_loss": 3.3942819742334476, + "tokens_seen": 2274337792 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015707121364092277, + "loss": 2.6236, + "theoretical_loss": 3.394274023992395, + "tokens_seen": 2274403328 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015706118355065195, + "loss": 2.6568, + "theoretical_loss": 3.3942660740445634, + "tokens_seen": 2274468864 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015705115346038116, + "loss": 2.5306, + "theoretical_loss": 3.394258124389933, + "tokens_seen": 2274534400 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015704112337011034, + "loss": 2.4596, + "theoretical_loss": 3.394250175028485, + "tokens_seen": 2274599936 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015703109327983952, + "loss": 2.6455, + "theoretical_loss": 3.3942422259602, + "tokens_seen": 2274665472 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001570210631895687, + "loss": 2.5982, + "theoretical_loss": 3.394234277185059, + "tokens_seen": 2274731008 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001570110330992979, + "loss": 2.319, + "theoretical_loss": 3.394226328703042, + "tokens_seen": 2274796544 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001570010030090271, + "loss": 2.5994, + "theoretical_loss": 3.3942183805141304, + "tokens_seen": 2274862080 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015699097291875627, + "loss": 2.6635, + "theoretical_loss": 3.394210432618305, + "tokens_seen": 2274927616 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015698094282848545, + "loss": 2.4822, + "theoretical_loss": 3.394202485015546, + "tokens_seen": 2274993152 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015697091273821466, + "loss": 2.5537, + "theoretical_loss": 3.3941945377058342, + "tokens_seen": 2275058688 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015696088264794384, + "loss": 2.4065, + "theoretical_loss": 3.394186590689151, + "tokens_seen": 2275124224 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015695085255767303, + "loss": 2.4707, + "theoretical_loss": 3.3941786439654766, + "tokens_seen": 2275189760 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001569408224674022, + "loss": 2.5829, + "theoretical_loss": 3.3941706975347916, + "tokens_seen": 2275255296 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001569307923771314, + "loss": 2.5916, + "theoretical_loss": 3.3941627513970776, + "tokens_seen": 2275320832 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001569207622868606, + "loss": 2.6667, + "theoretical_loss": 3.394154805552315, + "tokens_seen": 2275386368 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015691073219658978, + "loss": 2.6645, + "theoretical_loss": 3.394146860000484, + "tokens_seen": 2275451904 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015690070210631896, + "loss": 2.494, + "theoretical_loss": 3.394138914741566, + "tokens_seen": 2275517440 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015689067201604814, + "loss": 2.6027, + "theoretical_loss": 3.3941309697755413, + "tokens_seen": 2275582976 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015688064192577735, + "loss": 2.5773, + "theoretical_loss": 3.394123025102391, + "tokens_seen": 2275648512 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015687061183550653, + "loss": 2.6234, + "theoretical_loss": 3.3941150807220954, + "tokens_seen": 2275714048 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2516002, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.45017147064209, + "objective/train/theoretical_loss": 3.394113094672778, + "objective/train/tokens_used": 2296190432, + "theoretical_loss": 3.394113094672778, + "tokens_seen": 2275730432 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001568605817452357, + "loss": 2.3656, + "theoretical_loss": 3.394107136634636, + "tokens_seen": 2275779584 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001568505516549649, + "loss": 2.6447, + "theoretical_loss": 3.3940991928399935, + "tokens_seen": 2275845120 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015684052156469407, + "loss": 2.6014, + "theoretical_loss": 3.394091249338148, + "tokens_seen": 2275910656 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015683049147442328, + "loss": 2.5922, + "theoretical_loss": 3.3940833061290805, + "tokens_seen": 2275976192 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015682046138415246, + "loss": 2.67, + "theoretical_loss": 3.3940753632127723, + "tokens_seen": 2276041728 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015681043129388164, + "loss": 2.6585, + "theoretical_loss": 3.394067420589204, + "tokens_seen": 2276107264 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015680040120361082, + "loss": 2.5458, + "theoretical_loss": 3.394059478258356, + "tokens_seen": 2276172800 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015679037111334003, + "loss": 2.3671, + "theoretical_loss": 3.394051536220209, + "tokens_seen": 2276238336 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001567803410230692, + "loss": 2.7415, + "theoretical_loss": 3.3940435944747445, + "tokens_seen": 2276303872 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001567703109327984, + "loss": 2.5849, + "theoretical_loss": 3.3940356530219424, + "tokens_seen": 2276369408 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015676028084252757, + "loss": 2.6123, + "theoretical_loss": 3.3940277118617845, + "tokens_seen": 2276434944 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015675025075225676, + "loss": 2.6521, + "theoretical_loss": 3.394019770994251, + "tokens_seen": 2276500480 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015674022066198596, + "loss": 2.629, + "theoretical_loss": 3.394011830419322, + "tokens_seen": 2276566016 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015673019057171515, + "loss": 2.5574, + "theoretical_loss": 3.39400389013698, + "tokens_seen": 2276631552 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015672016048144433, + "loss": 2.6168, + "theoretical_loss": 3.3939959501472043, + "tokens_seen": 2276697088 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001567101303911735, + "loss": 2.5833, + "theoretical_loss": 3.3939880104499762, + "tokens_seen": 2276762624 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015670010030090272, + "loss": 2.716, + "theoretical_loss": 3.3939800710452768, + "tokens_seen": 2276828160 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001566900702106319, + "loss": 2.5778, + "theoretical_loss": 3.3939721319330864, + "tokens_seen": 2276893696 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015668004012036108, + "loss": 2.4515, + "theoretical_loss": 3.393964193113386, + "tokens_seen": 2276959232 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015667001003009026, + "loss": 2.5564, + "theoretical_loss": 3.393956254586157, + "tokens_seen": 2277024768 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015665997993981944, + "loss": 2.6204, + "theoretical_loss": 3.3939483163513793, + "tokens_seen": 2277090304 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015664994984954865, + "loss": 2.4742, + "theoretical_loss": 3.3939403784090336, + "tokens_seen": 2277155840 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015663991975927783, + "loss": 2.5911, + "theoretical_loss": 3.3939324407591016, + "tokens_seen": 2277221376 + }, + { + "epoch": 7.06, + "learning_rate": 0.000156629889669007, + "loss": 2.5927, + "theoretical_loss": 3.393924503401564, + "tokens_seen": 2277286912 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001566198595787362, + "loss": 2.5387, + "theoretical_loss": 3.393916566336401, + "tokens_seen": 2277352448 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2516796, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6286563873291016, + "objective/train/theoretical_loss": 3.3939145821157917, + "objective/train/tokens_used": 2297828832, + "theoretical_loss": 3.3939145821157917, + "tokens_seen": 2277368832 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001566098294884654, + "loss": 2.5961, + "theoretical_loss": 3.393908629563594, + "tokens_seen": 2277417984 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015659979939819458, + "loss": 2.4931, + "theoretical_loss": 3.393900693083123, + "tokens_seen": 2277483520 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015658976930792376, + "loss": 2.3184, + "theoretical_loss": 3.39389275689497, + "tokens_seen": 2277549056 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015657973921765294, + "loss": 2.4584, + "theoretical_loss": 3.3938848209991144, + "tokens_seen": 2277614592 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015656970912738212, + "loss": 2.5691, + "theoretical_loss": 3.3938768853955383, + "tokens_seen": 2277680128 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015655967903711133, + "loss": 2.6013, + "theoretical_loss": 3.393868950084222, + "tokens_seen": 2277745664 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015654964894684051, + "loss": 2.6334, + "theoretical_loss": 3.3938610150651463, + "tokens_seen": 2277811200 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001565396188565697, + "loss": 2.2335, + "theoretical_loss": 3.393853080338292, + "tokens_seen": 2277876736 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001565295887662989, + "loss": 2.5701, + "theoretical_loss": 3.39384514590364, + "tokens_seen": 2277942272 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001565195586760281, + "loss": 2.4275, + "theoretical_loss": 3.3938372117611717, + "tokens_seen": 2278007808 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001565095285857573, + "loss": 2.3073, + "theoretical_loss": 3.393829277910867, + "tokens_seen": 2278073344 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015649949849548647, + "loss": 2.6336, + "theoretical_loss": 3.393821344352707, + "tokens_seen": 2278138880 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015648946840521565, + "loss": 2.8236, + "theoretical_loss": 3.393813411086673, + "tokens_seen": 2278204416 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015647943831494486, + "loss": 2.6091, + "theoretical_loss": 3.3938054781127454, + "tokens_seen": 2278269952 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015646940822467404, + "loss": 2.5737, + "theoretical_loss": 3.3937975454309046, + "tokens_seen": 2278335488 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015645937813440323, + "loss": 2.5227, + "theoretical_loss": 3.3937896130411325, + "tokens_seen": 2278401024 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001564493480441324, + "loss": 2.5068, + "theoretical_loss": 3.3937816809434094, + "tokens_seen": 2278466560 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001564393179538616, + "loss": 2.4429, + "theoretical_loss": 3.3937737491377162, + "tokens_seen": 2278532096 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001564292878635908, + "loss": 2.6087, + "theoretical_loss": 3.3937658176240335, + "tokens_seen": 2278597632 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015641925777331998, + "loss": 2.7213, + "theoretical_loss": 3.393757886402343, + "tokens_seen": 2278663168 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015640922768304916, + "loss": 2.4405, + "theoretical_loss": 3.3937499554726243, + "tokens_seen": 2278728704 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015639919759277834, + "loss": 2.6626, + "theoretical_loss": 3.393742024834859, + "tokens_seen": 2278794240 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015638916750250755, + "loss": 2.5306, + "theoretical_loss": 3.3937340944890284, + "tokens_seen": 2278859776 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015637913741223673, + "loss": 2.5031, + "theoretical_loss": 3.393726164435112, + "tokens_seen": 2278925312 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001563691073219659, + "loss": 2.5143, + "theoretical_loss": 3.3937182346730923, + "tokens_seen": 2278990848 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2518288, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.9803285598754883, + "objective/train/theoretical_loss": 3.3937162522781934, + "objective/train/tokens_used": 2299467232, + "theoretical_loss": 3.3937162522781934, + "tokens_seen": 2279007232 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001563590772316951, + "loss": 2.4092, + "theoretical_loss": 3.3937103052029487, + "tokens_seen": 2279056384 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015634904714142427, + "loss": 2.672, + "theoretical_loss": 3.393702376024663, + "tokens_seen": 2279121920 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015633901705115348, + "loss": 2.5283, + "theoretical_loss": 3.3936944471382158, + "tokens_seen": 2279187456 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015632898696088266, + "loss": 2.544, + "theoretical_loss": 3.393686518543588, + "tokens_seen": 2279252992 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015631895687061184, + "loss": 2.5925, + "theoretical_loss": 3.39367859024076, + "tokens_seen": 2279318528 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015630892678034102, + "loss": 2.3722, + "theoretical_loss": 3.393670662229713, + "tokens_seen": 2279384064 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015629889669007023, + "loss": 2.5457, + "theoretical_loss": 3.3936627345104284, + "tokens_seen": 2279449600 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001562888665997994, + "loss": 2.5592, + "theoretical_loss": 3.3936548070828865, + "tokens_seen": 2279515136 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001562788365095286, + "loss": 2.5523, + "theoretical_loss": 3.3936468799470685, + "tokens_seen": 2279580672 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015626880641925778, + "loss": 2.7762, + "theoretical_loss": 3.3936389531029545, + "tokens_seen": 2279646208 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015625877632898696, + "loss": 2.4763, + "theoretical_loss": 3.3936310265505263, + "tokens_seen": 2279711744 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015624874623871616, + "loss": 2.5681, + "theoretical_loss": 3.3936231002897648, + "tokens_seen": 2279777280 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015623871614844535, + "loss": 2.7866, + "theoretical_loss": 3.39361517432065, + "tokens_seen": 2279842816 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015622868605817453, + "loss": 2.6148, + "theoretical_loss": 3.393607248643163, + "tokens_seen": 2279908352 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001562186559679037, + "loss": 2.6403, + "theoretical_loss": 3.393599323257286, + "tokens_seen": 2279973888 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015620862587763292, + "loss": 2.6411, + "theoretical_loss": 3.393591398162998, + "tokens_seen": 2280039424 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001561985957873621, + "loss": 2.7552, + "theoretical_loss": 3.393583473360281, + "tokens_seen": 2280104960 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015618856569709128, + "loss": 2.4253, + "theoretical_loss": 3.3935755488491157, + "tokens_seen": 2280170496 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015617853560682046, + "loss": 2.6842, + "theoretical_loss": 3.3935676246294832, + "tokens_seen": 2280236032 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015616850551654964, + "loss": 2.6671, + "theoretical_loss": 3.3935597007013643, + "tokens_seen": 2280301568 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015615847542627885, + "loss": 2.5359, + "theoretical_loss": 3.393551777064739, + "tokens_seen": 2280367104 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015614844533600803, + "loss": 2.3373, + "theoretical_loss": 3.39354385371959, + "tokens_seen": 2280432640 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001561384152457372, + "loss": 2.4936, + "theoretical_loss": 3.3935359306658963, + "tokens_seen": 2280498176 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001561283851554664, + "loss": 2.4877, + "theoretical_loss": 3.3935280079036403, + "tokens_seen": 2280563712 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001561183550651956, + "loss": 2.4184, + "theoretical_loss": 3.393520085432802, + "tokens_seen": 2280629248 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2518926, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.308356523513794, + "objective/train/theoretical_loss": 3.3935181048606236, + "objective/train/tokens_used": 2301105632, + "theoretical_loss": 3.3935181048606236, + "tokens_seen": 2280645632 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015610832497492478, + "loss": 2.4045, + "theoretical_loss": 3.3935121632533622, + "tokens_seen": 2280694784 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015609829488465396, + "loss": 2.6016, + "theoretical_loss": 3.3935042413653025, + "tokens_seen": 2280760320 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015608826479438314, + "loss": 2.7696, + "theoretical_loss": 3.3934963197686034, + "tokens_seen": 2280825856 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015607823470411232, + "loss": 2.6313, + "theoretical_loss": 3.393488398463246, + "tokens_seen": 2280891392 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015606820461384153, + "loss": 2.6656, + "theoretical_loss": 3.393480477449211, + "tokens_seen": 2280956928 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015605817452357071, + "loss": 2.3947, + "theoretical_loss": 3.3934725567264796, + "tokens_seen": 2281022464 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001560481444332999, + "loss": 2.6738, + "theoretical_loss": 3.3934646362950325, + "tokens_seen": 2281088000 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015603811434302908, + "loss": 2.5319, + "theoretical_loss": 3.3934567161548506, + "tokens_seen": 2281153536 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015602808425275828, + "loss": 2.6804, + "theoretical_loss": 3.3934487963059152, + "tokens_seen": 2281219072 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015601805416248747, + "loss": 2.6343, + "theoretical_loss": 3.3934408767482065, + "tokens_seen": 2281284608 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015600802407221665, + "loss": 2.7164, + "theoretical_loss": 3.393432957481706, + "tokens_seen": 2281350144 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015599799398194583, + "loss": 2.3735, + "theoretical_loss": 3.3934250385063947, + "tokens_seen": 2281415680 + }, + { + "epoch": 7.06, + "learning_rate": 0.000155987963891675, + "loss": 2.4442, + "theoretical_loss": 3.393417119822253, + "tokens_seen": 2281481216 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015597793380140422, + "loss": 2.7156, + "theoretical_loss": 3.3934092014292627, + "tokens_seen": 2281546752 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001559679037111334, + "loss": 2.3828, + "theoretical_loss": 3.393401283327403, + "tokens_seen": 2281612288 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015595787362086258, + "loss": 2.7602, + "theoretical_loss": 3.393393365516657, + "tokens_seen": 2281677824 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015594784353059176, + "loss": 2.5136, + "theoretical_loss": 3.3933854479970047, + "tokens_seen": 2281743360 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015593781344032097, + "loss": 2.6984, + "theoretical_loss": 3.3933775307684266, + "tokens_seen": 2281808896 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015592778335005015, + "loss": 2.5324, + "theoretical_loss": 3.393369613830904, + "tokens_seen": 2281874432 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015591775325977933, + "loss": 2.5753, + "theoretical_loss": 3.393361697184418, + "tokens_seen": 2281939968 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001559077231695085, + "loss": 2.4322, + "theoretical_loss": 3.3933537808289493, + "tokens_seen": 2282005504 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001558976930792377, + "loss": 2.5216, + "theoretical_loss": 3.3933458647644787, + "tokens_seen": 2282071040 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001558876629889669, + "loss": 2.6468, + "theoretical_loss": 3.393337948990988, + "tokens_seen": 2282136576 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015587763289869608, + "loss": 2.5568, + "theoretical_loss": 3.393330033508457, + "tokens_seen": 2282202112 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015586760280842526, + "loss": 2.5456, + "theoretical_loss": 3.3933221183168674, + "tokens_seen": 2282267648 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2520101, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.50203275680542, + "objective/train/theoretical_loss": 3.3933201395644272, + "objective/train/tokens_used": 2302744032, + "theoretical_loss": 3.3933201395644272, + "tokens_seen": 2282284032 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015585757271815444, + "loss": 2.4424, + "theoretical_loss": 3.3933142034162, + "tokens_seen": 2282333184 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015584754262788365, + "loss": 2.3659, + "theoretical_loss": 3.3933062888064356, + "tokens_seen": 2282398720 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015583751253761283, + "loss": 2.533, + "theoretical_loss": 3.3932983744875553, + "tokens_seen": 2282464256 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015582748244734202, + "loss": 2.3596, + "theoretical_loss": 3.39329046045954, + "tokens_seen": 2282529792 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001558174523570712, + "loss": 2.5568, + "theoretical_loss": 3.3932825467223706, + "tokens_seen": 2282595328 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001558074222668004, + "loss": 2.6501, + "theoretical_loss": 3.393274633276028, + "tokens_seen": 2282660864 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015579739217652959, + "loss": 2.4876, + "theoretical_loss": 3.3932667201204936, + "tokens_seen": 2282726400 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015578736208625877, + "loss": 2.4731, + "theoretical_loss": 3.393258807255748, + "tokens_seen": 2282791936 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015577733199598798, + "loss": 2.7815, + "theoretical_loss": 3.393250894681772, + "tokens_seen": 2282857472 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015576730190571716, + "loss": 2.2802, + "theoretical_loss": 3.3932429823985473, + "tokens_seen": 2282923008 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015575727181544636, + "loss": 2.5461, + "theoretical_loss": 3.3932350704060537, + "tokens_seen": 2282988544 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015574724172517555, + "loss": 2.5715, + "theoretical_loss": 3.3932271587042733, + "tokens_seen": 2283054080 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015573721163490473, + "loss": 2.5641, + "theoretical_loss": 3.393219247293186, + "tokens_seen": 2283119616 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001557271815446339, + "loss": 2.4159, + "theoretical_loss": 3.393211336172774, + "tokens_seen": 2283185152 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015571715145436312, + "loss": 2.6435, + "theoretical_loss": 3.3932034253430174, + "tokens_seen": 2283250688 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001557071213640923, + "loss": 2.4866, + "theoretical_loss": 3.3931955148038977, + "tokens_seen": 2283316224 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015569709127382148, + "loss": 2.6414, + "theoretical_loss": 3.3931876045553953, + "tokens_seen": 2283381760 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015568706118355066, + "loss": 2.5191, + "theoretical_loss": 3.393179694597492, + "tokens_seen": 2283447296 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015567703109327984, + "loss": 2.4417, + "theoretical_loss": 3.3931717849301677, + "tokens_seen": 2283512832 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015566700100300905, + "loss": 2.5803, + "theoretical_loss": 3.3931638755534044, + "tokens_seen": 2283578368 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015565697091273823, + "loss": 2.6513, + "theoretical_loss": 3.3931559664671824, + "tokens_seen": 2283643904 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001556469408224674, + "loss": 2.5821, + "theoretical_loss": 3.393148057671483, + "tokens_seen": 2283709440 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001556369107321966, + "loss": 2.3296, + "theoretical_loss": 3.393140149166287, + "tokens_seen": 2283774976 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001556268806419258, + "loss": 2.3669, + "theoretical_loss": 3.393132240951576, + "tokens_seen": 2283840512 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015561685055165498, + "loss": 2.5447, + "theoretical_loss": 3.3931243330273304, + "tokens_seen": 2283906048 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2520708, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.767819881439209, + "objective/train/theoretical_loss": 3.3931223560916517, + "objective/train/tokens_used": 2304382432, + "theoretical_loss": 3.3931223560916517, + "tokens_seen": 2283922432 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015560682046138416, + "loss": 2.564, + "theoretical_loss": 3.393116425393531, + "tokens_seen": 2283971584 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015559679037111334, + "loss": 2.2437, + "theoretical_loss": 3.3931085180501595, + "tokens_seen": 2284037120 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015558676028084252, + "loss": 2.2309, + "theoretical_loss": 3.393100610997196, + "tokens_seen": 2284102656 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015557673019057173, + "loss": 2.6695, + "theoretical_loss": 3.3930927042346224, + "tokens_seen": 2284168192 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015556670010030091, + "loss": 2.5211, + "theoretical_loss": 3.3930847977624197, + "tokens_seen": 2284233728 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001555566700100301, + "loss": 2.7031, + "theoretical_loss": 3.393076891580568, + "tokens_seen": 2284299264 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015554663991975928, + "loss": 2.5715, + "theoretical_loss": 3.393068985689049, + "tokens_seen": 2284364800 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015553660982948848, + "loss": 2.5059, + "theoretical_loss": 3.3930610800878434, + "tokens_seen": 2284430336 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015552657973921767, + "loss": 2.4876, + "theoretical_loss": 3.3930531747769326, + "tokens_seen": 2284495872 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015551654964894685, + "loss": 2.4592, + "theoretical_loss": 3.393045269756297, + "tokens_seen": 2284561408 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015550651955867603, + "loss": 2.6385, + "theoretical_loss": 3.3930373650259185, + "tokens_seen": 2284626944 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001554964894684052, + "loss": 2.5694, + "theoretical_loss": 3.3930294605857774, + "tokens_seen": 2284692480 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015548645937813442, + "loss": 2.6933, + "theoretical_loss": 3.393021556435855, + "tokens_seen": 2284758016 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001554764292878636, + "loss": 2.4982, + "theoretical_loss": 3.393013652576132, + "tokens_seen": 2284823552 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015546639919759278, + "loss": 2.5074, + "theoretical_loss": 3.3930057490065897, + "tokens_seen": 2284889088 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015545636910732196, + "loss": 2.3835, + "theoretical_loss": 3.3929978457272094, + "tokens_seen": 2284954624 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015544633901705117, + "loss": 2.4232, + "theoretical_loss": 3.3929899427379717, + "tokens_seen": 2285020160 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015543630892678035, + "loss": 2.5695, + "theoretical_loss": 3.392982040038858, + "tokens_seen": 2285085696 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015542627883650953, + "loss": 2.4998, + "theoretical_loss": 3.3929741376298486, + "tokens_seen": 2285151232 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001554162487462387, + "loss": 2.5834, + "theoretical_loss": 3.3929662355109254, + "tokens_seen": 2285216768 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001554062186559679, + "loss": 2.6471, + "theoretical_loss": 3.392958333682069, + "tokens_seen": 2285282304 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001553961885656971, + "loss": 2.5818, + "theoretical_loss": 3.3929504321432598, + "tokens_seen": 2285347840 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015538615847542628, + "loss": 2.6906, + "theoretical_loss": 3.39294253089448, + "tokens_seen": 2285413376 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015537612838515546, + "loss": 2.4813, + "theoretical_loss": 3.39293462993571, + "tokens_seen": 2285478912 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015536609829488465, + "loss": 2.4822, + "theoretical_loss": 3.3929267292669314, + "tokens_seen": 2285544448 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2522182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.496152400970459, + "objective/train/theoretical_loss": 3.392924754145046, + "objective/train/tokens_used": 2306020832, + "theoretical_loss": 3.392924754145046, + "tokens_seen": 2285560832 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015535606820461385, + "loss": 2.4117, + "theoretical_loss": 3.392918828888125, + "tokens_seen": 2285609984 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015534603811434303, + "loss": 2.4752, + "theoretical_loss": 3.3929109287992714, + "tokens_seen": 2285675520 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015533600802407222, + "loss": 2.3661, + "theoretical_loss": 3.392903029000352, + "tokens_seen": 2285741056 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001553259779338014, + "loss": 2.5005, + "theoretical_loss": 3.392895129491347, + "tokens_seen": 2285806592 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001553159478435306, + "loss": 2.5866, + "theoretical_loss": 3.392887230272239, + "tokens_seen": 2285872128 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015530591775325979, + "loss": 2.581, + "theoretical_loss": 3.392879331343008, + "tokens_seen": 2285937664 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015529588766298897, + "loss": 2.5091, + "theoretical_loss": 3.3928714327036356, + "tokens_seen": 2286003200 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015528585757271815, + "loss": 2.4297, + "theoretical_loss": 3.3928635343541025, + "tokens_seen": 2286068736 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015527582748244733, + "loss": 2.4065, + "theoretical_loss": 3.3928556362943896, + "tokens_seen": 2286134272 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015526579739217654, + "loss": 2.3627, + "theoretical_loss": 3.392847738524478, + "tokens_seen": 2286199808 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015525576730190572, + "loss": 2.5279, + "theoretical_loss": 3.392839841044349, + "tokens_seen": 2286265344 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001552457372116349, + "loss": 2.5726, + "theoretical_loss": 3.3928319438539845, + "tokens_seen": 2286330880 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015523570712136408, + "loss": 2.4352, + "theoretical_loss": 3.392824046953364, + "tokens_seen": 2286396416 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001552256770310933, + "loss": 2.6599, + "theoretical_loss": 3.392816150342469, + "tokens_seen": 2286461952 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015521564694082247, + "loss": 2.4949, + "theoretical_loss": 3.392808254021281, + "tokens_seen": 2286527488 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015520561685055165, + "loss": 2.5169, + "theoretical_loss": 3.392800357989781, + "tokens_seen": 2286593024 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015519558676028083, + "loss": 2.5426, + "theoretical_loss": 3.39279246224795, + "tokens_seen": 2286658560 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015518555667001001, + "loss": 2.7257, + "theoretical_loss": 3.3927845667957692, + "tokens_seen": 2286724096 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015517552657973922, + "loss": 2.3688, + "theoretical_loss": 3.392776671633219, + "tokens_seen": 2286789632 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001551654964894684, + "loss": 2.3783, + "theoretical_loss": 3.392768776760281, + "tokens_seen": 2286855168 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015515546639919758, + "loss": 2.3908, + "theoretical_loss": 3.3927608821769364, + "tokens_seen": 2286920704 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015514543630892677, + "loss": 2.5, + "theoretical_loss": 3.392752987883166, + "tokens_seen": 2286986240 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015513540621865597, + "loss": 2.6588, + "theoretical_loss": 3.3927450938789514, + "tokens_seen": 2287051776 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015512537612838515, + "loss": 2.3995, + "theoretical_loss": 3.392737200164273, + "tokens_seen": 2287117312 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015511534603811434, + "loss": 2.3764, + "theoretical_loss": 3.392729306739112, + "tokens_seen": 2287182848 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2522834, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9174721240997314, + "objective/train/theoretical_loss": 3.3927273334280565, + "objective/train/tokens_used": 2307659232, + "theoretical_loss": 3.3927273334280565, + "tokens_seen": 2287199232 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015510531594784352, + "loss": 2.7531, + "theoretical_loss": 3.39272141360345, + "tokens_seen": 2287248384 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001550952858575727, + "loss": 2.5825, + "theoretical_loss": 3.392713520757267, + "tokens_seen": 2287313920 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001550852557673019, + "loss": 2.5146, + "theoretical_loss": 3.3927056282005457, + "tokens_seen": 2287379456 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001550752256770311, + "loss": 2.6495, + "theoretical_loss": 3.392697735933266, + "tokens_seen": 2287444992 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015506519558676027, + "loss": 2.5518, + "theoretical_loss": 3.3926898439554094, + "tokens_seen": 2287510528 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015505516549648945, + "loss": 2.4284, + "theoretical_loss": 3.3926819522669565, + "tokens_seen": 2287576064 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015504513540621866, + "loss": 2.3623, + "theoretical_loss": 3.3926740608678894, + "tokens_seen": 2287641600 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015503510531594784, + "loss": 2.3869, + "theoretical_loss": 3.392666169758188, + "tokens_seen": 2287707136 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015502507522567705, + "loss": 2.54, + "theoretical_loss": 3.3926582789378346, + "tokens_seen": 2287772672 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015501504513540623, + "loss": 2.4534, + "theoretical_loss": 3.3926503884068095, + "tokens_seen": 2287838208 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001550050150451354, + "loss": 2.4214, + "theoretical_loss": 3.3926424981650936, + "tokens_seen": 2287903744 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015499498495486462, + "loss": 2.4369, + "theoretical_loss": 3.392634608212669, + "tokens_seen": 2287969280 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001549849548645938, + "loss": 2.7511, + "theoretical_loss": 3.392626718549516, + "tokens_seen": 2288034816 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015497492477432298, + "loss": 2.1936, + "theoretical_loss": 3.392618829175616, + "tokens_seen": 2288100352 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015496489468405216, + "loss": 2.393, + "theoretical_loss": 3.39261094009095, + "tokens_seen": 2288165888 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015495486459378137, + "loss": 2.4035, + "theoretical_loss": 3.3926030512954988, + "tokens_seen": 2288231424 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015494483450351055, + "loss": 2.6408, + "theoretical_loss": 3.3925951627892443, + "tokens_seen": 2288296960 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015493480441323973, + "loss": 2.4817, + "theoretical_loss": 3.392587274572167, + "tokens_seen": 2288362496 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001549247743229689, + "loss": 2.649, + "theoretical_loss": 3.3925793866442486, + "tokens_seen": 2288428032 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001549147442326981, + "loss": 2.635, + "theoretical_loss": 3.392571499005469, + "tokens_seen": 2288493568 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001549047141424273, + "loss": 2.4138, + "theoretical_loss": 3.392563611655811, + "tokens_seen": 2288559104 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015489468405215648, + "loss": 2.3359, + "theoretical_loss": 3.3925557245952547, + "tokens_seen": 2288624640 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015488465396188566, + "loss": 2.4397, + "theoretical_loss": 3.392547837823781, + "tokens_seen": 2288690176 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015487462387161485, + "loss": 2.5209, + "theoretical_loss": 3.392539951341372, + "tokens_seen": 2288755712 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015486459378134405, + "loss": 2.5724, + "theoretical_loss": 3.3925320651480075, + "tokens_seen": 2288821248 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2523561, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.159848213195801, + "objective/train/theoretical_loss": 3.3925300936448277, + "objective/train/tokens_used": 2309297632, + "theoretical_loss": 3.3925300936448277, + "tokens_seen": 2288837632 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015485456369107323, + "loss": 2.443, + "theoretical_loss": 3.39252417924367, + "tokens_seen": 2288886784 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015484453360080242, + "loss": 2.2486, + "theoretical_loss": 3.39251629362834, + "tokens_seen": 2288952320 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001548345035105316, + "loss": 2.5026, + "theoretical_loss": 3.3925084083019983, + "tokens_seen": 2289017856 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001548244734202608, + "loss": 2.4057, + "theoretical_loss": 3.3925005232646264, + "tokens_seen": 2289083392 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015481444332998999, + "loss": 2.6299, + "theoretical_loss": 3.3924926385162055, + "tokens_seen": 2289148928 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015480441323971917, + "loss": 2.5352, + "theoretical_loss": 3.392484754056717, + "tokens_seen": 2289214464 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015479438314944835, + "loss": 2.6108, + "theoretical_loss": 3.392476869886141, + "tokens_seen": 2289280000 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015478435305917753, + "loss": 2.2632, + "theoretical_loss": 3.39246898600446, + "tokens_seen": 2289345536 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015477432296890674, + "loss": 2.4426, + "theoretical_loss": 3.392461102411654, + "tokens_seen": 2289411072 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015476429287863592, + "loss": 2.747, + "theoretical_loss": 3.392453219107705, + "tokens_seen": 2289476608 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001547542627883651, + "loss": 2.3805, + "theoretical_loss": 3.392445336092594, + "tokens_seen": 2289542144 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015474423269809428, + "loss": 2.6976, + "theoretical_loss": 3.3924374533663015, + "tokens_seen": 2289607680 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001547342026078235, + "loss": 2.4948, + "theoretical_loss": 3.392429570928809, + "tokens_seen": 2289673216 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015472417251755267, + "loss": 2.6024, + "theoretical_loss": 3.392421688780098, + "tokens_seen": 2289738752 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015471414242728185, + "loss": 2.2456, + "theoretical_loss": 3.3924138069201497, + "tokens_seen": 2289804288 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015470411233701103, + "loss": 2.5596, + "theoretical_loss": 3.3924059253489443, + "tokens_seen": 2289869824 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015469408224674021, + "loss": 2.5633, + "theoretical_loss": 3.392398044066464, + "tokens_seen": 2289935360 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015468405215646942, + "loss": 2.2594, + "theoretical_loss": 3.3923901630726894, + "tokens_seen": 2290000896 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001546740220661986, + "loss": 2.6334, + "theoretical_loss": 3.3923822823676018, + "tokens_seen": 2290066432 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015466399197592778, + "loss": 2.5656, + "theoretical_loss": 3.392374401951183, + "tokens_seen": 2290131968 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015465396188565697, + "loss": 2.596, + "theoretical_loss": 3.392366521823413, + "tokens_seen": 2290197504 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015464393179538617, + "loss": 2.5879, + "theoretical_loss": 3.392358641984273, + "tokens_seen": 2290263040 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015463390170511535, + "loss": 2.5112, + "theoretical_loss": 3.3923507624337454, + "tokens_seen": 2290328576 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015462387161484454, + "loss": 2.2299, + "theoretical_loss": 3.3923428831718105, + "tokens_seen": 2290394112 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015461384152457372, + "loss": 2.5961, + "theoretical_loss": 3.3923350041984497, + "tokens_seen": 2290459648 + }, + { + "epoch": 7.06, + "objective/train/docs_used": 2524841, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2457592487335205, + "objective/train/theoretical_loss": 3.392333034500197, + "objective/train/tokens_used": 2310936032, + "theoretical_loss": 3.392333034500197, + "tokens_seen": 2290476032 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001546038114343029, + "loss": 2.4401, + "theoretical_loss": 3.3923271255136442, + "tokens_seen": 2290525184 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001545937813440321, + "loss": 2.3832, + "theoretical_loss": 3.392319247117375, + "tokens_seen": 2290590720 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001545837512537613, + "loss": 2.3542, + "theoretical_loss": 3.3923113690096236, + "tokens_seen": 2290656256 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015457372116349047, + "loss": 2.4833, + "theoretical_loss": 3.3923034911903707, + "tokens_seen": 2290721792 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015456369107321965, + "loss": 2.396, + "theoretical_loss": 3.3922956136595976, + "tokens_seen": 2290787328 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015455366098294886, + "loss": 2.6818, + "theoretical_loss": 3.392287736417286, + "tokens_seen": 2290852864 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015454363089267804, + "loss": 2.6303, + "theoretical_loss": 3.3922798594634163, + "tokens_seen": 2290918400 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015453360080240722, + "loss": 2.2986, + "theoretical_loss": 3.3922719827979706, + "tokens_seen": 2290983936 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001545235707121364, + "loss": 2.5959, + "theoretical_loss": 3.3922641064209293, + "tokens_seen": 2291049472 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015451354062186558, + "loss": 2.6256, + "theoretical_loss": 3.392256230332274, + "tokens_seen": 2291115008 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001545035105315948, + "loss": 2.4866, + "theoretical_loss": 3.3922483545319855, + "tokens_seen": 2291180544 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015449348044132397, + "loss": 2.4934, + "theoretical_loss": 3.392240479020045, + "tokens_seen": 2291246080 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015448345035105315, + "loss": 2.4031, + "theoretical_loss": 3.3922326037964345, + "tokens_seen": 2291311616 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015447342026078233, + "loss": 2.4776, + "theoretical_loss": 3.3922247288611347, + "tokens_seen": 2291377152 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015446339017051154, + "loss": 2.5949, + "theoretical_loss": 3.3922168542141264, + "tokens_seen": 2291442688 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015445336008024072, + "loss": 2.3809, + "theoretical_loss": 3.3922089798553916, + "tokens_seen": 2291508224 + }, + { + "epoch": 7.06, + "learning_rate": 0.0001544433299899699, + "loss": 2.4033, + "theoretical_loss": 3.3922011057849106, + "tokens_seen": 2291573760 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015443329989969909, + "loss": 2.4132, + "theoretical_loss": 3.3921932320026653, + "tokens_seen": 2291639296 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015442326980942827, + "loss": 2.6213, + "theoretical_loss": 3.3921853585086366, + "tokens_seen": 2291704832 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015441323971915747, + "loss": 2.5458, + "theoretical_loss": 3.3921774853028053, + "tokens_seen": 2291770368 + }, + { + "epoch": 7.06, + "learning_rate": 0.00015440320962888666, + "loss": 2.7394, + "theoretical_loss": 3.3921696123851537, + "tokens_seen": 2291835904 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015439317953861584, + "loss": 2.5274, + "theoretical_loss": 3.3921617397556623, + "tokens_seen": 2291901440 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015438314944834502, + "loss": 2.6616, + "theoretical_loss": 3.3921538674143124, + "tokens_seen": 2291966976 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015437311935807423, + "loss": 2.3572, + "theoretical_loss": 3.3921459953610853, + "tokens_seen": 2292032512 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001543630892678034, + "loss": 2.5817, + "theoretical_loss": 3.392138123595962, + "tokens_seen": 2292098048 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2525323, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.681734800338745, + "objective/train/theoretical_loss": 3.392136155699695, + "objective/train/tokens_used": 2312574432, + "theoretical_loss": 3.392136155699695, + "tokens_seen": 2292114432 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001543530591775326, + "loss": 2.6215, + "theoretical_loss": 3.392130252118924, + "tokens_seen": 2292163584 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015434302908726177, + "loss": 2.4552, + "theoretical_loss": 3.392122380929952, + "tokens_seen": 2292229120 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015433299899699095, + "loss": 2.6421, + "theoretical_loss": 3.3921145100290278, + "tokens_seen": 2292294656 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015432296890672016, + "loss": 2.7319, + "theoretical_loss": 3.3921066394161326, + "tokens_seen": 2292360192 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015431293881644934, + "loss": 2.3605, + "theoretical_loss": 3.3920987690912474, + "tokens_seen": 2292425728 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015430290872617852, + "loss": 2.4467, + "theoretical_loss": 3.3920908990543532, + "tokens_seen": 2292491264 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001542928786359077, + "loss": 2.7336, + "theoretical_loss": 3.392083029305432, + "tokens_seen": 2292556800 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001542828485456369, + "loss": 2.6067, + "theoretical_loss": 3.392075159844464, + "tokens_seen": 2292622336 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015427281845536612, + "loss": 2.5427, + "theoretical_loss": 3.3920672906714313, + "tokens_seen": 2292687872 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001542627883650953, + "loss": 2.5561, + "theoretical_loss": 3.392059421786315, + "tokens_seen": 2292753408 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015425275827482448, + "loss": 2.464, + "theoretical_loss": 3.392051553189096, + "tokens_seen": 2292818944 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001542427281845537, + "loss": 2.3029, + "theoretical_loss": 3.3920436848797553, + "tokens_seen": 2292884480 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015423269809428287, + "loss": 2.5588, + "theoretical_loss": 3.3920358168582747, + "tokens_seen": 2292950016 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015422266800401205, + "loss": 2.4756, + "theoretical_loss": 3.3920279491246355, + "tokens_seen": 2293015552 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015421263791374123, + "loss": 2.5639, + "theoretical_loss": 3.392020081678819, + "tokens_seen": 2293081088 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015420260782347041, + "loss": 2.5343, + "theoretical_loss": 3.3920122145208054, + "tokens_seen": 2293146624 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015419257773319962, + "loss": 2.4799, + "theoretical_loss": 3.392004347650577, + "tokens_seen": 2293212160 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001541825476429288, + "loss": 2.4552, + "theoretical_loss": 3.391996481068115, + "tokens_seen": 2293277696 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015417251755265798, + "loss": 2.7615, + "theoretical_loss": 3.3919886147733997, + "tokens_seen": 2293343232 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015416248746238717, + "loss": 2.6436, + "theoretical_loss": 3.3919807487664135, + "tokens_seen": 2293408768 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015415245737211637, + "loss": 2.4674, + "theoretical_loss": 3.3919728830471376, + "tokens_seen": 2293474304 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015414242728184555, + "loss": 2.8303, + "theoretical_loss": 3.3919650176155525, + "tokens_seen": 2293539840 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015413239719157474, + "loss": 2.7707, + "theoretical_loss": 3.39195715247164, + "tokens_seen": 2293605376 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015412236710130392, + "loss": 2.2497, + "theoretical_loss": 3.3919492876153807, + "tokens_seen": 2293670912 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001541123370110331, + "loss": 2.5234, + "theoretical_loss": 3.391941423046757, + "tokens_seen": 2293736448 + }, + { + "debugging/Self-BLEU-5": 0.2582188847678782, + "debugging/distinct-1-grams": 0.864913555406513, + "debugging/distinct-2-grams": 0.978549995128763, + "debugging/entropy-1-grams": 5.367368966295116, + "debugging/entropy-2-grams": 5.739559976929851, + "debugging/length": 456.4, + "debugging/num_segments": 5, + "debugging/score": 0.018665992170809432, + "debugging/score_std": 0.008130755540144809, + "epoch": 7.07, + "objective/train/docs_used": 2526621, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8920140266418457, + "objective/train/theoretical_loss": 3.3919394569495416, + "objective/train/tokens_used": 2314212832, + "theoretical_loss": 3.3919394569495416, + "tokens_seen": 2293752832 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001541023069207623, + "loss": 2.6266, + "theoretical_loss": 3.391933558765749, + "tokens_seen": 2293801984 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001540922768304915, + "loss": 2.562, + "theoretical_loss": 3.391925694772339, + "tokens_seen": 2293867520 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015408224674022067, + "loss": 2.7452, + "theoretical_loss": 3.391917831066507, + "tokens_seen": 2293933056 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015407221664994985, + "loss": 2.7048, + "theoretical_loss": 3.3919099676482354, + "tokens_seen": 2293998592 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015406218655967906, + "loss": 2.5325, + "theoretical_loss": 3.391902104517505, + "tokens_seen": 2294064128 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015405215646940824, + "loss": 2.5141, + "theoretical_loss": 3.391894241674297, + "tokens_seen": 2294129664 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015404212637913742, + "loss": 2.4459, + "theoretical_loss": 3.3918863791185934, + "tokens_seen": 2294195200 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001540320962888666, + "loss": 2.5195, + "theoretical_loss": 3.3918785168503742, + "tokens_seen": 2294260736 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015402206619859578, + "loss": 2.3254, + "theoretical_loss": 3.3918706548696216, + "tokens_seen": 2294326272 + }, + { + "epoch": 7.07, + "learning_rate": 0.000154012036108325, + "loss": 2.5684, + "theoretical_loss": 3.391862793176317, + "tokens_seen": 2294391808 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015400200601805417, + "loss": 2.6141, + "theoretical_loss": 3.3918549317704407, + "tokens_seen": 2294457344 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015399197592778335, + "loss": 2.5873, + "theoretical_loss": 3.391847070651975, + "tokens_seen": 2294522880 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015398194583751253, + "loss": 2.5334, + "theoretical_loss": 3.391839209820901, + "tokens_seen": 2294588416 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015397191574724174, + "loss": 2.5963, + "theoretical_loss": 3.3918313492771994, + "tokens_seen": 2294653952 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015396188565697092, + "loss": 2.5704, + "theoretical_loss": 3.391823489020852, + "tokens_seen": 2294719488 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001539518555667001, + "loss": 2.4662, + "theoretical_loss": 3.39181562905184, + "tokens_seen": 2294785024 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015394182547642929, + "loss": 2.501, + "theoretical_loss": 3.3918077693701445, + "tokens_seen": 2294850560 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015393179538615847, + "loss": 2.4627, + "theoretical_loss": 3.391799909975747, + "tokens_seen": 2294916096 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015392176529588768, + "loss": 2.6045, + "theoretical_loss": 3.391792050868629, + "tokens_seen": 2294981632 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015391173520561686, + "loss": 2.491, + "theoretical_loss": 3.391784192048771, + "tokens_seen": 2295047168 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015390170511534604, + "loss": 2.5994, + "theoretical_loss": 3.3917763335161553, + "tokens_seen": 2295112704 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015389167502507522, + "loss": 2.2183, + "theoretical_loss": 3.3917684752707626, + "tokens_seen": 2295178240 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015388164493480443, + "loss": 2.3257, + "theoretical_loss": 3.391760617312574, + "tokens_seen": 2295243776 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001538716148445336, + "loss": 2.4489, + "theoretical_loss": 3.3917527596415717, + "tokens_seen": 2295309312 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001538615847542628, + "loss": 2.5253, + "theoretical_loss": 3.391744902257736, + "tokens_seen": 2295374848 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2527287, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8029632568359375, + "objective/train/theoretical_loss": 3.3917429379566446, + "objective/train/tokens_used": 2315851232, + "theoretical_loss": 3.3917429379566446, + "tokens_seen": 2295391232 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015385155466399197, + "loss": 2.6715, + "theoretical_loss": 3.391737045161049, + "tokens_seen": 2295440384 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015384152457372115, + "loss": 2.5627, + "theoretical_loss": 3.3917291883514915, + "tokens_seen": 2295505920 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015383149448345036, + "loss": 2.2014, + "theoretical_loss": 3.3917213318290447, + "tokens_seen": 2295571456 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015382146439317954, + "loss": 2.638, + "theoretical_loss": 3.3917134755936904, + "tokens_seen": 2295636992 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015381143430290872, + "loss": 2.3781, + "theoretical_loss": 3.3917056196454096, + "tokens_seen": 2295702528 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001538014042126379, + "loss": 2.4533, + "theoretical_loss": 3.391697763984184, + "tokens_seen": 2295768064 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001537913741223671, + "loss": 2.3586, + "theoretical_loss": 3.3916899086099943, + "tokens_seen": 2295833600 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001537813440320963, + "loss": 2.4175, + "theoretical_loss": 3.391682053522822, + "tokens_seen": 2295899136 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015377131394182547, + "loss": 2.4243, + "theoretical_loss": 3.391674198722649, + "tokens_seen": 2295964672 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015376128385155465, + "loss": 2.69, + "theoretical_loss": 3.391666344209456, + "tokens_seen": 2296030208 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015375125376128386, + "loss": 2.5434, + "theoretical_loss": 3.3916584899832243, + "tokens_seen": 2296095744 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015374122367101304, + "loss": 2.532, + "theoretical_loss": 3.391650636043936, + "tokens_seen": 2296161280 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015373119358074222, + "loss": 2.5632, + "theoretical_loss": 3.391642782391571, + "tokens_seen": 2296226816 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001537211634904714, + "loss": 2.5529, + "theoretical_loss": 3.3916349290261123, + "tokens_seen": 2296292352 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001537111334002006, + "loss": 2.3573, + "theoretical_loss": 3.39162707594754, + "tokens_seen": 2296357888 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001537011033099298, + "loss": 2.7184, + "theoretical_loss": 3.391619223155836, + "tokens_seen": 2296423424 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015369107321965898, + "loss": 2.5577, + "theoretical_loss": 3.391611370650981, + "tokens_seen": 2296488960 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015368104312938816, + "loss": 2.5207, + "theoretical_loss": 3.391603518432958, + "tokens_seen": 2296554496 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015367101303911734, + "loss": 2.4134, + "theoretical_loss": 3.391595666501746, + "tokens_seen": 2296620032 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015366098294884655, + "loss": 2.6083, + "theoretical_loss": 3.3915878148573277, + "tokens_seen": 2296685568 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015365095285857573, + "loss": 2.5329, + "theoretical_loss": 3.3915799634996846, + "tokens_seen": 2296751104 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001536409227683049, + "loss": 2.3822, + "theoretical_loss": 3.3915721124287974, + "tokens_seen": 2296816640 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001536308926780341, + "loss": 2.4167, + "theoretical_loss": 3.391564261644648, + "tokens_seen": 2296882176 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015362086258776327, + "loss": 2.2733, + "theoretical_loss": 3.391556411147217, + "tokens_seen": 2296947712 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015361083249749248, + "loss": 2.4808, + "theoretical_loss": 3.391548560936487, + "tokens_seen": 2297013248 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2528366, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6413795948028564, + "objective/train/theoretical_loss": 3.391546598428599, + "objective/train/tokens_used": 2317489632, + "theoretical_loss": 3.391546598428599, + "tokens_seen": 2297029632 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015360080240722166, + "loss": 2.5454, + "theoretical_loss": 3.3915407110124383, + "tokens_seen": 2297078784 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015359077231695084, + "loss": 2.3544, + "theoretical_loss": 3.3915328613750524, + "tokens_seen": 2297144320 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015358074222668002, + "loss": 2.4905, + "theoretical_loss": 3.3915250120243106, + "tokens_seen": 2297209856 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015357071213640923, + "loss": 2.6643, + "theoretical_loss": 3.3915171629601946, + "tokens_seen": 2297275392 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001535606820461384, + "loss": 2.4836, + "theoretical_loss": 3.3915093141826858, + "tokens_seen": 2297340928 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001535506519558676, + "loss": 2.5618, + "theoretical_loss": 3.391501465691765, + "tokens_seen": 2297406464 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015354062186559677, + "loss": 2.3954, + "theoretical_loss": 3.3914936174874137, + "tokens_seen": 2297472000 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015353059177532596, + "loss": 2.506, + "theoretical_loss": 3.391485769569614, + "tokens_seen": 2297537536 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001535205616850552, + "loss": 2.5975, + "theoretical_loss": 3.3914779219383466, + "tokens_seen": 2297603072 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015351053159478437, + "loss": 2.3971, + "theoretical_loss": 3.391470074593593, + "tokens_seen": 2297668608 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015350050150451355, + "loss": 2.4237, + "theoretical_loss": 3.3914622275353348, + "tokens_seen": 2297734144 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015349047141424273, + "loss": 2.6008, + "theoretical_loss": 3.3914543807635527, + "tokens_seen": 2297799680 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015348044132397194, + "loss": 2.6927, + "theoretical_loss": 3.391446534278229, + "tokens_seen": 2297865216 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015347041123370112, + "loss": 2.4282, + "theoretical_loss": 3.391438688079344, + "tokens_seen": 2297930752 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001534603811434303, + "loss": 2.3753, + "theoretical_loss": 3.39143084216688, + "tokens_seen": 2297996288 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015345035105315949, + "loss": 2.5955, + "theoretical_loss": 3.3914229965408182, + "tokens_seen": 2298061824 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015344032096288867, + "loss": 2.671, + "theoretical_loss": 3.39141515120114, + "tokens_seen": 2298127360 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015343029087261788, + "loss": 2.3932, + "theoretical_loss": 3.391407306147826, + "tokens_seen": 2298192896 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015342026078234706, + "loss": 2.4362, + "theoretical_loss": 3.3913994613808587, + "tokens_seen": 2298258432 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015341023069207624, + "loss": 2.1919, + "theoretical_loss": 3.3913916169002185, + "tokens_seen": 2298323968 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015340020060180542, + "loss": 2.3314, + "theoretical_loss": 3.391383772705887, + "tokens_seen": 2298389504 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015339017051153463, + "loss": 2.6232, + "theoretical_loss": 3.3913759287978467, + "tokens_seen": 2298455040 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001533801404212638, + "loss": 2.6566, + "theoretical_loss": 3.3913680851760777, + "tokens_seen": 2298520576 + }, + { + "epoch": 7.07, + "learning_rate": 0.000153370110330993, + "loss": 2.5466, + "theoretical_loss": 3.3913602418405615, + "tokens_seen": 2298586112 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015336008024072217, + "loss": 2.271, + "theoretical_loss": 3.3913523987912804, + "tokens_seen": 2298651648 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2529672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.497499465942383, + "objective/train/theoretical_loss": 3.3913504380736823, + "objective/train/tokens_used": 2319128032, + "theoretical_loss": 3.3913504380736823, + "tokens_seen": 2298668032 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015335005015045135, + "loss": 2.3455, + "theoretical_loss": 3.3913445560282147, + "tokens_seen": 2298717184 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015334002006018056, + "loss": 2.8312, + "theoretical_loss": 3.391336713551347, + "tokens_seen": 2298782720 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015332998996990974, + "loss": 2.4217, + "theoretical_loss": 3.391328871360657, + "tokens_seen": 2298848256 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015331995987963892, + "loss": 2.5275, + "theoretical_loss": 3.391321029456128, + "tokens_seen": 2298913792 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001533099297893681, + "loss": 2.4035, + "theoretical_loss": 3.3913131878377403, + "tokens_seen": 2298979328 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001532998996990973, + "loss": 2.59, + "theoretical_loss": 3.391305346505475, + "tokens_seen": 2299044864 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001532898696088265, + "loss": 2.4576, + "theoretical_loss": 3.391297505459314, + "tokens_seen": 2299110400 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015327983951855567, + "loss": 2.6095, + "theoretical_loss": 3.391289664699239, + "tokens_seen": 2299175936 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015326980942828485, + "loss": 2.6936, + "theoretical_loss": 3.3912818242252314, + "tokens_seen": 2299241472 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015325977933801406, + "loss": 2.3631, + "theoretical_loss": 3.391273984037272, + "tokens_seen": 2299307008 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015324974924774324, + "loss": 2.616, + "theoretical_loss": 3.3912661441353427, + "tokens_seen": 2299372544 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015323971915747243, + "loss": 2.627, + "theoretical_loss": 3.391258304519425, + "tokens_seen": 2299438080 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001532296890672016, + "loss": 2.5348, + "theoretical_loss": 3.3912504651894997, + "tokens_seen": 2299503616 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001532196589769308, + "loss": 2.6447, + "theoretical_loss": 3.3912426261455484, + "tokens_seen": 2299569152 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015320962888666, + "loss": 2.4365, + "theoretical_loss": 3.391234787387553, + "tokens_seen": 2299634688 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015319959879638918, + "loss": 2.4928, + "theoretical_loss": 3.3912269489154943, + "tokens_seen": 2299700224 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015318956870611836, + "loss": 2.5123, + "theoretical_loss": 3.3912191107293546, + "tokens_seen": 2299765760 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015317953861584754, + "loss": 2.3907, + "theoretical_loss": 3.391211272829114, + "tokens_seen": 2299831296 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015316950852557675, + "loss": 2.6213, + "theoretical_loss": 3.3912034352147553, + "tokens_seen": 2299896832 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015315947843530593, + "loss": 2.3835, + "theoretical_loss": 3.3911955978862593, + "tokens_seen": 2299962368 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001531494483450351, + "loss": 2.4172, + "theoretical_loss": 3.391187760843607, + "tokens_seen": 2300027904 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001531394182547643, + "loss": 2.4688, + "theoretical_loss": 3.3911799240867806, + "tokens_seen": 2300093440 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015312938816449347, + "loss": 2.4999, + "theoretical_loss": 3.3911720876157614, + "tokens_seen": 2300158976 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015311935807422268, + "loss": 2.4238, + "theoretical_loss": 3.39116425143053, + "tokens_seen": 2300224512 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015310932798395186, + "loss": 2.5049, + "theoretical_loss": 3.391156415531069, + "tokens_seen": 2300290048 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2530368, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.661353588104248, + "objective/train/theoretical_loss": 3.3911544566008534, + "objective/train/tokens_used": 2320766432, + "theoretical_loss": 3.3911544566008534, + "tokens_seen": 2300306432 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015309929789368104, + "loss": 2.6269, + "theoretical_loss": 3.3911485799173593, + "tokens_seen": 2300355584 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015308926780341022, + "loss": 2.5221, + "theoretical_loss": 3.391140744589382, + "tokens_seen": 2300421120 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015307923771313943, + "loss": 2.447, + "theoretical_loss": 3.391132909547119, + "tokens_seen": 2300486656 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001530692076228686, + "loss": 2.3627, + "theoretical_loss": 3.391125074790552, + "tokens_seen": 2300552192 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001530591775325978, + "loss": 2.4111, + "theoretical_loss": 3.391117240319662, + "tokens_seen": 2300617728 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015304914744232697, + "loss": 2.6053, + "theoretical_loss": 3.3911094061344302, + "tokens_seen": 2300683264 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015303911735205616, + "loss": 2.4682, + "theoretical_loss": 3.391101572234839, + "tokens_seen": 2300748800 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015302908726178536, + "loss": 2.497, + "theoretical_loss": 3.3910937386208686, + "tokens_seen": 2300814336 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015301905717151455, + "loss": 2.4848, + "theoretical_loss": 3.3910859052925013, + "tokens_seen": 2300879872 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015300902708124373, + "loss": 2.2398, + "theoretical_loss": 3.391078072249718, + "tokens_seen": 2300945408 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001529989969909729, + "loss": 2.367, + "theoretical_loss": 3.391070239492501, + "tokens_seen": 2301010944 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015298896690070212, + "loss": 2.5441, + "theoretical_loss": 3.391062407020831, + "tokens_seen": 2301076480 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001529789368104313, + "loss": 2.4574, + "theoretical_loss": 3.3910545748346896, + "tokens_seen": 2301142016 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015296890672016048, + "loss": 2.4031, + "theoretical_loss": 3.3910467429340585, + "tokens_seen": 2301207552 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015295887662988966, + "loss": 2.4234, + "theoretical_loss": 3.391038911318919, + "tokens_seen": 2301273088 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015294884653961884, + "loss": 2.5007, + "theoretical_loss": 3.3910310799892525, + "tokens_seen": 2301338624 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015293881644934805, + "loss": 2.5518, + "theoretical_loss": 3.3910232489450407, + "tokens_seen": 2301404160 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015292878635907723, + "loss": 2.3426, + "theoretical_loss": 3.3910154181862646, + "tokens_seen": 2301469696 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001529187562688064, + "loss": 2.4701, + "theoretical_loss": 3.3910075877129064, + "tokens_seen": 2301535232 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001529087261785356, + "loss": 2.3448, + "theoretical_loss": 3.3909997575249466, + "tokens_seen": 2301600768 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001528986960882648, + "loss": 2.6644, + "theoretical_loss": 3.390991927622368, + "tokens_seen": 2301666304 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015288866599799398, + "loss": 2.5708, + "theoretical_loss": 3.3909840980051507, + "tokens_seen": 2301731840 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015287863590772316, + "loss": 2.6108, + "theoretical_loss": 3.3909762686732767, + "tokens_seen": 2301797376 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015286860581745234, + "loss": 2.3312, + "theoretical_loss": 3.390968439626728, + "tokens_seen": 2301862912 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015285857572718152, + "loss": 2.5501, + "theoretical_loss": 3.3909606108654855, + "tokens_seen": 2301928448 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2531737, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.259066581726074, + "objective/train/theoretical_loss": 3.3909586537197516, + "objective/train/tokens_used": 2322404832, + "theoretical_loss": 3.3909586537197516, + "tokens_seen": 2301944832 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015284854563691073, + "loss": 2.4526, + "theoretical_loss": 3.390952782389531, + "tokens_seen": 2301993984 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015283851554663991, + "loss": 2.3693, + "theoretical_loss": 3.3909449541988455, + "tokens_seen": 2302059520 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001528284854563691, + "loss": 2.715, + "theoretical_loss": 3.3909371262934105, + "tokens_seen": 2302125056 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015281845536609828, + "loss": 2.2086, + "theoretical_loss": 3.3909292986732082, + "tokens_seen": 2302190592 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015280842527582748, + "loss": 2.358, + "theoretical_loss": 3.3909214713382196, + "tokens_seen": 2302256128 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015279839518555667, + "loss": 2.5592, + "theoretical_loss": 3.390913644288426, + "tokens_seen": 2302321664 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015278836509528585, + "loss": 2.3709, + "theoretical_loss": 3.3909058175238096, + "tokens_seen": 2302387200 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015277833500501503, + "loss": 2.7, + "theoretical_loss": 3.390897991044351, + "tokens_seen": 2302452736 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015276830491474426, + "loss": 2.6206, + "theoretical_loss": 3.390890164850032, + "tokens_seen": 2302518272 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015275827482447344, + "loss": 2.6223, + "theoretical_loss": 3.3908823389408345, + "tokens_seen": 2302583808 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015274824473420263, + "loss": 2.4272, + "theoretical_loss": 3.3908745133167395, + "tokens_seen": 2302649344 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001527382146439318, + "loss": 2.3545, + "theoretical_loss": 3.390866687977729, + "tokens_seen": 2302714880 + }, + { + "epoch": 7.07, + "learning_rate": 0.000152728184553661, + "loss": 2.5571, + "theoretical_loss": 3.390858862923784, + "tokens_seen": 2302780416 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001527181544633902, + "loss": 2.5473, + "theoretical_loss": 3.3908510381548864, + "tokens_seen": 2302845952 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015270812437311938, + "loss": 2.5759, + "theoretical_loss": 3.390843213671017, + "tokens_seen": 2302911488 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015269809428284856, + "loss": 2.4689, + "theoretical_loss": 3.3908353894721586, + "tokens_seen": 2302977024 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015268806419257774, + "loss": 2.469, + "theoretical_loss": 3.3908275655582916, + "tokens_seen": 2303042560 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015267803410230695, + "loss": 2.3692, + "theoretical_loss": 3.390819741929398, + "tokens_seen": 2303108096 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015266800401203613, + "loss": 2.4037, + "theoretical_loss": 3.3908119185854586, + "tokens_seen": 2303173632 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001526579739217653, + "loss": 2.2999, + "theoretical_loss": 3.3908040955264562, + "tokens_seen": 2303239168 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001526479438314945, + "loss": 2.4103, + "theoretical_loss": 3.390796272752371, + "tokens_seen": 2303304704 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015263791374122367, + "loss": 2.5022, + "theoretical_loss": 3.3907884502631855, + "tokens_seen": 2303370240 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015262788365095288, + "loss": 2.5552, + "theoretical_loss": 3.3907806280588804, + "tokens_seen": 2303435776 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015261785356068206, + "loss": 2.4951, + "theoretical_loss": 3.390772806139438, + "tokens_seen": 2303501312 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015260782347041124, + "loss": 2.5454, + "theoretical_loss": 3.3907649845048393, + "tokens_seen": 2303566848 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2532385, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.412231922149658, + "objective/train/theoretical_loss": 3.3907630291406945, + "objective/train/tokens_used": 2324043232, + "theoretical_loss": 3.3907630291406945, + "tokens_seen": 2303583232 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015259779338014042, + "loss": 2.7698, + "theoretical_loss": 3.390757163155066, + "tokens_seen": 2303632384 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015258776328986963, + "loss": 2.3674, + "theoretical_loss": 3.3907493420901, + "tokens_seen": 2303697920 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001525777331995988, + "loss": 2.6264, + "theoretical_loss": 3.3907415213099217, + "tokens_seen": 2303763456 + }, + { + "epoch": 7.07, + "learning_rate": 0.000152567703109328, + "loss": 2.4956, + "theoretical_loss": 3.390733700814514, + "tokens_seen": 2303828992 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015255767301905717, + "loss": 2.1748, + "theoretical_loss": 3.3907258806038576, + "tokens_seen": 2303894528 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015254764292878636, + "loss": 2.5676, + "theoretical_loss": 3.3907180606779344, + "tokens_seen": 2303960064 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015253761283851556, + "loss": 2.5047, + "theoretical_loss": 3.390710241036725, + "tokens_seen": 2304025600 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015252758274824475, + "loss": 2.4989, + "theoretical_loss": 3.3907024216802126, + "tokens_seen": 2304091136 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015251755265797393, + "loss": 2.1943, + "theoretical_loss": 3.3906946026083773, + "tokens_seen": 2304156672 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001525075225677031, + "loss": 2.2833, + "theoretical_loss": 3.3906867838212014, + "tokens_seen": 2304222208 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015249749247743232, + "loss": 2.6285, + "theoretical_loss": 3.390678965318666, + "tokens_seen": 2304287744 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001524874623871615, + "loss": 2.4864, + "theoretical_loss": 3.390671147100753, + "tokens_seen": 2304353280 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015247743229689068, + "loss": 2.4981, + "theoretical_loss": 3.3906633291674444, + "tokens_seen": 2304418816 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015246740220661986, + "loss": 2.6993, + "theoretical_loss": 3.3906555115187205, + "tokens_seen": 2304484352 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015245737211634904, + "loss": 2.5552, + "theoretical_loss": 3.3906476941545636, + "tokens_seen": 2304549888 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015244734202607825, + "loss": 2.3208, + "theoretical_loss": 3.390639877074955, + "tokens_seen": 2304615424 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015243731193580743, + "loss": 2.2834, + "theoretical_loss": 3.3906320602798767, + "tokens_seen": 2304680960 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001524272818455366, + "loss": 2.6143, + "theoretical_loss": 3.39062424376931, + "tokens_seen": 2304746496 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001524172517552658, + "loss": 2.437, + "theoretical_loss": 3.390616427543236, + "tokens_seen": 2304812032 + }, + { + "epoch": 7.07, + "learning_rate": 0.000152407221664995, + "loss": 2.426, + "theoretical_loss": 3.390608611601637, + "tokens_seen": 2304877568 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015239719157472418, + "loss": 2.4501, + "theoretical_loss": 3.3906007959444944, + "tokens_seen": 2304943104 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015238716148445336, + "loss": 2.2815, + "theoretical_loss": 3.390592980571789, + "tokens_seen": 2305008640 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015237713139418254, + "loss": 2.6586, + "theoretical_loss": 3.3905851654835035, + "tokens_seen": 2305074176 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015236710130391172, + "loss": 2.5588, + "theoretical_loss": 3.3905773506796186, + "tokens_seen": 2305139712 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015235707121364093, + "loss": 2.5693, + "theoretical_loss": 3.3905695361601165, + "tokens_seen": 2305205248 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2532779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4602081775665283, + "objective/train/theoretical_loss": 3.3905675825746733, + "objective/train/tokens_used": 2325681632, + "theoretical_loss": 3.3905675825746733, + "tokens_seen": 2305221632 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015234704112337011, + "loss": 2.5607, + "theoretical_loss": 3.390561721924978, + "tokens_seen": 2305270784 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001523370110330993, + "loss": 2.4422, + "theoretical_loss": 3.3905539079741853, + "tokens_seen": 2305336320 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015232698094282848, + "loss": 2.6359, + "theoretical_loss": 3.3905460943077195, + "tokens_seen": 2305401856 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015231695085255768, + "loss": 2.6393, + "theoretical_loss": 3.390538280925563, + "tokens_seen": 2305467392 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015230692076228687, + "loss": 2.5202, + "theoretical_loss": 3.3905304678276966, + "tokens_seen": 2305532928 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015229689067201605, + "loss": 2.5757, + "theoretical_loss": 3.390522655014102, + "tokens_seen": 2305598464 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015228686058174523, + "loss": 2.4172, + "theoretical_loss": 3.390514842484761, + "tokens_seen": 2305664000 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001522768304914744, + "loss": 2.5276, + "theoretical_loss": 3.3905070302396547, + "tokens_seen": 2305729536 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015226680040120362, + "loss": 2.6633, + "theoretical_loss": 3.3904992182787654, + "tokens_seen": 2305795072 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001522567703109328, + "loss": 2.5106, + "theoretical_loss": 3.3904914066020746, + "tokens_seen": 2305860608 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015224674022066198, + "loss": 2.5461, + "theoretical_loss": 3.390483595209563, + "tokens_seen": 2305926144 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015223671013039116, + "loss": 2.4919, + "theoretical_loss": 3.390475784101213, + "tokens_seen": 2305991680 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015222668004012037, + "loss": 2.5497, + "theoretical_loss": 3.390467973277006, + "tokens_seen": 2306057216 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015221664994984955, + "loss": 2.237, + "theoretical_loss": 3.390460162736924, + "tokens_seen": 2306122752 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015220661985957873, + "loss": 2.4156, + "theoretical_loss": 3.390452352480948, + "tokens_seen": 2306188288 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001521965897693079, + "loss": 2.5554, + "theoretical_loss": 3.3904445425090595, + "tokens_seen": 2306253824 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001521865596790371, + "loss": 2.644, + "theoretical_loss": 3.39043673282124, + "tokens_seen": 2306319360 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001521765295887663, + "loss": 2.5453, + "theoretical_loss": 3.390428923417472, + "tokens_seen": 2306384896 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015216649949849548, + "loss": 2.4594, + "theoretical_loss": 3.390421114297736, + "tokens_seen": 2306450432 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015215646940822466, + "loss": 2.4921, + "theoretical_loss": 3.3904133054620145, + "tokens_seen": 2306515968 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015214643931795384, + "loss": 2.5957, + "theoretical_loss": 3.390405496910289, + "tokens_seen": 2306581504 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015213640922768305, + "loss": 2.3769, + "theoretical_loss": 3.3903976886425404, + "tokens_seen": 2306647040 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015212637913741223, + "loss": 2.4128, + "theoretical_loss": 3.3903898806587507, + "tokens_seen": 2306712576 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015211634904714142, + "loss": 2.5339, + "theoretical_loss": 3.390382072958902, + "tokens_seen": 2306778112 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001521063189568706, + "loss": 2.6121, + "theoretical_loss": 3.390374265542975, + "tokens_seen": 2306843648 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2534072, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3961236476898193, + "objective/train/theoretical_loss": 3.390372313733354, + "objective/train/tokens_used": 2327320032, + "theoretical_loss": 3.390372313733354, + "tokens_seen": 2306860032 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001520962888665998, + "loss": 2.4167, + "theoretical_loss": 3.390366458410952, + "tokens_seen": 2306909184 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015208625877632899, + "loss": 2.7464, + "theoretical_loss": 3.3903586515628144, + "tokens_seen": 2306974720 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015207622868605817, + "loss": 2.1534, + "theoretical_loss": 3.3903508449985438, + "tokens_seen": 2307040256 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015206619859578735, + "loss": 2.6014, + "theoretical_loss": 3.3903430387181217, + "tokens_seen": 2307105792 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015205616850551653, + "loss": 2.6327, + "theoretical_loss": 3.39033523272153, + "tokens_seen": 2307171328 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015204613841524574, + "loss": 2.538, + "theoretical_loss": 3.3903274270087502, + "tokens_seen": 2307236864 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015203610832497492, + "loss": 2.5019, + "theoretical_loss": 3.3903196215797635, + "tokens_seen": 2307302400 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015202607823470413, + "loss": 2.7305, + "theoretical_loss": 3.390311816434552, + "tokens_seen": 2307367936 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001520160481444333, + "loss": 2.3736, + "theoretical_loss": 3.3903040115730976, + "tokens_seen": 2307433472 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015200601805416252, + "loss": 2.5345, + "theoretical_loss": 3.390296206995381, + "tokens_seen": 2307499008 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001519959879638917, + "loss": 2.6409, + "theoretical_loss": 3.3902884027013847, + "tokens_seen": 2307564544 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015198595787362088, + "loss": 2.3847, + "theoretical_loss": 3.3902805986910898, + "tokens_seen": 2307630080 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015197592778335006, + "loss": 2.4229, + "theoretical_loss": 3.390272794964478, + "tokens_seen": 2307695616 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015196589769307924, + "loss": 2.674, + "theoretical_loss": 3.3902649915215313, + "tokens_seen": 2307761152 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015195586760280845, + "loss": 2.6901, + "theoretical_loss": 3.3902571883622312, + "tokens_seen": 2307826688 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015194583751253763, + "loss": 2.2882, + "theoretical_loss": 3.390249385486559, + "tokens_seen": 2307892224 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001519358074222668, + "loss": 2.4832, + "theoretical_loss": 3.3902415828944967, + "tokens_seen": 2307957760 + }, + { + "epoch": 7.07, + "learning_rate": 0.000151925777331996, + "loss": 2.3968, + "theoretical_loss": 3.3902337805860254, + "tokens_seen": 2308023296 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001519157472417252, + "loss": 2.5442, + "theoretical_loss": 3.3902259785611277, + "tokens_seen": 2308088832 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015190571715145438, + "loss": 2.4745, + "theoretical_loss": 3.3902181768197845, + "tokens_seen": 2308154368 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015189568706118356, + "loss": 2.656, + "theoretical_loss": 3.3902103753619777, + "tokens_seen": 2308219904 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015188565697091274, + "loss": 2.7144, + "theoretical_loss": 3.390202574187689, + "tokens_seen": 2308285440 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015187562688064192, + "loss": 2.3307, + "theoretical_loss": 3.3901947732968996, + "tokens_seen": 2308350976 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015186559679037113, + "loss": 2.4991, + "theoretical_loss": 3.390186972689591, + "tokens_seen": 2308416512 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015185556670010031, + "loss": 2.6892, + "theoretical_loss": 3.390179172365746, + "tokens_seen": 2308482048 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2534646, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5717384815216064, + "objective/train/theoretical_loss": 3.390177222329074, + "objective/train/tokens_used": 2328958432, + "theoretical_loss": 3.390177222329074, + "tokens_seen": 2308498432 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001518455366098295, + "loss": 2.5781, + "theoretical_loss": 3.3901713723253453, + "tokens_seen": 2308547584 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015183550651955868, + "loss": 2.3144, + "theoretical_loss": 3.390163572568371, + "tokens_seen": 2308613120 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015182547642928788, + "loss": 2.2319, + "theoretical_loss": 3.3901557730948046, + "tokens_seen": 2308678656 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015181544633901707, + "loss": 2.4606, + "theoretical_loss": 3.390147973904628, + "tokens_seen": 2308744192 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015180541624874625, + "loss": 2.4607, + "theoretical_loss": 3.3901401749978217, + "tokens_seen": 2308809728 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015179538615847543, + "loss": 2.4363, + "theoretical_loss": 3.390132376374369, + "tokens_seen": 2308875264 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001517853560682046, + "loss": 2.5284, + "theoretical_loss": 3.390124578034251, + "tokens_seen": 2308940800 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015177532597793382, + "loss": 2.6464, + "theoretical_loss": 3.3901167799774488, + "tokens_seen": 2309006336 + }, + { + "epoch": 7.07, + "learning_rate": 0.000151765295887663, + "loss": 2.5511, + "theoretical_loss": 3.390108982203944, + "tokens_seen": 2309071872 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015175526579739218, + "loss": 2.4423, + "theoretical_loss": 3.3901011847137195, + "tokens_seen": 2309137408 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015174523570712136, + "loss": 2.4597, + "theoretical_loss": 3.390093387506756, + "tokens_seen": 2309202944 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015173520561685057, + "loss": 2.4706, + "theoretical_loss": 3.390085590583035, + "tokens_seen": 2309268480 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015172517552657975, + "loss": 2.5099, + "theoretical_loss": 3.3900777939425386, + "tokens_seen": 2309334016 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015171514543630893, + "loss": 2.4778, + "theoretical_loss": 3.3900699975852486, + "tokens_seen": 2309399552 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001517051153460381, + "loss": 2.5366, + "theoretical_loss": 3.390062201511147, + "tokens_seen": 2309465088 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001516950852557673, + "loss": 2.7261, + "theoretical_loss": 3.390054405720214, + "tokens_seen": 2309530624 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001516850551654965, + "loss": 2.6948, + "theoretical_loss": 3.3900466102124325, + "tokens_seen": 2309596160 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015167502507522568, + "loss": 2.3711, + "theoretical_loss": 3.390038814987784, + "tokens_seen": 2309661696 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015166499498495486, + "loss": 2.5481, + "theoretical_loss": 3.3900310200462505, + "tokens_seen": 2309727232 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015165496489468404, + "loss": 2.4893, + "theoretical_loss": 3.390023225387813, + "tokens_seen": 2309792768 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015164493480441325, + "loss": 2.4873, + "theoretical_loss": 3.3900154310124533, + "tokens_seen": 2309858304 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015163490471414243, + "loss": 2.6118, + "theoretical_loss": 3.3900076369201537, + "tokens_seen": 2309923840 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015162487462387162, + "loss": 2.7218, + "theoretical_loss": 3.3899998431108953, + "tokens_seen": 2309989376 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001516148445336008, + "loss": 2.395, + "theoretical_loss": 3.3899920495846594, + "tokens_seen": 2310054912 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015160481444333, + "loss": 2.3933, + "theoretical_loss": 3.389984256341429, + "tokens_seen": 2310120448 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2535733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7114014625549316, + "objective/train/theoretical_loss": 3.3899823080748384, + "objective/train/tokens_used": 2330596832, + "theoretical_loss": 3.3899823080748384, + "tokens_seen": 2310136832 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015159478435305919, + "loss": 2.7261, + "theoretical_loss": 3.3899764633811844, + "tokens_seen": 2310185984 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015158475426278837, + "loss": 2.3578, + "theoretical_loss": 3.3899686707039085, + "tokens_seen": 2310251520 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015157472417251755, + "loss": 2.6809, + "theoretical_loss": 3.389960878309582, + "tokens_seen": 2310317056 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015156469408224673, + "loss": 2.7582, + "theoretical_loss": 3.3899530861981875, + "tokens_seen": 2310382592 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015155466399197594, + "loss": 2.6198, + "theoretical_loss": 3.3899452943697055, + "tokens_seen": 2310448128 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015154463390170512, + "loss": 2.628, + "theoretical_loss": 3.389937502824119, + "tokens_seen": 2310513664 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001515346038114343, + "loss": 2.633, + "theoretical_loss": 3.3899297115614093, + "tokens_seen": 2310579200 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015152457372116348, + "loss": 2.4108, + "theoretical_loss": 3.389921920581558, + "tokens_seen": 2310644736 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001515145436308927, + "loss": 2.4269, + "theoretical_loss": 3.389914129884546, + "tokens_seen": 2310710272 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015150451354062187, + "loss": 2.5168, + "theoretical_loss": 3.389906339470356, + "tokens_seen": 2310775808 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015149448345035105, + "loss": 2.6363, + "theoretical_loss": 3.38989854933897, + "tokens_seen": 2310841344 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015148445336008023, + "loss": 2.5891, + "theoretical_loss": 3.3898907594903687, + "tokens_seen": 2310906880 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001514744232698094, + "loss": 2.504, + "theoretical_loss": 3.3898829699245345, + "tokens_seen": 2310972416 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015146439317953862, + "loss": 2.4125, + "theoretical_loss": 3.3898751806414493, + "tokens_seen": 2311037952 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001514543630892678, + "loss": 2.7126, + "theoretical_loss": 3.389867391641094, + "tokens_seen": 2311103488 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015144433299899698, + "loss": 2.6852, + "theoretical_loss": 3.389859602923451, + "tokens_seen": 2311169024 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015143430290872617, + "loss": 2.4281, + "theoretical_loss": 3.3898518144885017, + "tokens_seen": 2311234560 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015142427281845537, + "loss": 2.8096, + "theoretical_loss": 3.3898440263362275, + "tokens_seen": 2311300096 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015141424272818455, + "loss": 2.6206, + "theoretical_loss": 3.389836238466611, + "tokens_seen": 2311365632 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015140421263791374, + "loss": 2.5448, + "theoretical_loss": 3.3898284508796337, + "tokens_seen": 2311431168 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015139418254764292, + "loss": 2.5701, + "theoretical_loss": 3.389820663575277, + "tokens_seen": 2311496704 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001513841524573721, + "loss": 2.6717, + "theoretical_loss": 3.3898128765535223, + "tokens_seen": 2311562240 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001513741223671013, + "loss": 2.3836, + "theoretical_loss": 3.389805089814352, + "tokens_seen": 2311627776 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001513640922768305, + "loss": 2.501, + "theoretical_loss": 3.3897973033577475, + "tokens_seen": 2311693312 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015135406218655967, + "loss": 2.5078, + "theoretical_loss": 3.3897895171836905, + "tokens_seen": 2311758848 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2537199, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7662432193756104, + "objective/train/theoretical_loss": 3.389787570684322, + "objective/train/tokens_used": 2332235232, + "theoretical_loss": 3.389787570684322, + "tokens_seen": 2311775232 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015134403209628885, + "loss": 2.6411, + "theoretical_loss": 3.389781731292163, + "tokens_seen": 2311824384 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015133400200601806, + "loss": 2.5748, + "theoretical_loss": 3.389773945683147, + "tokens_seen": 2311889920 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015132397191574724, + "loss": 2.5432, + "theoretical_loss": 3.3897661603566234, + "tokens_seen": 2311955456 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015131394182547642, + "loss": 2.6448, + "theoretical_loss": 3.3897583753125744, + "tokens_seen": 2312020992 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001513039117352056, + "loss": 2.5989, + "theoretical_loss": 3.3897505905509817, + "tokens_seen": 2312086528 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015129388164493478, + "loss": 2.5931, + "theoretical_loss": 3.389742806071827, + "tokens_seen": 2312152064 + }, + { + "epoch": 7.07, + "learning_rate": 0.000151283851554664, + "loss": 2.5672, + "theoretical_loss": 3.389735021875093, + "tokens_seen": 2312217600 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001512738214643932, + "loss": 2.498, + "theoretical_loss": 3.3897272379607597, + "tokens_seen": 2312283136 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015126379137412238, + "loss": 2.6588, + "theoretical_loss": 3.3897194543288096, + "tokens_seen": 2312348672 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015125376128385156, + "loss": 2.5567, + "theoretical_loss": 3.389711670979225, + "tokens_seen": 2312414208 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015124373119358077, + "loss": 2.4307, + "theoretical_loss": 3.3897038879119874, + "tokens_seen": 2312479744 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015123370110330995, + "loss": 2.6314, + "theoretical_loss": 3.3896961051270784, + "tokens_seen": 2312545280 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015122367101303913, + "loss": 2.5916, + "theoretical_loss": 3.3896883226244796, + "tokens_seen": 2312610816 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001512136409227683, + "loss": 2.6238, + "theoretical_loss": 3.3896805404041728, + "tokens_seen": 2312676352 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001512036108324975, + "loss": 2.4079, + "theoretical_loss": 3.3896727584661397, + "tokens_seen": 2312741888 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001511935807422267, + "loss": 2.5239, + "theoretical_loss": 3.3896649768103626, + "tokens_seen": 2312807424 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015118355065195588, + "loss": 2.5721, + "theoretical_loss": 3.389657195436823, + "tokens_seen": 2312872960 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015117352056168506, + "loss": 2.6134, + "theoretical_loss": 3.3896494143455023, + "tokens_seen": 2312938496 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015116349047141425, + "loss": 2.5401, + "theoretical_loss": 3.3896416335363826, + "tokens_seen": 2313004032 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015115346038114345, + "loss": 2.3878, + "theoretical_loss": 3.389633853009445, + "tokens_seen": 2313069568 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015114343029087263, + "loss": 2.5213, + "theoretical_loss": 3.389626072764673, + "tokens_seen": 2313135104 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015113340020060182, + "loss": 2.4836, + "theoretical_loss": 3.3896182928020466, + "tokens_seen": 2313200640 + }, + { + "epoch": 7.07, + "learning_rate": 0.000151123370110331, + "loss": 2.6217, + "theoretical_loss": 3.3896105131215486, + "tokens_seen": 2313266176 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001511133400200602, + "loss": 2.4399, + "theoretical_loss": 3.38960273372316, + "tokens_seen": 2313331712 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015110330992978939, + "loss": 2.51, + "theoretical_loss": 3.389594954606863, + "tokens_seen": 2313397248 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2537846, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.431040048599243, + "objective/train/theoretical_loss": 3.389593009871864, + "objective/train/tokens_used": 2333873632, + "theoretical_loss": 3.389593009871864, + "tokens_seen": 2313413632 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015109327983951857, + "loss": 2.4049, + "theoretical_loss": 3.3895871757726397, + "tokens_seen": 2313462784 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015108324974924775, + "loss": 2.3334, + "theoretical_loss": 3.389579397220471, + "tokens_seen": 2313528320 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015107321965897693, + "loss": 2.7466, + "theoretical_loss": 3.38957161895034, + "tokens_seen": 2313593856 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015106318956870614, + "loss": 2.6756, + "theoretical_loss": 3.3895638409622273, + "tokens_seen": 2313659392 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015105315947843532, + "loss": 2.4459, + "theoretical_loss": 3.389556063256115, + "tokens_seen": 2313724928 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001510431293881645, + "loss": 2.6898, + "theoretical_loss": 3.389548285831985, + "tokens_seen": 2313790464 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015103309929789368, + "loss": 2.6192, + "theoretical_loss": 3.3895405086898194, + "tokens_seen": 2313856000 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001510230692076229, + "loss": 2.2466, + "theoretical_loss": 3.3895327318295996, + "tokens_seen": 2313921536 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015101303911735207, + "loss": 2.4612, + "theoretical_loss": 3.389524955251307, + "tokens_seen": 2313987072 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015100300902708125, + "loss": 2.6094, + "theoretical_loss": 3.3895171789549243, + "tokens_seen": 2314052608 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015099297893681043, + "loss": 2.6696, + "theoretical_loss": 3.3895094029404325, + "tokens_seen": 2314118144 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015098294884653961, + "loss": 2.51, + "theoretical_loss": 3.389501627207814, + "tokens_seen": 2314183680 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015097291875626882, + "loss": 2.6117, + "theoretical_loss": 3.3894938517570505, + "tokens_seen": 2314249216 + }, + { + "epoch": 7.07, + "learning_rate": 0.000150962888665998, + "loss": 2.5269, + "theoretical_loss": 3.389486076588123, + "tokens_seen": 2314314752 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015095285857572718, + "loss": 2.7119, + "theoretical_loss": 3.389478301701015, + "tokens_seen": 2314380288 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015094282848545637, + "loss": 2.5618, + "theoretical_loss": 3.3894705270957064, + "tokens_seen": 2314445824 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015093279839518557, + "loss": 2.4502, + "theoretical_loss": 3.3894627527721806, + "tokens_seen": 2314511360 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015092276830491475, + "loss": 2.8467, + "theoretical_loss": 3.3894549787304182, + "tokens_seen": 2314576896 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015091273821464394, + "loss": 2.5797, + "theoretical_loss": 3.389447204970401, + "tokens_seen": 2314642432 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015090270812437312, + "loss": 2.4363, + "theoretical_loss": 3.389439431492112, + "tokens_seen": 2314707968 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001508926780341023, + "loss": 2.6874, + "theoretical_loss": 3.3894316582955324, + "tokens_seen": 2314773504 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001508826479438315, + "loss": 2.7959, + "theoretical_loss": 3.3894238853806438, + "tokens_seen": 2314839040 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001508726178535607, + "loss": 2.5815, + "theoretical_loss": 3.389416112747428, + "tokens_seen": 2314904576 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015086258776328987, + "loss": 2.5208, + "theoretical_loss": 3.389408340395867, + "tokens_seen": 2314970112 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015085255767301905, + "loss": 2.3516, + "theoretical_loss": 3.389400568325943, + "tokens_seen": 2315035648 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2539148, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.589704751968384, + "objective/train/theoretical_loss": 3.3893986253524653, + "objective/train/tokens_used": 2335512032, + "theoretical_loss": 3.3893986253524653, + "tokens_seen": 2315052032 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015084252758274826, + "loss": 2.6353, + "theoretical_loss": 3.3893927965376367, + "tokens_seen": 2315101184 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015083249749247744, + "loss": 2.4942, + "theoretical_loss": 3.389385025030931, + "tokens_seen": 2315166720 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015082246740220662, + "loss": 2.532, + "theoretical_loss": 3.389377253805807, + "tokens_seen": 2315232256 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001508124373119358, + "loss": 2.5435, + "theoretical_loss": 3.3893694828622474, + "tokens_seen": 2315297792 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015080240722166498, + "loss": 2.533, + "theoretical_loss": 3.3893617122002335, + "tokens_seen": 2315363328 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001507923771313942, + "loss": 2.3967, + "theoretical_loss": 3.389353941819747, + "tokens_seen": 2315428864 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015078234704112337, + "loss": 2.6383, + "theoretical_loss": 3.38934617172077, + "tokens_seen": 2315494400 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015077231695085255, + "loss": 2.7238, + "theoretical_loss": 3.389338401903284, + "tokens_seen": 2315559936 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015076228686058173, + "loss": 2.4724, + "theoretical_loss": 3.389330632367271, + "tokens_seen": 2315625472 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015075225677031094, + "loss": 2.5562, + "theoretical_loss": 3.3893228631127132, + "tokens_seen": 2315691008 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015074222668004012, + "loss": 2.4366, + "theoretical_loss": 3.389315094139592, + "tokens_seen": 2315756544 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001507321965897693, + "loss": 2.6958, + "theoretical_loss": 3.3893073254478896, + "tokens_seen": 2315822080 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015072216649949849, + "loss": 2.472, + "theoretical_loss": 3.389299557037587, + "tokens_seen": 2315887616 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015071213640922767, + "loss": 2.3209, + "theoretical_loss": 3.3892917889086673, + "tokens_seen": 2315953152 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015070210631895687, + "loss": 2.4252, + "theoretical_loss": 3.389284021061111, + "tokens_seen": 2316018688 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015069207622868606, + "loss": 2.6029, + "theoretical_loss": 3.3892762534949012, + "tokens_seen": 2316084224 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015068204613841524, + "loss": 2.7117, + "theoretical_loss": 3.3892684862100193, + "tokens_seen": 2316149760 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015067201604814442, + "loss": 2.6079, + "theoretical_loss": 3.3892607192064466, + "tokens_seen": 2316215296 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015066198595787363, + "loss": 2.4526, + "theoretical_loss": 3.389252952484166, + "tokens_seen": 2316280832 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001506519558676028, + "loss": 2.5296, + "theoretical_loss": 3.3892451860431585, + "tokens_seen": 2316346368 + }, + { + "epoch": 7.07, + "learning_rate": 0.000150641925777332, + "loss": 2.7493, + "theoretical_loss": 3.389237419883406, + "tokens_seen": 2316411904 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015063189568706117, + "loss": 2.6838, + "theoretical_loss": 3.3892296540048905, + "tokens_seen": 2316477440 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015062186559679035, + "loss": 2.6169, + "theoretical_loss": 3.389221888407594, + "tokens_seen": 2316542976 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015061183550651956, + "loss": 2.5645, + "theoretical_loss": 3.3892141230914987, + "tokens_seen": 2316608512 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015060180541624874, + "loss": 2.5668, + "theoretical_loss": 3.389206358056586, + "tokens_seen": 2316674048 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2540005, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5352425575256348, + "objective/train/theoretical_loss": 3.3892044168417903, + "objective/train/tokens_used": 2337150432, + "theoretical_loss": 3.3892044168417903, + "tokens_seen": 2316690432 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015059177532597792, + "loss": 2.4892, + "theoretical_loss": 3.3891985933028375, + "tokens_seen": 2316739584 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001505817452357071, + "loss": 2.319, + "theoretical_loss": 3.3891908288302353, + "tokens_seen": 2316805120 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001505717151454363, + "loss": 2.3139, + "theoretical_loss": 3.3891830646387615, + "tokens_seen": 2316870656 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001505616850551655, + "loss": 2.3897, + "theoretical_loss": 3.3891753007283976, + "tokens_seen": 2316936192 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015055165496489467, + "loss": 2.5469, + "theoretical_loss": 3.389167537099126, + "tokens_seen": 2317001728 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015054162487462385, + "loss": 2.7781, + "theoretical_loss": 3.3891597737509285, + "tokens_seen": 2317067264 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015053159478435304, + "loss": 2.55, + "theoretical_loss": 3.3891520106837865, + "tokens_seen": 2317132800 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015052156469408227, + "loss": 2.6173, + "theoretical_loss": 3.389144247897682, + "tokens_seen": 2317198336 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015051153460381145, + "loss": 2.2508, + "theoretical_loss": 3.389136485392597, + "tokens_seen": 2317263872 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015050150451354063, + "loss": 2.5321, + "theoretical_loss": 3.3891287231685134, + "tokens_seen": 2317329408 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015049147442326981, + "loss": 2.5457, + "theoretical_loss": 3.389120961225413, + "tokens_seen": 2317394944 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015048144433299902, + "loss": 2.4202, + "theoretical_loss": 3.389113199563278, + "tokens_seen": 2317460480 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001504714142427282, + "loss": 2.7963, + "theoretical_loss": 3.38910543818209, + "tokens_seen": 2317526016 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015046138415245738, + "loss": 2.5634, + "theoretical_loss": 3.3890976770818306, + "tokens_seen": 2317591552 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015045135406218657, + "loss": 2.5479, + "theoretical_loss": 3.389089916262482, + "tokens_seen": 2317657088 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015044132397191577, + "loss": 2.7075, + "theoretical_loss": 3.3890821557240267, + "tokens_seen": 2317722624 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015043129388164495, + "loss": 2.5746, + "theoretical_loss": 3.3890743954664453, + "tokens_seen": 2317788160 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015042126379137414, + "loss": 2.5705, + "theoretical_loss": 3.3890666354897205, + "tokens_seen": 2317853696 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015041123370110332, + "loss": 2.6358, + "theoretical_loss": 3.3890588757938342, + "tokens_seen": 2317919232 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001504012036108325, + "loss": 2.7096, + "theoretical_loss": 3.389051116378768, + "tokens_seen": 2317984768 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001503911735205617, + "loss": 2.621, + "theoretical_loss": 3.3890433572445042, + "tokens_seen": 2318050304 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001503811434302909, + "loss": 2.6961, + "theoretical_loss": 3.3890355983910245, + "tokens_seen": 2318115840 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015037111334002007, + "loss": 2.6743, + "theoretical_loss": 3.389027839818311, + "tokens_seen": 2318181376 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015036108324974925, + "loss": 2.6026, + "theoretical_loss": 3.3890200815263447, + "tokens_seen": 2318246912 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015035105315947846, + "loss": 2.6179, + "theoretical_loss": 3.3890123235151086, + "tokens_seen": 2318312448 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2541552, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.267181634902954, + "objective/train/theoretical_loss": 3.3890103840561614, + "objective/train/tokens_used": 2338788832, + "theoretical_loss": 3.3890103840561614, + "tokens_seen": 2318328832 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015034102306920764, + "loss": 2.5686, + "theoretical_loss": 3.389004565784584, + "tokens_seen": 2318377984 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015033099297893682, + "loss": 2.5138, + "theoretical_loss": 3.388996808334753, + "tokens_seen": 2318443520 + }, + { + "epoch": 7.07, + "learning_rate": 0.000150320962888666, + "loss": 2.4776, + "theoretical_loss": 3.3889890511655976, + "tokens_seen": 2318509056 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015031093279839518, + "loss": 2.5479, + "theoretical_loss": 3.3889812942770994, + "tokens_seen": 2318574592 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001503009027081244, + "loss": 2.4878, + "theoretical_loss": 3.3889735376692407, + "tokens_seen": 2318640128 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015029087261785357, + "loss": 2.392, + "theoretical_loss": 3.3889657813420033, + "tokens_seen": 2318705664 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015028084252758275, + "loss": 2.733, + "theoretical_loss": 3.388958025295369, + "tokens_seen": 2318771200 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015027081243731193, + "loss": 2.6916, + "theoretical_loss": 3.3889502695293197, + "tokens_seen": 2318836736 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015026078234704114, + "loss": 2.595, + "theoretical_loss": 3.3889425140438374, + "tokens_seen": 2318902272 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015025075225677032, + "loss": 2.5068, + "theoretical_loss": 3.388934758838904, + "tokens_seen": 2318967808 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001502407221664995, + "loss": 2.6379, + "theoretical_loss": 3.3889270039145014, + "tokens_seen": 2319033344 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015023069207622869, + "loss": 2.5487, + "theoretical_loss": 3.3889192492706117, + "tokens_seen": 2319098880 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015022066198595787, + "loss": 2.6234, + "theoretical_loss": 3.3889114949072168, + "tokens_seen": 2319164416 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015021063189568707, + "loss": 2.571, + "theoretical_loss": 3.3889037408242983, + "tokens_seen": 2319229952 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015020060180541626, + "loss": 2.3125, + "theoretical_loss": 3.3888959870218383, + "tokens_seen": 2319295488 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015019057171514544, + "loss": 2.628, + "theoretical_loss": 3.3888882334998187, + "tokens_seen": 2319361024 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015018054162487462, + "loss": 2.7237, + "theoretical_loss": 3.388880480258222, + "tokens_seen": 2319426560 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015017051153460383, + "loss": 2.5311, + "theoretical_loss": 3.388872727297029, + "tokens_seen": 2319492096 + }, + { + "epoch": 7.07, + "learning_rate": 0.000150160481444333, + "loss": 2.6315, + "theoretical_loss": 3.3888649746162227, + "tokens_seen": 2319557632 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001501504513540622, + "loss": 2.6158, + "theoretical_loss": 3.3888572222157842, + "tokens_seen": 2319623168 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015014042126379137, + "loss": 2.5484, + "theoretical_loss": 3.388849470095696, + "tokens_seen": 2319688704 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015013039117352055, + "loss": 2.5195, + "theoretical_loss": 3.3888417182559403, + "tokens_seen": 2319754240 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015012036108324976, + "loss": 2.5381, + "theoretical_loss": 3.388833966696499, + "tokens_seen": 2319819776 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015011033099297894, + "loss": 2.6387, + "theoretical_loss": 3.3888262154173527, + "tokens_seen": 2319885312 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015010030090270812, + "loss": 2.3987, + "theoretical_loss": 3.3888184644184847, + "tokens_seen": 2319950848 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2542225, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.81160306930542, + "objective/train/theoretical_loss": 3.388816526712559, + "objective/train/tokens_used": 2340427232, + "theoretical_loss": 3.388816526712559, + "tokens_seen": 2319967232 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001500902708124373, + "loss": 2.642, + "theoretical_loss": 3.3888107136998764, + "tokens_seen": 2320016384 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001500802407221665, + "loss": 2.5171, + "theoretical_loss": 3.38880296326151, + "tokens_seen": 2320081920 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001500702106318957, + "loss": 2.5238, + "theoretical_loss": 3.388795213103368, + "tokens_seen": 2320147456 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015006018054162487, + "loss": 2.2928, + "theoretical_loss": 3.388787463225431, + "tokens_seen": 2320212992 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015005015045135405, + "loss": 2.5443, + "theoretical_loss": 3.388779713627682, + "tokens_seen": 2320278528 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015004012036108324, + "loss": 2.6126, + "theoretical_loss": 3.3887719643101026, + "tokens_seen": 2320344064 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015003009027081244, + "loss": 2.6055, + "theoretical_loss": 3.388764215272675, + "tokens_seen": 2320409600 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015002006018054162, + "loss": 2.6107, + "theoretical_loss": 3.388756466515381, + "tokens_seen": 2320475136 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001500100300902708, + "loss": 2.661, + "theoretical_loss": 3.388748718038202, + "tokens_seen": 2320540672 + }, + { + "epoch": 7.07, + "learning_rate": 0.00015, + "loss": 2.4008, + "theoretical_loss": 3.388740969841121, + "tokens_seen": 2320606208 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001499899699097292, + "loss": 2.5422, + "theoretical_loss": 3.3887332219241193, + "tokens_seen": 2320671744 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014997993981945838, + "loss": 2.5156, + "theoretical_loss": 3.388725474287179, + "tokens_seen": 2320737280 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014996990972918756, + "loss": 2.6198, + "theoretical_loss": 3.388717726930282, + "tokens_seen": 2320802816 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014995987963891674, + "loss": 2.5488, + "theoretical_loss": 3.3887099798534104, + "tokens_seen": 2320868352 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014994984954864595, + "loss": 2.3517, + "theoretical_loss": 3.3887022330565464, + "tokens_seen": 2320933888 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014993981945837513, + "loss": 2.4199, + "theoretical_loss": 3.3886944865396713, + "tokens_seen": 2320999424 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001499297893681043, + "loss": 2.7259, + "theoretical_loss": 3.388686740302768, + "tokens_seen": 2321064960 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001499197592778335, + "loss": 2.4907, + "theoretical_loss": 3.3886789943458173, + "tokens_seen": 2321130496 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014990972918756267, + "loss": 2.673, + "theoretical_loss": 3.3886712486688024, + "tokens_seen": 2321196032 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014989969909729188, + "loss": 2.3699, + "theoretical_loss": 3.3886635032717045, + "tokens_seen": 2321261568 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014988966900702106, + "loss": 2.5826, + "theoretical_loss": 3.388655758154506, + "tokens_seen": 2321327104 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014987963891675024, + "loss": 2.3294, + "theoretical_loss": 3.388648013317188, + "tokens_seen": 2321392640 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014986960882647942, + "loss": 2.5058, + "theoretical_loss": 3.388640268759734, + "tokens_seen": 2321458176 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014985957873620863, + "loss": 2.4127, + "theoretical_loss": 3.3886325244821247, + "tokens_seen": 2321523712 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001498495486459378, + "loss": 2.6171, + "theoretical_loss": 3.388624780484343, + "tokens_seen": 2321589248 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2543432, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8942110538482666, + "objective/train/theoretical_loss": 3.3886228445286184, + "objective/train/tokens_used": 2342065632, + "theoretical_loss": 3.3886228445286184, + "tokens_seen": 2321605632 + }, + { + "epoch": 7.07, + "learning_rate": 0.000149839518555667, + "loss": 2.543, + "theoretical_loss": 3.38861703676637, + "tokens_seen": 2321654784 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014982948846539617, + "loss": 2.7034, + "theoretical_loss": 3.388609293328188, + "tokens_seen": 2321720320 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014981945837512536, + "loss": 2.3242, + "theoretical_loss": 3.3886015501697795, + "tokens_seen": 2321785856 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014980942828485456, + "loss": 2.4378, + "theoretical_loss": 3.388593807291126, + "tokens_seen": 2321851392 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014979939819458374, + "loss": 2.6971, + "theoretical_loss": 3.3885860646922095, + "tokens_seen": 2321916928 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014978936810431293, + "loss": 2.5459, + "theoretical_loss": 3.3885783223730126, + "tokens_seen": 2321982464 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001497793380140421, + "loss": 2.4491, + "theoretical_loss": 3.388570580333516, + "tokens_seen": 2322048000 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014976930792377134, + "loss": 2.5411, + "theoretical_loss": 3.388562838573703, + "tokens_seen": 2322113536 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014975927783350052, + "loss": 2.4848, + "theoretical_loss": 3.388555097093555, + "tokens_seen": 2322179072 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001497492477432297, + "loss": 2.3651, + "theoretical_loss": 3.3885473558930546, + "tokens_seen": 2322244608 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014973921765295889, + "loss": 2.503, + "theoretical_loss": 3.3885396149721827, + "tokens_seen": 2322310144 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014972918756268807, + "loss": 2.5131, + "theoretical_loss": 3.3885318743309223, + "tokens_seen": 2322375680 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014971915747241728, + "loss": 2.4954, + "theoretical_loss": 3.3885241339692547, + "tokens_seen": 2322441216 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014970912738214646, + "loss": 2.8086, + "theoretical_loss": 3.3885163938871625, + "tokens_seen": 2322506752 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014969909729187564, + "loss": 2.5876, + "theoretical_loss": 3.388508654084627, + "tokens_seen": 2322572288 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014968906720160482, + "loss": 2.3854, + "theoretical_loss": 3.3885009145616314, + "tokens_seen": 2322637824 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014967903711133403, + "loss": 2.372, + "theoretical_loss": 3.388493175318157, + "tokens_seen": 2322703360 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001496690070210632, + "loss": 2.7432, + "theoretical_loss": 3.3884854363541854, + "tokens_seen": 2322768896 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001496589769307924, + "loss": 2.6183, + "theoretical_loss": 3.388477697669699, + "tokens_seen": 2322834432 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014964894684052157, + "loss": 2.5622, + "theoretical_loss": 3.38846995926468, + "tokens_seen": 2322899968 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014963891675025075, + "loss": 2.4547, + "theoretical_loss": 3.38846222113911, + "tokens_seen": 2322965504 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014962888665997996, + "loss": 2.1438, + "theoretical_loss": 3.388454483292972, + "tokens_seen": 2323031040 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014961885656970914, + "loss": 2.3972, + "theoretical_loss": 3.388446745726247, + "tokens_seen": 2323096576 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014960882647943832, + "loss": 2.5674, + "theoretical_loss": 3.3884390084389175, + "tokens_seen": 2323162112 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001495987963891675, + "loss": 2.5763, + "theoretical_loss": 3.388431271430965, + "tokens_seen": 2323227648 + }, + { + "epoch": 7.07, + "objective/train/docs_used": 2543997, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.434436321258545, + "objective/train/theoretical_loss": 3.388429337222628, + "objective/train/tokens_used": 2343704032, + "theoretical_loss": 3.388429337222628, + "tokens_seen": 2323244032 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001495887662988967, + "loss": 2.3956, + "theoretical_loss": 3.3884235347023726, + "tokens_seen": 2323293184 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001495787362086259, + "loss": 2.3757, + "theoretical_loss": 3.388415798253121, + "tokens_seen": 2323358720 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014956870611835507, + "loss": 2.7765, + "theoretical_loss": 3.388408062083193, + "tokens_seen": 2323424256 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014955867602808425, + "loss": 2.689, + "theoretical_loss": 3.3884003261925706, + "tokens_seen": 2323489792 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014954864593781344, + "loss": 2.5666, + "theoretical_loss": 3.388392590581236, + "tokens_seen": 2323555328 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014953861584754264, + "loss": 2.5719, + "theoretical_loss": 3.388384855249171, + "tokens_seen": 2323620864 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014952858575727182, + "loss": 2.5691, + "theoretical_loss": 3.3883771201963575, + "tokens_seen": 2323686400 + }, + { + "epoch": 7.07, + "learning_rate": 0.000149518555667001, + "loss": 2.4406, + "theoretical_loss": 3.3883693854227777, + "tokens_seen": 2323751936 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001495085255767302, + "loss": 2.6408, + "theoretical_loss": 3.3883616509284136, + "tokens_seen": 2323817472 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001494984954864594, + "loss": 2.4157, + "theoretical_loss": 3.388353916713247, + "tokens_seen": 2323883008 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014948846539618858, + "loss": 2.7102, + "theoretical_loss": 3.388346182777261, + "tokens_seen": 2323948544 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014947843530591776, + "loss": 2.4586, + "theoretical_loss": 3.3883384491204365, + "tokens_seen": 2324014080 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014946840521564694, + "loss": 2.4947, + "theoretical_loss": 3.3883307157427556, + "tokens_seen": 2324079616 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014945837512537615, + "loss": 2.5521, + "theoretical_loss": 3.388322982644201, + "tokens_seen": 2324145152 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014944834503510533, + "loss": 2.7966, + "theoretical_loss": 3.3883152498247546, + "tokens_seen": 2324210688 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001494383149448345, + "loss": 2.767, + "theoretical_loss": 3.388307517284398, + "tokens_seen": 2324276224 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001494282848545637, + "loss": 2.526, + "theoretical_loss": 3.388299785023114, + "tokens_seen": 2324341760 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014941825476429287, + "loss": 2.5058, + "theoretical_loss": 3.3882920530408835, + "tokens_seen": 2324407296 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014940822467402208, + "loss": 2.4447, + "theoretical_loss": 3.3882843213376903, + "tokens_seen": 2324472832 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014939819458375126, + "loss": 2.6909, + "theoretical_loss": 3.3882765899135148, + "tokens_seen": 2324538368 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014938816449348044, + "loss": 2.6542, + "theoretical_loss": 3.3882688587683396, + "tokens_seen": 2324603904 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014937813440320962, + "loss": 2.512, + "theoretical_loss": 3.388261127902147, + "tokens_seen": 2324669440 + }, + { + "epoch": 7.07, + "learning_rate": 0.00014936810431293883, + "loss": 2.3659, + "theoretical_loss": 3.3882533973149194, + "tokens_seen": 2324734976 + }, + { + "epoch": 7.07, + "learning_rate": 0.000149358074222668, + "loss": 2.3323, + "theoretical_loss": 3.388245667006638, + "tokens_seen": 2324800512 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001493480441323972, + "loss": 2.5828, + "theoretical_loss": 3.3882379369772853, + "tokens_seen": 2324866048 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2544441, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7727210521698, + "objective/train/theoretical_loss": 3.3882360045135274, + "objective/train/tokens_used": 2345342432, + "theoretical_loss": 3.3882360045135274, + "tokens_seen": 2324882432 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014933801404212637, + "loss": 2.624, + "theoretical_loss": 3.3882302072268433, + "tokens_seen": 2324931584 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014932798395185556, + "loss": 2.5175, + "theoretical_loss": 3.388222477755294, + "tokens_seen": 2324997120 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014931795386158476, + "loss": 2.3707, + "theoretical_loss": 3.38821474856262, + "tokens_seen": 2325062656 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014930792377131394, + "loss": 2.2929, + "theoretical_loss": 3.388207019648803, + "tokens_seen": 2325128192 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014929789368104313, + "loss": 2.4374, + "theoretical_loss": 3.3881992910138248, + "tokens_seen": 2325193728 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001492878635907723, + "loss": 2.4956, + "theoretical_loss": 3.388191562657668, + "tokens_seen": 2325259264 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014927783350050152, + "loss": 2.5793, + "theoretical_loss": 3.388183834580314, + "tokens_seen": 2325324800 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001492678034102307, + "loss": 2.6036, + "theoretical_loss": 3.3881761067817457, + "tokens_seen": 2325390336 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014925777331995988, + "loss": 2.5484, + "theoretical_loss": 3.388168379261945, + "tokens_seen": 2325455872 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014924774322968906, + "loss": 2.5221, + "theoretical_loss": 3.3881606520208933, + "tokens_seen": 2325521408 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014923771313941824, + "loss": 2.6571, + "theoretical_loss": 3.3881529250585736, + "tokens_seen": 2325586944 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014922768304914745, + "loss": 2.4433, + "theoretical_loss": 3.388145198374967, + "tokens_seen": 2325652480 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014921765295887663, + "loss": 2.4772, + "theoretical_loss": 3.3881374719700568, + "tokens_seen": 2325718016 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001492076228686058, + "loss": 2.6488, + "theoretical_loss": 3.388129745843824, + "tokens_seen": 2325783552 + }, + { + "epoch": 7.08, + "learning_rate": 0.000149197592778335, + "loss": 2.765, + "theoretical_loss": 3.3881220199962514, + "tokens_seen": 2325849088 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001491875626880642, + "loss": 2.2501, + "theoretical_loss": 3.388114294427321, + "tokens_seen": 2325914624 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014917753259779338, + "loss": 2.4681, + "theoretical_loss": 3.3881065691370145, + "tokens_seen": 2325980160 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014916750250752256, + "loss": 2.6719, + "theoretical_loss": 3.3880988441253144, + "tokens_seen": 2326045696 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014915747241725174, + "loss": 2.3992, + "theoretical_loss": 3.3880911193922025, + "tokens_seen": 2326111232 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014914744232698092, + "loss": 2.5526, + "theoretical_loss": 3.388083394937661, + "tokens_seen": 2326176768 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014913741223671013, + "loss": 2.4569, + "theoretical_loss": 3.3880756707616726, + "tokens_seen": 2326242304 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001491273821464393, + "loss": 2.4819, + "theoretical_loss": 3.3880679468642185, + "tokens_seen": 2326307840 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001491173520561685, + "loss": 2.4418, + "theoretical_loss": 3.388060223245281, + "tokens_seen": 2326373376 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014910732196589768, + "loss": 2.4223, + "theoretical_loss": 3.388052499904843, + "tokens_seen": 2326438912 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014909729187562688, + "loss": 2.6099, + "theoretical_loss": 3.3880447768428854, + "tokens_seen": 2326504448 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2545728, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.887075662612915, + "objective/train/theoretical_loss": 3.3880428461209067, + "objective/train/tokens_used": 2346980832, + "theoretical_loss": 3.3880428461209067, + "tokens_seen": 2326520832 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014908726178535607, + "loss": 2.5388, + "theoretical_loss": 3.3880370540593914, + "tokens_seen": 2326569984 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014907723169508525, + "loss": 2.7626, + "theoretical_loss": 3.3880293315543426, + "tokens_seen": 2326635520 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014906720160481443, + "loss": 2.5315, + "theoretical_loss": 3.388021609327721, + "tokens_seen": 2326701056 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001490571715145436, + "loss": 2.5467, + "theoretical_loss": 3.3880138873795085, + "tokens_seen": 2326766592 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014904714142427282, + "loss": 2.6448, + "theoretical_loss": 3.388006165709688, + "tokens_seen": 2326832128 + }, + { + "epoch": 7.08, + "learning_rate": 0.000149037111334002, + "loss": 2.5908, + "theoretical_loss": 3.3879984443182414, + "tokens_seen": 2326897664 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014902708124373118, + "loss": 2.644, + "theoretical_loss": 3.3879907232051503, + "tokens_seen": 2326963200 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001490170511534604, + "loss": 2.7328, + "theoretical_loss": 3.3879830023703974, + "tokens_seen": 2327028736 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001490070210631896, + "loss": 2.6188, + "theoretical_loss": 3.3879752818139646, + "tokens_seen": 2327094272 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014899699097291878, + "loss": 2.609, + "theoretical_loss": 3.387967561535834, + "tokens_seen": 2327159808 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014898696088264796, + "loss": 2.7502, + "theoretical_loss": 3.387959841535988, + "tokens_seen": 2327225344 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014897693079237714, + "loss": 2.3968, + "theoretical_loss": 3.387952121814408, + "tokens_seen": 2327290880 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014896690070210635, + "loss": 2.7974, + "theoretical_loss": 3.387944402371077, + "tokens_seen": 2327356416 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014895687061183553, + "loss": 2.5138, + "theoretical_loss": 3.387936683205977, + "tokens_seen": 2327421952 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001489468405215647, + "loss": 2.615, + "theoretical_loss": 3.387928964319089, + "tokens_seen": 2327487488 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001489368104312939, + "loss": 2.4109, + "theoretical_loss": 3.387921245710397, + "tokens_seen": 2327553024 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014892678034102307, + "loss": 2.5488, + "theoretical_loss": 3.3879135273798817, + "tokens_seen": 2327618560 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014891675025075228, + "loss": 2.3907, + "theoretical_loss": 3.387905809327526, + "tokens_seen": 2327684096 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014890672016048146, + "loss": 2.5178, + "theoretical_loss": 3.3878980915533115, + "tokens_seen": 2327749632 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014889669007021064, + "loss": 2.7289, + "theoretical_loss": 3.387890374057221, + "tokens_seen": 2327815168 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014888665997993982, + "loss": 2.3899, + "theoretical_loss": 3.3878826568392357, + "tokens_seen": 2327880704 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014887662988966903, + "loss": 2.5658, + "theoretical_loss": 3.387874939899339, + "tokens_seen": 2327946240 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001488665997993982, + "loss": 2.6668, + "theoretical_loss": 3.387867223237512, + "tokens_seen": 2328011776 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001488565697091274, + "loss": 2.6277, + "theoretical_loss": 3.3878595068537374, + "tokens_seen": 2328077312 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014884653961885657, + "loss": 2.4785, + "theoretical_loss": 3.387851790747997, + "tokens_seen": 2328142848 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2546540, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.597400426864624, + "objective/train/theoretical_loss": 3.3878498617650026, + "objective/train/tokens_used": 2348619232, + "theoretical_loss": 3.3878498617650026, + "tokens_seen": 2328159232 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014883650952858576, + "loss": 2.6908, + "theoretical_loss": 3.387844074920273, + "tokens_seen": 2328208384 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014882647943831496, + "loss": 2.5599, + "theoretical_loss": 3.3878363593705476, + "tokens_seen": 2328273920 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014881644934804415, + "loss": 2.3932, + "theoretical_loss": 3.3878286440988035, + "tokens_seen": 2328339456 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014880641925777333, + "loss": 2.6704, + "theoretical_loss": 3.3878209291050223, + "tokens_seen": 2328404992 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001487963891675025, + "loss": 2.6783, + "theoretical_loss": 3.387813214389186, + "tokens_seen": 2328470528 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014878635907723172, + "loss": 2.5596, + "theoretical_loss": 3.387805499951277, + "tokens_seen": 2328536064 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001487763289869609, + "loss": 2.4627, + "theoretical_loss": 3.387797785791278, + "tokens_seen": 2328601600 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014876629889669008, + "loss": 2.6367, + "theoretical_loss": 3.3877900719091705, + "tokens_seen": 2328667136 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014875626880641926, + "loss": 2.6874, + "theoretical_loss": 3.3877823583049365, + "tokens_seen": 2328732672 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014874623871614844, + "loss": 2.6673, + "theoretical_loss": 3.387774644978559, + "tokens_seen": 2328798208 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014873620862587765, + "loss": 2.5535, + "theoretical_loss": 3.3877669319300194, + "tokens_seen": 2328863744 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014872617853560683, + "loss": 2.6282, + "theoretical_loss": 3.3877592191593, + "tokens_seen": 2328929280 + }, + { + "epoch": 7.08, + "learning_rate": 0.000148716148445336, + "loss": 2.5624, + "theoretical_loss": 3.3877515066663832, + "tokens_seen": 2328994816 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001487061183550652, + "loss": 2.4081, + "theoretical_loss": 3.3877437944512514, + "tokens_seen": 2329060352 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001486960882647944, + "loss": 2.4536, + "theoretical_loss": 3.3877360825138862, + "tokens_seen": 2329125888 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014868605817452358, + "loss": 2.6028, + "theoretical_loss": 3.38772837085427, + "tokens_seen": 2329191424 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014867602808425276, + "loss": 2.4756, + "theoretical_loss": 3.3877206594723854, + "tokens_seen": 2329256960 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014866599799398194, + "loss": 2.5223, + "theoretical_loss": 3.387712948368214, + "tokens_seen": 2329322496 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014865596790371112, + "loss": 2.7597, + "theoretical_loss": 3.3877052375417382, + "tokens_seen": 2329388032 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014864593781344033, + "loss": 2.4855, + "theoretical_loss": 3.3876975269929406, + "tokens_seen": 2329453568 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014863590772316951, + "loss": 2.7701, + "theoretical_loss": 3.3876898167218026, + "tokens_seen": 2329519104 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001486258776328987, + "loss": 2.4992, + "theoretical_loss": 3.387682106728307, + "tokens_seen": 2329584640 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014861584754262788, + "loss": 2.6285, + "theoretical_loss": 3.3876743970124354, + "tokens_seen": 2329650176 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014860581745235708, + "loss": 2.513, + "theoretical_loss": 3.3876666875741708, + "tokens_seen": 2329715712 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014859578736208627, + "loss": 2.5486, + "theoretical_loss": 3.3876589784134947, + "tokens_seen": 2329781248 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2547809, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.430067300796509, + "objective/train/theoretical_loss": 3.3876570511666966, + "objective/train/tokens_used": 2350257632, + "theoretical_loss": 3.3876570511666966, + "tokens_seen": 2329797632 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014858575727181545, + "loss": 2.5092, + "theoretical_loss": 3.3876512695303895, + "tokens_seen": 2329846784 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014857572718154463, + "loss": 2.6256, + "theoretical_loss": 3.387643560924838, + "tokens_seen": 2329912320 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001485656970912738, + "loss": 2.4763, + "theoretical_loss": 3.387635852596821, + "tokens_seen": 2329977856 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014855566700100302, + "loss": 2.4083, + "theoretical_loss": 3.387628144546322, + "tokens_seen": 2330043392 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001485456369107322, + "loss": 2.6019, + "theoretical_loss": 3.387620436773323, + "tokens_seen": 2330108928 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014853560682046138, + "loss": 2.4662, + "theoretical_loss": 3.387612729277806, + "tokens_seen": 2330174464 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014852557673019056, + "loss": 2.5539, + "theoretical_loss": 3.3876050220597524, + "tokens_seen": 2330240000 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014851554663991977, + "loss": 2.5429, + "theoretical_loss": 3.387597315119146, + "tokens_seen": 2330305536 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014850551654964895, + "loss": 2.7006, + "theoretical_loss": 3.387589608455968, + "tokens_seen": 2330371072 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014849548645937813, + "loss": 2.5583, + "theoretical_loss": 3.3875819020702007, + "tokens_seen": 2330436608 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001484854563691073, + "loss": 2.6269, + "theoretical_loss": 3.387574195961826, + "tokens_seen": 2330502144 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001484754262788365, + "loss": 2.4441, + "theoretical_loss": 3.3875664901308276, + "tokens_seen": 2330567680 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001484653961885657, + "loss": 2.654, + "theoretical_loss": 3.3875587845771857, + "tokens_seen": 2330633216 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014845536609829488, + "loss": 2.4892, + "theoretical_loss": 3.3875510793008843, + "tokens_seen": 2330698752 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014844533600802406, + "loss": 2.4773, + "theoretical_loss": 3.387543374301904, + "tokens_seen": 2330764288 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014843530591775324, + "loss": 2.6203, + "theoretical_loss": 3.3875356695802283, + "tokens_seen": 2330829824 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014842527582748245, + "loss": 2.6316, + "theoretical_loss": 3.3875279651358388, + "tokens_seen": 2330895360 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014841524573721163, + "loss": 2.5141, + "theoretical_loss": 3.387520260968718, + "tokens_seen": 2330960896 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014840521564694081, + "loss": 2.6174, + "theoretical_loss": 3.3875125570788476, + "tokens_seen": 2331026432 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014839518555667, + "loss": 2.2199, + "theoretical_loss": 3.3875048534662104, + "tokens_seen": 2331091968 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001483851554663992, + "loss": 2.7224, + "theoretical_loss": 3.3874971501307884, + "tokens_seen": 2331157504 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014837512537612839, + "loss": 2.5373, + "theoretical_loss": 3.3874894470725643, + "tokens_seen": 2331223040 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014836509528585757, + "loss": 2.619, + "theoretical_loss": 3.3874817442915197, + "tokens_seen": 2331288576 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014835506519558675, + "loss": 2.6754, + "theoretical_loss": 3.387474041787637, + "tokens_seen": 2331354112 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014834503510531593, + "loss": 2.5688, + "theoretical_loss": 3.3874663395608984, + "tokens_seen": 2331419648 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2548511, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5613021850585938, + "objective/train/theoretical_loss": 3.3874644140475154, + "objective/train/tokens_used": 2351896032, + "theoretical_loss": 3.3874644140475154, + "tokens_seen": 2331436032 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014833500501504514, + "loss": 2.6417, + "theoretical_loss": 3.387458637611286, + "tokens_seen": 2331485184 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014832497492477432, + "loss": 2.5907, + "theoretical_loss": 3.3874509359387828, + "tokens_seen": 2331550720 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001483149448345035, + "loss": 2.7458, + "theoretical_loss": 3.3874432345433703, + "tokens_seen": 2331616256 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014830491474423268, + "loss": 2.433, + "theoretical_loss": 3.387435533425031, + "tokens_seen": 2331681792 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001482948846539619, + "loss": 2.4464, + "theoretical_loss": 3.3874278325837466, + "tokens_seen": 2331747328 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014828485456369107, + "loss": 2.3877, + "theoretical_loss": 3.3874201320195003, + "tokens_seen": 2331812864 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014827482447342025, + "loss": 2.6217, + "theoretical_loss": 3.387412431732274, + "tokens_seen": 2331878400 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014826479438314946, + "loss": 2.4545, + "theoretical_loss": 3.3874047317220493, + "tokens_seen": 2331943936 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014825476429287864, + "loss": 2.4029, + "theoretical_loss": 3.3873970319888094, + "tokens_seen": 2332009472 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014824473420260785, + "loss": 2.4007, + "theoretical_loss": 3.387389332532536, + "tokens_seen": 2332075008 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014823470411233703, + "loss": 2.693, + "theoretical_loss": 3.3873816333532116, + "tokens_seen": 2332140544 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001482246740220662, + "loss": 2.5893, + "theoretical_loss": 3.3873739344508182, + "tokens_seen": 2332206080 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001482146439317954, + "loss": 2.4238, + "theoretical_loss": 3.3873662358253385, + "tokens_seen": 2332271616 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001482046138415246, + "loss": 2.5253, + "theoretical_loss": 3.387358537476754, + "tokens_seen": 2332337152 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014819458375125378, + "loss": 2.4827, + "theoretical_loss": 3.3873508394050478, + "tokens_seen": 2332402688 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014818455366098296, + "loss": 2.5384, + "theoretical_loss": 3.3873431416102013, + "tokens_seen": 2332468224 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014817452357071214, + "loss": 2.6515, + "theoretical_loss": 3.387335444092198, + "tokens_seen": 2332533760 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014816449348044132, + "loss": 2.5892, + "theoretical_loss": 3.387327746851019, + "tokens_seen": 2332599296 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014815446339017053, + "loss": 2.709, + "theoretical_loss": 3.3873200498866467, + "tokens_seen": 2332664832 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014814443329989971, + "loss": 2.2626, + "theoretical_loss": 3.3873123531990643, + "tokens_seen": 2332730368 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001481344032096289, + "loss": 2.6588, + "theoretical_loss": 3.387304656788253, + "tokens_seen": 2332795904 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014812437311935808, + "loss": 2.7088, + "theoretical_loss": 3.3872969606541954, + "tokens_seen": 2332861440 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014811434302908728, + "loss": 2.3859, + "theoretical_loss": 3.387289264796874, + "tokens_seen": 2332926976 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014810431293881647, + "loss": 2.4651, + "theoretical_loss": 3.387281569216271, + "tokens_seen": 2332992512 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014809428284854565, + "loss": 2.713, + "theoretical_loss": 3.3872738739123687, + "tokens_seen": 2333058048 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2550091, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.815211534500122, + "objective/train/theoretical_loss": 3.387271950129626, + "objective/train/tokens_used": 2353534432, + "theoretical_loss": 3.387271950129626, + "tokens_seen": 2333074432 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014808425275827483, + "loss": 2.548, + "theoretical_loss": 3.3872661788851497, + "tokens_seen": 2333123584 + }, + { + "epoch": 7.08, + "learning_rate": 0.000148074222668004, + "loss": 2.5499, + "theoretical_loss": 3.387258484134595, + "tokens_seen": 2333189120 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014806419257773322, + "loss": 2.6349, + "theoretical_loss": 3.3872507896606883, + "tokens_seen": 2333254656 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001480541624874624, + "loss": 2.4596, + "theoretical_loss": 3.387243095463411, + "tokens_seen": 2333320192 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014804413239719158, + "loss": 2.4498, + "theoretical_loss": 3.387235401542746, + "tokens_seen": 2333385728 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014803410230692076, + "loss": 2.5036, + "theoretical_loss": 3.3872277078986754, + "tokens_seen": 2333451264 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014802407221664997, + "loss": 2.4502, + "theoretical_loss": 3.3872200145311817, + "tokens_seen": 2333516800 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014801404212637915, + "loss": 2.7349, + "theoretical_loss": 3.3872123214402463, + "tokens_seen": 2333582336 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014800401203610833, + "loss": 2.5058, + "theoretical_loss": 3.3872046286258524, + "tokens_seen": 2333647872 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001479939819458375, + "loss": 2.5251, + "theoretical_loss": 3.3871969360879817, + "tokens_seen": 2333713408 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001479839518555667, + "loss": 2.5893, + "theoretical_loss": 3.387189243826617, + "tokens_seen": 2333778944 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001479739217652959, + "loss": 2.5492, + "theoretical_loss": 3.3871815518417403, + "tokens_seen": 2333844480 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014796389167502508, + "loss": 2.4726, + "theoretical_loss": 3.387173860133334, + "tokens_seen": 2333910016 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014795386158475426, + "loss": 2.3618, + "theoretical_loss": 3.3871661687013805, + "tokens_seen": 2333975552 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014794383149448344, + "loss": 2.6773, + "theoretical_loss": 3.387158477545862, + "tokens_seen": 2334041088 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014793380140421265, + "loss": 2.6481, + "theoretical_loss": 3.3871507866667607, + "tokens_seen": 2334106624 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014792377131394183, + "loss": 2.3666, + "theoretical_loss": 3.387143096064059, + "tokens_seen": 2334172160 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014791374122367102, + "loss": 2.6593, + "theoretical_loss": 3.387135405737739, + "tokens_seen": 2334237696 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001479037111334002, + "loss": 2.5572, + "theoretical_loss": 3.3871277156877833, + "tokens_seen": 2334303232 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001478936810431294, + "loss": 2.6448, + "theoretical_loss": 3.3871200259141743, + "tokens_seen": 2334368768 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014788365095285859, + "loss": 2.6002, + "theoretical_loss": 3.387112336416894, + "tokens_seen": 2334434304 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014787362086258777, + "loss": 2.6434, + "theoretical_loss": 3.3871046471959247, + "tokens_seen": 2334499840 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014786359077231695, + "loss": 2.5189, + "theoretical_loss": 3.387096958251249, + "tokens_seen": 2334565376 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014785356068204613, + "loss": 2.6074, + "theoretical_loss": 3.387089269582849, + "tokens_seen": 2334630912 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014784353059177534, + "loss": 2.4641, + "theoretical_loss": 3.3870815811907073, + "tokens_seen": 2334696448 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2550606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.713803291320801, + "objective/train/theoretical_loss": 3.3870796591358348, + "objective/train/tokens_used": 2355172832, + "theoretical_loss": 3.3870796591358348, + "tokens_seen": 2334712832 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014783350050150452, + "loss": 2.6161, + "theoretical_loss": 3.3870738930748057, + "tokens_seen": 2334761984 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001478234704112337, + "loss": 2.6399, + "theoretical_loss": 3.3870662052351266, + "tokens_seen": 2334827520 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014781344032096288, + "loss": 2.6785, + "theoretical_loss": 3.3870585176716532, + "tokens_seen": 2334893056 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001478034102306921, + "loss": 2.436, + "theoretical_loss": 3.387050830384367, + "tokens_seen": 2334958592 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014779338014042127, + "loss": 2.6425, + "theoretical_loss": 3.3870431433732504, + "tokens_seen": 2335024128 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014778335005015045, + "loss": 2.4532, + "theoretical_loss": 3.3870354566382854, + "tokens_seen": 2335089664 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014777331995987963, + "loss": 2.4668, + "theoretical_loss": 3.3870277701794556, + "tokens_seen": 2335155200 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001477632898696088, + "loss": 2.5156, + "theoretical_loss": 3.3870200839967417, + "tokens_seen": 2335220736 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014775325977933802, + "loss": 2.5824, + "theoretical_loss": 3.3870123980901274, + "tokens_seen": 2335286272 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001477432296890672, + "loss": 2.2948, + "theoretical_loss": 3.387004712459594, + "tokens_seen": 2335351808 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014773319959879638, + "loss": 2.5296, + "theoretical_loss": 3.3869970271051244, + "tokens_seen": 2335417344 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014772316950852556, + "loss": 2.3835, + "theoretical_loss": 3.386989342026701, + "tokens_seen": 2335482880 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014771313941825477, + "loss": 2.5329, + "theoretical_loss": 3.386981657224306, + "tokens_seen": 2335548416 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014770310932798395, + "loss": 2.5395, + "theoretical_loss": 3.3869739726979216, + "tokens_seen": 2335613952 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014769307923771314, + "loss": 2.4231, + "theoretical_loss": 3.3869662884475304, + "tokens_seen": 2335679488 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014768304914744232, + "loss": 2.5335, + "theoretical_loss": 3.386958604473114, + "tokens_seen": 2335745024 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001476730190571715, + "loss": 2.5473, + "theoretical_loss": 3.386950920774656, + "tokens_seen": 2335810560 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001476629889669007, + "loss": 2.3125, + "theoretical_loss": 3.386943237352138, + "tokens_seen": 2335876096 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001476529588766299, + "loss": 2.5878, + "theoretical_loss": 3.386935554205542, + "tokens_seen": 2335941632 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014764292878635907, + "loss": 2.4437, + "theoretical_loss": 3.3869278713348514, + "tokens_seen": 2336007168 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014763289869608825, + "loss": 2.58, + "theoretical_loss": 3.3869201887400475, + "tokens_seen": 2336072704 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014762286860581746, + "loss": 2.5531, + "theoretical_loss": 3.3869125064211136, + "tokens_seen": 2336138240 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014761283851554664, + "loss": 2.4391, + "theoretical_loss": 3.386904824378031, + "tokens_seen": 2336203776 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014760280842527582, + "loss": 2.3444, + "theoretical_loss": 3.386897142610783, + "tokens_seen": 2336269312 + }, + { + "epoch": 7.08, + "learning_rate": 0.000147592778335005, + "loss": 2.585, + "theoretical_loss": 3.386889461119351, + "tokens_seen": 2336334848 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2551933, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6039879322052, + "objective/train/theoretical_loss": 3.3868875407895875, + "objective/train/tokens_used": 2356811232, + "theoretical_loss": 3.3868875407895875, + "tokens_seen": 2336351232 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014758274824473418, + "loss": 2.6065, + "theoretical_loss": 3.3868817799037183, + "tokens_seen": 2336400384 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001475727181544634, + "loss": 2.437, + "theoretical_loss": 3.386874098963867, + "tokens_seen": 2336465920 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014756268806419257, + "loss": 2.5642, + "theoretical_loss": 3.386866418299779, + "tokens_seen": 2336531456 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014755265797392175, + "loss": 2.5701, + "theoretical_loss": 3.3868587379114374, + "tokens_seen": 2336596992 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014754262788365093, + "loss": 2.5634, + "theoretical_loss": 3.386851057798824, + "tokens_seen": 2336662528 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014753259779338014, + "loss": 2.4819, + "theoretical_loss": 3.3868433779619216, + "tokens_seen": 2336728064 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014752256770310932, + "loss": 2.4594, + "theoretical_loss": 3.386835698400712, + "tokens_seen": 2336793600 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014751253761283853, + "loss": 2.4609, + "theoretical_loss": 3.386828019115178, + "tokens_seen": 2336859136 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001475025075225677, + "loss": 2.4215, + "theoretical_loss": 3.386820340105302, + "tokens_seen": 2336924672 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001474924774322969, + "loss": 2.4348, + "theoretical_loss": 3.386812661371066, + "tokens_seen": 2336990208 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001474824473420261, + "loss": 2.2851, + "theoretical_loss": 3.3868049829124525, + "tokens_seen": 2337055744 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014747241725175528, + "loss": 2.5601, + "theoretical_loss": 3.3867973047294444, + "tokens_seen": 2337121280 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014746238716148446, + "loss": 2.8247, + "theoretical_loss": 3.3867896268220234, + "tokens_seen": 2337186816 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014745235707121364, + "loss": 2.6791, + "theoretical_loss": 3.386781949190172, + "tokens_seen": 2337252352 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014744232698094285, + "loss": 2.7733, + "theoretical_loss": 3.386774271833873, + "tokens_seen": 2337317888 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014743229689067203, + "loss": 2.3898, + "theoretical_loss": 3.3867665947531087, + "tokens_seen": 2337383424 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014742226680040122, + "loss": 2.2727, + "theoretical_loss": 3.3867589179478608, + "tokens_seen": 2337448960 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001474122367101304, + "loss": 2.5065, + "theoretical_loss": 3.3867512414181125, + "tokens_seen": 2337514496 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001474022066198596, + "loss": 2.2996, + "theoretical_loss": 3.386743565163846, + "tokens_seen": 2337580032 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014739217652958879, + "loss": 2.7209, + "theoretical_loss": 3.386735889185043, + "tokens_seen": 2337645568 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014738214643931797, + "loss": 2.4113, + "theoretical_loss": 3.386728213481687, + "tokens_seen": 2337711104 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014737211634904715, + "loss": 2.7011, + "theoretical_loss": 3.38672053805376, + "tokens_seen": 2337776640 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014736208625877633, + "loss": 2.4663, + "theoretical_loss": 3.386712862901244, + "tokens_seen": 2337842176 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014735205616850554, + "loss": 2.3659, + "theoretical_loss": 3.3867051880241217, + "tokens_seen": 2337907712 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014734202607823472, + "loss": 2.7348, + "theoretical_loss": 3.3866975134223756, + "tokens_seen": 2337973248 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2552453, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.304367780685425, + "objective/train/theoretical_loss": 3.3866955948149644, + "objective/train/tokens_used": 2358449632, + "theoretical_loss": 3.3866955948149644, + "tokens_seen": 2337989632 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001473319959879639, + "loss": 2.4949, + "theoretical_loss": 3.386689839095988, + "tokens_seen": 2338038784 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014732196589769308, + "loss": 2.5417, + "theoretical_loss": 3.386682165044941, + "tokens_seen": 2338104320 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001473119358074223, + "loss": 2.7562, + "theoretical_loss": 3.386674491269217, + "tokens_seen": 2338169856 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014730190571715147, + "loss": 2.6826, + "theoretical_loss": 3.3866668177687993, + "tokens_seen": 2338235392 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014729187562688065, + "loss": 2.6086, + "theoretical_loss": 3.3866591445436693, + "tokens_seen": 2338300928 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014728184553660983, + "loss": 2.5815, + "theoretical_loss": 3.38665147159381, + "tokens_seen": 2338366464 + }, + { + "epoch": 7.08, + "learning_rate": 0.000147271815446339, + "loss": 2.486, + "theoretical_loss": 3.386643798919203, + "tokens_seen": 2338432000 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014726178535606822, + "loss": 2.4016, + "theoretical_loss": 3.386636126519832, + "tokens_seen": 2338497536 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001472517552657974, + "loss": 2.3867, + "theoretical_loss": 3.3866284543956784, + "tokens_seen": 2338563072 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014724172517552658, + "loss": 2.6857, + "theoretical_loss": 3.386620782546725, + "tokens_seen": 2338628608 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014723169508525577, + "loss": 2.6145, + "theoretical_loss": 3.386613110972954, + "tokens_seen": 2338694144 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014722166499498497, + "loss": 2.727, + "theoretical_loss": 3.3866054396743484, + "tokens_seen": 2338759680 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014721163490471415, + "loss": 2.3976, + "theoretical_loss": 3.3865977686508897, + "tokens_seen": 2338825216 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014720160481444334, + "loss": 2.557, + "theoretical_loss": 3.386590097902561, + "tokens_seen": 2338890752 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014719157472417252, + "loss": 2.4824, + "theoretical_loss": 3.386582427429345, + "tokens_seen": 2338956288 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001471815446339017, + "loss": 2.7339, + "theoretical_loss": 3.386574757231223, + "tokens_seen": 2339021824 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001471715145436309, + "loss": 2.324, + "theoretical_loss": 3.386567087308179, + "tokens_seen": 2339087360 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001471614844533601, + "loss": 2.6031, + "theoretical_loss": 3.386559417660193, + "tokens_seen": 2339152896 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014715145436308927, + "loss": 2.6237, + "theoretical_loss": 3.3865517482872503, + "tokens_seen": 2339218432 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014714142427281845, + "loss": 2.4506, + "theoretical_loss": 3.3865440791893313, + "tokens_seen": 2339283968 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014713139418254766, + "loss": 2.3732, + "theoretical_loss": 3.3865364103664195, + "tokens_seen": 2339349504 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014712136409227684, + "loss": 2.6548, + "theoretical_loss": 3.3865287418184966, + "tokens_seen": 2339415040 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014711133400200602, + "loss": 2.4095, + "theoretical_loss": 3.3865210735455453, + "tokens_seen": 2339480576 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001471013039117352, + "loss": 2.6433, + "theoretical_loss": 3.3865134055475483, + "tokens_seen": 2339546112 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014709127382146438, + "loss": 2.4369, + "theoretical_loss": 3.3865057378244883, + "tokens_seen": 2339611648 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2553515, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3645668029785156, + "objective/train/theoretical_loss": 3.38650382093668, + "objective/train/tokens_used": 2360088032, + "theoretical_loss": 3.38650382093668, + "tokens_seen": 2339628032 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001470812437311936, + "loss": 2.5471, + "theoretical_loss": 3.386498070376347, + "tokens_seen": 2339677184 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014707121364092277, + "loss": 2.6719, + "theoretical_loss": 3.3864904032031067, + "tokens_seen": 2339742720 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014706118355065195, + "loss": 2.415, + "theoretical_loss": 3.3864827363047505, + "tokens_seen": 2339808256 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014705115346038113, + "loss": 2.4285, + "theoretical_loss": 3.386475069681261, + "tokens_seen": 2339873792 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014704112337011034, + "loss": 2.6311, + "theoretical_loss": 3.3864674033326203, + "tokens_seen": 2339939328 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014703109327983952, + "loss": 2.5565, + "theoretical_loss": 3.3864597372588103, + "tokens_seen": 2340004864 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001470210631895687, + "loss": 2.4431, + "theoretical_loss": 3.3864520714598143, + "tokens_seen": 2340070400 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014701103309929789, + "loss": 2.2054, + "theoretical_loss": 3.3864444059356145, + "tokens_seen": 2340135936 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014700100300902707, + "loss": 2.5754, + "theoretical_loss": 3.386436740686193, + "tokens_seen": 2340201472 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014699097291875627, + "loss": 2.5354, + "theoretical_loss": 3.386429075711533, + "tokens_seen": 2340267008 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014698094282848546, + "loss": 2.3781, + "theoretical_loss": 3.386421411011616, + "tokens_seen": 2340332544 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014697091273821464, + "loss": 2.4615, + "theoretical_loss": 3.3864137465864257, + "tokens_seen": 2340398080 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014696088264794382, + "loss": 2.4367, + "theoretical_loss": 3.386406082435943, + "tokens_seen": 2340463616 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014695085255767303, + "loss": 2.3836, + "theoretical_loss": 3.3863984185601517, + "tokens_seen": 2340529152 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001469408224674022, + "loss": 2.6525, + "theoretical_loss": 3.3863907549590335, + "tokens_seen": 2340594688 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001469307923771314, + "loss": 2.3676, + "theoretical_loss": 3.3863830916325712, + "tokens_seen": 2340660224 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014692076228686057, + "loss": 2.6384, + "theoretical_loss": 3.386375428580747, + "tokens_seen": 2340725760 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014691073219658975, + "loss": 2.5897, + "theoretical_loss": 3.386367765803544, + "tokens_seen": 2340791296 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014690070210631896, + "loss": 2.4887, + "theoretical_loss": 3.3863601033009436, + "tokens_seen": 2340856832 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014689067201604814, + "loss": 2.3817, + "theoretical_loss": 3.3863524410729293, + "tokens_seen": 2340922368 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014688064192577732, + "loss": 2.3065, + "theoretical_loss": 3.3863447791194834, + "tokens_seen": 2340987904 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001468706118355065, + "loss": 2.3932, + "theoretical_loss": 3.3863371174405876, + "tokens_seen": 2341053440 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001468605817452357, + "loss": 2.3674, + "theoretical_loss": 3.386329456036225, + "tokens_seen": 2341118976 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001468505516549649, + "loss": 2.5387, + "theoretical_loss": 3.386321794906378, + "tokens_seen": 2341184512 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014684052156469407, + "loss": 2.3953, + "theoretical_loss": 3.3863141340510294, + "tokens_seen": 2341250048 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2554117, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5200552940368652, + "objective/train/theoretical_loss": 3.3863122188800805, + "objective/train/tokens_used": 2361726432, + "theoretical_loss": 3.3863122188800805, + "tokens_seen": 2341266432 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014683049147442325, + "loss": 2.4023, + "theoretical_loss": 3.386306473470161, + "tokens_seen": 2341315584 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014682046138415243, + "loss": 2.2706, + "theoretical_loss": 3.386298813163756, + "tokens_seen": 2341381120 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014681043129388164, + "loss": 2.6533, + "theoretical_loss": 3.386291153131796, + "tokens_seen": 2341446656 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014680040120361082, + "loss": 2.4997, + "theoretical_loss": 3.3862834933742647, + "tokens_seen": 2341512192 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014679037111334, + "loss": 2.6555, + "theoretical_loss": 3.3862758338911436, + "tokens_seen": 2341577728 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001467803410230692, + "loss": 2.3033, + "theoretical_loss": 3.3862681746824155, + "tokens_seen": 2341643264 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014677031093279842, + "loss": 2.3541, + "theoretical_loss": 3.3862605157480625, + "tokens_seen": 2341708800 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001467602808425276, + "loss": 2.4468, + "theoretical_loss": 3.386252857088068, + "tokens_seen": 2341774336 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014675025075225678, + "loss": 2.364, + "theoretical_loss": 3.3862451987024134, + "tokens_seen": 2341839872 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014674022066198597, + "loss": 2.6473, + "theoretical_loss": 3.3862375405910825, + "tokens_seen": 2341905408 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014673019057171517, + "loss": 2.42, + "theoretical_loss": 3.3862298827540567, + "tokens_seen": 2341970944 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014672016048144435, + "loss": 2.3738, + "theoretical_loss": 3.3862222251913185, + "tokens_seen": 2342036480 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014671013039117354, + "loss": 2.6367, + "theoretical_loss": 3.3862145679028512, + "tokens_seen": 2342102016 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014670010030090272, + "loss": 2.2179, + "theoretical_loss": 3.3862069108886366, + "tokens_seen": 2342167552 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001466900702106319, + "loss": 2.6099, + "theoretical_loss": 3.3861992541486576, + "tokens_seen": 2342233088 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001466800401203611, + "loss": 2.4544, + "theoretical_loss": 3.3861915976828962, + "tokens_seen": 2342298624 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001466700100300903, + "loss": 2.4488, + "theoretical_loss": 3.386183941491336, + "tokens_seen": 2342364160 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014665997993981947, + "loss": 2.4424, + "theoretical_loss": 3.386176285573958, + "tokens_seen": 2342429696 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014664994984954865, + "loss": 2.2303, + "theoretical_loss": 3.3861686299307463, + "tokens_seen": 2342495232 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014663991975927786, + "loss": 2.3041, + "theoretical_loss": 3.3861609745616823, + "tokens_seen": 2342560768 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014662988966900704, + "loss": 2.4863, + "theoretical_loss": 3.3861533194667484, + "tokens_seen": 2342626304 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014661985957873622, + "loss": 2.5438, + "theoretical_loss": 3.386145664645928, + "tokens_seen": 2342691840 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001466098294884654, + "loss": 2.4595, + "theoretical_loss": 3.386138010099203, + "tokens_seen": 2342757376 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014659979939819458, + "loss": 2.3983, + "theoretical_loss": 3.3861303558265563, + "tokens_seen": 2342822912 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001465897693079238, + "loss": 2.495, + "theoretical_loss": 3.38612270182797, + "tokens_seen": 2342888448 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2555288, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7491304874420166, + "objective/train/theoretical_loss": 3.3861207883711435, + "objective/train/tokens_used": 2363364832, + "theoretical_loss": 3.3861207883711435, + "tokens_seen": 2342904832 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014657973921765297, + "loss": 2.4445, + "theoretical_loss": 3.386115048103427, + "tokens_seen": 2342953984 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014656970912738215, + "loss": 2.4605, + "theoretical_loss": 3.386107394652909, + "tokens_seen": 2343019520 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014655967903711133, + "loss": 2.6751, + "theoretical_loss": 3.3860997414763996, + "tokens_seen": 2343085056 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014654964894684054, + "loss": 2.49, + "theoretical_loss": 3.386092088573881, + "tokens_seen": 2343150592 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014653961885656972, + "loss": 2.7323, + "theoretical_loss": 3.3860844359453353, + "tokens_seen": 2343216128 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001465295887662989, + "loss": 2.413, + "theoretical_loss": 3.3860767835907457, + "tokens_seen": 2343281664 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014651955867602809, + "loss": 2.5624, + "theoretical_loss": 3.3860691315100944, + "tokens_seen": 2343347200 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014650952858575727, + "loss": 2.5455, + "theoretical_loss": 3.3860614797033635, + "tokens_seen": 2343412736 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014649949849548647, + "loss": 2.5293, + "theoretical_loss": 3.386053828170536, + "tokens_seen": 2343478272 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014648946840521566, + "loss": 2.5651, + "theoretical_loss": 3.3860461769115946, + "tokens_seen": 2343543808 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014647943831494484, + "loss": 2.4748, + "theoretical_loss": 3.3860385259265215, + "tokens_seen": 2343609344 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014646940822467402, + "loss": 2.3689, + "theoretical_loss": 3.3860308752152997, + "tokens_seen": 2343674880 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014645937813440323, + "loss": 2.6834, + "theoretical_loss": 3.386023224777911, + "tokens_seen": 2343740416 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001464493480441324, + "loss": 2.4252, + "theoretical_loss": 3.3860155746143388, + "tokens_seen": 2343805952 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001464393179538616, + "loss": 2.6639, + "theoretical_loss": 3.3860079247245647, + "tokens_seen": 2343871488 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014642928786359077, + "loss": 2.7539, + "theoretical_loss": 3.3860002751085716, + "tokens_seen": 2343937024 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014641925777331995, + "loss": 2.4088, + "theoretical_loss": 3.3859926257663426, + "tokens_seen": 2344002560 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014640922768304916, + "loss": 2.276, + "theoretical_loss": 3.3859849766978596, + "tokens_seen": 2344068096 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014639919759277834, + "loss": 2.3293, + "theoretical_loss": 3.3859773279031056, + "tokens_seen": 2344133632 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014638916750250752, + "loss": 2.4218, + "theoretical_loss": 3.385969679382063, + "tokens_seen": 2344199168 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001463791374122367, + "loss": 2.5344, + "theoretical_loss": 3.3859620311347136, + "tokens_seen": 2344264704 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001463691073219659, + "loss": 2.5969, + "theoretical_loss": 3.3859543831610415, + "tokens_seen": 2344330240 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001463590772316951, + "loss": 2.5207, + "theoretical_loss": 3.3859467354610278, + "tokens_seen": 2344395776 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014634904714142427, + "loss": 2.5571, + "theoretical_loss": 3.3859390880346556, + "tokens_seen": 2344461312 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014633901705115345, + "loss": 2.4629, + "theoretical_loss": 3.385931440881908, + "tokens_seen": 2344526848 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2555974, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2718100547790527, + "objective/train/theoretical_loss": 3.385929529136473, + "objective/train/tokens_used": 2365003232, + "theoretical_loss": 3.385929529136473, + "tokens_seen": 2344543232 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014632898696088264, + "loss": 2.4535, + "theoretical_loss": 3.3859237940027667, + "tokens_seen": 2344592384 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014631895687061184, + "loss": 2.5948, + "theoretical_loss": 3.385916147397215, + "tokens_seen": 2344657920 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014630892678034102, + "loss": 2.5481, + "theoretical_loss": 3.3859085010652348, + "tokens_seen": 2344723456 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001462988966900702, + "loss": 2.4887, + "theoretical_loss": 3.385900855006809, + "tokens_seen": 2344788992 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001462888665997994, + "loss": 2.6158, + "theoretical_loss": 3.38589320922192, + "tokens_seen": 2344854528 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001462788365095286, + "loss": 2.4898, + "theoretical_loss": 3.3858855637105507, + "tokens_seen": 2344920064 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014626880641925778, + "loss": 2.3966, + "theoretical_loss": 3.3858779184726835, + "tokens_seen": 2344985600 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014625877632898696, + "loss": 2.5251, + "theoretical_loss": 3.385870273508301, + "tokens_seen": 2345051136 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014624874623871614, + "loss": 2.5592, + "theoretical_loss": 3.3858626288173856, + "tokens_seen": 2345116672 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014623871614844535, + "loss": 2.536, + "theoretical_loss": 3.38585498439992, + "tokens_seen": 2345182208 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014622868605817453, + "loss": 2.7096, + "theoretical_loss": 3.3858473402558866, + "tokens_seen": 2345247744 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001462186559679037, + "loss": 2.581, + "theoretical_loss": 3.3858396963852684, + "tokens_seen": 2345313280 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001462086258776329, + "loss": 2.7019, + "theoretical_loss": 3.385832052788048, + "tokens_seen": 2345378816 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014619859578736207, + "loss": 2.4993, + "theoretical_loss": 3.385824409464207, + "tokens_seen": 2345444352 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014618856569709128, + "loss": 2.7609, + "theoretical_loss": 3.385816766413729, + "tokens_seen": 2345509888 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014617853560682046, + "loss": 2.5574, + "theoretical_loss": 3.3858091236365966, + "tokens_seen": 2345575424 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014616850551654964, + "loss": 2.5604, + "theoretical_loss": 3.385801481132792, + "tokens_seen": 2345640960 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014615847542627882, + "loss": 2.4002, + "theoretical_loss": 3.3857938389022975, + "tokens_seen": 2345706496 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014614844533600803, + "loss": 2.5081, + "theoretical_loss": 3.3857861969450966, + "tokens_seen": 2345772032 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001461384152457372, + "loss": 2.3, + "theoretical_loss": 3.385778555261171, + "tokens_seen": 2345837568 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001461283851554664, + "loss": 2.4966, + "theoretical_loss": 3.3857709138505037, + "tokens_seen": 2345903104 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014611835506519557, + "loss": 2.5144, + "theoretical_loss": 3.3857632727130773, + "tokens_seen": 2345968640 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014610832497492476, + "loss": 2.4091, + "theoretical_loss": 3.385755631848874, + "tokens_seen": 2346034176 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014609829488465396, + "loss": 2.3472, + "theoretical_loss": 3.385747991257877, + "tokens_seen": 2346099712 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014608826479438314, + "loss": 2.5817, + "theoretical_loss": 3.3857403509400688, + "tokens_seen": 2346165248 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2557243, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.81683087348938, + "objective/train/theoretical_loss": 3.3857384409033005, + "objective/train/tokens_used": 2366641632, + "theoretical_loss": 3.3857384409033005, + "tokens_seen": 2346181632 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014607823470411233, + "loss": 2.4441, + "theoretical_loss": 3.3857327108954314, + "tokens_seen": 2346230784 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001460682046138415, + "loss": 2.5174, + "theoretical_loss": 3.3857250711239484, + "tokens_seen": 2346296320 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014605817452357072, + "loss": 2.3606, + "theoretical_loss": 3.3857174316256016, + "tokens_seen": 2346361856 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001460481444332999, + "loss": 2.6204, + "theoretical_loss": 3.3857097924003736, + "tokens_seen": 2346427392 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014603811434302908, + "loss": 2.4616, + "theoretical_loss": 3.385702153448247, + "tokens_seen": 2346492928 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014602808425275826, + "loss": 2.6192, + "theoretical_loss": 3.3856945147692055, + "tokens_seen": 2346558464 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014601805416248747, + "loss": 2.6572, + "theoretical_loss": 3.3856868763632306, + "tokens_seen": 2346624000 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014600802407221667, + "loss": 2.4623, + "theoretical_loss": 3.385679238230305, + "tokens_seen": 2346689536 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014599799398194586, + "loss": 2.4528, + "theoretical_loss": 3.3856716003704115, + "tokens_seen": 2346755072 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014598796389167504, + "loss": 2.5355, + "theoretical_loss": 3.385663962783533, + "tokens_seen": 2346820608 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014597793380140422, + "loss": 2.6841, + "theoretical_loss": 3.3856563254696512, + "tokens_seen": 2346886144 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014596790371113343, + "loss": 2.5761, + "theoretical_loss": 3.38564868842875, + "tokens_seen": 2346951680 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001459578736208626, + "loss": 2.5321, + "theoretical_loss": 3.3856410516608113, + "tokens_seen": 2347017216 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001459478435305918, + "loss": 2.5285, + "theoretical_loss": 3.3856334151658176, + "tokens_seen": 2347082752 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014593781344032097, + "loss": 2.5947, + "theoretical_loss": 3.385625778943752, + "tokens_seen": 2347148288 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014592778335005015, + "loss": 2.6455, + "theoretical_loss": 3.385618142994596, + "tokens_seen": 2347213824 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014591775325977936, + "loss": 2.4594, + "theoretical_loss": 3.385610507318334, + "tokens_seen": 2347279360 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014590772316950854, + "loss": 2.5381, + "theoretical_loss": 3.3856028719149474, + "tokens_seen": 2347344896 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014589769307923772, + "loss": 2.5603, + "theoretical_loss": 3.385595236784419, + "tokens_seen": 2347410432 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001458876629889669, + "loss": 2.5438, + "theoretical_loss": 3.3855876019267317, + "tokens_seen": 2347475968 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001458776328986961, + "loss": 2.337, + "theoretical_loss": 3.3855799673418683, + "tokens_seen": 2347541504 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001458676028084253, + "loss": 2.4882, + "theoretical_loss": 3.3855723330298106, + "tokens_seen": 2347607040 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014585757271815447, + "loss": 2.5354, + "theoretical_loss": 3.385564698990542, + "tokens_seen": 2347672576 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014584754262788365, + "loss": 2.5916, + "theoretical_loss": 3.385557065224045, + "tokens_seen": 2347738112 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014583751253761284, + "loss": 2.5581, + "theoretical_loss": 3.385549431730302, + "tokens_seen": 2347803648 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2557830, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.239716053009033, + "objective/train/theoretical_loss": 3.385547523399482, + "objective/train/tokens_used": 2368280032, + "theoretical_loss": 3.385547523399482, + "tokens_seen": 2347820032 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014582748244734204, + "loss": 2.5643, + "theoretical_loss": 3.385541798509296, + "tokens_seen": 2347869184 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014581745235707122, + "loss": 2.3792, + "theoretical_loss": 3.385534165561009, + "tokens_seen": 2347934720 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001458074222668004, + "loss": 2.5834, + "theoretical_loss": 3.3855265328854243, + "tokens_seen": 2348000256 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001457973921765296, + "loss": 2.4596, + "theoretical_loss": 3.3855189004825244, + "tokens_seen": 2348065792 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001457873620862588, + "loss": 2.4648, + "theoretical_loss": 3.385511268352292, + "tokens_seen": 2348131328 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014577733199598798, + "loss": 2.5768, + "theoretical_loss": 3.3855036364947093, + "tokens_seen": 2348196864 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014576730190571716, + "loss": 2.3404, + "theoretical_loss": 3.3854960049097595, + "tokens_seen": 2348262400 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014575727181544634, + "loss": 2.739, + "theoretical_loss": 3.385488373597425, + "tokens_seen": 2348327936 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014574724172517555, + "loss": 2.3923, + "theoretical_loss": 3.3854807425576885, + "tokens_seen": 2348393472 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014573721163490473, + "loss": 2.5571, + "theoretical_loss": 3.3854731117905326, + "tokens_seen": 2348459008 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001457271815446339, + "loss": 2.6432, + "theoretical_loss": 3.38546548129594, + "tokens_seen": 2348524544 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001457171514543631, + "loss": 2.7706, + "theoretical_loss": 3.385457851073893, + "tokens_seen": 2348590080 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014570712136409227, + "loss": 2.1881, + "theoretical_loss": 3.385450221124375, + "tokens_seen": 2348655616 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014569709127382148, + "loss": 2.2887, + "theoretical_loss": 3.3854425914473683, + "tokens_seen": 2348721152 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014568706118355066, + "loss": 2.503, + "theoretical_loss": 3.385434962042855, + "tokens_seen": 2348786688 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014567703109327984, + "loss": 2.5665, + "theoretical_loss": 3.385427332910819, + "tokens_seen": 2348852224 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014566700100300902, + "loss": 2.4613, + "theoretical_loss": 3.385419704051242, + "tokens_seen": 2348917760 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014565697091273823, + "loss": 2.3118, + "theoretical_loss": 3.3854120754641066, + "tokens_seen": 2348983296 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001456469408224674, + "loss": 2.4444, + "theoretical_loss": 3.3854044471493965, + "tokens_seen": 2349048832 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001456369107321966, + "loss": 2.2566, + "theoretical_loss": 3.385396819107093, + "tokens_seen": 2349114368 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014562688064192577, + "loss": 2.2578, + "theoretical_loss": 3.38538919133718, + "tokens_seen": 2349179904 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014561685055165496, + "loss": 2.4534, + "theoretical_loss": 3.385381563839639, + "tokens_seen": 2349245440 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014560682046138416, + "loss": 2.4088, + "theoretical_loss": 3.385373936614454, + "tokens_seen": 2349310976 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014559679037111334, + "loss": 2.1887, + "theoretical_loss": 3.3853663096616065, + "tokens_seen": 2349376512 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014558676028084253, + "loss": 2.3834, + "theoretical_loss": 3.38535868298108, + "tokens_seen": 2349442048 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2558531, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9176032543182373, + "objective/train/theoretical_loss": 3.385356776353496, + "objective/train/tokens_used": 2369918432, + "theoretical_loss": 3.385356776353496, + "tokens_seen": 2349458432 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001455767301905717, + "loss": 2.5544, + "theoretical_loss": 3.3853510565728566, + "tokens_seen": 2349507584 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014556670010030092, + "loss": 2.3057, + "theoretical_loss": 3.385343430436919, + "tokens_seen": 2349573120 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001455566700100301, + "loss": 2.5806, + "theoretical_loss": 3.3853358045732502, + "tokens_seen": 2349638656 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014554663991975928, + "loss": 2.2102, + "theoretical_loss": 3.3853281789818332, + "tokens_seen": 2349704192 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014553660982948846, + "loss": 2.418, + "theoretical_loss": 3.38532055366265, + "tokens_seen": 2349769728 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014552657973921764, + "loss": 2.3733, + "theoretical_loss": 3.385312928615684, + "tokens_seen": 2349835264 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014551654964894685, + "loss": 2.5079, + "theoretical_loss": 3.3853053038409167, + "tokens_seen": 2349900800 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014550651955867603, + "loss": 2.443, + "theoretical_loss": 3.385297679338332, + "tokens_seen": 2349966336 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001454964894684052, + "loss": 2.439, + "theoretical_loss": 3.385290055107912, + "tokens_seen": 2350031872 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001454864593781344, + "loss": 2.6226, + "theoretical_loss": 3.3852824311496392, + "tokens_seen": 2350097408 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001454764292878636, + "loss": 2.1795, + "theoretical_loss": 3.3852748074634973, + "tokens_seen": 2350162944 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014546639919759278, + "loss": 2.4174, + "theoretical_loss": 3.385267184049468, + "tokens_seen": 2350228480 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014545636910732196, + "loss": 2.4657, + "theoretical_loss": 3.385259560907534, + "tokens_seen": 2350294016 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014544633901705114, + "loss": 2.2965, + "theoretical_loss": 3.385251938037679, + "tokens_seen": 2350359552 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014543630892678032, + "loss": 2.4683, + "theoretical_loss": 3.385244315439885, + "tokens_seen": 2350425088 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014542627883650953, + "loss": 2.4073, + "theoretical_loss": 3.3852366931141344, + "tokens_seen": 2350490624 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001454162487462387, + "loss": 2.2495, + "theoretical_loss": 3.3852290710604103, + "tokens_seen": 2350556160 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001454062186559679, + "loss": 2.4413, + "theoretical_loss": 3.385221449278695, + "tokens_seen": 2350621696 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014539618856569708, + "loss": 2.4781, + "theoretical_loss": 3.3852138277689723, + "tokens_seen": 2350687232 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014538615847542628, + "loss": 2.4844, + "theoretical_loss": 3.385206206531224, + "tokens_seen": 2350752768 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014537612838515546, + "loss": 2.6601, + "theoretical_loss": 3.3851985855654325, + "tokens_seen": 2350818304 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014536609829488465, + "loss": 2.4229, + "theoretical_loss": 3.3851909648715814, + "tokens_seen": 2350883840 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014535606820461383, + "loss": 2.2444, + "theoretical_loss": 3.3851833444496533, + "tokens_seen": 2350949376 + }, + { + "epoch": 7.08, + "learning_rate": 0.000145346038114343, + "loss": 2.446, + "theoretical_loss": 3.3851757242996303, + "tokens_seen": 2351014912 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014533600802407222, + "loss": 2.3002, + "theoretical_loss": 3.385168104421495, + "tokens_seen": 2351080448 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2559853, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0985100269317627, + "objective/train/theoretical_loss": 3.385166199494442, + "objective/train/tokens_used": 2371556832, + "theoretical_loss": 3.385166199494442, + "tokens_seen": 2351096832 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001453259779338014, + "loss": 2.3364, + "theoretical_loss": 3.3851604848152315, + "tokens_seen": 2351145984 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014531594784353058, + "loss": 2.5333, + "theoretical_loss": 3.385152865480821, + "tokens_seen": 2351211520 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014530591775325976, + "loss": 2.1738, + "theoretical_loss": 3.3851452464182468, + "tokens_seen": 2351277056 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014529588766298897, + "loss": 2.3737, + "theoretical_loss": 3.3851376276274916, + "tokens_seen": 2351342592 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014528585757271815, + "loss": 2.5295, + "theoretical_loss": 3.3851300091085386, + "tokens_seen": 2351408128 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014527582748244733, + "loss": 2.5224, + "theoretical_loss": 3.38512239086137, + "tokens_seen": 2351473664 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014526579739217654, + "loss": 2.4263, + "theoretical_loss": 3.3851147728859683, + "tokens_seen": 2351539200 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014525576730190575, + "loss": 2.6158, + "theoretical_loss": 3.385107155182317, + "tokens_seen": 2351604736 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014524573721163493, + "loss": 2.3145, + "theoretical_loss": 3.385099537750398, + "tokens_seen": 2351670272 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001452357071213641, + "loss": 2.453, + "theoretical_loss": 3.3850919205901944, + "tokens_seen": 2351735808 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001452256770310933, + "loss": 2.1843, + "theoretical_loss": 3.3850843037016896, + "tokens_seen": 2351801344 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014521564694082247, + "loss": 2.3839, + "theoretical_loss": 3.385076687084865, + "tokens_seen": 2351866880 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014520561685055168, + "loss": 2.4656, + "theoretical_loss": 3.3850690707397044, + "tokens_seen": 2351932416 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014519558676028086, + "loss": 2.2877, + "theoretical_loss": 3.3850614546661904, + "tokens_seen": 2351997952 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014518555667001004, + "loss": 2.5786, + "theoretical_loss": 3.3850538388643048, + "tokens_seen": 2352063488 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014517552657973922, + "loss": 2.4282, + "theoretical_loss": 3.3850462233340317, + "tokens_seen": 2352129024 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014516549648946843, + "loss": 2.2506, + "theoretical_loss": 3.385038608075353, + "tokens_seen": 2352194560 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001451554663991976, + "loss": 2.2902, + "theoretical_loss": 3.385030993088252, + "tokens_seen": 2352260096 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001451454363089268, + "loss": 2.548, + "theoretical_loss": 3.3850233783727104, + "tokens_seen": 2352325632 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014513540621865597, + "loss": 2.1214, + "theoretical_loss": 3.3850157639287124, + "tokens_seen": 2352391168 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014512537612838516, + "loss": 2.3225, + "theoretical_loss": 3.3850081497562394, + "tokens_seen": 2352456704 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014511534603811436, + "loss": 2.0721, + "theoretical_loss": 3.3850005358552755, + "tokens_seen": 2352522240 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014510531594784354, + "loss": 2.5639, + "theoretical_loss": 3.384992922225802, + "tokens_seen": 2352587776 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014509528585757273, + "loss": 2.5819, + "theoretical_loss": 3.384985308867803, + "tokens_seen": 2352653312 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001450852557673019, + "loss": 2.6193, + "theoretical_loss": 3.3849776957812603, + "tokens_seen": 2352718848 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2562616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4599740505218506, + "objective/train/theoretical_loss": 3.3849757925520376, + "objective/train/tokens_used": 2373195232, + "theoretical_loss": 3.3849757925520376, + "tokens_seen": 2352735232 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014507522567703112, + "loss": 2.3136, + "theoretical_loss": 3.3849700829661566, + "tokens_seen": 2352784384 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001450651955867603, + "loss": 2.4143, + "theoretical_loss": 3.384962470422476, + "tokens_seen": 2352849920 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014505516549648948, + "loss": 2.3458, + "theoretical_loss": 3.3849548581502, + "tokens_seen": 2352915456 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014504513540621866, + "loss": 2.4936, + "theoretical_loss": 3.384947246149311, + "tokens_seen": 2352980992 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014503510531594784, + "loss": 2.1112, + "theoretical_loss": 3.3849396344197933, + "tokens_seen": 2353046528 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014502507522567705, + "loss": 2.5572, + "theoretical_loss": 3.384932022961628, + "tokens_seen": 2353112064 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014501504513540623, + "loss": 2.4787, + "theoretical_loss": 3.3849244117748, + "tokens_seen": 2353177600 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001450050150451354, + "loss": 2.5453, + "theoretical_loss": 3.3849168008592896, + "tokens_seen": 2353243136 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001449949849548646, + "loss": 2.5858, + "theoretical_loss": 3.3849091902150814, + "tokens_seen": 2353308672 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001449849548645938, + "loss": 2.4639, + "theoretical_loss": 3.384901579842157, + "tokens_seen": 2353374208 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014497492477432298, + "loss": 2.5014, + "theoretical_loss": 3.3848939697405003, + "tokens_seen": 2353439744 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014496489468405216, + "loss": 2.4915, + "theoretical_loss": 3.3848863599100927, + "tokens_seen": 2353505280 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014495486459378134, + "loss": 2.7909, + "theoretical_loss": 3.3848787503509183, + "tokens_seen": 2353570816 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014494483450351052, + "loss": 2.4591, + "theoretical_loss": 3.384871141062959, + "tokens_seen": 2353636352 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014493480441323973, + "loss": 2.1091, + "theoretical_loss": 3.3848635320461984, + "tokens_seen": 2353701888 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001449247743229689, + "loss": 2.5978, + "theoretical_loss": 3.384855923300618, + "tokens_seen": 2353767424 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001449147442326981, + "loss": 2.3508, + "theoretical_loss": 3.3848483148262023, + "tokens_seen": 2353832960 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014490471414242728, + "loss": 2.5857, + "theoretical_loss": 3.3848407066229327, + "tokens_seen": 2353898496 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014489468405215648, + "loss": 2.3177, + "theoretical_loss": 3.3848330986907924, + "tokens_seen": 2353964032 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014488465396188567, + "loss": 2.5487, + "theoretical_loss": 3.384825491029764, + "tokens_seen": 2354029568 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014487462387161485, + "loss": 2.4692, + "theoretical_loss": 3.384817883639831, + "tokens_seen": 2354095104 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014486459378134403, + "loss": 2.3894, + "theoretical_loss": 3.3848102765209758, + "tokens_seen": 2354160640 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001448545636910732, + "loss": 2.355, + "theoretical_loss": 3.384802669673181, + "tokens_seen": 2354226176 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014484453360080242, + "loss": 2.6068, + "theoretical_loss": 3.3847950630964294, + "tokens_seen": 2354291712 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001448345035105316, + "loss": 2.1843, + "theoretical_loss": 3.3847874567907037, + "tokens_seen": 2354357248 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.488801956176758, + "objective/train/theoretical_loss": 3.3847855552566184, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.3847855552566184, + "tokens_seen": 2354373632 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014482447342026078, + "loss": 2.6486, + "theoretical_loss": 3.3847798507559874, + "tokens_seen": 2354422784 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014481444332998996, + "loss": 2.4228, + "theoretical_loss": 3.3847722449922624, + "tokens_seen": 2354488320 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014480441323971917, + "loss": 2.4764, + "theoretical_loss": 3.384764639499512, + "tokens_seen": 2354553856 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014479438314944835, + "loss": 2.2356, + "theoretical_loss": 3.3847570342777193, + "tokens_seen": 2354619392 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014478435305917753, + "loss": 2.5091, + "theoretical_loss": 3.3847494293268663, + "tokens_seen": 2354684928 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001447743229689067, + "loss": 2.5221, + "theoretical_loss": 3.3847418246469365, + "tokens_seen": 2354750464 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001447642928786359, + "loss": 2.5452, + "theoretical_loss": 3.3847342202379123, + "tokens_seen": 2354816000 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001447542627883651, + "loss": 2.5316, + "theoretical_loss": 3.3847266160997767, + "tokens_seen": 2354881536 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014474423269809428, + "loss": 2.4454, + "theoretical_loss": 3.3847190122325124, + "tokens_seen": 2354947072 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014473420260782346, + "loss": 2.4978, + "theoretical_loss": 3.384711408636102, + "tokens_seen": 2355012608 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014472417251755264, + "loss": 2.4331, + "theoretical_loss": 3.3847038053105294, + "tokens_seen": 2355078144 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014471414242728185, + "loss": 2.5379, + "theoretical_loss": 3.384696202255776, + "tokens_seen": 2355143680 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014470411233701103, + "loss": 2.4783, + "theoretical_loss": 3.384688599471825, + "tokens_seen": 2355209216 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014469408224674021, + "loss": 2.5134, + "theoretical_loss": 3.38468099695866, + "tokens_seen": 2355274752 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001446840521564694, + "loss": 2.3384, + "theoretical_loss": 3.384673394716263, + "tokens_seen": 2355340288 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001446740220661986, + "loss": 2.3741, + "theoretical_loss": 3.3846657927446175, + "tokens_seen": 2355405824 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014466399197592779, + "loss": 2.4597, + "theoretical_loss": 3.3846581910437052, + "tokens_seen": 2355471360 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014465396188565697, + "loss": 2.4917, + "theoretical_loss": 3.38465058961351, + "tokens_seen": 2355536896 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014464393179538615, + "loss": 2.2207, + "theoretical_loss": 3.3846429884540146, + "tokens_seen": 2355602432 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014463390170511533, + "loss": 2.5431, + "theoretical_loss": 3.3846353875652015, + "tokens_seen": 2355667968 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014462387161484454, + "loss": 2.4424, + "theoretical_loss": 3.3846277869470534, + "tokens_seen": 2355733504 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014461384152457372, + "loss": 2.3128, + "theoretical_loss": 3.3846201865995535, + "tokens_seen": 2355799040 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001446038114343029, + "loss": 2.3052, + "theoretical_loss": 3.3846125865226844, + "tokens_seen": 2355864576 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014459378134403208, + "loss": 2.445, + "theoretical_loss": 3.3846049867164294, + "tokens_seen": 2355930112 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001445837512537613, + "loss": 2.3609, + "theoretical_loss": 3.3845973871807704, + "tokens_seen": 2355995648 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1623623371124268, + "objective/train/theoretical_loss": 3.3845954873391344, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.3845954873391344, + "tokens_seen": 2356012032 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014457372116349047, + "loss": 2.3103, + "theoretical_loss": 3.3845897879156914, + "tokens_seen": 2356061184 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014456369107321965, + "loss": 2.6258, + "theoretical_loss": 3.3845821889211742, + "tokens_seen": 2356126720 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014455366098294883, + "loss": 2.3515, + "theoretical_loss": 3.3845745901972024, + "tokens_seen": 2356192256 + }, + { + "epoch": 7.08, + "learning_rate": 0.000144543630892678, + "loss": 2.5641, + "theoretical_loss": 3.384566991743758, + "tokens_seen": 2356257792 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014453360080240722, + "loss": 2.4617, + "theoretical_loss": 3.384559393560825, + "tokens_seen": 2356323328 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001445235707121364, + "loss": 2.6776, + "theoretical_loss": 3.3845517956483855, + "tokens_seen": 2356388864 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001445135406218656, + "loss": 2.4877, + "theoretical_loss": 3.3845441980064224, + "tokens_seen": 2356454400 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001445035105315948, + "loss": 2.3741, + "theoretical_loss": 3.3845366006349185, + "tokens_seen": 2356519936 + }, + { + "epoch": 7.08, + "learning_rate": 0.000144493480441324, + "loss": 2.6604, + "theoretical_loss": 3.384529003533857, + "tokens_seen": 2356585472 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014448345035105318, + "loss": 2.3358, + "theoretical_loss": 3.38452140670322, + "tokens_seen": 2356651008 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014447342026078236, + "loss": 2.8054, + "theoretical_loss": 3.3845138101429915, + "tokens_seen": 2356716544 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014446339017051154, + "loss": 2.5538, + "theoretical_loss": 3.3845062138531534, + "tokens_seen": 2356782080 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014445336008024072, + "loss": 2.6153, + "theoretical_loss": 3.384498617833689, + "tokens_seen": 2356847616 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014444332998996993, + "loss": 2.4389, + "theoretical_loss": 3.384491022084581, + "tokens_seen": 2356913152 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014443329989969911, + "loss": 2.7364, + "theoretical_loss": 3.3844834266058124, + "tokens_seen": 2356978688 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001444232698094283, + "loss": 2.329, + "theoretical_loss": 3.3844758313973657, + "tokens_seen": 2357044224 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014441323971915748, + "loss": 2.5387, + "theoretical_loss": 3.3844682364592242, + "tokens_seen": 2357109760 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014440320962888668, + "loss": 2.5037, + "theoretical_loss": 3.384460641791371, + "tokens_seen": 2357175296 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014439317953861587, + "loss": 2.2891, + "theoretical_loss": 3.384453047393788, + "tokens_seen": 2357240832 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014438314944834505, + "loss": 2.4392, + "theoretical_loss": 3.384445453266459, + "tokens_seen": 2357306368 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014437311935807423, + "loss": 2.6206, + "theoretical_loss": 3.384437859409366, + "tokens_seen": 2357371904 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001443630892678034, + "loss": 2.3703, + "theoretical_loss": 3.384430265822493, + "tokens_seen": 2357437440 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014435305917753262, + "loss": 2.5676, + "theoretical_loss": 3.384422672505822, + "tokens_seen": 2357502976 + }, + { + "epoch": 7.08, + "learning_rate": 0.0001443430290872618, + "loss": 2.4799, + "theoretical_loss": 3.3844150794593357, + "tokens_seen": 2357568512 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014433299899699098, + "loss": 2.5969, + "theoretical_loss": 3.3844074866830183, + "tokens_seen": 2357634048 + }, + { + "epoch": 7.08, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4014029502868652, + "objective/train/theoretical_loss": 3.38440558853115, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.38440558853115, + "tokens_seen": 2357650432 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014432296890672016, + "loss": 2.3315, + "theoretical_loss": 3.384399894176851, + "tokens_seen": 2357699584 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014431293881644937, + "loss": 2.5823, + "theoretical_loss": 3.3843923019408177, + "tokens_seen": 2357765120 + }, + { + "epoch": 7.08, + "learning_rate": 0.00014430290872617855, + "loss": 2.3963, + "theoretical_loss": 3.3843847099749014, + "tokens_seen": 2357830656 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014429287863590773, + "loss": 2.3623, + "theoretical_loss": 3.384377118279084, + "tokens_seen": 2357896192 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001442828485456369, + "loss": 2.4904, + "theoretical_loss": 3.3843695268533494, + "tokens_seen": 2357961728 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001442728184553661, + "loss": 2.5022, + "theoretical_loss": 3.38436193569768, + "tokens_seen": 2358027264 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001442627883650953, + "loss": 2.2692, + "theoretical_loss": 3.384354344812059, + "tokens_seen": 2358092800 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014425275827482448, + "loss": 2.3866, + "theoretical_loss": 3.3843467541964687, + "tokens_seen": 2358158336 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014424272818455366, + "loss": 2.4576, + "theoretical_loss": 3.3843391638508926, + "tokens_seen": 2358223872 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014423269809428284, + "loss": 2.2776, + "theoretical_loss": 3.384331573775313, + "tokens_seen": 2358289408 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014422266800401205, + "loss": 2.4179, + "theoretical_loss": 3.3843239839697135, + "tokens_seen": 2358354944 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014421263791374123, + "loss": 2.296, + "theoretical_loss": 3.3843163944340766, + "tokens_seen": 2358420480 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014420260782347041, + "loss": 2.6926, + "theoretical_loss": 3.384308805168385, + "tokens_seen": 2358486016 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001441925777331996, + "loss": 2.3968, + "theoretical_loss": 3.3843012161726223, + "tokens_seen": 2358551552 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001441825476429288, + "loss": 2.3264, + "theoretical_loss": 3.3842936274467705, + "tokens_seen": 2358617088 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014417251755265799, + "loss": 2.3982, + "theoretical_loss": 3.384286038990813, + "tokens_seen": 2358682624 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014416248746238717, + "loss": 2.4979, + "theoretical_loss": 3.3842784508047328, + "tokens_seen": 2358748160 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014415245737211635, + "loss": 2.4862, + "theoretical_loss": 3.3842708628885125, + "tokens_seen": 2358813696 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014414242728184553, + "loss": 2.3421, + "theoretical_loss": 3.384263275242135, + "tokens_seen": 2358879232 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014413239719157474, + "loss": 2.3735, + "theoretical_loss": 3.3842556878655836, + "tokens_seen": 2358944768 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014412236710130392, + "loss": 2.483, + "theoretical_loss": 3.3842481007588407, + "tokens_seen": 2359010304 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001441123370110331, + "loss": 2.4013, + "theoretical_loss": 3.38424051392189, + "tokens_seen": 2359075840 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014410230692076228, + "loss": 2.5624, + "theoretical_loss": 3.3842329273547134, + "tokens_seen": 2359141376 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001440922768304915, + "loss": 2.5456, + "theoretical_loss": 3.3842253410572942, + "tokens_seen": 2359206912 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014408224674022067, + "loss": 2.5311, + "theoretical_loss": 3.3842177550296157, + "tokens_seen": 2359272448 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.11842679977417, + "objective/train/theoretical_loss": 3.384215858564841, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.384215858564841, + "tokens_seen": 2359288832 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014407221664994985, + "loss": 2.2973, + "theoretical_loss": 3.3842101692716606, + "tokens_seen": 2359337984 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014406218655967903, + "loss": 2.3519, + "theoretical_loss": 3.3842025837834115, + "tokens_seen": 2359403520 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001440521564694082, + "loss": 2.3563, + "theoretical_loss": 3.384194998564852, + "tokens_seen": 2359469056 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014404212637913742, + "loss": 2.1217, + "theoretical_loss": 3.384187413615964, + "tokens_seen": 2359534592 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001440320962888666, + "loss": 2.3242, + "theoretical_loss": 3.3841798289367313, + "tokens_seen": 2359600128 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014402206619859578, + "loss": 2.4586, + "theoretical_loss": 3.3841722445271367, + "tokens_seen": 2359665664 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014401203610832496, + "loss": 2.3193, + "theoretical_loss": 3.3841646603871625, + "tokens_seen": 2359731200 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014400200601805417, + "loss": 2.2926, + "theoretical_loss": 3.384157076516792, + "tokens_seen": 2359796736 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014399197592778335, + "loss": 2.5557, + "theoretical_loss": 3.3841494929160088, + "tokens_seen": 2359862272 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014398194583751254, + "loss": 2.4414, + "theoretical_loss": 3.384141909584795, + "tokens_seen": 2359927808 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014397191574724172, + "loss": 2.7824, + "theoretical_loss": 3.384134326523134, + "tokens_seen": 2359993344 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001439618856569709, + "loss": 2.451, + "theoretical_loss": 3.384126743731008, + "tokens_seen": 2360058880 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001439518555667001, + "loss": 2.4336, + "theoretical_loss": 3.3841191612084005, + "tokens_seen": 2360124416 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001439418254764293, + "loss": 2.359, + "theoretical_loss": 3.384111578955295, + "tokens_seen": 2360189952 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014393179538615847, + "loss": 2.5303, + "theoretical_loss": 3.3841039969716733, + "tokens_seen": 2360255488 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014392176529588765, + "loss": 2.1134, + "theoretical_loss": 3.384096415257519, + "tokens_seen": 2360321024 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014391173520561686, + "loss": 2.5346, + "theoretical_loss": 3.384088833812815, + "tokens_seen": 2360386560 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014390170511534604, + "loss": 2.4671, + "theoretical_loss": 3.384081252637544, + "tokens_seen": 2360452096 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014389167502507522, + "loss": 2.4262, + "theoretical_loss": 3.3840736717316893, + "tokens_seen": 2360517632 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001438816449348044, + "loss": 2.3891, + "theoretical_loss": 3.3840660910952334, + "tokens_seen": 2360583168 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014387161484453358, + "loss": 2.5559, + "theoretical_loss": 3.3840585107281593, + "tokens_seen": 2360648704 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001438615847542628, + "loss": 2.4287, + "theoretical_loss": 3.3840509306304507, + "tokens_seen": 2360714240 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014385155466399197, + "loss": 2.2124, + "theoretical_loss": 3.3840433508020897, + "tokens_seen": 2360779776 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014384152457372115, + "loss": 2.3001, + "theoretical_loss": 3.3840357712430595, + "tokens_seen": 2360845312 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014383149448345033, + "loss": 2.5738, + "theoretical_loss": 3.384028191953343, + "tokens_seen": 2360910848 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2777292728424072, + "objective/train/theoretical_loss": 3.3840262971729924, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.3840262971729924, + "tokens_seen": 2360927232 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014382146439317954, + "loss": 2.3141, + "theoretical_loss": 3.384020612932923, + "tokens_seen": 2360976384 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014381143430290872, + "loss": 2.8561, + "theoretical_loss": 3.384013034181783, + "tokens_seen": 2361041920 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001438014042126379, + "loss": 2.6427, + "theoretical_loss": 3.384005455699906, + "tokens_seen": 2361107456 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014379137412236708, + "loss": 2.3496, + "theoretical_loss": 3.3839978774872743, + "tokens_seen": 2361172992 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014378134403209627, + "loss": 2.6278, + "theoretical_loss": 3.3839902995438713, + "tokens_seen": 2361238528 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014377131394182547, + "loss": 2.4196, + "theoretical_loss": 3.3839827218696796, + "tokens_seen": 2361304064 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014376128385155468, + "loss": 2.5732, + "theoretical_loss": 3.383975144464683, + "tokens_seen": 2361369600 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014375125376128386, + "loss": 2.3683, + "theoretical_loss": 3.383967567328863, + "tokens_seen": 2361435136 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014374122367101304, + "loss": 2.4816, + "theoretical_loss": 3.383959990462204, + "tokens_seen": 2361500672 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014373119358074225, + "loss": 2.3109, + "theoretical_loss": 3.383952413864688, + "tokens_seen": 2361566208 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014372116349047143, + "loss": 2.5446, + "theoretical_loss": 3.383944837536299, + "tokens_seen": 2361631744 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014371113340020062, + "loss": 2.3256, + "theoretical_loss": 3.383937261477019, + "tokens_seen": 2361697280 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001437011033099298, + "loss": 2.4843, + "theoretical_loss": 3.3839296856868315, + "tokens_seen": 2361762816 + }, + { + "epoch": 7.09, + "learning_rate": 0.000143691073219659, + "loss": 2.5422, + "theoretical_loss": 3.3839221101657193, + "tokens_seen": 2361828352 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014368104312938819, + "loss": 2.6425, + "theoretical_loss": 3.3839145349136652, + "tokens_seen": 2361893888 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014367101303911737, + "loss": 2.3513, + "theoretical_loss": 3.3839069599306524, + "tokens_seen": 2361959424 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014366098294884655, + "loss": 2.379, + "theoretical_loss": 3.383899385216664, + "tokens_seen": 2362024960 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014365095285857573, + "loss": 2.4082, + "theoretical_loss": 3.3838918107716824, + "tokens_seen": 2362090496 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014364092276830494, + "loss": 2.5335, + "theoretical_loss": 3.3838842365956916, + "tokens_seen": 2362156032 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014363089267803412, + "loss": 2.502, + "theoretical_loss": 3.3838766626886736, + "tokens_seen": 2362221568 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001436208625877633, + "loss": 2.3968, + "theoretical_loss": 3.383869089050612, + "tokens_seen": 2362287104 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014361083249749248, + "loss": 2.4409, + "theoretical_loss": 3.383861515681489, + "tokens_seen": 2362352640 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001436008024072217, + "loss": 2.4261, + "theoretical_loss": 3.383853942581289, + "tokens_seen": 2362418176 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014359077231695087, + "loss": 2.47, + "theoretical_loss": 3.383846369749994, + "tokens_seen": 2362483712 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014358074222668005, + "loss": 2.4954, + "theoretical_loss": 3.3838387971875865, + "tokens_seen": 2362549248 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5691733360290527, + "objective/train/theoretical_loss": 3.3838369040889966, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.3838369040889966, + "tokens_seen": 2362565632 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014357071213640923, + "loss": 2.785, + "theoretical_loss": 3.3838312248940507, + "tokens_seen": 2362614784 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001435606820461384, + "loss": 2.3858, + "theoretical_loss": 3.383823652869369, + "tokens_seen": 2362680320 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014355065195586762, + "loss": 2.3972, + "theoretical_loss": 3.3838160811135243, + "tokens_seen": 2362745856 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001435406218655968, + "loss": 2.4823, + "theoretical_loss": 3.3838085096264994, + "tokens_seen": 2362811392 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014353059177532598, + "loss": 2.2848, + "theoretical_loss": 3.383800938408278, + "tokens_seen": 2362876928 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014352056168505516, + "loss": 2.6882, + "theoretical_loss": 3.3837933674588427, + "tokens_seen": 2362942464 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014351053159478437, + "loss": 2.3641, + "theoretical_loss": 3.3837857967781764, + "tokens_seen": 2363008000 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014350050150451355, + "loss": 2.512, + "theoretical_loss": 3.3837782263662626, + "tokens_seen": 2363073536 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014349047141424274, + "loss": 2.3148, + "theoretical_loss": 3.3837706562230836, + "tokens_seen": 2363139072 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014348044132397192, + "loss": 2.7684, + "theoretical_loss": 3.383763086348623, + "tokens_seen": 2363204608 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001434704112337011, + "loss": 2.3494, + "theoretical_loss": 3.3837555167428635, + "tokens_seen": 2363270144 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001434603811434303, + "loss": 2.1668, + "theoretical_loss": 3.383747947405788, + "tokens_seen": 2363335680 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001434503510531595, + "loss": 2.5332, + "theoretical_loss": 3.3837403783373796, + "tokens_seen": 2363401216 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014344032096288867, + "loss": 2.541, + "theoretical_loss": 3.3837328095376216, + "tokens_seen": 2363466752 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014343029087261785, + "loss": 2.1337, + "theoretical_loss": 3.3837252410064966, + "tokens_seen": 2363532288 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014342026078234706, + "loss": 2.3725, + "theoretical_loss": 3.3837176727439884, + "tokens_seen": 2363597824 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014341023069207624, + "loss": 2.4608, + "theoretical_loss": 3.3837101047500786, + "tokens_seen": 2363663360 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014340020060180542, + "loss": 2.3127, + "theoretical_loss": 3.3837025370247518, + "tokens_seen": 2363728896 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001433901705115346, + "loss": 2.4781, + "theoretical_loss": 3.38369496956799, + "tokens_seen": 2363794432 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014338014042126378, + "loss": 2.416, + "theoretical_loss": 3.3836874023797767, + "tokens_seen": 2363859968 + }, + { + "epoch": 7.09, + "learning_rate": 0.000143370110330993, + "loss": 2.2328, + "theoretical_loss": 3.3836798354600948, + "tokens_seen": 2363925504 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014336008024072217, + "loss": 2.5467, + "theoretical_loss": 3.383672268808927, + "tokens_seen": 2363991040 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014335005015045135, + "loss": 2.4174, + "theoretical_loss": 3.3836647024262563, + "tokens_seen": 2364056576 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014334002006018053, + "loss": 2.5976, + "theoretical_loss": 3.3836571363120664, + "tokens_seen": 2364122112 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014332998996990974, + "loss": 2.3718, + "theoretical_loss": 3.38364957046634, + "tokens_seen": 2364187648 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.046018362045288, + "objective/train/theoretical_loss": 3.3836476790468537, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.3836476790468537, + "tokens_seen": 2364204032 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014331995987963892, + "loss": 2.5956, + "theoretical_loss": 3.38364200488906, + "tokens_seen": 2364253184 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001433099297893681, + "loss": 2.498, + "theoretical_loss": 3.38363443958021, + "tokens_seen": 2364318720 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014329989969909728, + "loss": 2.5362, + "theoretical_loss": 3.3836268745397717, + "tokens_seen": 2364384256 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014328986960882647, + "loss": 2.4169, + "theoretical_loss": 3.3836193097677296, + "tokens_seen": 2364449792 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014327983951855567, + "loss": 2.2872, + "theoretical_loss": 3.3836117452640657, + "tokens_seen": 2364515328 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014326980942828486, + "loss": 2.3961, + "theoretical_loss": 3.383604181028764, + "tokens_seen": 2364580864 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014325977933801404, + "loss": 2.3635, + "theoretical_loss": 3.383596617061807, + "tokens_seen": 2364646400 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014324974924774322, + "loss": 2.395, + "theoretical_loss": 3.3835890533631776, + "tokens_seen": 2364711936 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014323971915747243, + "loss": 2.5451, + "theoretical_loss": 3.3835814899328587, + "tokens_seen": 2364777472 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001432296890672016, + "loss": 2.5197, + "theoretical_loss": 3.383573926770834, + "tokens_seen": 2364843008 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001432196589769308, + "loss": 2.4006, + "theoretical_loss": 3.383566363877086, + "tokens_seen": 2364908544 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014320962888665997, + "loss": 2.5925, + "theoretical_loss": 3.3835588012515982, + "tokens_seen": 2364974080 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014319959879638915, + "loss": 2.2661, + "theoretical_loss": 3.383551238894353, + "tokens_seen": 2365039616 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014318956870611836, + "loss": 2.4672, + "theoretical_loss": 3.3835436768053344, + "tokens_seen": 2365105152 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014317953861584754, + "loss": 2.4102, + "theoretical_loss": 3.3835361149845244, + "tokens_seen": 2365170688 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014316950852557672, + "loss": 2.6568, + "theoretical_loss": 3.383528553431907, + "tokens_seen": 2365236224 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001431594784353059, + "loss": 2.5223, + "theoretical_loss": 3.3835209921474645, + "tokens_seen": 2365301760 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001431494483450351, + "loss": 2.2289, + "theoretical_loss": 3.3835134311311803, + "tokens_seen": 2365367296 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001431394182547643, + "loss": 2.3503, + "theoretical_loss": 3.3835058703830376, + "tokens_seen": 2365432832 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014312938816449347, + "loss": 2.435, + "theoretical_loss": 3.3834983099030196, + "tokens_seen": 2365498368 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014311935807422265, + "loss": 2.5813, + "theoretical_loss": 3.3834907496911084, + "tokens_seen": 2365563904 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014310932798395183, + "loss": 2.3808, + "theoretical_loss": 3.383483189747288, + "tokens_seen": 2365629440 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014309929789368104, + "loss": 2.4484, + "theoretical_loss": 3.3834756300715414, + "tokens_seen": 2365694976 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014308926780341022, + "loss": 2.4883, + "theoretical_loss": 3.3834680706638514, + "tokens_seen": 2365760512 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001430792377131394, + "loss": 2.2465, + "theoretical_loss": 3.383460511524201, + "tokens_seen": 2365826048 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3637964725494385, + "objective/train/theoretical_loss": 3.383458621781168, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.383458621781168, + "tokens_seen": 2365842432 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014306920762286859, + "loss": 2.3286, + "theoretical_loss": 3.383452952652574, + "tokens_seen": 2365891584 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001430591775325978, + "loss": 2.4619, + "theoretical_loss": 3.383445394048952, + "tokens_seen": 2365957120 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014304914744232698, + "loss": 2.4294, + "theoretical_loss": 3.3834378357133197, + "tokens_seen": 2366022656 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014303911735205616, + "loss": 2.4587, + "theoretical_loss": 3.383430277645659, + "tokens_seen": 2366088192 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014302908726178534, + "loss": 2.3338, + "theoretical_loss": 3.3834227198459534, + "tokens_seen": 2366153728 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014301905717151455, + "loss": 2.4524, + "theoretical_loss": 3.383415162314186, + "tokens_seen": 2366219264 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014300902708124375, + "loss": 2.4262, + "theoretical_loss": 3.38340760505034, + "tokens_seen": 2366284800 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014299899699097294, + "loss": 2.4438, + "theoretical_loss": 3.3834000480543986, + "tokens_seen": 2366350336 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014298896690070212, + "loss": 2.3423, + "theoretical_loss": 3.3833924913263442, + "tokens_seen": 2366415872 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001429789368104313, + "loss": 2.4576, + "theoretical_loss": 3.3833849348661604, + "tokens_seen": 2366481408 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001429689067201605, + "loss": 2.3935, + "theoretical_loss": 3.3833773786738304, + "tokens_seen": 2366546944 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001429588766298897, + "loss": 2.2518, + "theoretical_loss": 3.383369822749337, + "tokens_seen": 2366612480 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014294884653961887, + "loss": 2.4052, + "theoretical_loss": 3.3833622670926635, + "tokens_seen": 2366678016 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014293881644934805, + "loss": 2.3474, + "theoretical_loss": 3.3833547117037925, + "tokens_seen": 2366743552 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014292878635907726, + "loss": 2.5982, + "theoretical_loss": 3.383347156582708, + "tokens_seen": 2366809088 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014291875626880644, + "loss": 2.3998, + "theoretical_loss": 3.383339601729392, + "tokens_seen": 2366874624 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014290872617853562, + "loss": 2.6129, + "theoretical_loss": 3.3833320471438286, + "tokens_seen": 2366940160 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001428986960882648, + "loss": 2.4532, + "theoretical_loss": 3.3833244928260005, + "tokens_seen": 2367005696 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014288866599799398, + "loss": 2.4912, + "theoretical_loss": 3.3833169387758906, + "tokens_seen": 2367071232 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001428786359077232, + "loss": 2.6395, + "theoretical_loss": 3.383309384993482, + "tokens_seen": 2367136768 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014286860581745237, + "loss": 2.4443, + "theoretical_loss": 3.383301831478758, + "tokens_seen": 2367202304 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014285857572718155, + "loss": 2.3009, + "theoretical_loss": 3.383294278231702, + "tokens_seen": 2367267840 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014284854563691073, + "loss": 2.4217, + "theoretical_loss": 3.3832867252522965, + "tokens_seen": 2367333376 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014283851554663994, + "loss": 2.2405, + "theoretical_loss": 3.383279172540525, + "tokens_seen": 2367398912 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014282848545636912, + "loss": 2.4317, + "theoretical_loss": 3.3832716200963704, + "tokens_seen": 2367464448 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.9446250200271606, + "objective/train/theoretical_loss": 3.383269732027145, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.383269732027145, + "tokens_seen": 2367480832 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001428184553660983, + "loss": 2.3474, + "theoretical_loss": 3.383264067919816, + "tokens_seen": 2367529984 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014280842527582749, + "loss": 2.5047, + "theoretical_loss": 3.3832565160108445, + "tokens_seen": 2367595520 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014279839518555667, + "loss": 2.441, + "theoretical_loss": 3.3832489643694394, + "tokens_seen": 2367661056 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014278836509528587, + "loss": 2.3817, + "theoretical_loss": 3.383241412995584, + "tokens_seen": 2367726592 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014277833500501506, + "loss": 2.5298, + "theoretical_loss": 3.383233861889261, + "tokens_seen": 2367792128 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014276830491474424, + "loss": 2.7486, + "theoretical_loss": 3.3832263110504535, + "tokens_seen": 2367857664 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014275827482447342, + "loss": 2.4127, + "theoretical_loss": 3.3832187604791453, + "tokens_seen": 2367923200 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014274824473420263, + "loss": 2.7197, + "theoretical_loss": 3.3832112101753187, + "tokens_seen": 2367988736 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001427382146439318, + "loss": 2.4403, + "theoretical_loss": 3.383203660138957, + "tokens_seen": 2368054272 + }, + { + "epoch": 7.09, + "learning_rate": 0.000142728184553661, + "loss": 2.6127, + "theoretical_loss": 3.3831961103700436, + "tokens_seen": 2368119808 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014271815446339017, + "loss": 2.2072, + "theoretical_loss": 3.383188560868561, + "tokens_seen": 2368185344 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014270812437311935, + "loss": 2.5667, + "theoretical_loss": 3.3831810116344934, + "tokens_seen": 2368250880 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014269809428284856, + "loss": 2.0819, + "theoretical_loss": 3.3831734626678234, + "tokens_seen": 2368316416 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014268806419257774, + "loss": 2.0837, + "theoretical_loss": 3.3831659139685337, + "tokens_seen": 2368381952 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014267803410230692, + "loss": 2.6641, + "theoretical_loss": 3.383158365536608, + "tokens_seen": 2368447488 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001426680040120361, + "loss": 2.3334, + "theoretical_loss": 3.3831508173720293, + "tokens_seen": 2368513024 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001426579739217653, + "loss": 2.2815, + "theoretical_loss": 3.3831432694747803, + "tokens_seen": 2368578560 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001426479438314945, + "loss": 2.4312, + "theoretical_loss": 3.3831357218448446, + "tokens_seen": 2368644096 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014263791374122367, + "loss": 2.2553, + "theoretical_loss": 3.3831281744822053, + "tokens_seen": 2368709632 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014262788365095285, + "loss": 2.5856, + "theoretical_loss": 3.3831206273868455, + "tokens_seen": 2368775168 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014261785356068203, + "loss": 2.5277, + "theoretical_loss": 3.3831130805587484, + "tokens_seen": 2368840704 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014260782347041124, + "loss": 2.4054, + "theoretical_loss": 3.383105533997897, + "tokens_seen": 2368906240 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014259779338014042, + "loss": 2.5678, + "theoretical_loss": 3.383097987704274, + "tokens_seen": 2368971776 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001425877632898696, + "loss": 2.2164, + "theoretical_loss": 3.3830904416778638, + "tokens_seen": 2369037312 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014257773319959879, + "loss": 2.4278, + "theoretical_loss": 3.3830828959186485, + "tokens_seen": 2369102848 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.61276912689209, + "objective/train/theoretical_loss": 3.383081009520592, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.383081009520592, + "tokens_seen": 2369119232 + }, + { + "epoch": 7.09, + "learning_rate": 0.000142567703109328, + "loss": 2.4485, + "theoretical_loss": 3.3830753504266116, + "tokens_seen": 2369168384 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014255767301905718, + "loss": 2.4781, + "theoretical_loss": 3.383067805201736, + "tokens_seen": 2369233920 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014254764292878636, + "loss": 2.4213, + "theoretical_loss": 3.383060260244005, + "tokens_seen": 2369299456 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014253761283851554, + "loss": 2.5139, + "theoretical_loss": 3.383052715553402, + "tokens_seen": 2369364992 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014252758274824475, + "loss": 2.0542, + "theoretical_loss": 3.38304517112991, + "tokens_seen": 2369430528 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014251755265797393, + "loss": 2.3047, + "theoretical_loss": 3.383037626973512, + "tokens_seen": 2369496064 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001425075225677031, + "loss": 2.5321, + "theoretical_loss": 3.383030083084191, + "tokens_seen": 2369561600 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001424974924774323, + "loss": 2.4881, + "theoretical_loss": 3.3830225394619307, + "tokens_seen": 2369627136 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014248746238716147, + "loss": 2.4157, + "theoretical_loss": 3.3830149961067137, + "tokens_seen": 2369692672 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014247743229689068, + "loss": 2.4336, + "theoretical_loss": 3.3830074530185237, + "tokens_seen": 2369758208 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014246740220661986, + "loss": 2.1947, + "theoretical_loss": 3.3829999101973436, + "tokens_seen": 2369823744 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014245737211634904, + "loss": 2.232, + "theoretical_loss": 3.3829923676431566, + "tokens_seen": 2369889280 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014244734202607822, + "loss": 2.2917, + "theoretical_loss": 3.382984825355946, + "tokens_seen": 2369954816 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014243731193580743, + "loss": 2.4158, + "theoretical_loss": 3.3829772833356944, + "tokens_seen": 2370020352 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001424272818455366, + "loss": 2.3232, + "theoretical_loss": 3.3829697415823854, + "tokens_seen": 2370085888 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001424172517552658, + "loss": 2.4926, + "theoretical_loss": 3.382962200096002, + "tokens_seen": 2370151424 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014240722166499497, + "loss": 2.3793, + "theoretical_loss": 3.382954658876528, + "tokens_seen": 2370216960 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014239719157472415, + "loss": 2.6033, + "theoretical_loss": 3.382947117923946, + "tokens_seen": 2370282496 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014238716148445336, + "loss": 2.4562, + "theoretical_loss": 3.382939577238239, + "tokens_seen": 2370348032 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014237713139418254, + "loss": 2.221, + "theoretical_loss": 3.3829320368193905, + "tokens_seen": 2370413568 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014236710130391173, + "loss": 2.4175, + "theoretical_loss": 3.3829244966673837, + "tokens_seen": 2370479104 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001423570712136409, + "loss": 2.2578, + "theoretical_loss": 3.3829169567822013, + "tokens_seen": 2370544640 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014234704112337011, + "loss": 2.2091, + "theoretical_loss": 3.3829094171638276, + "tokens_seen": 2370610176 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001423370110330993, + "loss": 2.3478, + "theoretical_loss": 3.3829018778122446, + "tokens_seen": 2370675712 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014232698094282848, + "loss": 2.2933, + "theoretical_loss": 3.3828943387274357, + "tokens_seen": 2370741248 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.512272596359253, + "objective/train/theoretical_loss": 3.3828924539979153, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.3828924539979153, + "tokens_seen": 2370757632 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014231695085255766, + "loss": 2.3232, + "theoretical_loss": 3.382886799909385, + "tokens_seen": 2370806784 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014230692076228684, + "loss": 2.4091, + "theoretical_loss": 3.3828792613580747, + "tokens_seen": 2370872320 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014229689067201605, + "loss": 2.5792, + "theoretical_loss": 3.382871723073488, + "tokens_seen": 2370937856 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014228686058174523, + "loss": 2.3852, + "theoretical_loss": 3.3828641850556087, + "tokens_seen": 2371003392 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001422768304914744, + "loss": 2.69, + "theoretical_loss": 3.3828566473044197, + "tokens_seen": 2371068928 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014226680040120362, + "loss": 2.5385, + "theoretical_loss": 3.382849109819904, + "tokens_seen": 2371134464 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014225677031093283, + "loss": 2.245, + "theoretical_loss": 3.3828415726020453, + "tokens_seen": 2371200000 + }, + { + "epoch": 7.09, + "learning_rate": 0.000142246740220662, + "loss": 2.4528, + "theoretical_loss": 3.382834035650826, + "tokens_seen": 2371265536 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001422367101303912, + "loss": 2.5743, + "theoretical_loss": 3.3828264989662307, + "tokens_seen": 2371331072 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014222668004012037, + "loss": 2.4506, + "theoretical_loss": 3.3828189625482405, + "tokens_seen": 2371396608 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014221664994984955, + "loss": 2.4617, + "theoretical_loss": 3.3828114263968407, + "tokens_seen": 2371462144 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014220661985957876, + "loss": 2.5983, + "theoretical_loss": 3.382803890512013, + "tokens_seen": 2371527680 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014219658976930794, + "loss": 2.5243, + "theoretical_loss": 3.3827963548937414, + "tokens_seen": 2371593216 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014218655967903712, + "loss": 2.1995, + "theoretical_loss": 3.382788819542009, + "tokens_seen": 2371658752 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001421765295887663, + "loss": 2.3979, + "theoretical_loss": 3.382781284456799, + "tokens_seen": 2371724288 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001421664994984955, + "loss": 2.3127, + "theoretical_loss": 3.382773749638094, + "tokens_seen": 2371789824 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001421564694082247, + "loss": 2.3782, + "theoretical_loss": 3.382766215085878, + "tokens_seen": 2371855360 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014214643931795387, + "loss": 2.4776, + "theoretical_loss": 3.382758680800134, + "tokens_seen": 2371920896 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014213640922768305, + "loss": 2.3952, + "theoretical_loss": 3.382751146780845, + "tokens_seen": 2371986432 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014212637913741223, + "loss": 2.4695, + "theoretical_loss": 3.3827436130279946, + "tokens_seen": 2372051968 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014211634904714144, + "loss": 2.2256, + "theoretical_loss": 3.3827360795415657, + "tokens_seen": 2372117504 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014210631895687062, + "loss": 2.5071, + "theoretical_loss": 3.3827285463215415, + "tokens_seen": 2372183040 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001420962888665998, + "loss": 2.4518, + "theoretical_loss": 3.382721013367905, + "tokens_seen": 2372248576 + }, + { + "epoch": 7.09, + "learning_rate": 0.000142086258776329, + "loss": 2.4923, + "theoretical_loss": 3.38271348068064, + "tokens_seen": 2372314112 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001420762286860582, + "loss": 2.454, + "theoretical_loss": 3.38270594825973, + "tokens_seen": 2372379648 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.453768491744995, + "objective/train/theoretical_loss": 3.382704065196118, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.382704065196118, + "tokens_seen": 2372396032 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014206619859578738, + "loss": 2.144, + "theoretical_loss": 3.382698416105157, + "tokens_seen": 2372445184 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014205616850551656, + "loss": 2.4025, + "theoretical_loss": 3.3826908842169052, + "tokens_seen": 2372510720 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014204613841524574, + "loss": 2.2811, + "theoretical_loss": 3.3826833525949573, + "tokens_seen": 2372576256 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014203610832497495, + "loss": 2.5197, + "theoretical_loss": 3.382675821239297, + "tokens_seen": 2372641792 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014202607823470413, + "loss": 2.453, + "theoretical_loss": 3.3826682901499074, + "tokens_seen": 2372707328 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001420160481444333, + "loss": 2.3454, + "theoretical_loss": 3.3826607593267712, + "tokens_seen": 2372772864 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001420060180541625, + "loss": 2.6128, + "theoretical_loss": 3.3826532287698723, + "tokens_seen": 2372838400 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014199598796389167, + "loss": 2.4003, + "theoretical_loss": 3.3826456984791937, + "tokens_seen": 2372903936 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014198595787362088, + "loss": 2.3001, + "theoretical_loss": 3.3826381684547187, + "tokens_seen": 2372969472 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014197592778335006, + "loss": 2.4677, + "theoretical_loss": 3.38263063869643, + "tokens_seen": 2373035008 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014196589769307924, + "loss": 2.3081, + "theoretical_loss": 3.382623109204312, + "tokens_seen": 2373100544 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014195586760280842, + "loss": 2.371, + "theoretical_loss": 3.3826155799783466, + "tokens_seen": 2373166080 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014194583751253763, + "loss": 2.2659, + "theoretical_loss": 3.382608051018518, + "tokens_seen": 2373231616 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001419358074222668, + "loss": 2.6892, + "theoretical_loss": 3.3826005223248092, + "tokens_seen": 2373297152 + }, + { + "epoch": 7.09, + "learning_rate": 0.000141925777331996, + "loss": 2.3934, + "theoretical_loss": 3.3825929938972035, + "tokens_seen": 2373362688 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014191574724172517, + "loss": 2.3064, + "theoretical_loss": 3.3825854657356835, + "tokens_seen": 2373428224 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014190571715145436, + "loss": 2.2947, + "theoretical_loss": 3.3825779378402334, + "tokens_seen": 2373493760 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014189568706118356, + "loss": 2.3661, + "theoretical_loss": 3.3825704102108363, + "tokens_seen": 2373559296 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014188565697091274, + "loss": 2.2197, + "theoretical_loss": 3.3825628828474743, + "tokens_seen": 2373624832 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014187562688064193, + "loss": 2.2569, + "theoretical_loss": 3.3825553557501324, + "tokens_seen": 2373690368 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001418655967903711, + "loss": 2.5032, + "theoretical_loss": 3.382547828918793, + "tokens_seen": 2373755904 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014185556670010031, + "loss": 2.4723, + "theoretical_loss": 3.382540302353439, + "tokens_seen": 2373821440 + }, + { + "epoch": 7.09, + "learning_rate": 0.0001418455366098295, + "loss": 2.603, + "theoretical_loss": 3.3825327760540533, + "tokens_seen": 2373886976 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014183550651955868, + "loss": 2.3099, + "theoretical_loss": 3.3825252500206204, + "tokens_seen": 2373952512 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014182547642928786, + "loss": 2.3401, + "theoretical_loss": 3.3825177242531232, + "tokens_seen": 2374018048 + }, + { + "epoch": 7.09, + "objective/train/docs_used": 2564848, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.7567811012268066, + "objective/train/theoretical_loss": 3.3825158428527997, + "objective/train/tokens_used": 2374078944, + "theoretical_loss": 3.3825158428527997, + "tokens_seen": 2374034432 + }, + { + "epoch": 7.09, + "learning_rate": 0.00014181544633901704, + "loss": 2.4097, + "theoretical_loss": 3.382510198751545, + "tokens_seen": 2374083584 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014180541624874625, + "loss": 3.3743, + "theoretical_loss": 3.3825009098272187, + "tokens_seen": 2374164480 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014179538615847543, + "loss": 2.4496, + "theoretical_loss": 3.382493384919742, + "tokens_seen": 2374230016 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001417853560682046, + "loss": 2.5041, + "theoretical_loss": 3.38248586027813, + "tokens_seen": 2374295552 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001417753259779338, + "loss": 2.6824, + "theoretical_loss": 3.382478335902366, + "tokens_seen": 2374361088 + }, + { + "epoch": 8.0, + "learning_rate": 0.000141765295887663, + "loss": 2.5185, + "theoretical_loss": 3.382470811792433, + "tokens_seen": 2374426624 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014175526579739218, + "loss": 2.6283, + "theoretical_loss": 3.3824632879483145, + "tokens_seen": 2374492160 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014174523570712136, + "loss": 2.6425, + "theoretical_loss": 3.3824557643699937, + "tokens_seen": 2374557696 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014173520561685054, + "loss": 2.3741, + "theoretical_loss": 3.382448241057454, + "tokens_seen": 2374623232 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014172517552657972, + "loss": 2.5458, + "theoretical_loss": 3.3824407180106784, + "tokens_seen": 2374688768 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014171514543630893, + "loss": 2.6721, + "theoretical_loss": 3.382433195229651, + "tokens_seen": 2374754304 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001417051153460381, + "loss": 2.6765, + "theoretical_loss": 3.382425672714354, + "tokens_seen": 2374819840 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001416950852557673, + "loss": 2.5521, + "theoretical_loss": 3.382418150464771, + "tokens_seen": 2374885376 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014168505516549648, + "loss": 2.5752, + "theoretical_loss": 3.3824106284808853, + "tokens_seen": 2374950912 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014167502507522568, + "loss": 2.6996, + "theoretical_loss": 3.3824031067626805, + "tokens_seen": 2375016448 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014166499498495486, + "loss": 2.5379, + "theoretical_loss": 3.3823955853101397, + "tokens_seen": 2375081984 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014165496489468405, + "loss": 2.5497, + "theoretical_loss": 3.3823880641232464, + "tokens_seen": 2375147520 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014164493480441323, + "loss": 2.5487, + "theoretical_loss": 3.3823805432019833, + "tokens_seen": 2375213056 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001416349047141424, + "loss": 2.4554, + "theoretical_loss": 3.382373022546334, + "tokens_seen": 2375278592 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014162487462387162, + "loss": 2.6048, + "theoretical_loss": 3.382365502156282, + "tokens_seen": 2375344128 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001416148445336008, + "loss": 2.5447, + "theoretical_loss": 3.3823579820318104, + "tokens_seen": 2375409664 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014160481444332998, + "loss": 2.5838, + "theoretical_loss": 3.3823504621729024, + "tokens_seen": 2375475200 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014159478435305916, + "loss": 2.6444, + "theoretical_loss": 3.382342942579542, + "tokens_seen": 2375540736 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014158475426278837, + "loss": 2.5965, + "theoretical_loss": 3.3823354232517113, + "tokens_seen": 2375606272 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2632328, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7899668216705322, + "objective/train/theoretical_loss": 3.382327904189394, + "objective/train/tokens_used": 2396131808, + "theoretical_loss": 3.382327904189394, + "tokens_seen": 2375671808 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014157472417251755, + "loss": 2.5772, + "theoretical_loss": 3.382327904189394, + "tokens_seen": 2375671808 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014156469408224673, + "loss": 2.4948, + "theoretical_loss": 3.3823203853925743, + "tokens_seen": 2375737344 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001415546639919759, + "loss": 2.2611, + "theoretical_loss": 3.382312866861234, + "tokens_seen": 2375802880 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001415446339017051, + "loss": 2.6906, + "theoretical_loss": 3.382305348595358, + "tokens_seen": 2375868416 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001415346038114343, + "loss": 2.3669, + "theoretical_loss": 3.3822978305949287, + "tokens_seen": 2375933952 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014152457372116348, + "loss": 2.6303, + "theoretical_loss": 3.3822903128599293, + "tokens_seen": 2375999488 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001415145436308927, + "loss": 2.7891, + "theoretical_loss": 3.3822827953903434, + "tokens_seen": 2376065024 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014150451354062187, + "loss": 2.5567, + "theoretical_loss": 3.382275278186154, + "tokens_seen": 2376130560 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014149448345035108, + "loss": 2.5377, + "theoretical_loss": 3.382267761247345, + "tokens_seen": 2376196096 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014148445336008026, + "loss": 2.5938, + "theoretical_loss": 3.3822602445738994, + "tokens_seen": 2376261632 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014147442326980944, + "loss": 2.6081, + "theoretical_loss": 3.3822527281658, + "tokens_seen": 2376327168 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014146439317953862, + "loss": 2.6295, + "theoretical_loss": 3.3822452120230313, + "tokens_seen": 2376392704 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014145436308926783, + "loss": 2.4232, + "theoretical_loss": 3.3822376961455753, + "tokens_seen": 2376458240 + }, + { + "epoch": 8.0, + "learning_rate": 0.000141444332998997, + "loss": 2.5633, + "theoretical_loss": 3.382230180533416, + "tokens_seen": 2376523776 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001414343029087262, + "loss": 2.5058, + "theoretical_loss": 3.382222665186537, + "tokens_seen": 2376589312 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014142427281845537, + "loss": 2.519, + "theoretical_loss": 3.382215150104921, + "tokens_seen": 2376654848 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014141424272818456, + "loss": 2.5307, + "theoretical_loss": 3.3822076352885517, + "tokens_seen": 2376720384 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014140421263791376, + "loss": 2.6328, + "theoretical_loss": 3.3822001207374126, + "tokens_seen": 2376785920 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014139418254764294, + "loss": 2.6018, + "theoretical_loss": 3.3821926064514862, + "tokens_seen": 2376851456 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014138415245737213, + "loss": 2.4718, + "theoretical_loss": 3.382185092430757, + "tokens_seen": 2376916992 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001413741223671013, + "loss": 2.3811, + "theoretical_loss": 3.3821775786752073, + "tokens_seen": 2376982528 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014136409227683052, + "loss": 2.6625, + "theoretical_loss": 3.3821700651848206, + "tokens_seen": 2377048064 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001413540621865597, + "loss": 2.4763, + "theoretical_loss": 3.382162551959581, + "tokens_seen": 2377113600 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014134403209628888, + "loss": 2.5039, + "theoretical_loss": 3.382155038999471, + "tokens_seen": 2377179136 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014133400200601806, + "loss": 2.5523, + "theoretical_loss": 3.3821475263044745, + "tokens_seen": 2377244672 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2637581, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8156797885894775, + "objective/train/theoretical_loss": 3.382140013874574, + "objective/train/tokens_used": 2397770208, + "theoretical_loss": 3.382140013874574, + "tokens_seen": 2377310208 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014132397191574724, + "loss": 2.7096, + "theoretical_loss": 3.382140013874574, + "tokens_seen": 2377310208 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014131394182547645, + "loss": 2.5364, + "theoretical_loss": 3.382132501709754, + "tokens_seen": 2377375744 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014130391173520563, + "loss": 2.5903, + "theoretical_loss": 3.3821249898099968, + "tokens_seen": 2377441280 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001412938816449348, + "loss": 2.5311, + "theoretical_loss": 3.3821174781752865, + "tokens_seen": 2377506816 + }, + { + "epoch": 8.0, + "learning_rate": 0.000141283851554664, + "loss": 2.5324, + "theoretical_loss": 3.382109966805606, + "tokens_seen": 2377572352 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001412738214643932, + "loss": 2.5074, + "theoretical_loss": 3.382102455700939, + "tokens_seen": 2377637888 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014126379137412238, + "loss": 2.6358, + "theoretical_loss": 3.3820949448612683, + "tokens_seen": 2377703424 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014125376128385156, + "loss": 2.6124, + "theoretical_loss": 3.3820874342865777, + "tokens_seen": 2377768960 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014124373119358074, + "loss": 2.4143, + "theoretical_loss": 3.3820799239768506, + "tokens_seen": 2377834496 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014123370110330992, + "loss": 2.6014, + "theoretical_loss": 3.38207241393207, + "tokens_seen": 2377900032 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014122367101303913, + "loss": 2.5049, + "theoretical_loss": 3.3820649041522195, + "tokens_seen": 2377965568 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001412136409227683, + "loss": 2.4792, + "theoretical_loss": 3.382057394637282, + "tokens_seen": 2378031104 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001412036108324975, + "loss": 2.5779, + "theoretical_loss": 3.3820498853872416, + "tokens_seen": 2378096640 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014119358074222668, + "loss": 2.3465, + "theoretical_loss": 3.3820423764020813, + "tokens_seen": 2378162176 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014118355065195588, + "loss": 2.6688, + "theoretical_loss": 3.382034867681784, + "tokens_seen": 2378227712 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014117352056168506, + "loss": 2.432, + "theoretical_loss": 3.3820273592263344, + "tokens_seen": 2378293248 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014116349047141425, + "loss": 2.4019, + "theoretical_loss": 3.3820198510357145, + "tokens_seen": 2378358784 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014115346038114343, + "loss": 2.418, + "theoretical_loss": 3.382012343109908, + "tokens_seen": 2378424320 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001411434302908726, + "loss": 2.5177, + "theoretical_loss": 3.3820048354488983, + "tokens_seen": 2378489856 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014113340020060182, + "loss": 2.5654, + "theoretical_loss": 3.381997328052669, + "tokens_seen": 2378555392 + }, + { + "epoch": 8.0, + "learning_rate": 0.000141123370110331, + "loss": 2.6419, + "theoretical_loss": 3.3819898209212034, + "tokens_seen": 2378620928 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014111334002006018, + "loss": 2.4191, + "theoretical_loss": 3.3819823140544845, + "tokens_seen": 2378686464 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014110330992978936, + "loss": 2.6311, + "theoretical_loss": 3.3819748074524965, + "tokens_seen": 2378752000 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014109327983951857, + "loss": 2.7553, + "theoretical_loss": 3.3819673011152216, + "tokens_seen": 2378817536 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014108324974924775, + "loss": 2.4877, + "theoretical_loss": 3.3819597950426443, + "tokens_seen": 2378883072 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2642454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8715226650238037, + "objective/train/theoretical_loss": 3.3819522892347473, + "objective/train/tokens_used": 2399408608, + "theoretical_loss": 3.3819522892347473, + "tokens_seen": 2378948608 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014107321965897693, + "loss": 2.4183, + "theoretical_loss": 3.3819522892347473, + "tokens_seen": 2378948608 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001410631895687061, + "loss": 2.6556, + "theoretical_loss": 3.381944783691514, + "tokens_seen": 2379014144 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001410531594784353, + "loss": 2.4774, + "theoretical_loss": 3.381937278412928, + "tokens_seen": 2379079680 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001410431293881645, + "loss": 2.5207, + "theoretical_loss": 3.381929773398973, + "tokens_seen": 2379145216 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014103309929789368, + "loss": 2.7657, + "theoretical_loss": 3.3819222686496313, + "tokens_seen": 2379210752 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014102306920762286, + "loss": 2.6014, + "theoretical_loss": 3.3819147641648875, + "tokens_seen": 2379276288 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014101303911735204, + "loss": 2.4887, + "theoretical_loss": 3.381907259944724, + "tokens_seen": 2379341824 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014100300902708125, + "loss": 2.7147, + "theoretical_loss": 3.381899755989125, + "tokens_seen": 2379407360 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014099297893681043, + "loss": 2.7856, + "theoretical_loss": 3.381892252298073, + "tokens_seen": 2379472896 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014098294884653961, + "loss": 2.6033, + "theoretical_loss": 3.3818847488715527, + "tokens_seen": 2379538432 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001409729187562688, + "loss": 2.7675, + "theoretical_loss": 3.3818772457095463, + "tokens_seen": 2379603968 + }, + { + "epoch": 8.0, + "learning_rate": 0.000140962888665998, + "loss": 2.6383, + "theoretical_loss": 3.3818697428120377, + "tokens_seen": 2379669504 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014095285857572718, + "loss": 2.5872, + "theoretical_loss": 3.38186224017901, + "tokens_seen": 2379735040 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014094282848545637, + "loss": 2.5438, + "theoretical_loss": 3.381854737810447, + "tokens_seen": 2379800576 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014093279839518555, + "loss": 2.6121, + "theoretical_loss": 3.381847235706332, + "tokens_seen": 2379866112 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014092276830491473, + "loss": 2.4152, + "theoretical_loss": 3.3818397338666477, + "tokens_seen": 2379931648 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014091273821464394, + "loss": 2.6275, + "theoretical_loss": 3.3818322322913783, + "tokens_seen": 2379997184 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014090270812437312, + "loss": 2.439, + "theoretical_loss": 3.381824730980507, + "tokens_seen": 2380062720 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001408926780341023, + "loss": 2.5887, + "theoretical_loss": 3.3818172299340175, + "tokens_seen": 2380128256 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014088264794383148, + "loss": 2.5661, + "theoretical_loss": 3.3818097291518927, + "tokens_seen": 2380193792 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001408726178535607, + "loss": 2.6525, + "theoretical_loss": 3.381802228634116, + "tokens_seen": 2380259328 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014086258776328987, + "loss": 2.5309, + "theoretical_loss": 3.3817947283806706, + "tokens_seen": 2380324864 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014085255767301905, + "loss": 2.5144, + "theoretical_loss": 3.3817872283915413, + "tokens_seen": 2380390400 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014084252758274823, + "loss": 2.4217, + "theoretical_loss": 3.38177972866671, + "tokens_seen": 2380455936 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001408324974924774, + "loss": 2.4622, + "theoretical_loss": 3.3817722292061605, + "tokens_seen": 2380521472 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2647361, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7506144046783447, + "objective/train/theoretical_loss": 3.381764730009876, + "objective/train/tokens_used": 2401047008, + "theoretical_loss": 3.381764730009876, + "tokens_seen": 2380587008 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014082246740220662, + "loss": 2.603, + "theoretical_loss": 3.381764730009876, + "tokens_seen": 2380587008 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001408124373119358, + "loss": 2.6142, + "theoretical_loss": 3.3817572310778408, + "tokens_seen": 2380652544 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014080240722166498, + "loss": 2.6834, + "theoretical_loss": 3.3817497324100376, + "tokens_seen": 2380718080 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014079237713139416, + "loss": 2.6388, + "theoretical_loss": 3.38174223400645, + "tokens_seen": 2380783616 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014078234704112337, + "loss": 2.7046, + "theoretical_loss": 3.381734735867061, + "tokens_seen": 2380849152 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014077231695085255, + "loss": 2.5966, + "theoretical_loss": 3.381727237991855, + "tokens_seen": 2380914688 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014076228686058176, + "loss": 2.3614, + "theoretical_loss": 3.381719740380814, + "tokens_seen": 2380980224 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014075225677031094, + "loss": 2.5386, + "theoretical_loss": 3.381712243033923, + "tokens_seen": 2381045760 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014074222668004012, + "loss": 2.7009, + "theoretical_loss": 3.3817047459511644, + "tokens_seen": 2381111296 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014073219658976933, + "loss": 2.6897, + "theoretical_loss": 3.3816972491325217, + "tokens_seen": 2381176832 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001407221664994985, + "loss": 2.4867, + "theoretical_loss": 3.3816897525779788, + "tokens_seen": 2381242368 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001407121364092277, + "loss": 2.6617, + "theoretical_loss": 3.381682256287519, + "tokens_seen": 2381307904 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014070210631895688, + "loss": 2.6976, + "theoretical_loss": 3.381674760261125, + "tokens_seen": 2381373440 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014069207622868608, + "loss": 2.7079, + "theoretical_loss": 3.3816672644987813, + "tokens_seen": 2381438976 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014068204613841526, + "loss": 2.565, + "theoretical_loss": 3.3816597690004704, + "tokens_seen": 2381504512 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014067201604814445, + "loss": 2.6246, + "theoretical_loss": 3.3816522737661767, + "tokens_seen": 2381570048 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014066198595787363, + "loss": 2.3864, + "theoretical_loss": 3.381644778795883, + "tokens_seen": 2381635584 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001406519558676028, + "loss": 2.5112, + "theoretical_loss": 3.3816372840895728, + "tokens_seen": 2381701120 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014064192577733202, + "loss": 2.4211, + "theoretical_loss": 3.381629789647229, + "tokens_seen": 2381766656 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001406318956870612, + "loss": 2.5204, + "theoretical_loss": 3.381622295468836, + "tokens_seen": 2381832192 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014062186559679038, + "loss": 2.6634, + "theoretical_loss": 3.3816148015543774, + "tokens_seen": 2381897728 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014061183550651956, + "loss": 2.3755, + "theoretical_loss": 3.3816073079038356, + "tokens_seen": 2381963264 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014060180541624877, + "loss": 2.4883, + "theoretical_loss": 3.3815998145171946, + "tokens_seen": 2382028800 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014059177532597795, + "loss": 2.7182, + "theoretical_loss": 3.3815923213944377, + "tokens_seen": 2382094336 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014058174523570713, + "loss": 2.5878, + "theoretical_loss": 3.3815848285355488, + "tokens_seen": 2382159872 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2652415, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.415748119354248, + "objective/train/theoretical_loss": 3.3815773359405106, + "objective/train/tokens_used": 2402685408, + "theoretical_loss": 3.3815773359405106, + "tokens_seen": 2382225408 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001405717151454363, + "loss": 2.6193, + "theoretical_loss": 3.3815773359405106, + "tokens_seen": 2382225408 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001405616850551655, + "loss": 2.6218, + "theoretical_loss": 3.381569843609307, + "tokens_seen": 2382290944 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001405516549648947, + "loss": 2.6054, + "theoretical_loss": 3.3815623515419215, + "tokens_seen": 2382356480 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014054162487462388, + "loss": 2.5155, + "theoretical_loss": 3.381554859738338, + "tokens_seen": 2382422016 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014053159478435306, + "loss": 2.409, + "theoretical_loss": 3.3815473681985386, + "tokens_seen": 2382487552 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014052156469408224, + "loss": 2.4813, + "theoretical_loss": 3.381539876922508, + "tokens_seen": 2382553088 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014051153460381145, + "loss": 2.5653, + "theoretical_loss": 3.381532385910229, + "tokens_seen": 2382618624 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014050150451354063, + "loss": 2.4964, + "theoretical_loss": 3.3815248951616854, + "tokens_seen": 2382684160 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014049147442326981, + "loss": 2.5543, + "theoretical_loss": 3.3815174046768606, + "tokens_seen": 2382749696 + }, + { + "epoch": 8.0, + "learning_rate": 0.000140481444332999, + "loss": 2.6139, + "theoretical_loss": 3.3815099144557377, + "tokens_seen": 2382815232 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014047141424272818, + "loss": 2.5876, + "theoretical_loss": 3.3815024244983007, + "tokens_seen": 2382880768 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014046138415245739, + "loss": 2.5556, + "theoretical_loss": 3.381494934804533, + "tokens_seen": 2382946304 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014045135406218657, + "loss": 2.5407, + "theoretical_loss": 3.3814874453744173, + "tokens_seen": 2383011840 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014044132397191575, + "loss": 2.6575, + "theoretical_loss": 3.3814799562079383, + "tokens_seen": 2383077376 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014043129388164493, + "loss": 2.5298, + "theoretical_loss": 3.3814724673050787, + "tokens_seen": 2383142912 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014042126379137414, + "loss": 2.6726, + "theoretical_loss": 3.381464978665822, + "tokens_seen": 2383208448 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014041123370110332, + "loss": 2.5491, + "theoretical_loss": 3.3814574902901517, + "tokens_seen": 2383273984 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001404012036108325, + "loss": 2.4565, + "theoretical_loss": 3.3814500021780516, + "tokens_seen": 2383339520 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014039117352056168, + "loss": 2.5731, + "theoretical_loss": 3.3814425143295046, + "tokens_seen": 2383405056 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001403811434302909, + "loss": 2.4839, + "theoretical_loss": 3.381435026744495, + "tokens_seen": 2383470592 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014037111334002007, + "loss": 2.5272, + "theoretical_loss": 3.3814275394230053, + "tokens_seen": 2383536128 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014036108324974925, + "loss": 2.5986, + "theoretical_loss": 3.3814200523650197, + "tokens_seen": 2383601664 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014035105315947843, + "loss": 2.4925, + "theoretical_loss": 3.381412565570521, + "tokens_seen": 2383667200 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001403410230692076, + "loss": 2.6334, + "theoretical_loss": 3.381405079039494, + "tokens_seen": 2383732736 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014033099297893682, + "loss": 2.4768, + "theoretical_loss": 3.3813975927719206, + "tokens_seen": 2383798272 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2657586, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.189558506011963, + "objective/train/theoretical_loss": 3.3813901067677854, + "objective/train/tokens_used": 2404323808, + "theoretical_loss": 3.3813901067677854, + "tokens_seen": 2383863808 + }, + { + "epoch": 8.0, + "learning_rate": 0.000140320962888666, + "loss": 2.625, + "theoretical_loss": 3.3813901067677854, + "tokens_seen": 2383863808 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014031093279839518, + "loss": 2.619, + "theoretical_loss": 3.3813826210270714, + "tokens_seen": 2383929344 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014030090270812436, + "loss": 2.529, + "theoretical_loss": 3.3813751355497623, + "tokens_seen": 2383994880 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014029087261785357, + "loss": 2.4645, + "theoretical_loss": 3.381367650335841, + "tokens_seen": 2384060416 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014028084252758275, + "loss": 2.604, + "theoretical_loss": 3.381360165385292, + "tokens_seen": 2384125952 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014027081243731193, + "loss": 2.4483, + "theoretical_loss": 3.3813526806980985, + "tokens_seen": 2384191488 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014026078234704112, + "loss": 2.5072, + "theoretical_loss": 3.381345196274243, + "tokens_seen": 2384257024 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001402507522567703, + "loss": 2.7053, + "theoretical_loss": 3.38133771211371, + "tokens_seen": 2384322560 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001402407221664995, + "loss": 2.3908, + "theoretical_loss": 3.381330228216483, + "tokens_seen": 2384388096 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014023069207622869, + "loss": 2.559, + "theoretical_loss": 3.3813227445825453, + "tokens_seen": 2384453632 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014022066198595787, + "loss": 2.4928, + "theoretical_loss": 3.38131526121188, + "tokens_seen": 2384519168 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014021063189568705, + "loss": 2.4828, + "theoretical_loss": 3.381307778104471, + "tokens_seen": 2384584704 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014020060180541626, + "loss": 2.4637, + "theoretical_loss": 3.381300295260302, + "tokens_seen": 2384650240 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014019057171514544, + "loss": 2.4265, + "theoretical_loss": 3.381292812679356, + "tokens_seen": 2384715776 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014018054162487462, + "loss": 2.5412, + "theoretical_loss": 3.381285330361617, + "tokens_seen": 2384781312 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001401705115346038, + "loss": 2.5774, + "theoretical_loss": 3.381277848307068, + "tokens_seen": 2384846848 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014016048144433298, + "loss": 2.5076, + "theoretical_loss": 3.381270366515693, + "tokens_seen": 2384912384 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001401504513540622, + "loss": 2.5484, + "theoretical_loss": 3.3812628849874757, + "tokens_seen": 2384977920 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014014042126379137, + "loss": 2.5646, + "theoretical_loss": 3.3812554037223985, + "tokens_seen": 2385043456 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014013039117352055, + "loss": 2.4398, + "theoretical_loss": 3.381247922720446, + "tokens_seen": 2385108992 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014012036108324973, + "loss": 2.5957, + "theoretical_loss": 3.3812404419816016, + "tokens_seen": 2385174528 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014011033099297894, + "loss": 2.5745, + "theoretical_loss": 3.381232961505848, + "tokens_seen": 2385240064 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014010030090270812, + "loss": 2.3082, + "theoretical_loss": 3.3812254812931695, + "tokens_seen": 2385305600 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001400902708124373, + "loss": 2.489, + "theoretical_loss": 3.3812180013435493, + "tokens_seen": 2385371136 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014008024072216648, + "loss": 2.6912, + "theoretical_loss": 3.3812105216569712, + "tokens_seen": 2385436672 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2662634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5370185375213623, + "objective/train/theoretical_loss": 3.3812030422334187, + "objective/train/tokens_used": 2405962208, + "theoretical_loss": 3.3812030422334187, + "tokens_seen": 2385502208 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014007021063189567, + "loss": 2.6791, + "theoretical_loss": 3.3812030422334187, + "tokens_seen": 2385502208 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014006018054162487, + "loss": 2.5091, + "theoretical_loss": 3.381195563072875, + "tokens_seen": 2385567744 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014005015045135406, + "loss": 2.7396, + "theoretical_loss": 3.3811880841753235, + "tokens_seen": 2385633280 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014004012036108324, + "loss": 2.5853, + "theoretical_loss": 3.3811806055407487, + "tokens_seen": 2385698816 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014003009027081242, + "loss": 2.6032, + "theoretical_loss": 3.381173127169133, + "tokens_seen": 2385764352 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014002006018054163, + "loss": 2.3863, + "theoretical_loss": 3.3811656490604602, + "tokens_seen": 2385829888 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014001003009027083, + "loss": 2.4664, + "theoretical_loss": 3.3811581712147145, + "tokens_seen": 2385895424 + }, + { + "epoch": 8.0, + "learning_rate": 0.00014000000000000001, + "loss": 2.3836, + "theoretical_loss": 3.381150693631879, + "tokens_seen": 2385960960 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001399899699097292, + "loss": 2.4761, + "theoretical_loss": 3.3811432163119366, + "tokens_seen": 2386026496 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013997993981945838, + "loss": 2.5991, + "theoretical_loss": 3.381135739254872, + "tokens_seen": 2386092032 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013996990972918759, + "loss": 2.3949, + "theoretical_loss": 3.3811282624606678, + "tokens_seen": 2386157568 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013995987963891677, + "loss": 2.5244, + "theoretical_loss": 3.381120785929308, + "tokens_seen": 2386223104 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013994984954864595, + "loss": 2.5959, + "theoretical_loss": 3.381113309660776, + "tokens_seen": 2386288640 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013993981945837513, + "loss": 2.5794, + "theoretical_loss": 3.3811058336550555, + "tokens_seen": 2386354176 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013992978936810434, + "loss": 2.5178, + "theoretical_loss": 3.38109835791213, + "tokens_seen": 2386419712 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013991975927783352, + "loss": 2.565, + "theoretical_loss": 3.381090882431983, + "tokens_seen": 2386485248 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001399097291875627, + "loss": 2.5249, + "theoretical_loss": 3.381083407214598, + "tokens_seen": 2386550784 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013989969909729188, + "loss": 2.5125, + "theoretical_loss": 3.381075932259958, + "tokens_seen": 2386616320 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001398896690070211, + "loss": 2.5105, + "theoretical_loss": 3.381068457568048, + "tokens_seen": 2386681856 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013987963891675027, + "loss": 2.5159, + "theoretical_loss": 3.38106098313885, + "tokens_seen": 2386747392 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013986960882647945, + "loss": 2.6142, + "theoretical_loss": 3.381053508972349, + "tokens_seen": 2386812928 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013985957873620863, + "loss": 2.5745, + "theoretical_loss": 3.381046035068527, + "tokens_seen": 2386878464 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001398495486459378, + "loss": 2.5935, + "theoretical_loss": 3.381038561427369, + "tokens_seen": 2386944000 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013983951855566702, + "loss": 2.4845, + "theoretical_loss": 3.381031088048857, + "tokens_seen": 2387009536 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001398294884653962, + "loss": 2.5736, + "theoretical_loss": 3.3810236149329764, + "tokens_seen": 2387075072 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2667364, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5953311920166016, + "objective/train/theoretical_loss": 3.3810161420797096, + "objective/train/tokens_used": 2407600608, + "theoretical_loss": 3.3810161420797096, + "tokens_seen": 2387140608 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013981945837512538, + "loss": 2.5669, + "theoretical_loss": 3.3810161420797096, + "tokens_seen": 2387140608 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013980942828485456, + "loss": 2.6516, + "theoretical_loss": 3.38100866948904, + "tokens_seen": 2387206144 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013979939819458377, + "loss": 2.5264, + "theoretical_loss": 3.381001197160952, + "tokens_seen": 2387271680 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013978936810431295, + "loss": 2.4814, + "theoretical_loss": 3.3809937250954283, + "tokens_seen": 2387337216 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013977933801404214, + "loss": 2.4276, + "theoretical_loss": 3.3809862532924533, + "tokens_seen": 2387402752 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013976930792377132, + "loss": 2.425, + "theoretical_loss": 3.3809787817520096, + "tokens_seen": 2387468288 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001397592778335005, + "loss": 2.4104, + "theoretical_loss": 3.3809713104740817, + "tokens_seen": 2387533824 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001397492477432297, + "loss": 2.7266, + "theoretical_loss": 3.3809638394586528, + "tokens_seen": 2387599360 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001397392176529589, + "loss": 2.3613, + "theoretical_loss": 3.3809563687057063, + "tokens_seen": 2387664896 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013972918756268807, + "loss": 2.5909, + "theoretical_loss": 3.380948898215226, + "tokens_seen": 2387730432 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013971915747241725, + "loss": 2.6523, + "theoretical_loss": 3.3809414279871954, + "tokens_seen": 2387795968 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013970912738214646, + "loss": 2.4737, + "theoretical_loss": 3.3809339580215982, + "tokens_seen": 2387861504 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013969909729187564, + "loss": 2.5041, + "theoretical_loss": 3.380926488318418, + "tokens_seen": 2387927040 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013968906720160482, + "loss": 2.6138, + "theoretical_loss": 3.3809190188776377, + "tokens_seen": 2387992576 + }, + { + "epoch": 8.0, + "learning_rate": 0.000139679037111334, + "loss": 2.4932, + "theoretical_loss": 3.380911549699242, + "tokens_seen": 2388058112 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013966900702106318, + "loss": 2.4589, + "theoretical_loss": 3.3809040807832136, + "tokens_seen": 2388123648 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001396589769307924, + "loss": 2.4483, + "theoretical_loss": 3.3808966121295363, + "tokens_seen": 2388189184 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013964894684052157, + "loss": 2.4386, + "theoretical_loss": 3.380889143738194, + "tokens_seen": 2388254720 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013963891675025075, + "loss": 2.7039, + "theoretical_loss": 3.38088167560917, + "tokens_seen": 2388320256 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013962888665997993, + "loss": 2.5871, + "theoretical_loss": 3.3808742077424476, + "tokens_seen": 2388385792 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013961885656970914, + "loss": 2.52, + "theoretical_loss": 3.3808667401380115, + "tokens_seen": 2388451328 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013960882647943832, + "loss": 2.4778, + "theoretical_loss": 3.3808592727958438, + "tokens_seen": 2388516864 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001395987963891675, + "loss": 2.4041, + "theoretical_loss": 3.3808518057159294, + "tokens_seen": 2388582400 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013958876629889668, + "loss": 2.6389, + "theoretical_loss": 3.380844338898251, + "tokens_seen": 2388647936 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013957873620862587, + "loss": 2.5433, + "theoretical_loss": 3.380836872342792, + "tokens_seen": 2388713472 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2672481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3550407886505127, + "objective/train/theoretical_loss": 3.3808294060495374, + "objective/train/tokens_used": 2409239008, + "theoretical_loss": 3.3808294060495374, + "tokens_seen": 2388779008 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013956870611835507, + "loss": 2.6157, + "theoretical_loss": 3.3808294060495374, + "tokens_seen": 2388779008 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013955867602808426, + "loss": 2.5383, + "theoretical_loss": 3.3808219400184694, + "tokens_seen": 2388844544 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013954864593781344, + "loss": 2.5383, + "theoretical_loss": 3.3808144742495725, + "tokens_seen": 2388910080 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013953861584754262, + "loss": 2.4723, + "theoretical_loss": 3.38080700874283, + "tokens_seen": 2388975616 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013952858575727183, + "loss": 2.6065, + "theoretical_loss": 3.3807995434982248, + "tokens_seen": 2389041152 + }, + { + "epoch": 8.0, + "learning_rate": 0.000139518555667001, + "loss": 2.3919, + "theoretical_loss": 3.380792078515742, + "tokens_seen": 2389106688 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001395085255767302, + "loss": 2.491, + "theoretical_loss": 3.380784613795363, + "tokens_seen": 2389172224 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013949849548645937, + "loss": 2.7017, + "theoretical_loss": 3.380777149337074, + "tokens_seen": 2389237760 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013948846539618855, + "loss": 2.5877, + "theoretical_loss": 3.380769685140857, + "tokens_seen": 2389303296 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013947843530591776, + "loss": 2.6097, + "theoretical_loss": 3.380762221206696, + "tokens_seen": 2389368832 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013946840521564694, + "loss": 2.339, + "theoretical_loss": 3.380754757534574, + "tokens_seen": 2389434368 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013945837512537612, + "loss": 2.7318, + "theoretical_loss": 3.380747294124476, + "tokens_seen": 2389499904 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001394483450351053, + "loss": 2.7555, + "theoretical_loss": 3.3807398309763843, + "tokens_seen": 2389565440 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001394383149448345, + "loss": 2.4476, + "theoretical_loss": 3.380732368090283, + "tokens_seen": 2389630976 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001394282848545637, + "loss": 2.5712, + "theoretical_loss": 3.380724905466156, + "tokens_seen": 2389696512 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013941825476429287, + "loss": 2.5691, + "theoretical_loss": 3.3807174431039866, + "tokens_seen": 2389762048 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013940822467402205, + "loss": 2.5042, + "theoretical_loss": 3.3807099810037586, + "tokens_seen": 2389827584 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013939819458375123, + "loss": 2.4875, + "theoretical_loss": 3.380702519165455, + "tokens_seen": 2389893120 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013938816449348044, + "loss": 2.6217, + "theoretical_loss": 3.3806950575890604, + "tokens_seen": 2389958656 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013937813440320962, + "loss": 2.7203, + "theoretical_loss": 3.3806875962745577, + "tokens_seen": 2390024192 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001393681043129388, + "loss": 2.6851, + "theoretical_loss": 3.380680135221931, + "tokens_seen": 2390089728 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013935807422266799, + "loss": 2.4037, + "theoretical_loss": 3.3806726744311635, + "tokens_seen": 2390155264 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001393480441323972, + "loss": 2.4906, + "theoretical_loss": 3.380665213902239, + "tokens_seen": 2390220800 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013933801404212638, + "loss": 2.49, + "theoretical_loss": 3.380657753635141, + "tokens_seen": 2390286336 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013932798395185556, + "loss": 2.5908, + "theoretical_loss": 3.380650293629854, + "tokens_seen": 2390351872 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 2677340, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6043975353240967, + "objective/train/theoretical_loss": 3.3806428338863603, + "objective/train/tokens_used": 2410877408, + "theoretical_loss": 3.3806428338863603, + "tokens_seen": 2390417408 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013931795386158474, + "loss": 2.4856, + "theoretical_loss": 3.3806428338863603, + "tokens_seen": 2390417408 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013930792377131395, + "loss": 2.5734, + "theoretical_loss": 3.3806353744046445, + "tokens_seen": 2390482944 + }, + { + "epoch": 8.0, + "learning_rate": 0.00013929789368104313, + "loss": 2.6338, + "theoretical_loss": 3.38062791518469, + "tokens_seen": 2390548480 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001392878635907723, + "loss": 2.5499, + "theoretical_loss": 3.38062045622648, + "tokens_seen": 2390614016 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001392778335005015, + "loss": 2.4962, + "theoretical_loss": 3.3806129975299983, + "tokens_seen": 2390679552 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013926780341023067, + "loss": 2.6004, + "theoretical_loss": 3.380605539095229, + "tokens_seen": 2390745088 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001392577733199599, + "loss": 2.4341, + "theoretical_loss": 3.3805980809221556, + "tokens_seen": 2390810624 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001392477432296891, + "loss": 2.603, + "theoretical_loss": 3.380590623010762, + "tokens_seen": 2390876160 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013923771313941827, + "loss": 2.4787, + "theoretical_loss": 3.3805831653610308, + "tokens_seen": 2390941696 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013922768304914745, + "loss": 2.4887, + "theoretical_loss": 3.3805757079729464, + "tokens_seen": 2391007232 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013921765295887666, + "loss": 2.589, + "theoretical_loss": 3.3805682508464923, + "tokens_seen": 2391072768 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013920762286860584, + "loss": 2.6111, + "theoretical_loss": 3.3805607939816524, + "tokens_seen": 2391138304 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013919759277833502, + "loss": 2.5605, + "theoretical_loss": 3.38055333737841, + "tokens_seen": 2391203840 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001391875626880642, + "loss": 2.49, + "theoretical_loss": 3.380545881036749, + "tokens_seen": 2391269376 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013917753259779338, + "loss": 2.4639, + "theoretical_loss": 3.380538424956653, + "tokens_seen": 2391334912 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001391675025075226, + "loss": 2.5793, + "theoretical_loss": 3.380530969138106, + "tokens_seen": 2391400448 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013915747241725177, + "loss": 2.5733, + "theoretical_loss": 3.3805235135810907, + "tokens_seen": 2391465984 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013914744232698095, + "loss": 2.4748, + "theoretical_loss": 3.3805160582855915, + "tokens_seen": 2391531520 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013913741223671013, + "loss": 2.4816, + "theoretical_loss": 3.380508603251592, + "tokens_seen": 2391597056 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013912738214643934, + "loss": 2.6361, + "theoretical_loss": 3.380501148479076, + "tokens_seen": 2391662592 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013911735205616852, + "loss": 2.4779, + "theoretical_loss": 3.3804936939680266, + "tokens_seen": 2391728128 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001391073219658977, + "loss": 2.5733, + "theoretical_loss": 3.3804862397184277, + "tokens_seen": 2391793664 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013909729187562688, + "loss": 2.4766, + "theoretical_loss": 3.380478785730263, + "tokens_seen": 2391859200 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013908726178535607, + "loss": 2.6369, + "theoretical_loss": 3.3804713320035167, + "tokens_seen": 2391924736 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013907723169508527, + "loss": 2.5538, + "theoretical_loss": 3.3804638785381718, + "tokens_seen": 2391990272 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2682561, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.389082431793213, + "objective/train/theoretical_loss": 3.380456425334212, + "objective/train/tokens_used": 2412515808, + "theoretical_loss": 3.380456425334212, + "tokens_seen": 2392055808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013906720160481446, + "loss": 2.5093, + "theoretical_loss": 3.380456425334212, + "tokens_seen": 2392055808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013905717151454364, + "loss": 2.4409, + "theoretical_loss": 3.3804489723916213, + "tokens_seen": 2392121344 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013904714142427282, + "loss": 2.5578, + "theoretical_loss": 3.3804415197103834, + "tokens_seen": 2392186880 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013903711133400203, + "loss": 2.5945, + "theoretical_loss": 3.3804340672904813, + "tokens_seen": 2392252416 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001390270812437312, + "loss": 2.4988, + "theoretical_loss": 3.3804266151318996, + "tokens_seen": 2392317952 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001390170511534604, + "loss": 2.6494, + "theoretical_loss": 3.380419163234621, + "tokens_seen": 2392383488 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013900702106318957, + "loss": 2.6231, + "theoretical_loss": 3.3804117115986307, + "tokens_seen": 2392449024 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013899699097291875, + "loss": 2.3768, + "theoretical_loss": 3.380404260223911, + "tokens_seen": 2392514560 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013898696088264796, + "loss": 2.5236, + "theoretical_loss": 3.3803968091104455, + "tokens_seen": 2392580096 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013897693079237714, + "loss": 2.4579, + "theoretical_loss": 3.3803893582582187, + "tokens_seen": 2392645632 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013896690070210632, + "loss": 2.5305, + "theoretical_loss": 3.380381907667214, + "tokens_seen": 2392711168 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001389568706118355, + "loss": 2.4329, + "theoretical_loss": 3.380374457337415, + "tokens_seen": 2392776704 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001389468405215647, + "loss": 2.6278, + "theoretical_loss": 3.3803670072688057, + "tokens_seen": 2392842240 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001389368104312939, + "loss": 2.6849, + "theoretical_loss": 3.380359557461369, + "tokens_seen": 2392907776 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013892678034102307, + "loss": 2.5037, + "theoretical_loss": 3.3803521079150896, + "tokens_seen": 2392973312 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013891675025075225, + "loss": 2.4538, + "theoretical_loss": 3.3803446586299506, + "tokens_seen": 2393038848 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013890672016048143, + "loss": 2.5506, + "theoretical_loss": 3.3803372096059356, + "tokens_seen": 2393104384 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013889669007021064, + "loss": 2.6381, + "theoretical_loss": 3.3803297608430287, + "tokens_seen": 2393169920 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013888665997993982, + "loss": 2.4271, + "theoretical_loss": 3.3803223123412134, + "tokens_seen": 2393235456 + }, + { + "epoch": 8.01, + "learning_rate": 0.000138876629889669, + "loss": 2.4927, + "theoretical_loss": 3.380314864100473, + "tokens_seen": 2393300992 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013886659979939819, + "loss": 2.6989, + "theoretical_loss": 3.380307416120792, + "tokens_seen": 2393366528 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001388565697091274, + "loss": 2.5322, + "theoretical_loss": 3.380299968402154, + "tokens_seen": 2393432064 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013884653961885658, + "loss": 2.5591, + "theoretical_loss": 3.380292520944542, + "tokens_seen": 2393497600 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013883650952858576, + "loss": 2.5214, + "theoretical_loss": 3.38028507374794, + "tokens_seen": 2393563136 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013882647943831494, + "loss": 2.6545, + "theoretical_loss": 3.3802776268123322, + "tokens_seen": 2393628672 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2687565, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6399991512298584, + "objective/train/theoretical_loss": 3.3802701801377015, + "objective/train/tokens_used": 2414154208, + "theoretical_loss": 3.3802701801377015, + "tokens_seen": 2393694208 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013881644934804415, + "loss": 2.4449, + "theoretical_loss": 3.3802701801377015, + "tokens_seen": 2393694208 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013880641925777333, + "loss": 2.5221, + "theoretical_loss": 3.3802627337240327, + "tokens_seen": 2393759744 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001387963891675025, + "loss": 2.6211, + "theoretical_loss": 3.380255287571308, + "tokens_seen": 2393825280 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001387863590772317, + "loss": 2.5767, + "theoretical_loss": 3.3802478416795125, + "tokens_seen": 2393890816 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013877632898696087, + "loss": 2.5039, + "theoretical_loss": 3.380240396048629, + "tokens_seen": 2393956352 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013876629889669008, + "loss": 2.4306, + "theoretical_loss": 3.380232950678642, + "tokens_seen": 2394021888 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013875626880641926, + "loss": 2.5095, + "theoretical_loss": 3.380225505569535, + "tokens_seen": 2394087424 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013874623871614844, + "loss": 2.6563, + "theoretical_loss": 3.380218060721291, + "tokens_seen": 2394152960 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013873620862587762, + "loss": 2.5845, + "theoretical_loss": 3.380210616133894, + "tokens_seen": 2394218496 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013872617853560683, + "loss": 2.4707, + "theoretical_loss": 3.3802031718073287, + "tokens_seen": 2394284032 + }, + { + "epoch": 8.01, + "learning_rate": 0.000138716148445336, + "loss": 2.7325, + "theoretical_loss": 3.3801957277415777, + "tokens_seen": 2394349568 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001387061183550652, + "loss": 2.4209, + "theoretical_loss": 3.3801882839366253, + "tokens_seen": 2394415104 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013869608826479437, + "loss": 2.4212, + "theoretical_loss": 3.380180840392455, + "tokens_seen": 2394480640 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013868605817452355, + "loss": 2.5232, + "theoretical_loss": 3.38017339710905, + "tokens_seen": 2394546176 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013867602808425276, + "loss": 2.3203, + "theoretical_loss": 3.380165954086395, + "tokens_seen": 2394611712 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013866599799398194, + "loss": 2.5452, + "theoretical_loss": 3.3801585113244736, + "tokens_seen": 2394677248 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013865596790371113, + "loss": 2.4814, + "theoretical_loss": 3.380151068823269, + "tokens_seen": 2394742784 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001386459378134403, + "loss": 2.6464, + "theoretical_loss": 3.380143626582765, + "tokens_seen": 2394808320 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013863590772316951, + "loss": 2.6306, + "theoretical_loss": 3.3801361846029456, + "tokens_seen": 2394873856 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001386258776328987, + "loss": 2.6591, + "theoretical_loss": 3.380128742883795, + "tokens_seen": 2394939392 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013861584754262788, + "loss": 2.604, + "theoretical_loss": 3.3801213014252958, + "tokens_seen": 2395004928 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013860581745235706, + "loss": 2.5462, + "theoretical_loss": 3.3801138602274325, + "tokens_seen": 2395070464 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013859578736208624, + "loss": 2.6621, + "theoretical_loss": 3.3801064192901884, + "tokens_seen": 2395136000 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013858575727181545, + "loss": 2.218, + "theoretical_loss": 3.380098978613548, + "tokens_seen": 2395201536 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013857572718154463, + "loss": 2.4173, + "theoretical_loss": 3.3800915381974943, + "tokens_seen": 2395267072 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2692802, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5686795711517334, + "objective/train/theoretical_loss": 3.380084098042011, + "objective/train/tokens_used": 2415792608, + "theoretical_loss": 3.380084098042011, + "tokens_seen": 2395332608 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001385656970912738, + "loss": 2.5243, + "theoretical_loss": 3.380084098042011, + "tokens_seen": 2395332608 + }, + { + "epoch": 8.01, + "learning_rate": 0.000138555667001003, + "loss": 2.4104, + "theoretical_loss": 3.3800766581470825, + "tokens_seen": 2395398144 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001385456369107322, + "loss": 2.6393, + "theoretical_loss": 3.380069218512692, + "tokens_seen": 2395463680 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013853560682046138, + "loss": 2.438, + "theoretical_loss": 3.3800617791388237, + "tokens_seen": 2395529216 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013852557673019056, + "loss": 2.5228, + "theoretical_loss": 3.380054340025461, + "tokens_seen": 2395594752 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013851554663991974, + "loss": 2.6148, + "theoretical_loss": 3.3800469011725878, + "tokens_seen": 2395660288 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013850551654964895, + "loss": 2.5129, + "theoretical_loss": 3.3800394625801875, + "tokens_seen": 2395725824 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013849548645937816, + "loss": 2.414, + "theoretical_loss": 3.3800320242482442, + "tokens_seen": 2395791360 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013848545636910734, + "loss": 2.5809, + "theoretical_loss": 3.380024586176742, + "tokens_seen": 2395856896 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013847542627883652, + "loss": 2.5146, + "theoretical_loss": 3.380017148365664, + "tokens_seen": 2395922432 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001384653961885657, + "loss": 2.4586, + "theoretical_loss": 3.3800097108149942, + "tokens_seen": 2395987968 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001384553660982949, + "loss": 2.4142, + "theoretical_loss": 3.3800022735247164, + "tokens_seen": 2396053504 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001384453360080241, + "loss": 2.4814, + "theoretical_loss": 3.3799948364948142, + "tokens_seen": 2396119040 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013843530591775327, + "loss": 2.6664, + "theoretical_loss": 3.3799873997252714, + "tokens_seen": 2396184576 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013842527582748245, + "loss": 2.6109, + "theoretical_loss": 3.3799799632160723, + "tokens_seen": 2396250112 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013841524573721163, + "loss": 2.4662, + "theoretical_loss": 3.3799725269672, + "tokens_seen": 2396315648 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013840521564694084, + "loss": 2.7799, + "theoretical_loss": 3.3799650909786387, + "tokens_seen": 2396381184 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013839518555667002, + "loss": 2.6687, + "theoretical_loss": 3.379957655250372, + "tokens_seen": 2396446720 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001383851554663992, + "loss": 2.5416, + "theoretical_loss": 3.3799502197823834, + "tokens_seen": 2396512256 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013837512537612839, + "loss": 2.3484, + "theoretical_loss": 3.3799427845746566, + "tokens_seen": 2396577792 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001383650952858576, + "loss": 2.5979, + "theoretical_loss": 3.3799353496271762, + "tokens_seen": 2396643328 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013835506519558678, + "loss": 2.5719, + "theoretical_loss": 3.379927914939925, + "tokens_seen": 2396708864 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013834503510531596, + "loss": 2.4373, + "theoretical_loss": 3.379920480512888, + "tokens_seen": 2396774400 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013833500501504514, + "loss": 2.6572, + "theoretical_loss": 3.3799130463460476, + "tokens_seen": 2396839936 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013832497492477435, + "loss": 2.6022, + "theoretical_loss": 3.3799056124393885, + "tokens_seen": 2396905472 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2695586, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5519800186157227, + "objective/train/theoretical_loss": 3.379898178792894, + "objective/train/tokens_used": 2417431008, + "theoretical_loss": 3.379898178792894, + "tokens_seen": 2396971008 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013831494483450353, + "loss": 2.5774, + "theoretical_loss": 3.379898178792894, + "tokens_seen": 2396971008 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001383049147442327, + "loss": 2.5585, + "theoretical_loss": 3.379890745406548, + "tokens_seen": 2397036544 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001382948846539619, + "loss": 2.6587, + "theoretical_loss": 3.3798833122803345, + "tokens_seen": 2397102080 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013828485456369107, + "loss": 2.5177, + "theoretical_loss": 3.379875879414237, + "tokens_seen": 2397167616 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013827482447342028, + "loss": 2.4434, + "theoretical_loss": 3.3798684468082394, + "tokens_seen": 2397233152 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013826479438314946, + "loss": 2.6384, + "theoretical_loss": 3.3798610144623256, + "tokens_seen": 2397298688 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013825476429287864, + "loss": 2.447, + "theoretical_loss": 3.3798535823764793, + "tokens_seen": 2397364224 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013824473420260782, + "loss": 2.6638, + "theoretical_loss": 3.3798461505506845, + "tokens_seen": 2397429760 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013823470411233703, + "loss": 2.6658, + "theoretical_loss": 3.3798387189849244, + "tokens_seen": 2397495296 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001382246740220662, + "loss": 2.4434, + "theoretical_loss": 3.3798312876791834, + "tokens_seen": 2397560832 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001382146439317954, + "loss": 2.6222, + "theoretical_loss": 3.3798238566334446, + "tokens_seen": 2397626368 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013820461384152457, + "loss": 2.4659, + "theoretical_loss": 3.379816425847693, + "tokens_seen": 2397691904 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013819458375125375, + "loss": 2.2931, + "theoretical_loss": 3.3798089953219113, + "tokens_seen": 2397757440 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013818455366098296, + "loss": 2.6677, + "theoretical_loss": 3.3798015650560833, + "tokens_seen": 2397822976 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013817452357071214, + "loss": 2.517, + "theoretical_loss": 3.3797941350501937, + "tokens_seen": 2397888512 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013816449348044133, + "loss": 2.5053, + "theoretical_loss": 3.3797867053042254, + "tokens_seen": 2397954048 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001381544633901705, + "loss": 2.4629, + "theoretical_loss": 3.379779275818163, + "tokens_seen": 2398019584 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013814443329989971, + "loss": 2.5559, + "theoretical_loss": 3.3797718465919893, + "tokens_seen": 2398085120 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001381344032096289, + "loss": 2.4655, + "theoretical_loss": 3.3797644176256894, + "tokens_seen": 2398150656 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013812437311935808, + "loss": 2.5798, + "theoretical_loss": 3.3797569889192456, + "tokens_seen": 2398216192 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013811434302908726, + "loss": 2.541, + "theoretical_loss": 3.379749560472643, + "tokens_seen": 2398281728 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013810431293881644, + "loss": 2.3988, + "theoretical_loss": 3.3797421322858647, + "tokens_seen": 2398347264 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013809428284854565, + "loss": 2.5238, + "theoretical_loss": 3.379734704358895, + "tokens_seen": 2398412800 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013808425275827483, + "loss": 2.561, + "theoretical_loss": 3.379727276691717, + "tokens_seen": 2398478336 + }, + { + "epoch": 8.01, + "learning_rate": 0.000138074222668004, + "loss": 2.5329, + "theoretical_loss": 3.3797198492843155, + "tokens_seen": 2398543872 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2696370, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6388156414031982, + "objective/train/theoretical_loss": 3.3797124221366732, + "objective/train/tokens_used": 2419069408, + "theoretical_loss": 3.3797124221366732, + "tokens_seen": 2398609408 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001380641925777332, + "loss": 2.6156, + "theoretical_loss": 3.3797124221366732, + "tokens_seen": 2398609408 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001380541624874624, + "loss": 2.6073, + "theoretical_loss": 3.3797049952487748, + "tokens_seen": 2398674944 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013804413239719158, + "loss": 2.5608, + "theoretical_loss": 3.3796975686206037, + "tokens_seen": 2398740480 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013803410230692076, + "loss": 2.5534, + "theoretical_loss": 3.379690142252144, + "tokens_seen": 2398806016 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013802407221664994, + "loss": 2.4472, + "theoretical_loss": 3.379682716143379, + "tokens_seen": 2398871552 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013801404212637912, + "loss": 2.5868, + "theoretical_loss": 3.3796752902942933, + "tokens_seen": 2398937088 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013800401203610833, + "loss": 2.4699, + "theoretical_loss": 3.37966786470487, + "tokens_seen": 2399002624 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001379939819458375, + "loss": 2.4289, + "theoretical_loss": 3.379660439375093, + "tokens_seen": 2399068160 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001379839518555667, + "loss": 2.6272, + "theoretical_loss": 3.379653014304947, + "tokens_seen": 2399133696 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013797392176529588, + "loss": 2.5386, + "theoretical_loss": 3.3796455894944146, + "tokens_seen": 2399199232 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013796389167502508, + "loss": 2.6755, + "theoretical_loss": 3.3796381649434806, + "tokens_seen": 2399264768 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013795386158475426, + "loss": 2.4404, + "theoretical_loss": 3.379630740652128, + "tokens_seen": 2399330304 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013794383149448345, + "loss": 2.423, + "theoretical_loss": 3.3796233166203415, + "tokens_seen": 2399395840 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013793380140421263, + "loss": 2.6343, + "theoretical_loss": 3.3796158928481046, + "tokens_seen": 2399461376 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001379237713139418, + "loss": 2.4878, + "theoretical_loss": 3.379608469335401, + "tokens_seen": 2399526912 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013791374122367102, + "loss": 2.5346, + "theoretical_loss": 3.3796010460822146, + "tokens_seen": 2399592448 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001379037111334002, + "loss": 2.5483, + "theoretical_loss": 3.3795936230885286, + "tokens_seen": 2399657984 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013789368104312938, + "loss": 2.5188, + "theoretical_loss": 3.3795862003543284, + "tokens_seen": 2399723520 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013788365095285856, + "loss": 2.719, + "theoretical_loss": 3.3795787778795963, + "tokens_seen": 2399789056 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013787362086258777, + "loss": 2.4594, + "theoretical_loss": 3.379571355664317, + "tokens_seen": 2399854592 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013786359077231695, + "loss": 2.471, + "theoretical_loss": 3.379563933708474, + "tokens_seen": 2399920128 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013785356068204613, + "loss": 2.2963, + "theoretical_loss": 3.3795565120120514, + "tokens_seen": 2399985664 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001378435305917753, + "loss": 2.6028, + "theoretical_loss": 3.3795490905750327, + "tokens_seen": 2400051200 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001378335005015045, + "loss": 2.4123, + "theoretical_loss": 3.379541669397402, + "tokens_seen": 2400116736 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001378234704112337, + "loss": 2.5565, + "theoretical_loss": 3.3795342484791435, + "tokens_seen": 2400182272 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2697494, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.261685848236084, + "objective/train/theoretical_loss": 3.37952682782024, + "objective/train/tokens_used": 2420707808, + "theoretical_loss": 3.37952682782024, + "tokens_seen": 2400247808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013781344032096288, + "loss": 2.3173, + "theoretical_loss": 3.37952682782024, + "tokens_seen": 2400247808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013780341023069206, + "loss": 2.435, + "theoretical_loss": 3.379519407420677, + "tokens_seen": 2400313344 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013779338014042124, + "loss": 2.589, + "theoretical_loss": 3.3795119872804364, + "tokens_seen": 2400378880 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013778335005015045, + "loss": 2.4212, + "theoretical_loss": 3.3795045673995032, + "tokens_seen": 2400444416 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013777331995987963, + "loss": 2.7241, + "theoretical_loss": 3.3794971477778617, + "tokens_seen": 2400509952 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013776328986960881, + "loss": 2.5743, + "theoretical_loss": 3.3794897284154946, + "tokens_seen": 2400575488 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013775325977933802, + "loss": 2.4361, + "theoretical_loss": 3.3794823093123867, + "tokens_seen": 2400641024 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013774322968906723, + "loss": 2.4425, + "theoretical_loss": 3.3794748904685212, + "tokens_seen": 2400706560 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001377331995987964, + "loss": 2.5198, + "theoretical_loss": 3.379467471883882, + "tokens_seen": 2400772096 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001377231695085256, + "loss": 2.7074, + "theoretical_loss": 3.3794600535584536, + "tokens_seen": 2400837632 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013771313941825477, + "loss": 2.4954, + "theoretical_loss": 3.3794526354922194, + "tokens_seen": 2400903168 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013770310932798396, + "loss": 2.3926, + "theoretical_loss": 3.379445217685163, + "tokens_seen": 2400968704 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013769307923771316, + "loss": 2.6168, + "theoretical_loss": 3.379437800137269, + "tokens_seen": 2401034240 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013768304914744234, + "loss": 2.2578, + "theoretical_loss": 3.379430382848521, + "tokens_seen": 2401099776 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013767301905717153, + "loss": 2.6126, + "theoretical_loss": 3.379422965818902, + "tokens_seen": 2401165312 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001376629889669007, + "loss": 2.6113, + "theoretical_loss": 3.3794155490483977, + "tokens_seen": 2401230848 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013765295887662991, + "loss": 2.5382, + "theoretical_loss": 3.3794081325369905, + "tokens_seen": 2401296384 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001376429287863591, + "loss": 2.3979, + "theoretical_loss": 3.3794007162846644, + "tokens_seen": 2401361920 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013763289869608828, + "loss": 2.4645, + "theoretical_loss": 3.3793933002914036, + "tokens_seen": 2401427456 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013762286860581746, + "loss": 2.6636, + "theoretical_loss": 3.379385884557192, + "tokens_seen": 2401492992 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013761283851554664, + "loss": 2.6929, + "theoretical_loss": 3.3793784690820137, + "tokens_seen": 2401558528 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013760280842527585, + "loss": 2.5954, + "theoretical_loss": 3.379371053865852, + "tokens_seen": 2401624064 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013759277833500503, + "loss": 2.5996, + "theoretical_loss": 3.3793636389086914, + "tokens_seen": 2401689600 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001375827482447342, + "loss": 2.4992, + "theoretical_loss": 3.3793562242105155, + "tokens_seen": 2401755136 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001375727181544634, + "loss": 2.3935, + "theoretical_loss": 3.379348809771308, + "tokens_seen": 2401820672 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2698075, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0661022663116455, + "objective/train/theoretical_loss": 3.379341395591053, + "objective/train/tokens_used": 2422346208, + "theoretical_loss": 3.379341395591053, + "tokens_seen": 2401886208 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001375626880641926, + "loss": 2.4453, + "theoretical_loss": 3.379341395591053, + "tokens_seen": 2401886208 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013755265797392178, + "loss": 2.5746, + "theoretical_loss": 3.3793339816697343, + "tokens_seen": 2401951744 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013754262788365096, + "loss": 2.4683, + "theoretical_loss": 3.379326568007336, + "tokens_seen": 2402017280 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013753259779338014, + "loss": 2.5117, + "theoretical_loss": 3.3793191546038415, + "tokens_seen": 2402082816 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013752256770310932, + "loss": 2.4924, + "theoretical_loss": 3.3793117414592353, + "tokens_seen": 2402148352 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013751253761283853, + "loss": 2.5171, + "theoretical_loss": 3.379304328573501, + "tokens_seen": 2402213888 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001375025075225677, + "loss": 2.649, + "theoretical_loss": 3.3792969159466226, + "tokens_seen": 2402279424 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001374924774322969, + "loss": 2.5575, + "theoretical_loss": 3.3792895035785837, + "tokens_seen": 2402344960 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013748244734202608, + "loss": 2.6169, + "theoretical_loss": 3.379282091469369, + "tokens_seen": 2402410496 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013747241725175528, + "loss": 2.6319, + "theoretical_loss": 3.3792746796189608, + "tokens_seen": 2402476032 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013746238716148446, + "loss": 2.5649, + "theoretical_loss": 3.379267268027345, + "tokens_seen": 2402541568 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013745235707121365, + "loss": 2.5744, + "theoretical_loss": 3.379259856694504, + "tokens_seen": 2402607104 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013744232698094283, + "loss": 2.607, + "theoretical_loss": 3.3792524456204225, + "tokens_seen": 2402672640 + }, + { + "epoch": 8.01, + "learning_rate": 0.000137432296890672, + "loss": 2.4234, + "theoretical_loss": 3.379245034805084, + "tokens_seen": 2402738176 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013742226680040122, + "loss": 2.548, + "theoretical_loss": 3.3792376242484727, + "tokens_seen": 2402803712 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001374122367101304, + "loss": 2.4632, + "theoretical_loss": 3.3792302139505725, + "tokens_seen": 2402869248 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013740220661985958, + "loss": 2.5069, + "theoretical_loss": 3.3792228039113668, + "tokens_seen": 2402934784 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013739217652958876, + "loss": 2.4594, + "theoretical_loss": 3.3792153941308403, + "tokens_seen": 2403000320 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013738214643931797, + "loss": 2.5077, + "theoretical_loss": 3.379207984608976, + "tokens_seen": 2403065856 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013737211634904715, + "loss": 2.5662, + "theoretical_loss": 3.379200575345759, + "tokens_seen": 2403131392 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013736208625877633, + "loss": 2.6099, + "theoretical_loss": 3.379193166341172, + "tokens_seen": 2403196928 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001373520561685055, + "loss": 2.5758, + "theoretical_loss": 3.3791857575951996, + "tokens_seen": 2403262464 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001373420260782347, + "loss": 2.3921, + "theoretical_loss": 3.3791783491078258, + "tokens_seen": 2403328000 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001373319959879639, + "loss": 2.2954, + "theoretical_loss": 3.3791709408790345, + "tokens_seen": 2403393536 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013732196589769308, + "loss": 2.5, + "theoretical_loss": 3.3791635329088088, + "tokens_seen": 2403459072 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2699270, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.486924171447754, + "objective/train/theoretical_loss": 3.3791561251971336, + "objective/train/tokens_used": 2423984608, + "theoretical_loss": 3.3791561251971336, + "tokens_seen": 2403524608 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013731193580742226, + "loss": 2.4796, + "theoretical_loss": 3.3791561251971336, + "tokens_seen": 2403524608 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013730190571715144, + "loss": 2.5225, + "theoretical_loss": 3.3791487177439925, + "tokens_seen": 2403590144 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013729187562688065, + "loss": 2.6432, + "theoretical_loss": 3.3791413105493695, + "tokens_seen": 2403655680 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013728184553660983, + "loss": 2.5634, + "theoretical_loss": 3.3791339036132486, + "tokens_seen": 2403721216 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013727181544633901, + "loss": 2.5428, + "theoretical_loss": 3.379126496935613, + "tokens_seen": 2403786752 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001372617853560682, + "loss": 2.647, + "theoretical_loss": 3.379119090516448, + "tokens_seen": 2403852288 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013725175526579738, + "loss": 2.6465, + "theoretical_loss": 3.379111684355736, + "tokens_seen": 2403917824 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013724172517552658, + "loss": 2.4999, + "theoretical_loss": 3.379104278453462, + "tokens_seen": 2403983360 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013723169508525577, + "loss": 2.6192, + "theoretical_loss": 3.3790968728096096, + "tokens_seen": 2404048896 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013722166499498495, + "loss": 2.4949, + "theoretical_loss": 3.379089467424163, + "tokens_seen": 2404114432 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013721163490471413, + "loss": 2.5219, + "theoretical_loss": 3.3790820622971056, + "tokens_seen": 2404179968 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013720160481444334, + "loss": 2.6154, + "theoretical_loss": 3.3790746574284216, + "tokens_seen": 2404245504 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013719157472417252, + "loss": 2.4875, + "theoretical_loss": 3.379067252818095, + "tokens_seen": 2404311040 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001371815446339017, + "loss": 2.7937, + "theoretical_loss": 3.3790598484661096, + "tokens_seen": 2404376576 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013717151454363088, + "loss": 2.4697, + "theoretical_loss": 3.3790524443724497, + "tokens_seen": 2404442112 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001371614844533601, + "loss": 2.5322, + "theoretical_loss": 3.379045040537099, + "tokens_seen": 2404507648 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013715145436308927, + "loss": 2.5485, + "theoretical_loss": 3.3790376369600414, + "tokens_seen": 2404573184 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013714142427281845, + "loss": 2.4966, + "theoretical_loss": 3.379030233641261, + "tokens_seen": 2404638720 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013713139418254763, + "loss": 2.6173, + "theoretical_loss": 3.379022830580742, + "tokens_seen": 2404704256 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001371213640922768, + "loss": 2.5525, + "theoretical_loss": 3.3790154277784676, + "tokens_seen": 2404769792 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013711133400200602, + "loss": 2.7149, + "theoretical_loss": 3.379008025234422, + "tokens_seen": 2404835328 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001371013039117352, + "loss": 2.3866, + "theoretical_loss": 3.3790006229485896, + "tokens_seen": 2404900864 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013709127382146438, + "loss": 2.4163, + "theoretical_loss": 3.378993220920954, + "tokens_seen": 2404966400 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013708124373119356, + "loss": 2.5412, + "theoretical_loss": 3.3789858191514996, + "tokens_seen": 2405031936 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013707121364092277, + "loss": 2.5524, + "theoretical_loss": 3.3789784176402096, + "tokens_seen": 2405097472 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2700072, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.785057306289673, + "objective/train/theoretical_loss": 3.3789710163870685, + "objective/train/tokens_used": 2425623008, + "theoretical_loss": 3.3789710163870685, + "tokens_seen": 2405163008 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013706118355065195, + "loss": 2.6074, + "theoretical_loss": 3.3789710163870685, + "tokens_seen": 2405163008 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013705115346038113, + "loss": 2.5888, + "theoretical_loss": 3.3789636153920597, + "tokens_seen": 2405228544 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013704112337011032, + "loss": 2.421, + "theoretical_loss": 3.378956214655168, + "tokens_seen": 2405294080 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001370310932798395, + "loss": 2.5164, + "theoretical_loss": 3.3789488141763773, + "tokens_seen": 2405359616 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001370210631895687, + "loss": 2.7771, + "theoretical_loss": 3.3789414139556704, + "tokens_seen": 2405425152 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001370110330992979, + "loss": 2.6486, + "theoretical_loss": 3.3789340139930326, + "tokens_seen": 2405490688 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001370010030090271, + "loss": 2.4556, + "theoretical_loss": 3.3789266142884475, + "tokens_seen": 2405556224 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013699097291875628, + "loss": 2.6267, + "theoretical_loss": 3.3789192148418987, + "tokens_seen": 2405621760 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013698094282848548, + "loss": 2.3805, + "theoretical_loss": 3.37891181565337, + "tokens_seen": 2405687296 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013697091273821466, + "loss": 2.5741, + "theoretical_loss": 3.378904416722847, + "tokens_seen": 2405752832 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013696088264794385, + "loss": 2.4716, + "theoretical_loss": 3.3788970180503113, + "tokens_seen": 2405818368 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013695085255767303, + "loss": 2.4773, + "theoretical_loss": 3.3788896196357485, + "tokens_seen": 2405883904 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001369408224674022, + "loss": 2.4529, + "theoretical_loss": 3.378882221479142, + "tokens_seen": 2405949440 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013693079237713142, + "loss": 2.5282, + "theoretical_loss": 3.378874823580476, + "tokens_seen": 2406014976 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001369207622868606, + "loss": 2.2383, + "theoretical_loss": 3.378867425939734, + "tokens_seen": 2406080512 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013691073219658978, + "loss": 2.346, + "theoretical_loss": 3.378860028556901, + "tokens_seen": 2406146048 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013690070210631896, + "loss": 2.5003, + "theoretical_loss": 3.37885263143196, + "tokens_seen": 2406211584 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013689067201604817, + "loss": 2.5282, + "theoretical_loss": 3.378845234564895, + "tokens_seen": 2406277120 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013688064192577735, + "loss": 2.5358, + "theoretical_loss": 3.3788378379556905, + "tokens_seen": 2406342656 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013687061183550653, + "loss": 2.4935, + "theoretical_loss": 3.3788304416043307, + "tokens_seen": 2406408192 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001368605817452357, + "loss": 2.6482, + "theoretical_loss": 3.378823045510799, + "tokens_seen": 2406473728 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001368505516549649, + "loss": 2.4819, + "theoretical_loss": 3.3788156496750794, + "tokens_seen": 2406539264 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001368405215646941, + "loss": 2.4429, + "theoretical_loss": 3.378808254097156, + "tokens_seen": 2406604800 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013683049147442328, + "loss": 2.5793, + "theoretical_loss": 3.378800858777013, + "tokens_seen": 2406670336 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013682046138415246, + "loss": 2.5201, + "theoretical_loss": 3.378793463714634, + "tokens_seen": 2406735872 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2701669, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4732956886291504, + "objective/train/theoretical_loss": 3.3787860689100038, + "objective/train/tokens_used": 2427261408, + "theoretical_loss": 3.3787860689100038, + "tokens_seen": 2406801408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013681043129388164, + "loss": 2.4762, + "theoretical_loss": 3.3787860689100038, + "tokens_seen": 2406801408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013680040120361085, + "loss": 2.6298, + "theoretical_loss": 3.3787786743631054, + "tokens_seen": 2406866944 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013679037111334003, + "loss": 2.5087, + "theoretical_loss": 3.3787712800739236, + "tokens_seen": 2406932480 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013678034102306921, + "loss": 2.4145, + "theoretical_loss": 3.378763886042442, + "tokens_seen": 2406998016 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001367703109327984, + "loss": 2.6061, + "theoretical_loss": 3.3787564922686446, + "tokens_seen": 2407063552 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013676028084252758, + "loss": 2.5525, + "theoretical_loss": 3.3787490987525155, + "tokens_seen": 2407129088 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013675025075225678, + "loss": 2.5819, + "theoretical_loss": 3.3787417054940385, + "tokens_seen": 2407194624 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013674022066198597, + "loss": 2.3751, + "theoretical_loss": 3.378734312493198, + "tokens_seen": 2407260160 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013673019057171515, + "loss": 2.486, + "theoretical_loss": 3.378726919749978, + "tokens_seen": 2407325696 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013672016048144433, + "loss": 2.7132, + "theoretical_loss": 3.3787195272643618, + "tokens_seen": 2407391232 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013671013039117354, + "loss": 2.3089, + "theoretical_loss": 3.378712135036334, + "tokens_seen": 2407456768 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013670010030090272, + "loss": 2.4439, + "theoretical_loss": 3.3787047430658785, + "tokens_seen": 2407522304 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001366900702106319, + "loss": 2.4473, + "theoretical_loss": 3.3786973513529794, + "tokens_seen": 2407587840 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013668004012036108, + "loss": 2.7168, + "theoretical_loss": 3.378689959897621, + "tokens_seen": 2407653376 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001366700100300903, + "loss": 2.6654, + "theoretical_loss": 3.3786825686997863, + "tokens_seen": 2407718912 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013665997993981947, + "loss": 2.6062, + "theoretical_loss": 3.3786751777594604, + "tokens_seen": 2407784448 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013664994984954865, + "loss": 2.6071, + "theoretical_loss": 3.378667787076627, + "tokens_seen": 2407849984 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013663991975927783, + "loss": 2.5815, + "theoretical_loss": 3.37866039665127, + "tokens_seen": 2407915520 + }, + { + "epoch": 8.01, + "learning_rate": 0.000136629889669007, + "loss": 2.553, + "theoretical_loss": 3.3786530064833733, + "tokens_seen": 2407981056 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013661985957873622, + "loss": 2.4063, + "theoretical_loss": 3.3786456165729213, + "tokens_seen": 2408046592 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001366098294884654, + "loss": 2.4143, + "theoretical_loss": 3.3786382269198976, + "tokens_seen": 2408112128 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013659979939819458, + "loss": 2.4948, + "theoretical_loss": 3.378630837524286, + "tokens_seen": 2408177664 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013658976930792376, + "loss": 2.5471, + "theoretical_loss": 3.3786234483860715, + "tokens_seen": 2408243200 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013657973921765297, + "loss": 2.6475, + "theoretical_loss": 3.3786160595052377, + "tokens_seen": 2408308736 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013656970912738215, + "loss": 2.4483, + "theoretical_loss": 3.378608670881768, + "tokens_seen": 2408374272 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2702403, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.673258066177368, + "objective/train/theoretical_loss": 3.3786012825156475, + "objective/train/tokens_used": 2428899808, + "theoretical_loss": 3.3786012825156475, + "tokens_seen": 2408439808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013655967903711133, + "loss": 2.4418, + "theoretical_loss": 3.3786012825156475, + "tokens_seen": 2408439808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013654964894684052, + "loss": 2.4194, + "theoretical_loss": 3.378593894406859, + "tokens_seen": 2408505344 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001365396188565697, + "loss": 2.5625, + "theoretical_loss": 3.378586506555388, + "tokens_seen": 2408570880 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001365295887662989, + "loss": 2.6476, + "theoretical_loss": 3.3785791189612175, + "tokens_seen": 2408636416 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013651955867602809, + "loss": 2.3863, + "theoretical_loss": 3.3785717316243318, + "tokens_seen": 2408701952 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013650952858575727, + "loss": 2.2867, + "theoretical_loss": 3.3785643445447144, + "tokens_seen": 2408767488 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013649949849548645, + "loss": 2.5018, + "theoretical_loss": 3.3785569577223504, + "tokens_seen": 2408833024 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013648946840521566, + "loss": 2.6205, + "theoretical_loss": 3.3785495711572233, + "tokens_seen": 2408898560 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013647943831494484, + "loss": 2.4057, + "theoretical_loss": 3.378542184849317, + "tokens_seen": 2408964096 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013646940822467402, + "loss": 2.1565, + "theoretical_loss": 3.378534798798616, + "tokens_seen": 2409029632 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001364593781344032, + "loss": 2.4484, + "theoretical_loss": 3.378527413005104, + "tokens_seen": 2409095168 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013644934804413238, + "loss": 2.5824, + "theoretical_loss": 3.378520027468765, + "tokens_seen": 2409160704 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001364393179538616, + "loss": 2.4296, + "theoretical_loss": 3.378512642189583, + "tokens_seen": 2409226240 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013642928786359077, + "loss": 2.6816, + "theoretical_loss": 3.378505257167542, + "tokens_seen": 2409291776 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013641925777331995, + "loss": 2.5485, + "theoretical_loss": 3.378497872402627, + "tokens_seen": 2409357312 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013640922768304913, + "loss": 2.4114, + "theoretical_loss": 3.3784904878948208, + "tokens_seen": 2409422848 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013639919759277834, + "loss": 2.6529, + "theoretical_loss": 3.3784831036441076, + "tokens_seen": 2409488384 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013638916750250752, + "loss": 2.6071, + "theoretical_loss": 3.3784757196504724, + "tokens_seen": 2409553920 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001363791374122367, + "loss": 2.5875, + "theoretical_loss": 3.3784683359138987, + "tokens_seen": 2409619456 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013636910732196588, + "loss": 2.3932, + "theoretical_loss": 3.3784609524343705, + "tokens_seen": 2409684992 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013635907723169507, + "loss": 2.6148, + "theoretical_loss": 3.378453569211872, + "tokens_seen": 2409750528 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013634904714142427, + "loss": 2.6844, + "theoretical_loss": 3.3784461862463866, + "tokens_seen": 2409816064 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013633901705115345, + "loss": 2.468, + "theoretical_loss": 3.3784388035378994, + "tokens_seen": 2409881600 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013632898696088264, + "loss": 2.3036, + "theoretical_loss": 3.3784314210863937, + "tokens_seen": 2409947136 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013631895687061182, + "loss": 2.4805, + "theoretical_loss": 3.378424038891854, + "tokens_seen": 2410012672 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2703734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.812814474105835, + "objective/train/theoretical_loss": 3.3784166569542644, + "objective/train/tokens_used": 2430538208, + "theoretical_loss": 3.3784166569542644, + "tokens_seen": 2410078208 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013630892678034103, + "loss": 2.4145, + "theoretical_loss": 3.3784166569542644, + "tokens_seen": 2410078208 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001362988966900702, + "loss": 2.3991, + "theoretical_loss": 3.378409275273609, + "tokens_seen": 2410143744 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001362888665997994, + "loss": 2.4081, + "theoretical_loss": 3.3784018938498708, + "tokens_seen": 2410209280 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013627883650952857, + "loss": 2.543, + "theoretical_loss": 3.3783945126830357, + "tokens_seen": 2410274816 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013626880641925775, + "loss": 2.3116, + "theoretical_loss": 3.3783871317730862, + "tokens_seen": 2410340352 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013625877632898699, + "loss": 2.4834, + "theoretical_loss": 3.378379751120007, + "tokens_seen": 2410405888 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013624874623871617, + "loss": 2.5506, + "theoretical_loss": 3.3783723707237825, + "tokens_seen": 2410471424 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013623871614844535, + "loss": 2.5007, + "theoretical_loss": 3.378364990584396, + "tokens_seen": 2410536960 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013622868605817453, + "loss": 2.4367, + "theoretical_loss": 3.3783576107018325, + "tokens_seen": 2410602496 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013621865596790374, + "loss": 2.5455, + "theoretical_loss": 3.3783502310760753, + "tokens_seen": 2410668032 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013620862587763292, + "loss": 2.5867, + "theoretical_loss": 3.3783428517071092, + "tokens_seen": 2410733568 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001361985957873621, + "loss": 2.5414, + "theoretical_loss": 3.3783354725949177, + "tokens_seen": 2410799104 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013618856569709128, + "loss": 2.4076, + "theoretical_loss": 3.378328093739485, + "tokens_seen": 2410864640 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001361785356068205, + "loss": 2.3729, + "theoretical_loss": 3.378320715140795, + "tokens_seen": 2410930176 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013616850551654967, + "loss": 2.3928, + "theoretical_loss": 3.3783133367988323, + "tokens_seen": 2410995712 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013615847542627885, + "loss": 2.6591, + "theoretical_loss": 3.3783059587135806, + "tokens_seen": 2411061248 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013614844533600803, + "loss": 2.4284, + "theoretical_loss": 3.378298580885024, + "tokens_seen": 2411126784 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001361384152457372, + "loss": 2.5725, + "theoretical_loss": 3.378291203313147, + "tokens_seen": 2411192320 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013612838515546642, + "loss": 2.5296, + "theoretical_loss": 3.3782838259979333, + "tokens_seen": 2411257856 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001361183550651956, + "loss": 2.4681, + "theoretical_loss": 3.378276448939367, + "tokens_seen": 2411323392 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013610832497492478, + "loss": 2.4928, + "theoretical_loss": 3.3782690721374324, + "tokens_seen": 2411388928 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013609829488465396, + "loss": 2.3952, + "theoretical_loss": 3.3782616955921134, + "tokens_seen": 2411454464 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013608826479438317, + "loss": 2.3485, + "theoretical_loss": 3.3782543193033945, + "tokens_seen": 2411520000 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013607823470411235, + "loss": 2.5832, + "theoretical_loss": 3.378246943271259, + "tokens_seen": 2411585536 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013606820461384153, + "loss": 2.396, + "theoretical_loss": 3.378239567495692, + "tokens_seen": 2411651072 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2704401, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2718825340270996, + "objective/train/theoretical_loss": 3.378232191976677, + "objective/train/tokens_used": 2432176608, + "theoretical_loss": 3.378232191976677, + "tokens_seen": 2411716608 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013605817452357072, + "loss": 2.5831, + "theoretical_loss": 3.378232191976677, + "tokens_seen": 2411716608 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001360481444332999, + "loss": 2.556, + "theoretical_loss": 3.3782248167141975, + "tokens_seen": 2411782144 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001360381143430291, + "loss": 2.4318, + "theoretical_loss": 3.378217441708239, + "tokens_seen": 2411847680 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013602808425275829, + "loss": 2.6405, + "theoretical_loss": 3.3782100669587845, + "tokens_seen": 2411913216 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013601805416248747, + "loss": 2.4923, + "theoretical_loss": 3.378202692465819, + "tokens_seen": 2411978752 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013600802407221665, + "loss": 2.6644, + "theoretical_loss": 3.378195318229326, + "tokens_seen": 2412044288 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013599799398194586, + "loss": 2.7242, + "theoretical_loss": 3.3781879442492895, + "tokens_seen": 2412109824 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013598796389167504, + "loss": 2.5881, + "theoretical_loss": 3.378180570525694, + "tokens_seen": 2412175360 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013597793380140422, + "loss": 2.6773, + "theoretical_loss": 3.3781731970585236, + "tokens_seen": 2412240896 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001359679037111334, + "loss": 2.4302, + "theoretical_loss": 3.378165823847762, + "tokens_seen": 2412306432 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013595787362086258, + "loss": 2.4467, + "theoretical_loss": 3.378158450893394, + "tokens_seen": 2412371968 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001359478435305918, + "loss": 2.5529, + "theoretical_loss": 3.378151078195403, + "tokens_seen": 2412437504 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013593781344032097, + "loss": 2.6194, + "theoretical_loss": 3.3781437057537738, + "tokens_seen": 2412503040 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013592778335005015, + "loss": 2.462, + "theoretical_loss": 3.3781363335684897, + "tokens_seen": 2412568576 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013591775325977933, + "loss": 2.5398, + "theoretical_loss": 3.378128961639536, + "tokens_seen": 2412634112 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013590772316950854, + "loss": 2.6418, + "theoretical_loss": 3.3781215899668955, + "tokens_seen": 2412699648 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013589769307923772, + "loss": 2.3422, + "theoretical_loss": 3.378114218550553, + "tokens_seen": 2412765184 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001358876629889669, + "loss": 2.2055, + "theoretical_loss": 3.3781068473904927, + "tokens_seen": 2412830720 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013587763289869608, + "loss": 2.5493, + "theoretical_loss": 3.3780994764866987, + "tokens_seen": 2412896256 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013586760280842527, + "loss": 2.4886, + "theoretical_loss": 3.378092105839155, + "tokens_seen": 2412961792 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013585757271815447, + "loss": 2.6148, + "theoretical_loss": 3.3780847354478456, + "tokens_seen": 2413027328 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013584754262788365, + "loss": 2.51, + "theoretical_loss": 3.378077365312755, + "tokens_seen": 2413092864 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013583751253761284, + "loss": 2.3583, + "theoretical_loss": 3.378069995433867, + "tokens_seen": 2413158400 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013582748244734202, + "loss": 2.4289, + "theoretical_loss": 3.378062625811166, + "tokens_seen": 2413223936 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013581745235707123, + "loss": 2.72, + "theoretical_loss": 3.378055256444636, + "tokens_seen": 2413289472 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2705933, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.810112953186035, + "objective/train/theoretical_loss": 3.3780478873342608, + "objective/train/tokens_used": 2433815008, + "theoretical_loss": 3.3780478873342608, + "tokens_seen": 2413355008 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001358074222668004, + "loss": 2.6274, + "theoretical_loss": 3.3780478873342608, + "tokens_seen": 2413355008 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001357973921765296, + "loss": 2.6344, + "theoretical_loss": 3.3780405184800255, + "tokens_seen": 2413420544 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013578736208625877, + "loss": 2.4081, + "theoretical_loss": 3.378033149881913, + "tokens_seen": 2413486080 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013577733199598795, + "loss": 2.4095, + "theoretical_loss": 3.3780257815399084, + "tokens_seen": 2413551616 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013576730190571716, + "loss": 2.4083, + "theoretical_loss": 3.3780184134539954, + "tokens_seen": 2413617152 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013575727181544634, + "loss": 2.5095, + "theoretical_loss": 3.3780110456241585, + "tokens_seen": 2413682688 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013574724172517552, + "loss": 2.6631, + "theoretical_loss": 3.3780036780503817, + "tokens_seen": 2413748224 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001357372116349047, + "loss": 2.5288, + "theoretical_loss": 3.377996310732649, + "tokens_seen": 2413813760 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001357271815446339, + "loss": 2.6128, + "theoretical_loss": 3.3779889436709443, + "tokens_seen": 2413879296 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001357171514543631, + "loss": 2.6436, + "theoretical_loss": 3.377981576865252, + "tokens_seen": 2413944832 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013570712136409227, + "loss": 2.5404, + "theoretical_loss": 3.3779742103155566, + "tokens_seen": 2414010368 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013569709127382145, + "loss": 2.4541, + "theoretical_loss": 3.377966844021842, + "tokens_seen": 2414075904 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013568706118355063, + "loss": 2.6417, + "theoretical_loss": 3.377959477984092, + "tokens_seen": 2414141440 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013567703109327984, + "loss": 2.3267, + "theoretical_loss": 3.3779521122022915, + "tokens_seen": 2414206976 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013566700100300902, + "loss": 2.3196, + "theoretical_loss": 3.377944746676424, + "tokens_seen": 2414272512 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001356569709127382, + "loss": 2.5432, + "theoretical_loss": 3.377937381406474, + "tokens_seen": 2414338048 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013564694082246739, + "loss": 2.5627, + "theoretical_loss": 3.3779300163924257, + "tokens_seen": 2414403584 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001356369107321966, + "loss": 2.5021, + "theoretical_loss": 3.377922651634263, + "tokens_seen": 2414469120 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013562688064192578, + "loss": 2.3469, + "theoretical_loss": 3.3779152871319704, + "tokens_seen": 2414534656 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013561685055165496, + "loss": 2.5979, + "theoretical_loss": 3.3779079228855315, + "tokens_seen": 2414600192 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013560682046138414, + "loss": 2.5629, + "theoretical_loss": 3.3779005588949307, + "tokens_seen": 2414665728 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013559679037111335, + "loss": 2.5371, + "theoretical_loss": 3.3778931951601527, + "tokens_seen": 2414731264 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013558676028084253, + "loss": 2.51, + "theoretical_loss": 3.3778858316811813, + "tokens_seen": 2414796800 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001355767301905717, + "loss": 2.3842, + "theoretical_loss": 3.3778784684580003, + "tokens_seen": 2414862336 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001355667001003009, + "loss": 2.5313, + "theoretical_loss": 3.3778711054905943, + "tokens_seen": 2414927872 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2706672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7994749546051025, + "objective/train/theoretical_loss": 3.3778637427789477, + "objective/train/tokens_used": 2435453408, + "theoretical_loss": 3.3778637427789477, + "tokens_seen": 2414993408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013555667001003007, + "loss": 2.7276, + "theoretical_loss": 3.3778637427789477, + "tokens_seen": 2414993408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013554663991975928, + "loss": 2.3452, + "theoretical_loss": 3.377856380323044, + "tokens_seen": 2415058944 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013553660982948846, + "loss": 2.2984, + "theoretical_loss": 3.377849018122868, + "tokens_seen": 2415124480 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013552657973921764, + "loss": 2.5824, + "theoretical_loss": 3.3778416561784033, + "tokens_seen": 2415190016 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013551654964894682, + "loss": 2.4544, + "theoretical_loss": 3.377834294489635, + "tokens_seen": 2415255552 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013550651955867606, + "loss": 2.6344, + "theoretical_loss": 3.377826933056546, + "tokens_seen": 2415321088 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013549648946840524, + "loss": 2.4683, + "theoretical_loss": 3.3778195718791215, + "tokens_seen": 2415386624 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013548645937813442, + "loss": 2.2731, + "theoretical_loss": 3.3778122109573454, + "tokens_seen": 2415452160 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001354764292878636, + "loss": 2.477, + "theoretical_loss": 3.3778048502912017, + "tokens_seen": 2415517696 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013546639919759278, + "loss": 2.5527, + "theoretical_loss": 3.377797489880675, + "tokens_seen": 2415583232 + }, + { + "epoch": 8.01, + "learning_rate": 0.000135456369107322, + "loss": 2.3298, + "theoretical_loss": 3.3777901297257484, + "tokens_seen": 2415648768 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013544633901705117, + "loss": 2.4347, + "theoretical_loss": 3.3777827698264073, + "tokens_seen": 2415714304 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013543630892678035, + "loss": 2.6306, + "theoretical_loss": 3.377775410182636, + "tokens_seen": 2415779840 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013542627883650953, + "loss": 2.4701, + "theoretical_loss": 3.3777680507944177, + "tokens_seen": 2415845376 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013541624874623874, + "loss": 2.5491, + "theoretical_loss": 3.377760691661737, + "tokens_seen": 2415910912 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013540621865596792, + "loss": 2.3589, + "theoretical_loss": 3.3777533327845783, + "tokens_seen": 2415976448 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001353961885656971, + "loss": 2.4348, + "theoretical_loss": 3.377745974162926, + "tokens_seen": 2416041984 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013538615847542628, + "loss": 2.6802, + "theoretical_loss": 3.3777386157967637, + "tokens_seen": 2416107520 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013537612838515547, + "loss": 2.4452, + "theoretical_loss": 3.377731257686076, + "tokens_seen": 2416173056 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013536609829488467, + "loss": 2.6189, + "theoretical_loss": 3.3777238998308468, + "tokens_seen": 2416238592 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013535606820461386, + "loss": 2.5319, + "theoretical_loss": 3.3777165422310604, + "tokens_seen": 2416304128 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013534603811434304, + "loss": 2.6077, + "theoretical_loss": 3.377709184886701, + "tokens_seen": 2416369664 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013533600802407222, + "loss": 2.3718, + "theoretical_loss": 3.377701827797753, + "tokens_seen": 2416435200 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013532597793380143, + "loss": 2.4822, + "theoretical_loss": 3.3776944709642005, + "tokens_seen": 2416500736 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001353159478435306, + "loss": 2.36, + "theoretical_loss": 3.3776871143860276, + "tokens_seen": 2416566272 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2707493, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7526471614837646, + "objective/train/theoretical_loss": 3.377679758063219, + "objective/train/tokens_used": 2437091808, + "theoretical_loss": 3.377679758063219, + "tokens_seen": 2416631808 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001353059177532598, + "loss": 2.4048, + "theoretical_loss": 3.377679758063219, + "tokens_seen": 2416631808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013529588766298897, + "loss": 2.348, + "theoretical_loss": 3.377672401995758, + "tokens_seen": 2416697344 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013528585757271815, + "loss": 2.5145, + "theoretical_loss": 3.3776650461836297, + "tokens_seen": 2416762880 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013527582748244736, + "loss": 2.3804, + "theoretical_loss": 3.3776576906268176, + "tokens_seen": 2416828416 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013526579739217654, + "loss": 2.4357, + "theoretical_loss": 3.3776503353253067, + "tokens_seen": 2416893952 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013525576730190572, + "loss": 2.5194, + "theoretical_loss": 3.3776429802790804, + "tokens_seen": 2416959488 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001352457372116349, + "loss": 2.4854, + "theoretical_loss": 3.3776356254881232, + "tokens_seen": 2417025024 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001352357071213641, + "loss": 2.416, + "theoretical_loss": 3.3776282709524197, + "tokens_seen": 2417090560 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001352256770310933, + "loss": 2.2904, + "theoretical_loss": 3.3776209166719537, + "tokens_seen": 2417156096 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013521564694082247, + "loss": 2.4417, + "theoretical_loss": 3.37761356264671, + "tokens_seen": 2417221632 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013520561685055165, + "loss": 2.5324, + "theoretical_loss": 3.3776062088766716, + "tokens_seen": 2417287168 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013519558676028083, + "loss": 2.3392, + "theoretical_loss": 3.377598855361824, + "tokens_seen": 2417352704 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013518555667001004, + "loss": 2.4276, + "theoretical_loss": 3.3775915021021508, + "tokens_seen": 2417418240 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013517552657973922, + "loss": 2.599, + "theoretical_loss": 3.3775841490976366, + "tokens_seen": 2417483776 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001351654964894684, + "loss": 2.3749, + "theoretical_loss": 3.377576796348265, + "tokens_seen": 2417549312 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013515546639919759, + "loss": 2.4077, + "theoretical_loss": 3.3775694438540205, + "tokens_seen": 2417614848 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001351454363089268, + "loss": 2.573, + "theoretical_loss": 3.377562091614888, + "tokens_seen": 2417680384 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013513540621865598, + "loss": 2.3962, + "theoretical_loss": 3.3775547396308507, + "tokens_seen": 2417745920 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013512537612838516, + "loss": 2.5223, + "theoretical_loss": 3.3775473879018936, + "tokens_seen": 2417811456 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013511534603811434, + "loss": 2.3042, + "theoretical_loss": 3.3775400364280004, + "tokens_seen": 2417876992 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013510531594784355, + "loss": 2.5221, + "theoretical_loss": 3.3775326852091556, + "tokens_seen": 2417942528 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013509528585757273, + "loss": 2.427, + "theoretical_loss": 3.3775253342453437, + "tokens_seen": 2418008064 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001350852557673019, + "loss": 2.4301, + "theoretical_loss": 3.3775179835365488, + "tokens_seen": 2418073600 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001350752256770311, + "loss": 2.5028, + "theoretical_loss": 3.3775106330827547, + "tokens_seen": 2418139136 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013506519558676027, + "loss": 2.4122, + "theoretical_loss": 3.377503282883946, + "tokens_seen": 2418204672 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2708995, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3458614349365234, + "objective/train/theoretical_loss": 3.3774959329401066, + "objective/train/tokens_used": 2438730208, + "theoretical_loss": 3.3774959329401066, + "tokens_seen": 2418270208 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013505516549648948, + "loss": 2.5779, + "theoretical_loss": 3.3774959329401066, + "tokens_seen": 2418270208 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013504513540621866, + "loss": 2.5654, + "theoretical_loss": 3.3774885832512216, + "tokens_seen": 2418335744 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013503510531594784, + "loss": 2.4586, + "theoretical_loss": 3.3774812338172744, + "tokens_seen": 2418401280 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013502507522567702, + "loss": 2.2968, + "theoretical_loss": 3.3774738846382495, + "tokens_seen": 2418466816 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013501504513540623, + "loss": 2.4428, + "theoretical_loss": 3.3774665357141314, + "tokens_seen": 2418532352 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001350050150451354, + "loss": 2.5052, + "theoretical_loss": 3.377459187044904, + "tokens_seen": 2418597888 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001349949849548646, + "loss": 2.6399, + "theoretical_loss": 3.3774518386305514, + "tokens_seen": 2418663424 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013498495486459377, + "loss": 2.3179, + "theoretical_loss": 3.3774444904710585, + "tokens_seen": 2418728960 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013497492477432295, + "loss": 2.468, + "theoretical_loss": 3.377437142566409, + "tokens_seen": 2418794496 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013496489468405216, + "loss": 2.4373, + "theoretical_loss": 3.3774297949165875, + "tokens_seen": 2418860032 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013495486459378134, + "loss": 2.3929, + "theoretical_loss": 3.377422447521578, + "tokens_seen": 2418925568 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013494483450351052, + "loss": 2.5409, + "theoretical_loss": 3.3774151003813655, + "tokens_seen": 2418991104 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001349348044132397, + "loss": 2.2684, + "theoretical_loss": 3.377407753495933, + "tokens_seen": 2419056640 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013492477432296891, + "loss": 2.5292, + "theoretical_loss": 3.3774004068652657, + "tokens_seen": 2419122176 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001349147442326981, + "loss": 2.5757, + "theoretical_loss": 3.3773930604893474, + "tokens_seen": 2419187712 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013490471414242728, + "loss": 2.4953, + "theoretical_loss": 3.3773857143681623, + "tokens_seen": 2419253248 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013489468405215646, + "loss": 2.3963, + "theoretical_loss": 3.377378368501695, + "tokens_seen": 2419318784 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013488465396188564, + "loss": 2.5512, + "theoretical_loss": 3.37737102288993, + "tokens_seen": 2419384320 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013487462387161485, + "loss": 2.3212, + "theoretical_loss": 3.377363677532851, + "tokens_seen": 2419449856 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013486459378134403, + "loss": 2.5785, + "theoretical_loss": 3.3773563324304425, + "tokens_seen": 2419515392 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001348545636910732, + "loss": 2.501, + "theoretical_loss": 3.377348987582689, + "tokens_seen": 2419580928 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001348445336008024, + "loss": 2.5728, + "theoretical_loss": 3.3773416429895744, + "tokens_seen": 2419646464 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001348345035105316, + "loss": 2.6577, + "theoretical_loss": 3.3773342986510833, + "tokens_seen": 2419712000 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013482447342026078, + "loss": 2.4024, + "theoretical_loss": 3.3773269545671996, + "tokens_seen": 2419777536 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013481444332998996, + "loss": 2.5146, + "theoretical_loss": 3.3773196107379078, + "tokens_seen": 2419843072 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2709690, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4343132972717285, + "objective/train/theoretical_loss": 3.377312267163192, + "objective/train/tokens_used": 2440368608, + "theoretical_loss": 3.377312267163192, + "tokens_seen": 2419908608 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013480441323971914, + "loss": 2.5092, + "theoretical_loss": 3.377312267163192, + "tokens_seen": 2419908608 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013479438314944832, + "loss": 2.6473, + "theoretical_loss": 3.3773049238430373, + "tokens_seen": 2419974144 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013478435305917753, + "loss": 2.285, + "theoretical_loss": 3.3772975807774266, + "tokens_seen": 2420039680 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001347743229689067, + "loss": 2.5425, + "theoretical_loss": 3.377290237966345, + "tokens_seen": 2420105216 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001347642928786359, + "loss": 2.5349, + "theoretical_loss": 3.377282895409777, + "tokens_seen": 2420170752 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001347542627883651, + "loss": 2.5008, + "theoretical_loss": 3.3772755531077063, + "tokens_seen": 2420236288 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001347442326980943, + "loss": 2.5455, + "theoretical_loss": 3.3772682110601178, + "tokens_seen": 2420301824 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001347342026078235, + "loss": 2.641, + "theoretical_loss": 3.3772608692669954, + "tokens_seen": 2420367360 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013472417251755267, + "loss": 2.5018, + "theoretical_loss": 3.3772535277283233, + "tokens_seen": 2420432896 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013471414242728185, + "loss": 2.3321, + "theoretical_loss": 3.3772461864440864, + "tokens_seen": 2420498432 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013470411233701103, + "loss": 2.4514, + "theoretical_loss": 3.377238845414268, + "tokens_seen": 2420563968 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013469408224674024, + "loss": 2.436, + "theoretical_loss": 3.377231504638853, + "tokens_seen": 2420629504 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013468405215646942, + "loss": 2.5615, + "theoretical_loss": 3.377224164117826, + "tokens_seen": 2420695040 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001346740220661986, + "loss": 2.6759, + "theoretical_loss": 3.37721682385117, + "tokens_seen": 2420760576 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013466399197592779, + "loss": 2.4344, + "theoretical_loss": 3.3772094838388713, + "tokens_seen": 2420826112 + }, + { + "epoch": 8.01, + "learning_rate": 0.000134653961885657, + "loss": 2.3957, + "theoretical_loss": 3.3772021440809126, + "tokens_seen": 2420891648 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013464393179538618, + "loss": 2.5139, + "theoretical_loss": 3.377194804577279, + "tokens_seen": 2420957184 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013463390170511536, + "loss": 2.3895, + "theoretical_loss": 3.377187465327954, + "tokens_seen": 2421022720 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013462387161484454, + "loss": 2.73, + "theoretical_loss": 3.377180126332923, + "tokens_seen": 2421088256 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013461384152457375, + "loss": 2.5713, + "theoretical_loss": 3.3771727875921695, + "tokens_seen": 2421153792 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013460381143430293, + "loss": 2.4446, + "theoretical_loss": 3.377165449105678, + "tokens_seen": 2421219328 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001345937813440321, + "loss": 2.5692, + "theoretical_loss": 3.377158110873433, + "tokens_seen": 2421284864 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001345837512537613, + "loss": 2.7278, + "theoretical_loss": 3.3771507728954187, + "tokens_seen": 2421350400 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013457372116349047, + "loss": 2.6281, + "theoretical_loss": 3.3771434351716194, + "tokens_seen": 2421415936 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013456369107321968, + "loss": 2.453, + "theoretical_loss": 3.3771360977020195, + "tokens_seen": 2421481472 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2710987, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4513354301452637, + "objective/train/theoretical_loss": 3.377128760486603, + "objective/train/tokens_used": 2442007008, + "theoretical_loss": 3.377128760486603, + "tokens_seen": 2421547008 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013455366098294886, + "loss": 2.3717, + "theoretical_loss": 3.377128760486603, + "tokens_seen": 2421547008 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013454363089267804, + "loss": 2.5496, + "theoretical_loss": 3.377121423525354, + "tokens_seen": 2421612544 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013453360080240722, + "loss": 2.5095, + "theoretical_loss": 3.377114086818258, + "tokens_seen": 2421678080 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013452357071213643, + "loss": 2.379, + "theoretical_loss": 3.3771067503652983, + "tokens_seen": 2421743616 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001345135406218656, + "loss": 2.5677, + "theoretical_loss": 3.377099414166459, + "tokens_seen": 2421809152 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001345035105315948, + "loss": 2.3569, + "theoretical_loss": 3.3770920782217257, + "tokens_seen": 2421874688 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013449348044132397, + "loss": 2.3683, + "theoretical_loss": 3.3770847425310815, + "tokens_seen": 2421940224 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013448345035105315, + "loss": 2.2607, + "theoretical_loss": 3.3770774070945113, + "tokens_seen": 2422005760 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013447342026078236, + "loss": 2.3141, + "theoretical_loss": 3.3770700719119993, + "tokens_seen": 2422071296 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013446339017051154, + "loss": 2.5068, + "theoretical_loss": 3.3770627369835298, + "tokens_seen": 2422136832 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013445336008024073, + "loss": 2.4663, + "theoretical_loss": 3.377055402309087, + "tokens_seen": 2422202368 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001344433299899699, + "loss": 2.5302, + "theoretical_loss": 3.377048067888655, + "tokens_seen": 2422267904 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013443329989969911, + "loss": 2.4189, + "theoretical_loss": 3.377040733722219, + "tokens_seen": 2422333440 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001344232698094283, + "loss": 2.2955, + "theoretical_loss": 3.3770333998097626, + "tokens_seen": 2422398976 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013441323971915748, + "loss": 2.5032, + "theoretical_loss": 3.3770260661512705, + "tokens_seen": 2422464512 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013440320962888666, + "loss": 2.5334, + "theoretical_loss": 3.377018732746727, + "tokens_seen": 2422530048 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013439317953861584, + "loss": 2.4401, + "theoretical_loss": 3.377011399596116, + "tokens_seen": 2422595584 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013438314944834505, + "loss": 2.3216, + "theoretical_loss": 3.3770040666994223, + "tokens_seen": 2422661120 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013437311935807423, + "loss": 2.5323, + "theoretical_loss": 3.3769967340566303, + "tokens_seen": 2422726656 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001343630892678034, + "loss": 2.5114, + "theoretical_loss": 3.376989401667724, + "tokens_seen": 2422792192 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001343530591775326, + "loss": 2.4835, + "theoretical_loss": 3.376982069532688, + "tokens_seen": 2422857728 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001343430290872618, + "loss": 2.5818, + "theoretical_loss": 3.3769747376515062, + "tokens_seen": 2422923264 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013433299899699098, + "loss": 2.3597, + "theoretical_loss": 3.3769674060241637, + "tokens_seen": 2422988800 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013432296890672016, + "loss": 2.4903, + "theoretical_loss": 3.3769600746506443, + "tokens_seen": 2423054336 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013431293881644934, + "loss": 2.3393, + "theoretical_loss": 3.376952743530932, + "tokens_seen": 2423119872 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 2711679, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9998180866241455, + "objective/train/theoretical_loss": 3.376945412665012, + "objective/train/tokens_used": 2443645408, + "theoretical_loss": 3.376945412665012, + "tokens_seen": 2423185408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013430290872617852, + "loss": 2.6533, + "theoretical_loss": 3.376945412665012, + "tokens_seen": 2423185408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013429287863590773, + "loss": 2.446, + "theoretical_loss": 3.3769380820528685, + "tokens_seen": 2423250944 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001342828485456369, + "loss": 2.4938, + "theoretical_loss": 3.376930751694485, + "tokens_seen": 2423316480 + }, + { + "epoch": 8.01, + "learning_rate": 0.0001342728184553661, + "loss": 2.435, + "theoretical_loss": 3.376923421589847, + "tokens_seen": 2423382016 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013426278836509527, + "loss": 2.5113, + "theoretical_loss": 3.3769160917389383, + "tokens_seen": 2423447552 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013425275827482448, + "loss": 2.2935, + "theoretical_loss": 3.3769087621417433, + "tokens_seen": 2423513088 + }, + { + "epoch": 8.01, + "learning_rate": 0.00013424272818455366, + "loss": 2.2195, + "theoretical_loss": 3.3769014327982463, + "tokens_seen": 2423578624 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013423269809428285, + "loss": 2.3973, + "theoretical_loss": 3.3768941037084317, + "tokens_seen": 2423644160 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013422266800401203, + "loss": 2.5646, + "theoretical_loss": 3.376886774872284, + "tokens_seen": 2423709696 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001342126379137412, + "loss": 2.4411, + "theoretical_loss": 3.376879446289787, + "tokens_seen": 2423775232 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013420260782347042, + "loss": 2.6326, + "theoretical_loss": 3.376872117960926, + "tokens_seen": 2423840768 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001341925777331996, + "loss": 2.3168, + "theoretical_loss": 3.3768647898856843, + "tokens_seen": 2423906304 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013418254764292878, + "loss": 2.5529, + "theoretical_loss": 3.3768574620640472, + "tokens_seen": 2423971840 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013417251755265796, + "loss": 2.3732, + "theoretical_loss": 3.3768501344959985, + "tokens_seen": 2424037376 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013416248746238717, + "loss": 2.4033, + "theoretical_loss": 3.376842807181523, + "tokens_seen": 2424102912 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013415245737211635, + "loss": 2.333, + "theoretical_loss": 3.3768354801206044, + "tokens_seen": 2424168448 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013414242728184553, + "loss": 2.3528, + "theoretical_loss": 3.376828153313228, + "tokens_seen": 2424233984 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001341323971915747, + "loss": 2.4559, + "theoretical_loss": 3.3768208267593773, + "tokens_seen": 2424299520 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001341223671013039, + "loss": 2.5064, + "theoretical_loss": 3.376813500459037, + "tokens_seen": 2424365056 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001341123370110331, + "loss": 2.5945, + "theoretical_loss": 3.376806174412192, + "tokens_seen": 2424430592 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013410230692076228, + "loss": 2.4901, + "theoretical_loss": 3.376798848618826, + "tokens_seen": 2424496128 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013409227683049146, + "loss": 2.4935, + "theoretical_loss": 3.3767915230789236, + "tokens_seen": 2424561664 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013408224674022064, + "loss": 2.6849, + "theoretical_loss": 3.376784197792469, + "tokens_seen": 2424627200 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013407221664994985, + "loss": 2.4754, + "theoretical_loss": 3.3767768727594465, + "tokens_seen": 2424692736 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013406218655967903, + "loss": 2.7323, + "theoretical_loss": 3.3767695479798414, + "tokens_seen": 2424758272 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2713299, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2632291316986084, + "objective/train/theoretical_loss": 3.3767622234536367, + "objective/train/tokens_used": 2445283808, + "theoretical_loss": 3.3767622234536367, + "tokens_seen": 2424823808 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013405215646940821, + "loss": 2.5576, + "theoretical_loss": 3.3767622234536367, + "tokens_seen": 2424823808 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001340421263791374, + "loss": 2.5742, + "theoretical_loss": 3.376754899180818, + "tokens_seen": 2424889344 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013403209628886658, + "loss": 2.2809, + "theoretical_loss": 3.376747575161369, + "tokens_seen": 2424954880 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013402206619859578, + "loss": 2.3724, + "theoretical_loss": 3.3767402513952742, + "tokens_seen": 2425020416 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013401203610832497, + "loss": 2.5343, + "theoretical_loss": 3.376732927882518, + "tokens_seen": 2425085952 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013400200601805417, + "loss": 2.3855, + "theoretical_loss": 3.376725604623085, + "tokens_seen": 2425151488 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013399197592778335, + "loss": 2.5929, + "theoretical_loss": 3.3767182816169594, + "tokens_seen": 2425217024 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013398194583751256, + "loss": 2.6679, + "theoretical_loss": 3.3767109588641255, + "tokens_seen": 2425282560 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013397191574724174, + "loss": 2.4027, + "theoretical_loss": 3.376703636364568, + "tokens_seen": 2425348096 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013396188565697093, + "loss": 2.5412, + "theoretical_loss": 3.376696314118271, + "tokens_seen": 2425413632 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001339518555667001, + "loss": 2.55, + "theoretical_loss": 3.376688992125219, + "tokens_seen": 2425479168 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013394182547642931, + "loss": 2.5317, + "theoretical_loss": 3.3766816703853966, + "tokens_seen": 2425544704 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001339317953861585, + "loss": 2.4085, + "theoretical_loss": 3.3766743488987876, + "tokens_seen": 2425610240 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013392176529588768, + "loss": 2.529, + "theoretical_loss": 3.3766670276653774, + "tokens_seen": 2425675776 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013391173520561686, + "loss": 2.4148, + "theoretical_loss": 3.3766597066851496, + "tokens_seen": 2425741312 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013390170511534604, + "loss": 2.372, + "theoretical_loss": 3.3766523859580886, + "tokens_seen": 2425806848 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013389167502507525, + "loss": 2.4419, + "theoretical_loss": 3.376645065484179, + "tokens_seen": 2425872384 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013388164493480443, + "loss": 2.3598, + "theoretical_loss": 3.3766377452634053, + "tokens_seen": 2425937920 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001338716148445336, + "loss": 2.5624, + "theoretical_loss": 3.3766304252957524, + "tokens_seen": 2426003456 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001338615847542628, + "loss": 2.7137, + "theoretical_loss": 3.3766231055812037, + "tokens_seen": 2426068992 + }, + { + "epoch": 8.02, + "learning_rate": 0.000133851554663992, + "loss": 2.1888, + "theoretical_loss": 3.376615786119744, + "tokens_seen": 2426134528 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013384152457372118, + "loss": 2.3892, + "theoretical_loss": 3.3766084669113576, + "tokens_seen": 2426200064 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013383149448345036, + "loss": 2.3993, + "theoretical_loss": 3.3766011479560296, + "tokens_seen": 2426265600 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013382146439317954, + "loss": 2.4708, + "theoretical_loss": 3.3765938292537436, + "tokens_seen": 2426331136 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013381143430290872, + "loss": 2.4921, + "theoretical_loss": 3.3765865108044846, + "tokens_seen": 2426396672 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2714020, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3106296062469482, + "objective/train/theoretical_loss": 3.3765791926082365, + "objective/train/tokens_used": 2446922208, + "theoretical_loss": 3.3765791926082365, + "tokens_seen": 2426462208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013380140421263793, + "loss": 2.4075, + "theoretical_loss": 3.3765791926082365, + "tokens_seen": 2426462208 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001337913741223671, + "loss": 2.4797, + "theoretical_loss": 3.376571874664984, + "tokens_seen": 2426527744 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001337813440320963, + "loss": 2.3436, + "theoretical_loss": 3.376564556974712, + "tokens_seen": 2426593280 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013377131394182548, + "loss": 2.6629, + "theoretical_loss": 3.3765572395374033, + "tokens_seen": 2426658816 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013376128385155468, + "loss": 2.5803, + "theoretical_loss": 3.3765499223530444, + "tokens_seen": 2426724352 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013375125376128386, + "loss": 2.6366, + "theoretical_loss": 3.3765426054216183, + "tokens_seen": 2426789888 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013374122367101305, + "loss": 2.6692, + "theoretical_loss": 3.37653528874311, + "tokens_seen": 2426855424 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013373119358074223, + "loss": 2.5296, + "theoretical_loss": 3.376527972317504, + "tokens_seen": 2426920960 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001337211634904714, + "loss": 2.5152, + "theoretical_loss": 3.3765206561447845, + "tokens_seen": 2426986496 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013371113340020062, + "loss": 2.2973, + "theoretical_loss": 3.3765133402249354, + "tokens_seen": 2427052032 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001337011033099298, + "loss": 2.5874, + "theoretical_loss": 3.3765060245579424, + "tokens_seen": 2427117568 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013369107321965898, + "loss": 2.6082, + "theoretical_loss": 3.376498709143789, + "tokens_seen": 2427183104 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013368104312938816, + "loss": 2.3789, + "theoretical_loss": 3.37649139398246, + "tokens_seen": 2427248640 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013367101303911737, + "loss": 2.3715, + "theoretical_loss": 3.3764840790739394, + "tokens_seen": 2427314176 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013366098294884655, + "loss": 2.3537, + "theoretical_loss": 3.376476764418212, + "tokens_seen": 2427379712 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013365095285857573, + "loss": 2.3386, + "theoretical_loss": 3.3764694500152626, + "tokens_seen": 2427445248 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001336409227683049, + "loss": 2.2806, + "theoretical_loss": 3.376462135865075, + "tokens_seen": 2427510784 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001336308926780341, + "loss": 2.6413, + "theoretical_loss": 3.376454821967634, + "tokens_seen": 2427576320 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001336208625877633, + "loss": 2.4286, + "theoretical_loss": 3.3764475083229235, + "tokens_seen": 2427641856 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013361083249749248, + "loss": 2.4691, + "theoretical_loss": 3.376440194930929, + "tokens_seen": 2427707392 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013360080240722166, + "loss": 2.5368, + "theoretical_loss": 3.376432881791634, + "tokens_seen": 2427772928 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013359077231695084, + "loss": 2.6962, + "theoretical_loss": 3.376425568905023, + "tokens_seen": 2427838464 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013358074222668005, + "loss": 2.5532, + "theoretical_loss": 3.376418256271081, + "tokens_seen": 2427904000 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013357071213640923, + "loss": 2.716, + "theoretical_loss": 3.376410943889792, + "tokens_seen": 2427969536 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013356068204613841, + "loss": 2.6986, + "theoretical_loss": 3.37640363176114, + "tokens_seen": 2428035072 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2715289, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.546915292739868, + "objective/train/theoretical_loss": 3.376396319885111, + "objective/train/tokens_used": 2448560608, + "theoretical_loss": 3.376396319885111, + "tokens_seen": 2428100608 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001335506519558676, + "loss": 2.5847, + "theoretical_loss": 3.376396319885111, + "tokens_seen": 2428100608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013354062186559678, + "loss": 2.6136, + "theoretical_loss": 3.3763890082616883, + "tokens_seen": 2428166144 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013353059177532598, + "loss": 2.7196, + "theoretical_loss": 3.3763816968908564, + "tokens_seen": 2428231680 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013352056168505517, + "loss": 2.3075, + "theoretical_loss": 3.3763743857726, + "tokens_seen": 2428297216 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013351053159478435, + "loss": 2.3579, + "theoretical_loss": 3.3763670749069035, + "tokens_seen": 2428362752 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013350050150451353, + "loss": 2.4275, + "theoretical_loss": 3.3763597642937513, + "tokens_seen": 2428428288 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013349047141424274, + "loss": 2.4936, + "theoretical_loss": 3.376352453933128, + "tokens_seen": 2428493824 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013348044132397192, + "loss": 2.343, + "theoretical_loss": 3.376345143825018, + "tokens_seen": 2428559360 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001334704112337011, + "loss": 2.5671, + "theoretical_loss": 3.3763378339694055, + "tokens_seen": 2428624896 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013346038114343028, + "loss": 2.5242, + "theoretical_loss": 3.376330524366275, + "tokens_seen": 2428690432 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001334503510531595, + "loss": 2.3464, + "theoretical_loss": 3.3763232150156117, + "tokens_seen": 2428755968 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013344032096288867, + "loss": 2.5256, + "theoretical_loss": 3.3763159059173993, + "tokens_seen": 2428821504 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013343029087261785, + "loss": 2.4466, + "theoretical_loss": 3.3763085970716222, + "tokens_seen": 2428887040 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013342026078234703, + "loss": 2.3926, + "theoretical_loss": 3.376301288478266, + "tokens_seen": 2428952576 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001334102306920762, + "loss": 2.5186, + "theoretical_loss": 3.3762939801373135, + "tokens_seen": 2429018112 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013340020060180542, + "loss": 2.5079, + "theoretical_loss": 3.3762866720487503, + "tokens_seen": 2429083648 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001333901705115346, + "loss": 2.5462, + "theoretical_loss": 3.376279364212561, + "tokens_seen": 2429149184 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013338014042126378, + "loss": 2.5603, + "theoretical_loss": 3.376272056628729, + "tokens_seen": 2429214720 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013337011033099296, + "loss": 2.416, + "theoretical_loss": 3.3762647492972397, + "tokens_seen": 2429280256 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013336008024072217, + "loss": 2.4753, + "theoretical_loss": 3.376257442218077, + "tokens_seen": 2429345792 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013335005015045135, + "loss": 2.5255, + "theoretical_loss": 3.3762501353912264, + "tokens_seen": 2429411328 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013334002006018053, + "loss": 2.61, + "theoretical_loss": 3.3762428288166713, + "tokens_seen": 2429476864 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013332998996990972, + "loss": 2.6596, + "theoretical_loss": 3.3762355224943965, + "tokens_seen": 2429542400 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001333199598796389, + "loss": 2.5698, + "theoretical_loss": 3.3762282164243866, + "tokens_seen": 2429607936 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001333099297893681, + "loss": 2.5829, + "theoretical_loss": 3.376220910606626, + "tokens_seen": 2429673472 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2715900, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4849119186401367, + "objective/train/theoretical_loss": 3.3762136050411, + "objective/train/tokens_used": 2450199008, + "theoretical_loss": 3.3762136050411, + "tokens_seen": 2429739008 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013329989969909729, + "loss": 2.3834, + "theoretical_loss": 3.3762136050411, + "tokens_seen": 2429739008 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013328986960882647, + "loss": 2.4081, + "theoretical_loss": 3.3762062997277913, + "tokens_seen": 2429804544 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013327983951855565, + "loss": 2.4477, + "theoretical_loss": 3.3761989946666855, + "tokens_seen": 2429870080 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013326980942828486, + "loss": 2.1547, + "theoretical_loss": 3.3761916898577673, + "tokens_seen": 2429935616 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013325977933801404, + "loss": 2.5892, + "theoretical_loss": 3.3761843853010207, + "tokens_seen": 2430001152 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013324974924774325, + "loss": 2.6317, + "theoretical_loss": 3.3761770809964307, + "tokens_seen": 2430066688 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013323971915747243, + "loss": 2.7123, + "theoretical_loss": 3.376169776943981, + "tokens_seen": 2430132224 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001332296890672016, + "loss": 2.5602, + "theoretical_loss": 3.376162473143657, + "tokens_seen": 2430197760 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013321965897693082, + "loss": 2.3893, + "theoretical_loss": 3.376155169595443, + "tokens_seen": 2430263296 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013320962888666, + "loss": 2.6268, + "theoretical_loss": 3.3761478662993225, + "tokens_seen": 2430328832 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013319959879638918, + "loss": 2.5091, + "theoretical_loss": 3.376140563255281, + "tokens_seen": 2430394368 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013318956870611836, + "loss": 2.6066, + "theoretical_loss": 3.376133260463303, + "tokens_seen": 2430459904 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013317953861584757, + "loss": 2.7228, + "theoretical_loss": 3.3761259579233727, + "tokens_seen": 2430525440 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013316950852557675, + "loss": 2.5346, + "theoretical_loss": 3.3761186556354748, + "tokens_seen": 2430590976 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013315947843530593, + "loss": 2.4737, + "theoretical_loss": 3.376111353599594, + "tokens_seen": 2430656512 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001331494483450351, + "loss": 2.4752, + "theoretical_loss": 3.3761040518157137, + "tokens_seen": 2430722048 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001331394182547643, + "loss": 2.5874, + "theoretical_loss": 3.3760967502838195, + "tokens_seen": 2430787584 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001331293881644935, + "loss": 2.4568, + "theoretical_loss": 3.3760894490038957, + "tokens_seen": 2430853120 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013311935807422268, + "loss": 2.4988, + "theoretical_loss": 3.376082147975927, + "tokens_seen": 2430918656 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013310932798395186, + "loss": 2.5723, + "theoretical_loss": 3.3760748471998974, + "tokens_seen": 2430984192 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013309929789368104, + "loss": 2.4184, + "theoretical_loss": 3.3760675466757917, + "tokens_seen": 2431049728 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013308926780341025, + "loss": 2.8094, + "theoretical_loss": 3.3760602464035943, + "tokens_seen": 2431115264 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013307923771313943, + "loss": 2.341, + "theoretical_loss": 3.3760529463832896, + "tokens_seen": 2431180800 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013306920762286861, + "loss": 2.6032, + "theoretical_loss": 3.3760456466148625, + "tokens_seen": 2431246336 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001330591775325978, + "loss": 2.5878, + "theoretical_loss": 3.3760383470982975, + "tokens_seen": 2431311872 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2717463, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.627654552459717, + "objective/train/theoretical_loss": 3.3760310478335787, + "objective/train/tokens_used": 2451837408, + "theoretical_loss": 3.3760310478335787, + "tokens_seen": 2431377408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013304914744232698, + "loss": 2.518, + "theoretical_loss": 3.3760310478335787, + "tokens_seen": 2431377408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013303911735205618, + "loss": 2.6489, + "theoretical_loss": 3.3760237488206912, + "tokens_seen": 2431442944 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013302908726178537, + "loss": 2.5315, + "theoretical_loss": 3.376016450059619, + "tokens_seen": 2431508480 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013301905717151455, + "loss": 2.6117, + "theoretical_loss": 3.3760091515503468, + "tokens_seen": 2431574016 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013300902708124373, + "loss": 2.6827, + "theoretical_loss": 3.376001853292859, + "tokens_seen": 2431639552 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013299899699097294, + "loss": 2.4495, + "theoretical_loss": 3.37599455528714, + "tokens_seen": 2431705088 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013298896690070212, + "loss": 2.4065, + "theoretical_loss": 3.3759872575331755, + "tokens_seen": 2431770624 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001329789368104313, + "loss": 2.6171, + "theoretical_loss": 3.3759799600309486, + "tokens_seen": 2431836160 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013296890672016048, + "loss": 2.446, + "theoretical_loss": 3.3759726627804447, + "tokens_seen": 2431901696 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001329588766298897, + "loss": 2.5182, + "theoretical_loss": 3.3759653657816475, + "tokens_seen": 2431967232 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013294884653961887, + "loss": 2.4193, + "theoretical_loss": 3.3759580690345423, + "tokens_seen": 2432032768 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013293881644934805, + "loss": 2.5859, + "theoretical_loss": 3.375950772539113, + "tokens_seen": 2432098304 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013292878635907723, + "loss": 2.4318, + "theoretical_loss": 3.3759434762953453, + "tokens_seen": 2432163840 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001329187562688064, + "loss": 2.2597, + "theoretical_loss": 3.3759361803032224, + "tokens_seen": 2432229376 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013290872617853562, + "loss": 2.458, + "theoretical_loss": 3.3759288845627298, + "tokens_seen": 2432294912 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001328986960882648, + "loss": 2.52, + "theoretical_loss": 3.3759215890738514, + "tokens_seen": 2432360448 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013288866599799398, + "loss": 2.3527, + "theoretical_loss": 3.375914293836572, + "tokens_seen": 2432425984 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013287863590772316, + "loss": 2.3837, + "theoretical_loss": 3.375906998850876, + "tokens_seen": 2432491520 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013286860581745237, + "loss": 2.3915, + "theoretical_loss": 3.375899704116748, + "tokens_seen": 2432557056 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013285857572718155, + "loss": 2.3127, + "theoretical_loss": 3.375892409634173, + "tokens_seen": 2432622592 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013284854563691073, + "loss": 2.4967, + "theoretical_loss": 3.375885115403135, + "tokens_seen": 2432688128 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013283851554663992, + "loss": 2.6998, + "theoretical_loss": 3.3758778214236185, + "tokens_seen": 2432753664 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001328284854563691, + "loss": 2.5287, + "theoretical_loss": 3.3758705276956085, + "tokens_seen": 2432819200 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001328184553660983, + "loss": 2.6259, + "theoretical_loss": 3.3758632342190893, + "tokens_seen": 2432884736 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013280842527582749, + "loss": 2.7283, + "theoretical_loss": 3.3758559409940454, + "tokens_seen": 2432950272 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2718093, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7143726348876953, + "objective/train/theoretical_loss": 3.3758486480204617, + "objective/train/tokens_used": 2453475808, + "theoretical_loss": 3.3758486480204617, + "tokens_seen": 2433015808 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013279839518555667, + "loss": 2.5041, + "theoretical_loss": 3.3758486480204617, + "tokens_seen": 2433015808 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013278836509528585, + "loss": 2.4377, + "theoretical_loss": 3.3758413552983217, + "tokens_seen": 2433081344 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013277833500501506, + "loss": 2.5994, + "theoretical_loss": 3.375834062827612, + "tokens_seen": 2433146880 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013276830491474424, + "loss": 2.5074, + "theoretical_loss": 3.375826770608315, + "tokens_seen": 2433212416 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013275827482447342, + "loss": 2.4924, + "theoretical_loss": 3.375819478640416, + "tokens_seen": 2433277952 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001327482447342026, + "loss": 2.4735, + "theoretical_loss": 3.3758121869239, + "tokens_seen": 2433343488 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013273821464393178, + "loss": 2.5365, + "theoretical_loss": 3.3758048954587516, + "tokens_seen": 2433409024 + }, + { + "epoch": 8.02, + "learning_rate": 0.000132728184553661, + "loss": 2.3243, + "theoretical_loss": 3.375797604244955, + "tokens_seen": 2433474560 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013271815446339017, + "loss": 2.3055, + "theoretical_loss": 3.3757903132824945, + "tokens_seen": 2433540096 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013270812437311935, + "loss": 2.412, + "theoretical_loss": 3.3757830225713557, + "tokens_seen": 2433605632 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013269809428284853, + "loss": 2.6765, + "theoretical_loss": 3.3757757321115216, + "tokens_seen": 2433671168 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013268806419257774, + "loss": 2.3519, + "theoretical_loss": 3.375768441902978, + "tokens_seen": 2433736704 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013267803410230692, + "loss": 2.5988, + "theoretical_loss": 3.375761151945709, + "tokens_seen": 2433802240 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001326680040120361, + "loss": 2.4892, + "theoretical_loss": 3.3757538622396996, + "tokens_seen": 2433867776 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013265797392176528, + "loss": 2.6609, + "theoretical_loss": 3.3757465727849336, + "tokens_seen": 2433933312 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013264794383149447, + "loss": 2.7207, + "theoretical_loss": 3.3757392835813964, + "tokens_seen": 2433998848 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013263791374122367, + "loss": 2.433, + "theoretical_loss": 3.375731994629072, + "tokens_seen": 2434064384 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013262788365095285, + "loss": 2.6055, + "theoretical_loss": 3.3757247059279454, + "tokens_seen": 2434129920 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013261785356068204, + "loss": 2.528, + "theoretical_loss": 3.375717417478001, + "tokens_seen": 2434195456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013260782347041122, + "loss": 2.3391, + "theoretical_loss": 3.375710129279223, + "tokens_seen": 2434260992 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013259779338014043, + "loss": 2.7582, + "theoretical_loss": 3.375702841331597, + "tokens_seen": 2434326528 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001325877632898696, + "loss": 2.4461, + "theoretical_loss": 3.375695553635106, + "tokens_seen": 2434392064 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001325777331995988, + "loss": 2.3941, + "theoretical_loss": 3.375688266189736, + "tokens_seen": 2434457600 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013256770310932797, + "loss": 2.5849, + "theoretical_loss": 3.375680978995471, + "tokens_seen": 2434523136 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013255767301905715, + "loss": 2.5619, + "theoretical_loss": 3.375673692052296, + "tokens_seen": 2434588672 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2719450, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.171168327331543, + "objective/train/theoretical_loss": 3.375666405360195, + "objective/train/tokens_used": 2455114208, + "theoretical_loss": 3.375666405360195, + "tokens_seen": 2434654208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013254764292878636, + "loss": 2.3605, + "theoretical_loss": 3.375666405360195, + "tokens_seen": 2434654208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013253761283851554, + "loss": 2.7228, + "theoretical_loss": 3.3756591189191525, + "tokens_seen": 2434719744 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013252758274824472, + "loss": 2.5389, + "theoretical_loss": 3.375651832729154, + "tokens_seen": 2434785280 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001325175526579739, + "loss": 2.5193, + "theoretical_loss": 3.375644546790183, + "tokens_seen": 2434850816 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001325075225677031, + "loss": 2.3662, + "theoretical_loss": 3.3756372611022254, + "tokens_seen": 2434916352 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013249749247743232, + "loss": 2.5053, + "theoretical_loss": 3.375629975665264, + "tokens_seen": 2434981888 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001324874623871615, + "loss": 2.6233, + "theoretical_loss": 3.375622690479285, + "tokens_seen": 2435047424 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013247743229689068, + "loss": 2.3436, + "theoretical_loss": 3.3756154055442726, + "tokens_seen": 2435112960 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001324674022066199, + "loss": 2.2244, + "theoretical_loss": 3.3756081208602113, + "tokens_seen": 2435178496 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013245737211634907, + "loss": 2.5123, + "theoretical_loss": 3.3756008364270853, + "tokens_seen": 2435244032 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013244734202607825, + "loss": 2.4818, + "theoretical_loss": 3.37559355224488, + "tokens_seen": 2435309568 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013243731193580743, + "loss": 2.4428, + "theoretical_loss": 3.375586268313579, + "tokens_seen": 2435375104 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001324272818455366, + "loss": 2.5931, + "theoretical_loss": 3.375578984633168, + "tokens_seen": 2435440640 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013241725175526582, + "loss": 2.4543, + "theoretical_loss": 3.3755717012036306, + "tokens_seen": 2435506176 + }, + { + "epoch": 8.02, + "learning_rate": 0.000132407221664995, + "loss": 2.5572, + "theoretical_loss": 3.375564418024952, + "tokens_seen": 2435571712 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013239719157472418, + "loss": 2.5066, + "theoretical_loss": 3.3755571350971167, + "tokens_seen": 2435637248 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013238716148445336, + "loss": 2.4153, + "theoretical_loss": 3.375549852420109, + "tokens_seen": 2435702784 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013237713139418257, + "loss": 2.7607, + "theoretical_loss": 3.375542569993914, + "tokens_seen": 2435768320 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013236710130391175, + "loss": 2.6185, + "theoretical_loss": 3.3755352878185163, + "tokens_seen": 2435833856 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013235707121364093, + "loss": 2.4663, + "theoretical_loss": 3.3755280058939006, + "tokens_seen": 2435899392 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013234704112337012, + "loss": 2.5667, + "theoretical_loss": 3.3755207242200505, + "tokens_seen": 2435964928 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001323370110330993, + "loss": 2.3647, + "theoretical_loss": 3.3755134427969518, + "tokens_seen": 2436030464 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001323269809428285, + "loss": 2.5148, + "theoretical_loss": 3.375506161624589, + "tokens_seen": 2436096000 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013231695085255769, + "loss": 2.4155, + "theoretical_loss": 3.3754988807029456, + "tokens_seen": 2436161536 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013230692076228687, + "loss": 2.2562, + "theoretical_loss": 3.3754916000320074, + "tokens_seen": 2436227072 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2720013, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.352172374725342, + "objective/train/theoretical_loss": 3.375484319611759, + "objective/train/tokens_used": 2456752608, + "theoretical_loss": 3.375484319611759, + "tokens_seen": 2436292608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013229689067201605, + "loss": 2.5971, + "theoretical_loss": 3.375484319611759, + "tokens_seen": 2436292608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013228686058174526, + "loss": 2.6318, + "theoretical_loss": 3.3754770394421842, + "tokens_seen": 2436358144 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013227683049147444, + "loss": 2.5289, + "theoretical_loss": 3.375469759523268, + "tokens_seen": 2436423680 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013226680040120362, + "loss": 2.4208, + "theoretical_loss": 3.3754624798549955, + "tokens_seen": 2436489216 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001322567703109328, + "loss": 2.5083, + "theoretical_loss": 3.375455200437351, + "tokens_seen": 2436554752 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013224674022066198, + "loss": 2.511, + "theoretical_loss": 3.375447921270319, + "tokens_seen": 2436620288 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001322367101303912, + "loss": 2.3915, + "theoretical_loss": 3.375440642353884, + "tokens_seen": 2436685824 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013222668004012037, + "loss": 2.3788, + "theoretical_loss": 3.3754333636880314, + "tokens_seen": 2436751360 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013221664994984955, + "loss": 2.4463, + "theoretical_loss": 3.375426085272745, + "tokens_seen": 2436816896 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013220661985957873, + "loss": 2.4601, + "theoretical_loss": 3.3754188071080096, + "tokens_seen": 2436882432 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013219658976930794, + "loss": 2.5367, + "theoretical_loss": 3.37541152919381, + "tokens_seen": 2436947968 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013218655967903712, + "loss": 2.6218, + "theoretical_loss": 3.3754042515301306, + "tokens_seen": 2437013504 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001321765295887663, + "loss": 2.5386, + "theoretical_loss": 3.3753969741169567, + "tokens_seen": 2437079040 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013216649949849548, + "loss": 2.0604, + "theoretical_loss": 3.3753896969542723, + "tokens_seen": 2437144576 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013215646940822467, + "loss": 2.5108, + "theoretical_loss": 3.3753824200420617, + "tokens_seen": 2437210112 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013214643931795387, + "loss": 2.5596, + "theoretical_loss": 3.3753751433803107, + "tokens_seen": 2437275648 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013213640922768305, + "loss": 2.6076, + "theoretical_loss": 3.375367866969003, + "tokens_seen": 2437341184 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013212637913741224, + "loss": 2.4698, + "theoretical_loss": 3.375360590808124, + "tokens_seen": 2437406720 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013211634904714142, + "loss": 2.4372, + "theoretical_loss": 3.3753533148976578, + "tokens_seen": 2437472256 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013210631895687063, + "loss": 2.7217, + "theoretical_loss": 3.3753460392375887, + "tokens_seen": 2437537792 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001320962888665998, + "loss": 2.5013, + "theoretical_loss": 3.3753387638279024, + "tokens_seen": 2437603328 + }, + { + "epoch": 8.02, + "learning_rate": 0.000132086258776329, + "loss": 2.817, + "theoretical_loss": 3.3753314886685826, + "tokens_seen": 2437668864 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013207622868605817, + "loss": 2.6448, + "theoretical_loss": 3.3753242137596144, + "tokens_seen": 2437734400 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013206619859578735, + "loss": 2.3459, + "theoretical_loss": 3.3753169391009825, + "tokens_seen": 2437799936 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013205616850551656, + "loss": 2.4713, + "theoretical_loss": 3.3753096646926712, + "tokens_seen": 2437865472 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2721495, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4229578971862793, + "objective/train/theoretical_loss": 3.3753023905346655, + "objective/train/tokens_used": 2458391008, + "theoretical_loss": 3.3753023905346655, + "tokens_seen": 2437931008 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013204613841524574, + "loss": 2.5588, + "theoretical_loss": 3.3753023905346655, + "tokens_seen": 2437931008 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013203610832497492, + "loss": 2.2699, + "theoretical_loss": 3.3752951166269494, + "tokens_seen": 2437996544 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001320260782347041, + "loss": 2.5274, + "theoretical_loss": 3.375287842969509, + "tokens_seen": 2438062080 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001320160481444333, + "loss": 2.3935, + "theoretical_loss": 3.3752805695623276, + "tokens_seen": 2438127616 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001320060180541625, + "loss": 2.6142, + "theoretical_loss": 3.3752732964053904, + "tokens_seen": 2438193152 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013199598796389167, + "loss": 2.4716, + "theoretical_loss": 3.375266023498682, + "tokens_seen": 2438258688 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013198595787362085, + "loss": 2.347, + "theoretical_loss": 3.375258750842187, + "tokens_seen": 2438324224 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013197592778335003, + "loss": 2.4548, + "theoretical_loss": 3.3752514784358905, + "tokens_seen": 2438389760 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013196589769307924, + "loss": 2.4301, + "theoretical_loss": 3.3752442062797763, + "tokens_seen": 2438455296 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013195586760280842, + "loss": 2.5766, + "theoretical_loss": 3.3752369343738295, + "tokens_seen": 2438520832 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001319458375125376, + "loss": 2.4883, + "theoretical_loss": 3.375229662718035, + "tokens_seen": 2438586368 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013193580742226679, + "loss": 2.4974, + "theoretical_loss": 3.3752223913123776, + "tokens_seen": 2438651904 + }, + { + "epoch": 8.02, + "learning_rate": 0.000131925777331996, + "loss": 2.6236, + "theoretical_loss": 3.3752151201568417, + "tokens_seen": 2438717440 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013191574724172517, + "loss": 2.6236, + "theoretical_loss": 3.3752078492514115, + "tokens_seen": 2438782976 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013190571715145436, + "loss": 2.5054, + "theoretical_loss": 3.3752005785960724, + "tokens_seen": 2438848512 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013189568706118354, + "loss": 2.4545, + "theoretical_loss": 3.3751933081908088, + "tokens_seen": 2438914048 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013188565697091275, + "loss": 2.482, + "theoretical_loss": 3.375186038035605, + "tokens_seen": 2438979584 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013187562688064193, + "loss": 2.6181, + "theoretical_loss": 3.375178768130447, + "tokens_seen": 2439045120 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001318655967903711, + "loss": 2.6728, + "theoretical_loss": 3.375171498475318, + "tokens_seen": 2439110656 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001318555667001003, + "loss": 2.6009, + "theoretical_loss": 3.375164229070203, + "tokens_seen": 2439176192 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013184553660982947, + "loss": 2.4256, + "theoretical_loss": 3.375156959915087, + "tokens_seen": 2439241728 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013183550651955868, + "loss": 2.5157, + "theoretical_loss": 3.375149691009955, + "tokens_seen": 2439307264 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013182547642928786, + "loss": 2.5426, + "theoretical_loss": 3.3751424223547906, + "tokens_seen": 2439372800 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013181544633901704, + "loss": 2.3428, + "theoretical_loss": 3.3751351539495795, + "tokens_seen": 2439438336 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013180541624874622, + "loss": 2.6458, + "theoretical_loss": 3.3751278857943063, + "tokens_seen": 2439503872 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2722337, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3477442264556885, + "objective/train/theoretical_loss": 3.3751206178889555, + "objective/train/tokens_used": 2460029408, + "theoretical_loss": 3.3751206178889555, + "tokens_seen": 2439569408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013179538615847543, + "loss": 2.6937, + "theoretical_loss": 3.3751206178889555, + "tokens_seen": 2439569408 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001317853560682046, + "loss": 2.5825, + "theoretical_loss": 3.3751133502335118, + "tokens_seen": 2439634944 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001317753259779338, + "loss": 2.3937, + "theoretical_loss": 3.3751060828279593, + "tokens_seen": 2439700480 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013176529588766297, + "loss": 2.4438, + "theoretical_loss": 3.375098815672284, + "tokens_seen": 2439766016 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013175526579739218, + "loss": 2.4019, + "theoretical_loss": 3.3750915487664694, + "tokens_seen": 2439831552 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001317452357071214, + "loss": 2.531, + "theoretical_loss": 3.3750842821105005, + "tokens_seen": 2439897088 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013173520561685057, + "loss": 2.2702, + "theoretical_loss": 3.3750770157043624, + "tokens_seen": 2439962624 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013172517552657975, + "loss": 2.6572, + "theoretical_loss": 3.3750697495480395, + "tokens_seen": 2440028160 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013171514543630893, + "loss": 2.6066, + "theoretical_loss": 3.3750624836415164, + "tokens_seen": 2440093696 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013170511534603814, + "loss": 2.4967, + "theoretical_loss": 3.3750552179847784, + "tokens_seen": 2440159232 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013169508525576732, + "loss": 2.6682, + "theoretical_loss": 3.375047952577809, + "tokens_seen": 2440224768 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001316850551654965, + "loss": 2.4165, + "theoretical_loss": 3.375040687420594, + "tokens_seen": 2440290304 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013167502507522568, + "loss": 2.4931, + "theoretical_loss": 3.375033422513118, + "tokens_seen": 2440355840 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013166499498495487, + "loss": 2.4541, + "theoretical_loss": 3.3750261578553653, + "tokens_seen": 2440421376 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013165496489468407, + "loss": 2.4332, + "theoretical_loss": 3.3750188934473204, + "tokens_seen": 2440486912 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013164493480441325, + "loss": 2.6702, + "theoretical_loss": 3.375011629288969, + "tokens_seen": 2440552448 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013163490471414244, + "loss": 2.5005, + "theoretical_loss": 3.375004365380295, + "tokens_seen": 2440617984 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013162487462387162, + "loss": 2.5817, + "theoretical_loss": 3.3749971017212834, + "tokens_seen": 2440683520 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013161484453360083, + "loss": 2.3336, + "theoretical_loss": 3.374989838311919, + "tokens_seen": 2440749056 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013160481444333, + "loss": 2.4515, + "theoretical_loss": 3.3749825751521856, + "tokens_seen": 2440814592 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001315947843530592, + "loss": 2.5086, + "theoretical_loss": 3.3749753122420696, + "tokens_seen": 2440880128 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013158475426278837, + "loss": 2.4285, + "theoretical_loss": 3.374968049581554, + "tokens_seen": 2440945664 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013157472417251755, + "loss": 2.3677, + "theoretical_loss": 3.374960787170625, + "tokens_seen": 2441011200 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013156469408224676, + "loss": 2.3364, + "theoretical_loss": 3.374953525009266, + "tokens_seen": 2441076736 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013155466399197594, + "loss": 2.5689, + "theoretical_loss": 3.3749462630974625, + "tokens_seen": 2441142272 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2723693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0714311599731445, + "objective/train/theoretical_loss": 3.3749390014351994, + "objective/train/tokens_used": 2461667808, + "theoretical_loss": 3.3749390014351994, + "tokens_seen": 2441207808 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013154463390170512, + "loss": 2.5067, + "theoretical_loss": 3.3749390014351994, + "tokens_seen": 2441207808 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001315346038114343, + "loss": 2.7355, + "theoretical_loss": 3.374931740022461, + "tokens_seen": 2441273344 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001315245737211635, + "loss": 2.6376, + "theoretical_loss": 3.3749244788592323, + "tokens_seen": 2441338880 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001315145436308927, + "loss": 2.2996, + "theoretical_loss": 3.3749172179454976, + "tokens_seen": 2441404416 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013150451354062187, + "loss": 2.5743, + "theoretical_loss": 3.374909957281242, + "tokens_seen": 2441469952 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013149448345035105, + "loss": 2.6643, + "theoretical_loss": 3.37490269686645, + "tokens_seen": 2441535488 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013148445336008023, + "loss": 2.507, + "theoretical_loss": 3.3748954367011064, + "tokens_seen": 2441601024 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013147442326980944, + "loss": 2.5891, + "theoretical_loss": 3.374888176785196, + "tokens_seen": 2441666560 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013146439317953862, + "loss": 2.4351, + "theoretical_loss": 3.3748809171187037, + "tokens_seen": 2441732096 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001314543630892678, + "loss": 2.3964, + "theoretical_loss": 3.374873657701614, + "tokens_seen": 2441797632 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013144433299899699, + "loss": 2.5026, + "theoretical_loss": 3.374866398533912, + "tokens_seen": 2441863168 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001314343029087262, + "loss": 2.5009, + "theoretical_loss": 3.374859139615582, + "tokens_seen": 2441928704 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013142427281845538, + "loss": 2.3863, + "theoretical_loss": 3.3748518809466086, + "tokens_seen": 2441994240 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013141424272818456, + "loss": 2.5286, + "theoretical_loss": 3.374844622526977, + "tokens_seen": 2442059776 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013140421263791374, + "loss": 2.3898, + "theoretical_loss": 3.3748373643566723, + "tokens_seen": 2442125312 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013139418254764295, + "loss": 2.587, + "theoretical_loss": 3.374830106435678, + "tokens_seen": 2442190848 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013138415245737213, + "loss": 2.4764, + "theoretical_loss": 3.3748228487639795, + "tokens_seen": 2442256384 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001313741223671013, + "loss": 2.6033, + "theoretical_loss": 3.3748155913415623, + "tokens_seen": 2442321920 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001313640922768305, + "loss": 2.4666, + "theoretical_loss": 3.37480833416841, + "tokens_seen": 2442387456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013135406218655967, + "loss": 2.4067, + "theoretical_loss": 3.374801077244508, + "tokens_seen": 2442452992 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013134403209628888, + "loss": 2.5386, + "theoretical_loss": 3.3747938205698405, + "tokens_seen": 2442518528 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013133400200601806, + "loss": 2.3127, + "theoretical_loss": 3.374786564144393, + "tokens_seen": 2442584064 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013132397191574724, + "loss": 2.4718, + "theoretical_loss": 3.37477930796815, + "tokens_seen": 2442649600 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013131394182547642, + "loss": 2.4825, + "theoretical_loss": 3.3747720520410955, + "tokens_seen": 2442715136 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013130391173520563, + "loss": 2.5716, + "theoretical_loss": 3.3747647963632152, + "tokens_seen": 2442780672 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2724314, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2700133323669434, + "objective/train/theoretical_loss": 3.3747575409344934, + "objective/train/tokens_used": 2463306208, + "theoretical_loss": 3.3747575409344934, + "tokens_seen": 2442846208 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001312938816449348, + "loss": 2.445, + "theoretical_loss": 3.3747575409344934, + "tokens_seen": 2442846208 + }, + { + "epoch": 8.02, + "learning_rate": 0.000131283851554664, + "loss": 2.4843, + "theoretical_loss": 3.3747502857549154, + "tokens_seen": 2442911744 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013127382146439317, + "loss": 2.7482, + "theoretical_loss": 3.3747430308244653, + "tokens_seen": 2442977280 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013126379137412235, + "loss": 2.3005, + "theoretical_loss": 3.3747357761431283, + "tokens_seen": 2443042816 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013125376128385156, + "loss": 2.4581, + "theoretical_loss": 3.374728521710889, + "tokens_seen": 2443108352 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013124373119358074, + "loss": 2.6374, + "theoretical_loss": 3.374721267527732, + "tokens_seen": 2443173888 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013123370110330992, + "loss": 2.5219, + "theoretical_loss": 3.3747140135936426, + "tokens_seen": 2443239424 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001312236710130391, + "loss": 2.5422, + "theoretical_loss": 3.3747067599086047, + "tokens_seen": 2443304960 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013121364092276831, + "loss": 2.5492, + "theoretical_loss": 3.3746995064726035, + "tokens_seen": 2443370496 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001312036108324975, + "loss": 2.5514, + "theoretical_loss": 3.3746922532856245, + "tokens_seen": 2443436032 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013119358074222668, + "loss": 2.5204, + "theoretical_loss": 3.374685000347651, + "tokens_seen": 2443501568 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013118355065195586, + "loss": 2.4565, + "theoretical_loss": 3.3746777476586693, + "tokens_seen": 2443567104 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013117352056168504, + "loss": 2.3754, + "theoretical_loss": 3.374670495218663, + "tokens_seen": 2443632640 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013116349047141425, + "loss": 2.5046, + "theoretical_loss": 3.3746632430276176, + "tokens_seen": 2443698176 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013115346038114343, + "loss": 2.3285, + "theoretical_loss": 3.3746559910855174, + "tokens_seen": 2443763712 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001311434302908726, + "loss": 2.5113, + "theoretical_loss": 3.3746487393923474, + "tokens_seen": 2443829248 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001311334002006018, + "loss": 2.5243, + "theoretical_loss": 3.374641487948092, + "tokens_seen": 2443894784 + }, + { + "epoch": 8.02, + "learning_rate": 0.000131123370110331, + "loss": 2.4154, + "theoretical_loss": 3.374634236752737, + "tokens_seen": 2443960320 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013111334002006018, + "loss": 2.5076, + "theoretical_loss": 3.3746269858062665, + "tokens_seen": 2444025856 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013110330992978936, + "loss": 2.4658, + "theoretical_loss": 3.3746197351086646, + "tokens_seen": 2444091392 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013109327983951854, + "loss": 2.3974, + "theoretical_loss": 3.3746124846599175, + "tokens_seen": 2444156928 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013108324974924772, + "loss": 2.6173, + "theoretical_loss": 3.3746052344600095, + "tokens_seen": 2444222464 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013107321965897693, + "loss": 2.5733, + "theoretical_loss": 3.3745979845089247, + "tokens_seen": 2444288000 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001310631895687061, + "loss": 2.6422, + "theoretical_loss": 3.374590734806648, + "tokens_seen": 2444353536 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001310531594784353, + "loss": 2.5222, + "theoretical_loss": 3.374583485353165, + "tokens_seen": 2444419072 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2725693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.627734899520874, + "objective/train/theoretical_loss": 3.37457623614846, + "objective/train/tokens_used": 2464944608, + "theoretical_loss": 3.37457623614846, + "tokens_seen": 2444484608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013104312938816447, + "loss": 2.4978, + "theoretical_loss": 3.37457623614846, + "tokens_seen": 2444484608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013103309929789368, + "loss": 2.509, + "theoretical_loss": 3.374568987192518, + "tokens_seen": 2444550144 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013102306920762286, + "loss": 2.3454, + "theoretical_loss": 3.3745617384853235, + "tokens_seen": 2444615680 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013101303911735204, + "loss": 2.4479, + "theoretical_loss": 3.3745544900268616, + "tokens_seen": 2444681216 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013100300902708125, + "loss": 2.4234, + "theoretical_loss": 3.3745472418171167, + "tokens_seen": 2444746752 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013099297893681043, + "loss": 2.3156, + "theoretical_loss": 3.374539993856074, + "tokens_seen": 2444812288 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013098294884653964, + "loss": 2.5014, + "theoretical_loss": 3.374532746143718, + "tokens_seen": 2444877824 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013097291875626882, + "loss": 2.4436, + "theoretical_loss": 3.3745254986800335, + "tokens_seen": 2444943360 + }, + { + "epoch": 8.02, + "learning_rate": 0.000130962888665998, + "loss": 2.4683, + "theoretical_loss": 3.3745182514650054, + "tokens_seen": 2445008896 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013095285857572719, + "loss": 2.5447, + "theoretical_loss": 3.374511004498619, + "tokens_seen": 2445074432 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001309428284854564, + "loss": 2.5205, + "theoretical_loss": 3.3745037577808583, + "tokens_seen": 2445139968 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013093279839518558, + "loss": 2.5962, + "theoretical_loss": 3.3744965113117082, + "tokens_seen": 2445205504 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013092276830491476, + "loss": 2.4791, + "theoretical_loss": 3.374489265091154, + "tokens_seen": 2445271040 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013091273821464394, + "loss": 2.625, + "theoretical_loss": 3.3744820191191804, + "tokens_seen": 2445336576 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013090270812437315, + "loss": 2.5811, + "theoretical_loss": 3.374474773395772, + "tokens_seen": 2445402112 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013089267803410233, + "loss": 2.446, + "theoretical_loss": 3.3744675279209138, + "tokens_seen": 2445467648 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001308826479438315, + "loss": 2.2935, + "theoretical_loss": 3.3744602826945904, + "tokens_seen": 2445533184 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001308726178535607, + "loss": 2.6332, + "theoretical_loss": 3.3744530377167865, + "tokens_seen": 2445598720 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013086258776328987, + "loss": 2.3846, + "theoretical_loss": 3.3744457929874874, + "tokens_seen": 2445664256 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013085255767301908, + "loss": 2.5213, + "theoretical_loss": 3.3744385485066775, + "tokens_seen": 2445729792 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013084252758274826, + "loss": 2.1615, + "theoretical_loss": 3.3744313042743417, + "tokens_seen": 2445795328 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013083249749247744, + "loss": 2.3823, + "theoretical_loss": 3.3744240602904654, + "tokens_seen": 2445860864 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013082246740220662, + "loss": 2.4378, + "theoretical_loss": 3.3744168165550326, + "tokens_seen": 2445926400 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013081243731193583, + "loss": 2.6596, + "theoretical_loss": 3.374409573068028, + "tokens_seen": 2445991936 + }, + { + "epoch": 8.02, + "learning_rate": 0.000130802407221665, + "loss": 2.4011, + "theoretical_loss": 3.3744023298294374, + "tokens_seen": 2446057472 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2726350, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2966136932373047, + "objective/train/theoretical_loss": 3.374395086839245, + "objective/train/tokens_used": 2466583008, + "theoretical_loss": 3.374395086839245, + "tokens_seen": 2446123008 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001307923771313942, + "loss": 2.5694, + "theoretical_loss": 3.374395086839245, + "tokens_seen": 2446123008 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013078234704112337, + "loss": 2.4076, + "theoretical_loss": 3.3743878440974355, + "tokens_seen": 2446188544 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013077231695085255, + "loss": 2.4969, + "theoretical_loss": 3.374380601603994, + "tokens_seen": 2446254080 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013076228686058176, + "loss": 2.4569, + "theoretical_loss": 3.374373359358905, + "tokens_seen": 2446319616 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013075225677031094, + "loss": 2.2778, + "theoretical_loss": 3.374366117362154, + "tokens_seen": 2446385152 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013074222668004012, + "loss": 2.5778, + "theoretical_loss": 3.3743588756137255, + "tokens_seen": 2446450688 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001307321965897693, + "loss": 2.593, + "theoretical_loss": 3.374351634113604, + "tokens_seen": 2446516224 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013072216649949851, + "loss": 2.5882, + "theoretical_loss": 3.374344392861775, + "tokens_seen": 2446581760 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001307121364092277, + "loss": 2.301, + "theoretical_loss": 3.3743371518582226, + "tokens_seen": 2446647296 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013070210631895688, + "loss": 2.3699, + "theoretical_loss": 3.374329911102932, + "tokens_seen": 2446712832 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013069207622868606, + "loss": 2.516, + "theoretical_loss": 3.374322670595888, + "tokens_seen": 2446778368 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013068204613841524, + "loss": 2.513, + "theoretical_loss": 3.3743154303370755, + "tokens_seen": 2446843904 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013067201604814445, + "loss": 2.777, + "theoretical_loss": 3.374308190326479, + "tokens_seen": 2446909440 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013066198595787363, + "loss": 2.5687, + "theoretical_loss": 3.374300950564084, + "tokens_seen": 2446974976 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001306519558676028, + "loss": 2.611, + "theoretical_loss": 3.3742937110498747, + "tokens_seen": 2447040512 + }, + { + "epoch": 8.02, + "learning_rate": 0.000130641925777332, + "loss": 2.4296, + "theoretical_loss": 3.3742864717838366, + "tokens_seen": 2447106048 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001306318956870612, + "loss": 2.3038, + "theoretical_loss": 3.3742792327659537, + "tokens_seen": 2447171584 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013062186559679038, + "loss": 2.3925, + "theoretical_loss": 3.374271993996212, + "tokens_seen": 2447237120 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013061183550651956, + "loss": 2.3905, + "theoretical_loss": 3.374264755474595, + "tokens_seen": 2447302656 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013060180541624874, + "loss": 2.5118, + "theoretical_loss": 3.3742575172010882, + "tokens_seen": 2447368192 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013059177532597792, + "loss": 2.7146, + "theoretical_loss": 3.374250279175677, + "tokens_seen": 2447433728 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013058174523570713, + "loss": 2.5174, + "theoretical_loss": 3.3742430413983455, + "tokens_seen": 2447499264 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001305717151454363, + "loss": 2.3369, + "theoretical_loss": 3.3742358038690785, + "tokens_seen": 2447564800 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001305616850551655, + "loss": 2.6324, + "theoretical_loss": 3.3742285665878615, + "tokens_seen": 2447630336 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013055165496489467, + "loss": 2.3775, + "theoretical_loss": 3.374221329554679, + "tokens_seen": 2447695872 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2727339, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.844595193862915, + "objective/train/theoretical_loss": 3.374214092769516, + "objective/train/tokens_used": 2468221408, + "theoretical_loss": 3.374214092769516, + "tokens_seen": 2447761408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013054162487462388, + "loss": 2.7934, + "theoretical_loss": 3.374214092769516, + "tokens_seen": 2447761408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013053159478435306, + "loss": 2.5109, + "theoretical_loss": 3.3742068562323566, + "tokens_seen": 2447826944 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013052156469408225, + "loss": 2.5252, + "theoretical_loss": 3.3741996199431865, + "tokens_seen": 2447892480 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013051153460381143, + "loss": 2.4163, + "theoretical_loss": 3.3741923839019905, + "tokens_seen": 2447958016 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001305015045135406, + "loss": 2.3509, + "theoretical_loss": 3.3741851481087535, + "tokens_seen": 2448023552 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013049147442326982, + "loss": 2.6576, + "theoretical_loss": 3.3741779125634594, + "tokens_seen": 2448089088 + }, + { + "epoch": 8.02, + "learning_rate": 0.000130481444332999, + "loss": 2.7343, + "theoretical_loss": 3.3741706772660947, + "tokens_seen": 2448154624 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013047141424272818, + "loss": 2.4684, + "theoretical_loss": 3.374163442216643, + "tokens_seen": 2448220160 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013046138415245736, + "loss": 2.5698, + "theoretical_loss": 3.3741562074150897, + "tokens_seen": 2448285696 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013045135406218657, + "loss": 2.5399, + "theoretical_loss": 3.3741489728614193, + "tokens_seen": 2448351232 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013044132397191575, + "loss": 2.7622, + "theoretical_loss": 3.374141738555617, + "tokens_seen": 2448416768 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013043129388164493, + "loss": 2.2413, + "theoretical_loss": 3.374134504497668, + "tokens_seen": 2448482304 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001304212637913741, + "loss": 2.5211, + "theoretical_loss": 3.374127270687556, + "tokens_seen": 2448547840 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001304112337011033, + "loss": 2.437, + "theoretical_loss": 3.374120037125267, + "tokens_seen": 2448613376 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001304012036108325, + "loss": 2.7271, + "theoretical_loss": 3.3741128038107857, + "tokens_seen": 2448678912 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013039117352056168, + "loss": 2.5782, + "theoretical_loss": 3.3741055707440966, + "tokens_seen": 2448744448 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013038114343029086, + "loss": 2.3955, + "theoretical_loss": 3.3740983379251848, + "tokens_seen": 2448809984 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013037111334002004, + "loss": 2.4755, + "theoretical_loss": 3.3740911053540352, + "tokens_seen": 2448875520 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013036108324974925, + "loss": 2.3525, + "theoretical_loss": 3.3740838730306324, + "tokens_seen": 2448941056 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013035105315947843, + "loss": 2.4941, + "theoretical_loss": 3.3740766409549616, + "tokens_seen": 2449006592 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013034102306920761, + "loss": 2.4926, + "theoretical_loss": 3.374069409127008, + "tokens_seen": 2449072128 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001303309929789368, + "loss": 2.5407, + "theoretical_loss": 3.374062177546756, + "tokens_seen": 2449137664 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013032096288866598, + "loss": 2.4609, + "theoretical_loss": 3.37405494621419, + "tokens_seen": 2449203200 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013031093279839518, + "loss": 2.5013, + "theoretical_loss": 3.3740477151292962, + "tokens_seen": 2449268736 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013030090270812437, + "loss": 2.645, + "theoretical_loss": 3.3740404842920584, + "tokens_seen": 2449334272 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2727923, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7825098037719727, + "objective/train/theoretical_loss": 3.3740332537024615, + "objective/train/tokens_used": 2469859808, + "theoretical_loss": 3.3740332537024615, + "tokens_seen": 2449399808 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013029087261785355, + "loss": 2.5964, + "theoretical_loss": 3.3740332537024615, + "tokens_seen": 2449399808 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013028084252758273, + "loss": 2.4802, + "theoretical_loss": 3.374026023360491, + "tokens_seen": 2449465344 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013027081243731194, + "loss": 2.5774, + "theoretical_loss": 3.374018793266132, + "tokens_seen": 2449530880 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013026078234704112, + "loss": 2.5442, + "theoretical_loss": 3.3740115634193684, + "tokens_seen": 2449596416 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013025075225677033, + "loss": 2.6027, + "theoretical_loss": 3.374004333820186, + "tokens_seen": 2449661952 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001302407221664995, + "loss": 2.4583, + "theoretical_loss": 3.373997104468569, + "tokens_seen": 2449727488 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013023069207622871, + "loss": 2.6898, + "theoretical_loss": 3.373989875364503, + "tokens_seen": 2449793024 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001302206619859579, + "loss": 2.4758, + "theoretical_loss": 3.3739826465079723, + "tokens_seen": 2449858560 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013021063189568708, + "loss": 2.5055, + "theoretical_loss": 3.373975417898962, + "tokens_seen": 2449924096 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013020060180541626, + "loss": 2.4341, + "theoretical_loss": 3.373968189537457, + "tokens_seen": 2449989632 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013019057171514544, + "loss": 2.3403, + "theoretical_loss": 3.3739609614234425, + "tokens_seen": 2450055168 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013018054162487465, + "loss": 2.3452, + "theoretical_loss": 3.373953733556903, + "tokens_seen": 2450120704 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013017051153460383, + "loss": 2.4832, + "theoretical_loss": 3.3739465059378233, + "tokens_seen": 2450186240 + }, + { + "epoch": 8.02, + "learning_rate": 0.000130160481444333, + "loss": 2.5291, + "theoretical_loss": 3.373939278566189, + "tokens_seen": 2450251776 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001301504513540622, + "loss": 2.6194, + "theoretical_loss": 3.3739320514419844, + "tokens_seen": 2450317312 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001301404212637914, + "loss": 2.3978, + "theoretical_loss": 3.3739248245651945, + "tokens_seen": 2450382848 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013013039117352058, + "loss": 2.5944, + "theoretical_loss": 3.3739175979358045, + "tokens_seen": 2450448384 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013012036108324976, + "loss": 2.4742, + "theoretical_loss": 3.373910371553799, + "tokens_seen": 2450513920 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013011033099297894, + "loss": 2.5416, + "theoretical_loss": 3.373903145419163, + "tokens_seen": 2450579456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013010030090270812, + "loss": 2.4571, + "theoretical_loss": 3.3738959195318814, + "tokens_seen": 2450644992 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013009027081243733, + "loss": 2.3607, + "theoretical_loss": 3.3738886938919395, + "tokens_seen": 2450710528 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001300802407221665, + "loss": 2.8662, + "theoretical_loss": 3.3738814684993215, + "tokens_seen": 2450776064 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001300702106318957, + "loss": 2.6144, + "theoretical_loss": 3.3738742433540128, + "tokens_seen": 2450841600 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013006018054162487, + "loss": 2.6592, + "theoretical_loss": 3.3738670184559982, + "tokens_seen": 2450907136 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013005015045135408, + "loss": 2.386, + "theoretical_loss": 3.373859793805263, + "tokens_seen": 2450972672 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2729290, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5292763710021973, + "objective/train/theoretical_loss": 3.3738525694017913, + "objective/train/tokens_used": 2471498208, + "theoretical_loss": 3.3738525694017913, + "tokens_seen": 2451038208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013004012036108326, + "loss": 2.4917, + "theoretical_loss": 3.3738525694017913, + "tokens_seen": 2451038208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013003009027081245, + "loss": 2.3914, + "theoretical_loss": 3.3738453452455683, + "tokens_seen": 2451103744 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013002006018054163, + "loss": 2.6272, + "theoretical_loss": 3.37383812133658, + "tokens_seen": 2451169280 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001300100300902708, + "loss": 2.786, + "theoretical_loss": 3.37383089767481, + "tokens_seen": 2451234816 + }, + { + "epoch": 8.02, + "learning_rate": 0.00013000000000000002, + "loss": 2.5333, + "theoretical_loss": 3.373823674260244, + "tokens_seen": 2451300352 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001299899699097292, + "loss": 2.5313, + "theoretical_loss": 3.373816451092866, + "tokens_seen": 2451365888 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012997993981945838, + "loss": 2.4838, + "theoretical_loss": 3.3738092281726617, + "tokens_seen": 2451431424 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012996990972918756, + "loss": 2.6994, + "theoretical_loss": 3.3738020054996163, + "tokens_seen": 2451496960 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012995987963891677, + "loss": 2.411, + "theoretical_loss": 3.373794783073714, + "tokens_seen": 2451562496 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012994984954864595, + "loss": 2.4282, + "theoretical_loss": 3.3737875608949404, + "tokens_seen": 2451628032 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012993981945837513, + "loss": 2.6424, + "theoretical_loss": 3.3737803389632797, + "tokens_seen": 2451693568 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001299297893681043, + "loss": 2.2594, + "theoretical_loss": 3.3737731172787173, + "tokens_seen": 2451759104 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001299197592778335, + "loss": 2.7524, + "theoretical_loss": 3.373765895841238, + "tokens_seen": 2451824640 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001299097291875627, + "loss": 2.4405, + "theoretical_loss": 3.3737586746508272, + "tokens_seen": 2451890176 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012989969909729188, + "loss": 2.6107, + "theoretical_loss": 3.3737514537074693, + "tokens_seen": 2451955712 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012988966900702106, + "loss": 2.6334, + "theoretical_loss": 3.37374423301115, + "tokens_seen": 2452021248 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012987963891675024, + "loss": 2.7327, + "theoretical_loss": 3.3737370125618527, + "tokens_seen": 2452086784 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012986960882647945, + "loss": 2.4911, + "theoretical_loss": 3.3737297923595637, + "tokens_seen": 2452152320 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012985957873620863, + "loss": 2.5587, + "theoretical_loss": 3.3737225724042674, + "tokens_seen": 2452217856 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012984954864593781, + "loss": 2.524, + "theoretical_loss": 3.373715352695949, + "tokens_seen": 2452283392 + }, + { + "epoch": 8.02, + "learning_rate": 0.000129839518555667, + "loss": 2.5849, + "theoretical_loss": 3.3737081332345937, + "tokens_seen": 2452348928 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012982948846539618, + "loss": 2.5473, + "theoretical_loss": 3.3737009140201857, + "tokens_seen": 2452414464 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012981945837512538, + "loss": 2.4411, + "theoretical_loss": 3.3736936950527108, + "tokens_seen": 2452480000 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012980942828485457, + "loss": 2.8439, + "theoretical_loss": 3.373686476332153, + "tokens_seen": 2452545536 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012979939819458375, + "loss": 2.5897, + "theoretical_loss": 3.373679257858498, + "tokens_seen": 2452611072 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2730057, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5579910278320312, + "objective/train/theoretical_loss": 3.3736720396317303, + "objective/train/tokens_used": 2473136608, + "theoretical_loss": 3.3736720396317303, + "tokens_seen": 2452676608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012978936810431293, + "loss": 2.4702, + "theoretical_loss": 3.3736720396317303, + "tokens_seen": 2452676608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012977933801404214, + "loss": 2.6902, + "theoretical_loss": 3.3736648216518352, + "tokens_seen": 2452742144 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012976930792377132, + "loss": 2.3674, + "theoretical_loss": 3.3736576039187978, + "tokens_seen": 2452807680 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001297592778335005, + "loss": 2.7256, + "theoretical_loss": 3.373650386432603, + "tokens_seen": 2452873216 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012974924774322968, + "loss": 2.7963, + "theoretical_loss": 3.373643169193235, + "tokens_seen": 2452938752 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001297392176529589, + "loss": 2.559, + "theoretical_loss": 3.3736359522006794, + "tokens_seen": 2453004288 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012972918756268807, + "loss": 2.3993, + "theoretical_loss": 3.3736287354549215, + "tokens_seen": 2453069824 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012971915747241725, + "loss": 2.6921, + "theoretical_loss": 3.3736215189559458, + "tokens_seen": 2453135360 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012970912738214643, + "loss": 2.5118, + "theoretical_loss": 3.3736143027037375, + "tokens_seen": 2453200896 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001296990972918756, + "loss": 2.4308, + "theoretical_loss": 3.373607086698281, + "tokens_seen": 2453266432 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012968906720160482, + "loss": 2.3677, + "theoretical_loss": 3.373599870939562, + "tokens_seen": 2453331968 + }, + { + "epoch": 8.02, + "learning_rate": 0.000129679037111334, + "loss": 2.4872, + "theoretical_loss": 3.373592655427565, + "tokens_seen": 2453397504 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012966900702106318, + "loss": 2.6075, + "theoretical_loss": 3.373585440162275, + "tokens_seen": 2453463040 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012965897693079236, + "loss": 2.7648, + "theoretical_loss": 3.3735782251436772, + "tokens_seen": 2453528576 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012964894684052157, + "loss": 2.2214, + "theoretical_loss": 3.3735710103717564, + "tokens_seen": 2453594112 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012963891675025075, + "loss": 2.4653, + "theoretical_loss": 3.373563795846498, + "tokens_seen": 2453659648 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012962888665997993, + "loss": 2.5123, + "theoretical_loss": 3.3735565815678865, + "tokens_seen": 2453725184 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012961885656970912, + "loss": 2.5423, + "theoretical_loss": 3.373549367535907, + "tokens_seen": 2453790720 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001296088264794383, + "loss": 2.5, + "theoretical_loss": 3.3735421537505443, + "tokens_seen": 2453856256 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001295987963891675, + "loss": 2.5517, + "theoretical_loss": 3.373534940211784, + "tokens_seen": 2453921792 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012958876629889669, + "loss": 2.5009, + "theoretical_loss": 3.3735277269196104, + "tokens_seen": 2453987328 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012957873620862587, + "loss": 2.4563, + "theoretical_loss": 3.3735205138740088, + "tokens_seen": 2454052864 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012956870611835505, + "loss": 2.5014, + "theoretical_loss": 3.373513301074964, + "tokens_seen": 2454118400 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012955867602808426, + "loss": 2.5358, + "theoretical_loss": 3.3735060885224613, + "tokens_seen": 2454183936 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012954864593781344, + "loss": 2.5036, + "theoretical_loss": 3.3734988762164857, + "tokens_seen": 2454249472 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2730843, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.788271427154541, + "objective/train/theoretical_loss": 3.373491664157022, + "objective/train/tokens_used": 2474775008, + "theoretical_loss": 3.373491664157022, + "tokens_seen": 2454315008 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012953861584754262, + "loss": 2.5702, + "theoretical_loss": 3.373491664157022, + "tokens_seen": 2454315008 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001295285857572718, + "loss": 2.3841, + "theoretical_loss": 3.373484452344055, + "tokens_seen": 2454380544 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012951855566700098, + "loss": 2.4137, + "theoretical_loss": 3.3734772407775697, + "tokens_seen": 2454446080 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001295085255767302, + "loss": 2.5917, + "theoretical_loss": 3.3734700294575517, + "tokens_seen": 2454511616 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001294984954864594, + "loss": 2.8415, + "theoretical_loss": 3.3734628183839854, + "tokens_seen": 2454577152 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012948846539618858, + "loss": 2.555, + "theoretical_loss": 3.373455607556856, + "tokens_seen": 2454642688 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012947843530591776, + "loss": 2.2681, + "theoretical_loss": 3.3734483969761486, + "tokens_seen": 2454708224 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012946840521564697, + "loss": 2.6606, + "theoretical_loss": 3.373441186641848, + "tokens_seen": 2454773760 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012945837512537615, + "loss": 2.5251, + "theoretical_loss": 3.373433976553939, + "tokens_seen": 2454839296 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012944834503510533, + "loss": 2.7318, + "theoretical_loss": 3.373426766712407, + "tokens_seen": 2454904832 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001294383149448345, + "loss": 2.4468, + "theoretical_loss": 3.3734195571172374, + "tokens_seen": 2454970368 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001294282848545637, + "loss": 2.3427, + "theoretical_loss": 3.373412347768414, + "tokens_seen": 2455035904 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001294182547642929, + "loss": 2.3165, + "theoretical_loss": 3.373405138665923, + "tokens_seen": 2455101440 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012940822467402208, + "loss": 2.3872, + "theoretical_loss": 3.373397929809749, + "tokens_seen": 2455166976 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012939819458375126, + "loss": 2.4945, + "theoretical_loss": 3.373390721199877, + "tokens_seen": 2455232512 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012938816449348044, + "loss": 2.5391, + "theoretical_loss": 3.3733835128362912, + "tokens_seen": 2455298048 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012937813440320965, + "loss": 2.4247, + "theoretical_loss": 3.373376304718978, + "tokens_seen": 2455363584 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012936810431293883, + "loss": 2.5626, + "theoretical_loss": 3.3733690968479215, + "tokens_seen": 2455429120 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012935807422266801, + "loss": 2.5145, + "theoretical_loss": 3.373361889223107, + "tokens_seen": 2455494656 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001293480441323972, + "loss": 2.5565, + "theoretical_loss": 3.3733546818445195, + "tokens_seen": 2455560192 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012933801404212638, + "loss": 2.4394, + "theoretical_loss": 3.373347474712144, + "tokens_seen": 2455625728 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012932798395185558, + "loss": 2.6681, + "theoretical_loss": 3.3733402678259656, + "tokens_seen": 2455691264 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012931795386158477, + "loss": 2.4447, + "theoretical_loss": 3.3733330611859693, + "tokens_seen": 2455756800 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012930792377131395, + "loss": 2.492, + "theoretical_loss": 3.37332585479214, + "tokens_seen": 2455822336 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012929789368104313, + "loss": 2.5086, + "theoretical_loss": 3.373318648644463, + "tokens_seen": 2455887872 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 2732091, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.550044536590576, + "objective/train/theoretical_loss": 3.373311442742923, + "objective/train/tokens_used": 2476413408, + "theoretical_loss": 3.373311442742923, + "tokens_seen": 2455953408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012928786359077234, + "loss": 2.476, + "theoretical_loss": 3.373311442742923, + "tokens_seen": 2455953408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012927783350050152, + "loss": 2.4896, + "theoretical_loss": 3.373304237087505, + "tokens_seen": 2456018944 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001292678034102307, + "loss": 2.6941, + "theoretical_loss": 3.3732970316781943, + "tokens_seen": 2456084480 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012925777331995988, + "loss": 2.5086, + "theoretical_loss": 3.3732898265149753, + "tokens_seen": 2456150016 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001292477432296891, + "loss": 2.464, + "theoretical_loss": 3.373282621597834, + "tokens_seen": 2456215552 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012923771313941827, + "loss": 2.6463, + "theoretical_loss": 3.373275416926755, + "tokens_seen": 2456281088 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012922768304914745, + "loss": 2.7624, + "theoretical_loss": 3.3732682125017233, + "tokens_seen": 2456346624 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012921765295887663, + "loss": 2.5107, + "theoretical_loss": 3.3732610083227237, + "tokens_seen": 2456412160 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001292076228686058, + "loss": 2.4268, + "theoretical_loss": 3.3732538043897415, + "tokens_seen": 2456477696 + }, + { + "epoch": 8.02, + "learning_rate": 0.00012919759277833502, + "loss": 2.4943, + "theoretical_loss": 3.3732466007027617, + "tokens_seen": 2456543232 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001291875626880642, + "loss": 2.6387, + "theoretical_loss": 3.373239397261769, + "tokens_seen": 2456608768 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012917753259779338, + "loss": 2.4482, + "theoretical_loss": 3.3732321940667496, + "tokens_seen": 2456674304 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012916750250752256, + "loss": 2.5648, + "theoretical_loss": 3.373224991117687, + "tokens_seen": 2456739840 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012915747241725177, + "loss": 2.3723, + "theoretical_loss": 3.3732177884145673, + "tokens_seen": 2456805376 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012914744232698095, + "loss": 2.5015, + "theoretical_loss": 3.373210585957375, + "tokens_seen": 2456870912 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012913741223671013, + "loss": 2.683, + "theoretical_loss": 3.3732033837460955, + "tokens_seen": 2456936448 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012912738214643932, + "loss": 2.5795, + "theoretical_loss": 3.3731961817807132, + "tokens_seen": 2457001984 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001291173520561685, + "loss": 2.4537, + "theoretical_loss": 3.3731889800612143, + "tokens_seen": 2457067520 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001291073219658977, + "loss": 2.4498, + "theoretical_loss": 3.3731817785875826, + "tokens_seen": 2457133056 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012909729187562689, + "loss": 2.4709, + "theoretical_loss": 3.373174577359804, + "tokens_seen": 2457198592 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012908726178535607, + "loss": 2.4113, + "theoretical_loss": 3.3731673763778636, + "tokens_seen": 2457264128 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012907723169508525, + "loss": 2.5771, + "theoretical_loss": 3.3731601756417455, + "tokens_seen": 2457329664 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012906720160481446, + "loss": 2.3728, + "theoretical_loss": 3.3731529751514358, + "tokens_seen": 2457395200 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012905717151454364, + "loss": 2.5783, + "theoretical_loss": 3.3731457749069187, + "tokens_seen": 2457460736 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012904714142427282, + "loss": 2.5185, + "theoretical_loss": 3.37313857490818, + "tokens_seen": 2457526272 + }, + { + "debugging/Self-BLEU-5": 0.4301686691037956, + "debugging/distinct-1-grams": 0.8067009943610444, + "debugging/distinct-2-grams": 0.9620028029188404, + "debugging/entropy-1-grams": 5.941760970797609, + "debugging/entropy-2-grams": 6.649070230611844, + "debugging/length": 501.3636363636364, + "debugging/num_segments": 11, + "debugging/score": 0.004908168640344304, + "debugging/score_std": 0.004314650708674823, + "epoch": 8.03, + "objective/train/docs_used": 2732693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.804779052734375, + "objective/train/theoretical_loss": 3.373131375155204, + "objective/train/tokens_used": 2478051808, + "theoretical_loss": 3.373131375155204, + "tokens_seen": 2457591808 + }, + { + "epoch": 8.03, + "learning_rate": 0.000129037111334002, + "loss": 2.4134, + "theoretical_loss": 3.373131375155204, + "tokens_seen": 2457591808 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012902708124373118, + "loss": 2.3052, + "theoretical_loss": 3.373124175647977, + "tokens_seen": 2457657344 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001290170511534604, + "loss": 2.4335, + "theoretical_loss": 3.373116976386483, + "tokens_seen": 2457722880 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012900702106318957, + "loss": 2.413, + "theoretical_loss": 3.373109777370707, + "tokens_seen": 2457788416 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012899699097291875, + "loss": 2.5729, + "theoretical_loss": 3.3731025786006343, + "tokens_seen": 2457853952 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012898696088264793, + "loss": 2.3927, + "theoretical_loss": 3.37309538007625, + "tokens_seen": 2457919488 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012897693079237714, + "loss": 2.3711, + "theoretical_loss": 3.3730881817975398, + "tokens_seen": 2457985024 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012896690070210632, + "loss": 2.3117, + "theoretical_loss": 3.3730809837644875, + "tokens_seen": 2458050560 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001289568706118355, + "loss": 2.5086, + "theoretical_loss": 3.373073785977079, + "tokens_seen": 2458116096 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012894684052156468, + "loss": 2.2867, + "theoretical_loss": 3.3730665884352993, + "tokens_seen": 2458181632 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012893681043129386, + "loss": 2.5117, + "theoretical_loss": 3.3730593911391336, + "tokens_seen": 2458247168 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012892678034102307, + "loss": 2.5221, + "theoretical_loss": 3.3730521940885665, + "tokens_seen": 2458312704 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012891675025075225, + "loss": 2.6497, + "theoretical_loss": 3.3730449972835834, + "tokens_seen": 2458378240 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012890672016048144, + "loss": 2.632, + "theoretical_loss": 3.3730378007241693, + "tokens_seen": 2458443776 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012889669007021062, + "loss": 2.4212, + "theoretical_loss": 3.3730306044103093, + "tokens_seen": 2458509312 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012888665997993982, + "loss": 2.4661, + "theoretical_loss": 3.3730234083419885, + "tokens_seen": 2458574848 + }, + { + "epoch": 8.03, + "learning_rate": 0.000128876629889669, + "loss": 2.5789, + "theoretical_loss": 3.3730162125191914, + "tokens_seen": 2458640384 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001288665997993982, + "loss": 2.5512, + "theoretical_loss": 3.373009016941904, + "tokens_seen": 2458705920 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012885656970912737, + "loss": 2.6247, + "theoretical_loss": 3.373001821610111, + "tokens_seen": 2458771456 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012884653961885655, + "loss": 2.8574, + "theoretical_loss": 3.3729946265237976, + "tokens_seen": 2458836992 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012883650952858576, + "loss": 2.4243, + "theoretical_loss": 3.3729874316829482, + "tokens_seen": 2458902528 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012882647943831494, + "loss": 2.5877, + "theoretical_loss": 3.372980237087549, + "tokens_seen": 2458968064 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012881644934804412, + "loss": 2.6095, + "theoretical_loss": 3.372973042737584, + "tokens_seen": 2459033600 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001288064192577733, + "loss": 2.5322, + "theoretical_loss": 3.372965848633039, + "tokens_seen": 2459099136 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001287963891675025, + "loss": 2.4657, + "theoretical_loss": 3.3729586547738992, + "tokens_seen": 2459164672 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2734079, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0726442337036133, + "objective/train/theoretical_loss": 3.372951461160149, + "objective/train/tokens_used": 2479690208, + "theoretical_loss": 3.372951461160149, + "tokens_seen": 2459230208 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001287863590772317, + "loss": 2.4529, + "theoretical_loss": 3.372951461160149, + "tokens_seen": 2459230208 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012877632898696087, + "loss": 2.5735, + "theoretical_loss": 3.372944267791774, + "tokens_seen": 2459295744 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012876629889669005, + "loss": 2.7121, + "theoretical_loss": 3.372937074668759, + "tokens_seen": 2459361280 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012875626880641923, + "loss": 2.6201, + "theoretical_loss": 3.3729298817910895, + "tokens_seen": 2459426816 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012874623871614847, + "loss": 2.553, + "theoretical_loss": 3.3729226891587505, + "tokens_seen": 2459492352 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012873620862587765, + "loss": 2.7504, + "theoretical_loss": 3.3729154967717268, + "tokens_seen": 2459557888 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012872617853560683, + "loss": 2.6871, + "theoretical_loss": 3.3729083046300032, + "tokens_seen": 2459623424 + }, + { + "epoch": 8.03, + "learning_rate": 0.000128716148445336, + "loss": 2.3722, + "theoretical_loss": 3.3729011127335653, + "tokens_seen": 2459688960 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012870611835506522, + "loss": 2.4091, + "theoretical_loss": 3.3728939210823987, + "tokens_seen": 2459754496 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001286960882647944, + "loss": 2.4033, + "theoretical_loss": 3.3728867296764875, + "tokens_seen": 2459820032 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012868605817452358, + "loss": 2.5821, + "theoretical_loss": 3.372879538515817, + "tokens_seen": 2459885568 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012867602808425276, + "loss": 2.7561, + "theoretical_loss": 3.372872347600373, + "tokens_seen": 2459951104 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012866599799398197, + "loss": 2.6299, + "theoretical_loss": 3.37286515693014, + "tokens_seen": 2460016640 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012865596790371115, + "loss": 2.1371, + "theoretical_loss": 3.3728579665051033, + "tokens_seen": 2460082176 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012864593781344033, + "loss": 2.7042, + "theoretical_loss": 3.372850776325248, + "tokens_seen": 2460147712 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012863590772316952, + "loss": 2.6214, + "theoretical_loss": 3.3728435863905593, + "tokens_seen": 2460213248 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001286258776328987, + "loss": 2.463, + "theoretical_loss": 3.3728363967010218, + "tokens_seen": 2460278784 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001286158475426279, + "loss": 2.3728, + "theoretical_loss": 3.372829207256621, + "tokens_seen": 2460344320 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012860581745235709, + "loss": 2.4982, + "theoretical_loss": 3.3728220180573425, + "tokens_seen": 2460409856 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012859578736208627, + "loss": 2.4934, + "theoretical_loss": 3.3728148291031705, + "tokens_seen": 2460475392 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012858575727181545, + "loss": 2.4757, + "theoretical_loss": 3.3728076403940905, + "tokens_seen": 2460540928 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012857572718154466, + "loss": 2.589, + "theoretical_loss": 3.372800451930088, + "tokens_seen": 2460606464 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012856569709127384, + "loss": 2.4335, + "theoretical_loss": 3.3727932637111473, + "tokens_seen": 2460672000 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012855566700100302, + "loss": 2.3657, + "theoretical_loss": 3.3727860757372543, + "tokens_seen": 2460737536 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001285456369107322, + "loss": 2.2448, + "theoretical_loss": 3.3727788880083933, + "tokens_seen": 2460803072 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2734747, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9201643466949463, + "objective/train/theoretical_loss": 3.3727717005245506, + "objective/train/tokens_used": 2481328608, + "theoretical_loss": 3.3727717005245506, + "tokens_seen": 2460868608 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012853560682046138, + "loss": 2.751, + "theoretical_loss": 3.3727717005245506, + "tokens_seen": 2460868608 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001285255767301906, + "loss": 2.5811, + "theoretical_loss": 3.37276451328571, + "tokens_seen": 2460934144 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012851554663991977, + "loss": 2.5991, + "theoretical_loss": 3.372757326291858, + "tokens_seen": 2460999680 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012850551654964895, + "loss": 2.6637, + "theoretical_loss": 3.3727501395429784, + "tokens_seen": 2461065216 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012849548645937813, + "loss": 2.5405, + "theoretical_loss": 3.372742953039057, + "tokens_seen": 2461130752 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012848545636910734, + "loss": 2.502, + "theoretical_loss": 3.372735766780079, + "tokens_seen": 2461196288 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012847542627883652, + "loss": 2.3045, + "theoretical_loss": 3.3727285807660294, + "tokens_seen": 2461261824 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001284653961885657, + "loss": 2.4765, + "theoretical_loss": 3.3727213949968933, + "tokens_seen": 2461327360 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012845536609829488, + "loss": 2.7149, + "theoretical_loss": 3.3727142094726563, + "tokens_seen": 2461392896 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012844533600802407, + "loss": 2.5588, + "theoretical_loss": 3.3727070241933026, + "tokens_seen": 2461458432 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012843530591775327, + "loss": 2.4579, + "theoretical_loss": 3.3726998391588174, + "tokens_seen": 2461523968 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012842527582748245, + "loss": 2.5626, + "theoretical_loss": 3.372692654369187, + "tokens_seen": 2461589504 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012841524573721164, + "loss": 2.7132, + "theoretical_loss": 3.372685469824395, + "tokens_seen": 2461655040 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012840521564694082, + "loss": 2.3971, + "theoretical_loss": 3.372678285524428, + "tokens_seen": 2461720576 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012839518555667002, + "loss": 2.7174, + "theoretical_loss": 3.3726711014692703, + "tokens_seen": 2461786112 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001283851554663992, + "loss": 2.4186, + "theoretical_loss": 3.3726639176589073, + "tokens_seen": 2461851648 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001283751253761284, + "loss": 2.6893, + "theoretical_loss": 3.3726567340933236, + "tokens_seen": 2461917184 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012836509528585757, + "loss": 2.6979, + "theoretical_loss": 3.3726495507725054, + "tokens_seen": 2461982720 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012835506519558675, + "loss": 2.4828, + "theoretical_loss": 3.3726423676964368, + "tokens_seen": 2462048256 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012834503510531596, + "loss": 2.3616, + "theoretical_loss": 3.3726351848651035, + "tokens_seen": 2462113792 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012833500501504514, + "loss": 2.6302, + "theoretical_loss": 3.3726280022784905, + "tokens_seen": 2462179328 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012832497492477432, + "loss": 2.4871, + "theoretical_loss": 3.372620819936583, + "tokens_seen": 2462244864 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001283149448345035, + "loss": 2.6865, + "theoretical_loss": 3.372613637839366, + "tokens_seen": 2462310400 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001283049147442327, + "loss": 2.6288, + "theoretical_loss": 3.372606455986825, + "tokens_seen": 2462375936 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001282948846539619, + "loss": 2.5883, + "theoretical_loss": 3.372599274378945, + "tokens_seen": 2462441472 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2735634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6200332641601562, + "objective/train/theoretical_loss": 3.3725920930157107, + "objective/train/tokens_used": 2482967008, + "theoretical_loss": 3.3725920930157107, + "tokens_seen": 2462507008 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012828485456369107, + "loss": 2.6151, + "theoretical_loss": 3.3725920930157107, + "tokens_seen": 2462507008 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012827482447342025, + "loss": 2.6422, + "theoretical_loss": 3.372584911897108, + "tokens_seen": 2462572544 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012826479438314943, + "loss": 2.5901, + "theoretical_loss": 3.3725777310231217, + "tokens_seen": 2462638080 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012825476429287864, + "loss": 2.4297, + "theoretical_loss": 3.372570550393737, + "tokens_seen": 2462703616 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012824473420260782, + "loss": 2.6857, + "theoretical_loss": 3.3725633700089386, + "tokens_seen": 2462769152 + }, + { + "epoch": 8.03, + "learning_rate": 0.000128234704112337, + "loss": 2.5735, + "theoretical_loss": 3.372556189868712, + "tokens_seen": 2462834688 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012822467402206619, + "loss": 2.6954, + "theoretical_loss": 3.372549009973043, + "tokens_seen": 2462900224 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001282146439317954, + "loss": 2.6532, + "theoretical_loss": 3.372541830321916, + "tokens_seen": 2462965760 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012820461384152457, + "loss": 2.542, + "theoretical_loss": 3.3725346509153167, + "tokens_seen": 2463031296 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012819458375125376, + "loss": 2.52, + "theoretical_loss": 3.372527471753229, + "tokens_seen": 2463096832 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012818455366098294, + "loss": 2.5252, + "theoretical_loss": 3.37252029283564, + "tokens_seen": 2463162368 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012817452357071212, + "loss": 2.5755, + "theoretical_loss": 3.3725131141625333, + "tokens_seen": 2463227904 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012816449348044133, + "loss": 2.5526, + "theoretical_loss": 3.372505935733895, + "tokens_seen": 2463293440 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001281544633901705, + "loss": 2.8087, + "theoretical_loss": 3.3724987575497094, + "tokens_seen": 2463358976 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001281444332998997, + "loss": 2.6275, + "theoretical_loss": 3.372491579609963, + "tokens_seen": 2463424512 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012813440320962887, + "loss": 2.7618, + "theoretical_loss": 3.3724844019146394, + "tokens_seen": 2463490048 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012812437311935808, + "loss": 2.6049, + "theoretical_loss": 3.372477224463725, + "tokens_seen": 2463555584 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012811434302908726, + "loss": 2.5806, + "theoretical_loss": 3.3724700472572042, + "tokens_seen": 2463621120 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012810431293881644, + "loss": 2.4766, + "theoretical_loss": 3.3724628702950623, + "tokens_seen": 2463686656 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012809428284854562, + "loss": 2.6534, + "theoretical_loss": 3.372455693577285, + "tokens_seen": 2463752192 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012808425275827483, + "loss": 2.3981, + "theoretical_loss": 3.372448517103857, + "tokens_seen": 2463817728 + }, + { + "epoch": 8.03, + "learning_rate": 0.000128074222668004, + "loss": 2.6045, + "theoretical_loss": 3.3724413408747638, + "tokens_seen": 2463883264 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001280641925777332, + "loss": 2.4557, + "theoretical_loss": 3.3724341648899903, + "tokens_seen": 2463948800 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012805416248746237, + "loss": 2.6322, + "theoretical_loss": 3.3724269891495213, + "tokens_seen": 2464014336 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012804413239719155, + "loss": 2.6916, + "theoretical_loss": 3.372419813653343, + "tokens_seen": 2464079872 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2736283, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7047829627990723, + "objective/train/theoretical_loss": 3.3724126384014403, + "objective/train/tokens_used": 2484605408, + "theoretical_loss": 3.3724126384014403, + "tokens_seen": 2464145408 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012803410230692076, + "loss": 2.6299, + "theoretical_loss": 3.3724126384014403, + "tokens_seen": 2464145408 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012802407221664994, + "loss": 2.5446, + "theoretical_loss": 3.3724054633937977, + "tokens_seen": 2464210944 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012801404212637912, + "loss": 2.5095, + "theoretical_loss": 3.372398288630401, + "tokens_seen": 2464276480 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001280040120361083, + "loss": 2.8099, + "theoretical_loss": 3.3723911141112355, + "tokens_seen": 2464342016 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012799398194583754, + "loss": 2.6707, + "theoretical_loss": 3.372383939836286, + "tokens_seen": 2464407552 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012798395185556672, + "loss": 2.6371, + "theoretical_loss": 3.3723767658055372, + "tokens_seen": 2464473088 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001279739217652959, + "loss": 2.6372, + "theoretical_loss": 3.3723695920189756, + "tokens_seen": 2464538624 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012796389167502508, + "loss": 2.6606, + "theoretical_loss": 3.3723624184765852, + "tokens_seen": 2464604160 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012795386158475427, + "loss": 2.4766, + "theoretical_loss": 3.372355245178352, + "tokens_seen": 2464669696 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012794383149448347, + "loss": 2.6886, + "theoretical_loss": 3.372348072124261, + "tokens_seen": 2464735232 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012793380140421265, + "loss": 2.4837, + "theoretical_loss": 3.372340899314297, + "tokens_seen": 2464800768 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012792377131394184, + "loss": 2.4903, + "theoretical_loss": 3.3723337267484457, + "tokens_seen": 2464866304 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012791374122367102, + "loss": 2.3992, + "theoretical_loss": 3.3723265544266923, + "tokens_seen": 2464931840 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012790371113340023, + "loss": 2.5134, + "theoretical_loss": 3.3723193823490214, + "tokens_seen": 2464997376 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001278936810431294, + "loss": 2.5809, + "theoretical_loss": 3.372312210515419, + "tokens_seen": 2465062912 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001278836509528586, + "loss": 2.4458, + "theoretical_loss": 3.3723050389258695, + "tokens_seen": 2465128448 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012787362086258777, + "loss": 2.5574, + "theoretical_loss": 3.3722978675803588, + "tokens_seen": 2465193984 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012786359077231695, + "loss": 2.4483, + "theoretical_loss": 3.3722906964788715, + "tokens_seen": 2465259520 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012785356068204616, + "loss": 2.7507, + "theoretical_loss": 3.3722835256213934, + "tokens_seen": 2465325056 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012784353059177534, + "loss": 2.6491, + "theoretical_loss": 3.3722763550079096, + "tokens_seen": 2465390592 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012783350050150452, + "loss": 2.5628, + "theoretical_loss": 3.372269184638405, + "tokens_seen": 2465456128 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001278234704112337, + "loss": 2.8878, + "theoretical_loss": 3.372262014512865, + "tokens_seen": 2465521664 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001278134403209629, + "loss": 2.7365, + "theoretical_loss": 3.3722548446312746, + "tokens_seen": 2465587200 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001278034102306921, + "loss": 2.5307, + "theoretical_loss": 3.3722476749936194, + "tokens_seen": 2465652736 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012779338014042127, + "loss": 2.5768, + "theoretical_loss": 3.3722405055998843, + "tokens_seen": 2465718272 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2736588, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.757777690887451, + "objective/train/theoretical_loss": 3.372233336450055, + "objective/train/tokens_used": 2486243808, + "theoretical_loss": 3.372233336450055, + "tokens_seen": 2465783808 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012778335005015045, + "loss": 2.7036, + "theoretical_loss": 3.372233336450055, + "tokens_seen": 2465783808 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012777331995987963, + "loss": 2.8244, + "theoretical_loss": 3.372226167544116, + "tokens_seen": 2465849344 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012776328986960884, + "loss": 2.4467, + "theoretical_loss": 3.372218998882053, + "tokens_seen": 2465914880 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012775325977933802, + "loss": 2.7708, + "theoretical_loss": 3.372211830463851, + "tokens_seen": 2465980416 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001277432296890672, + "loss": 2.5481, + "theoretical_loss": 3.3722046622894952, + "tokens_seen": 2466045952 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012773319959879639, + "loss": 2.5518, + "theoretical_loss": 3.3721974943589714, + "tokens_seen": 2466111488 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001277231695085256, + "loss": 2.6974, + "theoretical_loss": 3.372190326672264, + "tokens_seen": 2466177024 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012771313941825477, + "loss": 2.645, + "theoretical_loss": 3.3721831592293587, + "tokens_seen": 2466242560 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012770310932798396, + "loss": 2.5059, + "theoretical_loss": 3.3721759920302405, + "tokens_seen": 2466308096 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012769307923771314, + "loss": 2.5632, + "theoretical_loss": 3.3721688250748953, + "tokens_seen": 2466373632 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012768304914744232, + "loss": 2.7232, + "theoretical_loss": 3.372161658363307, + "tokens_seen": 2466439168 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012767301905717153, + "loss": 2.8441, + "theoretical_loss": 3.3721544918954622, + "tokens_seen": 2466504704 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001276629889669007, + "loss": 2.6092, + "theoretical_loss": 3.372147325671346, + "tokens_seen": 2466570240 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001276529588766299, + "loss": 2.4637, + "theoretical_loss": 3.372140159690942, + "tokens_seen": 2466635776 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012764292878635907, + "loss": 2.5336, + "theoretical_loss": 3.3721329939542377, + "tokens_seen": 2466701312 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012763289869608828, + "loss": 2.6977, + "theoretical_loss": 3.372125828461217, + "tokens_seen": 2466766848 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012762286860581746, + "loss": 2.5441, + "theoretical_loss": 3.3721186632118654, + "tokens_seen": 2466832384 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012761283851554664, + "loss": 2.4707, + "theoretical_loss": 3.3721114982061677, + "tokens_seen": 2466897920 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012760280842527582, + "loss": 2.651, + "theoretical_loss": 3.37210433344411, + "tokens_seen": 2466963456 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012759277833500503, + "loss": 2.5738, + "theoretical_loss": 3.372097168925677, + "tokens_seen": 2467028992 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001275827482447342, + "loss": 2.5395, + "theoretical_loss": 3.3720900046508544, + "tokens_seen": 2467094528 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001275727181544634, + "loss": 2.6266, + "theoretical_loss": 3.372082840619627, + "tokens_seen": 2467160064 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012756268806419257, + "loss": 2.5247, + "theoretical_loss": 3.37207567683198, + "tokens_seen": 2467225600 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012755265797392175, + "loss": 2.5278, + "theoretical_loss": 3.372068513287899, + "tokens_seen": 2467291136 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012754262788365096, + "loss": 2.6404, + "theoretical_loss": 3.372061349987369, + "tokens_seen": 2467356672 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2736588, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5390255451202393, + "objective/train/theoretical_loss": 3.3720541869303755, + "objective/train/tokens_used": 2487882208, + "theoretical_loss": 3.3720541869303755, + "tokens_seen": 2467422208 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012753259779338014, + "loss": 2.576, + "theoretical_loss": 3.3720541869303755, + "tokens_seen": 2467422208 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012752256770310932, + "loss": 2.8648, + "theoretical_loss": 3.3720470241169034, + "tokens_seen": 2467487744 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001275125376128385, + "loss": 2.5207, + "theoretical_loss": 3.372039861546938, + "tokens_seen": 2467553280 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012750250752256771, + "loss": 2.7244, + "theoretical_loss": 3.3720326992204646, + "tokens_seen": 2467618816 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001274924774322969, + "loss": 2.7824, + "theoretical_loss": 3.372025537137469, + "tokens_seen": 2467684352 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012748244734202608, + "loss": 2.7334, + "theoretical_loss": 3.3720183752979356, + "tokens_seen": 2467749888 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012747241725175526, + "loss": 2.6861, + "theoretical_loss": 3.3720112137018505, + "tokens_seen": 2467815424 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012746238716148444, + "loss": 2.7995, + "theoretical_loss": 3.3720040523491983, + "tokens_seen": 2467880960 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012745235707121365, + "loss": 2.6429, + "theoretical_loss": 3.3719968912399643, + "tokens_seen": 2467946496 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012744232698094283, + "loss": 2.7178, + "theoretical_loss": 3.371989730374134, + "tokens_seen": 2468012032 + }, + { + "epoch": 8.03, + "learning_rate": 0.000127432296890672, + "loss": 2.7457, + "theoretical_loss": 3.371982569751693, + "tokens_seen": 2468077568 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001274222668004012, + "loss": 2.6482, + "theoretical_loss": 3.3719754093726255, + "tokens_seen": 2468143104 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001274122367101304, + "loss": 2.5627, + "theoretical_loss": 3.371968249236918, + "tokens_seen": 2468208640 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012740220661985958, + "loss": 2.8357, + "theoretical_loss": 3.371961089344555, + "tokens_seen": 2468274176 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012739217652958876, + "loss": 2.4391, + "theoretical_loss": 3.371953929695522, + "tokens_seen": 2468339712 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012738214643931794, + "loss": 2.729, + "theoretical_loss": 3.3719467702898043, + "tokens_seen": 2468405248 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012737211634904712, + "loss": 2.8813, + "theoretical_loss": 3.371939611127387, + "tokens_seen": 2468470784 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012736208625877633, + "loss": 2.663, + "theoretical_loss": 3.371932452208256, + "tokens_seen": 2468536320 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001273520561685055, + "loss": 2.6206, + "theoretical_loss": 3.3719252935323953, + "tokens_seen": 2468601856 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001273420260782347, + "loss": 2.5838, + "theoretical_loss": 3.3719181350997913, + "tokens_seen": 2468667392 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012733199598796387, + "loss": 2.6388, + "theoretical_loss": 3.3719109769104287, + "tokens_seen": 2468732928 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012732196589769308, + "loss": 2.8252, + "theoretical_loss": 3.3719038189642934, + "tokens_seen": 2468798464 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012731193580742226, + "loss": 2.6678, + "theoretical_loss": 3.37189666126137, + "tokens_seen": 2468864000 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012730190571715144, + "loss": 2.9337, + "theoretical_loss": 3.3718895038016443, + "tokens_seen": 2468929536 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012729187562688063, + "loss": 2.7719, + "theoretical_loss": 3.371882346585101, + "tokens_seen": 2468995072 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2738158, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6867573261260986, + "objective/train/theoretical_loss": 3.371875189611726, + "objective/train/tokens_used": 2489520608, + "theoretical_loss": 3.371875189611726, + "tokens_seen": 2469060608 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001272818455366098, + "loss": 2.6838, + "theoretical_loss": 3.371875189611726, + "tokens_seen": 2469060608 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012727181544633902, + "loss": 2.6904, + "theoretical_loss": 3.371868032881504, + "tokens_seen": 2469126144 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001272617853560682, + "loss": 2.7662, + "theoretical_loss": 3.371860876394421, + "tokens_seen": 2469191680 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001272517552657974, + "loss": 2.7791, + "theoretical_loss": 3.3718537201504617, + "tokens_seen": 2469257216 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012724172517552659, + "loss": 2.699, + "theoretical_loss": 3.371846564149611, + "tokens_seen": 2469322752 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001272316950852558, + "loss": 2.5824, + "theoretical_loss": 3.3718394083918555, + "tokens_seen": 2469388288 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012722166499498497, + "loss": 2.7032, + "theoretical_loss": 3.3718322528771796, + "tokens_seen": 2469453824 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012721163490471416, + "loss": 2.6131, + "theoretical_loss": 3.3718250976055684, + "tokens_seen": 2469519360 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012720160481444334, + "loss": 2.7322, + "theoretical_loss": 3.3718179425770076, + "tokens_seen": 2469584896 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012719157472417252, + "loss": 2.7097, + "theoretical_loss": 3.3718107877914827, + "tokens_seen": 2469650432 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012718154463390173, + "loss": 2.6531, + "theoretical_loss": 3.3718036332489785, + "tokens_seen": 2469715968 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001271715145436309, + "loss": 2.8055, + "theoretical_loss": 3.3717964789494808, + "tokens_seen": 2469781504 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001271614844533601, + "loss": 2.6116, + "theoretical_loss": 3.3717893248929744, + "tokens_seen": 2469847040 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012715145436308927, + "loss": 2.634, + "theoretical_loss": 3.371782171079445, + "tokens_seen": 2469912576 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012714142427281848, + "loss": 2.7761, + "theoretical_loss": 3.3717750175088774, + "tokens_seen": 2469978112 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012713139418254766, + "loss": 2.7395, + "theoretical_loss": 3.3717678641812574, + "tokens_seen": 2470043648 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012712136409227684, + "loss": 2.5573, + "theoretical_loss": 3.37176071109657, + "tokens_seen": 2470109184 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012711133400200602, + "loss": 2.6468, + "theoretical_loss": 3.371753558254801, + "tokens_seen": 2470174720 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012710130391173523, + "loss": 2.631, + "theoretical_loss": 3.371746405655935, + "tokens_seen": 2470240256 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001270912738214644, + "loss": 2.6151, + "theoretical_loss": 3.3717392532999577, + "tokens_seen": 2470305792 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001270812437311936, + "loss": 2.5084, + "theoretical_loss": 3.371732101186854, + "tokens_seen": 2470371328 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012707121364092277, + "loss": 2.7764, + "theoretical_loss": 3.37172494931661, + "tokens_seen": 2470436864 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012706118355065195, + "loss": 2.4759, + "theoretical_loss": 3.3717177976892105, + "tokens_seen": 2470502400 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012705115346038116, + "loss": 2.8141, + "theoretical_loss": 3.371710646304641, + "tokens_seen": 2470567936 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012704112337011034, + "loss": 2.4927, + "theoretical_loss": 3.371703495162887, + "tokens_seen": 2470633472 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2738784, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.897266149520874, + "objective/train/theoretical_loss": 3.3716963442639325, + "objective/train/tokens_used": 2491159008, + "theoretical_loss": 3.3716963442639325, + "tokens_seen": 2470699008 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012703109327983952, + "loss": 2.6541, + "theoretical_loss": 3.3716963442639325, + "tokens_seen": 2470699008 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001270210631895687, + "loss": 2.8512, + "theoretical_loss": 3.3716891936077644, + "tokens_seen": 2470764544 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012701103309929791, + "loss": 2.7454, + "theoretical_loss": 3.3716820431943675, + "tokens_seen": 2470830080 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001270010030090271, + "loss": 2.8079, + "theoretical_loss": 3.371674893023727, + "tokens_seen": 2470895616 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012699097291875628, + "loss": 2.7838, + "theoretical_loss": 3.3716677430958284, + "tokens_seen": 2470961152 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012698094282848546, + "loss": 2.5941, + "theoretical_loss": 3.3716605934106565, + "tokens_seen": 2471026688 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012697091273821464, + "loss": 2.8144, + "theoretical_loss": 3.3716534439681975, + "tokens_seen": 2471092224 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012696088264794385, + "loss": 2.8219, + "theoretical_loss": 3.371646294768436, + "tokens_seen": 2471157760 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012695085255767303, + "loss": 2.8187, + "theoretical_loss": 3.3716391458113577, + "tokens_seen": 2471223296 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001269408224674022, + "loss": 2.6418, + "theoretical_loss": 3.3716319970969475, + "tokens_seen": 2471288832 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001269307923771314, + "loss": 2.7121, + "theoretical_loss": 3.371624848625191, + "tokens_seen": 2471354368 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001269207622868606, + "loss": 2.5948, + "theoretical_loss": 3.371617700396074, + "tokens_seen": 2471419904 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012691073219658978, + "loss": 2.5379, + "theoretical_loss": 3.3716105524095816, + "tokens_seen": 2471485440 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012690070210631896, + "loss": 2.731, + "theoretical_loss": 3.371603404665698, + "tokens_seen": 2471550976 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012689067201604814, + "loss": 2.6782, + "theoretical_loss": 3.37159625716441, + "tokens_seen": 2471616512 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012688064192577732, + "loss": 2.9569, + "theoretical_loss": 3.3715891099057025, + "tokens_seen": 2471682048 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012687061183550653, + "loss": 2.568, + "theoretical_loss": 3.3715819628895605, + "tokens_seen": 2471747584 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001268605817452357, + "loss": 2.6159, + "theoretical_loss": 3.3715748161159698, + "tokens_seen": 2471813120 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001268505516549649, + "loss": 2.7729, + "theoretical_loss": 3.3715676695849153, + "tokens_seen": 2471878656 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012684052156469407, + "loss": 2.6015, + "theoretical_loss": 3.371560523296383, + "tokens_seen": 2471944192 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012683049147442328, + "loss": 2.8073, + "theoretical_loss": 3.3715533772503568, + "tokens_seen": 2472009728 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012682046138415246, + "loss": 2.6499, + "theoretical_loss": 3.371546231446824, + "tokens_seen": 2472075264 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012681043129388164, + "loss": 2.5989, + "theoretical_loss": 3.371539085885768, + "tokens_seen": 2472140800 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012680040120361083, + "loss": 2.3958, + "theoretical_loss": 3.3715319405671758, + "tokens_seen": 2472206336 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012679037111334, + "loss": 2.5683, + "theoretical_loss": 3.371524795491032, + "tokens_seen": 2472271872 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2739985, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.446648597717285, + "objective/train/theoretical_loss": 3.3715176506573217, + "objective/train/tokens_used": 2492797408, + "theoretical_loss": 3.3715176506573217, + "tokens_seen": 2472337408 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012678034102306922, + "loss": 2.8086, + "theoretical_loss": 3.3715176506573217, + "tokens_seen": 2472337408 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001267703109327984, + "loss": 2.692, + "theoretical_loss": 3.371510506066031, + "tokens_seen": 2472402944 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012676028084252758, + "loss": 2.7412, + "theoretical_loss": 3.3715033617171444, + "tokens_seen": 2472468480 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012675025075225676, + "loss": 2.6575, + "theoretical_loss": 3.3714962176106478, + "tokens_seen": 2472534016 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012674022066198597, + "loss": 2.452, + "theoretical_loss": 3.3714890737465266, + "tokens_seen": 2472599552 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012673019057171515, + "loss": 2.8258, + "theoretical_loss": 3.3714819301247654, + "tokens_seen": 2472665088 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012672016048144433, + "loss": 2.5999, + "theoretical_loss": 3.3714747867453507, + "tokens_seen": 2472730624 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001267101303911735, + "loss": 2.5614, + "theoretical_loss": 3.3714676436082667, + "tokens_seen": 2472796160 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001267001003009027, + "loss": 2.5814, + "theoretical_loss": 3.3714605007135, + "tokens_seen": 2472861696 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001266900702106319, + "loss": 2.5846, + "theoretical_loss": 3.371453358061035, + "tokens_seen": 2472927232 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012668004012036108, + "loss": 2.5731, + "theoretical_loss": 3.371446215650857, + "tokens_seen": 2472992768 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012667001003009026, + "loss": 2.8202, + "theoretical_loss": 3.371439073482952, + "tokens_seen": 2473058304 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012665997993981944, + "loss": 2.6458, + "theoretical_loss": 3.371431931557305, + "tokens_seen": 2473123840 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012664994984954865, + "loss": 2.6973, + "theoretical_loss": 3.371424789873901, + "tokens_seen": 2473189376 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012663991975927783, + "loss": 2.4185, + "theoretical_loss": 3.3714176484327263, + "tokens_seen": 2473254912 + }, + { + "epoch": 8.03, + "learning_rate": 0.000126629889669007, + "loss": 2.5139, + "theoretical_loss": 3.3714105072337657, + "tokens_seen": 2473320448 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001266198595787362, + "loss": 2.7077, + "theoretical_loss": 3.3714033662770047, + "tokens_seen": 2473385984 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012660982948846538, + "loss": 2.6405, + "theoretical_loss": 3.371396225562428, + "tokens_seen": 2473451520 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012659979939819458, + "loss": 2.6631, + "theoretical_loss": 3.3713890850900223, + "tokens_seen": 2473517056 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012658976930792377, + "loss": 2.5967, + "theoretical_loss": 3.371381944859772, + "tokens_seen": 2473582592 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012657973921765295, + "loss": 2.6175, + "theoretical_loss": 3.3713748048716625, + "tokens_seen": 2473648128 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012656970912738213, + "loss": 2.7838, + "theoretical_loss": 3.3713676651256796, + "tokens_seen": 2473713664 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012655967903711134, + "loss": 2.6565, + "theoretical_loss": 3.3713605256218084, + "tokens_seen": 2473779200 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012654964894684052, + "loss": 2.5419, + "theoretical_loss": 3.3713533863600342, + "tokens_seen": 2473844736 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001265396188565697, + "loss": 2.6335, + "theoretical_loss": 3.371346247340343, + "tokens_seen": 2473910272 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2740645, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.563281297683716, + "objective/train/theoretical_loss": 3.371339108562719, + "objective/train/tokens_used": 2494435808, + "theoretical_loss": 3.371339108562719, + "tokens_seen": 2473975808 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012652958876629888, + "loss": 2.716, + "theoretical_loss": 3.371339108562719, + "tokens_seen": 2473975808 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001265195586760281, + "loss": 2.7706, + "theoretical_loss": 3.371331970027149, + "tokens_seen": 2474041344 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012650952858575727, + "loss": 2.5266, + "theoretical_loss": 3.371324831733617, + "tokens_seen": 2474106880 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012649949849548648, + "loss": 2.6164, + "theoretical_loss": 3.3713176936821094, + "tokens_seen": 2474172416 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012648946840521566, + "loss": 2.4443, + "theoretical_loss": 3.371310555872611, + "tokens_seen": 2474237952 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012647943831494484, + "loss": 2.5453, + "theoretical_loss": 3.371303418305107, + "tokens_seen": 2474303488 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012646940822467405, + "loss": 2.641, + "theoretical_loss": 3.3712962809795837, + "tokens_seen": 2474369024 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012645937813440323, + "loss": 2.5204, + "theoretical_loss": 3.371289143896026, + "tokens_seen": 2474434560 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001264493480441324, + "loss": 2.5537, + "theoretical_loss": 3.371282007054419, + "tokens_seen": 2474500096 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001264393179538616, + "loss": 2.7262, + "theoretical_loss": 3.371274870454749, + "tokens_seen": 2474565632 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001264292878635908, + "loss": 2.738, + "theoretical_loss": 3.371267734097, + "tokens_seen": 2474631168 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012641925777331998, + "loss": 2.697, + "theoretical_loss": 3.3712605979811583, + "tokens_seen": 2474696704 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012640922768304916, + "loss": 2.8755, + "theoretical_loss": 3.371253462107209, + "tokens_seen": 2474762240 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012639919759277834, + "loss": 2.5993, + "theoretical_loss": 3.371246326475138, + "tokens_seen": 2474827776 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012638916750250752, + "loss": 2.7967, + "theoretical_loss": 3.37123919108493, + "tokens_seen": 2474893312 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012637913741223673, + "loss": 2.6524, + "theoretical_loss": 3.371232055936571, + "tokens_seen": 2474958848 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001263691073219659, + "loss": 2.6168, + "theoretical_loss": 3.371224921030046, + "tokens_seen": 2475024384 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001263590772316951, + "loss": 2.4592, + "theoretical_loss": 3.3712177863653405, + "tokens_seen": 2475089920 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012634904714142427, + "loss": 2.7239, + "theoretical_loss": 3.37121065194244, + "tokens_seen": 2475155456 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012633901705115348, + "loss": 2.7091, + "theoretical_loss": 3.3712035177613298, + "tokens_seen": 2475220992 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012632898696088266, + "loss": 2.7014, + "theoretical_loss": 3.371196383821995, + "tokens_seen": 2475286528 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012631895687061185, + "loss": 2.52, + "theoretical_loss": 3.371189250124422, + "tokens_seen": 2475352064 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012630892678034103, + "loss": 2.6042, + "theoretical_loss": 3.371182116668595, + "tokens_seen": 2475417600 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001262988966900702, + "loss": 2.4735, + "theoretical_loss": 3.3711749834545, + "tokens_seen": 2475483136 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012628886659979942, + "loss": 2.6313, + "theoretical_loss": 3.3711678504821223, + "tokens_seen": 2475548672 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2741587, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6049747467041016, + "objective/train/theoretical_loss": 3.3711607177514473, + "objective/train/tokens_used": 2496074208, + "theoretical_loss": 3.3711607177514473, + "tokens_seen": 2475614208 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001262788365095286, + "loss": 2.4958, + "theoretical_loss": 3.3711607177514473, + "tokens_seen": 2475614208 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012626880641925778, + "loss": 2.8572, + "theoretical_loss": 3.371153585262461, + "tokens_seen": 2475679744 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012625877632898696, + "loss": 2.4563, + "theoretical_loss": 3.3711464530151476, + "tokens_seen": 2475745280 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012624874623871617, + "loss": 3.0826, + "theoretical_loss": 3.3711393210094935, + "tokens_seen": 2475810816 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012623871614844535, + "loss": 2.6324, + "theoretical_loss": 3.371132189245484, + "tokens_seen": 2475876352 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012622868605817453, + "loss": 2.553, + "theoretical_loss": 3.3711250577231042, + "tokens_seen": 2475941888 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001262186559679037, + "loss": 2.7408, + "theoretical_loss": 3.3711179264423397, + "tokens_seen": 2476007424 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001262086258776329, + "loss": 2.6218, + "theoretical_loss": 3.371110795403176, + "tokens_seen": 2476072960 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001261985957873621, + "loss": 2.6825, + "theoretical_loss": 3.371103664605598, + "tokens_seen": 2476138496 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012618856569709128, + "loss": 2.7812, + "theoretical_loss": 3.3710965340495918, + "tokens_seen": 2476204032 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012617853560682046, + "loss": 2.7145, + "theoretical_loss": 3.371089403735142, + "tokens_seen": 2476269568 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012616850551654964, + "loss": 2.3521, + "theoretical_loss": 3.3710822736622355, + "tokens_seen": 2476335104 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012615847542627885, + "loss": 2.552, + "theoretical_loss": 3.371075143830856, + "tokens_seen": 2476400640 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012614844533600803, + "loss": 2.3785, + "theoretical_loss": 3.3710680142409903, + "tokens_seen": 2476466176 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012613841524573721, + "loss": 2.8058, + "theoretical_loss": 3.371060884892623, + "tokens_seen": 2476531712 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001261283851554664, + "loss": 2.8036, + "theoretical_loss": 3.3710537557857396, + "tokens_seen": 2476597248 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012611835506519558, + "loss": 2.6622, + "theoretical_loss": 3.371046626920326, + "tokens_seen": 2476662784 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012610832497492478, + "loss": 2.7259, + "theoretical_loss": 3.371039498296367, + "tokens_seen": 2476728320 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012609829488465397, + "loss": 2.7096, + "theoretical_loss": 3.3710323699138485, + "tokens_seen": 2476793856 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012608826479438315, + "loss": 2.8317, + "theoretical_loss": 3.371025241772756, + "tokens_seen": 2476859392 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012607823470411233, + "loss": 2.6336, + "theoretical_loss": 3.3710181138730744, + "tokens_seen": 2476924928 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012606820461384154, + "loss": 2.648, + "theoretical_loss": 3.3710109862147895, + "tokens_seen": 2476990464 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012605817452357072, + "loss": 2.4934, + "theoretical_loss": 3.371003858797887, + "tokens_seen": 2477056000 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001260481444332999, + "loss": 2.5992, + "theoretical_loss": 3.370996731622352, + "tokens_seen": 2477121536 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012603811434302908, + "loss": 2.6321, + "theoretical_loss": 3.3709896046881695, + "tokens_seen": 2477187072 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2746696, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.290653944015503, + "objective/train/theoretical_loss": 3.370982477995326, + "objective/train/tokens_used": 2497712608, + "theoretical_loss": 3.370982477995326, + "tokens_seen": 2477252608 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001260280842527583, + "loss": 2.4284, + "theoretical_loss": 3.370982477995326, + "tokens_seen": 2477252608 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012601805416248747, + "loss": 2.8295, + "theoretical_loss": 3.370975351543806, + "tokens_seen": 2477318144 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012600802407221665, + "loss": 2.5138, + "theoretical_loss": 3.3709682253335957, + "tokens_seen": 2477383680 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012599799398194583, + "loss": 2.6462, + "theoretical_loss": 3.3709610993646804, + "tokens_seen": 2477449216 + }, + { + "epoch": 8.03, + "learning_rate": 0.000125987963891675, + "loss": 2.7805, + "theoretical_loss": 3.370953973637045, + "tokens_seen": 2477514752 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012597793380140422, + "loss": 2.8241, + "theoretical_loss": 3.370946848150675, + "tokens_seen": 2477580288 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001259679037111334, + "loss": 2.7769, + "theoretical_loss": 3.370939722905556, + "tokens_seen": 2477645824 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012595787362086258, + "loss": 2.4429, + "theoretical_loss": 3.3709325979016738, + "tokens_seen": 2477711360 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012594784353059176, + "loss": 2.6573, + "theoretical_loss": 3.370925473139014, + "tokens_seen": 2477776896 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012593781344032097, + "loss": 2.5816, + "theoretical_loss": 3.370918348617561, + "tokens_seen": 2477842432 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012592778335005015, + "loss": 2.7456, + "theoretical_loss": 3.370911224337301, + "tokens_seen": 2477907968 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012591775325977933, + "loss": 2.6232, + "theoretical_loss": 3.37090410029822, + "tokens_seen": 2477973504 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012590772316950851, + "loss": 2.6675, + "theoretical_loss": 3.3708969765003025, + "tokens_seen": 2478039040 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001258976930792377, + "loss": 2.6336, + "theoretical_loss": 3.3708898529435345, + "tokens_seen": 2478104576 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001258876629889669, + "loss": 2.72, + "theoretical_loss": 3.3708827296279007, + "tokens_seen": 2478170112 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012587763289869609, + "loss": 2.7144, + "theoretical_loss": 3.3708756065533874, + "tokens_seen": 2478235648 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012586760280842527, + "loss": 2.6199, + "theoretical_loss": 3.37086848371998, + "tokens_seen": 2478301184 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012585757271815445, + "loss": 2.5576, + "theoretical_loss": 3.3708613611276634, + "tokens_seen": 2478366720 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012584754262788366, + "loss": 2.7196, + "theoretical_loss": 3.3708542387764235, + "tokens_seen": 2478432256 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012583751253761284, + "loss": 2.8036, + "theoretical_loss": 3.370847116666246, + "tokens_seen": 2478497792 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012582748244734202, + "loss": 2.4618, + "theoretical_loss": 3.3708399947971155, + "tokens_seen": 2478563328 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001258174523570712, + "loss": 2.5223, + "theoretical_loss": 3.3708328731690185, + "tokens_seen": 2478628864 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012580742226680038, + "loss": 2.6904, + "theoretical_loss": 3.3708257517819398, + "tokens_seen": 2478694400 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001257973921765296, + "loss": 2.8328, + "theoretical_loss": 3.3708186306358647, + "tokens_seen": 2478759936 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012578736208625877, + "loss": 2.6714, + "theoretical_loss": 3.3708115097307796, + "tokens_seen": 2478825472 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2751881, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.608482599258423, + "objective/train/theoretical_loss": 3.3708043890666692, + "objective/train/tokens_used": 2499351008, + "theoretical_loss": 3.3708043890666692, + "tokens_seen": 2478891008 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012577733199598795, + "loss": 2.6745, + "theoretical_loss": 3.3708043890666692, + "tokens_seen": 2478891008 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012576730190571713, + "loss": 2.5345, + "theoretical_loss": 3.370797268643519, + "tokens_seen": 2478956544 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012575727181544634, + "loss": 2.5103, + "theoretical_loss": 3.3707901484613148, + "tokens_seen": 2479022080 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012574724172517555, + "loss": 2.4838, + "theoretical_loss": 3.3707830285200417, + "tokens_seen": 2479087616 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012573721163490473, + "loss": 2.6984, + "theoretical_loss": 3.3707759088196854, + "tokens_seen": 2479153152 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001257271815446339, + "loss": 2.6185, + "theoretical_loss": 3.370768789360232, + "tokens_seen": 2479218688 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001257171514543631, + "loss": 2.7245, + "theoretical_loss": 3.3707616701416656, + "tokens_seen": 2479284224 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001257071213640923, + "loss": 2.4676, + "theoretical_loss": 3.370754551163973, + "tokens_seen": 2479349760 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012569709127382148, + "loss": 2.7043, + "theoretical_loss": 3.3707474324271387, + "tokens_seen": 2479415296 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012568706118355066, + "loss": 2.7631, + "theoretical_loss": 3.3707403139311487, + "tokens_seen": 2479480832 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012567703109327984, + "loss": 2.809, + "theoretical_loss": 3.3707331956759887, + "tokens_seen": 2479546368 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012566700100300905, + "loss": 2.6834, + "theoretical_loss": 3.3707260776616437, + "tokens_seen": 2479611904 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012565697091273823, + "loss": 2.5605, + "theoretical_loss": 3.3707189598880993, + "tokens_seen": 2479677440 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012564694082246741, + "loss": 2.7913, + "theoretical_loss": 3.370711842355341, + "tokens_seen": 2479742976 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001256369107321966, + "loss": 2.55, + "theoretical_loss": 3.3707047250633546, + "tokens_seen": 2479808512 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012562688064192578, + "loss": 2.4892, + "theoretical_loss": 3.370697608012125, + "tokens_seen": 2479874048 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012561685055165498, + "loss": 2.7969, + "theoretical_loss": 3.370690491201638, + "tokens_seen": 2479939584 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012560682046138417, + "loss": 2.3498, + "theoretical_loss": 3.37068337463188, + "tokens_seen": 2480005120 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012559679037111335, + "loss": 2.6765, + "theoretical_loss": 3.3706762583028347, + "tokens_seen": 2480070656 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012558676028084253, + "loss": 2.6361, + "theoretical_loss": 3.370669142214489, + "tokens_seen": 2480136192 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012557673019057174, + "loss": 2.5601, + "theoretical_loss": 3.3706620263668277, + "tokens_seen": 2480201728 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012556670010030092, + "loss": 2.6177, + "theoretical_loss": 3.3706549107598365, + "tokens_seen": 2480267264 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001255566700100301, + "loss": 2.768, + "theoretical_loss": 3.3706477953935012, + "tokens_seen": 2480332800 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012554663991975928, + "loss": 2.4566, + "theoretical_loss": 3.370640680267807, + "tokens_seen": 2480398336 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001255366098294885, + "loss": 2.5494, + "theoretical_loss": 3.370633565382739, + "tokens_seen": 2480463872 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2756904, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.510277271270752, + "objective/train/theoretical_loss": 3.3706264507382837, + "objective/train/tokens_used": 2500989408, + "theoretical_loss": 3.3706264507382837, + "tokens_seen": 2480529408 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012552657973921767, + "loss": 2.8182, + "theoretical_loss": 3.3706264507382837, + "tokens_seen": 2480529408 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012551654964894685, + "loss": 2.5487, + "theoretical_loss": 3.3706193363344257, + "tokens_seen": 2480594944 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012550651955867603, + "loss": 2.6011, + "theoretical_loss": 3.370612222171151, + "tokens_seen": 2480660480 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001254964894684052, + "loss": 2.4998, + "theoretical_loss": 3.370605108248445, + "tokens_seen": 2480726016 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012548645937813442, + "loss": 2.6969, + "theoretical_loss": 3.3705979945662934, + "tokens_seen": 2480791552 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001254764292878636, + "loss": 2.6208, + "theoretical_loss": 3.3705908811246807, + "tokens_seen": 2480857088 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012546639919759278, + "loss": 2.5585, + "theoretical_loss": 3.370583767923594, + "tokens_seen": 2480922624 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012545636910732196, + "loss": 2.5045, + "theoretical_loss": 3.370576654963018, + "tokens_seen": 2480988160 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012544633901705117, + "loss": 2.5108, + "theoretical_loss": 3.3705695422429374, + "tokens_seen": 2481053696 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012543630892678035, + "loss": 2.5258, + "theoretical_loss": 3.370562429763339, + "tokens_seen": 2481119232 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012542627883650953, + "loss": 2.6092, + "theoretical_loss": 3.3705553175242082, + "tokens_seen": 2481184768 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012541624874623872, + "loss": 2.7035, + "theoretical_loss": 3.37054820552553, + "tokens_seen": 2481250304 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001254062186559679, + "loss": 2.7065, + "theoretical_loss": 3.37054109376729, + "tokens_seen": 2481315840 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001253961885656971, + "loss": 2.4737, + "theoretical_loss": 3.3705339822494738, + "tokens_seen": 2481381376 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012538615847542629, + "loss": 2.5624, + "theoretical_loss": 3.370526870972067, + "tokens_seen": 2481446912 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012537612838515547, + "loss": 2.5133, + "theoretical_loss": 3.370519759935055, + "tokens_seen": 2481512448 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012536609829488465, + "loss": 2.7395, + "theoretical_loss": 3.3705126491384236, + "tokens_seen": 2481577984 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012535606820461386, + "loss": 2.6378, + "theoretical_loss": 3.370505538582158, + "tokens_seen": 2481643520 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012534603811434304, + "loss": 2.7092, + "theoretical_loss": 3.3704984282662434, + "tokens_seen": 2481709056 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012533600802407222, + "loss": 2.549, + "theoretical_loss": 3.3704913181906666, + "tokens_seen": 2481774592 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001253259779338014, + "loss": 2.5454, + "theoretical_loss": 3.370484208355412, + "tokens_seen": 2481840128 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012531594784353058, + "loss": 2.5465, + "theoretical_loss": 3.370477098760465, + "tokens_seen": 2481905664 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001253059177532598, + "loss": 2.4743, + "theoretical_loss": 3.370469989405812, + "tokens_seen": 2481971200 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012529588766298897, + "loss": 2.6449, + "theoretical_loss": 3.370462880291438, + "tokens_seen": 2482036736 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012528585757271815, + "loss": 2.7765, + "theoretical_loss": 3.370455771417329, + "tokens_seen": 2482102272 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2762079, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.937701463699341, + "objective/train/theoretical_loss": 3.3704486627834696, + "objective/train/tokens_used": 2502627808, + "theoretical_loss": 3.3704486627834696, + "tokens_seen": 2482167808 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012527582748244733, + "loss": 2.536, + "theoretical_loss": 3.3704486627834696, + "tokens_seen": 2482167808 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012526579739217654, + "loss": 2.6852, + "theoretical_loss": 3.370441554389846, + "tokens_seen": 2482233344 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012525576730190572, + "loss": 2.4077, + "theoretical_loss": 3.3704344462364437, + "tokens_seen": 2482298880 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001252457372116349, + "loss": 2.6924, + "theoretical_loss": 3.3704273383232484, + "tokens_seen": 2482364416 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012523570712136408, + "loss": 2.6775, + "theoretical_loss": 3.3704202306502453, + "tokens_seen": 2482429952 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012522567703109326, + "loss": 2.5295, + "theoretical_loss": 3.37041312321742, + "tokens_seen": 2482495488 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012521564694082247, + "loss": 2.6386, + "theoretical_loss": 3.3704060160247584, + "tokens_seen": 2482561024 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012520561685055165, + "loss": 2.4849, + "theoretical_loss": 3.3703989090722457, + "tokens_seen": 2482626560 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012519558676028084, + "loss": 2.4999, + "theoretical_loss": 3.370391802359867, + "tokens_seen": 2482692096 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012518555667001002, + "loss": 2.4917, + "theoretical_loss": 3.370384695887609, + "tokens_seen": 2482757632 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012517552657973922, + "loss": 2.6921, + "theoretical_loss": 3.3703775896554564, + "tokens_seen": 2482823168 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001251654964894684, + "loss": 2.6124, + "theoretical_loss": 3.3703704836633945, + "tokens_seen": 2482888704 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001251554663991976, + "loss": 2.6279, + "theoretical_loss": 3.37036337791141, + "tokens_seen": 2482954240 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012514543630892677, + "loss": 2.6969, + "theoretical_loss": 3.3703562723994875, + "tokens_seen": 2483019776 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012513540621865595, + "loss": 2.5483, + "theoretical_loss": 3.3703491671276127, + "tokens_seen": 2483085312 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012512537612838516, + "loss": 2.6872, + "theoretical_loss": 3.370342062095771, + "tokens_seen": 2483150848 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012511534603811434, + "loss": 2.586, + "theoretical_loss": 3.3703349573039487, + "tokens_seen": 2483216384 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012510531594784352, + "loss": 2.6744, + "theoretical_loss": 3.3703278527521308, + "tokens_seen": 2483281920 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001250952858575727, + "loss": 2.6521, + "theoretical_loss": 3.370320748440303, + "tokens_seen": 2483347456 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001250852557673019, + "loss": 2.6388, + "theoretical_loss": 3.370313644368451, + "tokens_seen": 2483412992 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001250752256770311, + "loss": 2.6675, + "theoretical_loss": 3.3703065405365598, + "tokens_seen": 2483478528 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012506519558676027, + "loss": 2.5654, + "theoretical_loss": 3.3702994369446153, + "tokens_seen": 2483544064 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012505516549648945, + "loss": 2.56, + "theoretical_loss": 3.3702923335926034, + "tokens_seen": 2483609600 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012504513540621863, + "loss": 2.6097, + "theoretical_loss": 3.370285230480509, + "tokens_seen": 2483675136 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012503510531594784, + "loss": 2.6997, + "theoretical_loss": 3.370278127608318, + "tokens_seen": 2483740672 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2763167, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4442803859710693, + "objective/train/theoretical_loss": 3.3702710249760166, + "objective/train/tokens_used": 2504266208, + "theoretical_loss": 3.3702710249760166, + "tokens_seen": 2483806208 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012502507522567702, + "loss": 2.4254, + "theoretical_loss": 3.3702710249760166, + "tokens_seen": 2483806208 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001250150451354062, + "loss": 2.593, + "theoretical_loss": 3.370263922583589, + "tokens_seen": 2483871744 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012500501504513538, + "loss": 2.4711, + "theoretical_loss": 3.370256820431022, + "tokens_seen": 2483937280 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001249949849548646, + "loss": 2.4258, + "theoretical_loss": 3.3702497185183007, + "tokens_seen": 2484002816 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012498495486459377, + "loss": 2.7625, + "theoretical_loss": 3.3702426168454105, + "tokens_seen": 2484068352 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012497492477432298, + "loss": 2.5243, + "theoretical_loss": 3.370235515412337, + "tokens_seen": 2484133888 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012496489468405216, + "loss": 2.8055, + "theoretical_loss": 3.370228414219066, + "tokens_seen": 2484199424 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012495486459378134, + "loss": 2.5453, + "theoretical_loss": 3.370221313265583, + "tokens_seen": 2484264960 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012494483450351053, + "loss": 2.4343, + "theoretical_loss": 3.370214212551874, + "tokens_seen": 2484330496 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001249348044132397, + "loss": 2.6049, + "theoretical_loss": 3.370207112077924, + "tokens_seen": 2484396032 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012492477432296892, + "loss": 2.8411, + "theoretical_loss": 3.3702000118437185, + "tokens_seen": 2484461568 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001249147442326981, + "loss": 2.4324, + "theoretical_loss": 3.3701929118492435, + "tokens_seen": 2484527104 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012490471414242728, + "loss": 2.6058, + "theoretical_loss": 3.3701858120944843, + "tokens_seen": 2484592640 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012489468405215646, + "loss": 2.5109, + "theoretical_loss": 3.3701787125794267, + "tokens_seen": 2484658176 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012488465396188567, + "loss": 2.8196, + "theoretical_loss": 3.3701716133040556, + "tokens_seen": 2484723712 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012487462387161485, + "loss": 2.634, + "theoretical_loss": 3.3701645142683576, + "tokens_seen": 2484789248 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012486459378134403, + "loss": 2.4753, + "theoretical_loss": 3.370157415472318, + "tokens_seen": 2484854784 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001248545636910732, + "loss": 2.618, + "theoretical_loss": 3.370150316915922, + "tokens_seen": 2484920320 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001248445336008024, + "loss": 2.5668, + "theoretical_loss": 3.3701432185991558, + "tokens_seen": 2484985856 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001248345035105316, + "loss": 2.4605, + "theoretical_loss": 3.370136120522004, + "tokens_seen": 2485051392 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012482447342026078, + "loss": 2.7253, + "theoretical_loss": 3.3701290226844534, + "tokens_seen": 2485116928 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012481444332998996, + "loss": 2.7015, + "theoretical_loss": 3.3701219250864884, + "tokens_seen": 2485182464 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012480441323971917, + "loss": 2.4172, + "theoretical_loss": 3.370114827728096, + "tokens_seen": 2485248000 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012479438314944835, + "loss": 2.5389, + "theoretical_loss": 3.37010773060926, + "tokens_seen": 2485313536 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012478435305917753, + "loss": 2.6066, + "theoretical_loss": 3.370100633729968, + "tokens_seen": 2485379072 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2763866, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.346468687057495, + "objective/train/theoretical_loss": 3.3700935370902037, + "objective/train/tokens_used": 2505904608, + "theoretical_loss": 3.3700935370902037, + "tokens_seen": 2485444608 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012477432296890674, + "loss": 2.5143, + "theoretical_loss": 3.3700935370902037, + "tokens_seen": 2485444608 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012476429287863592, + "loss": 2.4117, + "theoretical_loss": 3.370086440689954, + "tokens_seen": 2485510144 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001247542627883651, + "loss": 2.5297, + "theoretical_loss": 3.370079344529204, + "tokens_seen": 2485575680 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012474423269809428, + "loss": 2.6699, + "theoretical_loss": 3.3700722486079395, + "tokens_seen": 2485641216 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012473420260782346, + "loss": 2.6114, + "theoretical_loss": 3.370065152926146, + "tokens_seen": 2485706752 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012472417251755267, + "loss": 2.7175, + "theoretical_loss": 3.370058057483809, + "tokens_seen": 2485772288 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012471414242728185, + "loss": 2.5482, + "theoretical_loss": 3.3700509622809145, + "tokens_seen": 2485837824 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012470411233701104, + "loss": 2.6525, + "theoretical_loss": 3.370043867317447, + "tokens_seen": 2485903360 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012469408224674022, + "loss": 2.4205, + "theoretical_loss": 3.3700367725933935, + "tokens_seen": 2485968896 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012468405215646942, + "loss": 2.5029, + "theoretical_loss": 3.3700296781087395, + "tokens_seen": 2486034432 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001246740220661986, + "loss": 2.5576, + "theoretical_loss": 3.370022583863469, + "tokens_seen": 2486099968 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001246639919759278, + "loss": 2.3772, + "theoretical_loss": 3.37001548985757, + "tokens_seen": 2486165504 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012465396188565697, + "loss": 2.5831, + "theoretical_loss": 3.370008396091026, + "tokens_seen": 2486231040 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012464393179538615, + "loss": 2.6577, + "theoretical_loss": 3.370001302563824, + "tokens_seen": 2486296576 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012463390170511536, + "loss": 2.4827, + "theoretical_loss": 3.3699942092759487, + "tokens_seen": 2486362112 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012462387161484454, + "loss": 2.6229, + "theoretical_loss": 3.3699871162273864, + "tokens_seen": 2486427648 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012461384152457372, + "loss": 2.6919, + "theoretical_loss": 3.3699800234181225, + "tokens_seen": 2486493184 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001246038114343029, + "loss": 2.4665, + "theoretical_loss": 3.3699729308481423, + "tokens_seen": 2486558720 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001245937813440321, + "loss": 2.7529, + "theoretical_loss": 3.3699658385174316, + "tokens_seen": 2486624256 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001245837512537613, + "loss": 2.5062, + "theoretical_loss": 3.3699587464259766, + "tokens_seen": 2486689792 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012457372116349047, + "loss": 2.6065, + "theoretical_loss": 3.369951654573762, + "tokens_seen": 2486755328 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012456369107321965, + "loss": 2.4744, + "theoretical_loss": 3.369944562960774, + "tokens_seen": 2486820864 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012455366098294883, + "loss": 2.6444, + "theoretical_loss": 3.3699374715869976, + "tokens_seen": 2486886400 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012454363089267804, + "loss": 2.525, + "theoretical_loss": 3.3699303804524194, + "tokens_seen": 2486951936 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012453360080240722, + "loss": 2.3591, + "theoretical_loss": 3.3699232895570246, + "tokens_seen": 2487017472 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2765122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.650627374649048, + "objective/train/theoretical_loss": 3.3699161989007984, + "objective/train/tokens_used": 2507543008, + "theoretical_loss": 3.3699161989007984, + "tokens_seen": 2487083008 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001245235707121364, + "loss": 2.5334, + "theoretical_loss": 3.3699161989007984, + "tokens_seen": 2487083008 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012451354062186559, + "loss": 2.7059, + "theoretical_loss": 3.369909108483727, + "tokens_seen": 2487148544 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001245035105315948, + "loss": 2.8164, + "theoretical_loss": 3.3699020183057957, + "tokens_seen": 2487214080 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012449348044132397, + "loss": 2.3227, + "theoretical_loss": 3.369894928366991, + "tokens_seen": 2487279616 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012448345035105316, + "loss": 2.8102, + "theoretical_loss": 3.369887838667297, + "tokens_seen": 2487345152 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012447342026078234, + "loss": 2.4173, + "theoretical_loss": 3.3698807492067004, + "tokens_seen": 2487410688 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012446339017051152, + "loss": 2.6106, + "theoretical_loss": 3.369873659985186, + "tokens_seen": 2487476224 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012445336008024073, + "loss": 2.8821, + "theoretical_loss": 3.3698665710027407, + "tokens_seen": 2487541760 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001244433299899699, + "loss": 2.6868, + "theoretical_loss": 3.3698594822593493, + "tokens_seen": 2487607296 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012443329989969912, + "loss": 2.5645, + "theoretical_loss": 3.3698523937549973, + "tokens_seen": 2487672832 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001244232698094283, + "loss": 2.4872, + "theoretical_loss": 3.369845305489671, + "tokens_seen": 2487738368 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012441323971915748, + "loss": 2.8046, + "theoretical_loss": 3.3698382174633554, + "tokens_seen": 2487803904 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012440320962888666, + "loss": 2.7014, + "theoretical_loss": 3.3698311296760366, + "tokens_seen": 2487869440 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012439317953861587, + "loss": 2.5315, + "theoretical_loss": 3.3698240421277, + "tokens_seen": 2487934976 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012438314944834505, + "loss": 2.548, + "theoretical_loss": 3.369816954818331, + "tokens_seen": 2488000512 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012437311935807423, + "loss": 2.6859, + "theoretical_loss": 3.3698098677479162, + "tokens_seen": 2488066048 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001243630892678034, + "loss": 2.4347, + "theoretical_loss": 3.3698027809164404, + "tokens_seen": 2488131584 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001243530591775326, + "loss": 2.6463, + "theoretical_loss": 3.3697956943238894, + "tokens_seen": 2488197120 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001243430290872618, + "loss": 2.6875, + "theoretical_loss": 3.369788607970249, + "tokens_seen": 2488262656 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012433299899699098, + "loss": 2.5427, + "theoretical_loss": 3.3697815218555043, + "tokens_seen": 2488328192 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012432296890672016, + "loss": 2.5477, + "theoretical_loss": 3.3697744359796418, + "tokens_seen": 2488393728 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012431293881644934, + "loss": 2.5335, + "theoretical_loss": 3.369767350342647, + "tokens_seen": 2488459264 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012430290872617855, + "loss": 2.8622, + "theoretical_loss": 3.369760264944505, + "tokens_seen": 2488524800 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012429287863590773, + "loss": 2.7712, + "theoretical_loss": 3.3697531797852016, + "tokens_seen": 2488590336 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001242828485456369, + "loss": 2.7177, + "theoretical_loss": 3.369746094864723, + "tokens_seen": 2488655872 + }, + { + "epoch": 8.03, + "objective/train/docs_used": 2765807, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4517862796783447, + "objective/train/theoretical_loss": 3.3697390101830544, + "objective/train/tokens_used": 2509181408, + "theoretical_loss": 3.3697390101830544, + "tokens_seen": 2488721408 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001242728184553661, + "loss": 2.5249, + "theoretical_loss": 3.3697390101830544, + "tokens_seen": 2488721408 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012426278836509528, + "loss": 2.5658, + "theoretical_loss": 3.3697319257401817, + "tokens_seen": 2488786944 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012425275827482448, + "loss": 2.5197, + "theoretical_loss": 3.3697248415360903, + "tokens_seen": 2488852480 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012424272818455367, + "loss": 2.6954, + "theoretical_loss": 3.3697177575707657, + "tokens_seen": 2488918016 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012423269809428285, + "loss": 2.7283, + "theoretical_loss": 3.369710673844194, + "tokens_seen": 2488983552 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012422266800401203, + "loss": 2.6731, + "theoretical_loss": 3.3697035903563615, + "tokens_seen": 2489049088 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012421263791374124, + "loss": 2.6212, + "theoretical_loss": 3.3696965071072524, + "tokens_seen": 2489114624 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012420260782347042, + "loss": 2.5762, + "theoretical_loss": 3.369689424096853, + "tokens_seen": 2489180160 + }, + { + "epoch": 8.03, + "learning_rate": 0.0001241925777331996, + "loss": 2.6452, + "theoretical_loss": 3.3696823413251495, + "tokens_seen": 2489245696 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012418254764292878, + "loss": 2.5417, + "theoretical_loss": 3.369675258792127, + "tokens_seen": 2489311232 + }, + { + "epoch": 8.03, + "learning_rate": 0.000124172517552658, + "loss": 2.2734, + "theoretical_loss": 3.369668176497771, + "tokens_seen": 2489376768 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012416248746238717, + "loss": 2.7265, + "theoretical_loss": 3.3696610944420677, + "tokens_seen": 2489442304 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012415245737211635, + "loss": 2.7165, + "theoretical_loss": 3.369654012625002, + "tokens_seen": 2489507840 + }, + { + "epoch": 8.03, + "learning_rate": 0.00012414242728184553, + "loss": 2.6668, + "theoretical_loss": 3.369646931046561, + "tokens_seen": 2489573376 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001241323971915747, + "loss": 2.5319, + "theoretical_loss": 3.369639849706729, + "tokens_seen": 2489638912 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012412236710130392, + "loss": 2.2521, + "theoretical_loss": 3.3696327686054923, + "tokens_seen": 2489704448 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001241123370110331, + "loss": 2.4891, + "theoretical_loss": 3.369625687742836, + "tokens_seen": 2489769984 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012410230692076228, + "loss": 2.6812, + "theoretical_loss": 3.3696186071187473, + "tokens_seen": 2489835520 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012409227683049146, + "loss": 2.6088, + "theoretical_loss": 3.36961152673321, + "tokens_seen": 2489901056 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012408224674022067, + "loss": 2.6342, + "theoretical_loss": 3.3696044465862105, + "tokens_seen": 2489966592 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012407221664994985, + "loss": 2.5194, + "theoretical_loss": 3.369597366677735, + "tokens_seen": 2490032128 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012406218655967903, + "loss": 2.5598, + "theoretical_loss": 3.3695902870077687, + "tokens_seen": 2490097664 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012405215646940824, + "loss": 2.8139, + "theoretical_loss": 3.369583207576297, + "tokens_seen": 2490163200 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012404212637913742, + "loss": 2.2464, + "theoretical_loss": 3.3695761283833066, + "tokens_seen": 2490228736 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001240320962888666, + "loss": 2.6248, + "theoretical_loss": 3.369569049428782, + "tokens_seen": 2490294272 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2767125, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7564380168914795, + "objective/train/theoretical_loss": 3.36956197071271, + "objective/train/tokens_used": 2510819808, + "theoretical_loss": 3.36956197071271, + "tokens_seen": 2490359808 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012402206619859579, + "loss": 2.6751, + "theoretical_loss": 3.36956197071271, + "tokens_seen": 2490359808 + }, + { + "epoch": 8.04, + "learning_rate": 0.000124012036108325, + "loss": 2.5456, + "theoretical_loss": 3.369554892235075, + "tokens_seen": 2490425344 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012400200601805417, + "loss": 2.4507, + "theoretical_loss": 3.369547813995864, + "tokens_seen": 2490490880 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012399197592778336, + "loss": 2.4233, + "theoretical_loss": 3.369540735995062, + "tokens_seen": 2490556416 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012398194583751254, + "loss": 2.4785, + "theoretical_loss": 3.3695336582326547, + "tokens_seen": 2490621952 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012397191574724172, + "loss": 2.6334, + "theoretical_loss": 3.369526580708628, + "tokens_seen": 2490687488 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012396188565697093, + "loss": 2.4977, + "theoretical_loss": 3.3695195034229677, + "tokens_seen": 2490753024 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001239518555667001, + "loss": 2.5771, + "theoretical_loss": 3.3695124263756586, + "tokens_seen": 2490818560 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001239418254764293, + "loss": 2.5834, + "theoretical_loss": 3.369505349566688, + "tokens_seen": 2490884096 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012393179538615847, + "loss": 2.5621, + "theoretical_loss": 3.36949827299604, + "tokens_seen": 2490949632 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012392176529588768, + "loss": 2.3913, + "theoretical_loss": 3.3694911966637013, + "tokens_seen": 2491015168 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012391173520561686, + "loss": 2.4609, + "theoretical_loss": 3.3694841205696577, + "tokens_seen": 2491080704 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012390170511534604, + "loss": 2.4978, + "theoretical_loss": 3.369477044713894, + "tokens_seen": 2491146240 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012389167502507522, + "loss": 2.4399, + "theoretical_loss": 3.3694699690963965, + "tokens_seen": 2491211776 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012388164493480443, + "loss": 2.7592, + "theoretical_loss": 3.3694628937171514, + "tokens_seen": 2491277312 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001238716148445336, + "loss": 2.594, + "theoretical_loss": 3.369455818576143, + "tokens_seen": 2491342848 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001238615847542628, + "loss": 2.5978, + "theoretical_loss": 3.3694487436733587, + "tokens_seen": 2491408384 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012385155466399197, + "loss": 2.8045, + "theoretical_loss": 3.369441669008783, + "tokens_seen": 2491473920 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012384152457372115, + "loss": 2.5283, + "theoretical_loss": 3.369434594582402, + "tokens_seen": 2491539456 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012383149448345036, + "loss": 2.6917, + "theoretical_loss": 3.3694275203942015, + "tokens_seen": 2491604992 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012382146439317954, + "loss": 2.4918, + "theoretical_loss": 3.369420446444167, + "tokens_seen": 2491670528 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012381143430290872, + "loss": 2.4506, + "theoretical_loss": 3.3694133727322844, + "tokens_seen": 2491736064 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001238014042126379, + "loss": 2.7209, + "theoretical_loss": 3.3694062992585394, + "tokens_seen": 2491801600 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012379137412236711, + "loss": 2.4842, + "theoretical_loss": 3.3693992260229177, + "tokens_seen": 2491867136 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001237813440320963, + "loss": 2.5936, + "theoretical_loss": 3.369392153025405, + "tokens_seen": 2491932672 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2767936, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6289756298065186, + "objective/train/theoretical_loss": 3.369385080265987, + "objective/train/tokens_used": 2512458208, + "theoretical_loss": 3.369385080265987, + "tokens_seen": 2491998208 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012377131394182548, + "loss": 2.4147, + "theoretical_loss": 3.369385080265987, + "tokens_seen": 2491998208 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012376128385155466, + "loss": 2.6152, + "theoretical_loss": 3.3693780077446496, + "tokens_seen": 2492063744 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012375125376128384, + "loss": 2.6174, + "theoretical_loss": 3.3693709354613786, + "tokens_seen": 2492129280 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012374122367101305, + "loss": 2.6825, + "theoretical_loss": 3.369363863416159, + "tokens_seen": 2492194816 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012373119358074223, + "loss": 2.5203, + "theoretical_loss": 3.3693567916089773, + "tokens_seen": 2492260352 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001237211634904714, + "loss": 2.5631, + "theoretical_loss": 3.369349720039819, + "tokens_seen": 2492325888 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001237111334002006, + "loss": 2.6153, + "theoretical_loss": 3.3693426487086695, + "tokens_seen": 2492391424 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001237011033099298, + "loss": 2.6638, + "theoretical_loss": 3.369335577615515, + "tokens_seen": 2492456960 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012369107321965898, + "loss": 2.6565, + "theoretical_loss": 3.369328506760341, + "tokens_seen": 2492522496 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001236810431293882, + "loss": 2.7057, + "theoretical_loss": 3.3693214361431334, + "tokens_seen": 2492588032 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012367101303911737, + "loss": 2.4299, + "theoretical_loss": 3.369314365763878, + "tokens_seen": 2492653568 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012366098294884655, + "loss": 2.5921, + "theoretical_loss": 3.3693072956225603, + "tokens_seen": 2492719104 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012365095285857573, + "loss": 2.4307, + "theoretical_loss": 3.369300225719166, + "tokens_seen": 2492784640 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001236409227683049, + "loss": 2.7084, + "theoretical_loss": 3.369293156053681, + "tokens_seen": 2492850176 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012363089267803412, + "loss": 2.5469, + "theoretical_loss": 3.369286086626091, + "tokens_seen": 2492915712 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001236208625877633, + "loss": 2.6326, + "theoretical_loss": 3.3692790174363814, + "tokens_seen": 2492981248 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012361083249749248, + "loss": 2.3849, + "theoretical_loss": 3.3692719484845384, + "tokens_seen": 2493046784 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012360080240722166, + "loss": 2.6935, + "theoretical_loss": 3.369264879770548, + "tokens_seen": 2493112320 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012359077231695087, + "loss": 2.4072, + "theoretical_loss": 3.3692578112943954, + "tokens_seen": 2493177856 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012358074222668005, + "loss": 2.5558, + "theoretical_loss": 3.369250743056066, + "tokens_seen": 2493243392 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012357071213640923, + "loss": 2.5567, + "theoretical_loss": 3.3692436750555466, + "tokens_seen": 2493308928 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012356068204613841, + "loss": 2.5009, + "theoretical_loss": 3.369236607292822, + "tokens_seen": 2493374464 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001235506519558676, + "loss": 2.6095, + "theoretical_loss": 3.3692295397678786, + "tokens_seen": 2493440000 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001235406218655968, + "loss": 2.5046, + "theoretical_loss": 3.369222472480702, + "tokens_seen": 2493505536 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012353059177532599, + "loss": 2.7973, + "theoretical_loss": 3.3692154054312775, + "tokens_seen": 2493571072 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2768968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.402454137802124, + "objective/train/theoretical_loss": 3.3692083386195915, + "objective/train/tokens_used": 2514096608, + "theoretical_loss": 3.3692083386195915, + "tokens_seen": 2493636608 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012352056168505517, + "loss": 2.3867, + "theoretical_loss": 3.3692083386195915, + "tokens_seen": 2493636608 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012351053159478435, + "loss": 2.3751, + "theoretical_loss": 3.369201272045629, + "tokens_seen": 2493702144 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012350050150451356, + "loss": 2.7073, + "theoretical_loss": 3.369194205709377, + "tokens_seen": 2493767680 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012349047141424274, + "loss": 2.8014, + "theoretical_loss": 3.36918713961082, + "tokens_seen": 2493833216 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012349047141424274, + "loss": 2.5158, + "theoretical_loss": 3.369180073749944, + "tokens_seen": 2493898752 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012348044132397192, + "loss": 2.5925, + "theoretical_loss": 3.3691730081267353, + "tokens_seen": 2493964288 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001234704112337011, + "loss": 2.5929, + "theoretical_loss": 3.369165942741179, + "tokens_seen": 2494029824 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012346038114343028, + "loss": 2.4998, + "theoretical_loss": 3.3691588775932617, + "tokens_seen": 2494095360 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001234503510531595, + "loss": 2.8621, + "theoretical_loss": 3.3691518126829685, + "tokens_seen": 2494160896 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012344032096288867, + "loss": 2.5321, + "theoretical_loss": 3.3691447480102856, + "tokens_seen": 2494226432 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012343029087261785, + "loss": 2.6369, + "theoretical_loss": 3.369137683575198, + "tokens_seen": 2494291968 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012342026078234703, + "loss": 2.4921, + "theoretical_loss": 3.3691306193776924, + "tokens_seen": 2494357504 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012341023069207624, + "loss": 2.7161, + "theoretical_loss": 3.3691235554177537, + "tokens_seen": 2494423040 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012340020060180542, + "loss": 2.6497, + "theoretical_loss": 3.3691164916953684, + "tokens_seen": 2494488576 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001233901705115346, + "loss": 2.4815, + "theoretical_loss": 3.369109428210522, + "tokens_seen": 2494554112 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012338014042126378, + "loss": 2.5812, + "theoretical_loss": 3.3691023649632, + "tokens_seen": 2494619648 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012337011033099296, + "loss": 2.3603, + "theoretical_loss": 3.3690953019533887, + "tokens_seen": 2494685184 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012336008024072217, + "loss": 2.6178, + "theoretical_loss": 3.3690882391810737, + "tokens_seen": 2494750720 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012335005015045135, + "loss": 2.567, + "theoretical_loss": 3.3690811766462403, + "tokens_seen": 2494816256 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012334002006018054, + "loss": 2.4952, + "theoretical_loss": 3.3690741143488747, + "tokens_seen": 2494881792 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012332998996990972, + "loss": 2.4881, + "theoretical_loss": 3.3690670522889627, + "tokens_seen": 2494947328 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012331995987963892, + "loss": 2.5816, + "theoretical_loss": 3.36905999046649, + "tokens_seen": 2495012864 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001233099297893681, + "loss": 2.6727, + "theoretical_loss": 3.3690529288814424, + "tokens_seen": 2495078400 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012329989969909731, + "loss": 2.4776, + "theoretical_loss": 3.3690458675338055, + "tokens_seen": 2495143936 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001232898696088265, + "loss": 2.2655, + "theoretical_loss": 3.369038806423566, + "tokens_seen": 2495209472 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2769750, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5581414699554443, + "objective/train/theoretical_loss": 3.369031745550708, + "objective/train/tokens_used": 2515735008, + "theoretical_loss": 3.369031745550708, + "tokens_seen": 2495275008 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012327983951855568, + "loss": 2.5469, + "theoretical_loss": 3.369031745550708, + "tokens_seen": 2495275008 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012326980942828486, + "loss": 2.5712, + "theoretical_loss": 3.369024684915219, + "tokens_seen": 2495340544 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012325977933801404, + "loss": 2.5424, + "theoretical_loss": 3.3690176245170833, + "tokens_seen": 2495406080 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012324974924774325, + "loss": 2.5555, + "theoretical_loss": 3.3690105643562878, + "tokens_seen": 2495471616 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012323971915747243, + "loss": 2.5675, + "theoretical_loss": 3.3690035044328175, + "tokens_seen": 2495537152 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001232296890672016, + "loss": 2.5505, + "theoretical_loss": 3.368996444746659, + "tokens_seen": 2495602688 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001232196589769308, + "loss": 2.6805, + "theoretical_loss": 3.3689893852977977, + "tokens_seen": 2495668224 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012320962888666, + "loss": 2.8068, + "theoretical_loss": 3.3689823260862193, + "tokens_seen": 2495733760 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012319959879638918, + "loss": 2.6454, + "theoretical_loss": 3.3689752671119093, + "tokens_seen": 2495799296 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012318956870611836, + "loss": 2.5255, + "theoretical_loss": 3.368968208374854, + "tokens_seen": 2495864832 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012317953861584754, + "loss": 2.4917, + "theoretical_loss": 3.368961149875039, + "tokens_seen": 2495930368 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012316950852557672, + "loss": 2.5423, + "theoretical_loss": 3.3689540916124505, + "tokens_seen": 2495995904 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012315947843530593, + "loss": 2.6087, + "theoretical_loss": 3.368947033587074, + "tokens_seen": 2496061440 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001231494483450351, + "loss": 2.5026, + "theoretical_loss": 3.3689399757988947, + "tokens_seen": 2496126976 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001231394182547643, + "loss": 2.4806, + "theoretical_loss": 3.368932918247899, + "tokens_seen": 2496192512 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012312938816449347, + "loss": 2.5719, + "theoretical_loss": 3.368925860934073, + "tokens_seen": 2496258048 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012311935807422268, + "loss": 2.4494, + "theoretical_loss": 3.368918803857402, + "tokens_seen": 2496323584 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012310932798395186, + "loss": 2.3868, + "theoretical_loss": 3.368911747017872, + "tokens_seen": 2496389120 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012309929789368104, + "loss": 2.7103, + "theoretical_loss": 3.3689046904154685, + "tokens_seen": 2496454656 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012308926780341023, + "loss": 2.7481, + "theoretical_loss": 3.3688976340501777, + "tokens_seen": 2496520192 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001230792377131394, + "loss": 2.6109, + "theoretical_loss": 3.3688905779219853, + "tokens_seen": 2496585728 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012306920762286862, + "loss": 2.582, + "theoretical_loss": 3.3688835220308775, + "tokens_seen": 2496651264 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001230591775325978, + "loss": 2.7172, + "theoretical_loss": 3.368876466376839, + "tokens_seen": 2496716800 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012304914744232698, + "loss": 2.2978, + "theoretical_loss": 3.368869410959857, + "tokens_seen": 2496782336 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012303911735205616, + "loss": 2.5246, + "theoretical_loss": 3.368862355779916, + "tokens_seen": 2496847872 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2771180, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4243853092193604, + "objective/train/theoretical_loss": 3.368855300837003, + "objective/train/tokens_used": 2517373408, + "theoretical_loss": 3.368855300837003, + "tokens_seen": 2496913408 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012302908726178537, + "loss": 2.6001, + "theoretical_loss": 3.368855300837003, + "tokens_seen": 2496913408 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012301905717151455, + "loss": 2.5724, + "theoretical_loss": 3.368848246131103, + "tokens_seen": 2496978944 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012300902708124373, + "loss": 2.5443, + "theoretical_loss": 3.3688411916622014, + "tokens_seen": 2497044480 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001229989969909729, + "loss": 2.4762, + "theoretical_loss": 3.3688341374302855, + "tokens_seen": 2497110016 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001229889669007021, + "loss": 2.7106, + "theoretical_loss": 3.36882708343534, + "tokens_seen": 2497175552 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001229789368104313, + "loss": 2.4745, + "theoretical_loss": 3.368820029677351, + "tokens_seen": 2497241088 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012296890672016048, + "loss": 2.636, + "theoretical_loss": 3.368812976156305, + "tokens_seen": 2497306624 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012295887662988966, + "loss": 2.6845, + "theoretical_loss": 3.3688059228721863, + "tokens_seen": 2497372160 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012294884653961884, + "loss": 2.6191, + "theoretical_loss": 3.368798869824982, + "tokens_seen": 2497437696 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012293881644934805, + "loss": 2.4132, + "theoretical_loss": 3.3687918170146776, + "tokens_seen": 2497503232 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012292878635907723, + "loss": 2.6856, + "theoretical_loss": 3.3687847644412585, + "tokens_seen": 2497568768 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012291875626880644, + "loss": 2.3017, + "theoretical_loss": 3.368777712104711, + "tokens_seen": 2497634304 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012290872617853562, + "loss": 2.5059, + "theoretical_loss": 3.368770660005021, + "tokens_seen": 2497699840 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001228986960882648, + "loss": 2.4301, + "theoretical_loss": 3.3687636081421743, + "tokens_seen": 2497765376 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012288866599799398, + "loss": 2.6381, + "theoretical_loss": 3.368756556516156, + "tokens_seen": 2497830912 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012287863590772316, + "loss": 2.5792, + "theoretical_loss": 3.368749505126953, + "tokens_seen": 2497896448 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012286860581745237, + "loss": 2.4099, + "theoretical_loss": 3.36874245397455, + "tokens_seen": 2497961984 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012285857572718155, + "loss": 2.3122, + "theoretical_loss": 3.3687354030589343, + "tokens_seen": 2498027520 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012284854563691074, + "loss": 2.3708, + "theoretical_loss": 3.3687283523800904, + "tokens_seen": 2498093056 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012283851554663992, + "loss": 2.5387, + "theoretical_loss": 3.3687213019380047, + "tokens_seen": 2498158592 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012282848545636912, + "loss": 2.5043, + "theoretical_loss": 3.368714251732663, + "tokens_seen": 2498224128 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001228184553660983, + "loss": 2.5234, + "theoretical_loss": 3.3687072017640514, + "tokens_seen": 2498289664 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001228084252758275, + "loss": 2.5409, + "theoretical_loss": 3.3687001520321553, + "tokens_seen": 2498355200 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012279839518555667, + "loss": 2.3472, + "theoretical_loss": 3.3686931025369606, + "tokens_seen": 2498420736 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012278836509528585, + "loss": 2.3348, + "theoretical_loss": 3.368686053278453, + "tokens_seen": 2498486272 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2772017, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5780913829803467, + "objective/train/theoretical_loss": 3.368679004256619, + "objective/train/tokens_used": 2519011808, + "theoretical_loss": 3.368679004256619, + "tokens_seen": 2498551808 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012277833500501506, + "loss": 2.6282, + "theoretical_loss": 3.368679004256619, + "tokens_seen": 2498551808 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012276830491474424, + "loss": 2.5755, + "theoretical_loss": 3.3686719554714437, + "tokens_seen": 2498617344 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012275827482447342, + "loss": 2.7748, + "theoretical_loss": 3.3686649069229135, + "tokens_seen": 2498682880 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001227482447342026, + "loss": 2.3707, + "theoretical_loss": 3.368657858611014, + "tokens_seen": 2498748416 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001227382146439318, + "loss": 2.6501, + "theoretical_loss": 3.3686508105357316, + "tokens_seen": 2498813952 + }, + { + "epoch": 8.04, + "learning_rate": 0.000122728184553661, + "loss": 2.4239, + "theoretical_loss": 3.368643762697051, + "tokens_seen": 2498879488 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012271815446339017, + "loss": 2.3116, + "theoretical_loss": 3.3686367150949588, + "tokens_seen": 2498945024 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012270812437311935, + "loss": 2.5013, + "theoretical_loss": 3.368629667729441, + "tokens_seen": 2499010560 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012269809428284853, + "loss": 2.562, + "theoretical_loss": 3.3686226206004832, + "tokens_seen": 2499076096 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012268806419257774, + "loss": 2.4684, + "theoretical_loss": 3.368615573708071, + "tokens_seen": 2499141632 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012267803410230692, + "loss": 2.4172, + "theoretical_loss": 3.3686085270521904, + "tokens_seen": 2499207168 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001226680040120361, + "loss": 2.7773, + "theoretical_loss": 3.3686014806328277, + "tokens_seen": 2499272704 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012265797392176528, + "loss": 2.8289, + "theoretical_loss": 3.3685944344499683, + "tokens_seen": 2499338240 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001226479438314945, + "loss": 2.4575, + "theoretical_loss": 3.3685873885035984, + "tokens_seen": 2499403776 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012263791374122367, + "loss": 2.5397, + "theoretical_loss": 3.3685803427937033, + "tokens_seen": 2499469312 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012262788365095286, + "loss": 2.8053, + "theoretical_loss": 3.3685732973202693, + "tokens_seen": 2499534848 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012261785356068204, + "loss": 2.5897, + "theoretical_loss": 3.3685662520832826, + "tokens_seen": 2499600384 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012260782347041122, + "loss": 2.6975, + "theoretical_loss": 3.368559207082728, + "tokens_seen": 2499665920 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012259779338014043, + "loss": 2.4157, + "theoretical_loss": 3.3685521623185926, + "tokens_seen": 2499731456 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001225877632898696, + "loss": 2.5631, + "theoretical_loss": 3.3685451177908616, + "tokens_seen": 2499796992 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001225777331995988, + "loss": 2.4467, + "theoretical_loss": 3.3685380734995207, + "tokens_seen": 2499862528 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012256770310932797, + "loss": 2.5137, + "theoretical_loss": 3.368531029444556, + "tokens_seen": 2499928064 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012255767301905718, + "loss": 2.5452, + "theoretical_loss": 3.368523985625954, + "tokens_seen": 2499993600 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012254764292878636, + "loss": 2.7721, + "theoretical_loss": 3.368516942043699, + "tokens_seen": 2500059136 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012253761283851557, + "loss": 2.3719, + "theoretical_loss": 3.3685098986977784, + "tokens_seen": 2500124672 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2773408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.359963893890381, + "objective/train/theoretical_loss": 3.3685028555881775, + "objective/train/tokens_used": 2520650208, + "theoretical_loss": 3.3685028555881775, + "tokens_seen": 2500190208 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012252758274824475, + "loss": 2.5714, + "theoretical_loss": 3.3685028555881775, + "tokens_seen": 2500190208 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012251755265797393, + "loss": 2.4174, + "theoretical_loss": 3.3684958127148823, + "tokens_seen": 2500255744 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001225075225677031, + "loss": 2.6667, + "theoretical_loss": 3.3684887700778785, + "tokens_seen": 2500321280 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001224974924774323, + "loss": 2.3413, + "theoretical_loss": 3.368481727677152, + "tokens_seen": 2500386816 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001224874623871615, + "loss": 2.4948, + "theoretical_loss": 3.368474685512689, + "tokens_seen": 2500452352 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012247743229689068, + "loss": 2.6107, + "theoretical_loss": 3.3684676435844745, + "tokens_seen": 2500517888 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012246740220661986, + "loss": 2.5354, + "theoretical_loss": 3.3684606018924956, + "tokens_seen": 2500583424 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012245737211634904, + "loss": 2.68, + "theoretical_loss": 3.368453560436737, + "tokens_seen": 2500648960 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012244734202607825, + "loss": 2.5428, + "theoretical_loss": 3.3684465192171857, + "tokens_seen": 2500714496 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012243731193580743, + "loss": 2.5138, + "theoretical_loss": 3.3684394782338267, + "tokens_seen": 2500780032 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001224272818455366, + "loss": 2.6256, + "theoretical_loss": 3.3684324374866463, + "tokens_seen": 2500845568 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001224172517552658, + "loss": 2.6049, + "theoretical_loss": 3.3684253969756304, + "tokens_seen": 2500911104 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012240722166499498, + "loss": 2.713, + "theoretical_loss": 3.368418356700765, + "tokens_seen": 2500976640 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012239719157472418, + "loss": 2.4962, + "theoretical_loss": 3.3684113166620357, + "tokens_seen": 2501042176 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012238716148445336, + "loss": 2.3277, + "theoretical_loss": 3.3684042768594287, + "tokens_seen": 2501107712 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012237713139418255, + "loss": 2.6448, + "theoretical_loss": 3.368397237292929, + "tokens_seen": 2501173248 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012236710130391173, + "loss": 2.558, + "theoretical_loss": 3.368390197962524, + "tokens_seen": 2501238784 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012235707121364094, + "loss": 2.4658, + "theoretical_loss": 3.3683831588681983, + "tokens_seen": 2501304320 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012234704112337012, + "loss": 2.4989, + "theoretical_loss": 3.3683761200099385, + "tokens_seen": 2501369856 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001223370110330993, + "loss": 2.3863, + "theoretical_loss": 3.3683690813877303, + "tokens_seen": 2501435392 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012232698094282848, + "loss": 2.4472, + "theoretical_loss": 3.36836204300156, + "tokens_seen": 2501500928 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001223169508525577, + "loss": 2.3422, + "theoretical_loss": 3.3683550048514124, + "tokens_seen": 2501566464 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012230692076228687, + "loss": 2.5729, + "theoretical_loss": 3.3683479669372742, + "tokens_seen": 2501632000 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012229689067201605, + "loss": 2.5876, + "theoretical_loss": 3.3683409292591313, + "tokens_seen": 2501697536 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012228686058174523, + "loss": 2.4765, + "theoretical_loss": 3.3683338918169694, + "tokens_seen": 2501763072 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2774294, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.394186496734619, + "objective/train/theoretical_loss": 3.3683268546107747, + "objective/train/tokens_used": 2522288608, + "theoretical_loss": 3.3683268546107747, + "tokens_seen": 2501828608 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001222768304914744, + "loss": 2.4527, + "theoretical_loss": 3.3683268546107747, + "tokens_seen": 2501828608 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012226680040120362, + "loss": 2.3234, + "theoretical_loss": 3.368319817640533, + "tokens_seen": 2501894144 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001222567703109328, + "loss": 2.6056, + "theoretical_loss": 3.36831278090623, + "tokens_seen": 2501959680 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012224674022066198, + "loss": 2.293, + "theoretical_loss": 3.3683057444078517, + "tokens_seen": 2502025216 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012223671013039116, + "loss": 2.3615, + "theoretical_loss": 3.368298708145384, + "tokens_seen": 2502090752 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012222668004012037, + "loss": 2.6796, + "theoretical_loss": 3.3682916721188128, + "tokens_seen": 2502156288 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012221664994984955, + "loss": 2.4963, + "theoretical_loss": 3.368284636328124, + "tokens_seen": 2502221824 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012220661985957873, + "loss": 2.6199, + "theoretical_loss": 3.368277600773304, + "tokens_seen": 2502287360 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012219658976930791, + "loss": 2.4625, + "theoretical_loss": 3.368270565454338, + "tokens_seen": 2502352896 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001221865596790371, + "loss": 2.6314, + "theoretical_loss": 3.368263530371212, + "tokens_seen": 2502418432 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001221765295887663, + "loss": 2.5321, + "theoretical_loss": 3.3682564955239123, + "tokens_seen": 2502483968 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012216649949849549, + "loss": 2.4142, + "theoretical_loss": 3.368249460912425, + "tokens_seen": 2502549504 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001221564694082247, + "loss": 2.4016, + "theoretical_loss": 3.3682424265367352, + "tokens_seen": 2502615040 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012214643931795387, + "loss": 2.5407, + "theoretical_loss": 3.3682353923968296, + "tokens_seen": 2502680576 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012213640922768306, + "loss": 2.5688, + "theoretical_loss": 3.3682283584926935, + "tokens_seen": 2502746112 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012212637913741224, + "loss": 2.4105, + "theoretical_loss": 3.3682213248243134, + "tokens_seen": 2502811648 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012211634904714142, + "loss": 2.5026, + "theoretical_loss": 3.368214291391675, + "tokens_seen": 2502877184 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012210631895687063, + "loss": 2.477, + "theoretical_loss": 3.368207258194764, + "tokens_seen": 2502942720 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001220962888665998, + "loss": 2.5865, + "theoretical_loss": 3.3682002252335668, + "tokens_seen": 2503008256 + }, + { + "epoch": 8.04, + "learning_rate": 0.000122086258776329, + "loss": 2.5382, + "theoretical_loss": 3.368193192508069, + "tokens_seen": 2503073792 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012207622868605817, + "loss": 2.5999, + "theoretical_loss": 3.368186160018256, + "tokens_seen": 2503139328 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012206619859578736, + "loss": 2.4539, + "theoretical_loss": 3.368179127764115, + "tokens_seen": 2503204864 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012205616850551656, + "loss": 2.6739, + "theoretical_loss": 3.368172095745631, + "tokens_seen": 2503270400 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012204613841524574, + "loss": 2.6503, + "theoretical_loss": 3.36816506396279, + "tokens_seen": 2503335936 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012203610832497493, + "loss": 2.6211, + "theoretical_loss": 3.368158032415578, + "tokens_seen": 2503401472 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2775001, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.617392063140869, + "objective/train/theoretical_loss": 3.3681510011039815, + "objective/train/tokens_used": 2523927008, + "theoretical_loss": 3.3681510011039815, + "tokens_seen": 2503467008 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012202607823470412, + "loss": 2.5324, + "theoretical_loss": 3.3681510011039815, + "tokens_seen": 2503467008 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001220160481444333, + "loss": 2.5588, + "theoretical_loss": 3.368143970027986, + "tokens_seen": 2503532544 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012200601805416249, + "loss": 2.3456, + "theoretical_loss": 3.368136939187577, + "tokens_seen": 2503598080 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012199598796389167, + "loss": 2.6758, + "theoretical_loss": 3.368129908582741, + "tokens_seen": 2503663616 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012198595787362087, + "loss": 2.5581, + "theoretical_loss": 3.368122878213464, + "tokens_seen": 2503729152 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012197592778335005, + "loss": 2.4993, + "theoretical_loss": 3.3681158480797313, + "tokens_seen": 2503794688 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012196589769307924, + "loss": 2.5713, + "theoretical_loss": 3.36810881818153, + "tokens_seen": 2503860224 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012195586760280842, + "loss": 2.4834, + "theoretical_loss": 3.3681017885188447, + "tokens_seen": 2503925760 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012194583751253762, + "loss": 2.5258, + "theoretical_loss": 3.368094759091662, + "tokens_seen": 2503991296 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001219358074222668, + "loss": 2.5565, + "theoretical_loss": 3.368087729899968, + "tokens_seen": 2504056832 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012192577733199598, + "loss": 2.5682, + "theoretical_loss": 3.3680807009437483, + "tokens_seen": 2504122368 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012191574724172518, + "loss": 2.4483, + "theoretical_loss": 3.3680736722229896, + "tokens_seen": 2504187904 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012190571715145436, + "loss": 2.5033, + "theoretical_loss": 3.3680666437376767, + "tokens_seen": 2504253440 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012189568706118355, + "loss": 2.5079, + "theoretical_loss": 3.3680596154877964, + "tokens_seen": 2504318976 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012188565697091273, + "loss": 2.4342, + "theoretical_loss": 3.3680525874733345, + "tokens_seen": 2504384512 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012187562688064193, + "loss": 2.5102, + "theoretical_loss": 3.368045559694276, + "tokens_seen": 2504450048 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012186559679037111, + "loss": 2.5523, + "theoretical_loss": 3.3680385321506088, + "tokens_seen": 2504515584 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001218555667001003, + "loss": 2.3728, + "theoretical_loss": 3.368031504842317, + "tokens_seen": 2504581120 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012184553660982948, + "loss": 2.5036, + "theoretical_loss": 3.3680244777693877, + "tokens_seen": 2504646656 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012183550651955868, + "loss": 2.3508, + "theoretical_loss": 3.368017450931806, + "tokens_seen": 2504712192 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012182547642928786, + "loss": 2.6125, + "theoretical_loss": 3.368010424329559, + "tokens_seen": 2504777728 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012181544633901704, + "loss": 2.7202, + "theoretical_loss": 3.3680033979626316, + "tokens_seen": 2504843264 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012180541624874625, + "loss": 2.4945, + "theoretical_loss": 3.36799637183101, + "tokens_seen": 2504908800 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012179538615847543, + "loss": 2.4006, + "theoretical_loss": 3.3679893459346806, + "tokens_seen": 2504974336 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012178535606820463, + "loss": 2.5328, + "theoretical_loss": 3.3679823202736285, + "tokens_seen": 2505039872 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2776495, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5442686080932617, + "objective/train/theoretical_loss": 3.3679752948478408, + "objective/train/tokens_used": 2525565408, + "theoretical_loss": 3.3679752948478408, + "tokens_seen": 2505105408 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001217753259779338, + "loss": 2.4794, + "theoretical_loss": 3.3679752948478408, + "tokens_seen": 2505105408 + }, + { + "epoch": 8.04, + "learning_rate": 0.000121765295887663, + "loss": 2.4816, + "theoretical_loss": 3.367968269657303, + "tokens_seen": 2505170944 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012175526579739218, + "loss": 2.575, + "theoretical_loss": 3.3679612447020006, + "tokens_seen": 2505236480 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012174523570712138, + "loss": 2.4954, + "theoretical_loss": 3.3679542199819203, + "tokens_seen": 2505302016 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012173520561685056, + "loss": 2.5495, + "theoretical_loss": 3.3679471954970475, + "tokens_seen": 2505367552 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012172517552657974, + "loss": 2.5464, + "theoretical_loss": 3.367940171247368, + "tokens_seen": 2505433088 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012171514543630893, + "loss": 2.3931, + "theoretical_loss": 3.367933147232869, + "tokens_seen": 2505498624 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012170511534603811, + "loss": 2.3556, + "theoretical_loss": 3.367926123453535, + "tokens_seen": 2505564160 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012169508525576731, + "loss": 2.5697, + "theoretical_loss": 3.367919099909353, + "tokens_seen": 2505629696 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012168505516549649, + "loss": 2.7169, + "theoretical_loss": 3.3679120766003083, + "tokens_seen": 2505695232 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012167502507522569, + "loss": 2.5677, + "theoretical_loss": 3.3679050535263872, + "tokens_seen": 2505760768 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012166499498495487, + "loss": 2.5521, + "theoretical_loss": 3.367898030687576, + "tokens_seen": 2505826304 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012165496489468406, + "loss": 2.4507, + "theoretical_loss": 3.36789100808386, + "tokens_seen": 2505891840 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012164493480441324, + "loss": 2.4191, + "theoretical_loss": 3.3678839857152254, + "tokens_seen": 2505957376 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012163490471414244, + "loss": 2.5894, + "theoretical_loss": 3.3678769635816583, + "tokens_seen": 2506022912 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012162487462387162, + "loss": 2.229, + "theoretical_loss": 3.367869941683145, + "tokens_seen": 2506088448 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001216148445336008, + "loss": 2.3141, + "theoretical_loss": 3.367862920019671, + "tokens_seen": 2506153984 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012160481444333, + "loss": 2.6187, + "theoretical_loss": 3.3678558985912224, + "tokens_seen": 2506219520 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012159478435305917, + "loss": 2.627, + "theoretical_loss": 3.367848877397785, + "tokens_seen": 2506285056 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012158475426278837, + "loss": 2.3617, + "theoretical_loss": 3.3678418564393455, + "tokens_seen": 2506350592 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012157472417251755, + "loss": 2.4654, + "theoretical_loss": 3.367834835715889, + "tokens_seen": 2506416128 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012156469408224675, + "loss": 2.5547, + "theoretical_loss": 3.367827815227402, + "tokens_seen": 2506481664 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012155466399197593, + "loss": 2.8724, + "theoretical_loss": 3.3678207949738708, + "tokens_seen": 2506547200 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012154463390170512, + "loss": 2.3279, + "theoretical_loss": 3.367813774955281, + "tokens_seen": 2506612736 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001215346038114343, + "loss": 2.4266, + "theoretical_loss": 3.367806755171618, + "tokens_seen": 2506678272 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2777249, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2320642471313477, + "objective/train/theoretical_loss": 3.367799735622869, + "objective/train/tokens_used": 2527203808, + "theoretical_loss": 3.367799735622869, + "tokens_seen": 2506743808 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012152457372116348, + "loss": 2.3741, + "theoretical_loss": 3.367799735622869, + "tokens_seen": 2506743808 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012151454363089268, + "loss": 2.5376, + "theoretical_loss": 3.367792716309019, + "tokens_seen": 2506809344 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012150451354062186, + "loss": 2.5842, + "theoretical_loss": 3.3677856972300546, + "tokens_seen": 2506874880 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012149448345035105, + "loss": 2.6786, + "theoretical_loss": 3.367778678385961, + "tokens_seen": 2506940416 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012148445336008024, + "loss": 2.447, + "theoretical_loss": 3.3677716597767255, + "tokens_seen": 2507005952 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012147442326980943, + "loss": 2.5107, + "theoretical_loss": 3.3677646414023332, + "tokens_seen": 2507071488 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012146439317953861, + "loss": 2.3272, + "theoretical_loss": 3.36775762326277, + "tokens_seen": 2507137024 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001214543630892678, + "loss": 2.4005, + "theoretical_loss": 3.3677506053580224, + "tokens_seen": 2507202560 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012144433299899699, + "loss": 2.2372, + "theoretical_loss": 3.3677435876880764, + "tokens_seen": 2507268096 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012143430290872617, + "loss": 2.7187, + "theoretical_loss": 3.3677365702529176, + "tokens_seen": 2507333632 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012142427281845538, + "loss": 2.5041, + "theoretical_loss": 3.367729553052532, + "tokens_seen": 2507399168 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012141424272818456, + "loss": 2.5945, + "theoretical_loss": 3.367722536086906, + "tokens_seen": 2507464704 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012140421263791375, + "loss": 2.4748, + "theoretical_loss": 3.3677155193560253, + "tokens_seen": 2507530240 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012139418254764293, + "loss": 2.7265, + "theoretical_loss": 3.3677085028598763, + "tokens_seen": 2507595776 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012138415245737213, + "loss": 2.3636, + "theoretical_loss": 3.367701486598445, + "tokens_seen": 2507661312 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012137412236710131, + "loss": 2.153, + "theoretical_loss": 3.367694470571717, + "tokens_seen": 2507726848 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001213640922768305, + "loss": 2.492, + "theoretical_loss": 3.367687454779678, + "tokens_seen": 2507792384 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012135406218655968, + "loss": 2.4645, + "theoretical_loss": 3.367680439222315, + "tokens_seen": 2507857920 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012134403209628888, + "loss": 2.8657, + "theoretical_loss": 3.367673423899613, + "tokens_seen": 2507923456 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012133400200601806, + "loss": 2.7077, + "theoretical_loss": 3.367666408811559, + "tokens_seen": 2507988992 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012132397191574724, + "loss": 2.2458, + "theoretical_loss": 3.3676593939581387, + "tokens_seen": 2508054528 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012131394182547644, + "loss": 2.506, + "theoretical_loss": 3.367652379339338, + "tokens_seen": 2508120064 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012130391173520562, + "loss": 2.3489, + "theoretical_loss": 3.3676453649551426, + "tokens_seen": 2508185600 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012129388164493481, + "loss": 2.383, + "theoretical_loss": 3.3676383508055387, + "tokens_seen": 2508251136 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012128385155466399, + "loss": 2.5217, + "theoretical_loss": 3.367631336890513, + "tokens_seen": 2508316672 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2778530, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4533112049102783, + "objective/train/theoretical_loss": 3.3676243232100505, + "objective/train/tokens_used": 2528842208, + "theoretical_loss": 3.3676243232100505, + "tokens_seen": 2508382208 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012127382146439319, + "loss": 2.5887, + "theoretical_loss": 3.3676243232100505, + "tokens_seen": 2508382208 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012126379137412237, + "loss": 2.8113, + "theoretical_loss": 3.367617309764138, + "tokens_seen": 2508447744 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012125376128385156, + "loss": 2.2994, + "theoretical_loss": 3.367610296552761, + "tokens_seen": 2508513280 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012124373119358074, + "loss": 2.3837, + "theoretical_loss": 3.3676032835759058, + "tokens_seen": 2508578816 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012123370110330993, + "loss": 2.6126, + "theoretical_loss": 3.3675962708335585, + "tokens_seen": 2508644352 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012122367101303912, + "loss": 2.6551, + "theoretical_loss": 3.367589258325705, + "tokens_seen": 2508709888 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001212136409227683, + "loss": 2.3271, + "theoretical_loss": 3.367582246052331, + "tokens_seen": 2508775424 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001212036108324975, + "loss": 2.3541, + "theoretical_loss": 3.3675752340134237, + "tokens_seen": 2508840960 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012119358074222668, + "loss": 2.4333, + "theoretical_loss": 3.367568222208968, + "tokens_seen": 2508906496 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012118355065195587, + "loss": 2.3483, + "theoretical_loss": 3.36756121063895, + "tokens_seen": 2508972032 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012117352056168505, + "loss": 2.7869, + "theoretical_loss": 3.367554199303356, + "tokens_seen": 2509037568 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012116349047141425, + "loss": 2.2978, + "theoretical_loss": 3.3675471882021726, + "tokens_seen": 2509103104 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012115346038114343, + "loss": 2.378, + "theoretical_loss": 3.3675401773353846, + "tokens_seen": 2509168640 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012114343029087261, + "loss": 2.1646, + "theoretical_loss": 3.3675331667029793, + "tokens_seen": 2509234176 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001211334002006018, + "loss": 2.5092, + "theoretical_loss": 3.367526156304942, + "tokens_seen": 2509299712 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012112337011033099, + "loss": 2.6178, + "theoretical_loss": 3.367519146141259, + "tokens_seen": 2509365248 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012111334002006018, + "loss": 2.4658, + "theoretical_loss": 3.367512136211916, + "tokens_seen": 2509430784 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012110330992978936, + "loss": 2.3137, + "theoretical_loss": 3.3675051265168996, + "tokens_seen": 2509496320 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012109327983951856, + "loss": 2.2604, + "theoretical_loss": 3.3674981170561953, + "tokens_seen": 2509561856 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012108324974924774, + "loss": 2.5159, + "theoretical_loss": 3.3674911078297898, + "tokens_seen": 2509627392 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012107321965897693, + "loss": 2.4862, + "theoretical_loss": 3.367484098837668, + "tokens_seen": 2509692928 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012106318956870611, + "loss": 2.5013, + "theoretical_loss": 3.3674770900798174, + "tokens_seen": 2509758464 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012105315947843532, + "loss": 2.628, + "theoretical_loss": 3.367470081556223, + "tokens_seen": 2509824000 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001210431293881645, + "loss": 2.6035, + "theoretical_loss": 3.3674630732668716, + "tokens_seen": 2509889536 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012103309929789368, + "loss": 2.5373, + "theoretical_loss": 3.3674560652117487, + "tokens_seen": 2509955072 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2779362, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.675309181213379, + "objective/train/theoretical_loss": 3.3674490573908407, + "objective/train/tokens_used": 2530480608, + "theoretical_loss": 3.3674490573908407, + "tokens_seen": 2510020608 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012102306920762288, + "loss": 2.4756, + "theoretical_loss": 3.3674490573908407, + "tokens_seen": 2510020608 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012101303911735206, + "loss": 2.6496, + "theoretical_loss": 3.367442049804133, + "tokens_seen": 2510086144 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012100300902708125, + "loss": 2.4696, + "theoretical_loss": 3.3674350424516124, + "tokens_seen": 2510151680 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012099297893681044, + "loss": 2.5308, + "theoretical_loss": 3.367428035333265, + "tokens_seen": 2510217216 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012098294884653963, + "loss": 2.6147, + "theoretical_loss": 3.367421028449076, + "tokens_seen": 2510282752 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012097291875626881, + "loss": 2.4331, + "theoretical_loss": 3.3674140217990325, + "tokens_seen": 2510348288 + }, + { + "epoch": 8.04, + "learning_rate": 0.000120962888665998, + "loss": 2.4515, + "theoretical_loss": 3.3674070153831197, + "tokens_seen": 2510413824 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012095285857572719, + "loss": 2.736, + "theoretical_loss": 3.3674000092013245, + "tokens_seen": 2510479360 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012094282848545637, + "loss": 2.6068, + "theoretical_loss": 3.3673930032536323, + "tokens_seen": 2510544896 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012093279839518556, + "loss": 2.6762, + "theoretical_loss": 3.3673859975400293, + "tokens_seen": 2510610432 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012092276830491474, + "loss": 2.4913, + "theoretical_loss": 3.3673789920605017, + "tokens_seen": 2510675968 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012091273821464394, + "loss": 2.4168, + "theoretical_loss": 3.367371986815036, + "tokens_seen": 2510741504 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012090270812437312, + "loss": 2.697, + "theoretical_loss": 3.3673649818036173, + "tokens_seen": 2510807040 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012089267803410231, + "loss": 2.4699, + "theoretical_loss": 3.367357977026232, + "tokens_seen": 2510872576 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001208826479438315, + "loss": 2.5697, + "theoretical_loss": 3.3673509724828667, + "tokens_seen": 2510938112 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012087261785356069, + "loss": 2.4718, + "theoretical_loss": 3.367343968173507, + "tokens_seen": 2511003648 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012086258776328987, + "loss": 2.6467, + "theoretical_loss": 3.367336964098139, + "tokens_seen": 2511069184 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012085255767301907, + "loss": 2.8004, + "theoretical_loss": 3.367329960256749, + "tokens_seen": 2511134720 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012084252758274825, + "loss": 2.5292, + "theoretical_loss": 3.367322956649323, + "tokens_seen": 2511200256 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012083249749247743, + "loss": 2.4841, + "theoretical_loss": 3.3673159532758468, + "tokens_seen": 2511265792 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012082246740220662, + "loss": 2.3733, + "theoretical_loss": 3.367308950136307, + "tokens_seen": 2511331328 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001208124373119358, + "loss": 2.4659, + "theoretical_loss": 3.367301947230689, + "tokens_seen": 2511396864 + }, + { + "epoch": 8.04, + "learning_rate": 0.000120802407221665, + "loss": 2.5118, + "theoretical_loss": 3.3672949445589797, + "tokens_seen": 2511462400 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012079237713139418, + "loss": 2.4519, + "theoretical_loss": 3.3672879421211643, + "tokens_seen": 2511527936 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012078234704112337, + "loss": 2.3955, + "theoretical_loss": 3.3672809399172294, + "tokens_seen": 2511593472 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2780593, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4508261680603027, + "objective/train/theoretical_loss": 3.3672739379471617, + "objective/train/tokens_used": 2532119008, + "theoretical_loss": 3.3672739379471617, + "tokens_seen": 2511659008 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012077231695085256, + "loss": 2.5624, + "theoretical_loss": 3.3672739379471617, + "tokens_seen": 2511659008 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012076228686058175, + "loss": 2.3601, + "theoretical_loss": 3.367266936210946, + "tokens_seen": 2511724544 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012075225677031093, + "loss": 2.434, + "theoretical_loss": 3.367259934708569, + "tokens_seen": 2511790080 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012074222668004011, + "loss": 2.618, + "theoretical_loss": 3.3672529334400165, + "tokens_seen": 2511855616 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012073219658976931, + "loss": 2.5338, + "theoretical_loss": 3.3672459324052757, + "tokens_seen": 2511921152 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012072216649949849, + "loss": 2.4676, + "theoretical_loss": 3.3672389316043314, + "tokens_seen": 2511986688 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012071213640922768, + "loss": 2.5064, + "theoretical_loss": 3.3672319310371703, + "tokens_seen": 2512052224 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012070210631895686, + "loss": 2.6196, + "theoretical_loss": 3.3672249307037783, + "tokens_seen": 2512117760 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012069207622868606, + "loss": 2.4417, + "theoretical_loss": 3.3672179306041414, + "tokens_seen": 2512183296 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012068204613841525, + "loss": 2.6023, + "theoretical_loss": 3.3672109307382456, + "tokens_seen": 2512248832 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012067201604814445, + "loss": 2.6613, + "theoretical_loss": 3.367203931106078, + "tokens_seen": 2512314368 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012066198595787363, + "loss": 2.6703, + "theoretical_loss": 3.3671969317076234, + "tokens_seen": 2512379904 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012065195586760281, + "loss": 2.4019, + "theoretical_loss": 3.3671899325428685, + "tokens_seen": 2512445440 + }, + { + "epoch": 8.04, + "learning_rate": 0.000120641925777332, + "loss": 2.6799, + "theoretical_loss": 3.3671829336117995, + "tokens_seen": 2512510976 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012063189568706119, + "loss": 2.4718, + "theoretical_loss": 3.3671759349144024, + "tokens_seen": 2512576512 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012062186559679038, + "loss": 2.3858, + "theoretical_loss": 3.367168936450663, + "tokens_seen": 2512642048 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012061183550651956, + "loss": 2.5551, + "theoretical_loss": 3.3671619382205678, + "tokens_seen": 2512707584 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012060180541624876, + "loss": 2.5654, + "theoretical_loss": 3.367154940224103, + "tokens_seen": 2512773120 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012059177532597794, + "loss": 2.5809, + "theoretical_loss": 3.367147942461254, + "tokens_seen": 2512838656 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012058174523570713, + "loss": 2.6627, + "theoretical_loss": 3.3671409449320078, + "tokens_seen": 2512904192 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012057171514543631, + "loss": 2.611, + "theoretical_loss": 3.3671339476363498, + "tokens_seen": 2512969728 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012056168505516551, + "loss": 2.5915, + "theoretical_loss": 3.3671269505742667, + "tokens_seen": 2513035264 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012055165496489469, + "loss": 2.6367, + "theoretical_loss": 3.3671199537457444, + "tokens_seen": 2513100800 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012054162487462387, + "loss": 2.5127, + "theoretical_loss": 3.3671129571507685, + "tokens_seen": 2513166336 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012053159478435306, + "loss": 2.6456, + "theoretical_loss": 3.3671059607893254, + "tokens_seen": 2513231872 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2781391, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5402140617370605, + "objective/train/theoretical_loss": 3.367098964661402, + "objective/train/tokens_used": 2533757408, + "theoretical_loss": 3.367098964661402, + "tokens_seen": 2513297408 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012052156469408225, + "loss": 2.5856, + "theoretical_loss": 3.367098964661402, + "tokens_seen": 2513297408 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012051153460381144, + "loss": 2.6205, + "theoretical_loss": 3.3670919687669834, + "tokens_seen": 2513362944 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012050150451354062, + "loss": 2.3645, + "theoretical_loss": 3.367084973106056, + "tokens_seen": 2513428480 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012049147442326982, + "loss": 2.505, + "theoretical_loss": 3.367077977678606, + "tokens_seen": 2513494016 + }, + { + "epoch": 8.04, + "learning_rate": 0.000120481444332999, + "loss": 2.6408, + "theoretical_loss": 3.36707098248462, + "tokens_seen": 2513559552 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012047141424272819, + "loss": 2.6165, + "theoretical_loss": 3.3670639875240833, + "tokens_seen": 2513625088 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012046138415245737, + "loss": 2.3287, + "theoretical_loss": 3.3670569927969822, + "tokens_seen": 2513690624 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012045135406218655, + "loss": 2.4676, + "theoretical_loss": 3.3670499983033033, + "tokens_seen": 2513756160 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012044132397191575, + "loss": 2.5876, + "theoretical_loss": 3.3670430040430324, + "tokens_seen": 2513821696 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012043129388164493, + "loss": 2.2269, + "theoretical_loss": 3.3670360100161556, + "tokens_seen": 2513887232 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012042126379137412, + "loss": 2.621, + "theoretical_loss": 3.3670290162226593, + "tokens_seen": 2513952768 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001204112337011033, + "loss": 2.2653, + "theoretical_loss": 3.367022022662529, + "tokens_seen": 2514018304 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001204012036108325, + "loss": 2.6281, + "theoretical_loss": 3.3670150293357515, + "tokens_seen": 2514083840 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012039117352056168, + "loss": 2.2177, + "theoretical_loss": 3.3670080362423125, + "tokens_seen": 2514149376 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012038114343029088, + "loss": 2.4926, + "theoretical_loss": 3.3670010433821984, + "tokens_seen": 2514214912 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012037111334002006, + "loss": 2.5253, + "theoretical_loss": 3.3669940507553955, + "tokens_seen": 2514280448 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012036108324974924, + "loss": 2.4635, + "theoretical_loss": 3.3669870583618895, + "tokens_seen": 2514345984 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012035105315947843, + "loss": 2.5205, + "theoretical_loss": 3.3669800662016667, + "tokens_seen": 2514411520 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012034102306920761, + "loss": 2.5302, + "theoretical_loss": 3.366973074274713, + "tokens_seen": 2514477056 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012033099297893681, + "loss": 2.2958, + "theoretical_loss": 3.366966082581015, + "tokens_seen": 2514542592 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012032096288866599, + "loss": 2.641, + "theoretical_loss": 3.3669590911205587, + "tokens_seen": 2514608128 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012031093279839519, + "loss": 2.5361, + "theoretical_loss": 3.36695209989333, + "tokens_seen": 2514673664 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012030090270812438, + "loss": 2.5059, + "theoretical_loss": 3.366945108899315, + "tokens_seen": 2514739200 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012029087261785357, + "loss": 2.3558, + "theoretical_loss": 3.3669381181385005, + "tokens_seen": 2514804736 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012028084252758276, + "loss": 2.6407, + "theoretical_loss": 3.3669311276108718, + "tokens_seen": 2514870272 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2782784, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.807842254638672, + "objective/train/theoretical_loss": 3.366924137316416, + "objective/train/tokens_used": 2535395808, + "theoretical_loss": 3.366924137316416, + "tokens_seen": 2514935808 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012027081243731195, + "loss": 2.6249, + "theoretical_loss": 3.366924137316416, + "tokens_seen": 2514935808 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012026078234704113, + "loss": 2.5499, + "theoretical_loss": 3.3669171472551183, + "tokens_seen": 2515001344 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012025075225677031, + "loss": 2.6524, + "theoretical_loss": 3.366910157426965, + "tokens_seen": 2515066880 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012024072216649951, + "loss": 2.4621, + "theoretical_loss": 3.3669031678319428, + "tokens_seen": 2515132416 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012023069207622869, + "loss": 2.5951, + "theoretical_loss": 3.3668961784700375, + "tokens_seen": 2515197952 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012022066198595788, + "loss": 2.7199, + "theoretical_loss": 3.366889189341235, + "tokens_seen": 2515263488 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012021063189568706, + "loss": 2.5621, + "theoretical_loss": 3.366882200445522, + "tokens_seen": 2515329024 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012020060180541626, + "loss": 2.3549, + "theoretical_loss": 3.3668752117828844, + "tokens_seen": 2515394560 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012019057171514544, + "loss": 2.4757, + "theoretical_loss": 3.3668682233533085, + "tokens_seen": 2515460096 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012018054162487463, + "loss": 2.6375, + "theoretical_loss": 3.36686123515678, + "tokens_seen": 2515525632 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012017051153460382, + "loss": 2.4726, + "theoretical_loss": 3.3668542471932854, + "tokens_seen": 2515591168 + }, + { + "epoch": 8.04, + "learning_rate": 0.000120160481444333, + "loss": 2.4112, + "theoretical_loss": 3.366847259462811, + "tokens_seen": 2515656704 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012015045135406219, + "loss": 2.5382, + "theoretical_loss": 3.3668402719653425, + "tokens_seen": 2515722240 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012014042126379137, + "loss": 2.4543, + "theoretical_loss": 3.3668332847008666, + "tokens_seen": 2515787776 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012013039117352057, + "loss": 2.5247, + "theoretical_loss": 3.366826297669369, + "tokens_seen": 2515853312 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012012036108324975, + "loss": 2.4129, + "theoretical_loss": 3.366819310870836, + "tokens_seen": 2515918848 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012011033099297894, + "loss": 2.3292, + "theoretical_loss": 3.366812324305254, + "tokens_seen": 2515984384 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012010030090270812, + "loss": 2.4984, + "theoretical_loss": 3.3668053379726093, + "tokens_seen": 2516049920 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012009027081243732, + "loss": 2.4743, + "theoretical_loss": 3.3667983518728875, + "tokens_seen": 2516115456 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001200802407221665, + "loss": 2.3808, + "theoretical_loss": 3.3667913660060744, + "tokens_seen": 2516180992 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012007021063189568, + "loss": 2.3325, + "theoretical_loss": 3.3667843803721578, + "tokens_seen": 2516246528 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012006018054162488, + "loss": 2.3515, + "theoretical_loss": 3.3667773949711224, + "tokens_seen": 2516312064 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012005015045135406, + "loss": 2.5555, + "theoretical_loss": 3.3667704098029545, + "tokens_seen": 2516377600 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012004012036108325, + "loss": 2.5109, + "theoretical_loss": 3.3667634248676412, + "tokens_seen": 2516443136 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012003009027081243, + "loss": 2.5207, + "theoretical_loss": 3.366756440165168, + "tokens_seen": 2516508672 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2783420, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2718324661254883, + "objective/train/theoretical_loss": 3.366749455695521, + "objective/train/tokens_used": 2537034208, + "theoretical_loss": 3.366749455695521, + "tokens_seen": 2516574208 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012002006018054163, + "loss": 2.5927, + "theoretical_loss": 3.366749455695521, + "tokens_seen": 2516574208 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012001003009027081, + "loss": 2.5556, + "theoretical_loss": 3.366742471458686, + "tokens_seen": 2516639744 + }, + { + "epoch": 8.04, + "learning_rate": 0.00012, + "loss": 2.6662, + "theoretical_loss": 3.3667354874546502, + "tokens_seen": 2516705280 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011998996990972918, + "loss": 2.5696, + "theoretical_loss": 3.3667285036833996, + "tokens_seen": 2516770816 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011997993981945838, + "loss": 2.4513, + "theoretical_loss": 3.3667215201449197, + "tokens_seen": 2516836352 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011996990972918756, + "loss": 2.616, + "theoretical_loss": 3.366714536839197, + "tokens_seen": 2516901888 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011995987963891674, + "loss": 2.5703, + "theoretical_loss": 3.3667075537662177, + "tokens_seen": 2516967424 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011994984954864594, + "loss": 2.459, + "theoretical_loss": 3.3667005709259685, + "tokens_seen": 2517032960 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011993981945837512, + "loss": 2.3183, + "theoretical_loss": 3.3666935883184346, + "tokens_seen": 2517098496 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011992978936810433, + "loss": 2.6678, + "theoretical_loss": 3.366686605943603, + "tokens_seen": 2517164032 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001199197592778335, + "loss": 2.3842, + "theoretical_loss": 3.3666796238014594, + "tokens_seen": 2517229568 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001199097291875627, + "loss": 2.443, + "theoretical_loss": 3.36667264189199, + "tokens_seen": 2517295104 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011989969909729188, + "loss": 2.4005, + "theoretical_loss": 3.3666656602151814, + "tokens_seen": 2517360640 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011988966900702108, + "loss": 2.6695, + "theoretical_loss": 3.3666586787710195, + "tokens_seen": 2517426176 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011987963891675026, + "loss": 2.2994, + "theoretical_loss": 3.3666516975594907, + "tokens_seen": 2517491712 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011986960882647944, + "loss": 2.5166, + "theoretical_loss": 3.3666447165805806, + "tokens_seen": 2517557248 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011985957873620863, + "loss": 2.6477, + "theoretical_loss": 3.366637735834276, + "tokens_seen": 2517622784 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011984954864593781, + "loss": 2.6657, + "theoretical_loss": 3.366630755320563, + "tokens_seen": 2517688320 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011983951855566701, + "loss": 2.3821, + "theoretical_loss": 3.366623775039428, + "tokens_seen": 2517753856 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011982948846539619, + "loss": 2.3947, + "theoretical_loss": 3.3666167949908563, + "tokens_seen": 2517819392 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011981945837512539, + "loss": 2.5734, + "theoretical_loss": 3.366609815174835, + "tokens_seen": 2517884928 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011980942828485457, + "loss": 2.6907, + "theoretical_loss": 3.36660283559135, + "tokens_seen": 2517950464 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011979939819458376, + "loss": 2.622, + "theoretical_loss": 3.3665958562403873, + "tokens_seen": 2518016000 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011978936810431294, + "loss": 2.4423, + "theoretical_loss": 3.3665888771219334, + "tokens_seen": 2518081536 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011977933801404214, + "loss": 2.4509, + "theoretical_loss": 3.3665818982359745, + "tokens_seen": 2518147072 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2784766, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.27060866355896, + "objective/train/theoretical_loss": 3.3665749195824968, + "objective/train/tokens_used": 2538672608, + "theoretical_loss": 3.3665749195824968, + "tokens_seen": 2518212608 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011976930792377132, + "loss": 2.3147, + "theoretical_loss": 3.3665749195824968, + "tokens_seen": 2518212608 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001197592778335005, + "loss": 2.3537, + "theoretical_loss": 3.3665679411614864, + "tokens_seen": 2518278144 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001197492477432297, + "loss": 2.5189, + "theoretical_loss": 3.3665609629729296, + "tokens_seen": 2518343680 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011973921765295887, + "loss": 2.6325, + "theoretical_loss": 3.3665539850168122, + "tokens_seen": 2518409216 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011972918756268807, + "loss": 2.4569, + "theoretical_loss": 3.366547007293121, + "tokens_seen": 2518474752 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011971915747241725, + "loss": 2.846, + "theoretical_loss": 3.3665400298018415, + "tokens_seen": 2518540288 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011970912738214645, + "loss": 2.6632, + "theoretical_loss": 3.366533052542961, + "tokens_seen": 2518605824 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011969909729187563, + "loss": 2.4433, + "theoretical_loss": 3.3665260755164645, + "tokens_seen": 2518671360 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011968906720160482, + "loss": 2.5261, + "theoretical_loss": 3.3665190987223395, + "tokens_seen": 2518736896 + }, + { + "epoch": 8.04, + "learning_rate": 0.000119679037111334, + "loss": 2.5282, + "theoretical_loss": 3.3665121221605707, + "tokens_seen": 2518802432 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011966900702106318, + "loss": 2.5622, + "theoretical_loss": 3.366505145831146, + "tokens_seen": 2518867968 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011965897693079238, + "loss": 2.3269, + "theoretical_loss": 3.36649816973405, + "tokens_seen": 2518933504 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011964894684052156, + "loss": 2.4932, + "theoretical_loss": 3.36649119386927, + "tokens_seen": 2518999040 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011963891675025075, + "loss": 2.579, + "theoretical_loss": 3.366484218236792, + "tokens_seen": 2519064576 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011962888665997993, + "loss": 2.4302, + "theoretical_loss": 3.3664772428366017, + "tokens_seen": 2519130112 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011961885656970913, + "loss": 2.4521, + "theoretical_loss": 3.366470267668686, + "tokens_seen": 2519195648 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011960882647943831, + "loss": 2.4058, + "theoretical_loss": 3.3664632927330307, + "tokens_seen": 2519261184 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001195987963891675, + "loss": 2.5959, + "theoretical_loss": 3.3664563180296225, + "tokens_seen": 2519326720 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011958876629889669, + "loss": 2.343, + "theoretical_loss": 3.366449343558447, + "tokens_seen": 2519392256 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011957873620862587, + "loss": 2.4965, + "theoretical_loss": 3.3664423693194907, + "tokens_seen": 2519457792 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011956870611835506, + "loss": 2.3205, + "theoretical_loss": 3.36643539531274, + "tokens_seen": 2519523328 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011955867602808424, + "loss": 2.223, + "theoretical_loss": 3.366428421538181, + "tokens_seen": 2519588864 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011954864593781345, + "loss": 2.6076, + "theoretical_loss": 3.3664214479957995, + "tokens_seen": 2519654400 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011953861584754263, + "loss": 2.5641, + "theoretical_loss": 3.366414474685582, + "tokens_seen": 2519719936 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011952858575727183, + "loss": 2.4293, + "theoretical_loss": 3.3664075016075157, + "tokens_seen": 2519785472 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2785557, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.717986583709717, + "objective/train/theoretical_loss": 3.3664005287615857, + "objective/train/tokens_used": 2540311008, + "theoretical_loss": 3.3664005287615857, + "tokens_seen": 2519851008 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011951855566700101, + "loss": 2.4051, + "theoretical_loss": 3.3664005287615857, + "tokens_seen": 2519851008 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001195085255767302, + "loss": 2.2878, + "theoretical_loss": 3.3663935561477785, + "tokens_seen": 2519916544 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011949849548645938, + "loss": 2.1773, + "theoretical_loss": 3.3663865837660802, + "tokens_seen": 2519982080 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011948846539618858, + "loss": 2.4268, + "theoretical_loss": 3.3663796116164773, + "tokens_seen": 2520047616 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011947843530591776, + "loss": 2.4173, + "theoretical_loss": 3.366372639698956, + "tokens_seen": 2520113152 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011946840521564694, + "loss": 2.6692, + "theoretical_loss": 3.3663656680135023, + "tokens_seen": 2520178688 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011945837512537614, + "loss": 2.5376, + "theoretical_loss": 3.366358696560103, + "tokens_seen": 2520244224 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011944834503510532, + "loss": 2.4628, + "theoretical_loss": 3.3663517253387436, + "tokens_seen": 2520309760 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011943831494483451, + "loss": 2.4819, + "theoretical_loss": 3.366344754349411, + "tokens_seen": 2520375296 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011942828485456369, + "loss": 2.3287, + "theoretical_loss": 3.3663377835920913, + "tokens_seen": 2520440832 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011941825476429289, + "loss": 2.5452, + "theoretical_loss": 3.3663308130667704, + "tokens_seen": 2520506368 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011940822467402207, + "loss": 2.57, + "theoretical_loss": 3.3663238427734345, + "tokens_seen": 2520571904 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011939819458375126, + "loss": 2.4722, + "theoretical_loss": 3.3663168727120705, + "tokens_seen": 2520637440 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011938816449348044, + "loss": 2.4385, + "theoretical_loss": 3.366309902882664, + "tokens_seen": 2520702976 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011937813440320963, + "loss": 2.6954, + "theoretical_loss": 3.3663029332852017, + "tokens_seen": 2520768512 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011936810431293882, + "loss": 2.5742, + "theoretical_loss": 3.3662959639196695, + "tokens_seen": 2520834048 + }, + { + "epoch": 8.04, + "learning_rate": 0.000119358074222668, + "loss": 2.4555, + "theoretical_loss": 3.366288994786054, + "tokens_seen": 2520899584 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001193480441323972, + "loss": 2.7197, + "theoretical_loss": 3.366282025884341, + "tokens_seen": 2520965120 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011933801404212638, + "loss": 2.5458, + "theoretical_loss": 3.3662750572145175, + "tokens_seen": 2521030656 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011932798395185557, + "loss": 2.4165, + "theoretical_loss": 3.3662680887765686, + "tokens_seen": 2521096192 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011931795386158475, + "loss": 2.6129, + "theoretical_loss": 3.3662611205704813, + "tokens_seen": 2521161728 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011930792377131395, + "loss": 2.5819, + "theoretical_loss": 3.3662541525962424, + "tokens_seen": 2521227264 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011929789368104313, + "loss": 2.4375, + "theoretical_loss": 3.366247184853837, + "tokens_seen": 2521292800 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011928786359077231, + "loss": 2.5743, + "theoretical_loss": 3.366240217343252, + "tokens_seen": 2521358336 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001192778335005015, + "loss": 2.5297, + "theoretical_loss": 3.366233250064474, + "tokens_seen": 2521423872 + }, + { + "epoch": 8.04, + "objective/train/docs_used": 2787181, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.431502342224121, + "objective/train/theoretical_loss": 3.3662262830174883, + "objective/train/tokens_used": 2541949408, + "theoretical_loss": 3.3662262830174883, + "tokens_seen": 2521489408 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011926780341023069, + "loss": 2.3688, + "theoretical_loss": 3.3662262830174883, + "tokens_seen": 2521489408 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011925777331995988, + "loss": 2.4454, + "theoretical_loss": 3.3662193162022818, + "tokens_seen": 2521554944 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011924774322968906, + "loss": 2.4825, + "theoretical_loss": 3.366212349618841, + "tokens_seen": 2521620480 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011923771313941826, + "loss": 2.435, + "theoretical_loss": 3.3662053832671512, + "tokens_seen": 2521686016 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011922768304914744, + "loss": 2.4609, + "theoretical_loss": 3.3661984171472, + "tokens_seen": 2521751552 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011921765295887663, + "loss": 2.6179, + "theoretical_loss": 3.3661914512589726, + "tokens_seen": 2521817088 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011920762286860581, + "loss": 2.5579, + "theoretical_loss": 3.3661844856024556, + "tokens_seen": 2521882624 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011919759277833501, + "loss": 2.63, + "theoretical_loss": 3.3661775201776356, + "tokens_seen": 2521948160 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011918756268806419, + "loss": 2.6031, + "theoretical_loss": 3.3661705549844982, + "tokens_seen": 2522013696 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011917753259779338, + "loss": 2.1912, + "theoretical_loss": 3.3661635900230302, + "tokens_seen": 2522079232 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011916750250752258, + "loss": 2.5767, + "theoretical_loss": 3.366156625293218, + "tokens_seen": 2522144768 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011915747241725176, + "loss": 2.3753, + "theoretical_loss": 3.366149660795047, + "tokens_seen": 2522210304 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011914744232698095, + "loss": 2.6897, + "theoretical_loss": 3.3661426965285046, + "tokens_seen": 2522275840 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011913741223671014, + "loss": 2.4488, + "theoretical_loss": 3.3661357324935763, + "tokens_seen": 2522341376 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011912738214643933, + "loss": 2.5896, + "theoretical_loss": 3.366128768690249, + "tokens_seen": 2522406912 + }, + { + "epoch": 8.04, + "learning_rate": 0.00011911735205616851, + "loss": 2.4833, + "theoretical_loss": 3.3661218051185084, + "tokens_seen": 2522472448 + }, + { + "epoch": 8.04, + "learning_rate": 0.0001191073219658977, + "loss": 2.4819, + "theoretical_loss": 3.366114841778341, + "tokens_seen": 2522537984 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011909729187562689, + "loss": 2.429, + "theoretical_loss": 3.366107878669733, + "tokens_seen": 2522603520 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011908726178535607, + "loss": 2.4761, + "theoretical_loss": 3.3661009157926705, + "tokens_seen": 2522669056 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011907723169508526, + "loss": 2.7415, + "theoretical_loss": 3.3660939531471405, + "tokens_seen": 2522734592 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011906720160481444, + "loss": 2.5377, + "theoretical_loss": 3.3660869907331286, + "tokens_seen": 2522800128 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011905717151454364, + "loss": 2.6089, + "theoretical_loss": 3.3660800285506216, + "tokens_seen": 2522865664 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011904714142427282, + "loss": 2.4326, + "theoretical_loss": 3.3660730665996055, + "tokens_seen": 2522931200 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011903711133400201, + "loss": 2.4364, + "theoretical_loss": 3.3660661048800664, + "tokens_seen": 2522996736 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001190270812437312, + "loss": 2.4025, + "theoretical_loss": 3.3660591433919906, + "tokens_seen": 2523062272 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2787821, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2500672340393066, + "objective/train/theoretical_loss": 3.3660521821353653, + "objective/train/tokens_used": 2543587808, + "theoretical_loss": 3.3660521821353653, + "tokens_seen": 2523127808 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011901705115346039, + "loss": 2.4838, + "theoretical_loss": 3.3660521821353653, + "tokens_seen": 2523127808 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011900702106318957, + "loss": 2.384, + "theoretical_loss": 3.3660452211101752, + "tokens_seen": 2523193344 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011899699097291877, + "loss": 2.4798, + "theoretical_loss": 3.366038260316408, + "tokens_seen": 2523258880 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011898696088264795, + "loss": 2.679, + "theoretical_loss": 3.3660312997540496, + "tokens_seen": 2523324416 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011897693079237713, + "loss": 2.4099, + "theoretical_loss": 3.366024339423086, + "tokens_seen": 2523389952 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011896690070210632, + "loss": 2.6456, + "theoretical_loss": 3.3660173793235035, + "tokens_seen": 2523455488 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001189568706118355, + "loss": 2.6857, + "theoretical_loss": 3.366010419455289, + "tokens_seen": 2523521024 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001189468405215647, + "loss": 2.3848, + "theoretical_loss": 3.366003459818428, + "tokens_seen": 2523586560 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011893681043129388, + "loss": 2.7668, + "theoretical_loss": 3.365996500412907, + "tokens_seen": 2523652096 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011892678034102307, + "loss": 2.5184, + "theoretical_loss": 3.365989541238713, + "tokens_seen": 2523717632 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011891675025075226, + "loss": 2.5068, + "theoretical_loss": 3.3659825822958314, + "tokens_seen": 2523783168 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011890672016048145, + "loss": 2.6037, + "theoretical_loss": 3.365975623584249, + "tokens_seen": 2523848704 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011889669007021063, + "loss": 2.5101, + "theoretical_loss": 3.365968665103952, + "tokens_seen": 2523914240 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011888665997993981, + "loss": 2.3517, + "theoretical_loss": 3.365961706854927, + "tokens_seen": 2523979776 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011887662988966901, + "loss": 2.4806, + "theoretical_loss": 3.3659547488371593, + "tokens_seen": 2524045312 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011886659979939819, + "loss": 2.3369, + "theoretical_loss": 3.3659477910506364, + "tokens_seen": 2524110848 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011885656970912738, + "loss": 2.4105, + "theoretical_loss": 3.365940833495344, + "tokens_seen": 2524176384 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011884653961885656, + "loss": 2.4237, + "theoretical_loss": 3.365933876171269, + "tokens_seen": 2524241920 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011883650952858576, + "loss": 2.4394, + "theoretical_loss": 3.3659269190783965, + "tokens_seen": 2524307456 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011882647943831494, + "loss": 2.5104, + "theoretical_loss": 3.365919962216714, + "tokens_seen": 2524372992 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011881644934804413, + "loss": 2.4306, + "theoretical_loss": 3.3659130055862074, + "tokens_seen": 2524438528 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011880641925777332, + "loss": 2.119, + "theoretical_loss": 3.365906049186863, + "tokens_seen": 2524504064 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011879638916750251, + "loss": 2.492, + "theoretical_loss": 3.3658990930186667, + "tokens_seen": 2524569600 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001187863590772317, + "loss": 2.4763, + "theoretical_loss": 3.3658921370816057, + "tokens_seen": 2524635136 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011877632898696089, + "loss": 2.4287, + "theoretical_loss": 3.3658851813756656, + "tokens_seen": 2524700672 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2789104, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.510348320007324, + "objective/train/theoretical_loss": 3.365878225900833, + "objective/train/tokens_used": 2545226208, + "theoretical_loss": 3.365878225900833, + "tokens_seen": 2524766208 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011876629889669008, + "loss": 2.4643, + "theoretical_loss": 3.365878225900833, + "tokens_seen": 2524766208 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011875626880641926, + "loss": 2.507, + "theoretical_loss": 3.3658712706570943, + "tokens_seen": 2524831744 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011874623871614846, + "loss": 2.5314, + "theoretical_loss": 3.365864315644436, + "tokens_seen": 2524897280 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011873620862587764, + "loss": 2.3282, + "theoretical_loss": 3.3658573608628437, + "tokens_seen": 2524962816 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011872617853560683, + "loss": 2.5222, + "theoretical_loss": 3.3658504063123047, + "tokens_seen": 2525028352 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011871614844533601, + "loss": 2.3035, + "theoretical_loss": 3.365843451992804, + "tokens_seen": 2525093888 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011870611835506521, + "loss": 2.4785, + "theoretical_loss": 3.3658364979043296, + "tokens_seen": 2525159424 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011869608826479439, + "loss": 2.4538, + "theoretical_loss": 3.3658295440468664, + "tokens_seen": 2525224960 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011868605817452357, + "loss": 2.7185, + "theoretical_loss": 3.3658225904204015, + "tokens_seen": 2525290496 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011867602808425276, + "loss": 2.2956, + "theoretical_loss": 3.3658156370249213, + "tokens_seen": 2525356032 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011866599799398195, + "loss": 2.4909, + "theoretical_loss": 3.3658086838604113, + "tokens_seen": 2525421568 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011865596790371114, + "loss": 2.3134, + "theoretical_loss": 3.3658017309268584, + "tokens_seen": 2525487104 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011864593781344032, + "loss": 2.397, + "theoretical_loss": 3.365794778224249, + "tokens_seen": 2525552640 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011863590772316952, + "loss": 2.4934, + "theoretical_loss": 3.36578782575257, + "tokens_seen": 2525618176 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001186258776328987, + "loss": 2.5126, + "theoretical_loss": 3.3657808735118064, + "tokens_seen": 2525683712 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011861584754262789, + "loss": 2.6153, + "theoretical_loss": 3.3657739215019458, + "tokens_seen": 2525749248 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011860581745235707, + "loss": 2.5665, + "theoretical_loss": 3.3657669697229737, + "tokens_seen": 2525814784 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011859578736208625, + "loss": 2.5085, + "theoretical_loss": 3.3657600181748766, + "tokens_seen": 2525880320 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011858575727181545, + "loss": 2.4828, + "theoretical_loss": 3.365753066857641, + "tokens_seen": 2525945856 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011857572718154463, + "loss": 2.2485, + "theoretical_loss": 3.365746115771253, + "tokens_seen": 2526011392 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011856569709127382, + "loss": 2.3677, + "theoretical_loss": 3.3657391649157, + "tokens_seen": 2526076928 + }, + { + "epoch": 8.05, + "learning_rate": 0.000118555667001003, + "loss": 2.4493, + "theoretical_loss": 3.3657322142909667, + "tokens_seen": 2526142464 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001185456369107322, + "loss": 2.3409, + "theoretical_loss": 3.3657252638970405, + "tokens_seen": 2526208000 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011853560682046138, + "loss": 2.395, + "theoretical_loss": 3.3657183137339075, + "tokens_seen": 2526273536 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011852557673019058, + "loss": 2.6361, + "theoretical_loss": 3.365711363801554, + "tokens_seen": 2526339072 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2789106, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5733888149261475, + "objective/train/theoretical_loss": 3.3657044140999663, + "objective/train/tokens_used": 2546864608, + "theoretical_loss": 3.3657044140999663, + "tokens_seen": 2526404608 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011851554663991976, + "loss": 2.5743, + "theoretical_loss": 3.3657044140999663, + "tokens_seen": 2526404608 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011850551654964894, + "loss": 2.3257, + "theoretical_loss": 3.3656974646291307, + "tokens_seen": 2526470144 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011849548645937813, + "loss": 2.5267, + "theoretical_loss": 3.3656905153890344, + "tokens_seen": 2526535680 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011848545636910731, + "loss": 2.5886, + "theoretical_loss": 3.3656835663796625, + "tokens_seen": 2526601216 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011847542627883651, + "loss": 2.1475, + "theoretical_loss": 3.365676617601002, + "tokens_seen": 2526666752 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011846539618856569, + "loss": 2.4034, + "theoretical_loss": 3.365669669053039, + "tokens_seen": 2526732288 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011845536609829488, + "loss": 2.6225, + "theoretical_loss": 3.3656627207357603, + "tokens_seen": 2526797824 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011844533600802407, + "loss": 2.5342, + "theoretical_loss": 3.365655772649152, + "tokens_seen": 2526863360 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011843530591775326, + "loss": 2.41, + "theoretical_loss": 3.3656488247932, + "tokens_seen": 2526928896 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011842527582748246, + "loss": 2.2422, + "theoretical_loss": 3.3656418771678918, + "tokens_seen": 2526994432 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011841524573721165, + "loss": 2.4911, + "theoretical_loss": 3.3656349297732127, + "tokens_seen": 2527059968 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011840521564694083, + "loss": 2.398, + "theoretical_loss": 3.3656279826091495, + "tokens_seen": 2527125504 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011839518555667001, + "loss": 2.7097, + "theoretical_loss": 3.3656210356756886, + "tokens_seen": 2527191040 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011838515546639921, + "loss": 2.5861, + "theoretical_loss": 3.365614088972816, + "tokens_seen": 2527256576 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011837512537612839, + "loss": 2.4668, + "theoretical_loss": 3.3656071425005183, + "tokens_seen": 2527322112 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011836509528585758, + "loss": 2.4293, + "theoretical_loss": 3.3656001962587823, + "tokens_seen": 2527387648 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011835506519558676, + "loss": 2.4793, + "theoretical_loss": 3.365593250247594, + "tokens_seen": 2527453184 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011834503510531596, + "loss": 2.4395, + "theoretical_loss": 3.365586304466939, + "tokens_seen": 2527518720 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011833500501504514, + "loss": 2.5291, + "theoretical_loss": 3.365579358916805, + "tokens_seen": 2527584256 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011832497492477433, + "loss": 2.5891, + "theoretical_loss": 3.3655724135971776, + "tokens_seen": 2527649792 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011831494483450352, + "loss": 2.3284, + "theoretical_loss": 3.365565468508043, + "tokens_seen": 2527715328 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001183049147442327, + "loss": 2.706, + "theoretical_loss": 3.365558523649389, + "tokens_seen": 2527780864 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011829488465396189, + "loss": 2.5425, + "theoretical_loss": 3.3655515790212, + "tokens_seen": 2527846400 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011828485456369107, + "loss": 2.7745, + "theoretical_loss": 3.3655446346234634, + "tokens_seen": 2527911936 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011827482447342027, + "loss": 2.5903, + "theoretical_loss": 3.3655376904561654, + "tokens_seen": 2527977472 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2789106, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5731163024902344, + "objective/train/theoretical_loss": 3.3655307465192923, + "objective/train/tokens_used": 2548503008, + "theoretical_loss": 3.3655307465192923, + "tokens_seen": 2528043008 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011826479438314945, + "loss": 2.3895, + "theoretical_loss": 3.3655307465192923, + "tokens_seen": 2528043008 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011825476429287864, + "loss": 2.7416, + "theoretical_loss": 3.3655238028128314, + "tokens_seen": 2528108544 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011824473420260782, + "loss": 2.6585, + "theoretical_loss": 3.3655168593367675, + "tokens_seen": 2528174080 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011823470411233702, + "loss": 2.3927, + "theoretical_loss": 3.365509916091088, + "tokens_seen": 2528239616 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001182246740220662, + "loss": 2.3346, + "theoretical_loss": 3.3655029730757793, + "tokens_seen": 2528305152 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011821464393179538, + "loss": 2.6722, + "theoretical_loss": 3.3654960302908274, + "tokens_seen": 2528370688 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011820461384152458, + "loss": 2.501, + "theoretical_loss": 3.365489087736219, + "tokens_seen": 2528436224 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011819458375125376, + "loss": 2.7031, + "theoretical_loss": 3.3654821454119404, + "tokens_seen": 2528501760 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011818455366098295, + "loss": 2.6136, + "theoretical_loss": 3.3654752033179776, + "tokens_seen": 2528567296 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011817452357071213, + "loss": 2.5426, + "theoretical_loss": 3.365468261454317, + "tokens_seen": 2528632832 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011816449348044133, + "loss": 2.6232, + "theoretical_loss": 3.365461319820946, + "tokens_seen": 2528698368 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011815446339017051, + "loss": 2.5672, + "theoretical_loss": 3.36545437841785, + "tokens_seen": 2528763904 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001181444332998997, + "loss": 2.4437, + "theoretical_loss": 3.365447437245016, + "tokens_seen": 2528829440 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011813440320962888, + "loss": 2.4478, + "theoretical_loss": 3.36544049630243, + "tokens_seen": 2528894976 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011812437311935808, + "loss": 2.6027, + "theoretical_loss": 3.365433555590078, + "tokens_seen": 2528960512 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011811434302908726, + "loss": 2.6688, + "theoretical_loss": 3.365426615107947, + "tokens_seen": 2529026048 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011810431293881644, + "loss": 2.477, + "theoretical_loss": 3.3654196748560232, + "tokens_seen": 2529091584 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011809428284854564, + "loss": 2.6564, + "theoretical_loss": 3.365412734834293, + "tokens_seen": 2529157120 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011808425275827482, + "loss": 2.5592, + "theoretical_loss": 3.3654057950427436, + "tokens_seen": 2529222656 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011807422266800401, + "loss": 2.5478, + "theoretical_loss": 3.3653988554813603, + "tokens_seen": 2529288192 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011806419257773319, + "loss": 2.3748, + "theoretical_loss": 3.36539191615013, + "tokens_seen": 2529353728 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001180541624874624, + "loss": 2.2701, + "theoretical_loss": 3.3653849770490383, + "tokens_seen": 2529419264 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011804413239719158, + "loss": 2.3389, + "theoretical_loss": 3.365378038178073, + "tokens_seen": 2529484800 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011803410230692078, + "loss": 2.4393, + "theoretical_loss": 3.3653710995372195, + "tokens_seen": 2529550336 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011802407221664996, + "loss": 2.4831, + "theoretical_loss": 3.365364161126464, + "tokens_seen": 2529615872 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2790611, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.25510835647583, + "objective/train/theoretical_loss": 3.365357222945794, + "objective/train/tokens_used": 2550141408, + "theoretical_loss": 3.365357222945794, + "tokens_seen": 2529681408 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011801404212637914, + "loss": 2.5659, + "theoretical_loss": 3.365357222945794, + "tokens_seen": 2529681408 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011800401203610833, + "loss": 2.4718, + "theoretical_loss": 3.3653502849951957, + "tokens_seen": 2529746944 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011799398194583751, + "loss": 2.5066, + "theoretical_loss": 3.3653433472746546, + "tokens_seen": 2529812480 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011798395185556671, + "loss": 2.4994, + "theoretical_loss": 3.365336409784158, + "tokens_seen": 2529878016 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011797392176529589, + "loss": 2.4954, + "theoretical_loss": 3.3653294725236913, + "tokens_seen": 2529943552 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011796389167502509, + "loss": 2.4473, + "theoretical_loss": 3.3653225354932417, + "tokens_seen": 2530009088 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011795386158475427, + "loss": 2.3971, + "theoretical_loss": 3.365315598692796, + "tokens_seen": 2530074624 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011794383149448346, + "loss": 2.4308, + "theoretical_loss": 3.36530866212234, + "tokens_seen": 2530140160 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011793380140421264, + "loss": 2.5474, + "theoretical_loss": 3.3653017257818596, + "tokens_seen": 2530205696 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011792377131394184, + "loss": 2.6408, + "theoretical_loss": 3.3652947896713425, + "tokens_seen": 2530271232 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011791374122367102, + "loss": 2.4639, + "theoretical_loss": 3.365287853790774, + "tokens_seen": 2530336768 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001179037111334002, + "loss": 2.4144, + "theoretical_loss": 3.365280918140141, + "tokens_seen": 2530402304 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001178936810431294, + "loss": 2.3976, + "theoretical_loss": 3.3652739827194305, + "tokens_seen": 2530467840 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011788365095285857, + "loss": 2.5413, + "theoretical_loss": 3.365267047528628, + "tokens_seen": 2530533376 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011787362086258777, + "loss": 2.4909, + "theoretical_loss": 3.3652601125677197, + "tokens_seen": 2530598912 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011786359077231695, + "loss": 2.5994, + "theoretical_loss": 3.3652531778366934, + "tokens_seen": 2530664448 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011785356068204615, + "loss": 2.4593, + "theoretical_loss": 3.365246243335534, + "tokens_seen": 2530729984 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011784353059177533, + "loss": 2.2786, + "theoretical_loss": 3.365239309064229, + "tokens_seen": 2530795520 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011783350050150452, + "loss": 2.9021, + "theoretical_loss": 3.3652323750227646, + "tokens_seen": 2530861056 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001178234704112337, + "loss": 2.2931, + "theoretical_loss": 3.365225441211127, + "tokens_seen": 2530926592 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011781344032096288, + "loss": 2.284, + "theoretical_loss": 3.3652185076293026, + "tokens_seen": 2530992128 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011780341023069208, + "loss": 2.3756, + "theoretical_loss": 3.3652115742772777, + "tokens_seen": 2531057664 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011779338014042126, + "loss": 2.6005, + "theoretical_loss": 3.3652046411550396, + "tokens_seen": 2531123200 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011778335005015045, + "loss": 2.457, + "theoretical_loss": 3.3651977082625733, + "tokens_seen": 2531188736 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011777331995987963, + "loss": 2.5391, + "theoretical_loss": 3.365190775599867, + "tokens_seen": 2531254272 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2791165, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.565734386444092, + "objective/train/theoretical_loss": 3.3651838431669057, + "objective/train/tokens_used": 2551779808, + "theoretical_loss": 3.3651838431669057, + "tokens_seen": 2531319808 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011776328986960883, + "loss": 2.6422, + "theoretical_loss": 3.3651838431669057, + "tokens_seen": 2531319808 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011775325977933801, + "loss": 2.455, + "theoretical_loss": 3.365176910963676, + "tokens_seen": 2531385344 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001177432296890672, + "loss": 2.5611, + "theoretical_loss": 3.3651699789901652, + "tokens_seen": 2531450880 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011773319959879639, + "loss": 2.5787, + "theoretical_loss": 3.365163047246359, + "tokens_seen": 2531516416 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011772316950852557, + "loss": 2.5234, + "theoretical_loss": 3.365156115732244, + "tokens_seen": 2531581952 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011771313941825476, + "loss": 2.7206, + "theoretical_loss": 3.365149184447807, + "tokens_seen": 2531647488 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011770310932798394, + "loss": 2.3477, + "theoretical_loss": 3.3651422533930337, + "tokens_seen": 2531713024 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011769307923771314, + "loss": 2.433, + "theoretical_loss": 3.3651353225679115, + "tokens_seen": 2531778560 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011768304914744232, + "loss": 2.5904, + "theoretical_loss": 3.365128391972426, + "tokens_seen": 2531844096 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011767301905717153, + "loss": 2.388, + "theoretical_loss": 3.365121461606564, + "tokens_seen": 2531909632 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011766298896690071, + "loss": 2.6921, + "theoretical_loss": 3.365114531470312, + "tokens_seen": 2531975168 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001176529588766299, + "loss": 2.5472, + "theoretical_loss": 3.3651076015636563, + "tokens_seen": 2532040704 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011764292878635908, + "loss": 2.5563, + "theoretical_loss": 3.3651006718865832, + "tokens_seen": 2532106240 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011763289869608828, + "loss": 2.7136, + "theoretical_loss": 3.36509374243908, + "tokens_seen": 2532171776 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011762286860581746, + "loss": 2.5971, + "theoretical_loss": 3.365086813221132, + "tokens_seen": 2532237312 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011761283851554664, + "loss": 2.5261, + "theoretical_loss": 3.3650798842327263, + "tokens_seen": 2532302848 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011760280842527584, + "loss": 2.5701, + "theoretical_loss": 3.3650729554738494, + "tokens_seen": 2532368384 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011759277833500502, + "loss": 2.4437, + "theoretical_loss": 3.3650660269444876, + "tokens_seen": 2532433920 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011758274824473421, + "loss": 2.4872, + "theoretical_loss": 3.365059098644627, + "tokens_seen": 2532499456 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011757271815446339, + "loss": 2.4757, + "theoretical_loss": 3.365052170574255, + "tokens_seen": 2532564992 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011756268806419259, + "loss": 2.6653, + "theoretical_loss": 3.3650452427333573, + "tokens_seen": 2532630528 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011755265797392177, + "loss": 2.6552, + "theoretical_loss": 3.3650383151219203, + "tokens_seen": 2532696064 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011754262788365096, + "loss": 2.5343, + "theoretical_loss": 3.365031387739931, + "tokens_seen": 2532761600 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011753259779338014, + "loss": 2.6056, + "theoretical_loss": 3.3650244605873754, + "tokens_seen": 2532827136 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011752256770310933, + "loss": 2.5372, + "theoretical_loss": 3.3650175336642403, + "tokens_seen": 2532892672 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2792437, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7672832012176514, + "objective/train/theoretical_loss": 3.365010606970512, + "objective/train/tokens_used": 2553418208, + "theoretical_loss": 3.365010606970512, + "tokens_seen": 2532958208 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011751253761283852, + "loss": 2.6094, + "theoretical_loss": 3.365010606970512, + "tokens_seen": 2532958208 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001175025075225677, + "loss": 2.3641, + "theoretical_loss": 3.3650036805061765, + "tokens_seen": 2533023744 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001174924774322969, + "loss": 2.3025, + "theoretical_loss": 3.364996754271221, + "tokens_seen": 2533089280 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011748244734202608, + "loss": 2.2529, + "theoretical_loss": 3.364989828265632, + "tokens_seen": 2533154816 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011747241725175527, + "loss": 2.4667, + "theoretical_loss": 3.3649829024893956, + "tokens_seen": 2533220352 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011746238716148445, + "loss": 2.3471, + "theoretical_loss": 3.3649759769424983, + "tokens_seen": 2533285888 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011745235707121365, + "loss": 2.5098, + "theoretical_loss": 3.364969051624926, + "tokens_seen": 2533351424 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011744232698094283, + "loss": 2.6408, + "theoretical_loss": 3.3649621265366667, + "tokens_seen": 2533416960 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011743229689067201, + "loss": 2.3772, + "theoretical_loss": 3.3649552016777053, + "tokens_seen": 2533482496 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001174222668004012, + "loss": 2.5212, + "theoretical_loss": 3.3649482770480295, + "tokens_seen": 2533548032 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011741223671013039, + "loss": 2.7018, + "theoretical_loss": 3.364941352647625, + "tokens_seen": 2533613568 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011740220661985958, + "loss": 2.656, + "theoretical_loss": 3.364934428476478, + "tokens_seen": 2533679104 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011739217652958876, + "loss": 2.5372, + "theoretical_loss": 3.3649275045345766, + "tokens_seen": 2533744640 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011738214643931796, + "loss": 2.4075, + "theoretical_loss": 3.3649205808219054, + "tokens_seen": 2533810176 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011737211634904714, + "loss": 2.6733, + "theoretical_loss": 3.3649136573384517, + "tokens_seen": 2533875712 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011736208625877633, + "loss": 2.5459, + "theoretical_loss": 3.364906734084202, + "tokens_seen": 2533941248 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011735205616850551, + "loss": 2.4581, + "theoretical_loss": 3.364899811059143, + "tokens_seen": 2534006784 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011734202607823471, + "loss": 2.458, + "theoretical_loss": 3.3648928882632605, + "tokens_seen": 2534072320 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011733199598796389, + "loss": 2.6283, + "theoretical_loss": 3.3648859656965415, + "tokens_seen": 2534137856 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011732196589769307, + "loss": 2.4646, + "theoretical_loss": 3.364879043358972, + "tokens_seen": 2534203392 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011731193580742226, + "loss": 2.5312, + "theoretical_loss": 3.3648721212505395, + "tokens_seen": 2534268928 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011730190571715146, + "loss": 2.7363, + "theoretical_loss": 3.3648651993712297, + "tokens_seen": 2534334464 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011729187562688065, + "loss": 2.693, + "theoretical_loss": 3.364858277721029, + "tokens_seen": 2534400000 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011728184553660983, + "loss": 2.51, + "theoretical_loss": 3.3648513562999245, + "tokens_seen": 2534465536 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011727181544633903, + "loss": 2.4902, + "theoretical_loss": 3.364844435107902, + "tokens_seen": 2534531072 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2793192, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.650895833969116, + "objective/train/theoretical_loss": 3.3648375141449485, + "objective/train/tokens_used": 2555056608, + "theoretical_loss": 3.3648375141449485, + "tokens_seen": 2534596608 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011726178535606821, + "loss": 2.6666, + "theoretical_loss": 3.3648375141449485, + "tokens_seen": 2534596608 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001172517552657974, + "loss": 2.2769, + "theoretical_loss": 3.36483059341105, + "tokens_seen": 2534662144 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011724172517552659, + "loss": 2.5783, + "theoretical_loss": 3.3648236729061938, + "tokens_seen": 2534727680 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011723169508525577, + "loss": 2.502, + "theoretical_loss": 3.3648167526303654, + "tokens_seen": 2534793216 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011722166499498496, + "loss": 2.7003, + "theoretical_loss": 3.3648098325835525, + "tokens_seen": 2534858752 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011721163490471414, + "loss": 2.5467, + "theoretical_loss": 3.3648029127657404, + "tokens_seen": 2534924288 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011720160481444334, + "loss": 2.4681, + "theoretical_loss": 3.3647959931769162, + "tokens_seen": 2534989824 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011719157472417252, + "loss": 2.3282, + "theoretical_loss": 3.3647890738170663, + "tokens_seen": 2535055360 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011718154463390171, + "loss": 2.3763, + "theoretical_loss": 3.3647821546861776, + "tokens_seen": 2535120896 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001171715145436309, + "loss": 2.627, + "theoretical_loss": 3.3647752357842355, + "tokens_seen": 2535186432 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011716148445336009, + "loss": 2.7775, + "theoretical_loss": 3.3647683171112277, + "tokens_seen": 2535251968 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011715145436308927, + "loss": 2.4198, + "theoretical_loss": 3.3647613986671407, + "tokens_seen": 2535317504 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011714142427281845, + "loss": 2.5335, + "theoretical_loss": 3.36475448045196, + "tokens_seen": 2535383040 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011713139418254765, + "loss": 2.5915, + "theoretical_loss": 3.3647475624656726, + "tokens_seen": 2535448576 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011712136409227683, + "loss": 2.6725, + "theoretical_loss": 3.3647406447082653, + "tokens_seen": 2535514112 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011711133400200602, + "loss": 2.4406, + "theoretical_loss": 3.364733727179724, + "tokens_seen": 2535579648 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001171013039117352, + "loss": 2.7464, + "theoretical_loss": 3.364726809880036, + "tokens_seen": 2535645184 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001170912738214644, + "loss": 2.625, + "theoretical_loss": 3.3647198928091875, + "tokens_seen": 2535710720 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011708124373119358, + "loss": 2.6692, + "theoretical_loss": 3.364712975967165, + "tokens_seen": 2535776256 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011707121364092277, + "loss": 2.7363, + "theoretical_loss": 3.3647060593539546, + "tokens_seen": 2535841792 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011706118355065196, + "loss": 2.6137, + "theoretical_loss": 3.364699142969543, + "tokens_seen": 2535907328 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011705115346038115, + "loss": 2.4747, + "theoretical_loss": 3.364692226813917, + "tokens_seen": 2535972864 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011704112337011033, + "loss": 2.6231, + "theoretical_loss": 3.3646853108870634, + "tokens_seen": 2536038400 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011703109327983951, + "loss": 2.6898, + "theoretical_loss": 3.3646783951889683, + "tokens_seen": 2536103936 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001170210631895687, + "loss": 2.5297, + "theoretical_loss": 3.3646714797196178, + "tokens_seen": 2536169472 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2794507, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3801376819610596, + "objective/train/theoretical_loss": 3.364664564478999, + "objective/train/tokens_used": 2556695008, + "theoretical_loss": 3.364664564478999, + "tokens_seen": 2536235008 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011701103309929789, + "loss": 2.5186, + "theoretical_loss": 3.364664564478999, + "tokens_seen": 2536235008 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011700100300902708, + "loss": 2.533, + "theoretical_loss": 3.364657649467098, + "tokens_seen": 2536300544 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011699097291875626, + "loss": 2.6412, + "theoretical_loss": 3.3646507346839023, + "tokens_seen": 2536366080 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011698094282848546, + "loss": 2.4213, + "theoretical_loss": 3.3646438201293973, + "tokens_seen": 2536431616 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011697091273821464, + "loss": 2.5568, + "theoretical_loss": 3.3646369058035703, + "tokens_seen": 2536497152 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011696088264794383, + "loss": 2.5026, + "theoretical_loss": 3.3646299917064075, + "tokens_seen": 2536562688 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011695085255767302, + "loss": 2.6727, + "theoretical_loss": 3.364623077837895, + "tokens_seen": 2536628224 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001169408224674022, + "loss": 2.4229, + "theoretical_loss": 3.3646161641980203, + "tokens_seen": 2536693760 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011693079237713139, + "loss": 2.5816, + "theoretical_loss": 3.364609250786769, + "tokens_seen": 2536759296 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011692076228686059, + "loss": 2.6059, + "theoretical_loss": 3.364602337604128, + "tokens_seen": 2536824832 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011691073219658978, + "loss": 2.5454, + "theoretical_loss": 3.364595424650084, + "tokens_seen": 2536890368 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011690070210631896, + "loss": 2.5479, + "theoretical_loss": 3.3645885119246235, + "tokens_seen": 2536955904 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011689067201604816, + "loss": 2.5293, + "theoretical_loss": 3.3645815994277326, + "tokens_seen": 2537021440 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011688064192577734, + "loss": 2.6765, + "theoretical_loss": 3.3645746871593984, + "tokens_seen": 2537086976 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011687061183550653, + "loss": 2.4569, + "theoretical_loss": 3.3645677751196077, + "tokens_seen": 2537152512 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011686058174523571, + "loss": 2.6306, + "theoretical_loss": 3.3645608633083457, + "tokens_seen": 2537218048 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011685055165496491, + "loss": 2.5259, + "theoretical_loss": 3.3645539517256005, + "tokens_seen": 2537283584 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011684052156469409, + "loss": 2.4077, + "theoretical_loss": 3.3645470403713573, + "tokens_seen": 2537349120 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011683049147442327, + "loss": 2.4528, + "theoretical_loss": 3.364540129245604, + "tokens_seen": 2537414656 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011682046138415246, + "loss": 2.6859, + "theoretical_loss": 3.364533218348326, + "tokens_seen": 2537480192 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011681043129388165, + "loss": 2.4322, + "theoretical_loss": 3.36452630767951, + "tokens_seen": 2537545728 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011680040120361084, + "loss": 2.4494, + "theoretical_loss": 3.3645193972391434, + "tokens_seen": 2537611264 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011679037111334002, + "loss": 2.7429, + "theoretical_loss": 3.364512487027212, + "tokens_seen": 2537676800 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011678034102306922, + "loss": 2.4609, + "theoretical_loss": 3.364505577043702, + "tokens_seen": 2537742336 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001167703109327984, + "loss": 2.4193, + "theoretical_loss": 3.364498667288601, + "tokens_seen": 2537807872 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2795152, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.380295753479004, + "objective/train/theoretical_loss": 3.364491757761895, + "objective/train/tokens_used": 2558333408, + "theoretical_loss": 3.364491757761895, + "tokens_seen": 2537873408 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011676028084252759, + "loss": 2.4639, + "theoretical_loss": 3.364491757761895, + "tokens_seen": 2537873408 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011675025075225677, + "loss": 2.5742, + "theoretical_loss": 3.3644848484635705, + "tokens_seen": 2537938944 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011674022066198595, + "loss": 2.5546, + "theoretical_loss": 3.3644779393936144, + "tokens_seen": 2538004480 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011673019057171515, + "loss": 2.4971, + "theoretical_loss": 3.3644710305520125, + "tokens_seen": 2538070016 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011672016048144433, + "loss": 2.5225, + "theoretical_loss": 3.3644641219387523, + "tokens_seen": 2538135552 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011671013039117352, + "loss": 2.2827, + "theoretical_loss": 3.3644572135538193, + "tokens_seen": 2538201088 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001167001003009027, + "loss": 2.5717, + "theoretical_loss": 3.364450305397201, + "tokens_seen": 2538266624 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001166900702106319, + "loss": 2.6194, + "theoretical_loss": 3.3644433974688837, + "tokens_seen": 2538332160 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011668004012036108, + "loss": 2.5791, + "theoretical_loss": 3.364436489768854, + "tokens_seen": 2538397696 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011667001003009028, + "loss": 2.6517, + "theoretical_loss": 3.364429582297098, + "tokens_seen": 2538463232 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011665997993981946, + "loss": 2.7105, + "theoretical_loss": 3.3644226750536026, + "tokens_seen": 2538528768 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011664994984954864, + "loss": 2.6549, + "theoretical_loss": 3.364415768038355, + "tokens_seen": 2538594304 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011663991975927783, + "loss": 2.5234, + "theoretical_loss": 3.3644088612513405, + "tokens_seen": 2538659840 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011662988966900701, + "loss": 2.5118, + "theoretical_loss": 3.3644019546925463, + "tokens_seen": 2538725376 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011661985957873621, + "loss": 2.6356, + "theoretical_loss": 3.3643950483619594, + "tokens_seen": 2538790912 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011660982948846539, + "loss": 2.5406, + "theoretical_loss": 3.3643881422595654, + "tokens_seen": 2538856448 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011659979939819458, + "loss": 2.7493, + "theoretical_loss": 3.364381236385352, + "tokens_seen": 2538921984 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011658976930792377, + "loss": 2.2681, + "theoretical_loss": 3.364374330739305, + "tokens_seen": 2538987520 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011657973921765296, + "loss": 2.5523, + "theoretical_loss": 3.3643674253214106, + "tokens_seen": 2539053056 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011656970912738214, + "loss": 2.2961, + "theoretical_loss": 3.3643605201316564, + "tokens_seen": 2539118592 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011655967903711134, + "loss": 2.363, + "theoretical_loss": 3.3643536151700286, + "tokens_seen": 2539184128 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011654964894684053, + "loss": 2.4569, + "theoretical_loss": 3.3643467104365135, + "tokens_seen": 2539249664 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011653961885656971, + "loss": 2.539, + "theoretical_loss": 3.364339805931098, + "tokens_seen": 2539315200 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011652958876629891, + "loss": 2.404, + "theoretical_loss": 3.364332901653768, + "tokens_seen": 2539380736 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011651955867602809, + "loss": 2.6464, + "theoretical_loss": 3.3643259976045115, + "tokens_seen": 2539446272 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2796497, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9452555179595947, + "objective/train/theoretical_loss": 3.3643190937833136, + "objective/train/tokens_used": 2559971808, + "theoretical_loss": 3.3643190937833136, + "tokens_seen": 2539511808 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011650952858575728, + "loss": 2.5454, + "theoretical_loss": 3.3643190937833136, + "tokens_seen": 2539511808 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011649949849548646, + "loss": 2.5705, + "theoretical_loss": 3.3643121901901614, + "tokens_seen": 2539577344 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011648946840521566, + "loss": 2.661, + "theoretical_loss": 3.364305286825042, + "tokens_seen": 2539642880 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011647943831494484, + "loss": 2.5017, + "theoretical_loss": 3.364298383687941, + "tokens_seen": 2539708416 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011646940822467403, + "loss": 2.6852, + "theoretical_loss": 3.364291480778846, + "tokens_seen": 2539773952 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011645937813440322, + "loss": 2.4078, + "theoretical_loss": 3.3642845780977426, + "tokens_seen": 2539839488 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001164493480441324, + "loss": 2.371, + "theoretical_loss": 3.3642776756446184, + "tokens_seen": 2539905024 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011643931795386159, + "loss": 2.8172, + "theoretical_loss": 3.3642707734194595, + "tokens_seen": 2539970560 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011642928786359077, + "loss": 2.6803, + "theoretical_loss": 3.3642638714222524, + "tokens_seen": 2540036096 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011641925777331997, + "loss": 2.3368, + "theoretical_loss": 3.3642569696529834, + "tokens_seen": 2540101632 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011640922768304915, + "loss": 2.3885, + "theoretical_loss": 3.3642500681116396, + "tokens_seen": 2540167168 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011639919759277834, + "loss": 2.7463, + "theoretical_loss": 3.364243166798208, + "tokens_seen": 2540232704 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011638916750250752, + "loss": 2.5617, + "theoretical_loss": 3.364236265712674, + "tokens_seen": 2540298240 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011637913741223672, + "loss": 2.5049, + "theoretical_loss": 3.3642293648550248, + "tokens_seen": 2540363776 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001163691073219659, + "loss": 2.4246, + "theoretical_loss": 3.3642224642252474, + "tokens_seen": 2540429312 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011635907723169508, + "loss": 2.4931, + "theoretical_loss": 3.3642155638233278, + "tokens_seen": 2540494848 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011634904714142428, + "loss": 2.5264, + "theoretical_loss": 3.364208663649253, + "tokens_seen": 2540560384 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011633901705115346, + "loss": 2.6366, + "theoretical_loss": 3.3642017637030093, + "tokens_seen": 2540625920 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011632898696088265, + "loss": 2.4714, + "theoretical_loss": 3.364194863984584, + "tokens_seen": 2540691456 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011631895687061183, + "loss": 2.584, + "theoretical_loss": 3.3641879644939623, + "tokens_seen": 2540756992 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011630892678034103, + "loss": 2.6051, + "theoretical_loss": 3.364181065231132, + "tokens_seen": 2540822528 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011629889669007021, + "loss": 2.7051, + "theoretical_loss": 3.3641741661960793, + "tokens_seen": 2540888064 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001162888665997994, + "loss": 2.63, + "theoretical_loss": 3.364167267388791, + "tokens_seen": 2540953600 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011627883650952858, + "loss": 2.4098, + "theoretical_loss": 3.3641603688092534, + "tokens_seen": 2541019136 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011626880641925778, + "loss": 2.5274, + "theoretical_loss": 3.364153470457453, + "tokens_seen": 2541084672 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2797087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6307413578033447, + "objective/train/theoretical_loss": 3.364146572333377, + "objective/train/tokens_used": 2561610208, + "theoretical_loss": 3.364146572333377, + "tokens_seen": 2541150208 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011625877632898696, + "loss": 2.5879, + "theoretical_loss": 3.364146572333377, + "tokens_seen": 2541150208 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011624874623871614, + "loss": 2.5105, + "theoretical_loss": 3.364139674437012, + "tokens_seen": 2541215744 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011623871614844534, + "loss": 2.4667, + "theoretical_loss": 3.3641327767683435, + "tokens_seen": 2541281280 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011622868605817452, + "loss": 2.4228, + "theoretical_loss": 3.3641258793273594, + "tokens_seen": 2541346816 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011621865596790371, + "loss": 2.4965, + "theoretical_loss": 3.3641189821140456, + "tokens_seen": 2541412352 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011620862587763289, + "loss": 2.6526, + "theoretical_loss": 3.364112085128389, + "tokens_seen": 2541477888 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011619859578736209, + "loss": 2.5482, + "theoretical_loss": 3.364105188370376, + "tokens_seen": 2541543424 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011618856569709127, + "loss": 2.5109, + "theoretical_loss": 3.3640982918399938, + "tokens_seen": 2541608960 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011617853560682048, + "loss": 2.4968, + "theoretical_loss": 3.3640913955372285, + "tokens_seen": 2541674496 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011616850551654966, + "loss": 2.4632, + "theoretical_loss": 3.3640844994620664, + "tokens_seen": 2541740032 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011615847542627884, + "loss": 2.6721, + "theoretical_loss": 3.364077603614495, + "tokens_seen": 2541805568 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011614844533600803, + "loss": 2.6254, + "theoretical_loss": 3.3640707079945, + "tokens_seen": 2541871104 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011613841524573721, + "loss": 2.4431, + "theoretical_loss": 3.3640638126020685, + "tokens_seen": 2541936640 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011612838515546641, + "loss": 2.3426, + "theoretical_loss": 3.3640569174371873, + "tokens_seen": 2542002176 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011611835506519559, + "loss": 2.655, + "theoretical_loss": 3.3640500224998426, + "tokens_seen": 2542067712 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011610832497492478, + "loss": 2.4525, + "theoretical_loss": 3.3640431277900213, + "tokens_seen": 2542133248 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011609829488465397, + "loss": 2.5644, + "theoretical_loss": 3.3640362333077105, + "tokens_seen": 2542198784 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011608826479438316, + "loss": 2.4597, + "theoretical_loss": 3.3640293390528955, + "tokens_seen": 2542264320 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011607823470411234, + "loss": 2.4674, + "theoretical_loss": 3.3640224450255642, + "tokens_seen": 2542329856 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011606820461384154, + "loss": 2.5844, + "theoretical_loss": 3.3640155512257026, + "tokens_seen": 2542395392 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011605817452357072, + "loss": 2.7279, + "theoretical_loss": 3.3640086576532973, + "tokens_seen": 2542460928 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001160481444332999, + "loss": 2.4226, + "theoretical_loss": 3.3640017643083353, + "tokens_seen": 2542526464 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001160381143430291, + "loss": 2.6113, + "theoretical_loss": 3.363994871190803, + "tokens_seen": 2542592000 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011602808425275827, + "loss": 2.4462, + "theoretical_loss": 3.3639879783006874, + "tokens_seen": 2542657536 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011601805416248747, + "loss": 2.6131, + "theoretical_loss": 3.3639810856379744, + "tokens_seen": 2542723072 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2797755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4461114406585693, + "objective/train/theoretical_loss": 3.363974193202651, + "objective/train/tokens_used": 2563248608, + "theoretical_loss": 3.363974193202651, + "tokens_seen": 2542788608 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011600802407221665, + "loss": 2.5577, + "theoretical_loss": 3.363974193202651, + "tokens_seen": 2542788608 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011599799398194585, + "loss": 2.6654, + "theoretical_loss": 3.363967300994704, + "tokens_seen": 2542854144 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011598796389167503, + "loss": 2.4834, + "theoretical_loss": 3.36396040901412, + "tokens_seen": 2542919680 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011597793380140422, + "loss": 2.6016, + "theoretical_loss": 3.3639535172608857, + "tokens_seen": 2542985216 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001159679037111334, + "loss": 2.6267, + "theoretical_loss": 3.3639466257349877, + "tokens_seen": 2543050752 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011595787362086258, + "loss": 2.6665, + "theoretical_loss": 3.3639397344364124, + "tokens_seen": 2543116288 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011594784353059178, + "loss": 2.5595, + "theoretical_loss": 3.3639328433651463, + "tokens_seen": 2543181824 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011593781344032096, + "loss": 2.5474, + "theoretical_loss": 3.3639259525211767, + "tokens_seen": 2543247360 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011592778335005015, + "loss": 2.4635, + "theoretical_loss": 3.3639190619044896, + "tokens_seen": 2543312896 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011591775325977933, + "loss": 2.6774, + "theoretical_loss": 3.3639121715150724, + "tokens_seen": 2543378432 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011590772316950853, + "loss": 2.4463, + "theoretical_loss": 3.3639052813529107, + "tokens_seen": 2543443968 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011589769307923771, + "loss": 2.7273, + "theoretical_loss": 3.363898391417992, + "tokens_seen": 2543509504 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001158876629889669, + "loss": 2.3848, + "theoretical_loss": 3.363891501710303, + "tokens_seen": 2543575040 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011587763289869609, + "loss": 2.448, + "theoretical_loss": 3.3638846122298296, + "tokens_seen": 2543640576 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011586760280842527, + "loss": 2.2572, + "theoretical_loss": 3.363877722976559, + "tokens_seen": 2543706112 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011585757271815446, + "loss": 2.7111, + "theoretical_loss": 3.363870833950478, + "tokens_seen": 2543771648 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011584754262788364, + "loss": 2.413, + "theoretical_loss": 3.3638639451515724, + "tokens_seen": 2543837184 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011583751253761284, + "loss": 2.572, + "theoretical_loss": 3.36385705657983, + "tokens_seen": 2543902720 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011582748244734202, + "loss": 2.4295, + "theoretical_loss": 3.3638501682352366, + "tokens_seen": 2543968256 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011581745235707121, + "loss": 2.7414, + "theoretical_loss": 3.3638432801177793, + "tokens_seen": 2544033792 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001158074222668004, + "loss": 2.6644, + "theoretical_loss": 3.363836392227445, + "tokens_seen": 2544099328 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001157973921765296, + "loss": 2.5174, + "theoretical_loss": 3.3638295045642197, + "tokens_seen": 2544164864 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011578736208625878, + "loss": 2.4665, + "theoretical_loss": 3.36382261712809, + "tokens_seen": 2544230400 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011577733199598798, + "loss": 2.4386, + "theoretical_loss": 3.363815729919043, + "tokens_seen": 2544295936 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011576730190571716, + "loss": 2.6926, + "theoretical_loss": 3.3638088429370656, + "tokens_seen": 2544361472 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2799256, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3045461177825928, + "objective/train/theoretical_loss": 3.363801956182144, + "objective/train/tokens_used": 2564887008, + "theoretical_loss": 3.363801956182144, + "tokens_seen": 2544427008 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011575727181544634, + "loss": 2.5196, + "theoretical_loss": 3.363801956182144, + "tokens_seen": 2544427008 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011574724172517554, + "loss": 2.4805, + "theoretical_loss": 3.363795069654265, + "tokens_seen": 2544492544 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011573721163490472, + "loss": 2.5559, + "theoretical_loss": 3.3637881833534156, + "tokens_seen": 2544558080 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011572718154463391, + "loss": 2.6252, + "theoretical_loss": 3.3637812972795818, + "tokens_seen": 2544623616 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011571715145436309, + "loss": 2.6883, + "theoretical_loss": 3.363774411432751, + "tokens_seen": 2544689152 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011570712136409229, + "loss": 2.6329, + "theoretical_loss": 3.363767525812909, + "tokens_seen": 2544754688 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011569709127382147, + "loss": 2.6888, + "theoretical_loss": 3.363760640420043, + "tokens_seen": 2544820224 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011568706118355066, + "loss": 2.3817, + "theoretical_loss": 3.3637537552541397, + "tokens_seen": 2544885760 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011567703109327984, + "loss": 2.5609, + "theoretical_loss": 3.363746870315186, + "tokens_seen": 2544951296 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011566700100300903, + "loss": 2.5044, + "theoretical_loss": 3.363739985603168, + "tokens_seen": 2545016832 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011565697091273822, + "loss": 2.5182, + "theoretical_loss": 3.3637331011180724, + "tokens_seen": 2545082368 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001156469408224674, + "loss": 2.6823, + "theoretical_loss": 3.3637262168598867, + "tokens_seen": 2545147904 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001156369107321966, + "loss": 2.7354, + "theoretical_loss": 3.363719332828597, + "tokens_seen": 2545213440 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011562688064192578, + "loss": 2.7223, + "theoretical_loss": 3.3637124490241894, + "tokens_seen": 2545278976 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011561685055165497, + "loss": 2.5799, + "theoretical_loss": 3.3637055654466517, + "tokens_seen": 2545344512 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011560682046138415, + "loss": 2.6354, + "theoretical_loss": 3.36369868209597, + "tokens_seen": 2545410048 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011559679037111335, + "loss": 2.4884, + "theoretical_loss": 3.3636917989721304, + "tokens_seen": 2545475584 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011558676028084253, + "loss": 2.4712, + "theoretical_loss": 3.363684916075121, + "tokens_seen": 2545541120 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011557673019057171, + "loss": 2.5157, + "theoretical_loss": 3.3636780334049274, + "tokens_seen": 2545606656 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001155667001003009, + "loss": 2.4254, + "theoretical_loss": 3.3636711509615367, + "tokens_seen": 2545672192 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011555667001003009, + "loss": 2.5737, + "theoretical_loss": 3.3636642687449356, + "tokens_seen": 2545737728 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011554663991975928, + "loss": 2.6094, + "theoretical_loss": 3.3636573867551105, + "tokens_seen": 2545803264 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011553660982948846, + "loss": 2.5354, + "theoretical_loss": 3.3636505049920484, + "tokens_seen": 2545868800 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011552657973921766, + "loss": 2.4508, + "theoretical_loss": 3.3636436234557356, + "tokens_seen": 2545934336 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011551654964894684, + "loss": 2.5328, + "theoretical_loss": 3.3636367421461593, + "tokens_seen": 2545999872 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2799937, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6040258407592773, + "objective/train/theoretical_loss": 3.3636298610633055, + "objective/train/tokens_used": 2566525408, + "theoretical_loss": 3.3636298610633055, + "tokens_seen": 2546065408 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011550651955867603, + "loss": 2.5143, + "theoretical_loss": 3.3636298610633055, + "tokens_seen": 2546065408 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011549648946840521, + "loss": 2.6605, + "theoretical_loss": 3.363622980207162, + "tokens_seen": 2546130944 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011548645937813441, + "loss": 2.6874, + "theoretical_loss": 3.3636160995777145, + "tokens_seen": 2546196480 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011547642928786359, + "loss": 2.3331, + "theoretical_loss": 3.3636092191749496, + "tokens_seen": 2546262016 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011546639919759277, + "loss": 2.5443, + "theoretical_loss": 3.363602338998855, + "tokens_seen": 2546327552 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011545636910732196, + "loss": 2.5857, + "theoretical_loss": 3.363595459049417, + "tokens_seen": 2546393088 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011544633901705115, + "loss": 2.6126, + "theoretical_loss": 3.3635885793266214, + "tokens_seen": 2546458624 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011543630892678034, + "loss": 2.4725, + "theoretical_loss": 3.363581699830456, + "tokens_seen": 2546524160 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011542627883650953, + "loss": 2.6558, + "theoretical_loss": 3.363574820560907, + "tokens_seen": 2546589696 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011541624874623873, + "loss": 2.6062, + "theoretical_loss": 3.363567941517962, + "tokens_seen": 2546655232 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011540621865596791, + "loss": 2.481, + "theoretical_loss": 3.363561062701606, + "tokens_seen": 2546720768 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001153961885656971, + "loss": 2.1832, + "theoretical_loss": 3.363554184111827, + "tokens_seen": 2546786304 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011538615847542629, + "loss": 2.2972, + "theoretical_loss": 3.3635473057486114, + "tokens_seen": 2546851840 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011537612838515547, + "loss": 2.5589, + "theoretical_loss": 3.3635404276119454, + "tokens_seen": 2546917376 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011536609829488466, + "loss": 2.6103, + "theoretical_loss": 3.363533549701817, + "tokens_seen": 2546982912 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011535606820461384, + "loss": 2.6072, + "theoretical_loss": 3.363526672018211, + "tokens_seen": 2547048448 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011534603811434304, + "loss": 2.7223, + "theoretical_loss": 3.363519794561116, + "tokens_seen": 2547113984 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011533600802407222, + "loss": 2.624, + "theoretical_loss": 3.363512917330518, + "tokens_seen": 2547179520 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011532597793380141, + "loss": 2.3401, + "theoretical_loss": 3.3635060403264028, + "tokens_seen": 2547245056 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001153159478435306, + "loss": 2.5895, + "theoretical_loss": 3.3634991635487586, + "tokens_seen": 2547310592 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011530591775325979, + "loss": 2.5696, + "theoretical_loss": 3.3634922869975714, + "tokens_seen": 2547376128 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011529588766298897, + "loss": 2.5606, + "theoretical_loss": 3.363485410672828, + "tokens_seen": 2547441664 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011528585757271815, + "loss": 2.5316, + "theoretical_loss": 3.3634785345745146, + "tokens_seen": 2547507200 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011527582748244735, + "loss": 2.729, + "theoretical_loss": 3.3634716587026188, + "tokens_seen": 2547572736 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011526579739217653, + "loss": 2.3427, + "theoretical_loss": 3.3634647830571267, + "tokens_seen": 2547638272 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2800988, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3870596885681152, + "objective/train/theoretical_loss": 3.3634579076380255, + "objective/train/tokens_used": 2568163808, + "theoretical_loss": 3.3634579076380255, + "tokens_seen": 2547703808 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011525576730190572, + "loss": 2.6577, + "theoretical_loss": 3.3634579076380255, + "tokens_seen": 2547703808 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001152457372116349, + "loss": 2.5731, + "theoretical_loss": 3.363451032445301, + "tokens_seen": 2547769344 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001152357071213641, + "loss": 2.547, + "theoretical_loss": 3.3634441574789413, + "tokens_seen": 2547834880 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011522567703109328, + "loss": 2.3258, + "theoretical_loss": 3.363437282738932, + "tokens_seen": 2547900416 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011521564694082247, + "loss": 2.432, + "theoretical_loss": 3.363430408225261, + "tokens_seen": 2547965952 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011520561685055165, + "loss": 2.4287, + "theoretical_loss": 3.3634235339379135, + "tokens_seen": 2548031488 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011519558676028085, + "loss": 2.5384, + "theoretical_loss": 3.363416659876877, + "tokens_seen": 2548097024 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011518555667001003, + "loss": 2.7089, + "theoretical_loss": 3.3634097860421384, + "tokens_seen": 2548162560 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011517552657973921, + "loss": 2.5863, + "theoretical_loss": 3.3634029124336844, + "tokens_seen": 2548228096 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001151654964894684, + "loss": 2.735, + "theoretical_loss": 3.3633960390515014, + "tokens_seen": 2548293632 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011515546639919759, + "loss": 2.477, + "theoretical_loss": 3.363389165895576, + "tokens_seen": 2548359168 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011514543630892678, + "loss": 2.5208, + "theoretical_loss": 3.3633822929658956, + "tokens_seen": 2548424704 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011513540621865596, + "loss": 2.4994, + "theoretical_loss": 3.3633754202624466, + "tokens_seen": 2548490240 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011512537612838516, + "loss": 2.6087, + "theoretical_loss": 3.3633685477852158, + "tokens_seen": 2548555776 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011511534603811434, + "loss": 2.2323, + "theoretical_loss": 3.3633616755341897, + "tokens_seen": 2548621312 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011510531594784353, + "loss": 2.696, + "theoretical_loss": 3.3633548035093552, + "tokens_seen": 2548686848 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011509528585757272, + "loss": 2.5837, + "theoretical_loss": 3.363347931710699, + "tokens_seen": 2548752384 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001150852557673019, + "loss": 2.6227, + "theoretical_loss": 3.363341060138208, + "tokens_seen": 2548817920 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011507522567703109, + "loss": 2.3942, + "theoretical_loss": 3.363334188791869, + "tokens_seen": 2548883456 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011506519558676027, + "loss": 2.5524, + "theoretical_loss": 3.363327317671668, + "tokens_seen": 2548948992 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011505516549648947, + "loss": 2.6294, + "theoretical_loss": 3.3633204467775926, + "tokens_seen": 2549014528 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011504513540621866, + "loss": 2.467, + "theoretical_loss": 3.363313576109629, + "tokens_seen": 2549080064 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011503510531594786, + "loss": 2.3593, + "theoretical_loss": 3.363306705667765, + "tokens_seen": 2549145600 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011502507522567704, + "loss": 2.5716, + "theoretical_loss": 3.363299835451986, + "tokens_seen": 2549211136 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011501504513540623, + "loss": 2.6932, + "theoretical_loss": 3.363292965462279, + "tokens_seen": 2549276672 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2801733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6198291778564453, + "objective/train/theoretical_loss": 3.3632860956986312, + "objective/train/tokens_used": 2569802208, + "theoretical_loss": 3.3632860956986312, + "tokens_seen": 2549342208 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011500501504513541, + "loss": 2.6589, + "theoretical_loss": 3.3632860956986312, + "tokens_seen": 2549342208 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011499498495486461, + "loss": 2.4986, + "theoretical_loss": 3.3632792261610294, + "tokens_seen": 2549407744 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011498495486459379, + "loss": 2.3579, + "theoretical_loss": 3.36327235684946, + "tokens_seen": 2549473280 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011497492477432297, + "loss": 2.4176, + "theoretical_loss": 3.36326548776391, + "tokens_seen": 2549538816 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011496489468405216, + "loss": 2.5083, + "theoretical_loss": 3.363258618904366, + "tokens_seen": 2549604352 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011495486459378135, + "loss": 2.6059, + "theoretical_loss": 3.3632517502708144, + "tokens_seen": 2549669888 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011494483450351054, + "loss": 2.6162, + "theoretical_loss": 3.3632448818632428, + "tokens_seen": 2549735424 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011493480441323972, + "loss": 2.6451, + "theoretical_loss": 3.3632380136816376, + "tokens_seen": 2549800960 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011492477432296892, + "loss": 2.7049, + "theoretical_loss": 3.363231145725985, + "tokens_seen": 2549866496 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001149147442326981, + "loss": 2.4254, + "theoretical_loss": 3.3632242779962724, + "tokens_seen": 2549932032 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011490471414242729, + "loss": 2.4743, + "theoretical_loss": 3.363217410492487, + "tokens_seen": 2549997568 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011489468405215647, + "loss": 2.6699, + "theoretical_loss": 3.363210543214614, + "tokens_seen": 2550063104 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011488465396188565, + "loss": 2.7441, + "theoretical_loss": 3.363203676162642, + "tokens_seen": 2550128640 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011487462387161485, + "loss": 2.5962, + "theoretical_loss": 3.3631968093365563, + "tokens_seen": 2550194176 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011486459378134403, + "loss": 2.6101, + "theoretical_loss": 3.3631899427363443, + "tokens_seen": 2550259712 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011485456369107322, + "loss": 2.5832, + "theoretical_loss": 3.3631830763619925, + "tokens_seen": 2550325248 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001148445336008024, + "loss": 2.6185, + "theoretical_loss": 3.3631762102134886, + "tokens_seen": 2550390784 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001148345035105316, + "loss": 2.5034, + "theoretical_loss": 3.363169344290818, + "tokens_seen": 2550456320 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011482447342026078, + "loss": 2.752, + "theoretical_loss": 3.3631624785939684, + "tokens_seen": 2550521856 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011481444332998998, + "loss": 2.581, + "theoretical_loss": 3.363155613122926, + "tokens_seen": 2550587392 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011480441323971916, + "loss": 2.6622, + "theoretical_loss": 3.3631487478776783, + "tokens_seen": 2550652928 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011479438314944834, + "loss": 2.6886, + "theoretical_loss": 3.363141882858211, + "tokens_seen": 2550718464 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011478435305917753, + "loss": 2.5137, + "theoretical_loss": 3.3631350180645123, + "tokens_seen": 2550784000 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011477432296890671, + "loss": 2.5128, + "theoretical_loss": 3.3631281534965676, + "tokens_seen": 2550849536 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011476429287863591, + "loss": 2.5946, + "theoretical_loss": 3.3631212891543645, + "tokens_seen": 2550915072 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2802994, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.303171157836914, + "objective/train/theoretical_loss": 3.3631144250378897, + "objective/train/tokens_used": 2571440608, + "theoretical_loss": 3.3631144250378897, + "tokens_seen": 2550980608 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011475426278836509, + "loss": 2.3736, + "theoretical_loss": 3.3631144250378897, + "tokens_seen": 2550980608 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011474423269809428, + "loss": 2.5759, + "theoretical_loss": 3.363107561147129, + "tokens_seen": 2551046144 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011473420260782347, + "loss": 2.6846, + "theoretical_loss": 3.363100697482071, + "tokens_seen": 2551111680 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011472417251755266, + "loss": 2.522, + "theoretical_loss": 3.363093834042701, + "tokens_seen": 2551177216 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011471414242728184, + "loss": 2.6438, + "theoretical_loss": 3.363086970829006, + "tokens_seen": 2551242752 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011470411233701104, + "loss": 2.5727, + "theoretical_loss": 3.3630801078409736, + "tokens_seen": 2551308288 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011469408224674022, + "loss": 2.6494, + "theoretical_loss": 3.36307324507859, + "tokens_seen": 2551373824 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001146840521564694, + "loss": 2.5964, + "theoretical_loss": 3.3630663825418416, + "tokens_seen": 2551439360 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001146740220661986, + "loss": 2.5591, + "theoretical_loss": 3.3630595202307156, + "tokens_seen": 2551504896 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011466399197592779, + "loss": 2.3058, + "theoretical_loss": 3.3630526581451985, + "tokens_seen": 2551570432 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011465396188565698, + "loss": 2.7461, + "theoretical_loss": 3.363045796285278, + "tokens_seen": 2551635968 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011464393179538616, + "loss": 2.7201, + "theoretical_loss": 3.36303893465094, + "tokens_seen": 2551701504 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011463390170511536, + "loss": 2.4121, + "theoretical_loss": 3.3630320732421715, + "tokens_seen": 2551767040 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011462387161484454, + "loss": 2.4802, + "theoretical_loss": 3.3630252120589597, + "tokens_seen": 2551832576 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011461384152457373, + "loss": 2.8841, + "theoretical_loss": 3.3630183511012905, + "tokens_seen": 2551898112 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011460381143430292, + "loss": 2.5302, + "theoretical_loss": 3.3630114903691517, + "tokens_seen": 2551963648 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001145937813440321, + "loss": 2.7475, + "theoretical_loss": 3.3630046298625293, + "tokens_seen": 2552029184 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011458375125376129, + "loss": 2.7149, + "theoretical_loss": 3.36299776958141, + "tokens_seen": 2552094720 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011457372116349047, + "loss": 2.5282, + "theoretical_loss": 3.3629909095257817, + "tokens_seen": 2552160256 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011456369107321967, + "loss": 2.5634, + "theoretical_loss": 3.3629840496956307, + "tokens_seen": 2552225792 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011455366098294885, + "loss": 2.7803, + "theoretical_loss": 3.362977190090943, + "tokens_seen": 2552291328 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011454363089267804, + "loss": 2.5504, + "theoretical_loss": 3.3629703307117063, + "tokens_seen": 2552356864 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011453360080240722, + "loss": 2.494, + "theoretical_loss": 3.362963471557907, + "tokens_seen": 2552422400 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011452357071213642, + "loss": 2.6605, + "theoretical_loss": 3.362956612629532, + "tokens_seen": 2552487936 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001145135406218656, + "loss": 2.5506, + "theoretical_loss": 3.3629497539265683, + "tokens_seen": 2552553472 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2804301, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.707094430923462, + "objective/train/theoretical_loss": 3.3629428954490024, + "objective/train/tokens_used": 2573079008, + "theoretical_loss": 3.3629428954490024, + "tokens_seen": 2552619008 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011450351053159478, + "loss": 2.7087, + "theoretical_loss": 3.3629428954490024, + "tokens_seen": 2552619008 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011449348044132398, + "loss": 2.581, + "theoretical_loss": 3.362936037196821, + "tokens_seen": 2552684544 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011448345035105316, + "loss": 2.3685, + "theoretical_loss": 3.3629291791700116, + "tokens_seen": 2552750080 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011447342026078235, + "loss": 2.6635, + "theoretical_loss": 3.36292232136856, + "tokens_seen": 2552815616 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011446339017051153, + "loss": 2.6602, + "theoretical_loss": 3.362915463792454, + "tokens_seen": 2552881152 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011445336008024073, + "loss": 2.533, + "theoretical_loss": 3.36290860644168, + "tokens_seen": 2552946688 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011444332998996991, + "loss": 2.4405, + "theoretical_loss": 3.362901749316224, + "tokens_seen": 2553012224 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001144332998996991, + "loss": 2.2207, + "theoretical_loss": 3.3628948924160746, + "tokens_seen": 2553077760 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011442326980942828, + "loss": 2.7129, + "theoretical_loss": 3.362888035741217, + "tokens_seen": 2553143296 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011441323971915748, + "loss": 2.3864, + "theoretical_loss": 3.362881179291639, + "tokens_seen": 2553208832 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011440320962888666, + "loss": 2.6062, + "theoretical_loss": 3.362874323067327, + "tokens_seen": 2553274368 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011439317953861584, + "loss": 2.3361, + "theoretical_loss": 3.3628674670682672, + "tokens_seen": 2553339904 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011438314944834504, + "loss": 2.5439, + "theoretical_loss": 3.362860611294448, + "tokens_seen": 2553405440 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011437311935807422, + "loss": 2.5891, + "theoretical_loss": 3.3628537557458547, + "tokens_seen": 2553470976 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011436308926780341, + "loss": 2.663, + "theoretical_loss": 3.362846900422475, + "tokens_seen": 2553536512 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011435305917753259, + "loss": 2.3634, + "theoretical_loss": 3.3628400453242953, + "tokens_seen": 2553602048 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011434302908726179, + "loss": 2.4804, + "theoretical_loss": 3.3628331904513025, + "tokens_seen": 2553667584 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011433299899699097, + "loss": 2.5547, + "theoretical_loss": 3.3628263358034833, + "tokens_seen": 2553733120 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011432296890672016, + "loss": 2.5329, + "theoretical_loss": 3.362819481380825, + "tokens_seen": 2553798656 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011431293881644934, + "loss": 2.8371, + "theoretical_loss": 3.362812627183314, + "tokens_seen": 2553864192 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011430290872617853, + "loss": 2.6314, + "theoretical_loss": 3.3628057732109373, + "tokens_seen": 2553929728 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011429287863590773, + "loss": 2.6661, + "theoretical_loss": 3.362798919463682, + "tokens_seen": 2553995264 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011428284854563691, + "loss": 2.4408, + "theoretical_loss": 3.3627920659415342, + "tokens_seen": 2554060800 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011427281845536611, + "loss": 2.6086, + "theoretical_loss": 3.3627852126444813, + "tokens_seen": 2554126336 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011426278836509529, + "loss": 2.5754, + "theoretical_loss": 3.36277835957251, + "tokens_seen": 2554191872 + }, + { + "epoch": 8.05, + "objective/train/docs_used": 2804949, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7353971004486084, + "objective/train/theoretical_loss": 3.3627715067256068, + "objective/train/tokens_used": 2574717408, + "theoretical_loss": 3.3627715067256068, + "tokens_seen": 2554257408 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011425275827482448, + "loss": 2.6155, + "theoretical_loss": 3.3627715067256068, + "tokens_seen": 2554257408 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011424272818455367, + "loss": 2.501, + "theoretical_loss": 3.3627646541037595, + "tokens_seen": 2554322944 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011423269809428286, + "loss": 2.4034, + "theoretical_loss": 3.3627578017069535, + "tokens_seen": 2554388480 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011422266800401204, + "loss": 2.4, + "theoretical_loss": 3.362750949535177, + "tokens_seen": 2554454016 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011421263791374124, + "loss": 2.6543, + "theoretical_loss": 3.362744097588416, + "tokens_seen": 2554519552 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011420260782347042, + "loss": 2.3511, + "theoretical_loss": 3.362737245866658, + "tokens_seen": 2554585088 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001141925777331996, + "loss": 2.6511, + "theoretical_loss": 3.3627303943698887, + "tokens_seen": 2554650624 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001141825476429288, + "loss": 2.5368, + "theoretical_loss": 3.362723543098096, + "tokens_seen": 2554716160 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011417251755265797, + "loss": 2.3873, + "theoretical_loss": 3.3627166920512663, + "tokens_seen": 2554781696 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011416248746238717, + "loss": 2.6894, + "theoretical_loss": 3.3627098412293868, + "tokens_seen": 2554847232 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011415245737211635, + "loss": 2.7074, + "theoretical_loss": 3.362702990632444, + "tokens_seen": 2554912768 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011414242728184554, + "loss": 2.5477, + "theoretical_loss": 3.362696140260425, + "tokens_seen": 2554978304 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011413239719157473, + "loss": 2.3587, + "theoretical_loss": 3.362689290113316, + "tokens_seen": 2555043840 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011412236710130392, + "loss": 2.5561, + "theoretical_loss": 3.362682440191105, + "tokens_seen": 2555109376 + }, + { + "epoch": 8.05, + "learning_rate": 0.0001141123370110331, + "loss": 2.6673, + "theoretical_loss": 3.3626755904937777, + "tokens_seen": 2555174912 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011410230692076228, + "loss": 2.5823, + "theoretical_loss": 3.362668741021322, + "tokens_seen": 2555240448 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011409227683049148, + "loss": 2.3544, + "theoretical_loss": 3.3626618917737234, + "tokens_seen": 2555305984 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011408224674022066, + "loss": 2.6842, + "theoretical_loss": 3.36265504275097, + "tokens_seen": 2555371520 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011407221664994985, + "loss": 2.6255, + "theoretical_loss": 3.3626481939530484, + "tokens_seen": 2555437056 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011406218655967903, + "loss": 2.5142, + "theoretical_loss": 3.3626413453799446, + "tokens_seen": 2555502592 + }, + { + "epoch": 8.05, + "learning_rate": 0.00011405215646940823, + "loss": 2.5177, + "theoretical_loss": 3.3626344970316464, + "tokens_seen": 2555568128 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011404212637913741, + "loss": 2.6788, + "theoretical_loss": 3.3626276489081404, + "tokens_seen": 2555633664 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001140320962888666, + "loss": 2.6311, + "theoretical_loss": 3.3626208010094136, + "tokens_seen": 2555699200 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011402206619859579, + "loss": 2.6484, + "theoretical_loss": 3.3626139533354524, + "tokens_seen": 2555764736 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011401203610832497, + "loss": 2.5779, + "theoretical_loss": 3.362607105886244, + "tokens_seen": 2555830272 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2806260, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5311129093170166, + "objective/train/theoretical_loss": 3.362600258661775, + "objective/train/tokens_used": 2576355808, + "theoretical_loss": 3.362600258661775, + "tokens_seen": 2555895808 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011400200601805416, + "loss": 2.5406, + "theoretical_loss": 3.362600258661775, + "tokens_seen": 2555895808 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011399197592778334, + "loss": 2.6036, + "theoretical_loss": 3.362593411662033, + "tokens_seen": 2555961344 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011398194583751254, + "loss": 2.5124, + "theoretical_loss": 3.3625865648870037, + "tokens_seen": 2556026880 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011397191574724172, + "loss": 2.4401, + "theoretical_loss": 3.362579718336675, + "tokens_seen": 2556092416 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011396188565697091, + "loss": 2.686, + "theoretical_loss": 3.3625728720110333, + "tokens_seen": 2556157952 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001139518555667001, + "loss": 2.3083, + "theoretical_loss": 3.3625660259100654, + "tokens_seen": 2556223488 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011394182547642929, + "loss": 2.6136, + "theoretical_loss": 3.362559180033758, + "tokens_seen": 2556289024 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011393179538615847, + "loss": 2.503, + "theoretical_loss": 3.3625523343820984, + "tokens_seen": 2556354560 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011392176529588768, + "loss": 2.4119, + "theoretical_loss": 3.3625454889550737, + "tokens_seen": 2556420096 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011391173520561686, + "loss": 2.4963, + "theoretical_loss": 3.36253864375267, + "tokens_seen": 2556485632 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011390170511534604, + "loss": 2.5749, + "theoretical_loss": 3.362531798774875, + "tokens_seen": 2556551168 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011389167502507524, + "loss": 2.6237, + "theoretical_loss": 3.3625249540216746, + "tokens_seen": 2556616704 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011388164493480442, + "loss": 2.6486, + "theoretical_loss": 3.362518109493056, + "tokens_seen": 2556682240 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011387161484453361, + "loss": 2.5097, + "theoretical_loss": 3.3625112651890072, + "tokens_seen": 2556747776 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011386158475426279, + "loss": 2.687, + "theoretical_loss": 3.3625044211095134, + "tokens_seen": 2556813312 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011385155466399199, + "loss": 2.6076, + "theoretical_loss": 3.3624975772545627, + "tokens_seen": 2556878848 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011384152457372117, + "loss": 2.4914, + "theoretical_loss": 3.362490733624141, + "tokens_seen": 2556944384 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011383149448345036, + "loss": 2.678, + "theoretical_loss": 3.362483890218236, + "tokens_seen": 2557009920 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011382146439317954, + "loss": 2.2374, + "theoretical_loss": 3.362477047036834, + "tokens_seen": 2557075456 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011381143430290873, + "loss": 2.6866, + "theoretical_loss": 3.3624702040799224, + "tokens_seen": 2557140992 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011380140421263792, + "loss": 2.5363, + "theoretical_loss": 3.362463361347488, + "tokens_seen": 2557206528 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001137913741223671, + "loss": 2.4977, + "theoretical_loss": 3.362456518839517, + "tokens_seen": 2557272064 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001137813440320963, + "loss": 2.3876, + "theoretical_loss": 3.3624496765559972, + "tokens_seen": 2557337600 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011377131394182548, + "loss": 2.7388, + "theoretical_loss": 3.362442834496915, + "tokens_seen": 2557403136 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011376128385155467, + "loss": 2.5455, + "theoretical_loss": 3.3624359926622573, + "tokens_seen": 2557468672 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2806749, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.273900270462036, + "objective/train/theoretical_loss": 3.362429151052011, + "objective/train/tokens_used": 2577994208, + "theoretical_loss": 3.362429151052011, + "tokens_seen": 2557534208 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011375125376128385, + "loss": 2.5048, + "theoretical_loss": 3.362429151052011, + "tokens_seen": 2557534208 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011374122367101305, + "loss": 2.452, + "theoretical_loss": 3.3624223096661634, + "tokens_seen": 2557599744 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011373119358074223, + "loss": 2.5467, + "theoretical_loss": 3.3624154685047007, + "tokens_seen": 2557665280 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011372116349047141, + "loss": 2.5092, + "theoretical_loss": 3.36240862756761, + "tokens_seen": 2557730816 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001137111334002006, + "loss": 2.5869, + "theoretical_loss": 3.362401786854879, + "tokens_seen": 2557796352 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011370110330992979, + "loss": 2.2979, + "theoretical_loss": 3.3623949463664933, + "tokens_seen": 2557861888 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011369107321965898, + "loss": 2.7113, + "theoretical_loss": 3.3623881061024408, + "tokens_seen": 2557927424 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011368104312938816, + "loss": 2.452, + "theoretical_loss": 3.3623812660627075, + "tokens_seen": 2557992960 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011367101303911736, + "loss": 2.6728, + "theoretical_loss": 3.362374426247281, + "tokens_seen": 2558058496 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011366098294884654, + "loss": 2.6557, + "theoretical_loss": 3.3623675866561484, + "tokens_seen": 2558124032 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011365095285857573, + "loss": 2.7306, + "theoretical_loss": 3.362360747289296, + "tokens_seen": 2558189568 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011364092276830491, + "loss": 2.4954, + "theoretical_loss": 3.3623539081467104, + "tokens_seen": 2558255104 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011363089267803411, + "loss": 2.4815, + "theoretical_loss": 3.3623470692283797, + "tokens_seen": 2558320640 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011362086258776329, + "loss": 2.5425, + "theoretical_loss": 3.3623402305342895, + "tokens_seen": 2558386176 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011361083249749247, + "loss": 2.3376, + "theoretical_loss": 3.3623333920644276, + "tokens_seen": 2558451712 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011360080240722166, + "loss": 2.4108, + "theoretical_loss": 3.3623265538187805, + "tokens_seen": 2558517248 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011359077231695085, + "loss": 2.6575, + "theoretical_loss": 3.3623197157973355, + "tokens_seen": 2558582784 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011358074222668004, + "loss": 2.4938, + "theoretical_loss": 3.362312878000079, + "tokens_seen": 2558648320 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011357071213640922, + "loss": 2.6553, + "theoretical_loss": 3.3623060404269984, + "tokens_seen": 2558713856 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011356068204613842, + "loss": 2.6206, + "theoretical_loss": 3.36229920307808, + "tokens_seen": 2558779392 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011355065195586761, + "loss": 2.5572, + "theoretical_loss": 3.3622923659533113, + "tokens_seen": 2558844928 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001135406218655968, + "loss": 2.8755, + "theoretical_loss": 3.362285529052679, + "tokens_seen": 2558910464 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011353059177532599, + "loss": 2.4233, + "theoretical_loss": 3.3622786923761696, + "tokens_seen": 2558976000 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011352056168505517, + "loss": 2.4299, + "theoretical_loss": 3.3622718559237708, + "tokens_seen": 2559041536 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011351053159478436, + "loss": 2.6148, + "theoretical_loss": 3.3622650196954686, + "tokens_seen": 2559107072 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2808385, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.316877841949463, + "objective/train/theoretical_loss": 3.362258183691251, + "objective/train/tokens_used": 2579632608, + "theoretical_loss": 3.362258183691251, + "tokens_seen": 2559172608 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011350050150451354, + "loss": 2.4742, + "theoretical_loss": 3.362258183691251, + "tokens_seen": 2559172608 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011349047141424274, + "loss": 2.4899, + "theoretical_loss": 3.362251347911104, + "tokens_seen": 2559238144 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011348044132397192, + "loss": 2.5617, + "theoretical_loss": 3.362244512355015, + "tokens_seen": 2559303680 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011347041123370111, + "loss": 2.3563, + "theoretical_loss": 3.3622376770229705, + "tokens_seen": 2559369216 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001134603811434303, + "loss": 2.5335, + "theoretical_loss": 3.362230841914958, + "tokens_seen": 2559434752 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011345035105315949, + "loss": 2.3638, + "theoretical_loss": 3.3622240070309637, + "tokens_seen": 2559500288 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011344032096288867, + "loss": 2.4593, + "theoretical_loss": 3.3622171723709755, + "tokens_seen": 2559565824 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011343029087261785, + "loss": 2.5511, + "theoretical_loss": 3.3622103379349793, + "tokens_seen": 2559631360 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011342026078234705, + "loss": 2.6465, + "theoretical_loss": 3.362203503722963, + "tokens_seen": 2559696896 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011341023069207623, + "loss": 2.4969, + "theoretical_loss": 3.3621966697349124, + "tokens_seen": 2559762432 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011340020060180542, + "loss": 2.3665, + "theoretical_loss": 3.362189835970815, + "tokens_seen": 2559827968 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001133901705115346, + "loss": 2.5572, + "theoretical_loss": 3.3621830024306583, + "tokens_seen": 2559893504 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001133801404212638, + "loss": 2.6499, + "theoretical_loss": 3.3621761691144285, + "tokens_seen": 2559959040 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011337011033099298, + "loss": 2.7598, + "theoretical_loss": 3.362169336022113, + "tokens_seen": 2560024576 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011336008024072217, + "loss": 2.5327, + "theoretical_loss": 3.362162503153698, + "tokens_seen": 2560090112 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011335005015045135, + "loss": 2.4852, + "theoretical_loss": 3.362155670509171, + "tokens_seen": 2560155648 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011334002006018055, + "loss": 2.4979, + "theoretical_loss": 3.3621488380885185, + "tokens_seen": 2560221184 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011332998996990973, + "loss": 2.5552, + "theoretical_loss": 3.3621420058917284, + "tokens_seen": 2560286720 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011331995987963891, + "loss": 2.3228, + "theoretical_loss": 3.3621351739187864, + "tokens_seen": 2560352256 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001133099297893681, + "loss": 2.4859, + "theoretical_loss": 3.3621283421696804, + "tokens_seen": 2560417792 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011329989969909729, + "loss": 2.3347, + "theoretical_loss": 3.3621215106443967, + "tokens_seen": 2560483328 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011328986960882648, + "loss": 2.4795, + "theoretical_loss": 3.362114679342923, + "tokens_seen": 2560548864 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011327983951855566, + "loss": 2.3966, + "theoretical_loss": 3.362107848265245, + "tokens_seen": 2560614400 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011326980942828486, + "loss": 2.6723, + "theoretical_loss": 3.362101017411351, + "tokens_seen": 2560679936 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011325977933801404, + "loss": 2.7166, + "theoretical_loss": 3.362094186781227, + "tokens_seen": 2560745472 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2811551, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.239719867706299, + "objective/train/theoretical_loss": 3.3620873563748606, + "objective/train/tokens_used": 2581271008, + "theoretical_loss": 3.3620873563748606, + "tokens_seen": 2560811008 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011324974924774323, + "loss": 2.5558, + "theoretical_loss": 3.3620873563748606, + "tokens_seen": 2560811008 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011323971915747241, + "loss": 2.6255, + "theoretical_loss": 3.362080526192238, + "tokens_seen": 2560876544 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001132296890672016, + "loss": 2.5521, + "theoretical_loss": 3.3620736962333466, + "tokens_seen": 2560942080 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011321965897693079, + "loss": 2.6057, + "theoretical_loss": 3.362066866498173, + "tokens_seen": 2561007616 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011320962888665997, + "loss": 2.4583, + "theoretical_loss": 3.3620600369867053, + "tokens_seen": 2561073152 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011319959879638917, + "loss": 2.734, + "theoretical_loss": 3.362053207698929, + "tokens_seen": 2561138688 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011318956870611835, + "loss": 2.4375, + "theoretical_loss": 3.3620463786348322, + "tokens_seen": 2561204224 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011317953861584754, + "loss": 2.6344, + "theoretical_loss": 3.3620395497944005, + "tokens_seen": 2561269760 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011316950852557674, + "loss": 2.5828, + "theoretical_loss": 3.362032721177622, + "tokens_seen": 2561335296 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011315947843530593, + "loss": 2.6644, + "theoretical_loss": 3.3620258927844837, + "tokens_seen": 2561400832 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011314944834503511, + "loss": 2.7396, + "theoretical_loss": 3.3620190646149717, + "tokens_seen": 2561466368 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011313941825476431, + "loss": 2.5759, + "theoretical_loss": 3.3620122366690737, + "tokens_seen": 2561531904 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011312938816449349, + "loss": 2.4713, + "theoretical_loss": 3.362005408946776, + "tokens_seen": 2561597440 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011311935807422267, + "loss": 2.613, + "theoretical_loss": 3.361998581448066, + "tokens_seen": 2561662976 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011310932798395186, + "loss": 2.1864, + "theoretical_loss": 3.3619917541729305, + "tokens_seen": 2561728512 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011309929789368105, + "loss": 2.5013, + "theoretical_loss": 3.361984927121357, + "tokens_seen": 2561794048 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011308926780341024, + "loss": 2.4924, + "theoretical_loss": 3.3619781002933316, + "tokens_seen": 2561859584 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011307923771313942, + "loss": 2.5888, + "theoretical_loss": 3.361971273688842, + "tokens_seen": 2561925120 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011306920762286862, + "loss": 2.4931, + "theoretical_loss": 3.3619644473078742, + "tokens_seen": 2561990656 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001130591775325978, + "loss": 2.6232, + "theoretical_loss": 3.3619576211504163, + "tokens_seen": 2562056192 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011304914744232699, + "loss": 2.5632, + "theoretical_loss": 3.3619507952164547, + "tokens_seen": 2562121728 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011303911735205617, + "loss": 2.5787, + "theoretical_loss": 3.361943969505976, + "tokens_seen": 2562187264 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011302908726178535, + "loss": 2.5933, + "theoretical_loss": 3.3619371440189685, + "tokens_seen": 2562252800 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011301905717151455, + "loss": 2.436, + "theoretical_loss": 3.3619303187554177, + "tokens_seen": 2562318336 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011300902708124373, + "loss": 2.453, + "theoretical_loss": 3.361923493715311, + "tokens_seen": 2562383872 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2816507, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9152402877807617, + "objective/train/theoretical_loss": 3.3619166688986355, + "objective/train/tokens_used": 2582909408, + "theoretical_loss": 3.3619166688986355, + "tokens_seen": 2562449408 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011299899699097292, + "loss": 2.5588, + "theoretical_loss": 3.3619166688986355, + "tokens_seen": 2562449408 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001129889669007021, + "loss": 2.5948, + "theoretical_loss": 3.3619098443053783, + "tokens_seen": 2562514944 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001129789368104313, + "loss": 2.5438, + "theoretical_loss": 3.3619030199355264, + "tokens_seen": 2562580480 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011296890672016048, + "loss": 2.5332, + "theoretical_loss": 3.361896195789066, + "tokens_seen": 2562646016 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011295887662988968, + "loss": 2.5589, + "theoretical_loss": 3.3618893718659857, + "tokens_seen": 2562711552 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011294884653961886, + "loss": 2.3547, + "theoretical_loss": 3.3618825481662706, + "tokens_seen": 2562777088 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011293881644934804, + "loss": 2.4546, + "theoretical_loss": 3.3618757246899085, + "tokens_seen": 2562842624 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011292878635907723, + "loss": 2.537, + "theoretical_loss": 3.361868901436887, + "tokens_seen": 2562908160 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011291875626880641, + "loss": 2.6082, + "theoretical_loss": 3.361862078407192, + "tokens_seen": 2562973696 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011290872617853561, + "loss": 2.5459, + "theoretical_loss": 3.3618552556008114, + "tokens_seen": 2563039232 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011289869608826479, + "loss": 2.5107, + "theoretical_loss": 3.3618484330177316, + "tokens_seen": 2563104768 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011288866599799398, + "loss": 2.3747, + "theoretical_loss": 3.36184161065794, + "tokens_seen": 2563170304 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011287863590772317, + "loss": 2.6296, + "theoretical_loss": 3.361834788521423, + "tokens_seen": 2563235840 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011286860581745236, + "loss": 2.7174, + "theoretical_loss": 3.3618279666081676, + "tokens_seen": 2563301376 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011285857572718154, + "loss": 2.406, + "theoretical_loss": 3.361821144918162, + "tokens_seen": 2563366912 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011284854563691074, + "loss": 2.5889, + "theoretical_loss": 3.3618143234513913, + "tokens_seen": 2563432448 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011283851554663992, + "loss": 2.518, + "theoretical_loss": 3.361807502207844, + "tokens_seen": 2563497984 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001128284854563691, + "loss": 2.7038, + "theoretical_loss": 3.361800681187506, + "tokens_seen": 2563563520 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011281845536609829, + "loss": 2.7628, + "theoretical_loss": 3.3617938603903657, + "tokens_seen": 2563629056 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011280842527582747, + "loss": 2.4642, + "theoretical_loss": 3.361787039816409, + "tokens_seen": 2563694592 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011279839518555668, + "loss": 2.5134, + "theoretical_loss": 3.3617802194656226, + "tokens_seen": 2563760128 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011278836509528586, + "loss": 2.4299, + "theoretical_loss": 3.3617733993379946, + "tokens_seen": 2563825664 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011277833500501506, + "loss": 2.5536, + "theoretical_loss": 3.361766579433511, + "tokens_seen": 2563891200 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011276830491474424, + "loss": 2.4308, + "theoretical_loss": 3.3617597597521596, + "tokens_seen": 2563956736 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011275827482447343, + "loss": 2.5091, + "theoretical_loss": 3.361752940293927, + "tokens_seen": 2564022272 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2821399, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2054381370544434, + "objective/train/theoretical_loss": 3.3617461210587996, + "objective/train/tokens_used": 2584547808, + "theoretical_loss": 3.3617461210587996, + "tokens_seen": 2564087808 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011274824473420262, + "loss": 2.2556, + "theoretical_loss": 3.3617461210587996, + "tokens_seen": 2564087808 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001127382146439318, + "loss": 2.4584, + "theoretical_loss": 3.3617393020467654, + "tokens_seen": 2564153344 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011272818455366099, + "loss": 2.5388, + "theoretical_loss": 3.3617324832578115, + "tokens_seen": 2564218880 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011271815446339017, + "loss": 2.5454, + "theoretical_loss": 3.3617256646919236, + "tokens_seen": 2564284416 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011270812437311937, + "loss": 2.4538, + "theoretical_loss": 3.36171884634909, + "tokens_seen": 2564349952 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011269809428284855, + "loss": 2.3516, + "theoretical_loss": 3.361712028229297, + "tokens_seen": 2564415488 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011268806419257774, + "loss": 2.5826, + "theoretical_loss": 3.361705210332532, + "tokens_seen": 2564481024 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011267803410230692, + "loss": 2.4975, + "theoretical_loss": 3.3616983926587816, + "tokens_seen": 2564546560 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011266800401203612, + "loss": 2.7078, + "theoretical_loss": 3.361691575208033, + "tokens_seen": 2564612096 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001126579739217653, + "loss": 2.4363, + "theoretical_loss": 3.3616847579802736, + "tokens_seen": 2564677632 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011264794383149448, + "loss": 2.5299, + "theoretical_loss": 3.3616779409754898, + "tokens_seen": 2564743168 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011263791374122368, + "loss": 2.7004, + "theoretical_loss": 3.361671124193669, + "tokens_seen": 2564808704 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011262788365095286, + "loss": 2.4403, + "theoretical_loss": 3.361664307634798, + "tokens_seen": 2564874240 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011261785356068205, + "loss": 2.7355, + "theoretical_loss": 3.361657491298864, + "tokens_seen": 2564939776 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011260782347041123, + "loss": 2.7038, + "theoretical_loss": 3.3616506751858535, + "tokens_seen": 2565005312 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011259779338014043, + "loss": 2.3284, + "theoretical_loss": 3.361643859295754, + "tokens_seen": 2565070848 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011258776328986961, + "loss": 2.47, + "theoretical_loss": 3.3616370436285523, + "tokens_seen": 2565136384 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001125777331995988, + "loss": 2.5717, + "theoretical_loss": 3.361630228184236, + "tokens_seen": 2565201920 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011256770310932798, + "loss": 2.6967, + "theoretical_loss": 3.3616234129627913, + "tokens_seen": 2565267456 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011255767301905718, + "loss": 2.6463, + "theoretical_loss": 3.361616597964206, + "tokens_seen": 2565332992 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011254764292878636, + "loss": 2.7396, + "theoretical_loss": 3.361609783188466, + "tokens_seen": 2565398528 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011253761283851554, + "loss": 2.6046, + "theoretical_loss": 3.3616029686355597, + "tokens_seen": 2565464064 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011252758274824474, + "loss": 2.5281, + "theoretical_loss": 3.361596154305473, + "tokens_seen": 2565529600 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011251755265797392, + "loss": 2.5815, + "theoretical_loss": 3.3615893401981936, + "tokens_seen": 2565595136 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011250752256770311, + "loss": 2.6001, + "theoretical_loss": 3.361582526313708, + "tokens_seen": 2565660672 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2826454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4855923652648926, + "objective/train/theoretical_loss": 3.361575712652004, + "objective/train/tokens_used": 2586186208, + "theoretical_loss": 3.361575712652004, + "tokens_seen": 2565726208 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011249749247743229, + "loss": 2.6082, + "theoretical_loss": 3.361575712652004, + "tokens_seen": 2565726208 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011248746238716149, + "loss": 2.5596, + "theoretical_loss": 3.3615688992130677, + "tokens_seen": 2565791744 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011247743229689067, + "loss": 2.4259, + "theoretical_loss": 3.3615620859968867, + "tokens_seen": 2565857280 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011246740220661986, + "loss": 2.7191, + "theoretical_loss": 3.3615552730034475, + "tokens_seen": 2565922816 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011245737211634904, + "loss": 2.4009, + "theoretical_loss": 3.361548460232738, + "tokens_seen": 2565988352 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011244734202607822, + "loss": 2.7177, + "theoretical_loss": 3.361541647684745, + "tokens_seen": 2566053888 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011243731193580742, + "loss": 2.5187, + "theoretical_loss": 3.3615348353594543, + "tokens_seen": 2566119424 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001124272818455366, + "loss": 2.5114, + "theoretical_loss": 3.361528023256855, + "tokens_seen": 2566184960 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011241725175526581, + "loss": 2.5309, + "theoretical_loss": 3.361521211376932, + "tokens_seen": 2566250496 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011240722166499499, + "loss": 2.5817, + "theoretical_loss": 3.361514399719674, + "tokens_seen": 2566316032 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011239719157472418, + "loss": 2.5499, + "theoretical_loss": 3.3615075882850673, + "tokens_seen": 2566381568 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011238716148445337, + "loss": 2.5775, + "theoretical_loss": 3.361500777073099, + "tokens_seen": 2566447104 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011237713139418256, + "loss": 2.5749, + "theoretical_loss": 3.3614939660837564, + "tokens_seen": 2566512640 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011236710130391174, + "loss": 2.5006, + "theoretical_loss": 3.3614871553170262, + "tokens_seen": 2566578176 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011235707121364094, + "loss": 2.4846, + "theoretical_loss": 3.3614803447728954, + "tokens_seen": 2566643712 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011234704112337012, + "loss": 2.7172, + "theoretical_loss": 3.3614735344513513, + "tokens_seen": 2566709248 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001123370110330993, + "loss": 2.647, + "theoretical_loss": 3.361466724352381, + "tokens_seen": 2566774784 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011232698094282849, + "loss": 2.6782, + "theoretical_loss": 3.3614599144759714, + "tokens_seen": 2566840320 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011231695085255767, + "loss": 2.4884, + "theoretical_loss": 3.3614531048221092, + "tokens_seen": 2566905856 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011230692076228687, + "loss": 2.5593, + "theoretical_loss": 3.361446295390782, + "tokens_seen": 2566971392 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011229689067201605, + "loss": 2.6059, + "theoretical_loss": 3.3614394861819767, + "tokens_seen": 2567036928 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011228686058174524, + "loss": 2.4943, + "theoretical_loss": 3.36143267719568, + "tokens_seen": 2567102464 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011227683049147443, + "loss": 2.7441, + "theoretical_loss": 3.3614258684318794, + "tokens_seen": 2567168000 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011226680040120362, + "loss": 2.668, + "theoretical_loss": 3.361419059890562, + "tokens_seen": 2567233536 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001122567703109328, + "loss": 2.5255, + "theoretical_loss": 3.361412251571714, + "tokens_seen": 2567299072 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2831537, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.114457130432129, + "objective/train/theoretical_loss": 3.361405443475324, + "objective/train/tokens_used": 2587824608, + "theoretical_loss": 3.361405443475324, + "tokens_seen": 2567364608 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011224674022066198, + "loss": 2.4566, + "theoretical_loss": 3.361405443475324, + "tokens_seen": 2567364608 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011223671013039118, + "loss": 2.6105, + "theoretical_loss": 3.3613986356013776, + "tokens_seen": 2567430144 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011222668004012036, + "loss": 2.505, + "theoretical_loss": 3.3613918279498622, + "tokens_seen": 2567495680 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011221664994984955, + "loss": 2.4771, + "theoretical_loss": 3.3613850205207654, + "tokens_seen": 2567561216 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011220661985957873, + "loss": 2.5814, + "theoretical_loss": 3.3613782133140737, + "tokens_seen": 2567626752 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011219658976930793, + "loss": 2.4767, + "theoretical_loss": 3.3613714063297744, + "tokens_seen": 2567692288 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011218655967903711, + "loss": 2.5089, + "theoretical_loss": 3.3613645995678545, + "tokens_seen": 2567757824 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001121765295887663, + "loss": 2.6957, + "theoretical_loss": 3.361357793028301, + "tokens_seen": 2567823360 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011216649949849549, + "loss": 2.7401, + "theoretical_loss": 3.361350986711101, + "tokens_seen": 2567888896 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011215646940822467, + "loss": 2.5512, + "theoretical_loss": 3.361344180616242, + "tokens_seen": 2567954432 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011214643931795386, + "loss": 2.4724, + "theoretical_loss": 3.3613373747437105, + "tokens_seen": 2568019968 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011213640922768304, + "loss": 2.6622, + "theoretical_loss": 3.3613305690934934, + "tokens_seen": 2568085504 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011212637913741224, + "loss": 2.665, + "theoretical_loss": 3.3613237636655784, + "tokens_seen": 2568151040 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011211634904714142, + "loss": 2.4625, + "theoretical_loss": 3.361316958459952, + "tokens_seen": 2568216576 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011210631895687061, + "loss": 2.724, + "theoretical_loss": 3.3613101534766017, + "tokens_seen": 2568282112 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001120962888665998, + "loss": 2.5968, + "theoretical_loss": 3.361303348715514, + "tokens_seen": 2568347648 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011208625877632899, + "loss": 2.5045, + "theoretical_loss": 3.361296544176677, + "tokens_seen": 2568413184 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011207622868605817, + "loss": 2.4884, + "theoretical_loss": 3.361289739860077, + "tokens_seen": 2568478720 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011206619859578735, + "loss": 2.5462, + "theoretical_loss": 3.361282935765701, + "tokens_seen": 2568544256 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011205616850551655, + "loss": 2.5896, + "theoretical_loss": 3.3612761318935362, + "tokens_seen": 2568609792 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011204613841524574, + "loss": 2.795, + "theoretical_loss": 3.3612693282435697, + "tokens_seen": 2568675328 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011203610832497494, + "loss": 2.604, + "theoretical_loss": 3.361262524815789, + "tokens_seen": 2568740864 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011202607823470412, + "loss": 2.6218, + "theoretical_loss": 3.3612557216101804, + "tokens_seen": 2568806400 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011201604814443331, + "loss": 2.5533, + "theoretical_loss": 3.361248918626732, + "tokens_seen": 2568871936 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011200601805416249, + "loss": 2.4815, + "theoretical_loss": 3.361242115865429, + "tokens_seen": 2568937472 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2834478, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.814790964126587, + "objective/train/theoretical_loss": 3.3612353133262607, + "objective/train/tokens_used": 2589463008, + "theoretical_loss": 3.3612353133262607, + "tokens_seen": 2569003008 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011199598796389169, + "loss": 2.7027, + "theoretical_loss": 3.3612353133262607, + "tokens_seen": 2569003008 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011198595787362087, + "loss": 2.5283, + "theoretical_loss": 3.3612285110092133, + "tokens_seen": 2569068544 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011197592778335006, + "loss": 2.646, + "theoretical_loss": 3.3612217089142735, + "tokens_seen": 2569134080 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011196589769307924, + "loss": 2.5572, + "theoretical_loss": 3.3612149070414286, + "tokens_seen": 2569199616 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011195586760280843, + "loss": 2.5438, + "theoretical_loss": 3.361208105390666, + "tokens_seen": 2569265152 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011194583751253762, + "loss": 2.6028, + "theoretical_loss": 3.3612013039619724, + "tokens_seen": 2569330688 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001119358074222668, + "loss": 2.4003, + "theoretical_loss": 3.361194502755335, + "tokens_seen": 2569396224 + }, + { + "epoch": 8.06, + "learning_rate": 0.000111925777331996, + "loss": 2.5604, + "theoretical_loss": 3.3611877017707408, + "tokens_seen": 2569461760 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011191574724172518, + "loss": 2.5814, + "theoretical_loss": 3.361180901008177, + "tokens_seen": 2569527296 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011190571715145437, + "loss": 2.3991, + "theoretical_loss": 3.3611741004676308, + "tokens_seen": 2569592832 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011189568706118355, + "loss": 2.7249, + "theoretical_loss": 3.361167300149089, + "tokens_seen": 2569658368 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011188565697091275, + "loss": 2.485, + "theoretical_loss": 3.3611605000525393, + "tokens_seen": 2569723904 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011187562688064193, + "loss": 2.5529, + "theoretical_loss": 3.3611537001779683, + "tokens_seen": 2569789440 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011186559679037111, + "loss": 2.7568, + "theoretical_loss": 3.361146900525363, + "tokens_seen": 2569854976 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001118555667001003, + "loss": 2.5828, + "theoretical_loss": 3.3611401010947106, + "tokens_seen": 2569920512 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011184553660982949, + "loss": 2.5781, + "theoretical_loss": 3.361133301885998, + "tokens_seen": 2569986048 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011183550651955868, + "loss": 2.6768, + "theoretical_loss": 3.361126502899213, + "tokens_seen": 2570051584 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011182547642928786, + "loss": 2.607, + "theoretical_loss": 3.361119704134342, + "tokens_seen": 2570117120 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011181544633901706, + "loss": 2.5213, + "theoretical_loss": 3.3611129055913724, + "tokens_seen": 2570182656 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011180541624874624, + "loss": 2.7024, + "theoretical_loss": 3.3611061072702912, + "tokens_seen": 2570248192 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011179538615847543, + "loss": 2.5288, + "theoretical_loss": 3.3610993091710855, + "tokens_seen": 2570313728 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011178535606820461, + "loss": 2.4934, + "theoretical_loss": 3.3610925112937426, + "tokens_seen": 2570379264 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011177532597793381, + "loss": 2.6791, + "theoretical_loss": 3.3610857136382495, + "tokens_seen": 2570444800 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011176529588766299, + "loss": 2.4478, + "theoretical_loss": 3.361078916204593, + "tokens_seen": 2570510336 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011175526579739217, + "loss": 2.5317, + "theoretical_loss": 3.3610721189927606, + "tokens_seen": 2570575872 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2835159, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.692937135696411, + "objective/train/theoretical_loss": 3.361065322002739, + "objective/train/tokens_used": 2591101408, + "theoretical_loss": 3.361065322002739, + "tokens_seen": 2570641408 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011174523570712136, + "loss": 2.6119, + "theoretical_loss": 3.361065322002739, + "tokens_seen": 2570641408 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011173520561685055, + "loss": 2.5497, + "theoretical_loss": 3.361058525234516, + "tokens_seen": 2570706944 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011172517552657974, + "loss": 2.5, + "theoretical_loss": 3.361051728688078, + "tokens_seen": 2570772480 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011171514543630892, + "loss": 2.4514, + "theoretical_loss": 3.3610449323634124, + "tokens_seen": 2570838016 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011170511534603812, + "loss": 2.3477, + "theoretical_loss": 3.3610381362605066, + "tokens_seen": 2570903552 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001116950852557673, + "loss": 2.5268, + "theoretical_loss": 3.361031340379347, + "tokens_seen": 2570969088 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011168505516549649, + "loss": 2.6587, + "theoretical_loss": 3.361024544719921, + "tokens_seen": 2571034624 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011167502507522567, + "loss": 2.6771, + "theoretical_loss": 3.361017749282216, + "tokens_seen": 2571100160 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011166499498495487, + "loss": 2.4119, + "theoretical_loss": 3.361010954066219, + "tokens_seen": 2571165696 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011165496489468406, + "loss": 2.678, + "theoretical_loss": 3.3610041590719173, + "tokens_seen": 2571231232 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011164493480441324, + "loss": 2.6698, + "theoretical_loss": 3.3609973642992976, + "tokens_seen": 2571296768 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011163490471414244, + "loss": 2.4074, + "theoretical_loss": 3.3609905697483473, + "tokens_seen": 2571362304 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011162487462387162, + "loss": 2.5722, + "theoretical_loss": 3.360983775419053, + "tokens_seen": 2571427840 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011161484453360081, + "loss": 2.5113, + "theoretical_loss": 3.3609769813114023, + "tokens_seen": 2571493376 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011160481444333, + "loss": 2.5556, + "theoretical_loss": 3.360970187425383, + "tokens_seen": 2571558912 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011159478435305919, + "loss": 2.635, + "theoretical_loss": 3.3609633937609806, + "tokens_seen": 2571624448 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011158475426278837, + "loss": 2.4361, + "theoretical_loss": 3.3609566003181834, + "tokens_seen": 2571689984 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011157472417251755, + "loss": 2.7234, + "theoretical_loss": 3.3609498070969783, + "tokens_seen": 2571755520 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011156469408224675, + "loss": 2.2743, + "theoretical_loss": 3.3609430140973524, + "tokens_seen": 2571821056 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011155466399197593, + "loss": 2.5482, + "theoretical_loss": 3.3609362213192924, + "tokens_seen": 2571886592 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011154463390170512, + "loss": 2.4566, + "theoretical_loss": 3.360929428762786, + "tokens_seen": 2571952128 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001115346038114343, + "loss": 2.5316, + "theoretical_loss": 3.36092263642782, + "tokens_seen": 2572017664 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001115245737211635, + "loss": 2.5859, + "theoretical_loss": 3.3609158443143823, + "tokens_seen": 2572083200 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011151454363089268, + "loss": 2.476, + "theoretical_loss": 3.360909052422459, + "tokens_seen": 2572148736 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011150451354062187, + "loss": 2.5424, + "theoretical_loss": 3.3609022607520376, + "tokens_seen": 2572214272 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2836608, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.769841432571411, + "objective/train/theoretical_loss": 3.3608954693031055, + "objective/train/tokens_used": 2592739808, + "theoretical_loss": 3.3608954693031055, + "tokens_seen": 2572279808 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011149448345035105, + "loss": 2.5883, + "theoretical_loss": 3.3608954693031055, + "tokens_seen": 2572279808 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011148445336008025, + "loss": 2.4731, + "theoretical_loss": 3.360888678075649, + "tokens_seen": 2572345344 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011147442326980943, + "loss": 2.8416, + "theoretical_loss": 3.3608818870696564, + "tokens_seen": 2572410880 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011146439317953861, + "loss": 2.5043, + "theoretical_loss": 3.360875096285114, + "tokens_seen": 2572476416 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001114543630892678, + "loss": 2.4598, + "theoretical_loss": 3.3608683057220095, + "tokens_seen": 2572541952 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011144433299899699, + "loss": 2.4495, + "theoretical_loss": 3.3608615153803294, + "tokens_seen": 2572607488 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011143430290872618, + "loss": 2.4264, + "theoretical_loss": 3.360854725260061, + "tokens_seen": 2572673024 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011142427281845536, + "loss": 2.5334, + "theoretical_loss": 3.3608479353611918, + "tokens_seen": 2572738560 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011141424272818456, + "loss": 2.4616, + "theoretical_loss": 3.3608411456837093, + "tokens_seen": 2572804096 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011140421263791374, + "loss": 2.453, + "theoretical_loss": 3.3608343562275995, + "tokens_seen": 2572869632 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011139418254764293, + "loss": 2.6599, + "theoretical_loss": 3.36082756699285, + "tokens_seen": 2572935168 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011138415245737211, + "loss": 2.5881, + "theoretical_loss": 3.3608207779794483, + "tokens_seen": 2573000704 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001113741223671013, + "loss": 2.4947, + "theoretical_loss": 3.3608139891873816, + "tokens_seen": 2573066240 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011136409227683049, + "loss": 2.4409, + "theoretical_loss": 3.3608072006166365, + "tokens_seen": 2573131776 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011135406218655967, + "loss": 2.6998, + "theoretical_loss": 3.360800412267201, + "tokens_seen": 2573197312 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011134403209628887, + "loss": 2.5458, + "theoretical_loss": 3.360793624139061, + "tokens_seen": 2573262848 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011133400200601805, + "loss": 2.5091, + "theoretical_loss": 3.3607868362322044, + "tokens_seen": 2573328384 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011132397191574724, + "loss": 2.5207, + "theoretical_loss": 3.3607800485466184, + "tokens_seen": 2573393920 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011131394182547642, + "loss": 2.5067, + "theoretical_loss": 3.3607732610822905, + "tokens_seen": 2573459456 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011130391173520562, + "loss": 2.531, + "theoretical_loss": 3.360766473839207, + "tokens_seen": 2573524992 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011129388164493481, + "loss": 2.6089, + "theoretical_loss": 3.3607596868173553, + "tokens_seen": 2573590528 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011128385155466401, + "loss": 2.559, + "theoretical_loss": 3.3607529000167227, + "tokens_seen": 2573656064 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011127382146439319, + "loss": 2.3606, + "theoretical_loss": 3.3607461134372967, + "tokens_seen": 2573721600 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011126379137412237, + "loss": 2.5145, + "theoretical_loss": 3.3607393270790635, + "tokens_seen": 2573787136 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011125376128385156, + "loss": 2.5524, + "theoretical_loss": 3.3607325409420117, + "tokens_seen": 2573852672 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2837402, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3895390033721924, + "objective/train/theoretical_loss": 3.360725755026127, + "objective/train/tokens_used": 2594378208, + "theoretical_loss": 3.360725755026127, + "tokens_seen": 2573918208 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011124373119358075, + "loss": 2.3812, + "theoretical_loss": 3.360725755026127, + "tokens_seen": 2573918208 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011123370110330994, + "loss": 2.5945, + "theoretical_loss": 3.3607189693313972, + "tokens_seen": 2573983744 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011122367101303912, + "loss": 2.7659, + "theoretical_loss": 3.36071218385781, + "tokens_seen": 2574049280 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011121364092276832, + "loss": 2.3655, + "theoretical_loss": 3.3607053986053517, + "tokens_seen": 2574114816 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001112036108324975, + "loss": 2.5613, + "theoretical_loss": 3.3606986135740096, + "tokens_seen": 2574180352 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011119358074222669, + "loss": 2.7456, + "theoretical_loss": 3.360691828763771, + "tokens_seen": 2574245888 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011118355065195587, + "loss": 2.4929, + "theoretical_loss": 3.360685044174623, + "tokens_seen": 2574311424 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011117352056168505, + "loss": 2.3807, + "theoretical_loss": 3.3606782598065537, + "tokens_seen": 2574376960 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011116349047141425, + "loss": 2.3535, + "theoretical_loss": 3.360671475659549, + "tokens_seen": 2574442496 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011115346038114343, + "loss": 2.2725, + "theoretical_loss": 3.3606646917335965, + "tokens_seen": 2574508032 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011114343029087262, + "loss": 2.4903, + "theoretical_loss": 3.360657908028683, + "tokens_seen": 2574573568 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001111334002006018, + "loss": 2.508, + "theoretical_loss": 3.3606511245447965, + "tokens_seen": 2574639104 + }, + { + "epoch": 8.06, + "learning_rate": 0.000111123370110331, + "loss": 2.4721, + "theoretical_loss": 3.360644341281924, + "tokens_seen": 2574704640 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011111334002006018, + "loss": 2.5106, + "theoretical_loss": 3.3606375582400516, + "tokens_seen": 2574770176 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011110330992978938, + "loss": 2.5435, + "theoretical_loss": 3.360630775419168, + "tokens_seen": 2574835712 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011109327983951856, + "loss": 2.5309, + "theoretical_loss": 3.360623992819259, + "tokens_seen": 2574901248 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011108324974924774, + "loss": 2.3801, + "theoretical_loss": 3.360617210440313, + "tokens_seen": 2574966784 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011107321965897693, + "loss": 2.5117, + "theoretical_loss": 3.3606104282823166, + "tokens_seen": 2575032320 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011106318956870611, + "loss": 2.4035, + "theoretical_loss": 3.3606036463452567, + "tokens_seen": 2575097856 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011105315947843531, + "loss": 2.3716, + "theoretical_loss": 3.360596864629121, + "tokens_seen": 2575163392 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011104312938816449, + "loss": 2.3325, + "theoretical_loss": 3.3605900831338964, + "tokens_seen": 2575228928 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011103309929789368, + "loss": 2.6732, + "theoretical_loss": 3.36058330185957, + "tokens_seen": 2575294464 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011102306920762287, + "loss": 2.6214, + "theoretical_loss": 3.360576520806129, + "tokens_seen": 2575360000 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011101303911735206, + "loss": 2.5154, + "theoretical_loss": 3.360569739973561, + "tokens_seen": 2575425536 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011100300902708124, + "loss": 2.634, + "theoretical_loss": 3.360562959361853, + "tokens_seen": 2575491072 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2838894, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.51961088180542, + "objective/train/theoretical_loss": 3.3605561789709917, + "objective/train/tokens_used": 2596016608, + "theoretical_loss": 3.3605561789709917, + "tokens_seen": 2575556608 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011099297893681042, + "loss": 2.4342, + "theoretical_loss": 3.3605561789709917, + "tokens_seen": 2575556608 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011098294884653962, + "loss": 2.4186, + "theoretical_loss": 3.360549398800965, + "tokens_seen": 2575622144 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001109729187562688, + "loss": 2.5537, + "theoretical_loss": 3.3605426188517598, + "tokens_seen": 2575687680 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011096288866599799, + "loss": 2.5092, + "theoretical_loss": 3.360535839123363, + "tokens_seen": 2575753216 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011095285857572717, + "loss": 2.5628, + "theoretical_loss": 3.3605290596157618, + "tokens_seen": 2575818752 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011094282848545637, + "loss": 2.4165, + "theoretical_loss": 3.360522280328944, + "tokens_seen": 2575884288 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011093279839518555, + "loss": 2.3777, + "theoretical_loss": 3.3605155012628964, + "tokens_seen": 2575949824 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011092276830491476, + "loss": 2.5391, + "theoretical_loss": 3.360508722417606, + "tokens_seen": 2576015360 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011091273821464394, + "loss": 2.6484, + "theoretical_loss": 3.3605019437930603, + "tokens_seen": 2576080896 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011090270812437313, + "loss": 2.4101, + "theoretical_loss": 3.3604951653892465, + "tokens_seen": 2576146432 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011089267803410231, + "loss": 2.5804, + "theoretical_loss": 3.3604883872061517, + "tokens_seen": 2576211968 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001108826479438315, + "loss": 2.5801, + "theoretical_loss": 3.360481609243763, + "tokens_seen": 2576277504 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011087261785356069, + "loss": 2.5126, + "theoretical_loss": 3.3604748315020676, + "tokens_seen": 2576343040 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011086258776328987, + "loss": 2.4946, + "theoretical_loss": 3.360468053981053, + "tokens_seen": 2576408576 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011085255767301907, + "loss": 2.5431, + "theoretical_loss": 3.3604612766807063, + "tokens_seen": 2576474112 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011084252758274825, + "loss": 2.5346, + "theoretical_loss": 3.3604544996010146, + "tokens_seen": 2576539648 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011083249749247744, + "loss": 2.6167, + "theoretical_loss": 3.3604477227419647, + "tokens_seen": 2576605184 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011082246740220662, + "loss": 2.63, + "theoretical_loss": 3.3604409461035445, + "tokens_seen": 2576670720 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011081243731193582, + "loss": 2.4262, + "theoretical_loss": 3.3604341696857407, + "tokens_seen": 2576736256 + }, + { + "epoch": 8.06, + "learning_rate": 0.000110802407221665, + "loss": 2.7355, + "theoretical_loss": 3.360427393488541, + "tokens_seen": 2576801792 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011079237713139418, + "loss": 2.6038, + "theoretical_loss": 3.360420617511932, + "tokens_seen": 2576867328 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011078234704112338, + "loss": 2.5617, + "theoretical_loss": 3.3604138417559017, + "tokens_seen": 2576932864 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011077231695085256, + "loss": 2.437, + "theoretical_loss": 3.3604070662204366, + "tokens_seen": 2576998400 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011076228686058175, + "loss": 2.4455, + "theoretical_loss": 3.3604002909055244, + "tokens_seen": 2577063936 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011075225677031093, + "loss": 2.624, + "theoretical_loss": 3.3603935158111518, + "tokens_seen": 2577129472 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2839532, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.29068660736084, + "objective/train/theoretical_loss": 3.3603867409373063, + "objective/train/tokens_used": 2597655008, + "theoretical_loss": 3.3603867409373063, + "tokens_seen": 2577195008 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011074222668004013, + "loss": 2.4394, + "theoretical_loss": 3.3603867409373063, + "tokens_seen": 2577195008 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011073219658976931, + "loss": 2.5898, + "theoretical_loss": 3.3603799662839755, + "tokens_seen": 2577260544 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001107221664994985, + "loss": 2.7225, + "theoretical_loss": 3.3603731918511457, + "tokens_seen": 2577326080 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011071213640922768, + "loss": 2.5534, + "theoretical_loss": 3.360366417638805, + "tokens_seen": 2577391616 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011070210631895688, + "loss": 2.5059, + "theoretical_loss": 3.36035964364694, + "tokens_seen": 2577457152 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011069207622868606, + "loss": 2.5973, + "theoretical_loss": 3.360352869875538, + "tokens_seen": 2577522688 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011068204613841524, + "loss": 2.5443, + "theoretical_loss": 3.360346096324587, + "tokens_seen": 2577588224 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011067201604814444, + "loss": 2.3675, + "theoretical_loss": 3.360339322994073, + "tokens_seen": 2577653760 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011066198595787362, + "loss": 2.6713, + "theoretical_loss": 3.360332549883984, + "tokens_seen": 2577719296 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011065195586760281, + "loss": 2.4741, + "theoretical_loss": 3.360325776994307, + "tokens_seen": 2577784832 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011064192577733199, + "loss": 2.3571, + "theoretical_loss": 3.3603190043250297, + "tokens_seen": 2577850368 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011063189568706119, + "loss": 2.6844, + "theoretical_loss": 3.3603122318761383, + "tokens_seen": 2577915904 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011062186559679037, + "loss": 2.4563, + "theoretical_loss": 3.360305459647621, + "tokens_seen": 2577981440 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011061183550651956, + "loss": 2.4759, + "theoretical_loss": 3.3602986876394647, + "tokens_seen": 2578046976 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011060180541624874, + "loss": 2.425, + "theoretical_loss": 3.3602919158516564, + "tokens_seen": 2578112512 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011059177532597792, + "loss": 2.5372, + "theoretical_loss": 3.360285144284184, + "tokens_seen": 2578178048 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011058174523570712, + "loss": 2.596, + "theoretical_loss": 3.3602783729370334, + "tokens_seen": 2578243584 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001105717151454363, + "loss": 2.5753, + "theoretical_loss": 3.3602716018101932, + "tokens_seen": 2578309120 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001105616850551655, + "loss": 2.7172, + "theoretical_loss": 3.3602648309036502, + "tokens_seen": 2578374656 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011055165496489468, + "loss": 2.6544, + "theoretical_loss": 3.3602580602173915, + "tokens_seen": 2578440192 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011054162487462388, + "loss": 2.443, + "theoretical_loss": 3.360251289751404, + "tokens_seen": 2578505728 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011053159478435307, + "loss": 2.4435, + "theoretical_loss": 3.3602445195056756, + "tokens_seen": 2578571264 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011052156469408226, + "loss": 2.6509, + "theoretical_loss": 3.360237749480193, + "tokens_seen": 2578636800 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011051153460381144, + "loss": 2.3933, + "theoretical_loss": 3.3602309796749443, + "tokens_seen": 2578702336 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011050150451354062, + "loss": 2.513, + "theoretical_loss": 3.360224210089916, + "tokens_seen": 2578767872 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2840054, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6991326808929443, + "objective/train/theoretical_loss": 3.360217440725095, + "objective/train/tokens_used": 2599293408, + "theoretical_loss": 3.360217440725095, + "tokens_seen": 2578833408 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011049147442326982, + "loss": 2.4997, + "theoretical_loss": 3.360217440725095, + "tokens_seen": 2578833408 + }, + { + "epoch": 8.06, + "learning_rate": 0.000110481444332999, + "loss": 2.3539, + "theoretical_loss": 3.360210671580469, + "tokens_seen": 2578898944 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011047141424272819, + "loss": 2.7285, + "theoretical_loss": 3.3602039026560258, + "tokens_seen": 2578964480 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011046138415245737, + "loss": 2.4659, + "theoretical_loss": 3.3601971339517513, + "tokens_seen": 2579030016 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011045135406218657, + "loss": 2.5637, + "theoretical_loss": 3.360190365467634, + "tokens_seen": 2579095552 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011044132397191575, + "loss": 2.7168, + "theoretical_loss": 3.360183597203661, + "tokens_seen": 2579161088 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011043129388164494, + "loss": 2.4959, + "theoretical_loss": 3.360176829159819, + "tokens_seen": 2579226624 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011042126379137413, + "loss": 2.3784, + "theoretical_loss": 3.3601700613360954, + "tokens_seen": 2579292160 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011041123370110332, + "loss": 2.5028, + "theoretical_loss": 3.3601632937324775, + "tokens_seen": 2579357696 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001104012036108325, + "loss": 2.6833, + "theoretical_loss": 3.3601565263489523, + "tokens_seen": 2579423232 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011039117352056168, + "loss": 2.542, + "theoretical_loss": 3.360149759185508, + "tokens_seen": 2579488768 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011038114343029088, + "loss": 2.5238, + "theoretical_loss": 3.3601429922421304, + "tokens_seen": 2579554304 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011037111334002006, + "loss": 2.3685, + "theoretical_loss": 3.360136225518808, + "tokens_seen": 2579619840 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011036108324974925, + "loss": 2.5863, + "theoretical_loss": 3.3601294590155275, + "tokens_seen": 2579685376 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011035105315947843, + "loss": 2.5668, + "theoretical_loss": 3.3601226927322765, + "tokens_seen": 2579750912 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011034102306920763, + "loss": 2.427, + "theoretical_loss": 3.3601159266690415, + "tokens_seen": 2579816448 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011033099297893681, + "loss": 2.5513, + "theoretical_loss": 3.3601091608258105, + "tokens_seen": 2579881984 + }, + { + "epoch": 8.06, + "learning_rate": 0.000110320962888666, + "loss": 2.5238, + "theoretical_loss": 3.3601023952025706, + "tokens_seen": 2579947520 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011031093279839519, + "loss": 2.5916, + "theoretical_loss": 3.360095629799309, + "tokens_seen": 2580013056 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011030090270812437, + "loss": 2.5717, + "theoretical_loss": 3.3600888646160127, + "tokens_seen": 2580078592 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011029087261785356, + "loss": 2.6143, + "theoretical_loss": 3.3600820996526695, + "tokens_seen": 2580144128 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011028084252758274, + "loss": 2.7328, + "theoretical_loss": 3.360075334909266, + "tokens_seen": 2580209664 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011027081243731194, + "loss": 2.4371, + "theoretical_loss": 3.3600685703857898, + "tokens_seen": 2580275200 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011026078234704112, + "loss": 2.4202, + "theoretical_loss": 3.3600618060822285, + "tokens_seen": 2580340736 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011025075225677031, + "loss": 2.4914, + "theoretical_loss": 3.3600550419985686, + "tokens_seen": 2580406272 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2841314, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7262954711914062, + "objective/train/theoretical_loss": 3.3600482781347982, + "objective/train/tokens_used": 2600931808, + "theoretical_loss": 3.3600482781347982, + "tokens_seen": 2580471808 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001102407221664995, + "loss": 2.5845, + "theoretical_loss": 3.3600482781347982, + "tokens_seen": 2580471808 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011023069207622869, + "loss": 2.47, + "theoretical_loss": 3.360041514490904, + "tokens_seen": 2580537344 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011022066198595787, + "loss": 2.4908, + "theoretical_loss": 3.3600347510668733, + "tokens_seen": 2580602880 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011021063189568705, + "loss": 2.5616, + "theoretical_loss": 3.360027987862694, + "tokens_seen": 2580668416 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011020060180541625, + "loss": 2.5578, + "theoretical_loss": 3.3600212248783525, + "tokens_seen": 2580733952 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011019057171514543, + "loss": 2.3891, + "theoretical_loss": 3.3600144621138366, + "tokens_seen": 2580799488 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011018054162487462, + "loss": 2.3433, + "theoretical_loss": 3.3600076995691333, + "tokens_seen": 2580865024 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011017051153460382, + "loss": 2.2626, + "theoretical_loss": 3.36000093724423, + "tokens_seen": 2580930560 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011016048144433301, + "loss": 2.5484, + "theoretical_loss": 3.3599941751391142, + "tokens_seen": 2580996096 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011015045135406219, + "loss": 2.3642, + "theoretical_loss": 3.359987413253773, + "tokens_seen": 2581061632 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011014042126379139, + "loss": 2.3185, + "theoretical_loss": 3.3599806515881934, + "tokens_seen": 2581127168 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011013039117352057, + "loss": 2.5011, + "theoretical_loss": 3.359973890142363, + "tokens_seen": 2581192704 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011012036108324976, + "loss": 2.2925, + "theoretical_loss": 3.359967128916269, + "tokens_seen": 2581258240 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011011033099297894, + "loss": 2.511, + "theoretical_loss": 3.3599603679098986, + "tokens_seen": 2581323776 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011010030090270812, + "loss": 2.4528, + "theoretical_loss": 3.359953607123239, + "tokens_seen": 2581389312 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011009027081243732, + "loss": 2.4027, + "theoretical_loss": 3.359946846556278, + "tokens_seen": 2581454848 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001100802407221665, + "loss": 2.3819, + "theoretical_loss": 3.3599400862090025, + "tokens_seen": 2581520384 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001100702106318957, + "loss": 2.4656, + "theoretical_loss": 3.3599333260813995, + "tokens_seen": 2581585920 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011006018054162488, + "loss": 2.3924, + "theoretical_loss": 3.3599265661734568, + "tokens_seen": 2581651456 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011005015045135407, + "loss": 2.5218, + "theoretical_loss": 3.3599198064851614, + "tokens_seen": 2581716992 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011004012036108325, + "loss": 2.4229, + "theoretical_loss": 3.3599130470165006, + "tokens_seen": 2581782528 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011003009027081245, + "loss": 2.5468, + "theoretical_loss": 3.359906287767462, + "tokens_seen": 2581848064 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011002006018054163, + "loss": 2.4471, + "theoretical_loss": 3.3598995287380324, + "tokens_seen": 2581913600 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011001003009027081, + "loss": 2.5406, + "theoretical_loss": 3.3598927699281993, + "tokens_seen": 2581979136 + }, + { + "epoch": 8.06, + "learning_rate": 0.00011, + "loss": 2.6608, + "theoretical_loss": 3.35988601133795, + "tokens_seen": 2582044672 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2842026, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.518416404724121, + "objective/train/theoretical_loss": 3.3598792529672723, + "objective/train/tokens_used": 2602570208, + "theoretical_loss": 3.3598792529672723, + "tokens_seen": 2582110208 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010998996990972919, + "loss": 2.4866, + "theoretical_loss": 3.3598792529672723, + "tokens_seen": 2582110208 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010997993981945838, + "loss": 2.2552, + "theoretical_loss": 3.3598724948161527, + "tokens_seen": 2582175744 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010996990972918756, + "loss": 2.5443, + "theoretical_loss": 3.3598657368845783, + "tokens_seen": 2582241280 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010995987963891676, + "loss": 2.5302, + "theoretical_loss": 3.3598589791725377, + "tokens_seen": 2582306816 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010994984954864594, + "loss": 2.4313, + "theoretical_loss": 3.359852221680017, + "tokens_seen": 2582372352 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010993981945837513, + "loss": 2.643, + "theoretical_loss": 3.359845464407004, + "tokens_seen": 2582437888 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010992978936810431, + "loss": 2.8376, + "theoretical_loss": 3.359838707353486, + "tokens_seen": 2582503424 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010991975927783351, + "loss": 2.4004, + "theoretical_loss": 3.3598319505194496, + "tokens_seen": 2582568960 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010990972918756269, + "loss": 2.4708, + "theoretical_loss": 3.3598251939048835, + "tokens_seen": 2582634496 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010989969909729187, + "loss": 2.5154, + "theoretical_loss": 3.359818437509774, + "tokens_seen": 2582700032 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010988966900702106, + "loss": 2.4687, + "theoretical_loss": 3.3598116813341083, + "tokens_seen": 2582765568 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010987963891675025, + "loss": 2.4672, + "theoretical_loss": 3.3598049253778743, + "tokens_seen": 2582831104 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010986960882647944, + "loss": 2.4056, + "theoretical_loss": 3.3597981696410586, + "tokens_seen": 2582896640 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010985957873620862, + "loss": 2.442, + "theoretical_loss": 3.3597914141236496, + "tokens_seen": 2582962176 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010984954864593782, + "loss": 2.4657, + "theoretical_loss": 3.3597846588256335, + "tokens_seen": 2583027712 + }, + { + "epoch": 8.06, + "learning_rate": 0.000109839518555667, + "loss": 2.3582, + "theoretical_loss": 3.359777903746998, + "tokens_seen": 2583093248 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010982948846539619, + "loss": 2.5855, + "theoretical_loss": 3.3597711488877304, + "tokens_seen": 2583158784 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010981945837512537, + "loss": 2.4948, + "theoretical_loss": 3.359764394247818, + "tokens_seen": 2583224320 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010980942828485455, + "loss": 2.5031, + "theoretical_loss": 3.3597576398272486, + "tokens_seen": 2583289856 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010979939819458375, + "loss": 2.4348, + "theoretical_loss": 3.3597508856260085, + "tokens_seen": 2583355392 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010978936810431294, + "loss": 2.4724, + "theoretical_loss": 3.359744131644086, + "tokens_seen": 2583420928 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010977933801404214, + "loss": 2.7582, + "theoretical_loss": 3.3597373778814683, + "tokens_seen": 2583486464 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010976930792377132, + "loss": 2.543, + "theoretical_loss": 3.359730624338142, + "tokens_seen": 2583552000 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010975927783350051, + "loss": 2.5363, + "theoretical_loss": 3.359723871014095, + "tokens_seen": 2583617536 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001097492477432297, + "loss": 2.4943, + "theoretical_loss": 3.359717117909314, + "tokens_seen": 2583683072 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2843440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.495652198791504, + "objective/train/theoretical_loss": 3.3597103650237874, + "objective/train/tokens_used": 2604208608, + "theoretical_loss": 3.3597103650237874, + "tokens_seen": 2583748608 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010973921765295889, + "loss": 2.3516, + "theoretical_loss": 3.3597103650237874, + "tokens_seen": 2583748608 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010972918756268807, + "loss": 2.5769, + "theoretical_loss": 3.3597036123575017, + "tokens_seen": 2583814144 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010971915747241725, + "loss": 2.3151, + "theoretical_loss": 3.3596968599104446, + "tokens_seen": 2583879680 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010970912738214645, + "loss": 2.5953, + "theoretical_loss": 3.359690107682603, + "tokens_seen": 2583945216 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010969909729187563, + "loss": 2.3274, + "theoretical_loss": 3.3596833556739645, + "tokens_seen": 2584010752 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010968906720160482, + "loss": 2.4172, + "theoretical_loss": 3.359676603884516, + "tokens_seen": 2584076288 + }, + { + "epoch": 8.06, + "learning_rate": 0.000109679037111334, + "loss": 2.3659, + "theoretical_loss": 3.359669852314246, + "tokens_seen": 2584141824 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001096690070210632, + "loss": 2.5107, + "theoretical_loss": 3.359663100963141, + "tokens_seen": 2584207360 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010965897693079238, + "loss": 2.6079, + "theoretical_loss": 3.3596563498311878, + "tokens_seen": 2584272896 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010964894684052157, + "loss": 2.5157, + "theoretical_loss": 3.3596495989183746, + "tokens_seen": 2584338432 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010963891675025075, + "loss": 2.5822, + "theoretical_loss": 3.3596428482246883, + "tokens_seen": 2584403968 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010962888665997995, + "loss": 2.5639, + "theoretical_loss": 3.3596360977501165, + "tokens_seen": 2584469504 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010961885656970913, + "loss": 2.3837, + "theoretical_loss": 3.3596293474946464, + "tokens_seen": 2584535040 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010960882647943831, + "loss": 2.3128, + "theoretical_loss": 3.3596225974582654, + "tokens_seen": 2584600576 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001095987963891675, + "loss": 2.6028, + "theoretical_loss": 3.3596158476409608, + "tokens_seen": 2584666112 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010958876629889669, + "loss": 2.5397, + "theoretical_loss": 3.3596090980427196, + "tokens_seen": 2584731648 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010957873620862588, + "loss": 2.4839, + "theoretical_loss": 3.35960234866353, + "tokens_seen": 2584797184 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010956870611835506, + "loss": 2.5551, + "theoretical_loss": 3.359595599503378, + "tokens_seen": 2584862720 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010955867602808426, + "loss": 2.4089, + "theoretical_loss": 3.359588850562252, + "tokens_seen": 2584928256 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010954864593781344, + "loss": 2.5744, + "theoretical_loss": 3.3595821018401395, + "tokens_seen": 2584993792 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010953861584754263, + "loss": 2.5455, + "theoretical_loss": 3.3595753533370267, + "tokens_seen": 2585059328 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010952858575727181, + "loss": 2.4633, + "theoretical_loss": 3.3595686050529023, + "tokens_seen": 2585124864 + }, + { + "epoch": 8.06, + "learning_rate": 0.000109518555667001, + "loss": 2.5453, + "theoretical_loss": 3.3595618569877526, + "tokens_seen": 2585190400 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010950852557673019, + "loss": 2.3163, + "theoretical_loss": 3.3595551091415654, + "tokens_seen": 2585255936 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010949849548645937, + "loss": 2.505, + "theoretical_loss": 3.359548361514328, + "tokens_seen": 2585321472 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2844167, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1208465099334717, + "objective/train/theoretical_loss": 3.3595416141060275, + "objective/train/tokens_used": 2605847008, + "theoretical_loss": 3.3595416141060275, + "tokens_seen": 2585387008 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010948846539618857, + "loss": 2.3421, + "theoretical_loss": 3.3595416141060275, + "tokens_seen": 2585387008 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010947843530591775, + "loss": 2.3683, + "theoretical_loss": 3.359534866916652, + "tokens_seen": 2585452544 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010946840521564694, + "loss": 2.6865, + "theoretical_loss": 3.3595281199461877, + "tokens_seen": 2585518080 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010945837512537612, + "loss": 2.6727, + "theoretical_loss": 3.3595213731946227, + "tokens_seen": 2585583616 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010944834503510532, + "loss": 2.5345, + "theoretical_loss": 3.3595146266619444, + "tokens_seen": 2585649152 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001094383149448345, + "loss": 2.6338, + "theoretical_loss": 3.35950788034814, + "tokens_seen": 2585714688 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010942828485456368, + "loss": 2.1202, + "theoretical_loss": 3.3595011342531964, + "tokens_seen": 2585780224 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010941825476429289, + "loss": 2.4135, + "theoretical_loss": 3.359494388377102, + "tokens_seen": 2585845760 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010940822467402207, + "loss": 2.3058, + "theoretical_loss": 3.359487642719843, + "tokens_seen": 2585911296 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010939819458375126, + "loss": 2.6286, + "theoretical_loss": 3.359480897281407, + "tokens_seen": 2585976832 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010938816449348045, + "loss": 2.2725, + "theoretical_loss": 3.3594741520617823, + "tokens_seen": 2586042368 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010937813440320964, + "loss": 2.4487, + "theoretical_loss": 3.359467407060955, + "tokens_seen": 2586107904 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010936810431293882, + "loss": 2.4185, + "theoretical_loss": 3.359460662278914, + "tokens_seen": 2586173440 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010935807422266802, + "loss": 2.6062, + "theoretical_loss": 3.359453917715645, + "tokens_seen": 2586238976 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001093480441323972, + "loss": 2.341, + "theoretical_loss": 3.359447173371136, + "tokens_seen": 2586304512 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010933801404212639, + "loss": 2.5771, + "theoretical_loss": 3.3594404292453746, + "tokens_seen": 2586370048 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010932798395185557, + "loss": 2.5017, + "theoretical_loss": 3.359433685338348, + "tokens_seen": 2586435584 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010931795386158475, + "loss": 2.4984, + "theoretical_loss": 3.3594269416500433, + "tokens_seen": 2586501120 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010930792377131395, + "loss": 2.3833, + "theoretical_loss": 3.359420198180448, + "tokens_seen": 2586566656 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010929789368104313, + "loss": 2.6228, + "theoretical_loss": 3.35941345492955, + "tokens_seen": 2586632192 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010928786359077232, + "loss": 2.4185, + "theoretical_loss": 3.3594067118973365, + "tokens_seen": 2586697728 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001092778335005015, + "loss": 2.3806, + "theoretical_loss": 3.359399969083794, + "tokens_seen": 2586763264 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001092678034102307, + "loss": 2.4854, + "theoretical_loss": 3.3593932264889106, + "tokens_seen": 2586828800 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010925777331995988, + "loss": 2.3276, + "theoretical_loss": 3.3593864841126737, + "tokens_seen": 2586894336 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010924774322968908, + "loss": 2.3432, + "theoretical_loss": 3.3593797419550704, + "tokens_seen": 2586959872 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2845447, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.797365427017212, + "objective/train/theoretical_loss": 3.3593730000160886, + "objective/train/tokens_used": 2607485408, + "theoretical_loss": 3.3593730000160886, + "tokens_seen": 2587025408 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010923771313941826, + "loss": 2.5484, + "theoretical_loss": 3.3593730000160886, + "tokens_seen": 2587025408 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010922768304914744, + "loss": 2.3366, + "theoretical_loss": 3.3593662582957147, + "tokens_seen": 2587090944 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010921765295887663, + "loss": 2.5833, + "theoretical_loss": 3.359359516793937, + "tokens_seen": 2587156480 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010920762286860581, + "loss": 2.5235, + "theoretical_loss": 3.3593527755107426, + "tokens_seen": 2587222016 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010919759277833501, + "loss": 2.505, + "theoretical_loss": 3.3593460344461183, + "tokens_seen": 2587287552 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010918756268806419, + "loss": 2.5886, + "theoretical_loss": 3.3593392936000526, + "tokens_seen": 2587353088 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010917753259779338, + "loss": 2.5844, + "theoretical_loss": 3.359332552972532, + "tokens_seen": 2587418624 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010916750250752257, + "loss": 2.3847, + "theoretical_loss": 3.3593258125635437, + "tokens_seen": 2587484160 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010915747241725176, + "loss": 2.6025, + "theoretical_loss": 3.359319072373076, + "tokens_seen": 2587549696 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010914744232698094, + "loss": 2.5457, + "theoretical_loss": 3.3593123324011156, + "tokens_seen": 2587615232 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010913741223671012, + "loss": 2.2865, + "theoretical_loss": 3.35930559264765, + "tokens_seen": 2587680768 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010912738214643932, + "loss": 2.3471, + "theoretical_loss": 3.359298853112667, + "tokens_seen": 2587746304 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001091173520561685, + "loss": 2.4065, + "theoretical_loss": 3.3592921137961533, + "tokens_seen": 2587811840 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010910732196589769, + "loss": 2.3466, + "theoretical_loss": 3.359285374698097, + "tokens_seen": 2587877376 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010909729187562687, + "loss": 2.4823, + "theoretical_loss": 3.3592786358184847, + "tokens_seen": 2587942912 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010908726178535607, + "loss": 2.4975, + "theoretical_loss": 3.3592718971573046, + "tokens_seen": 2588008448 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010907723169508525, + "loss": 2.3929, + "theoretical_loss": 3.3592651587145435, + "tokens_seen": 2588073984 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010906720160481444, + "loss": 2.3408, + "theoretical_loss": 3.359258420490189, + "tokens_seen": 2588139520 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010905717151454363, + "loss": 2.4616, + "theoretical_loss": 3.3592516824842282, + "tokens_seen": 2588205056 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010904714142427282, + "loss": 2.4185, + "theoretical_loss": 3.3592449446966492, + "tokens_seen": 2588270592 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010903711133400201, + "loss": 2.6566, + "theoretical_loss": 3.3592382071274387, + "tokens_seen": 2588336128 + }, + { + "epoch": 8.06, + "learning_rate": 0.0001090270812437312, + "loss": 2.5976, + "theoretical_loss": 3.359231469776584, + "tokens_seen": 2588401664 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010901705115346039, + "loss": 2.3756, + "theoretical_loss": 3.3592247326440736, + "tokens_seen": 2588467200 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010900702106318957, + "loss": 2.4692, + "theoretical_loss": 3.3592179957298938, + "tokens_seen": 2588532736 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010899699097291877, + "loss": 2.467, + "theoretical_loss": 3.359211259034032, + "tokens_seen": 2588598272 + }, + { + "epoch": 8.06, + "objective/train/docs_used": 2845929, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.346712350845337, + "objective/train/theoretical_loss": 3.3592045225564764, + "objective/train/tokens_used": 2609123808, + "theoretical_loss": 3.3592045225564764, + "tokens_seen": 2588663808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010898696088264795, + "loss": 2.4988, + "theoretical_loss": 3.3592045225564764, + "tokens_seen": 2588663808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010897693079237714, + "loss": 2.5577, + "theoretical_loss": 3.359197786297214, + "tokens_seen": 2588729344 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010896690070210632, + "loss": 2.2904, + "theoretical_loss": 3.359191050256232, + "tokens_seen": 2588794880 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010895687061183552, + "loss": 2.3462, + "theoretical_loss": 3.359184314433518, + "tokens_seen": 2588860416 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001089468405215647, + "loss": 2.5562, + "theoretical_loss": 3.359177578829059, + "tokens_seen": 2588925952 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010893681043129388, + "loss": 2.4376, + "theoretical_loss": 3.359170843442843, + "tokens_seen": 2588991488 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010892678034102307, + "loss": 2.6558, + "theoretical_loss": 3.359164108274857, + "tokens_seen": 2589057024 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010891675025075226, + "loss": 2.495, + "theoretical_loss": 3.359157373325089, + "tokens_seen": 2589122560 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010890672016048145, + "loss": 2.579, + "theoretical_loss": 3.3591506385935253, + "tokens_seen": 2589188096 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010889669007021063, + "loss": 2.3675, + "theoretical_loss": 3.3591439040801543, + "tokens_seen": 2589253632 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010888665997993983, + "loss": 2.4214, + "theoretical_loss": 3.359137169784963, + "tokens_seen": 2589319168 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010887662988966901, + "loss": 2.5925, + "theoretical_loss": 3.359130435707939, + "tokens_seen": 2589384704 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001088665997993982, + "loss": 2.5626, + "theoretical_loss": 3.3591237018490694, + "tokens_seen": 2589450240 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010885656970912738, + "loss": 2.3905, + "theoretical_loss": 3.359116968208342, + "tokens_seen": 2589515776 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010884653961885658, + "loss": 2.4368, + "theoretical_loss": 3.359110234785744, + "tokens_seen": 2589581312 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010883650952858576, + "loss": 2.4862, + "theoretical_loss": 3.359103501581263, + "tokens_seen": 2589646848 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010882647943831494, + "loss": 2.4741, + "theoretical_loss": 3.3590967685948856, + "tokens_seen": 2589712384 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010881644934804414, + "loss": 2.5794, + "theoretical_loss": 3.3590900358266005, + "tokens_seen": 2589777920 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010880641925777332, + "loss": 2.4134, + "theoretical_loss": 3.3590833032763943, + "tokens_seen": 2589843456 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010879638916750251, + "loss": 2.4195, + "theoretical_loss": 3.3590765709442545, + "tokens_seen": 2589908992 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010878635907723169, + "loss": 2.5628, + "theoretical_loss": 3.3590698388301687, + "tokens_seen": 2589974528 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010877632898696089, + "loss": 2.4873, + "theoretical_loss": 3.359063106934124, + "tokens_seen": 2590040064 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010876629889669007, + "loss": 2.5951, + "theoretical_loss": 3.3590563752561087, + "tokens_seen": 2590105600 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010875626880641926, + "loss": 2.2546, + "theoretical_loss": 3.359049643796109, + "tokens_seen": 2590171136 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010874623871614844, + "loss": 2.3179, + "theoretical_loss": 3.3590429125541132, + "tokens_seen": 2590236672 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2847227, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.59712553024292, + "objective/train/theoretical_loss": 3.3590361815301084, + "objective/train/tokens_used": 2610762208, + "theoretical_loss": 3.3590361815301084, + "tokens_seen": 2590302208 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010873620862587762, + "loss": 2.4325, + "theoretical_loss": 3.3590361815301084, + "tokens_seen": 2590302208 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010872617853560682, + "loss": 2.5237, + "theoretical_loss": 3.3590294507240817, + "tokens_seen": 2590367744 + }, + { + "epoch": 8.07, + "learning_rate": 0.000108716148445336, + "loss": 2.5065, + "theoretical_loss": 3.3590227201360214, + "tokens_seen": 2590433280 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001087061183550652, + "loss": 2.517, + "theoretical_loss": 3.359015989765914, + "tokens_seen": 2590498816 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010869608826479438, + "loss": 2.6724, + "theoretical_loss": 3.3590092596137477, + "tokens_seen": 2590564352 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010868605817452357, + "loss": 2.4231, + "theoretical_loss": 3.3590025296795094, + "tokens_seen": 2590629888 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010867602808425275, + "loss": 2.4008, + "theoretical_loss": 3.3589957999631865, + "tokens_seen": 2590695424 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010866599799398196, + "loss": 2.5569, + "theoretical_loss": 3.3589890704647667, + "tokens_seen": 2590760960 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010865596790371114, + "loss": 2.5474, + "theoretical_loss": 3.358982341184238, + "tokens_seen": 2590826496 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010864593781344032, + "loss": 2.6393, + "theoretical_loss": 3.3589756121215864, + "tokens_seen": 2590892032 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010863590772316952, + "loss": 2.341, + "theoretical_loss": 3.3589688832768005, + "tokens_seen": 2590957568 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001086258776328987, + "loss": 2.4672, + "theoretical_loss": 3.358962154649867, + "tokens_seen": 2591023104 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010861584754262789, + "loss": 2.5192, + "theoretical_loss": 3.358955426240774, + "tokens_seen": 2591088640 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010860581745235707, + "loss": 2.4128, + "theoretical_loss": 3.3589486980495087, + "tokens_seen": 2591154176 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010859578736208627, + "loss": 2.4739, + "theoretical_loss": 3.3589419700760583, + "tokens_seen": 2591219712 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010858575727181545, + "loss": 2.5409, + "theoretical_loss": 3.3589352423204106, + "tokens_seen": 2591285248 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010857572718154464, + "loss": 2.4726, + "theoretical_loss": 3.3589285147825527, + "tokens_seen": 2591350784 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010856569709127383, + "loss": 2.544, + "theoretical_loss": 3.3589217874624726, + "tokens_seen": 2591416320 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010855566700100302, + "loss": 2.244, + "theoretical_loss": 3.358915060360157, + "tokens_seen": 2591481856 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001085456369107322, + "loss": 2.6583, + "theoretical_loss": 3.3589083334755934, + "tokens_seen": 2591547392 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010853560682046138, + "loss": 2.3952, + "theoretical_loss": 3.35890160680877, + "tokens_seen": 2591612928 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010852557673019058, + "loss": 2.4469, + "theoretical_loss": 3.3588948803596734, + "tokens_seen": 2591678464 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010851554663991976, + "loss": 2.5217, + "theoretical_loss": 3.358888154128292, + "tokens_seen": 2591744000 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010850551654964895, + "loss": 2.4904, + "theoretical_loss": 3.3588814281146124, + "tokens_seen": 2591809536 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010849548645937813, + "loss": 2.4772, + "theoretical_loss": 3.358874702318622, + "tokens_seen": 2591875072 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2847893, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.433539867401123, + "objective/train/theoretical_loss": 3.358867976740309, + "objective/train/tokens_used": 2612400608, + "theoretical_loss": 3.358867976740309, + "tokens_seen": 2591940608 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010848545636910733, + "loss": 2.3799, + "theoretical_loss": 3.358867976740309, + "tokens_seen": 2591940608 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010847542627883651, + "loss": 2.648, + "theoretical_loss": 3.3588612513796603, + "tokens_seen": 2592006144 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001084653961885657, + "loss": 2.3764, + "theoretical_loss": 3.358854526236663, + "tokens_seen": 2592071680 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010845536609829489, + "loss": 2.5295, + "theoretical_loss": 3.358847801311306, + "tokens_seen": 2592137216 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010844533600802407, + "loss": 2.5967, + "theoretical_loss": 3.358841076603575, + "tokens_seen": 2592202752 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010843530591775326, + "loss": 2.4851, + "theoretical_loss": 3.3588343521134587, + "tokens_seen": 2592268288 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010842527582748244, + "loss": 2.4933, + "theoretical_loss": 3.358827627840944, + "tokens_seen": 2592333824 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010841524573721164, + "loss": 2.4, + "theoretical_loss": 3.358820903786018, + "tokens_seen": 2592399360 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010840521564694082, + "loss": 2.3865, + "theoretical_loss": 3.358814179948669, + "tokens_seen": 2592464896 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010839518555667001, + "loss": 2.4044, + "theoretical_loss": 3.3588074563288846, + "tokens_seen": 2592530432 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001083851554663992, + "loss": 2.2833, + "theoretical_loss": 3.358800732926651, + "tokens_seen": 2592595968 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010837512537612839, + "loss": 2.5041, + "theoretical_loss": 3.3587940097419566, + "tokens_seen": 2592661504 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010836509528585757, + "loss": 2.4725, + "theoretical_loss": 3.3587872867747883, + "tokens_seen": 2592727040 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010835506519558675, + "loss": 2.5754, + "theoretical_loss": 3.3587805640251345, + "tokens_seen": 2592792576 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010834503510531595, + "loss": 2.741, + "theoretical_loss": 3.358773841492982, + "tokens_seen": 2592858112 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010833500501504513, + "loss": 2.3302, + "theoretical_loss": 3.358767119178318, + "tokens_seen": 2592923648 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010832497492477432, + "loss": 2.3765, + "theoretical_loss": 3.358760397081131, + "tokens_seen": 2592989184 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001083149448345035, + "loss": 2.5074, + "theoretical_loss": 3.358753675201407, + "tokens_seen": 2593054720 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001083049147442327, + "loss": 2.4934, + "theoretical_loss": 3.3587469535391348, + "tokens_seen": 2593120256 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010829488465396189, + "loss": 2.3059, + "theoretical_loss": 3.358740232094301, + "tokens_seen": 2593185792 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010828485456369109, + "loss": 2.4268, + "theoretical_loss": 3.3587335108668936, + "tokens_seen": 2593251328 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010827482447342027, + "loss": 2.4165, + "theoretical_loss": 3.3587267898568998, + "tokens_seen": 2593316864 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010826479438314946, + "loss": 2.3786, + "theoretical_loss": 3.358720069064307, + "tokens_seen": 2593382400 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010825476429287864, + "loss": 2.6027, + "theoretical_loss": 3.358713348489103, + "tokens_seen": 2593447936 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010824473420260782, + "loss": 2.4528, + "theoretical_loss": 3.3587066281312756, + "tokens_seen": 2593513472 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2848972, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3333914279937744, + "objective/train/theoretical_loss": 3.358699907990811, + "objective/train/tokens_used": 2614039008, + "theoretical_loss": 3.358699907990811, + "tokens_seen": 2593579008 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010823470411233702, + "loss": 2.4748, + "theoretical_loss": 3.358699907990811, + "tokens_seen": 2593579008 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001082246740220662, + "loss": 2.4567, + "theoretical_loss": 3.358693188067698, + "tokens_seen": 2593644544 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001082146439317954, + "loss": 2.2818, + "theoretical_loss": 3.358686468361923, + "tokens_seen": 2593710080 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010820461384152458, + "loss": 2.0654, + "theoretical_loss": 3.3586797488734743, + "tokens_seen": 2593775616 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010819458375125377, + "loss": 2.4594, + "theoretical_loss": 3.358673029602339, + "tokens_seen": 2593841152 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010818455366098295, + "loss": 2.4055, + "theoretical_loss": 3.3586663105485046, + "tokens_seen": 2593906688 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010817452357071215, + "loss": 2.6426, + "theoretical_loss": 3.358659591711959, + "tokens_seen": 2593972224 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010816449348044133, + "loss": 2.6069, + "theoretical_loss": 3.358652873092689, + "tokens_seen": 2594037760 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010815446339017051, + "loss": 2.3901, + "theoretical_loss": 3.3586461546906827, + "tokens_seen": 2594103296 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001081444332998997, + "loss": 2.144, + "theoretical_loss": 3.358639436505927, + "tokens_seen": 2594168832 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010813440320962888, + "loss": 2.2869, + "theoretical_loss": 3.35863271853841, + "tokens_seen": 2594234368 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010812437311935808, + "loss": 2.5024, + "theoretical_loss": 3.3586260007881186, + "tokens_seen": 2594299904 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010811434302908726, + "loss": 2.5352, + "theoretical_loss": 3.3586192832550408, + "tokens_seen": 2594365440 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010810431293881646, + "loss": 2.5805, + "theoretical_loss": 3.358612565939164, + "tokens_seen": 2594430976 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010809428284854564, + "loss": 2.5116, + "theoretical_loss": 3.358605848840475, + "tokens_seen": 2594496512 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010808425275827483, + "loss": 2.4744, + "theoretical_loss": 3.3585991319589623, + "tokens_seen": 2594562048 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010807422266800401, + "loss": 2.4402, + "theoretical_loss": 3.3585924152946127, + "tokens_seen": 2594627584 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010806419257773321, + "loss": 2.4862, + "theoretical_loss": 3.3585856988474143, + "tokens_seen": 2594693120 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010805416248746239, + "loss": 2.4008, + "theoretical_loss": 3.3585789826173538, + "tokens_seen": 2594758656 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010804413239719157, + "loss": 2.5104, + "theoretical_loss": 3.358572266604419, + "tokens_seen": 2594824192 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010803410230692076, + "loss": 2.5669, + "theoretical_loss": 3.358565550808598, + "tokens_seen": 2594889728 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010802407221664995, + "loss": 2.5357, + "theoretical_loss": 3.3585588352298776, + "tokens_seen": 2594955264 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010801404212637914, + "loss": 2.2981, + "theoretical_loss": 3.3585521198682455, + "tokens_seen": 2595020800 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010800401203610832, + "loss": 2.3511, + "theoretical_loss": 3.358545404723689, + "tokens_seen": 2595086336 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010799398194583752, + "loss": 2.5575, + "theoretical_loss": 3.3585386897961964, + "tokens_seen": 2595151872 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2849522, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3405961990356445, + "objective/train/theoretical_loss": 3.358531975085754, + "objective/train/tokens_used": 2615677408, + "theoretical_loss": 3.358531975085754, + "tokens_seen": 2595217408 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001079839518555667, + "loss": 2.3409, + "theoretical_loss": 3.358531975085754, + "tokens_seen": 2595217408 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010797392176529589, + "loss": 2.5019, + "theoretical_loss": 3.35852526059235, + "tokens_seen": 2595282944 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010796389167502507, + "loss": 2.5063, + "theoretical_loss": 3.358518546315972, + "tokens_seen": 2595348480 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010795386158475425, + "loss": 2.6058, + "theoretical_loss": 3.3585118322566077, + "tokens_seen": 2595414016 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010794383149448345, + "loss": 2.4002, + "theoretical_loss": 3.358505118414244, + "tokens_seen": 2595479552 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010793380140421263, + "loss": 2.2386, + "theoretical_loss": 3.358498404788868, + "tokens_seen": 2595545088 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010792377131394182, + "loss": 2.559, + "theoretical_loss": 3.3584916913804688, + "tokens_seen": 2595610624 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010791374122367102, + "loss": 2.3919, + "theoretical_loss": 3.3584849781890327, + "tokens_seen": 2595676160 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010790371113340021, + "loss": 2.3776, + "theoretical_loss": 3.358478265214547, + "tokens_seen": 2595741696 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001078936810431294, + "loss": 2.5164, + "theoretical_loss": 3.3584715524570004, + "tokens_seen": 2595807232 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010788365095285859, + "loss": 2.3263, + "theoretical_loss": 3.3584648399163792, + "tokens_seen": 2595872768 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010787362086258777, + "loss": 2.5455, + "theoretical_loss": 3.3584581275926713, + "tokens_seen": 2595938304 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010786359077231695, + "loss": 2.3286, + "theoretical_loss": 3.358451415485865, + "tokens_seen": 2596003840 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010785356068204615, + "loss": 2.4007, + "theoretical_loss": 3.3584447035959464, + "tokens_seen": 2596069376 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010784353059177533, + "loss": 2.4968, + "theoretical_loss": 3.358437991922904, + "tokens_seen": 2596134912 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010783350050150452, + "loss": 2.3705, + "theoretical_loss": 3.3584312804667253, + "tokens_seen": 2596200448 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001078234704112337, + "loss": 2.307, + "theoretical_loss": 3.3584245692273975, + "tokens_seen": 2596265984 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001078134403209629, + "loss": 2.3268, + "theoretical_loss": 3.3584178582049082, + "tokens_seen": 2596331520 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010780341023069208, + "loss": 2.342, + "theoretical_loss": 3.358411147399245, + "tokens_seen": 2596397056 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010779338014042127, + "loss": 2.3888, + "theoretical_loss": 3.358404436810395, + "tokens_seen": 2596462592 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010778335005015045, + "loss": 2.5575, + "theoretical_loss": 3.3583977264383464, + "tokens_seen": 2596528128 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010777331995987965, + "loss": 2.5354, + "theoretical_loss": 3.358391016283086, + "tokens_seen": 2596593664 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010776328986960883, + "loss": 2.362, + "theoretical_loss": 3.358384306344602, + "tokens_seen": 2596659200 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010775325977933801, + "loss": 2.5239, + "theoretical_loss": 3.358377596622882, + "tokens_seen": 2596724736 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001077432296890672, + "loss": 2.5342, + "theoretical_loss": 3.358370887117913, + "tokens_seen": 2596790272 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2850974, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.025660514831543, + "objective/train/theoretical_loss": 3.3583641778296824, + "objective/train/tokens_used": 2617315808, + "theoretical_loss": 3.3583641778296824, + "tokens_seen": 2596855808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010773319959879639, + "loss": 2.3497, + "theoretical_loss": 3.3583641778296824, + "tokens_seen": 2596855808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010772316950852558, + "loss": 2.3508, + "theoretical_loss": 3.3583574687581788, + "tokens_seen": 2596921344 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010771313941825476, + "loss": 2.4632, + "theoretical_loss": 3.3583507599033884, + "tokens_seen": 2596986880 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010770310932798396, + "loss": 2.3638, + "theoretical_loss": 3.3583440512652993, + "tokens_seen": 2597052416 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010769307923771314, + "loss": 2.5416, + "theoretical_loss": 3.3583373428438987, + "tokens_seen": 2597117952 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010768304914744233, + "loss": 2.3748, + "theoretical_loss": 3.358330634639175, + "tokens_seen": 2597183488 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010767301905717151, + "loss": 2.4661, + "theoretical_loss": 3.3583239266511153, + "tokens_seen": 2597249024 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001076629889669007, + "loss": 2.558, + "theoretical_loss": 3.358317218879707, + "tokens_seen": 2597314560 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010765295887662989, + "loss": 2.5499, + "theoretical_loss": 3.358310511324937, + "tokens_seen": 2597380096 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010764292878635907, + "loss": 2.5779, + "theoretical_loss": 3.358303803986794, + "tokens_seen": 2597445632 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010763289869608827, + "loss": 2.4792, + "theoretical_loss": 3.358297096865265, + "tokens_seen": 2597511168 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010762286860581745, + "loss": 2.3244, + "theoretical_loss": 3.3582903899603376, + "tokens_seen": 2597576704 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010761283851554664, + "loss": 2.4661, + "theoretical_loss": 3.358283683271999, + "tokens_seen": 2597642240 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010760280842527582, + "loss": 2.1949, + "theoretical_loss": 3.3582769768002376, + "tokens_seen": 2597707776 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010759277833500502, + "loss": 2.4947, + "theoretical_loss": 3.3582702705450402, + "tokens_seen": 2597773312 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001075827482447342, + "loss": 2.5679, + "theoretical_loss": 3.358263564506395, + "tokens_seen": 2597838848 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010757271815446338, + "loss": 2.4342, + "theoretical_loss": 3.3582568586842885, + "tokens_seen": 2597904384 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010756268806419257, + "loss": 2.385, + "theoretical_loss": 3.358250153078709, + "tokens_seen": 2597969920 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010755265797392176, + "loss": 2.4687, + "theoretical_loss": 3.358243447689644, + "tokens_seen": 2598035456 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010754262788365096, + "loss": 2.2, + "theoretical_loss": 3.358236742517081, + "tokens_seen": 2598100992 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010753259779338015, + "loss": 2.4996, + "theoretical_loss": 3.3582300375610075, + "tokens_seen": 2598166528 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010752256770310934, + "loss": 2.3886, + "theoretical_loss": 3.3582233328214106, + "tokens_seen": 2598232064 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010751253761283852, + "loss": 2.3749, + "theoretical_loss": 3.358216628298279, + "tokens_seen": 2598297600 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010750250752256772, + "loss": 2.4193, + "theoretical_loss": 3.358209923991599, + "tokens_seen": 2598363136 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001074924774322969, + "loss": 2.4556, + "theoretical_loss": 3.3582032199013585, + "tokens_seen": 2598428672 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2851684, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.355797290802002, + "objective/train/theoretical_loss": 3.3581965160275455, + "objective/train/tokens_used": 2618954208, + "theoretical_loss": 3.3581965160275455, + "tokens_seen": 2598494208 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010748244734202609, + "loss": 2.3777, + "theoretical_loss": 3.3581965160275455, + "tokens_seen": 2598494208 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010747241725175527, + "loss": 2.1659, + "theoretical_loss": 3.3581898123701475, + "tokens_seen": 2598559744 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010746238716148445, + "loss": 2.7358, + "theoretical_loss": 3.358183108929152, + "tokens_seen": 2598625280 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010745235707121365, + "loss": 2.5151, + "theoretical_loss": 3.358176405704546, + "tokens_seen": 2598690816 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010744232698094283, + "loss": 2.2498, + "theoretical_loss": 3.3581697026963178, + "tokens_seen": 2598756352 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010743229689067202, + "loss": 2.4382, + "theoretical_loss": 3.3581629999044544, + "tokens_seen": 2598821888 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001074222668004012, + "loss": 2.6066, + "theoretical_loss": 3.358156297328944, + "tokens_seen": 2598887424 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001074122367101304, + "loss": 2.4805, + "theoretical_loss": 3.358149594969773, + "tokens_seen": 2598952960 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010740220661985958, + "loss": 2.6404, + "theoretical_loss": 3.35814289282693, + "tokens_seen": 2599018496 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010739217652958878, + "loss": 2.5636, + "theoretical_loss": 3.3581361909004026, + "tokens_seen": 2599084032 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010738214643931796, + "loss": 2.5316, + "theoretical_loss": 3.358129489190178, + "tokens_seen": 2599149568 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010737211634904714, + "loss": 2.3375, + "theoretical_loss": 3.3581227876962436, + "tokens_seen": 2599215104 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010736208625877633, + "loss": 2.4935, + "theoretical_loss": 3.3581160864185873, + "tokens_seen": 2599280640 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010735205616850551, + "loss": 2.2654, + "theoretical_loss": 3.3581093853571966, + "tokens_seen": 2599346176 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010734202607823471, + "loss": 2.2119, + "theoretical_loss": 3.358102684512059, + "tokens_seen": 2599411712 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010733199598796389, + "loss": 2.5056, + "theoretical_loss": 3.358095983883162, + "tokens_seen": 2599477248 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010732196589769308, + "loss": 2.6246, + "theoretical_loss": 3.358089283470493, + "tokens_seen": 2599542784 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010731193580742227, + "loss": 2.5908, + "theoretical_loss": 3.35808258327404, + "tokens_seen": 2599608320 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010730190571715146, + "loss": 2.528, + "theoretical_loss": 3.3580758832937905, + "tokens_seen": 2599673856 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010729187562688064, + "loss": 2.4959, + "theoretical_loss": 3.3580691835297323, + "tokens_seen": 2599739392 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010728184553660982, + "loss": 2.5313, + "theoretical_loss": 3.3580624839818523, + "tokens_seen": 2599804928 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010727181544633902, + "loss": 2.343, + "theoretical_loss": 3.3580557846501384, + "tokens_seen": 2599870464 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001072617853560682, + "loss": 2.6551, + "theoretical_loss": 3.358049085534578, + "tokens_seen": 2599936000 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010725175526579739, + "loss": 2.4251, + "theoretical_loss": 3.358042386635159, + "tokens_seen": 2600001536 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010724172517552657, + "loss": 2.4537, + "theoretical_loss": 3.358035687951869, + "tokens_seen": 2600067072 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2852991, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.296438455581665, + "objective/train/theoretical_loss": 3.3580289894846955, + "objective/train/tokens_used": 2620592608, + "theoretical_loss": 3.3580289894846955, + "tokens_seen": 2600132608 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010723169508525577, + "loss": 2.3104, + "theoretical_loss": 3.3580289894846955, + "tokens_seen": 2600132608 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010722166499498495, + "loss": 2.4532, + "theoretical_loss": 3.3580222912336257, + "tokens_seen": 2600198144 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010721163490471414, + "loss": 2.4713, + "theoretical_loss": 3.358015593198648, + "tokens_seen": 2600263680 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010720160481444333, + "loss": 2.4931, + "theoretical_loss": 3.358008895379749, + "tokens_seen": 2600329216 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010719157472417252, + "loss": 2.7373, + "theoretical_loss": 3.358002197776917, + "tokens_seen": 2600394752 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001071815446339017, + "loss": 2.7324, + "theoretical_loss": 3.357995500390139, + "tokens_seen": 2600460288 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010717151454363088, + "loss": 2.4035, + "theoretical_loss": 3.3579888032194036, + "tokens_seen": 2600525824 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010716148445336009, + "loss": 2.5138, + "theoretical_loss": 3.3579821062646973, + "tokens_seen": 2600591360 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010715145436308927, + "loss": 2.5689, + "theoretical_loss": 3.357975409526008, + "tokens_seen": 2600656896 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010714142427281847, + "loss": 2.4636, + "theoretical_loss": 3.3579687130033236, + "tokens_seen": 2600722432 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010713139418254765, + "loss": 2.491, + "theoretical_loss": 3.3579620166966313, + "tokens_seen": 2600787968 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010712136409227684, + "loss": 2.4503, + "theoretical_loss": 3.357955320605919, + "tokens_seen": 2600853504 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010711133400200602, + "loss": 2.68, + "theoretical_loss": 3.357948624731174, + "tokens_seen": 2600919040 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010710130391173522, + "loss": 2.4701, + "theoretical_loss": 3.357941929072384, + "tokens_seen": 2600984576 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001070912738214644, + "loss": 2.275, + "theoretical_loss": 3.3579352336295374, + "tokens_seen": 2601050112 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010708124373119358, + "loss": 2.5504, + "theoretical_loss": 3.35792853840262, + "tokens_seen": 2601115648 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010707121364092277, + "loss": 2.4255, + "theoretical_loss": 3.357921843391621, + "tokens_seen": 2601181184 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010706118355065196, + "loss": 2.5215, + "theoretical_loss": 3.3579151485965273, + "tokens_seen": 2601246720 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010705115346038115, + "loss": 2.4509, + "theoretical_loss": 3.357908454017327, + "tokens_seen": 2601312256 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010704112337011033, + "loss": 2.5392, + "theoretical_loss": 3.357901759654007, + "tokens_seen": 2601377792 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010703109327983953, + "loss": 2.338, + "theoretical_loss": 3.3578950655065554, + "tokens_seen": 2601443328 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010702106318956871, + "loss": 2.1733, + "theoretical_loss": 3.3578883715749592, + "tokens_seen": 2601508864 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001070110330992979, + "loss": 2.5425, + "theoretical_loss": 3.357881677859207, + "tokens_seen": 2601574400 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010700100300902708, + "loss": 2.4206, + "theoretical_loss": 3.3578749843592854, + "tokens_seen": 2601639936 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010699097291875628, + "loss": 2.3674, + "theoretical_loss": 3.357868291075183, + "tokens_seen": 2601705472 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2853385, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3864526748657227, + "objective/train/theoretical_loss": 3.3578615980068864, + "objective/train/tokens_used": 2622231008, + "theoretical_loss": 3.3578615980068864, + "tokens_seen": 2601771008 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010698094282848546, + "loss": 2.6018, + "theoretical_loss": 3.3578615980068864, + "tokens_seen": 2601771008 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010697091273821464, + "loss": 2.4474, + "theoretical_loss": 3.357854905154384, + "tokens_seen": 2601836544 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010696088264794383, + "loss": 2.368, + "theoretical_loss": 3.3578482125176627, + "tokens_seen": 2601902080 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010695085255767302, + "loss": 2.4136, + "theoretical_loss": 3.3578415200967107, + "tokens_seen": 2601967616 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010694082246740221, + "loss": 2.2701, + "theoretical_loss": 3.3578348278915153, + "tokens_seen": 2602033152 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010693079237713139, + "loss": 2.3977, + "theoretical_loss": 3.3578281359020643, + "tokens_seen": 2602098688 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010692076228686059, + "loss": 2.5909, + "theoretical_loss": 3.357821444128345, + "tokens_seen": 2602164224 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010691073219658977, + "loss": 2.4751, + "theoretical_loss": 3.3578147525703455, + "tokens_seen": 2602229760 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010690070210631896, + "loss": 2.4195, + "theoretical_loss": 3.3578080612280528, + "tokens_seen": 2602295296 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010689067201604814, + "loss": 2.5962, + "theoretical_loss": 3.357801370101455, + "tokens_seen": 2602360832 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010688064192577732, + "loss": 2.2734, + "theoretical_loss": 3.3577946791905395, + "tokens_seen": 2602426368 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010687061183550652, + "loss": 2.4134, + "theoretical_loss": 3.357787988495294, + "tokens_seen": 2602491904 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001068605817452357, + "loss": 2.2957, + "theoretical_loss": 3.3577812980157065, + "tokens_seen": 2602557440 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001068505516549649, + "loss": 2.7159, + "theoretical_loss": 3.3577746077517636, + "tokens_seen": 2602622976 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010684052156469408, + "loss": 2.4684, + "theoretical_loss": 3.357767917703454, + "tokens_seen": 2602688512 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010683049147442327, + "loss": 2.5677, + "theoretical_loss": 3.3577612278707645, + "tokens_seen": 2602754048 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010682046138415245, + "loss": 2.351, + "theoretical_loss": 3.3577545382536833, + "tokens_seen": 2602819584 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010681043129388165, + "loss": 2.4789, + "theoretical_loss": 3.357747848852198, + "tokens_seen": 2602885120 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010680040120361083, + "loss": 2.4066, + "theoretical_loss": 3.3577411596662956, + "tokens_seen": 2602950656 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010679037111334002, + "loss": 2.5231, + "theoretical_loss": 3.3577344706959646, + "tokens_seen": 2603016192 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010678034102306922, + "loss": 2.5558, + "theoretical_loss": 3.3577277819411915, + "tokens_seen": 2603081728 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001067703109327984, + "loss": 2.4159, + "theoretical_loss": 3.357721093401965, + "tokens_seen": 2603147264 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010676028084252759, + "loss": 2.2783, + "theoretical_loss": 3.357714405078273, + "tokens_seen": 2603212800 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010675025075225677, + "loss": 2.5866, + "theoretical_loss": 3.3577077169701015, + "tokens_seen": 2603278336 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010674022066198597, + "loss": 2.3001, + "theoretical_loss": 3.357701029077439, + "tokens_seen": 2603343872 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2854678, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2366559505462646, + "objective/train/theoretical_loss": 3.357694341400274, + "objective/train/tokens_used": 2623869408, + "theoretical_loss": 3.357694341400274, + "tokens_seen": 2603409408 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010673019057171515, + "loss": 2.4834, + "theoretical_loss": 3.357694341400274, + "tokens_seen": 2603409408 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010672016048144434, + "loss": 2.5915, + "theoretical_loss": 3.357687653938593, + "tokens_seen": 2603474944 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010671013039117353, + "loss": 2.3176, + "theoretical_loss": 3.3576809666923837, + "tokens_seen": 2603540480 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010670010030090272, + "loss": 2.628, + "theoretical_loss": 3.357674279661634, + "tokens_seen": 2603606016 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001066900702106319, + "loss": 2.3374, + "theoretical_loss": 3.357667592846332, + "tokens_seen": 2603671552 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010668004012036108, + "loss": 2.6668, + "theoretical_loss": 3.3576609062464646, + "tokens_seen": 2603737088 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010667001003009028, + "loss": 2.3095, + "theoretical_loss": 3.35765421986202, + "tokens_seen": 2603802624 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010665997993981946, + "loss": 2.5512, + "theoretical_loss": 3.3576475336929854, + "tokens_seen": 2603868160 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010664994984954865, + "loss": 2.5419, + "theoretical_loss": 3.3576408477393485, + "tokens_seen": 2603933696 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010663991975927783, + "loss": 2.6463, + "theoretical_loss": 3.357634162001097, + "tokens_seen": 2603999232 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010662988966900703, + "loss": 2.4168, + "theoretical_loss": 3.3576274764782186, + "tokens_seen": 2604064768 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010661985957873621, + "loss": 2.6454, + "theoretical_loss": 3.3576207911707012, + "tokens_seen": 2604130304 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001066098294884654, + "loss": 2.3632, + "theoretical_loss": 3.357614106078532, + "tokens_seen": 2604195840 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010659979939819459, + "loss": 2.4599, + "theoretical_loss": 3.3576074212016986, + "tokens_seen": 2604261376 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010658976930792377, + "loss": 2.348, + "theoretical_loss": 3.3576007365401894, + "tokens_seen": 2604326912 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010657973921765296, + "loss": 2.4468, + "theoretical_loss": 3.357594052093991, + "tokens_seen": 2604392448 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010656970912738214, + "loss": 2.3837, + "theoretical_loss": 3.3575873678630916, + "tokens_seen": 2604457984 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010655967903711134, + "loss": 2.3372, + "theoretical_loss": 3.357580683847479, + "tokens_seen": 2604523520 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010654964894684052, + "loss": 2.4563, + "theoretical_loss": 3.3575740000471406, + "tokens_seen": 2604589056 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010653961885656971, + "loss": 2.1905, + "theoretical_loss": 3.357567316462064, + "tokens_seen": 2604654592 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001065295887662989, + "loss": 2.5912, + "theoretical_loss": 3.3575606330922367, + "tokens_seen": 2604720128 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010651955867602809, + "loss": 2.4947, + "theoretical_loss": 3.357553949937647, + "tokens_seen": 2604785664 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010650952858575727, + "loss": 2.6684, + "theoretical_loss": 3.3575472669982824, + "tokens_seen": 2604851200 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010649949849548645, + "loss": 2.5585, + "theoretical_loss": 3.3575405842741297, + "tokens_seen": 2604916736 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010648946840521565, + "loss": 2.3072, + "theoretical_loss": 3.3575339017651773, + "tokens_seen": 2604982272 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2855252, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1690635681152344, + "objective/train/theoretical_loss": 3.3575272194714127, + "objective/train/tokens_used": 2625507808, + "theoretical_loss": 3.3575272194714127, + "tokens_seen": 2605047808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010647943831494483, + "loss": 2.5751, + "theoretical_loss": 3.3575272194714127, + "tokens_seen": 2605047808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010646940822467402, + "loss": 2.2512, + "theoretical_loss": 3.357520537392824, + "tokens_seen": 2605113344 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001064593781344032, + "loss": 2.6462, + "theoretical_loss": 3.3575138555293984, + "tokens_seen": 2605178880 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001064493480441324, + "loss": 2.258, + "theoretical_loss": 3.357507173881123, + "tokens_seen": 2605244416 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010643931795386158, + "loss": 2.3197, + "theoretical_loss": 3.3575004924479863, + "tokens_seen": 2605309952 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010642928786359077, + "loss": 2.6933, + "theoretical_loss": 3.357493811229976, + "tokens_seen": 2605375488 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010641925777331995, + "loss": 2.5066, + "theoretical_loss": 3.3574871302270792, + "tokens_seen": 2605441024 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010640922768304916, + "loss": 2.2516, + "theoretical_loss": 3.357480449439284, + "tokens_seen": 2605506560 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010639919759277834, + "loss": 2.3892, + "theoretical_loss": 3.357473768866578, + "tokens_seen": 2605572096 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010638916750250752, + "loss": 2.4719, + "theoretical_loss": 3.3574670885089484, + "tokens_seen": 2605637632 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010637913741223672, + "loss": 2.7088, + "theoretical_loss": 3.3574604083663835, + "tokens_seen": 2605703168 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001063691073219659, + "loss": 2.5024, + "theoretical_loss": 3.3574537284388706, + "tokens_seen": 2605768704 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001063590772316951, + "loss": 2.5619, + "theoretical_loss": 3.3574470487263977, + "tokens_seen": 2605834240 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010634904714142428, + "loss": 2.4758, + "theoretical_loss": 3.357440369228952, + "tokens_seen": 2605899776 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010633901705115347, + "loss": 2.5732, + "theoretical_loss": 3.3574336899465216, + "tokens_seen": 2605965312 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010632898696088265, + "loss": 2.4848, + "theoretical_loss": 3.3574270108790936, + "tokens_seen": 2606030848 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010631895687061185, + "loss": 2.502, + "theoretical_loss": 3.3574203320266567, + "tokens_seen": 2606096384 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010630892678034103, + "loss": 2.6095, + "theoretical_loss": 3.3574136533891976, + "tokens_seen": 2606161920 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010629889669007021, + "loss": 2.5066, + "theoretical_loss": 3.357406974966704, + "tokens_seen": 2606227456 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001062888665997994, + "loss": 2.6733, + "theoretical_loss": 3.357400296759164, + "tokens_seen": 2606292992 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010627883650952858, + "loss": 2.3846, + "theoretical_loss": 3.3573936187665656, + "tokens_seen": 2606358528 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010626880641925778, + "loss": 2.545, + "theoretical_loss": 3.3573869409888957, + "tokens_seen": 2606424064 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010625877632898696, + "loss": 2.6155, + "theoretical_loss": 3.3573802634261423, + "tokens_seen": 2606489600 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010624874623871616, + "loss": 2.7045, + "theoretical_loss": 3.3573735860782934, + "tokens_seen": 2606555136 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010623871614844534, + "loss": 2.5166, + "theoretical_loss": 3.357366908945336, + "tokens_seen": 2606620672 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2856339, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.907118797302246, + "objective/train/theoretical_loss": 3.3573602320272586, + "objective/train/tokens_used": 2627146208, + "theoretical_loss": 3.3573602320272586, + "tokens_seen": 2606686208 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010622868605817453, + "loss": 2.6593, + "theoretical_loss": 3.3573602320272586, + "tokens_seen": 2606686208 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010621865596790371, + "loss": 2.5021, + "theoretical_loss": 3.357353555324048, + "tokens_seen": 2606751744 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001062086258776329, + "loss": 2.5112, + "theoretical_loss": 3.3573468788356924, + "tokens_seen": 2606817280 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010619859578736209, + "loss": 2.5341, + "theoretical_loss": 3.3573402025621792, + "tokens_seen": 2606882816 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010618856569709127, + "loss": 2.5973, + "theoretical_loss": 3.3573335265034965, + "tokens_seen": 2606948352 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010617853560682046, + "loss": 2.6751, + "theoretical_loss": 3.357326850659632, + "tokens_seen": 2607013888 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010616850551654964, + "loss": 2.4128, + "theoretical_loss": 3.3573201750305732, + "tokens_seen": 2607079424 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010615847542627884, + "loss": 2.522, + "theoretical_loss": 3.3573134996163074, + "tokens_seen": 2607144960 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010614844533600802, + "loss": 2.6066, + "theoretical_loss": 3.357306824416823, + "tokens_seen": 2607210496 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010613841524573722, + "loss": 2.5126, + "theoretical_loss": 3.357300149432107, + "tokens_seen": 2607276032 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001061283851554664, + "loss": 2.4803, + "theoretical_loss": 3.3572934746621477, + "tokens_seen": 2607341568 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010611835506519559, + "loss": 2.5555, + "theoretical_loss": 3.3572868001069325, + "tokens_seen": 2607407104 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010610832497492477, + "loss": 2.4311, + "theoretical_loss": 3.357280125766449, + "tokens_seen": 2607472640 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010609829488465395, + "loss": 2.3693, + "theoretical_loss": 3.357273451640685, + "tokens_seen": 2607538176 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010608826479438315, + "loss": 2.6117, + "theoretical_loss": 3.3572667777296283, + "tokens_seen": 2607603712 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010607823470411233, + "loss": 2.3847, + "theoretical_loss": 3.3572601040332666, + "tokens_seen": 2607669248 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010606820461384152, + "loss": 2.6041, + "theoretical_loss": 3.3572534305515873, + "tokens_seen": 2607734784 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001060581745235707, + "loss": 2.4933, + "theoretical_loss": 3.357246757284578, + "tokens_seen": 2607800320 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001060481444332999, + "loss": 2.5779, + "theoretical_loss": 3.3572400842322274, + "tokens_seen": 2607865856 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001060381143430291, + "loss": 2.5091, + "theoretical_loss": 3.357233411394522, + "tokens_seen": 2607931392 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010602808425275829, + "loss": 2.4188, + "theoretical_loss": 3.35722673877145, + "tokens_seen": 2607996928 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010601805416248747, + "loss": 2.5331, + "theoretical_loss": 3.3572200663629994, + "tokens_seen": 2608062464 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010600802407221665, + "loss": 2.5847, + "theoretical_loss": 3.3572133941691575, + "tokens_seen": 2608128000 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010599799398194585, + "loss": 2.3958, + "theoretical_loss": 3.357206722189912, + "tokens_seen": 2608193536 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010598796389167503, + "loss": 2.4465, + "theoretical_loss": 3.3572000504252513, + "tokens_seen": 2608259072 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2857058, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.876957416534424, + "objective/train/theoretical_loss": 3.357193378875162, + "objective/train/tokens_used": 2628784608, + "theoretical_loss": 3.357193378875162, + "tokens_seen": 2608324608 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010597793380140422, + "loss": 2.4708, + "theoretical_loss": 3.357193378875162, + "tokens_seen": 2608324608 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001059679037111334, + "loss": 2.5732, + "theoretical_loss": 3.357186707539632, + "tokens_seen": 2608390144 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001059578736208626, + "loss": 2.7007, + "theoretical_loss": 3.35718003641865, + "tokens_seen": 2608455680 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010594784353059178, + "loss": 2.5422, + "theoretical_loss": 3.357173365512203, + "tokens_seen": 2608521216 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010593781344032097, + "loss": 2.6024, + "theoretical_loss": 3.3571666948202785, + "tokens_seen": 2608586752 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010592778335005015, + "loss": 2.3566, + "theoretical_loss": 3.3571600243428645, + "tokens_seen": 2608652288 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010591775325977935, + "loss": 2.5887, + "theoretical_loss": 3.3571533540799487, + "tokens_seen": 2608717824 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010590772316950853, + "loss": 2.4658, + "theoretical_loss": 3.3571466840315187, + "tokens_seen": 2608783360 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010589769307923771, + "loss": 2.5822, + "theoretical_loss": 3.3571400141975625, + "tokens_seen": 2608848896 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001058876629889669, + "loss": 2.67, + "theoretical_loss": 3.3571333445780676, + "tokens_seen": 2608914432 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010587763289869609, + "loss": 2.5821, + "theoretical_loss": 3.357126675173022, + "tokens_seen": 2608979968 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010586760280842528, + "loss": 2.4543, + "theoretical_loss": 3.357120005982413, + "tokens_seen": 2609045504 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010585757271815446, + "loss": 2.5036, + "theoretical_loss": 3.3571133370062283, + "tokens_seen": 2609111040 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010584754262788366, + "loss": 2.408, + "theoretical_loss": 3.357106668244456, + "tokens_seen": 2609176576 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010583751253761284, + "loss": 2.3125, + "theoretical_loss": 3.3570999996970836, + "tokens_seen": 2609242112 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010582748244734203, + "loss": 2.4892, + "theoretical_loss": 3.3570933313640987, + "tokens_seen": 2609307648 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010581745235707121, + "loss": 2.4257, + "theoretical_loss": 3.3570866632454894, + "tokens_seen": 2609373184 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001058074222668004, + "loss": 2.4423, + "theoretical_loss": 3.3570799953412434, + "tokens_seen": 2609438720 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010579739217652959, + "loss": 2.4851, + "theoretical_loss": 3.357073327651348, + "tokens_seen": 2609504256 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010578736208625877, + "loss": 2.5917, + "theoretical_loss": 3.357066660175791, + "tokens_seen": 2609569792 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010577733199598797, + "loss": 2.5992, + "theoretical_loss": 3.3570599929145604, + "tokens_seen": 2609635328 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010576730190571715, + "loss": 2.5864, + "theoretical_loss": 3.357053325867644, + "tokens_seen": 2609700864 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010575727181544634, + "loss": 2.4519, + "theoretical_loss": 3.357046659035029, + "tokens_seen": 2609766400 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010574724172517552, + "loss": 2.5212, + "theoretical_loss": 3.357039992416704, + "tokens_seen": 2609831936 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010573721163490472, + "loss": 2.4136, + "theoretical_loss": 3.3570333260126555, + "tokens_seen": 2609897472 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2858452, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4414010047912598, + "objective/train/theoretical_loss": 3.357026659822872, + "objective/train/tokens_used": 2630423008, + "theoretical_loss": 3.357026659822872, + "tokens_seen": 2609963008 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001057271815446339, + "loss": 2.5734, + "theoretical_loss": 3.357026659822872, + "tokens_seen": 2609963008 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010571715145436308, + "loss": 2.5315, + "theoretical_loss": 3.357019993847342, + "tokens_seen": 2610028544 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010570712136409227, + "loss": 2.5274, + "theoretical_loss": 3.357013328086052, + "tokens_seen": 2610094080 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010569709127382146, + "loss": 2.6868, + "theoretical_loss": 3.35700666253899, + "tokens_seen": 2610159616 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010568706118355065, + "loss": 2.4695, + "theoretical_loss": 3.3569999972061435, + "tokens_seen": 2610225152 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010567703109327983, + "loss": 2.3873, + "theoretical_loss": 3.356993332087501, + "tokens_seen": 2610290688 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010566700100300904, + "loss": 2.2495, + "theoretical_loss": 3.35698666718305, + "tokens_seen": 2610356224 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010565697091273822, + "loss": 2.5308, + "theoretical_loss": 3.356980002492778, + "tokens_seen": 2610421760 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010564694082246742, + "loss": 2.502, + "theoretical_loss": 3.356973338016673, + "tokens_seen": 2610487296 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001056369107321966, + "loss": 2.657, + "theoretical_loss": 3.356966673754722, + "tokens_seen": 2610552832 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010562688064192579, + "loss": 2.6247, + "theoretical_loss": 3.356960009706914, + "tokens_seen": 2610618368 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010561685055165497, + "loss": 2.6322, + "theoretical_loss": 3.3569533458732357, + "tokens_seen": 2610683904 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010560682046138415, + "loss": 2.3573, + "theoretical_loss": 3.356946682253675, + "tokens_seen": 2610749440 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010559679037111335, + "loss": 2.4007, + "theoretical_loss": 3.35694001884822, + "tokens_seen": 2610814976 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010558676028084253, + "loss": 2.6589, + "theoretical_loss": 3.3569333556568584, + "tokens_seen": 2610880512 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010557673019057172, + "loss": 2.6309, + "theoretical_loss": 3.3569266926795778, + "tokens_seen": 2610946048 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001055667001003009, + "loss": 2.4985, + "theoretical_loss": 3.356920029916366, + "tokens_seen": 2611011584 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001055566700100301, + "loss": 2.5526, + "theoretical_loss": 3.356913367367211, + "tokens_seen": 2611077120 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010554663991975928, + "loss": 2.5746, + "theoretical_loss": 3.3569067050320998, + "tokens_seen": 2611142656 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010553660982948848, + "loss": 2.6478, + "theoretical_loss": 3.356900042911021, + "tokens_seen": 2611208192 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010552657973921766, + "loss": 2.437, + "theoretical_loss": 3.356893381003962, + "tokens_seen": 2611273728 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010551654964894684, + "loss": 2.5114, + "theoretical_loss": 3.35688671931091, + "tokens_seen": 2611339264 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010550651955867603, + "loss": 2.432, + "theoretical_loss": 3.3568800578318543, + "tokens_seen": 2611404800 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010549648946840521, + "loss": 2.4939, + "theoretical_loss": 3.356873396566781, + "tokens_seen": 2611470336 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010548645937813441, + "loss": 2.4111, + "theoretical_loss": 3.3568667355156787, + "tokens_seen": 2611535872 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2859162, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.160414457321167, + "objective/train/theoretical_loss": 3.3568600746785346, + "objective/train/tokens_used": 2632061408, + "theoretical_loss": 3.3568600746785346, + "tokens_seen": 2611601408 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010547642928786359, + "loss": 2.5618, + "theoretical_loss": 3.3568600746785346, + "tokens_seen": 2611601408 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010546639919759278, + "loss": 2.4985, + "theoretical_loss": 3.356853414055337, + "tokens_seen": 2611666944 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010545636910732197, + "loss": 2.5176, + "theoretical_loss": 3.3568467536460735, + "tokens_seen": 2611732480 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010544633901705116, + "loss": 2.7021, + "theoretical_loss": 3.356840093450732, + "tokens_seen": 2611798016 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010543630892678034, + "loss": 2.5328, + "theoretical_loss": 3.3568334334693004, + "tokens_seen": 2611863552 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010542627883650952, + "loss": 2.1973, + "theoretical_loss": 3.3568267737017656, + "tokens_seen": 2611929088 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010541624874623872, + "loss": 2.5905, + "theoretical_loss": 3.3568201141481167, + "tokens_seen": 2611994624 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001054062186559679, + "loss": 2.5308, + "theoretical_loss": 3.35681345480834, + "tokens_seen": 2612060160 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010539618856569709, + "loss": 2.6033, + "theoretical_loss": 3.356806795682424, + "tokens_seen": 2612125696 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010538615847542627, + "loss": 2.4609, + "theoretical_loss": 3.3568001367703566, + "tokens_seen": 2612191232 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010537612838515547, + "loss": 2.4879, + "theoretical_loss": 3.3567934780721256, + "tokens_seen": 2612256768 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010536609829488465, + "loss": 2.5613, + "theoretical_loss": 3.356786819587718, + "tokens_seen": 2612322304 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010535606820461384, + "loss": 2.6006, + "theoretical_loss": 3.356780161317123, + "tokens_seen": 2612387840 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010534603811434303, + "loss": 2.6134, + "theoretical_loss": 3.3567735032603268, + "tokens_seen": 2612453376 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010533600802407222, + "loss": 2.3114, + "theoretical_loss": 3.3567668454173183, + "tokens_seen": 2612518912 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001053259779338014, + "loss": 2.5076, + "theoretical_loss": 3.3567601877880846, + "tokens_seen": 2612584448 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010531594784353058, + "loss": 2.5072, + "theoretical_loss": 3.3567535303726137, + "tokens_seen": 2612649984 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010530591775325978, + "loss": 2.4612, + "theoretical_loss": 3.3567468731708936, + "tokens_seen": 2612715520 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010529588766298896, + "loss": 2.5672, + "theoretical_loss": 3.356740216182912, + "tokens_seen": 2612781056 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010528585757271817, + "loss": 2.7538, + "theoretical_loss": 3.3567335594086565, + "tokens_seen": 2612846592 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010527582748244735, + "loss": 2.4656, + "theoretical_loss": 3.3567269028481146, + "tokens_seen": 2612912128 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010526579739217654, + "loss": 2.5629, + "theoretical_loss": 3.3567202465012747, + "tokens_seen": 2612977664 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010525576730190572, + "loss": 2.5853, + "theoretical_loss": 3.3567135903681242, + "tokens_seen": 2613043200 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010524573721163492, + "loss": 2.5334, + "theoretical_loss": 3.356706934448651, + "tokens_seen": 2613108736 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001052357071213641, + "loss": 2.3746, + "theoretical_loss": 3.356700278742843, + "tokens_seen": 2613174272 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2860611, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5662970542907715, + "objective/train/theoretical_loss": 3.3566936232506874, + "objective/train/tokens_used": 2633699808, + "theoretical_loss": 3.3566936232506874, + "tokens_seen": 2613239808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010522567703109328, + "loss": 2.6107, + "theoretical_loss": 3.3566936232506874, + "tokens_seen": 2613239808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010521564694082247, + "loss": 2.4853, + "theoretical_loss": 3.356686967972173, + "tokens_seen": 2613305344 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010520561685055166, + "loss": 2.636, + "theoretical_loss": 3.3566803129072866, + "tokens_seen": 2613370880 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010519558676028085, + "loss": 2.5207, + "theoretical_loss": 3.3566736580560166, + "tokens_seen": 2613436416 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010518555667001003, + "loss": 2.3633, + "theoretical_loss": 3.3566670034183503, + "tokens_seen": 2613501952 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010517552657973923, + "loss": 2.4953, + "theoretical_loss": 3.3566603489942763, + "tokens_seen": 2613567488 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010516549648946841, + "loss": 2.5193, + "theoretical_loss": 3.3566536947837813, + "tokens_seen": 2613633024 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001051554663991976, + "loss": 2.5158, + "theoretical_loss": 3.356647040786854, + "tokens_seen": 2613698560 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010514543630892678, + "loss": 2.5023, + "theoretical_loss": 3.3566403870034818, + "tokens_seen": 2613764096 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010513540621865598, + "loss": 2.4076, + "theoretical_loss": 3.3566337334336525, + "tokens_seen": 2613829632 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010512537612838516, + "loss": 2.6243, + "theoretical_loss": 3.356627080077354, + "tokens_seen": 2613895168 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010511534603811434, + "loss": 2.3963, + "theoretical_loss": 3.356620426934574, + "tokens_seen": 2613960704 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010510531594784353, + "loss": 2.3786, + "theoretical_loss": 3.3566137740053, + "tokens_seen": 2614026240 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010509528585757272, + "loss": 2.7334, + "theoretical_loss": 3.3566071212895205, + "tokens_seen": 2614091776 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010508525576730191, + "loss": 2.4032, + "theoretical_loss": 3.3566004687872226, + "tokens_seen": 2614157312 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010507522567703109, + "loss": 2.4824, + "theoretical_loss": 3.3565938164983944, + "tokens_seen": 2614222848 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010506519558676029, + "loss": 2.5721, + "theoretical_loss": 3.356587164423024, + "tokens_seen": 2614288384 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010505516549648947, + "loss": 2.4086, + "theoretical_loss": 3.3565805125610986, + "tokens_seen": 2614353920 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010504513540621866, + "loss": 2.2564, + "theoretical_loss": 3.3565738609126066, + "tokens_seen": 2614419456 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010503510531594784, + "loss": 2.5796, + "theoretical_loss": 3.3565672094775354, + "tokens_seen": 2614484992 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010502507522567702, + "loss": 2.8758, + "theoretical_loss": 3.3565605582558726, + "tokens_seen": 2614550528 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010501504513540622, + "loss": 2.5657, + "theoretical_loss": 3.3565539072476067, + "tokens_seen": 2614616064 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001050050150451354, + "loss": 2.5186, + "theoretical_loss": 3.3565472564527252, + "tokens_seen": 2614681600 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001049949849548646, + "loss": 2.6982, + "theoretical_loss": 3.3565406058712157, + "tokens_seen": 2614747136 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010498495486459378, + "loss": 2.5271, + "theoretical_loss": 3.3565339555030658, + "tokens_seen": 2614812672 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2861364, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.9855258464813232, + "objective/train/theoretical_loss": 3.356527305348264, + "objective/train/tokens_used": 2635338208, + "theoretical_loss": 3.356527305348264, + "tokens_seen": 2614878208 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010497492477432297, + "loss": 2.1523, + "theoretical_loss": 3.356527305348264, + "tokens_seen": 2614878208 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010496489468405215, + "loss": 2.4304, + "theoretical_loss": 3.3565206554067974, + "tokens_seen": 2614943744 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010495486459378135, + "loss": 2.6098, + "theoretical_loss": 3.3565140056786547, + "tokens_seen": 2615009280 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010494483450351053, + "loss": 2.3304, + "theoretical_loss": 3.3565073561638226, + "tokens_seen": 2615074816 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010493480441323971, + "loss": 2.5941, + "theoretical_loss": 3.35650070686229, + "tokens_seen": 2615140352 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001049247743229689, + "loss": 2.5435, + "theoretical_loss": 3.356494057774044, + "tokens_seen": 2615205888 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001049147442326981, + "loss": 2.4405, + "theoretical_loss": 3.3564874088990724, + "tokens_seen": 2615271424 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010490471414242729, + "loss": 2.3538, + "theoretical_loss": 3.356480760237363, + "tokens_seen": 2615336960 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010489468405215647, + "loss": 2.3361, + "theoretical_loss": 3.3564741117889043, + "tokens_seen": 2615402496 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010488465396188567, + "loss": 2.6487, + "theoretical_loss": 3.3564674635536837, + "tokens_seen": 2615468032 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010487462387161485, + "loss": 2.4429, + "theoretical_loss": 3.3564608155316886, + "tokens_seen": 2615533568 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010486459378134404, + "loss": 2.4642, + "theoretical_loss": 3.3564541677229074, + "tokens_seen": 2615599104 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010485456369107323, + "loss": 2.4998, + "theoretical_loss": 3.356447520127328, + "tokens_seen": 2615664640 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010484453360080242, + "loss": 2.4757, + "theoretical_loss": 3.3564408727449377, + "tokens_seen": 2615730176 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001048345035105316, + "loss": 2.312, + "theoretical_loss": 3.3564342255757245, + "tokens_seen": 2615795712 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010482447342026078, + "loss": 2.4641, + "theoretical_loss": 3.356427578619676, + "tokens_seen": 2615861248 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010481444332998998, + "loss": 2.5158, + "theoretical_loss": 3.3564209318767806, + "tokens_seen": 2615926784 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010480441323971916, + "loss": 2.4801, + "theoretical_loss": 3.356414285347026, + "tokens_seen": 2615992320 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010479438314944835, + "loss": 2.6165, + "theoretical_loss": 3.3564076390303996, + "tokens_seen": 2616057856 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010478435305917753, + "loss": 2.3312, + "theoretical_loss": 3.3564009929268894, + "tokens_seen": 2616123392 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010477432296890673, + "loss": 2.6188, + "theoretical_loss": 3.3563943470364834, + "tokens_seen": 2616188928 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010476429287863591, + "loss": 2.4514, + "theoretical_loss": 3.3563877013591696, + "tokens_seen": 2616254464 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001047542627883651, + "loss": 2.4736, + "theoretical_loss": 3.356381055894935, + "tokens_seen": 2616320000 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010474423269809429, + "loss": 2.5267, + "theoretical_loss": 3.3563744106437685, + "tokens_seen": 2616385536 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010473420260782347, + "loss": 2.5698, + "theoretical_loss": 3.356367765605657, + "tokens_seen": 2616451072 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2862831, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3601505756378174, + "objective/train/theoretical_loss": 3.3563611207805892, + "objective/train/tokens_used": 2636976608, + "theoretical_loss": 3.3563611207805892, + "tokens_seen": 2616516608 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010472417251755266, + "loss": 2.4609, + "theoretical_loss": 3.3563611207805892, + "tokens_seen": 2616516608 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010471414242728184, + "loss": 2.4261, + "theoretical_loss": 3.356354476168552, + "tokens_seen": 2616582144 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010470411233701104, + "loss": 2.3718, + "theoretical_loss": 3.356347831769534, + "tokens_seen": 2616647680 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010469408224674022, + "loss": 2.4938, + "theoretical_loss": 3.356341187583523, + "tokens_seen": 2616713216 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010468405215646941, + "loss": 2.4491, + "theoretical_loss": 3.3563345436105063, + "tokens_seen": 2616778752 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001046740220661986, + "loss": 2.3242, + "theoretical_loss": 3.356327899850472, + "tokens_seen": 2616844288 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010466399197592779, + "loss": 2.7436, + "theoretical_loss": 3.356321256303408, + "tokens_seen": 2616909824 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010465396188565697, + "loss": 2.56, + "theoretical_loss": 3.3563146129693022, + "tokens_seen": 2616975360 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010464393179538615, + "loss": 2.2305, + "theoretical_loss": 3.3563079698481424, + "tokens_seen": 2617040896 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010463390170511535, + "loss": 2.2504, + "theoretical_loss": 3.3563013269399167, + "tokens_seen": 2617106432 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010462387161484453, + "loss": 2.7156, + "theoretical_loss": 3.356294684244612, + "tokens_seen": 2617171968 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010461384152457372, + "loss": 2.6094, + "theoretical_loss": 3.3562880417622174, + "tokens_seen": 2617237504 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001046038114343029, + "loss": 2.2766, + "theoretical_loss": 3.35628139949272, + "tokens_seen": 2617303040 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001045937813440321, + "loss": 2.3903, + "theoretical_loss": 3.356274757436107, + "tokens_seen": 2617368576 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010458375125376128, + "loss": 2.3635, + "theoretical_loss": 3.3562681155923677, + "tokens_seen": 2617434112 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010457372116349047, + "loss": 2.4763, + "theoretical_loss": 3.3562614739614895, + "tokens_seen": 2617499648 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010456369107321965, + "loss": 2.4091, + "theoretical_loss": 3.35625483254346, + "tokens_seen": 2617565184 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010455366098294885, + "loss": 2.7105, + "theoretical_loss": 3.3562481913382665, + "tokens_seen": 2617630720 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010454363089267803, + "loss": 2.4998, + "theoretical_loss": 3.3562415503458976, + "tokens_seen": 2617696256 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010453360080240722, + "loss": 2.2749, + "theoretical_loss": 3.356234909566341, + "tokens_seen": 2617761792 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010452357071213642, + "loss": 2.5243, + "theoretical_loss": 3.356228268999585, + "tokens_seen": 2617827328 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001045135406218656, + "loss": 2.4966, + "theoretical_loss": 3.356221628645616, + "tokens_seen": 2617892864 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001045035105315948, + "loss": 2.5814, + "theoretical_loss": 3.3562149885044237, + "tokens_seen": 2617958400 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010449348044132398, + "loss": 2.6862, + "theoretical_loss": 3.356208348575995, + "tokens_seen": 2618023936 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010448345035105317, + "loss": 2.4265, + "theoretical_loss": 3.3562017088603175, + "tokens_seen": 2618089472 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2863400, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.020458459854126, + "objective/train/theoretical_loss": 3.35619506935738, + "objective/train/tokens_used": 2638615008, + "theoretical_loss": 3.35619506935738, + "tokens_seen": 2618155008 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010447342026078235, + "loss": 2.2628, + "theoretical_loss": 3.35619506935738, + "tokens_seen": 2618155008 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010446339017051155, + "loss": 2.4761, + "theoretical_loss": 3.356188430067169, + "tokens_seen": 2618220544 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010445336008024073, + "loss": 2.3963, + "theoretical_loss": 3.356181790989673, + "tokens_seen": 2618286080 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010444332998996991, + "loss": 2.5621, + "theoretical_loss": 3.356175152124881, + "tokens_seen": 2618351616 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001044332998996991, + "loss": 2.4966, + "theoretical_loss": 3.3561685134727792, + "tokens_seen": 2618417152 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010442326980942828, + "loss": 2.5212, + "theoretical_loss": 3.356161875033356, + "tokens_seen": 2618482688 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010441323971915748, + "loss": 2.6671, + "theoretical_loss": 3.3561552368066, + "tokens_seen": 2618548224 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010440320962888666, + "loss": 2.6735, + "theoretical_loss": 3.356148598792498, + "tokens_seen": 2618613760 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010439317953861586, + "loss": 2.4453, + "theoretical_loss": 3.356141960991038, + "tokens_seen": 2618679296 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010438314944834504, + "loss": 2.4796, + "theoretical_loss": 3.3561353234022087, + "tokens_seen": 2618744832 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010437311935807423, + "loss": 2.4454, + "theoretical_loss": 3.356128686025997, + "tokens_seen": 2618810368 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010436308926780341, + "loss": 2.4571, + "theoretical_loss": 3.3561220488623915, + "tokens_seen": 2618875904 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010435305917753259, + "loss": 2.3555, + "theoretical_loss": 3.3561154119113796, + "tokens_seen": 2618941440 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010434302908726179, + "loss": 2.5349, + "theoretical_loss": 3.3561087751729497, + "tokens_seen": 2619006976 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010433299899699097, + "loss": 2.6064, + "theoretical_loss": 3.356102138647089, + "tokens_seen": 2619072512 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010432296890672016, + "loss": 2.486, + "theoretical_loss": 3.356095502333786, + "tokens_seen": 2619138048 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010431293881644934, + "loss": 2.6127, + "theoretical_loss": 3.3560888662330277, + "tokens_seen": 2619203584 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010430290872617854, + "loss": 2.3975, + "theoretical_loss": 3.356082230344803, + "tokens_seen": 2619269120 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010429287863590772, + "loss": 2.5629, + "theoretical_loss": 3.3560755946690994, + "tokens_seen": 2619334656 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010428284854563692, + "loss": 2.5646, + "theoretical_loss": 3.3560689592059045, + "tokens_seen": 2619400192 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001042728184553661, + "loss": 2.2947, + "theoretical_loss": 3.3560623239552063, + "tokens_seen": 2619465728 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010426278836509529, + "loss": 2.258, + "theoretical_loss": 3.356055688916993, + "tokens_seen": 2619531264 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010425275827482447, + "loss": 2.462, + "theoretical_loss": 3.356049054091252, + "tokens_seen": 2619596800 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010424272818455365, + "loss": 2.535, + "theoretical_loss": 3.356042419477971, + "tokens_seen": 2619662336 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010423269809428285, + "loss": 2.6252, + "theoretical_loss": 3.3560357850771387, + "tokens_seen": 2619727872 + }, + { + "epoch": 8.07, + "objective/train/docs_used": 2864603, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2457985877990723, + "objective/train/theoretical_loss": 3.3560291508887428, + "objective/train/tokens_used": 2640253408, + "theoretical_loss": 3.3560291508887428, + "tokens_seen": 2619793408 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010422266800401203, + "loss": 2.48, + "theoretical_loss": 3.3560291508887428, + "tokens_seen": 2619793408 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010421263791374122, + "loss": 2.3693, + "theoretical_loss": 3.3560225169127706, + "tokens_seen": 2619858944 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001042026078234704, + "loss": 2.5796, + "theoretical_loss": 3.356015883149211, + "tokens_seen": 2619924480 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001041925777331996, + "loss": 2.5324, + "theoretical_loss": 3.3560092495980505, + "tokens_seen": 2619990016 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010418254764292878, + "loss": 2.4476, + "theoretical_loss": 3.356002616259278, + "tokens_seen": 2620055552 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010417251755265798, + "loss": 2.5268, + "theoretical_loss": 3.355995983132881, + "tokens_seen": 2620121088 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010416248746238717, + "loss": 2.5652, + "theoretical_loss": 3.355989350218848, + "tokens_seen": 2620186624 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010415245737211635, + "loss": 2.4462, + "theoretical_loss": 3.3559827175171657, + "tokens_seen": 2620252160 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010414242728184555, + "loss": 2.6768, + "theoretical_loss": 3.3559760850278226, + "tokens_seen": 2620317696 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010413239719157473, + "loss": 2.4228, + "theoretical_loss": 3.355969452750807, + "tokens_seen": 2620383232 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010412236710130392, + "loss": 2.3364, + "theoretical_loss": 3.3559628206861065, + "tokens_seen": 2620448768 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001041123370110331, + "loss": 2.4061, + "theoretical_loss": 3.3559561888337086, + "tokens_seen": 2620514304 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001041023069207623, + "loss": 2.3211, + "theoretical_loss": 3.355949557193602, + "tokens_seen": 2620579840 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010409227683049148, + "loss": 2.5925, + "theoretical_loss": 3.355942925765774, + "tokens_seen": 2620645376 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010408224674022067, + "loss": 2.6172, + "theoretical_loss": 3.3559362945502125, + "tokens_seen": 2620710912 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010407221664994985, + "loss": 2.4424, + "theoretical_loss": 3.355929663546906, + "tokens_seen": 2620776448 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010406218655967905, + "loss": 2.6023, + "theoretical_loss": 3.3559230327558414, + "tokens_seen": 2620841984 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010405215646940823, + "loss": 2.6165, + "theoretical_loss": 3.3559164021770074, + "tokens_seen": 2620907520 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010404212637913741, + "loss": 2.1263, + "theoretical_loss": 3.3559097718103916, + "tokens_seen": 2620973056 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001040320962888666, + "loss": 2.5027, + "theoretical_loss": 3.3559031416559817, + "tokens_seen": 2621038592 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010402206619859579, + "loss": 2.6314, + "theoretical_loss": 3.355896511713766, + "tokens_seen": 2621104128 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010401203610832498, + "loss": 2.4284, + "theoretical_loss": 3.355889881983732, + "tokens_seen": 2621169664 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010400200601805416, + "loss": 2.4262, + "theoretical_loss": 3.3558832524658686, + "tokens_seen": 2621235200 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010399197592778336, + "loss": 2.4386, + "theoretical_loss": 3.3558766231601624, + "tokens_seen": 2621300736 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010398194583751254, + "loss": 2.5237, + "theoretical_loss": 3.3558699940666017, + "tokens_seen": 2621366272 + }, + { + "debugging/Self-BLEU-5": 0.41912251462409106, + "debugging/distinct-1-grams": 0.7598625226928022, + "debugging/distinct-2-grams": 0.9527855590866102, + "debugging/entropy-1-grams": 5.750894952498057, + "debugging/entropy-2-grams": 6.629707901616409, + "debugging/length": 496.27272727272725, + "debugging/num_segments": 11, + "debugging/score": 0.007313295962424692, + "debugging/score_std": 0.006134720457915707, + "epoch": 8.07, + "objective/train/docs_used": 2865047, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3852787017822266, + "objective/train/theoretical_loss": 3.355863365185175, + "objective/train/tokens_used": 2641891808, + "theoretical_loss": 3.355863365185175, + "tokens_seen": 2621431808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010397191574724173, + "loss": 2.6025, + "theoretical_loss": 3.355863365185175, + "tokens_seen": 2621431808 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010396188565697091, + "loss": 2.3579, + "theoretical_loss": 3.35585673651587, + "tokens_seen": 2621497344 + }, + { + "epoch": 8.07, + "learning_rate": 0.0001039518555667001, + "loss": 2.5956, + "theoretical_loss": 3.3558501080586733, + "tokens_seen": 2621562880 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010394182547642929, + "loss": 2.3452, + "theoretical_loss": 3.355843479813575, + "tokens_seen": 2621628416 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010393179538615847, + "loss": 2.4896, + "theoretical_loss": 3.3558368517805612, + "tokens_seen": 2621693952 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010392176529588767, + "loss": 2.583, + "theoretical_loss": 3.3558302239596207, + "tokens_seen": 2621759488 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010391173520561685, + "loss": 2.756, + "theoretical_loss": 3.3558235963507412, + "tokens_seen": 2621825024 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010390170511534604, + "loss": 2.5296, + "theoretical_loss": 3.355816968953911, + "tokens_seen": 2621890560 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010389167502507522, + "loss": 2.6496, + "theoretical_loss": 3.355810341769117, + "tokens_seen": 2621956096 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010388164493480442, + "loss": 2.5529, + "theoretical_loss": 3.355803714796348, + "tokens_seen": 2622021632 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001038716148445336, + "loss": 2.3649, + "theoretical_loss": 3.355797088035592, + "tokens_seen": 2622087168 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010386158475426278, + "loss": 2.5661, + "theoretical_loss": 3.355790461486837, + "tokens_seen": 2622152704 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010385155466399197, + "loss": 2.4115, + "theoretical_loss": 3.3557838351500697, + "tokens_seen": 2622218240 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010384152457372116, + "loss": 2.492, + "theoretical_loss": 3.3557772090252795, + "tokens_seen": 2622283776 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010383149448345035, + "loss": 2.6166, + "theoretical_loss": 3.355770583112453, + "tokens_seen": 2622349312 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010382146439317953, + "loss": 2.3584, + "theoretical_loss": 3.3557639574115794, + "tokens_seen": 2622414848 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010381143430290873, + "loss": 2.4967, + "theoretical_loss": 3.3557573319226455, + "tokens_seen": 2622480384 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010380140421263791, + "loss": 2.4828, + "theoretical_loss": 3.3557507066456402, + "tokens_seen": 2622545920 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001037913741223671, + "loss": 2.5273, + "theoretical_loss": 3.355744081580551, + "tokens_seen": 2622611456 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001037813440320963, + "loss": 2.3024, + "theoretical_loss": 3.3557374567273657, + "tokens_seen": 2622676992 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010377131394182549, + "loss": 2.4608, + "theoretical_loss": 3.355730832086072, + "tokens_seen": 2622742528 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010376128385155467, + "loss": 2.5571, + "theoretical_loss": 3.3557242076566585, + "tokens_seen": 2622808064 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010375125376128385, + "loss": 2.6516, + "theoretical_loss": 3.3557175834391124, + "tokens_seen": 2622873600 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010374122367101305, + "loss": 2.4139, + "theoretical_loss": 3.3557109594334222, + "tokens_seen": 2622939136 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010373119358074223, + "loss": 2.4891, + "theoretical_loss": 3.355704335639576, + "tokens_seen": 2623004672 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2866334, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9571285247802734, + "objective/train/theoretical_loss": 3.355697712057561, + "objective/train/tokens_used": 2643530208, + "theoretical_loss": 3.355697712057561, + "tokens_seen": 2623070208 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010372116349047142, + "loss": 2.6516, + "theoretical_loss": 3.355697712057561, + "tokens_seen": 2623070208 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001037111334002006, + "loss": 2.5278, + "theoretical_loss": 3.3556910886873657, + "tokens_seen": 2623135744 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001037011033099298, + "loss": 2.6428, + "theoretical_loss": 3.3556844655289777, + "tokens_seen": 2623201280 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010369107321965898, + "loss": 2.4554, + "theoretical_loss": 3.355677842582385, + "tokens_seen": 2623266816 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010368104312938818, + "loss": 2.624, + "theoretical_loss": 3.3556712198475758, + "tokens_seen": 2623332352 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010367101303911736, + "loss": 2.5503, + "theoretical_loss": 3.355664597324538, + "tokens_seen": 2623397888 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010366098294884654, + "loss": 2.5908, + "theoretical_loss": 3.355657975013259, + "tokens_seen": 2623463424 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010365095285857573, + "loss": 2.3957, + "theoretical_loss": 3.3556513529137275, + "tokens_seen": 2623528960 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010364092276830491, + "loss": 2.5678, + "theoretical_loss": 3.3556447310259307, + "tokens_seen": 2623594496 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010363089267803411, + "loss": 2.4612, + "theoretical_loss": 3.355638109349857, + "tokens_seen": 2623660032 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010362086258776329, + "loss": 2.6653, + "theoretical_loss": 3.3556314878854945, + "tokens_seen": 2623725568 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010361083249749248, + "loss": 2.319, + "theoretical_loss": 3.3556248666328305, + "tokens_seen": 2623791104 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010360080240722167, + "loss": 2.5872, + "theoretical_loss": 3.3556182455918537, + "tokens_seen": 2623856640 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010359077231695086, + "loss": 2.5885, + "theoretical_loss": 3.3556116247625516, + "tokens_seen": 2623922176 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010358074222668004, + "loss": 2.625, + "theoretical_loss": 3.355605004144912, + "tokens_seen": 2623987712 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010357071213640922, + "loss": 2.4847, + "theoretical_loss": 3.355598383738924, + "tokens_seen": 2624053248 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010356068204613842, + "loss": 2.4101, + "theoretical_loss": 3.3555917635445733, + "tokens_seen": 2624118784 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001035506519558676, + "loss": 2.4593, + "theoretical_loss": 3.35558514356185, + "tokens_seen": 2624184320 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010354062186559679, + "loss": 2.2338, + "theoretical_loss": 3.355578523790741, + "tokens_seen": 2624249856 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010353059177532597, + "loss": 2.6009, + "theoretical_loss": 3.3555719042312346, + "tokens_seen": 2624315392 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010352056168505517, + "loss": 2.6752, + "theoretical_loss": 3.355565284883318, + "tokens_seen": 2624380928 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010351053159478435, + "loss": 2.5545, + "theoretical_loss": 3.3555586657469805, + "tokens_seen": 2624446464 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010350050150451354, + "loss": 2.3657, + "theoretical_loss": 3.3555520468222095, + "tokens_seen": 2624512000 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010349047141424273, + "loss": 2.345, + "theoretical_loss": 3.355545428108992, + "tokens_seen": 2624577536 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010348044132397192, + "loss": 2.37, + "theoretical_loss": 3.3555388096073173, + "tokens_seen": 2624643072 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2867146, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7143969535827637, + "objective/train/theoretical_loss": 3.355532191317173, + "objective/train/tokens_used": 2645168608, + "theoretical_loss": 3.355532191317173, + "tokens_seen": 2624708608 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001034704112337011, + "loss": 2.7332, + "theoretical_loss": 3.355532191317173, + "tokens_seen": 2624708608 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010346038114343028, + "loss": 2.4954, + "theoretical_loss": 3.3555255732385465, + "tokens_seen": 2624774144 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010345035105315948, + "loss": 2.5819, + "theoretical_loss": 3.3555189553714264, + "tokens_seen": 2624839680 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010344032096288866, + "loss": 2.5987, + "theoretical_loss": 3.3555123377158003, + "tokens_seen": 2624905216 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010343029087261785, + "loss": 2.6112, + "theoretical_loss": 3.3555057202716556, + "tokens_seen": 2624970752 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010342026078234703, + "loss": 2.4176, + "theoretical_loss": 3.355499103038982, + "tokens_seen": 2625036288 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010341023069207624, + "loss": 2.6402, + "theoretical_loss": 3.3554924860177655, + "tokens_seen": 2625101824 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010340020060180542, + "loss": 2.4872, + "theoretical_loss": 3.355485869207995, + "tokens_seen": 2625167360 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010339017051153462, + "loss": 2.4833, + "theoretical_loss": 3.3554792526096584, + "tokens_seen": 2625232896 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001033801404212638, + "loss": 2.3111, + "theoretical_loss": 3.355472636222744, + "tokens_seen": 2625298432 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010337011033099298, + "loss": 2.2897, + "theoretical_loss": 3.3554660200472393, + "tokens_seen": 2625363968 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010336008024072217, + "loss": 2.6041, + "theoretical_loss": 3.355459404083133, + "tokens_seen": 2625429504 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010335005015045136, + "loss": 2.5849, + "theoretical_loss": 3.3554527883304113, + "tokens_seen": 2625495040 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010334002006018055, + "loss": 2.3154, + "theoretical_loss": 3.355446172789064, + "tokens_seen": 2625560576 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010332998996990973, + "loss": 2.6806, + "theoretical_loss": 3.3554395574590785, + "tokens_seen": 2625626112 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010331995987963893, + "loss": 2.6225, + "theoretical_loss": 3.3554329423404425, + "tokens_seen": 2625691648 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010330992978936811, + "loss": 2.5403, + "theoretical_loss": 3.355426327433144, + "tokens_seen": 2625757184 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001032998996990973, + "loss": 2.4863, + "theoretical_loss": 3.3554197127371714, + "tokens_seen": 2625822720 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010328986960882648, + "loss": 2.5318, + "theoretical_loss": 3.3554130982525123, + "tokens_seen": 2625888256 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010327983951855568, + "loss": 2.6304, + "theoretical_loss": 3.355406483979155, + "tokens_seen": 2625953792 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010326980942828486, + "loss": 2.4804, + "theoretical_loss": 3.3553998699170866, + "tokens_seen": 2626019328 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010325977933801404, + "loss": 2.4787, + "theoretical_loss": 3.355393256066296, + "tokens_seen": 2626084864 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010324974924774323, + "loss": 2.3844, + "theoretical_loss": 3.355386642426771, + "tokens_seen": 2626150400 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010323971915747242, + "loss": 2.4507, + "theoretical_loss": 3.3553800289985, + "tokens_seen": 2626215936 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010322968906720161, + "loss": 2.3519, + "theoretical_loss": 3.35537341578147, + "tokens_seen": 2626281472 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2868415, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3982455730438232, + "objective/train/theoretical_loss": 3.355366802775669, + "objective/train/tokens_used": 2646807008, + "theoretical_loss": 3.355366802775669, + "tokens_seen": 2626347008 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010321965897693079, + "loss": 2.4247, + "theoretical_loss": 3.355366802775669, + "tokens_seen": 2626347008 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010320962888665999, + "loss": 2.5116, + "theoretical_loss": 3.355360189981086, + "tokens_seen": 2626412544 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010319959879638917, + "loss": 2.5666, + "theoretical_loss": 3.3553535773977083, + "tokens_seen": 2626478080 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010318956870611836, + "loss": 2.5726, + "theoretical_loss": 3.3553469650255243, + "tokens_seen": 2626543616 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010318956870611836, + "loss": 2.5783, + "theoretical_loss": 3.355340352864521, + "tokens_seen": 2626609152 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010317953861584754, + "loss": 2.7134, + "theoretical_loss": 3.3553337409146877, + "tokens_seen": 2626674688 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010316950852557672, + "loss": 2.3759, + "theoretical_loss": 3.3553271291760116, + "tokens_seen": 2626740224 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010315947843530592, + "loss": 2.4866, + "theoretical_loss": 3.355320517648481, + "tokens_seen": 2626805760 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001031494483450351, + "loss": 2.5222, + "theoretical_loss": 3.355313906332083, + "tokens_seen": 2626871296 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001031394182547643, + "loss": 2.7102, + "theoretical_loss": 3.3553072952268073, + "tokens_seen": 2626936832 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010312938816449348, + "loss": 2.5271, + "theoretical_loss": 3.35530068433264, + "tokens_seen": 2627002368 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010311935807422267, + "loss": 2.432, + "theoretical_loss": 3.3552940736495707, + "tokens_seen": 2627067904 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010310932798395185, + "loss": 2.3849, + "theoretical_loss": 3.3552874631775866, + "tokens_seen": 2627133440 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010309929789368105, + "loss": 2.6969, + "theoretical_loss": 3.3552808529166755, + "tokens_seen": 2627198976 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010308926780341023, + "loss": 2.6085, + "theoretical_loss": 3.355274242866826, + "tokens_seen": 2627264512 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010307923771313941, + "loss": 2.6181, + "theoretical_loss": 3.3552676330280256, + "tokens_seen": 2627330048 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001030692076228686, + "loss": 2.4088, + "theoretical_loss": 3.3552610234002627, + "tokens_seen": 2627395584 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010305917753259778, + "loss": 2.4824, + "theoretical_loss": 3.3552544139835248, + "tokens_seen": 2627461120 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010304914744232698, + "loss": 2.5952, + "theoretical_loss": 3.3552478047778003, + "tokens_seen": 2627526656 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010303911735205617, + "loss": 2.5264, + "theoretical_loss": 3.3552411957830772, + "tokens_seen": 2627592192 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010302908726178537, + "loss": 2.7107, + "theoretical_loss": 3.3552345869993436, + "tokens_seen": 2627657728 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010301905717151455, + "loss": 2.6801, + "theoretical_loss": 3.355227978426587, + "tokens_seen": 2627723264 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010300902708124374, + "loss": 2.376, + "theoretical_loss": 3.3552213700647955, + "tokens_seen": 2627788800 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010299899699097293, + "loss": 2.4559, + "theoretical_loss": 3.3552147619139574, + "tokens_seen": 2627854336 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010298896690070212, + "loss": 2.4934, + "theoretical_loss": 3.355208153974061, + "tokens_seen": 2627919872 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2869117, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7138559818267822, + "objective/train/theoretical_loss": 3.3552015462450937, + "objective/train/tokens_used": 2648445408, + "theoretical_loss": 3.3552015462450937, + "tokens_seen": 2627985408 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001029789368104313, + "loss": 2.7149, + "theoretical_loss": 3.3552015462450937, + "tokens_seen": 2627985408 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010296890672016048, + "loss": 2.5831, + "theoretical_loss": 3.3551949387270437, + "tokens_seen": 2628050944 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010295887662988968, + "loss": 2.4475, + "theoretical_loss": 3.3551883314198987, + "tokens_seen": 2628116480 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010294884653961886, + "loss": 2.3222, + "theoretical_loss": 3.3551817243236473, + "tokens_seen": 2628182016 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010293881644934805, + "loss": 2.2739, + "theoretical_loss": 3.355175117438277, + "tokens_seen": 2628247552 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010292878635907723, + "loss": 2.3664, + "theoretical_loss": 3.3551685107637765, + "tokens_seen": 2628313088 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010291875626880643, + "loss": 2.3274, + "theoretical_loss": 3.3551619043001333, + "tokens_seen": 2628378624 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010290872617853561, + "loss": 2.4815, + "theoretical_loss": 3.355155298047335, + "tokens_seen": 2628444160 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001028986960882648, + "loss": 2.5664, + "theoretical_loss": 3.3551486920053706, + "tokens_seen": 2628509696 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010288866599799399, + "loss": 2.3648, + "theoretical_loss": 3.3551420861742276, + "tokens_seen": 2628575232 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010287863590772317, + "loss": 2.5718, + "theoretical_loss": 3.3551354805538938, + "tokens_seen": 2628640768 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010286860581745236, + "loss": 2.6466, + "theoretical_loss": 3.3551288751443575, + "tokens_seen": 2628706304 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010285857572718154, + "loss": 2.8665, + "theoretical_loss": 3.355122269945607, + "tokens_seen": 2628771840 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010284854563691074, + "loss": 2.3069, + "theoretical_loss": 3.3551156649576295, + "tokens_seen": 2628837376 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010283851554663992, + "loss": 2.4278, + "theoretical_loss": 3.3551090601804137, + "tokens_seen": 2628902912 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010282848545636911, + "loss": 2.2347, + "theoretical_loss": 3.3551024556139475, + "tokens_seen": 2628968448 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001028184553660983, + "loss": 2.3919, + "theoretical_loss": 3.3550958512582185, + "tokens_seen": 2629033984 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010280842527582749, + "loss": 2.4534, + "theoretical_loss": 3.3550892471132157, + "tokens_seen": 2629099520 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010279839518555667, + "loss": 2.5549, + "theoretical_loss": 3.3550826431789256, + "tokens_seen": 2629165056 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010278836509528585, + "loss": 2.4797, + "theoretical_loss": 3.355076039455338, + "tokens_seen": 2629230592 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010277833500501505, + "loss": 2.6413, + "theoretical_loss": 3.35506943594244, + "tokens_seen": 2629296128 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010276830491474423, + "loss": 2.2321, + "theoretical_loss": 3.3550628326402188, + "tokens_seen": 2629361664 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010275827482447342, + "loss": 2.4521, + "theoretical_loss": 3.355056229548664, + "tokens_seen": 2629427200 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001027482447342026, + "loss": 2.4451, + "theoretical_loss": 3.3550496266677627, + "tokens_seen": 2629492736 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001027382146439318, + "loss": 2.5392, + "theoretical_loss": 3.355043023997503, + "tokens_seen": 2629558272 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2869882, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3628032207489014, + "objective/train/theoretical_loss": 3.355036421537873, + "objective/train/tokens_used": 2650083808, + "theoretical_loss": 3.355036421537873, + "tokens_seen": 2629623808 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010272818455366098, + "loss": 2.4966, + "theoretical_loss": 3.355036421537873, + "tokens_seen": 2629623808 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010271815446339017, + "loss": 2.2767, + "theoretical_loss": 3.3550298192888617, + "tokens_seen": 2629689344 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010270812437311935, + "loss": 2.3036, + "theoretical_loss": 3.3550232172504555, + "tokens_seen": 2629754880 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010269809428284855, + "loss": 2.4559, + "theoretical_loss": 3.3550166154226435, + "tokens_seen": 2629820416 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010268806419257773, + "loss": 2.686, + "theoretical_loss": 3.355010013805413, + "tokens_seen": 2629885952 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010267803410230691, + "loss": 2.5077, + "theoretical_loss": 3.3550034123987524, + "tokens_seen": 2629951488 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001026680040120361, + "loss": 2.7575, + "theoretical_loss": 3.3549968112026503, + "tokens_seen": 2630017024 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001026579739217653, + "loss": 2.464, + "theoretical_loss": 3.3549902102170943, + "tokens_seen": 2630082560 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001026479438314945, + "loss": 2.3912, + "theoretical_loss": 3.3549836094420717, + "tokens_seen": 2630148096 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010263791374122368, + "loss": 2.5475, + "theoretical_loss": 3.3549770088775714, + "tokens_seen": 2630213632 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010262788365095287, + "loss": 2.3629, + "theoretical_loss": 3.354970408523582, + "tokens_seen": 2630279168 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010261785356068205, + "loss": 2.3227, + "theoretical_loss": 3.35496380838009, + "tokens_seen": 2630344704 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010260782347041125, + "loss": 2.4052, + "theoretical_loss": 3.3549572084470842, + "tokens_seen": 2630410240 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010259779338014043, + "loss": 2.6368, + "theoretical_loss": 3.354950608724553, + "tokens_seen": 2630475776 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010258776328986961, + "loss": 2.5665, + "theoretical_loss": 3.3549440092124843, + "tokens_seen": 2630541312 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001025777331995988, + "loss": 2.5321, + "theoretical_loss": 3.3549374099108658, + "tokens_seen": 2630606848 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010256770310932798, + "loss": 2.4942, + "theoretical_loss": 3.3549308108196856, + "tokens_seen": 2630672384 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010255767301905718, + "loss": 2.4742, + "theoretical_loss": 3.3549242119389318, + "tokens_seen": 2630737920 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010254764292878636, + "loss": 2.4345, + "theoretical_loss": 3.3549176132685923, + "tokens_seen": 2630803456 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010253761283851556, + "loss": 2.6951, + "theoretical_loss": 3.3549110148086556, + "tokens_seen": 2630868992 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010252758274824474, + "loss": 2.4125, + "theoretical_loss": 3.3549044165591093, + "tokens_seen": 2630934528 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010251755265797393, + "loss": 2.5526, + "theoretical_loss": 3.354897818519942, + "tokens_seen": 2631000064 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010250752256770311, + "loss": 2.3951, + "theoretical_loss": 3.354891220691141, + "tokens_seen": 2631065600 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010249749247743229, + "loss": 2.3778, + "theoretical_loss": 3.3548846230726954, + "tokens_seen": 2631131136 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010248746238716149, + "loss": 2.4198, + "theoretical_loss": 3.3548780256645925, + "tokens_seen": 2631196672 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2871212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5602478981018066, + "objective/train/theoretical_loss": 3.35487142846682, + "objective/train/tokens_used": 2651722208, + "theoretical_loss": 3.35487142846682, + "tokens_seen": 2631262208 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010247743229689067, + "loss": 2.5218, + "theoretical_loss": 3.35487142846682, + "tokens_seen": 2631262208 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010246740220661986, + "loss": 2.5156, + "theoretical_loss": 3.3548648314793668, + "tokens_seen": 2631327744 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010245737211634904, + "loss": 2.4171, + "theoretical_loss": 3.3548582347022204, + "tokens_seen": 2631393280 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010244734202607824, + "loss": 2.3855, + "theoretical_loss": 3.354851638135369, + "tokens_seen": 2631458816 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010243731193580742, + "loss": 2.4284, + "theoretical_loss": 3.354845041778801, + "tokens_seen": 2631524352 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010242728184553662, + "loss": 2.4002, + "theoretical_loss": 3.3548384456325033, + "tokens_seen": 2631589888 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001024172517552658, + "loss": 2.6544, + "theoretical_loss": 3.354831849696466, + "tokens_seen": 2631655424 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010240722166499499, + "loss": 2.5859, + "theoretical_loss": 3.354825253970675, + "tokens_seen": 2631720960 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010239719157472417, + "loss": 2.6931, + "theoretical_loss": 3.3548186584551196, + "tokens_seen": 2631786496 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010238716148445335, + "loss": 2.5598, + "theoretical_loss": 3.354812063149788, + "tokens_seen": 2631852032 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010237713139418255, + "loss": 2.6115, + "theoretical_loss": 3.3548054680546677, + "tokens_seen": 2631917568 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010236710130391173, + "loss": 2.6074, + "theoretical_loss": 3.354798873169747, + "tokens_seen": 2631983104 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010235707121364092, + "loss": 2.4591, + "theoretical_loss": 3.3547922784950135, + "tokens_seen": 2632048640 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001023470411233701, + "loss": 2.5139, + "theoretical_loss": 3.354785684030456, + "tokens_seen": 2632114176 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001023370110330993, + "loss": 2.5049, + "theoretical_loss": 3.3547790897760623, + "tokens_seen": 2632179712 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010232698094282848, + "loss": 2.6039, + "theoretical_loss": 3.3547724957318206, + "tokens_seen": 2632245248 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010231695085255768, + "loss": 2.4248, + "theoretical_loss": 3.3547659018977183, + "tokens_seen": 2632310784 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010230692076228686, + "loss": 2.3785, + "theoretical_loss": 3.354759308273744, + "tokens_seen": 2632376320 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010229689067201604, + "loss": 2.5984, + "theoretical_loss": 3.354752714859886, + "tokens_seen": 2632441856 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010228686058174525, + "loss": 2.4046, + "theoretical_loss": 3.354746121656132, + "tokens_seen": 2632507392 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010227683049147443, + "loss": 2.3079, + "theoretical_loss": 3.35473952866247, + "tokens_seen": 2632572928 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010226680040120362, + "loss": 2.2676, + "theoretical_loss": 3.3547329358788884, + "tokens_seen": 2632638464 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001022567703109328, + "loss": 2.4853, + "theoretical_loss": 3.3547263433053756, + "tokens_seen": 2632704000 + }, + { + "epoch": 8.08, + "learning_rate": 0.000102246740220662, + "loss": 2.3748, + "theoretical_loss": 3.3547197509419187, + "tokens_seen": 2632769536 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010223671013039118, + "loss": 2.349, + "theoretical_loss": 3.354713158788506, + "tokens_seen": 2632835072 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2871940, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4001126289367676, + "objective/train/theoretical_loss": 3.3547065668451266, + "objective/train/tokens_used": 2653360608, + "theoretical_loss": 3.3547065668451266, + "tokens_seen": 2632900608 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010222668004012037, + "loss": 2.4963, + "theoretical_loss": 3.3547065668451266, + "tokens_seen": 2632900608 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010221664994984955, + "loss": 2.3504, + "theoretical_loss": 3.3546999751117674, + "tokens_seen": 2632966144 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010220661985957875, + "loss": 2.3958, + "theoretical_loss": 3.3546933835884167, + "tokens_seen": 2633031680 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010219658976930793, + "loss": 2.4155, + "theoretical_loss": 3.354686792275063, + "tokens_seen": 2633097216 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010218655967903711, + "loss": 2.5539, + "theoretical_loss": 3.3546802011716945, + "tokens_seen": 2633162752 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001021765295887663, + "loss": 2.5739, + "theoretical_loss": 3.3546736102782986, + "tokens_seen": 2633228288 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010216649949849549, + "loss": 2.5768, + "theoretical_loss": 3.354667019594864, + "tokens_seen": 2633293824 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010215646940822468, + "loss": 2.4629, + "theoretical_loss": 3.3546604291213784, + "tokens_seen": 2633359360 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010214643931795386, + "loss": 2.3831, + "theoretical_loss": 3.35465383885783, + "tokens_seen": 2633424896 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010213640922768306, + "loss": 2.4951, + "theoretical_loss": 3.354647248804207, + "tokens_seen": 2633490432 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010212637913741224, + "loss": 2.2379, + "theoretical_loss": 3.3546406589604976, + "tokens_seen": 2633555968 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010211634904714143, + "loss": 2.6389, + "theoretical_loss": 3.354634069326689, + "tokens_seen": 2633621504 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010210631895687061, + "loss": 2.4453, + "theoretical_loss": 3.3546274799027707, + "tokens_seen": 2633687040 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001020962888665998, + "loss": 2.5346, + "theoretical_loss": 3.35462089068873, + "tokens_seen": 2633752576 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010208625877632899, + "loss": 2.3857, + "theoretical_loss": 3.3546143016845553, + "tokens_seen": 2633818112 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010207622868605817, + "loss": 2.366, + "theoretical_loss": 3.354607712890234, + "tokens_seen": 2633883648 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010206619859578737, + "loss": 2.5147, + "theoretical_loss": 3.3546011243057547, + "tokens_seen": 2633949184 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010205616850551655, + "loss": 2.4541, + "theoretical_loss": 3.3545945359311053, + "tokens_seen": 2634014720 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010204613841524574, + "loss": 2.3639, + "theoretical_loss": 3.354587947766275, + "tokens_seen": 2634080256 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010203610832497492, + "loss": 2.3047, + "theoretical_loss": 3.3545813598112497, + "tokens_seen": 2634145792 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010202607823470412, + "loss": 2.4845, + "theoretical_loss": 3.3545747720660195, + "tokens_seen": 2634211328 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001020160481444333, + "loss": 2.5298, + "theoretical_loss": 3.3545681845305717, + "tokens_seen": 2634276864 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010200601805416248, + "loss": 2.504, + "theoretical_loss": 3.3545615972048943, + "tokens_seen": 2634342400 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010199598796389167, + "loss": 2.5522, + "theoretical_loss": 3.354555010088976, + "tokens_seen": 2634407936 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010198595787362086, + "loss": 2.6786, + "theoretical_loss": 3.3545484231828038, + "tokens_seen": 2634473472 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2873059, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5236940383911133, + "objective/train/theoretical_loss": 3.3545418364863666, + "objective/train/tokens_used": 2654999008, + "theoretical_loss": 3.3545418364863666, + "tokens_seen": 2634539008 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010197592778335005, + "loss": 2.4665, + "theoretical_loss": 3.3545418364863666, + "tokens_seen": 2634539008 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010196589769307923, + "loss": 2.4358, + "theoretical_loss": 3.354535249999653, + "tokens_seen": 2634604544 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010195586760280843, + "loss": 2.5588, + "theoretical_loss": 3.3545286637226495, + "tokens_seen": 2634670080 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010194583751253761, + "loss": 2.4568, + "theoretical_loss": 3.354522077655346, + "tokens_seen": 2634735616 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001019358074222668, + "loss": 2.5431, + "theoretical_loss": 3.3545154917977293, + "tokens_seen": 2634801152 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010192577733199598, + "loss": 2.2605, + "theoretical_loss": 3.354508906149788, + "tokens_seen": 2634866688 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010191574724172518, + "loss": 2.6001, + "theoretical_loss": 3.3545023207115108, + "tokens_seen": 2634932224 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010190571715145437, + "loss": 2.456, + "theoretical_loss": 3.3544957354828844, + "tokens_seen": 2634997760 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010189568706118355, + "loss": 2.3819, + "theoretical_loss": 3.3544891504638983, + "tokens_seen": 2635063296 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010188565697091275, + "loss": 2.4672, + "theoretical_loss": 3.35448256565454, + "tokens_seen": 2635128832 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010187562688064193, + "loss": 2.3652, + "theoretical_loss": 3.3544759810547973, + "tokens_seen": 2635194368 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010186559679037112, + "loss": 2.4067, + "theoretical_loss": 3.3544693966646593, + "tokens_seen": 2635259904 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001018555667001003, + "loss": 2.556, + "theoretical_loss": 3.3544628124841127, + "tokens_seen": 2635325440 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001018455366098295, + "loss": 2.5081, + "theoretical_loss": 3.354456228513147, + "tokens_seen": 2635390976 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010183550651955868, + "loss": 2.3829, + "theoretical_loss": 3.3544496447517496, + "tokens_seen": 2635456512 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010182547642928788, + "loss": 2.5699, + "theoretical_loss": 3.3544430611999085, + "tokens_seen": 2635522048 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010181544633901706, + "loss": 2.2603, + "theoretical_loss": 3.3544364778576123, + "tokens_seen": 2635587584 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010180541624874624, + "loss": 2.5142, + "theoretical_loss": 3.3544298947248485, + "tokens_seen": 2635653120 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010179538615847543, + "loss": 2.5861, + "theoretical_loss": 3.354423311801606, + "tokens_seen": 2635718656 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010178535606820461, + "loss": 2.5319, + "theoretical_loss": 3.3544167290878724, + "tokens_seen": 2635784192 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010177532597793381, + "loss": 2.4392, + "theoretical_loss": 3.3544101465836365, + "tokens_seen": 2635849728 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010176529588766299, + "loss": 2.4855, + "theoretical_loss": 3.354403564288885, + "tokens_seen": 2635915264 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010175526579739218, + "loss": 2.4102, + "theoretical_loss": 3.3543969822036073, + "tokens_seen": 2635980800 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010174523570712136, + "loss": 2.3037, + "theoretical_loss": 3.354390400327791, + "tokens_seen": 2636046336 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010173520561685056, + "loss": 2.5486, + "theoretical_loss": 3.3543838186614243, + "tokens_seen": 2636111872 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2873488, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.826429843902588, + "objective/train/theoretical_loss": 3.3543772372044955, + "objective/train/tokens_used": 2656637408, + "theoretical_loss": 3.3543772372044955, + "tokens_seen": 2636177408 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010172517552657974, + "loss": 2.5237, + "theoretical_loss": 3.3543772372044955, + "tokens_seen": 2636177408 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010171514543630892, + "loss": 2.3804, + "theoretical_loss": 3.3543706559569926, + "tokens_seen": 2636242944 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010170511534603812, + "loss": 2.6487, + "theoretical_loss": 3.354364074918904, + "tokens_seen": 2636308480 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001016950852557673, + "loss": 2.6617, + "theoretical_loss": 3.354357494090217, + "tokens_seen": 2636374016 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010168505516549649, + "loss": 2.4023, + "theoretical_loss": 3.3543509134709204, + "tokens_seen": 2636439552 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010167502507522567, + "loss": 2.2899, + "theoretical_loss": 3.3543443330610025, + "tokens_seen": 2636505088 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010166499498495487, + "loss": 2.3783, + "theoretical_loss": 3.354337752860451, + "tokens_seen": 2636570624 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010165496489468405, + "loss": 2.482, + "theoretical_loss": 3.354331172869254, + "tokens_seen": 2636636160 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010164493480441324, + "loss": 2.3283, + "theoretical_loss": 3.3543245930874006, + "tokens_seen": 2636701696 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010163490471414243, + "loss": 2.4356, + "theoretical_loss": 3.3543180135148774, + "tokens_seen": 2636767232 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010162487462387162, + "loss": 2.5446, + "theoretical_loss": 3.3543114341516738, + "tokens_seen": 2636832768 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001016148445336008, + "loss": 2.6327, + "theoretical_loss": 3.3543048549977774, + "tokens_seen": 2636898304 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010160481444332998, + "loss": 2.5034, + "theoretical_loss": 3.354298276053176, + "tokens_seen": 2636963840 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010159478435305918, + "loss": 2.4815, + "theoretical_loss": 3.3542916973178585, + "tokens_seen": 2637029376 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010158475426278836, + "loss": 2.3009, + "theoretical_loss": 3.3542851187918123, + "tokens_seen": 2637094912 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010157472417251755, + "loss": 2.4337, + "theoretical_loss": 3.3542785404750264, + "tokens_seen": 2637160448 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010156469408224673, + "loss": 2.4791, + "theoretical_loss": 3.354271962367488, + "tokens_seen": 2637225984 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010155466399197593, + "loss": 2.4229, + "theoretical_loss": 3.354265384469186, + "tokens_seen": 2637291520 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010154463390170511, + "loss": 2.5361, + "theoretical_loss": 3.3542588067801082, + "tokens_seen": 2637357056 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010153460381143432, + "loss": 2.2096, + "theoretical_loss": 3.3542522293002426, + "tokens_seen": 2637422592 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001015245737211635, + "loss": 2.557, + "theoretical_loss": 3.3542456520295776, + "tokens_seen": 2637488128 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010151454363089268, + "loss": 2.2795, + "theoretical_loss": 3.3542390749681013, + "tokens_seen": 2637553664 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010150451354062187, + "loss": 2.5905, + "theoretical_loss": 3.3542324981158016, + "tokens_seen": 2637619200 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010149448345035106, + "loss": 2.5823, + "theoretical_loss": 3.3542259214726675, + "tokens_seen": 2637684736 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010148445336008025, + "loss": 2.4798, + "theoretical_loss": 3.354219345038686, + "tokens_seen": 2637750272 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2874723, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6192829608917236, + "objective/train/theoretical_loss": 3.354212768813846, + "objective/train/tokens_used": 2658275808, + "theoretical_loss": 3.354212768813846, + "tokens_seen": 2637815808 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010147442326980943, + "loss": 2.6094, + "theoretical_loss": 3.354212768813846, + "tokens_seen": 2637815808 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010146439317953863, + "loss": 2.4324, + "theoretical_loss": 3.3542061927981357, + "tokens_seen": 2637881344 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010145436308926781, + "loss": 2.5902, + "theoretical_loss": 3.3541996169915427, + "tokens_seen": 2637946880 + }, + { + "epoch": 8.08, + "learning_rate": 0.000101444332998997, + "loss": 2.4674, + "theoretical_loss": 3.3541930413940557, + "tokens_seen": 2638012416 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010143430290872618, + "loss": 2.511, + "theoretical_loss": 3.354186466005662, + "tokens_seen": 2638077952 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010142427281845536, + "loss": 2.5355, + "theoretical_loss": 3.3541798908263507, + "tokens_seen": 2638143488 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010141424272818456, + "loss": 2.5591, + "theoretical_loss": 3.35417331585611, + "tokens_seen": 2638209024 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010140421263791374, + "loss": 2.3525, + "theoretical_loss": 3.354166741094927, + "tokens_seen": 2638274560 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010139418254764293, + "loss": 2.2619, + "theoretical_loss": 3.3541601665427914, + "tokens_seen": 2638340096 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010138415245737212, + "loss": 2.3542, + "theoretical_loss": 3.35415359219969, + "tokens_seen": 2638405632 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010137412236710131, + "loss": 2.375, + "theoretical_loss": 3.3541470180656114, + "tokens_seen": 2638471168 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010136409227683049, + "loss": 2.4454, + "theoretical_loss": 3.354140444140544, + "tokens_seen": 2638536704 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010135406218655969, + "loss": 2.3205, + "theoretical_loss": 3.3541338704244756, + "tokens_seen": 2638602240 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010134403209628887, + "loss": 2.6918, + "theoretical_loss": 3.3541272969173948, + "tokens_seen": 2638667776 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010133400200601806, + "loss": 2.3829, + "theoretical_loss": 3.354120723619289, + "tokens_seen": 2638733312 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010132397191574724, + "loss": 2.3067, + "theoretical_loss": 3.3541141505301475, + "tokens_seen": 2638798848 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010131394182547642, + "loss": 2.3537, + "theoretical_loss": 3.3541075776499576, + "tokens_seen": 2638864384 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010130391173520562, + "loss": 2.7285, + "theoretical_loss": 3.3541010049787077, + "tokens_seen": 2638929920 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001012938816449348, + "loss": 2.2567, + "theoretical_loss": 3.354094432516386, + "tokens_seen": 2638995456 + }, + { + "epoch": 8.08, + "learning_rate": 0.000101283851554664, + "loss": 2.5775, + "theoretical_loss": 3.3540878602629807, + "tokens_seen": 2639060992 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010127382146439318, + "loss": 2.4932, + "theoretical_loss": 3.35408128821848, + "tokens_seen": 2639126528 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010126379137412237, + "loss": 2.5853, + "theoretical_loss": 3.354074716382872, + "tokens_seen": 2639192064 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010125376128385155, + "loss": 2.5081, + "theoretical_loss": 3.3540681447561447, + "tokens_seen": 2639257600 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010124373119358075, + "loss": 2.416, + "theoretical_loss": 3.354061573338287, + "tokens_seen": 2639323136 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010123370110330993, + "loss": 2.432, + "theoretical_loss": 3.354055002129286, + "tokens_seen": 2639388672 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2875159, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7799160480499268, + "objective/train/theoretical_loss": 3.3540484311291303, + "objective/train/tokens_used": 2659914208, + "theoretical_loss": 3.3540484311291303, + "tokens_seen": 2639454208 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010122367101303911, + "loss": 2.4659, + "theoretical_loss": 3.3540484311291303, + "tokens_seen": 2639454208 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001012136409227683, + "loss": 2.6492, + "theoretical_loss": 3.3540418603378086, + "tokens_seen": 2639519744 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010120361083249748, + "loss": 2.3731, + "theoretical_loss": 3.3540352897553083, + "tokens_seen": 2639585280 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010119358074222668, + "loss": 2.5825, + "theoretical_loss": 3.354028719381618, + "tokens_seen": 2639650816 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010118355065195586, + "loss": 2.5545, + "theoretical_loss": 3.354022149216726, + "tokens_seen": 2639716352 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010117352056168505, + "loss": 2.472, + "theoretical_loss": 3.3540155792606203, + "tokens_seen": 2639781888 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010116349047141425, + "loss": 2.1071, + "theoretical_loss": 3.3540090095132893, + "tokens_seen": 2639847424 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010115346038114344, + "loss": 2.5821, + "theoretical_loss": 3.35400243997472, + "tokens_seen": 2639912960 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010114343029087263, + "loss": 2.5537, + "theoretical_loss": 3.3539958706449027, + "tokens_seen": 2639978496 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010113340020060182, + "loss": 2.5122, + "theoretical_loss": 3.353989301523824, + "tokens_seen": 2640044032 + }, + { + "epoch": 8.08, + "learning_rate": 0.000101123370110331, + "loss": 2.5735, + "theoretical_loss": 3.3539827326114726, + "tokens_seen": 2640109568 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010111334002006018, + "loss": 2.6045, + "theoretical_loss": 3.3539761639078365, + "tokens_seen": 2640175104 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010110330992978938, + "loss": 2.5769, + "theoretical_loss": 3.353969595412904, + "tokens_seen": 2640240640 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010109327983951856, + "loss": 2.3155, + "theoretical_loss": 3.3539630271266634, + "tokens_seen": 2640306176 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010108324974924775, + "loss": 2.3103, + "theoretical_loss": 3.353956459049103, + "tokens_seen": 2640371712 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010107321965897693, + "loss": 2.4025, + "theoretical_loss": 3.35394989118021, + "tokens_seen": 2640437248 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010106318956870613, + "loss": 2.6364, + "theoretical_loss": 3.3539433235199736, + "tokens_seen": 2640502784 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010105315947843531, + "loss": 2.4411, + "theoretical_loss": 3.353936756068382, + "tokens_seen": 2640568320 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001010431293881645, + "loss": 2.3674, + "theoretical_loss": 3.3539301888254234, + "tokens_seen": 2640633856 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010103309929789369, + "loss": 2.4713, + "theoretical_loss": 3.3539236217910853, + "tokens_seen": 2640699392 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010102306920762287, + "loss": 2.3755, + "theoretical_loss": 3.3539170549653567, + "tokens_seen": 2640764928 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010101303911735206, + "loss": 2.4251, + "theoretical_loss": 3.3539104883482254, + "tokens_seen": 2640830464 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010100300902708124, + "loss": 2.4485, + "theoretical_loss": 3.353903921939679, + "tokens_seen": 2640896000 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010099297893681044, + "loss": 2.346, + "theoretical_loss": 3.353897355739707, + "tokens_seen": 2640961536 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010098294884653962, + "loss": 2.3963, + "theoretical_loss": 3.353890789748297, + "tokens_seen": 2641027072 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2876580, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6085259914398193, + "objective/train/theoretical_loss": 3.3538842239654367, + "objective/train/tokens_used": 2661552608, + "theoretical_loss": 3.3538842239654367, + "tokens_seen": 2641092608 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010097291875626881, + "loss": 2.533, + "theoretical_loss": 3.3538842239654367, + "tokens_seen": 2641092608 + }, + { + "epoch": 8.08, + "learning_rate": 0.000100962888665998, + "loss": 2.617, + "theoretical_loss": 3.3538776583911147, + "tokens_seen": 2641158144 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010095285857572719, + "loss": 2.5656, + "theoretical_loss": 3.3538710930253197, + "tokens_seen": 2641223680 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010094282848545637, + "loss": 2.5039, + "theoretical_loss": 3.353864527868039, + "tokens_seen": 2641289216 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010093279839518555, + "loss": 2.3565, + "theoretical_loss": 3.3538579629192613, + "tokens_seen": 2641354752 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010092276830491475, + "loss": 2.5183, + "theoretical_loss": 3.353851398178975, + "tokens_seen": 2641420288 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010091273821464393, + "loss": 2.2921, + "theoretical_loss": 3.353844833647168, + "tokens_seen": 2641485824 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010090270812437312, + "loss": 2.3834, + "theoretical_loss": 3.353838269323828, + "tokens_seen": 2641551360 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001008926780341023, + "loss": 2.5241, + "theoretical_loss": 3.353831705208944, + "tokens_seen": 2641616896 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001008826479438315, + "loss": 2.3855, + "theoretical_loss": 3.3538251413025044, + "tokens_seen": 2641682432 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010087261785356068, + "loss": 2.4532, + "theoretical_loss": 3.353818577604497, + "tokens_seen": 2641747968 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010086258776328987, + "loss": 2.2466, + "theoretical_loss": 3.3538120141149097, + "tokens_seen": 2641813504 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010085255767301905, + "loss": 2.4304, + "theoretical_loss": 3.3538054508337307, + "tokens_seen": 2641879040 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010084252758274825, + "loss": 2.5468, + "theoretical_loss": 3.353798887760949, + "tokens_seen": 2641944576 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010083249749247743, + "loss": 2.3551, + "theoretical_loss": 3.353792324896552, + "tokens_seen": 2642010112 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010082246740220661, + "loss": 2.2726, + "theoretical_loss": 3.3537857622405287, + "tokens_seen": 2642075648 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001008124373119358, + "loss": 2.4071, + "theoretical_loss": 3.3537791997928665, + "tokens_seen": 2642141184 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010080240722166499, + "loss": 2.2947, + "theoretical_loss": 3.353772637553554, + "tokens_seen": 2642206720 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010079237713139418, + "loss": 2.4523, + "theoretical_loss": 3.3537660755225795, + "tokens_seen": 2642272256 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010078234704112338, + "loss": 2.4459, + "theoretical_loss": 3.353759513699931, + "tokens_seen": 2642337792 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010077231695085257, + "loss": 2.4508, + "theoretical_loss": 3.353752952085597, + "tokens_seen": 2642403328 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010076228686058175, + "loss": 2.473, + "theoretical_loss": 3.3537463906795657, + "tokens_seen": 2642468864 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010075225677031095, + "loss": 2.4413, + "theoretical_loss": 3.353739829481825, + "tokens_seen": 2642534400 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010074222668004013, + "loss": 2.3868, + "theoretical_loss": 3.3537332684923635, + "tokens_seen": 2642599936 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010073219658976931, + "loss": 2.6874, + "theoretical_loss": 3.3537267077111688, + "tokens_seen": 2642665472 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2877359, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.563947916030884, + "objective/train/theoretical_loss": 3.3537201471382296, + "objective/train/tokens_used": 2663191008, + "theoretical_loss": 3.3537201471382296, + "tokens_seen": 2642731008 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001007221664994985, + "loss": 2.5104, + "theoretical_loss": 3.3537201471382296, + "tokens_seen": 2642731008 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010071213640922768, + "loss": 2.4543, + "theoretical_loss": 3.3537135867735346, + "tokens_seen": 2642796544 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010070210631895688, + "loss": 2.6311, + "theoretical_loss": 3.3537070266170708, + "tokens_seen": 2642862080 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010069207622868606, + "loss": 2.3732, + "theoretical_loss": 3.3537004666688275, + "tokens_seen": 2642927616 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010068204613841525, + "loss": 2.3227, + "theoretical_loss": 3.3536939069287928, + "tokens_seen": 2642993152 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010067201604814444, + "loss": 2.5329, + "theoretical_loss": 3.353687347396954, + "tokens_seen": 2643058688 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010066198595787363, + "loss": 2.5271, + "theoretical_loss": 3.3536807880733006, + "tokens_seen": 2643124224 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010065195586760281, + "loss": 2.5147, + "theoretical_loss": 3.35367422895782, + "tokens_seen": 2643189760 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010064192577733199, + "loss": 2.7197, + "theoretical_loss": 3.353667670050501, + "tokens_seen": 2643255296 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010063189568706119, + "loss": 2.3874, + "theoretical_loss": 3.353661111351331, + "tokens_seen": 2643320832 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010062186559679037, + "loss": 2.4664, + "theoretical_loss": 3.353654552860299, + "tokens_seen": 2643386368 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010061183550651956, + "loss": 2.463, + "theoretical_loss": 3.353647994577393, + "tokens_seen": 2643451904 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010060180541624874, + "loss": 2.6102, + "theoretical_loss": 3.3536414365026013, + "tokens_seen": 2643517440 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010059177532597794, + "loss": 2.4565, + "theoretical_loss": 3.3536348786359116, + "tokens_seen": 2643582976 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010058174523570712, + "loss": 2.2842, + "theoretical_loss": 3.353628320977313, + "tokens_seen": 2643648512 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010057171514543632, + "loss": 2.5043, + "theoretical_loss": 3.3536217635267933, + "tokens_seen": 2643714048 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001005616850551655, + "loss": 2.4271, + "theoretical_loss": 3.3536152062843407, + "tokens_seen": 2643779584 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010055165496489469, + "loss": 2.3525, + "theoretical_loss": 3.3536086492499435, + "tokens_seen": 2643845120 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010054162487462387, + "loss": 2.4769, + "theoretical_loss": 3.35360209242359, + "tokens_seen": 2643910656 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010053159478435305, + "loss": 2.6649, + "theoretical_loss": 3.353595535805268, + "tokens_seen": 2643976192 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010052156469408225, + "loss": 2.3906, + "theoretical_loss": 3.3535889793949667, + "tokens_seen": 2644041728 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010051153460381143, + "loss": 2.4234, + "theoretical_loss": 3.3535824231926736, + "tokens_seen": 2644107264 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010050150451354062, + "loss": 2.5835, + "theoretical_loss": 3.3535758671983773, + "tokens_seen": 2644172800 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001004914744232698, + "loss": 2.4589, + "theoretical_loss": 3.3535693114120653, + "tokens_seen": 2644238336 + }, + { + "epoch": 8.08, + "learning_rate": 0.000100481444332999, + "loss": 2.2861, + "theoretical_loss": 3.353562755833727, + "tokens_seen": 2644303872 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2878436, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.459373950958252, + "objective/train/theoretical_loss": 3.3535562004633497, + "objective/train/tokens_used": 2664829408, + "theoretical_loss": 3.3535562004633497, + "tokens_seen": 2644369408 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010047141424272818, + "loss": 2.4503, + "theoretical_loss": 3.3535562004633497, + "tokens_seen": 2644369408 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010046138415245738, + "loss": 2.5221, + "theoretical_loss": 3.353549645300922, + "tokens_seen": 2644434944 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010045135406218656, + "loss": 2.5956, + "theoretical_loss": 3.3535430903464327, + "tokens_seen": 2644500480 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010044132397191574, + "loss": 2.4219, + "theoretical_loss": 3.353536535599869, + "tokens_seen": 2644566016 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010043129388164493, + "loss": 2.6358, + "theoretical_loss": 3.35352998106122, + "tokens_seen": 2644631552 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010042126379137411, + "loss": 2.507, + "theoretical_loss": 3.353523426730473, + "tokens_seen": 2644697088 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010041123370110332, + "loss": 2.3315, + "theoretical_loss": 3.3535168726076177, + "tokens_seen": 2644762624 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001004012036108325, + "loss": 2.567, + "theoretical_loss": 3.353510318692641, + "tokens_seen": 2644828160 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001003911735205617, + "loss": 2.1989, + "theoretical_loss": 3.3535037649855317, + "tokens_seen": 2644893696 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010038114343029088, + "loss": 2.4373, + "theoretical_loss": 3.353497211486278, + "tokens_seen": 2644959232 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010037111334002007, + "loss": 2.5338, + "theoretical_loss": 3.3534906581948687, + "tokens_seen": 2645024768 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010036108324974925, + "loss": 2.4657, + "theoretical_loss": 3.353484105111291, + "tokens_seen": 2645090304 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010035105315947845, + "loss": 2.3083, + "theoretical_loss": 3.353477552235534, + "tokens_seen": 2645155840 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010034102306920763, + "loss": 2.5101, + "theoretical_loss": 3.3534709995675858, + "tokens_seen": 2645221376 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010033099297893681, + "loss": 2.3582, + "theoretical_loss": 3.3534644471074344, + "tokens_seen": 2645286912 + }, + { + "epoch": 8.08, + "learning_rate": 0.000100320962888666, + "loss": 2.5491, + "theoretical_loss": 3.353457894855068, + "tokens_seen": 2645352448 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010031093279839519, + "loss": 2.4317, + "theoretical_loss": 3.3534513428104753, + "tokens_seen": 2645417984 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010030090270812438, + "loss": 2.3588, + "theoretical_loss": 3.3534447909736445, + "tokens_seen": 2645483520 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010029087261785356, + "loss": 2.309, + "theoretical_loss": 3.3534382393445634, + "tokens_seen": 2645549056 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010028084252758276, + "loss": 2.6158, + "theoretical_loss": 3.3534316879232207, + "tokens_seen": 2645614592 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010027081243731194, + "loss": 2.3207, + "theoretical_loss": 3.353425136709604, + "tokens_seen": 2645680128 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010026078234704113, + "loss": 2.5315, + "theoretical_loss": 3.353418585703703, + "tokens_seen": 2645745664 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010025075225677031, + "loss": 2.3806, + "theoretical_loss": 3.3534120349055048, + "tokens_seen": 2645811200 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001002407221664995, + "loss": 2.4605, + "theoretical_loss": 3.3534054843149974, + "tokens_seen": 2645876736 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010023069207622869, + "loss": 2.5865, + "theoretical_loss": 3.35339893393217, + "tokens_seen": 2645942272 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2879137, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.373716115951538, + "objective/train/theoretical_loss": 3.3533923837570105, + "objective/train/tokens_used": 2666467808, + "theoretical_loss": 3.3533923837570105, + "tokens_seen": 2646007808 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010022066198595787, + "loss": 2.567, + "theoretical_loss": 3.3533923837570105, + "tokens_seen": 2646007808 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010021063189568707, + "loss": 2.201, + "theoretical_loss": 3.353385833789507, + "tokens_seen": 2646073344 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010020060180541625, + "loss": 2.2608, + "theoretical_loss": 3.353379284029648, + "tokens_seen": 2646138880 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010019057171514544, + "loss": 2.2279, + "theoretical_loss": 3.3533727344774222, + "tokens_seen": 2646204416 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010018054162487462, + "loss": 2.2154, + "theoretical_loss": 3.353366185132817, + "tokens_seen": 2646269952 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010017051153460382, + "loss": 2.5103, + "theoretical_loss": 3.3533596359958207, + "tokens_seen": 2646335488 + }, + { + "epoch": 8.08, + "learning_rate": 0.000100160481444333, + "loss": 2.4073, + "theoretical_loss": 3.3533530870664223, + "tokens_seen": 2646401024 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010015045135406218, + "loss": 2.5223, + "theoretical_loss": 3.3533465383446095, + "tokens_seen": 2646466560 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010014042126379137, + "loss": 2.4429, + "theoretical_loss": 3.353339989830371, + "tokens_seen": 2646532096 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010013039117352056, + "loss": 2.3965, + "theoretical_loss": 3.353333441523695, + "tokens_seen": 2646597632 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010012036108324975, + "loss": 2.2035, + "theoretical_loss": 3.353326893424569, + "tokens_seen": 2646663168 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010011033099297893, + "loss": 2.2713, + "theoretical_loss": 3.3533203455329827, + "tokens_seen": 2646728704 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010010030090270813, + "loss": 2.5106, + "theoretical_loss": 3.3533137978489234, + "tokens_seen": 2646794240 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010009027081243731, + "loss": 2.5419, + "theoretical_loss": 3.3533072503723793, + "tokens_seen": 2646859776 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001000802407221665, + "loss": 2.4808, + "theoretical_loss": 3.3533007031033395, + "tokens_seen": 2646925312 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010007021063189568, + "loss": 2.4754, + "theoretical_loss": 3.3532941560417915, + "tokens_seen": 2646990848 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010006018054162486, + "loss": 2.3346, + "theoretical_loss": 3.3532876091877237, + "tokens_seen": 2647056384 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010005015045135406, + "loss": 2.2796, + "theoretical_loss": 3.3532810625411247, + "tokens_seen": 2647121920 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010004012036108324, + "loss": 2.2115, + "theoretical_loss": 3.3532745161019823, + "tokens_seen": 2647187456 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010003009027081245, + "loss": 2.3147, + "theoretical_loss": 3.353267969870286, + "tokens_seen": 2647252992 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010002006018054163, + "loss": 2.2091, + "theoretical_loss": 3.3532614238460225, + "tokens_seen": 2647318528 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010001003009027082, + "loss": 2.3779, + "theoretical_loss": 3.353254878029181, + "tokens_seen": 2647384064 + }, + { + "epoch": 8.08, + "learning_rate": 0.0001, + "loss": 2.4709, + "theoretical_loss": 3.3532483324197497, + "tokens_seen": 2647449600 + }, + { + "epoch": 8.08, + "learning_rate": 9.99899699097292e-05, + "loss": 2.3543, + "theoretical_loss": 3.353241787017717, + "tokens_seen": 2647515136 + }, + { + "epoch": 8.08, + "learning_rate": 9.997993981945838e-05, + "loss": 2.2247, + "theoretical_loss": 3.3532352418230706, + "tokens_seen": 2647580672 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2879670, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.432544231414795, + "objective/train/theoretical_loss": 3.3532286968357994, + "objective/train/tokens_used": 2668106208, + "theoretical_loss": 3.3532286968357994, + "tokens_seen": 2647646208 + }, + { + "epoch": 8.08, + "learning_rate": 9.996990972918758e-05, + "loss": 2.2071, + "theoretical_loss": 3.3532286968357994, + "tokens_seen": 2647646208 + }, + { + "epoch": 8.08, + "learning_rate": 9.995987963891676e-05, + "loss": 2.4059, + "theoretical_loss": 3.353222152055891, + "tokens_seen": 2647711744 + }, + { + "epoch": 8.08, + "learning_rate": 9.994984954864594e-05, + "loss": 2.285, + "theoretical_loss": 3.353215607483335, + "tokens_seen": 2647777280 + }, + { + "epoch": 8.08, + "learning_rate": 9.993981945837513e-05, + "loss": 2.3773, + "theoretical_loss": 3.3532090631181184, + "tokens_seen": 2647842816 + }, + { + "epoch": 8.08, + "learning_rate": 9.992978936810431e-05, + "loss": 2.2377, + "theoretical_loss": 3.35320251896023, + "tokens_seen": 2647908352 + }, + { + "epoch": 8.08, + "learning_rate": 9.991975927783351e-05, + "loss": 2.4605, + "theoretical_loss": 3.3531959750096583, + "tokens_seen": 2647973888 + }, + { + "epoch": 8.08, + "learning_rate": 9.990972918756269e-05, + "loss": 2.4106, + "theoretical_loss": 3.353189431266391, + "tokens_seen": 2648039424 + }, + { + "epoch": 8.08, + "learning_rate": 9.989969909729188e-05, + "loss": 2.3293, + "theoretical_loss": 3.353182887730417, + "tokens_seen": 2648104960 + }, + { + "epoch": 8.08, + "learning_rate": 9.988966900702106e-05, + "loss": 2.3622, + "theoretical_loss": 3.3531763444017244, + "tokens_seen": 2648170496 + }, + { + "epoch": 8.08, + "learning_rate": 9.987963891675026e-05, + "loss": 2.3284, + "theoretical_loss": 3.353169801280302, + "tokens_seen": 2648236032 + }, + { + "epoch": 8.08, + "learning_rate": 9.986960882647944e-05, + "loss": 2.4701, + "theoretical_loss": 3.353163258366137, + "tokens_seen": 2648301568 + }, + { + "epoch": 8.08, + "learning_rate": 9.985957873620862e-05, + "loss": 2.28, + "theoretical_loss": 3.3531567156592184, + "tokens_seen": 2648367104 + }, + { + "epoch": 8.08, + "learning_rate": 9.984954864593782e-05, + "loss": 2.4343, + "theoretical_loss": 3.353150173159534, + "tokens_seen": 2648432640 + }, + { + "epoch": 8.08, + "learning_rate": 9.9839518555667e-05, + "loss": 2.2214, + "theoretical_loss": 3.3531436308670735, + "tokens_seen": 2648498176 + }, + { + "epoch": 8.08, + "learning_rate": 9.982948846539619e-05, + "loss": 2.4308, + "theoretical_loss": 3.3531370887818235, + "tokens_seen": 2648563712 + }, + { + "epoch": 8.08, + "learning_rate": 9.981945837512537e-05, + "loss": 2.4547, + "theoretical_loss": 3.3531305469037735, + "tokens_seen": 2648629248 + }, + { + "epoch": 8.08, + "learning_rate": 9.980942828485457e-05, + "loss": 2.646, + "theoretical_loss": 3.353124005232911, + "tokens_seen": 2648694784 + }, + { + "epoch": 8.08, + "learning_rate": 9.979939819458375e-05, + "loss": 2.2311, + "theoretical_loss": 3.353117463769225, + "tokens_seen": 2648760320 + }, + { + "epoch": 8.08, + "learning_rate": 9.978936810431294e-05, + "loss": 2.4932, + "theoretical_loss": 3.353110922512703, + "tokens_seen": 2648825856 + }, + { + "epoch": 8.08, + "learning_rate": 9.977933801404212e-05, + "loss": 2.3972, + "theoretical_loss": 3.3531043814633343, + "tokens_seen": 2648891392 + }, + { + "epoch": 8.08, + "learning_rate": 9.976930792377132e-05, + "loss": 2.3797, + "theoretical_loss": 3.353097840621106, + "tokens_seen": 2648956928 + }, + { + "epoch": 8.08, + "learning_rate": 9.97592778335005e-05, + "loss": 2.3813, + "theoretical_loss": 3.353091299986008, + "tokens_seen": 2649022464 + }, + { + "epoch": 8.08, + "learning_rate": 9.974924774322968e-05, + "loss": 2.6804, + "theoretical_loss": 3.353084759558027, + "tokens_seen": 2649088000 + }, + { + "epoch": 8.08, + "learning_rate": 9.973921765295888e-05, + "loss": 2.3513, + "theoretical_loss": 3.353078219337153, + "tokens_seen": 2649153536 + }, + { + "epoch": 8.08, + "learning_rate": 9.972918756268806e-05, + "loss": 2.3308, + "theoretical_loss": 3.3530716793233726, + "tokens_seen": 2649219072 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2883222, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.465657949447632, + "objective/train/theoretical_loss": 3.353065139516675, + "objective/train/tokens_used": 2669744608, + "theoretical_loss": 3.353065139516675, + "tokens_seen": 2649284608 + }, + { + "epoch": 8.08, + "learning_rate": 9.971915747241725e-05, + "loss": 2.2465, + "theoretical_loss": 3.353065139516675, + "tokens_seen": 2649284608 + }, + { + "epoch": 8.08, + "learning_rate": 9.970912738214643e-05, + "loss": 2.5328, + "theoretical_loss": 3.3530585999170484, + "tokens_seen": 2649350144 + }, + { + "epoch": 8.08, + "learning_rate": 9.969909729187563e-05, + "loss": 2.3334, + "theoretical_loss": 3.3530520605244813, + "tokens_seen": 2649415680 + }, + { + "epoch": 8.08, + "learning_rate": 9.968906720160481e-05, + "loss": 2.5236, + "theoretical_loss": 3.353045521338962, + "tokens_seen": 2649481216 + }, + { + "epoch": 8.08, + "learning_rate": 9.9679037111334e-05, + "loss": 2.4125, + "theoretical_loss": 3.3530389823604785, + "tokens_seen": 2649546752 + }, + { + "epoch": 8.08, + "learning_rate": 9.966900702106319e-05, + "loss": 2.4194, + "theoretical_loss": 3.353032443589019, + "tokens_seen": 2649612288 + }, + { + "epoch": 8.08, + "learning_rate": 9.965897693079238e-05, + "loss": 2.3953, + "theoretical_loss": 3.3530259050245728, + "tokens_seen": 2649677824 + }, + { + "epoch": 8.08, + "learning_rate": 9.964894684052157e-05, + "loss": 2.3471, + "theoretical_loss": 3.353019366667127, + "tokens_seen": 2649743360 + }, + { + "epoch": 8.08, + "learning_rate": 9.963891675025076e-05, + "loss": 2.454, + "theoretical_loss": 3.353012828516671, + "tokens_seen": 2649808896 + }, + { + "epoch": 8.08, + "learning_rate": 9.962888665997995e-05, + "loss": 2.4406, + "theoretical_loss": 3.353006290573192, + "tokens_seen": 2649874432 + }, + { + "epoch": 8.08, + "learning_rate": 9.961885656970913e-05, + "loss": 2.4848, + "theoretical_loss": 3.3529997528366793, + "tokens_seen": 2649939968 + }, + { + "epoch": 8.08, + "learning_rate": 9.960882647943833e-05, + "loss": 2.5023, + "theoretical_loss": 3.352993215307121, + "tokens_seen": 2650005504 + }, + { + "epoch": 8.08, + "learning_rate": 9.959879638916751e-05, + "loss": 2.3611, + "theoretical_loss": 3.352986677984505, + "tokens_seen": 2650071040 + }, + { + "epoch": 8.08, + "learning_rate": 9.95887662988967e-05, + "loss": 2.363, + "theoretical_loss": 3.3529801408688202, + "tokens_seen": 2650136576 + }, + { + "epoch": 8.08, + "learning_rate": 9.957873620862588e-05, + "loss": 2.3842, + "theoretical_loss": 3.3529736039600544, + "tokens_seen": 2650202112 + }, + { + "epoch": 8.08, + "learning_rate": 9.956870611835506e-05, + "loss": 2.5214, + "theoretical_loss": 3.3529670672581964, + "tokens_seen": 2650267648 + }, + { + "epoch": 8.08, + "learning_rate": 9.955867602808426e-05, + "loss": 2.2837, + "theoretical_loss": 3.352960530763234, + "tokens_seen": 2650333184 + }, + { + "epoch": 8.08, + "learning_rate": 9.954864593781344e-05, + "loss": 2.4913, + "theoretical_loss": 3.3529539944751563, + "tokens_seen": 2650398720 + }, + { + "epoch": 8.08, + "learning_rate": 9.953861584754263e-05, + "loss": 2.2546, + "theoretical_loss": 3.352947458393951, + "tokens_seen": 2650464256 + }, + { + "epoch": 8.08, + "learning_rate": 9.952858575727182e-05, + "loss": 2.4769, + "theoretical_loss": 3.3529409225196067, + "tokens_seen": 2650529792 + }, + { + "epoch": 8.08, + "learning_rate": 9.951855566700101e-05, + "loss": 2.4501, + "theoretical_loss": 3.352934386852112, + "tokens_seen": 2650595328 + }, + { + "epoch": 8.08, + "learning_rate": 9.950852557673019e-05, + "loss": 2.1402, + "theoretical_loss": 3.3529278513914544, + "tokens_seen": 2650660864 + }, + { + "epoch": 8.08, + "learning_rate": 9.949849548645939e-05, + "loss": 2.4428, + "theoretical_loss": 3.352921316137623, + "tokens_seen": 2650726400 + }, + { + "epoch": 8.08, + "learning_rate": 9.948846539618857e-05, + "loss": 2.4435, + "theoretical_loss": 3.352914781090606, + "tokens_seen": 2650791936 + }, + { + "epoch": 8.08, + "learning_rate": 9.947843530591776e-05, + "loss": 2.3957, + "theoretical_loss": 3.352908246250392, + "tokens_seen": 2650857472 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0571491718292236, + "objective/train/theoretical_loss": 3.352901711616968, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.352901711616968, + "tokens_seen": 2650923008 + }, + { + "epoch": 8.08, + "learning_rate": 9.946840521564694e-05, + "loss": 2.4714, + "theoretical_loss": 3.352901711616968, + "tokens_seen": 2650923008 + }, + { + "epoch": 8.08, + "learning_rate": 9.945837512537612e-05, + "loss": 2.434, + "theoretical_loss": 3.3528951771903244, + "tokens_seen": 2650988544 + }, + { + "epoch": 8.08, + "learning_rate": 9.944834503510532e-05, + "loss": 2.5303, + "theoretical_loss": 3.3528886429704476, + "tokens_seen": 2651054080 + }, + { + "epoch": 8.08, + "learning_rate": 9.94383149448345e-05, + "loss": 2.4312, + "theoretical_loss": 3.3528821089573277, + "tokens_seen": 2651119616 + }, + { + "epoch": 8.08, + "learning_rate": 9.94282848545637e-05, + "loss": 2.4783, + "theoretical_loss": 3.3528755751509514, + "tokens_seen": 2651185152 + }, + { + "epoch": 8.08, + "learning_rate": 9.941825476429288e-05, + "loss": 2.4348, + "theoretical_loss": 3.3528690415513083, + "tokens_seen": 2651250688 + }, + { + "epoch": 8.08, + "learning_rate": 9.940822467402207e-05, + "loss": 2.4827, + "theoretical_loss": 3.352862508158386, + "tokens_seen": 2651316224 + }, + { + "epoch": 8.08, + "learning_rate": 9.939819458375125e-05, + "loss": 2.3249, + "theoretical_loss": 3.352855974972173, + "tokens_seen": 2651381760 + }, + { + "epoch": 8.08, + "learning_rate": 9.938816449348045e-05, + "loss": 2.3328, + "theoretical_loss": 3.3528494419926584, + "tokens_seen": 2651447296 + }, + { + "epoch": 8.08, + "learning_rate": 9.937813440320963e-05, + "loss": 2.6019, + "theoretical_loss": 3.3528429092198295, + "tokens_seen": 2651512832 + }, + { + "epoch": 8.08, + "learning_rate": 9.936810431293881e-05, + "loss": 2.2784, + "theoretical_loss": 3.352836376653675, + "tokens_seen": 2651578368 + }, + { + "epoch": 8.08, + "learning_rate": 9.9358074222668e-05, + "loss": 2.3134, + "theoretical_loss": 3.3528298442941837, + "tokens_seen": 2651643904 + }, + { + "epoch": 8.08, + "learning_rate": 9.934804413239718e-05, + "loss": 2.5618, + "theoretical_loss": 3.3528233121413433, + "tokens_seen": 2651709440 + }, + { + "epoch": 8.08, + "learning_rate": 9.933801404212638e-05, + "loss": 2.3243, + "theoretical_loss": 3.3528167801951425, + "tokens_seen": 2651774976 + }, + { + "epoch": 8.08, + "learning_rate": 9.932798395185556e-05, + "loss": 2.4235, + "theoretical_loss": 3.3528102484555693, + "tokens_seen": 2651840512 + }, + { + "epoch": 8.08, + "learning_rate": 9.931795386158475e-05, + "loss": 2.3301, + "theoretical_loss": 3.3528037169226126, + "tokens_seen": 2651906048 + }, + { + "epoch": 8.08, + "learning_rate": 9.930792377131394e-05, + "loss": 2.7289, + "theoretical_loss": 3.352797185596261, + "tokens_seen": 2651971584 + }, + { + "epoch": 8.08, + "learning_rate": 9.929789368104313e-05, + "loss": 2.3407, + "theoretical_loss": 3.3527906544765016, + "tokens_seen": 2652037120 + }, + { + "epoch": 8.08, + "learning_rate": 9.928786359077231e-05, + "loss": 2.5802, + "theoretical_loss": 3.352784123563324, + "tokens_seen": 2652102656 + }, + { + "epoch": 8.08, + "learning_rate": 9.927783350050152e-05, + "loss": 2.0788, + "theoretical_loss": 3.352777592856716, + "tokens_seen": 2652168192 + }, + { + "epoch": 8.08, + "learning_rate": 9.92678034102307e-05, + "loss": 2.5275, + "theoretical_loss": 3.3527710623566658, + "tokens_seen": 2652233728 + }, + { + "epoch": 8.08, + "learning_rate": 9.925777331995988e-05, + "loss": 2.1455, + "theoretical_loss": 3.3527645320631625, + "tokens_seen": 2652299264 + }, + { + "epoch": 8.08, + "learning_rate": 9.924774322968908e-05, + "loss": 2.2197, + "theoretical_loss": 3.3527580019761936, + "tokens_seen": 2652364800 + }, + { + "epoch": 8.08, + "learning_rate": 9.923771313941826e-05, + "loss": 2.4475, + "theoretical_loss": 3.352751472095748, + "tokens_seen": 2652430336 + }, + { + "epoch": 8.08, + "learning_rate": 9.922768304914745e-05, + "loss": 2.2249, + "theoretical_loss": 3.352744942421814, + "tokens_seen": 2652495872 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.695746898651123, + "objective/train/theoretical_loss": 3.3527384129543796, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.3527384129543796, + "tokens_seen": 2652561408 + }, + { + "epoch": 8.08, + "learning_rate": 9.921765295887663e-05, + "loss": 2.5125, + "theoretical_loss": 3.3527384129543796, + "tokens_seen": 2652561408 + }, + { + "epoch": 8.08, + "learning_rate": 9.920762286860583e-05, + "loss": 2.4085, + "theoretical_loss": 3.3527318836934334, + "tokens_seen": 2652626944 + }, + { + "epoch": 8.08, + "learning_rate": 9.919759277833501e-05, + "loss": 2.3821, + "theoretical_loss": 3.3527253546389644, + "tokens_seen": 2652692480 + }, + { + "epoch": 8.08, + "learning_rate": 9.91875626880642e-05, + "loss": 2.4354, + "theoretical_loss": 3.35271882579096, + "tokens_seen": 2652758016 + }, + { + "epoch": 8.08, + "learning_rate": 9.917753259779339e-05, + "loss": 2.5723, + "theoretical_loss": 3.352712297149409, + "tokens_seen": 2652823552 + }, + { + "epoch": 8.08, + "learning_rate": 9.916750250752257e-05, + "loss": 2.5366, + "theoretical_loss": 3.3527057687142996, + "tokens_seen": 2652889088 + }, + { + "epoch": 8.08, + "learning_rate": 9.915747241725176e-05, + "loss": 2.2866, + "theoretical_loss": 3.3526992404856206, + "tokens_seen": 2652954624 + }, + { + "epoch": 8.08, + "learning_rate": 9.914744232698094e-05, + "loss": 2.4923, + "theoretical_loss": 3.3526927124633596, + "tokens_seen": 2653020160 + }, + { + "epoch": 8.08, + "learning_rate": 9.913741223671014e-05, + "loss": 2.3235, + "theoretical_loss": 3.352686184647506, + "tokens_seen": 2653085696 + }, + { + "epoch": 8.08, + "learning_rate": 9.912738214643932e-05, + "loss": 2.6408, + "theoretical_loss": 3.3526796570380473, + "tokens_seen": 2653151232 + }, + { + "epoch": 8.08, + "learning_rate": 9.911735205616851e-05, + "loss": 2.3813, + "theoretical_loss": 3.3526731296349723, + "tokens_seen": 2653216768 + }, + { + "epoch": 8.08, + "learning_rate": 9.91073219658977e-05, + "loss": 2.3944, + "theoretical_loss": 3.352666602438269, + "tokens_seen": 2653282304 + }, + { + "epoch": 8.08, + "learning_rate": 9.909729187562689e-05, + "loss": 2.4556, + "theoretical_loss": 3.3526600754479268, + "tokens_seen": 2653347840 + }, + { + "epoch": 8.08, + "learning_rate": 9.908726178535607e-05, + "loss": 2.4419, + "theoretical_loss": 3.352653548663933, + "tokens_seen": 2653413376 + }, + { + "epoch": 8.08, + "learning_rate": 9.907723169508525e-05, + "loss": 2.122, + "theoretical_loss": 3.352647022086276, + "tokens_seen": 2653478912 + }, + { + "epoch": 8.08, + "learning_rate": 9.906720160481445e-05, + "loss": 2.2171, + "theoretical_loss": 3.3526404957149447, + "tokens_seen": 2653544448 + }, + { + "epoch": 8.08, + "learning_rate": 9.905717151454363e-05, + "loss": 2.6446, + "theoretical_loss": 3.3526339695499274, + "tokens_seen": 2653609984 + }, + { + "epoch": 8.08, + "learning_rate": 9.904714142427282e-05, + "loss": 2.4346, + "theoretical_loss": 3.3526274435912122, + "tokens_seen": 2653675520 + }, + { + "epoch": 8.08, + "learning_rate": 9.9037111334002e-05, + "loss": 2.2741, + "theoretical_loss": 3.352620917838788, + "tokens_seen": 2653741056 + }, + { + "epoch": 8.08, + "learning_rate": 9.90270812437312e-05, + "loss": 2.4854, + "theoretical_loss": 3.3526143922926424, + "tokens_seen": 2653806592 + }, + { + "epoch": 8.08, + "learning_rate": 9.901705115346038e-05, + "loss": 2.595, + "theoretical_loss": 3.352607866952764, + "tokens_seen": 2653872128 + }, + { + "epoch": 8.08, + "learning_rate": 9.900702106318957e-05, + "loss": 2.5697, + "theoretical_loss": 3.352601341819142, + "tokens_seen": 2653937664 + }, + { + "epoch": 8.08, + "learning_rate": 9.899699097291875e-05, + "loss": 2.2878, + "theoretical_loss": 3.352594816891764, + "tokens_seen": 2654003200 + }, + { + "epoch": 8.08, + "learning_rate": 9.898696088264795e-05, + "loss": 2.2737, + "theoretical_loss": 3.3525882921706183, + "tokens_seen": 2654068736 + }, + { + "epoch": 8.08, + "learning_rate": 9.897693079237713e-05, + "loss": 2.5145, + "theoretical_loss": 3.352581767655694, + "tokens_seen": 2654134272 + }, + { + "epoch": 8.08, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4736855030059814, + "objective/train/theoretical_loss": 3.352575243346979, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.352575243346979, + "tokens_seen": 2654199808 + }, + { + "epoch": 8.08, + "learning_rate": 9.896690070210631e-05, + "loss": 2.1602, + "theoretical_loss": 3.352575243346979, + "tokens_seen": 2654199808 + }, + { + "epoch": 8.08, + "learning_rate": 9.89568706118355e-05, + "loss": 2.2267, + "theoretical_loss": 3.3525687192444615, + "tokens_seen": 2654265344 + }, + { + "epoch": 8.08, + "learning_rate": 9.894684052156469e-05, + "loss": 2.6149, + "theoretical_loss": 3.3525621953481304, + "tokens_seen": 2654330880 + }, + { + "epoch": 8.08, + "learning_rate": 9.893681043129388e-05, + "loss": 2.3472, + "theoretical_loss": 3.3525556716579734, + "tokens_seen": 2654396416 + }, + { + "epoch": 8.08, + "learning_rate": 9.892678034102306e-05, + "loss": 2.3916, + "theoretical_loss": 3.3525491481739795, + "tokens_seen": 2654461952 + }, + { + "epoch": 8.08, + "learning_rate": 9.891675025075226e-05, + "loss": 2.4587, + "theoretical_loss": 3.352542624896137, + "tokens_seen": 2654527488 + }, + { + "epoch": 8.08, + "learning_rate": 9.890672016048145e-05, + "loss": 2.4687, + "theoretical_loss": 3.3525361018244344, + "tokens_seen": 2654593024 + }, + { + "epoch": 8.09, + "learning_rate": 9.889669007021065e-05, + "loss": 2.4371, + "theoretical_loss": 3.35252957895886, + "tokens_seen": 2654658560 + }, + { + "epoch": 8.09, + "learning_rate": 9.888665997993983e-05, + "loss": 2.2648, + "theoretical_loss": 3.3525230562994017, + "tokens_seen": 2654724096 + }, + { + "epoch": 8.09, + "learning_rate": 9.887662988966901e-05, + "loss": 2.2943, + "theoretical_loss": 3.3525165338460488, + "tokens_seen": 2654789632 + }, + { + "epoch": 8.09, + "learning_rate": 9.88665997993982e-05, + "loss": 2.3462, + "theoretical_loss": 3.3525100115987887, + "tokens_seen": 2654855168 + }, + { + "epoch": 8.09, + "learning_rate": 9.885656970912738e-05, + "loss": 2.5674, + "theoretical_loss": 3.352503489557611, + "tokens_seen": 2654920704 + }, + { + "epoch": 8.09, + "learning_rate": 9.884653961885658e-05, + "loss": 2.2358, + "theoretical_loss": 3.3524969677225025, + "tokens_seen": 2654986240 + }, + { + "epoch": 8.09, + "learning_rate": 9.883650952858576e-05, + "loss": 2.4454, + "theoretical_loss": 3.3524904460934533, + "tokens_seen": 2655051776 + }, + { + "epoch": 8.09, + "learning_rate": 9.882647943831495e-05, + "loss": 2.5319, + "theoretical_loss": 3.3524839246704508, + "tokens_seen": 2655117312 + }, + { + "epoch": 8.09, + "learning_rate": 9.881644934804414e-05, + "loss": 2.4607, + "theoretical_loss": 3.3524774034534834, + "tokens_seen": 2655182848 + }, + { + "epoch": 8.09, + "learning_rate": 9.880641925777333e-05, + "loss": 2.4119, + "theoretical_loss": 3.3524708824425398, + "tokens_seen": 2655248384 + }, + { + "epoch": 8.09, + "learning_rate": 9.879638916750251e-05, + "loss": 2.4423, + "theoretical_loss": 3.352464361637608, + "tokens_seen": 2655313920 + }, + { + "epoch": 8.09, + "learning_rate": 9.878635907723169e-05, + "loss": 2.4607, + "theoretical_loss": 3.3524578410386776, + "tokens_seen": 2655379456 + }, + { + "epoch": 8.09, + "learning_rate": 9.877632898696089e-05, + "loss": 2.4256, + "theoretical_loss": 3.3524513206457356, + "tokens_seen": 2655444992 + }, + { + "epoch": 8.09, + "learning_rate": 9.876629889669007e-05, + "loss": 2.3845, + "theoretical_loss": 3.3524448004587715, + "tokens_seen": 2655510528 + }, + { + "epoch": 8.09, + "learning_rate": 9.875626880641926e-05, + "loss": 2.4885, + "theoretical_loss": 3.3524382804777724, + "tokens_seen": 2655576064 + }, + { + "epoch": 8.09, + "learning_rate": 9.874623871614844e-05, + "loss": 2.1658, + "theoretical_loss": 3.352431760702728, + "tokens_seen": 2655641600 + }, + { + "epoch": 8.09, + "learning_rate": 9.873620862587764e-05, + "loss": 2.4863, + "theoretical_loss": 3.352425241133626, + "tokens_seen": 2655707136 + }, + { + "epoch": 8.09, + "learning_rate": 9.872617853560682e-05, + "loss": 2.627, + "theoretical_loss": 3.352418721770455, + "tokens_seen": 2655772672 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9499683380126953, + "objective/train/theoretical_loss": 3.3524122026132037, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.3524122026132037, + "tokens_seen": 2655838208 + }, + { + "epoch": 8.09, + "learning_rate": 9.871614844533601e-05, + "loss": 2.433, + "theoretical_loss": 3.3524122026132037, + "tokens_seen": 2655838208 + }, + { + "epoch": 8.09, + "learning_rate": 9.87061183550652e-05, + "loss": 2.5009, + "theoretical_loss": 3.35240568366186, + "tokens_seen": 2655903744 + }, + { + "epoch": 8.09, + "learning_rate": 9.869608826479439e-05, + "loss": 2.2754, + "theoretical_loss": 3.3523991649164127, + "tokens_seen": 2655969280 + }, + { + "epoch": 8.09, + "learning_rate": 9.868605817452357e-05, + "loss": 2.0501, + "theoretical_loss": 3.35239264637685, + "tokens_seen": 2656034816 + }, + { + "epoch": 8.09, + "learning_rate": 9.867602808425275e-05, + "loss": 2.3548, + "theoretical_loss": 3.35238612804316, + "tokens_seen": 2656100352 + }, + { + "epoch": 8.09, + "learning_rate": 9.866599799398195e-05, + "loss": 2.3218, + "theoretical_loss": 3.352379609915332, + "tokens_seen": 2656165888 + }, + { + "epoch": 8.09, + "learning_rate": 9.865596790371113e-05, + "loss": 2.3625, + "theoretical_loss": 3.352373091993354, + "tokens_seen": 2656231424 + }, + { + "epoch": 8.09, + "learning_rate": 9.864593781344032e-05, + "loss": 2.3086, + "theoretical_loss": 3.352366574277214, + "tokens_seen": 2656296960 + }, + { + "epoch": 8.09, + "learning_rate": 9.86359077231695e-05, + "loss": 2.3214, + "theoretical_loss": 3.352360056766901, + "tokens_seen": 2656362496 + }, + { + "epoch": 8.09, + "learning_rate": 9.86258776328987e-05, + "loss": 2.3367, + "theoretical_loss": 3.3523535394624027, + "tokens_seen": 2656428032 + }, + { + "epoch": 8.09, + "learning_rate": 9.861584754262788e-05, + "loss": 2.6537, + "theoretical_loss": 3.3523470223637086, + "tokens_seen": 2656493568 + }, + { + "epoch": 8.09, + "learning_rate": 9.860581745235707e-05, + "loss": 2.508, + "theoretical_loss": 3.3523405054708064, + "tokens_seen": 2656559104 + }, + { + "epoch": 8.09, + "learning_rate": 9.859578736208626e-05, + "loss": 2.378, + "theoretical_loss": 3.3523339887836845, + "tokens_seen": 2656624640 + }, + { + "epoch": 8.09, + "learning_rate": 9.858575727181544e-05, + "loss": 2.592, + "theoretical_loss": 3.3523274723023317, + "tokens_seen": 2656690176 + }, + { + "epoch": 8.09, + "learning_rate": 9.857572718154463e-05, + "loss": 2.3522, + "theoretical_loss": 3.352320956026736, + "tokens_seen": 2656755712 + }, + { + "epoch": 8.09, + "learning_rate": 9.856569709127381e-05, + "loss": 2.5973, + "theoretical_loss": 3.3523144399568867, + "tokens_seen": 2656821248 + }, + { + "epoch": 8.09, + "learning_rate": 9.855566700100301e-05, + "loss": 2.5221, + "theoretical_loss": 3.352307924092771, + "tokens_seen": 2656886784 + }, + { + "epoch": 8.09, + "learning_rate": 9.854563691073219e-05, + "loss": 2.2894, + "theoretical_loss": 3.3523014084343776, + "tokens_seen": 2656952320 + }, + { + "epoch": 8.09, + "learning_rate": 9.85356068204614e-05, + "loss": 2.4418, + "theoretical_loss": 3.352294892981696, + "tokens_seen": 2657017856 + }, + { + "epoch": 8.09, + "learning_rate": 9.852557673019058e-05, + "loss": 2.5827, + "theoretical_loss": 3.3522883777347134, + "tokens_seen": 2657083392 + }, + { + "epoch": 8.09, + "learning_rate": 9.851554663991977e-05, + "loss": 2.4182, + "theoretical_loss": 3.3522818626934185, + "tokens_seen": 2657148928 + }, + { + "epoch": 8.09, + "learning_rate": 9.850551654964895e-05, + "loss": 2.6335, + "theoretical_loss": 3.3522753478578, + "tokens_seen": 2657214464 + }, + { + "epoch": 8.09, + "learning_rate": 9.849548645937815e-05, + "loss": 2.309, + "theoretical_loss": 3.3522688332278467, + "tokens_seen": 2657280000 + }, + { + "epoch": 8.09, + "learning_rate": 9.848545636910733e-05, + "loss": 2.3921, + "theoretical_loss": 3.3522623188035463, + "tokens_seen": 2657345536 + }, + { + "epoch": 8.09, + "learning_rate": 9.847542627883651e-05, + "loss": 2.6484, + "theoretical_loss": 3.352255804584888, + "tokens_seen": 2657411072 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3828349113464355, + "objective/train/theoretical_loss": 3.352249290571859, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.352249290571859, + "tokens_seen": 2657476608 + }, + { + "epoch": 8.09, + "learning_rate": 9.84653961885657e-05, + "loss": 2.3647, + "theoretical_loss": 3.352249290571859, + "tokens_seen": 2657476608 + }, + { + "epoch": 8.09, + "learning_rate": 9.845536609829489e-05, + "loss": 2.5018, + "theoretical_loss": 3.352242776764449, + "tokens_seen": 2657542144 + }, + { + "epoch": 8.09, + "learning_rate": 9.844533600802408e-05, + "loss": 2.2724, + "theoretical_loss": 3.352236263162646, + "tokens_seen": 2657607680 + }, + { + "epoch": 8.09, + "learning_rate": 9.843530591775326e-05, + "loss": 2.3942, + "theoretical_loss": 3.352229749766438, + "tokens_seen": 2657673216 + }, + { + "epoch": 8.09, + "learning_rate": 9.842527582748246e-05, + "loss": 2.5353, + "theoretical_loss": 3.3522232365758144, + "tokens_seen": 2657738752 + }, + { + "epoch": 8.09, + "learning_rate": 9.841524573721164e-05, + "loss": 2.6252, + "theoretical_loss": 3.3522167235907627, + "tokens_seen": 2657804288 + }, + { + "epoch": 8.09, + "learning_rate": 9.840521564694083e-05, + "loss": 2.3634, + "theoretical_loss": 3.352210210811272, + "tokens_seen": 2657869824 + }, + { + "epoch": 8.09, + "learning_rate": 9.839518555667001e-05, + "loss": 2.5471, + "theoretical_loss": 3.35220369823733, + "tokens_seen": 2657935360 + }, + { + "epoch": 8.09, + "learning_rate": 9.83851554663992e-05, + "loss": 2.2746, + "theoretical_loss": 3.352197185868926, + "tokens_seen": 2658000896 + }, + { + "epoch": 8.09, + "learning_rate": 9.837512537612839e-05, + "loss": 2.3826, + "theoretical_loss": 3.3521906737060476, + "tokens_seen": 2658066432 + }, + { + "epoch": 8.09, + "learning_rate": 9.836509528585757e-05, + "loss": 2.4589, + "theoretical_loss": 3.352184161748684, + "tokens_seen": 2658131968 + }, + { + "epoch": 8.09, + "learning_rate": 9.835506519558677e-05, + "loss": 2.2217, + "theoretical_loss": 3.3521776499968237, + "tokens_seen": 2658197504 + }, + { + "epoch": 8.09, + "learning_rate": 9.834503510531595e-05, + "loss": 2.4168, + "theoretical_loss": 3.352171138450454, + "tokens_seen": 2658263040 + }, + { + "epoch": 8.09, + "learning_rate": 9.833500501504514e-05, + "loss": 2.3965, + "theoretical_loss": 3.352164627109565, + "tokens_seen": 2658328576 + }, + { + "epoch": 8.09, + "learning_rate": 9.832497492477432e-05, + "loss": 2.3551, + "theoretical_loss": 3.352158115974144, + "tokens_seen": 2658394112 + }, + { + "epoch": 8.09, + "learning_rate": 9.831494483450352e-05, + "loss": 2.4726, + "theoretical_loss": 3.3521516050441793, + "tokens_seen": 2658459648 + }, + { + "epoch": 8.09, + "learning_rate": 9.83049147442327e-05, + "loss": 2.1113, + "theoretical_loss": 3.3521450943196602, + "tokens_seen": 2658525184 + }, + { + "epoch": 8.09, + "learning_rate": 9.829488465396188e-05, + "loss": 2.6128, + "theoretical_loss": 3.3521385838005746, + "tokens_seen": 2658590720 + }, + { + "epoch": 8.09, + "learning_rate": 9.828485456369107e-05, + "loss": 2.3964, + "theoretical_loss": 3.3521320734869113, + "tokens_seen": 2658656256 + }, + { + "epoch": 8.09, + "learning_rate": 9.827482447342026e-05, + "loss": 2.659, + "theoretical_loss": 3.3521255633786584, + "tokens_seen": 2658721792 + }, + { + "epoch": 8.09, + "learning_rate": 9.826479438314945e-05, + "loss": 2.4581, + "theoretical_loss": 3.3521190534758047, + "tokens_seen": 2658787328 + }, + { + "epoch": 8.09, + "learning_rate": 9.825476429287863e-05, + "loss": 2.3733, + "theoretical_loss": 3.352112543778339, + "tokens_seen": 2658852864 + }, + { + "epoch": 8.09, + "learning_rate": 9.824473420260783e-05, + "loss": 2.4485, + "theoretical_loss": 3.352106034286248, + "tokens_seen": 2658918400 + }, + { + "epoch": 8.09, + "learning_rate": 9.823470411233701e-05, + "loss": 2.3729, + "theoretical_loss": 3.352099524999522, + "tokens_seen": 2658983936 + }, + { + "epoch": 8.09, + "learning_rate": 9.82246740220662e-05, + "loss": 2.3504, + "theoretical_loss": 3.352093015918149, + "tokens_seen": 2659049472 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.209988594055176, + "objective/train/theoretical_loss": 3.352086507042117, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.352086507042117, + "tokens_seen": 2659115008 + }, + { + "epoch": 8.09, + "learning_rate": 9.821464393179538e-05, + "loss": 2.3413, + "theoretical_loss": 3.352086507042117, + "tokens_seen": 2659115008 + }, + { + "epoch": 8.09, + "learning_rate": 9.820461384152456e-05, + "loss": 2.3224, + "theoretical_loss": 3.352079998371415, + "tokens_seen": 2659180544 + }, + { + "epoch": 8.09, + "learning_rate": 9.819458375125376e-05, + "loss": 2.3084, + "theoretical_loss": 3.352073489906031, + "tokens_seen": 2659246080 + }, + { + "epoch": 8.09, + "learning_rate": 9.818455366098294e-05, + "loss": 2.3849, + "theoretical_loss": 3.352066981645954, + "tokens_seen": 2659311616 + }, + { + "epoch": 8.09, + "learning_rate": 9.817452357071213e-05, + "loss": 2.3627, + "theoretical_loss": 3.3520604735911723, + "tokens_seen": 2659377152 + }, + { + "epoch": 8.09, + "learning_rate": 9.816449348044132e-05, + "loss": 2.3745, + "theoretical_loss": 3.352053965741674, + "tokens_seen": 2659442688 + }, + { + "epoch": 8.09, + "learning_rate": 9.815446339017052e-05, + "loss": 2.5156, + "theoretical_loss": 3.3520474580974473, + "tokens_seen": 2659508224 + }, + { + "epoch": 8.09, + "learning_rate": 9.81444332998997e-05, + "loss": 2.2566, + "theoretical_loss": 3.352040950658482, + "tokens_seen": 2659573760 + }, + { + "epoch": 8.09, + "learning_rate": 9.81344032096289e-05, + "loss": 2.3684, + "theoretical_loss": 3.352034443424765, + "tokens_seen": 2659639296 + }, + { + "epoch": 8.09, + "learning_rate": 9.812437311935808e-05, + "loss": 2.3222, + "theoretical_loss": 3.352027936396286, + "tokens_seen": 2659704832 + }, + { + "epoch": 8.09, + "learning_rate": 9.811434302908728e-05, + "loss": 2.5202, + "theoretical_loss": 3.352021429573033, + "tokens_seen": 2659770368 + }, + { + "epoch": 8.09, + "learning_rate": 9.810431293881646e-05, + "loss": 2.3103, + "theoretical_loss": 3.3520149229549943, + "tokens_seen": 2659835904 + }, + { + "epoch": 8.09, + "learning_rate": 9.809428284854564e-05, + "loss": 2.2854, + "theoretical_loss": 3.352008416542158, + "tokens_seen": 2659901440 + }, + { + "epoch": 8.09, + "learning_rate": 9.808425275827483e-05, + "loss": 2.4245, + "theoretical_loss": 3.352001910334514, + "tokens_seen": 2659966976 + }, + { + "epoch": 8.09, + "learning_rate": 9.807422266800401e-05, + "loss": 2.3846, + "theoretical_loss": 3.3519954043320492, + "tokens_seen": 2660032512 + }, + { + "epoch": 8.09, + "learning_rate": 9.806419257773321e-05, + "loss": 2.5909, + "theoretical_loss": 3.351988898534753, + "tokens_seen": 2660098048 + }, + { + "epoch": 8.09, + "learning_rate": 9.805416248746239e-05, + "loss": 2.169, + "theoretical_loss": 3.3519823929426136, + "tokens_seen": 2660163584 + }, + { + "epoch": 8.09, + "learning_rate": 9.804413239719158e-05, + "loss": 2.4592, + "theoretical_loss": 3.3519758875556196, + "tokens_seen": 2660229120 + }, + { + "epoch": 8.09, + "learning_rate": 9.803410230692076e-05, + "loss": 2.5565, + "theoretical_loss": 3.351969382373759, + "tokens_seen": 2660294656 + }, + { + "epoch": 8.09, + "learning_rate": 9.802407221664996e-05, + "loss": 2.5544, + "theoretical_loss": 3.351962877397021, + "tokens_seen": 2660360192 + }, + { + "epoch": 8.09, + "learning_rate": 9.801404212637914e-05, + "loss": 2.5978, + "theoretical_loss": 3.3519563726253936, + "tokens_seen": 2660425728 + }, + { + "epoch": 8.09, + "learning_rate": 9.800401203610832e-05, + "loss": 2.1862, + "theoretical_loss": 3.351949868058865, + "tokens_seen": 2660491264 + }, + { + "epoch": 8.09, + "learning_rate": 9.799398194583752e-05, + "loss": 2.494, + "theoretical_loss": 3.3519433636974245, + "tokens_seen": 2660556800 + }, + { + "epoch": 8.09, + "learning_rate": 9.79839518555667e-05, + "loss": 2.3322, + "theoretical_loss": 3.3519368595410604, + "tokens_seen": 2660622336 + }, + { + "epoch": 8.09, + "learning_rate": 9.797392176529589e-05, + "loss": 2.2079, + "theoretical_loss": 3.3519303555897606, + "tokens_seen": 2660687872 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2256221771240234, + "objective/train/theoretical_loss": 3.351923851843514, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.351923851843514, + "tokens_seen": 2660753408 + }, + { + "epoch": 8.09, + "learning_rate": 9.796389167502507e-05, + "loss": 2.3452, + "theoretical_loss": 3.351923851843514, + "tokens_seen": 2660753408 + }, + { + "epoch": 8.09, + "learning_rate": 9.795386158475427e-05, + "loss": 2.7556, + "theoretical_loss": 3.351917348302309, + "tokens_seen": 2660818944 + }, + { + "epoch": 8.09, + "learning_rate": 9.794383149448345e-05, + "loss": 2.5739, + "theoretical_loss": 3.351910844966134, + "tokens_seen": 2660884480 + }, + { + "epoch": 8.09, + "learning_rate": 9.793380140421264e-05, + "loss": 2.4661, + "theoretical_loss": 3.351904341834978, + "tokens_seen": 2660950016 + }, + { + "epoch": 8.09, + "learning_rate": 9.792377131394182e-05, + "loss": 2.3381, + "theoretical_loss": 3.3518978389088288, + "tokens_seen": 2661015552 + }, + { + "epoch": 8.09, + "learning_rate": 9.791374122367102e-05, + "loss": 2.4953, + "theoretical_loss": 3.351891336187675, + "tokens_seen": 2661081088 + }, + { + "epoch": 8.09, + "learning_rate": 9.79037111334002e-05, + "loss": 2.1553, + "theoretical_loss": 3.3518848336715052, + "tokens_seen": 2661146624 + }, + { + "epoch": 8.09, + "learning_rate": 9.789368104312938e-05, + "loss": 2.3697, + "theoretical_loss": 3.351878331360308, + "tokens_seen": 2661212160 + }, + { + "epoch": 8.09, + "learning_rate": 9.788365095285858e-05, + "loss": 2.3356, + "theoretical_loss": 3.3518718292540726, + "tokens_seen": 2661277696 + }, + { + "epoch": 8.09, + "learning_rate": 9.787362086258776e-05, + "loss": 2.4298, + "theoretical_loss": 3.351865327352786, + "tokens_seen": 2661343232 + }, + { + "epoch": 8.09, + "learning_rate": 9.786359077231695e-05, + "loss": 2.498, + "theoretical_loss": 3.3518588256564374, + "tokens_seen": 2661408768 + }, + { + "epoch": 8.09, + "learning_rate": 9.785356068204613e-05, + "loss": 2.2207, + "theoretical_loss": 3.3518523241650153, + "tokens_seen": 2661474304 + }, + { + "epoch": 8.09, + "learning_rate": 9.784353059177533e-05, + "loss": 2.302, + "theoretical_loss": 3.3518458228785084, + "tokens_seen": 2661539840 + }, + { + "epoch": 8.09, + "learning_rate": 9.783350050150451e-05, + "loss": 2.3958, + "theoretical_loss": 3.3518393217969047, + "tokens_seen": 2661605376 + }, + { + "epoch": 8.09, + "learning_rate": 9.78234704112337e-05, + "loss": 2.5243, + "theoretical_loss": 3.3518328209201935, + "tokens_seen": 2661670912 + }, + { + "epoch": 8.09, + "learning_rate": 9.781344032096288e-05, + "loss": 2.2266, + "theoretical_loss": 3.3518263202483625, + "tokens_seen": 2661736448 + }, + { + "epoch": 8.09, + "learning_rate": 9.780341023069207e-05, + "loss": 2.2506, + "theoretical_loss": 3.351819819781401, + "tokens_seen": 2661801984 + }, + { + "epoch": 8.09, + "learning_rate": 9.779338014042126e-05, + "loss": 2.5152, + "theoretical_loss": 3.3518133195192963, + "tokens_seen": 2661867520 + }, + { + "epoch": 8.09, + "learning_rate": 9.778335005015046e-05, + "loss": 2.194, + "theoretical_loss": 3.3518068194620376, + "tokens_seen": 2661933056 + }, + { + "epoch": 8.09, + "learning_rate": 9.777331995987965e-05, + "loss": 2.6585, + "theoretical_loss": 3.3518003196096138, + "tokens_seen": 2661998592 + }, + { + "epoch": 8.09, + "learning_rate": 9.776328986960883e-05, + "loss": 2.3255, + "theoretical_loss": 3.3517938199620128, + "tokens_seen": 2662064128 + }, + { + "epoch": 8.09, + "learning_rate": 9.775325977933803e-05, + "loss": 2.3142, + "theoretical_loss": 3.3517873205192235, + "tokens_seen": 2662129664 + }, + { + "epoch": 8.09, + "learning_rate": 9.774322968906721e-05, + "loss": 2.4566, + "theoretical_loss": 3.351780821281234, + "tokens_seen": 2662195200 + }, + { + "epoch": 8.09, + "learning_rate": 9.77331995987964e-05, + "loss": 2.4713, + "theoretical_loss": 3.3517743222480334, + "tokens_seen": 2662260736 + }, + { + "epoch": 8.09, + "learning_rate": 9.772316950852558e-05, + "loss": 2.496, + "theoretical_loss": 3.3517678234196095, + "tokens_seen": 2662326272 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.8895364999771118, + "objective/train/theoretical_loss": 3.351761324795951, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.351761324795951, + "tokens_seen": 2662391808 + }, + { + "epoch": 8.09, + "learning_rate": 9.771313941825476e-05, + "loss": 2.3032, + "theoretical_loss": 3.351761324795951, + "tokens_seen": 2662391808 + }, + { + "epoch": 8.09, + "learning_rate": 9.770310932798396e-05, + "loss": 2.6111, + "theoretical_loss": 3.3517548263770465, + "tokens_seen": 2662457344 + }, + { + "epoch": 8.09, + "learning_rate": 9.769307923771314e-05, + "loss": 2.4439, + "theoretical_loss": 3.351748328162885, + "tokens_seen": 2662522880 + }, + { + "epoch": 8.09, + "learning_rate": 9.768304914744233e-05, + "loss": 2.4404, + "theoretical_loss": 3.351741830153454, + "tokens_seen": 2662588416 + }, + { + "epoch": 8.09, + "learning_rate": 9.767301905717152e-05, + "loss": 2.2972, + "theoretical_loss": 3.351735332348743, + "tokens_seen": 2662653952 + }, + { + "epoch": 8.09, + "learning_rate": 9.766298896690071e-05, + "loss": 2.4376, + "theoretical_loss": 3.35172883474874, + "tokens_seen": 2662719488 + }, + { + "epoch": 8.09, + "learning_rate": 9.765295887662989e-05, + "loss": 2.1485, + "theoretical_loss": 3.3517223373534337, + "tokens_seen": 2662785024 + }, + { + "epoch": 8.09, + "learning_rate": 9.764292878635909e-05, + "loss": 2.2882, + "theoretical_loss": 3.351715840162812, + "tokens_seen": 2662850560 + }, + { + "epoch": 8.09, + "learning_rate": 9.763289869608827e-05, + "loss": 2.5629, + "theoretical_loss": 3.3517093431768643, + "tokens_seen": 2662916096 + }, + { + "epoch": 8.09, + "learning_rate": 9.762286860581746e-05, + "loss": 2.3683, + "theoretical_loss": 3.351702846395579, + "tokens_seen": 2662981632 + }, + { + "epoch": 8.09, + "learning_rate": 9.761283851554664e-05, + "loss": 2.3244, + "theoretical_loss": 3.351696349818944, + "tokens_seen": 2663047168 + }, + { + "epoch": 8.09, + "learning_rate": 9.760280842527582e-05, + "loss": 2.3638, + "theoretical_loss": 3.351689853446948, + "tokens_seen": 2663112704 + }, + { + "epoch": 8.09, + "learning_rate": 9.759277833500502e-05, + "loss": 2.5794, + "theoretical_loss": 3.35168335727958, + "tokens_seen": 2663178240 + }, + { + "epoch": 8.09, + "learning_rate": 9.75827482447342e-05, + "loss": 2.3999, + "theoretical_loss": 3.3516768613168284, + "tokens_seen": 2663243776 + }, + { + "epoch": 8.09, + "learning_rate": 9.75727181544634e-05, + "loss": 2.4636, + "theoretical_loss": 3.3516703655586815, + "tokens_seen": 2663309312 + }, + { + "epoch": 8.09, + "learning_rate": 9.756268806419258e-05, + "loss": 2.5883, + "theoretical_loss": 3.3516638700051273, + "tokens_seen": 2663374848 + }, + { + "epoch": 8.09, + "learning_rate": 9.755265797392177e-05, + "loss": 2.2297, + "theoretical_loss": 3.3516573746561553, + "tokens_seen": 2663440384 + }, + { + "epoch": 8.09, + "learning_rate": 9.754262788365095e-05, + "loss": 2.4569, + "theoretical_loss": 3.3516508795117534, + "tokens_seen": 2663505920 + }, + { + "epoch": 8.09, + "learning_rate": 9.753259779338015e-05, + "loss": 2.5567, + "theoretical_loss": 3.3516443845719106, + "tokens_seen": 2663571456 + }, + { + "epoch": 8.09, + "learning_rate": 9.752256770310933e-05, + "loss": 2.4858, + "theoretical_loss": 3.351637889836615, + "tokens_seen": 2663636992 + }, + { + "epoch": 8.09, + "learning_rate": 9.751253761283851e-05, + "loss": 2.4444, + "theoretical_loss": 3.351631395305855, + "tokens_seen": 2663702528 + }, + { + "epoch": 8.09, + "learning_rate": 9.75025075225677e-05, + "loss": 2.462, + "theoretical_loss": 3.3516249009796195, + "tokens_seen": 2663768064 + }, + { + "epoch": 8.09, + "learning_rate": 9.749247743229688e-05, + "loss": 2.6772, + "theoretical_loss": 3.351618406857897, + "tokens_seen": 2663833600 + }, + { + "epoch": 8.09, + "learning_rate": 9.748244734202608e-05, + "loss": 2.334, + "theoretical_loss": 3.351611912940676, + "tokens_seen": 2663899136 + }, + { + "epoch": 8.09, + "learning_rate": 9.747241725175526e-05, + "loss": 2.5156, + "theoretical_loss": 3.351605419227945, + "tokens_seen": 2663964672 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5675952434539795, + "objective/train/theoretical_loss": 3.3515989257196925, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.3515989257196925, + "tokens_seen": 2664030208 + }, + { + "epoch": 8.09, + "learning_rate": 9.746238716148445e-05, + "loss": 2.2697, + "theoretical_loss": 3.3515989257196925, + "tokens_seen": 2664030208 + }, + { + "epoch": 8.09, + "learning_rate": 9.745235707121364e-05, + "loss": 2.4266, + "theoretical_loss": 3.351592432415907, + "tokens_seen": 2664095744 + }, + { + "epoch": 8.09, + "learning_rate": 9.744232698094283e-05, + "loss": 2.3073, + "theoretical_loss": 3.351585939316577, + "tokens_seen": 2664161280 + }, + { + "epoch": 8.09, + "learning_rate": 9.743229689067201e-05, + "loss": 2.6191, + "theoretical_loss": 3.3515794464216913, + "tokens_seen": 2664226816 + }, + { + "epoch": 8.09, + "learning_rate": 9.742226680040119e-05, + "loss": 2.4523, + "theoretical_loss": 3.351572953731238, + "tokens_seen": 2664292352 + }, + { + "epoch": 8.09, + "learning_rate": 9.741223671013039e-05, + "loss": 2.184, + "theoretical_loss": 3.351566461245206, + "tokens_seen": 2664357888 + }, + { + "epoch": 8.09, + "learning_rate": 9.740220661985958e-05, + "loss": 2.4642, + "theoretical_loss": 3.351559968963584, + "tokens_seen": 2664423424 + }, + { + "epoch": 8.09, + "learning_rate": 9.739217652958878e-05, + "loss": 2.4866, + "theoretical_loss": 3.3515534768863597, + "tokens_seen": 2664488960 + }, + { + "epoch": 8.09, + "learning_rate": 9.738214643931796e-05, + "loss": 2.0862, + "theoretical_loss": 3.3515469850135227, + "tokens_seen": 2664554496 + }, + { + "epoch": 8.09, + "learning_rate": 9.737211634904715e-05, + "loss": 2.3294, + "theoretical_loss": 3.3515404933450608, + "tokens_seen": 2664620032 + }, + { + "epoch": 8.09, + "learning_rate": 9.736208625877633e-05, + "loss": 2.4782, + "theoretical_loss": 3.3515340018809625, + "tokens_seen": 2664685568 + }, + { + "epoch": 8.09, + "learning_rate": 9.735205616850553e-05, + "loss": 2.5374, + "theoretical_loss": 3.3515275106212172, + "tokens_seen": 2664751104 + }, + { + "epoch": 8.09, + "learning_rate": 9.734202607823471e-05, + "loss": 2.5233, + "theoretical_loss": 3.3515210195658125, + "tokens_seen": 2664816640 + }, + { + "epoch": 8.09, + "learning_rate": 9.73319959879639e-05, + "loss": 2.4755, + "theoretical_loss": 3.351514528714737, + "tokens_seen": 2664882176 + }, + { + "epoch": 8.09, + "learning_rate": 9.732196589769309e-05, + "loss": 2.488, + "theoretical_loss": 3.35150803806798, + "tokens_seen": 2664947712 + }, + { + "epoch": 8.09, + "learning_rate": 9.731193580742227e-05, + "loss": 2.4022, + "theoretical_loss": 3.351501547625529, + "tokens_seen": 2665013248 + }, + { + "epoch": 8.09, + "learning_rate": 9.730190571715146e-05, + "loss": 2.2748, + "theoretical_loss": 3.351495057387374, + "tokens_seen": 2665078784 + }, + { + "epoch": 8.09, + "learning_rate": 9.729187562688064e-05, + "loss": 2.3577, + "theoretical_loss": 3.351488567353502, + "tokens_seen": 2665144320 + }, + { + "epoch": 8.09, + "learning_rate": 9.728184553660984e-05, + "loss": 2.5897, + "theoretical_loss": 3.3514820775239027, + "tokens_seen": 2665209856 + }, + { + "epoch": 8.09, + "learning_rate": 9.727181544633902e-05, + "loss": 2.2628, + "theoretical_loss": 3.351475587898564, + "tokens_seen": 2665275392 + }, + { + "epoch": 8.09, + "learning_rate": 9.726178535606821e-05, + "loss": 2.4156, + "theoretical_loss": 3.351469098477474, + "tokens_seen": 2665340928 + }, + { + "epoch": 8.09, + "learning_rate": 9.72517552657974e-05, + "loss": 2.3808, + "theoretical_loss": 3.3514626092606226, + "tokens_seen": 2665406464 + }, + { + "epoch": 8.09, + "learning_rate": 9.724172517552659e-05, + "loss": 2.6636, + "theoretical_loss": 3.351456120247997, + "tokens_seen": 2665472000 + }, + { + "epoch": 8.09, + "learning_rate": 9.723169508525577e-05, + "loss": 2.4191, + "theoretical_loss": 3.351449631439587, + "tokens_seen": 2665537536 + }, + { + "epoch": 8.09, + "learning_rate": 9.722166499498495e-05, + "loss": 2.4095, + "theoretical_loss": 3.35144314283538, + "tokens_seen": 2665603072 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.549375057220459, + "objective/train/theoretical_loss": 3.3514366544353655, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.3514366544353655, + "tokens_seen": 2665668608 + }, + { + "epoch": 8.09, + "learning_rate": 9.721163490471415e-05, + "loss": 2.2659, + "theoretical_loss": 3.3514366544353655, + "tokens_seen": 2665668608 + }, + { + "epoch": 8.09, + "learning_rate": 9.720160481444333e-05, + "loss": 2.6145, + "theoretical_loss": 3.351430166239531, + "tokens_seen": 2665734144 + }, + { + "epoch": 8.09, + "learning_rate": 9.719157472417252e-05, + "loss": 2.2964, + "theoretical_loss": 3.3514236782478664, + "tokens_seen": 2665799680 + }, + { + "epoch": 8.09, + "learning_rate": 9.71815446339017e-05, + "loss": 2.4269, + "theoretical_loss": 3.351417190460359, + "tokens_seen": 2665865216 + }, + { + "epoch": 8.09, + "learning_rate": 9.71715145436309e-05, + "loss": 2.2241, + "theoretical_loss": 3.3514107028769984, + "tokens_seen": 2665930752 + }, + { + "epoch": 8.09, + "learning_rate": 9.716148445336008e-05, + "loss": 2.3144, + "theoretical_loss": 3.3514042154977726, + "tokens_seen": 2665996288 + }, + { + "epoch": 8.09, + "learning_rate": 9.715145436308927e-05, + "loss": 2.4275, + "theoretical_loss": 3.3513977283226697, + "tokens_seen": 2666061824 + }, + { + "epoch": 8.09, + "learning_rate": 9.714142427281845e-05, + "loss": 2.3132, + "theoretical_loss": 3.351391241351679, + "tokens_seen": 2666127360 + }, + { + "epoch": 8.09, + "learning_rate": 9.713139418254765e-05, + "loss": 2.2899, + "theoretical_loss": 3.3513847545847892, + "tokens_seen": 2666192896 + }, + { + "epoch": 8.09, + "learning_rate": 9.712136409227683e-05, + "loss": 2.4154, + "theoretical_loss": 3.351378268021988, + "tokens_seen": 2666258432 + }, + { + "epoch": 8.09, + "learning_rate": 9.711133400200601e-05, + "loss": 2.5411, + "theoretical_loss": 3.3513717816632647, + "tokens_seen": 2666323968 + }, + { + "epoch": 8.09, + "learning_rate": 9.71013039117352e-05, + "loss": 2.28, + "theoretical_loss": 3.3513652955086077, + "tokens_seen": 2666389504 + }, + { + "epoch": 8.09, + "learning_rate": 9.709127382146439e-05, + "loss": 2.3379, + "theoretical_loss": 3.3513588095580054, + "tokens_seen": 2666455040 + }, + { + "epoch": 8.09, + "learning_rate": 9.708124373119358e-05, + "loss": 2.4759, + "theoretical_loss": 3.3513523238114464, + "tokens_seen": 2666520576 + }, + { + "epoch": 8.09, + "learning_rate": 9.707121364092276e-05, + "loss": 2.4326, + "theoretical_loss": 3.351345838268919, + "tokens_seen": 2666586112 + }, + { + "epoch": 8.09, + "learning_rate": 9.706118355065196e-05, + "loss": 2.4407, + "theoretical_loss": 3.3513393529304123, + "tokens_seen": 2666651648 + }, + { + "epoch": 8.09, + "learning_rate": 9.705115346038114e-05, + "loss": 2.4473, + "theoretical_loss": 3.351332867795915, + "tokens_seen": 2666717184 + }, + { + "epoch": 8.09, + "learning_rate": 9.704112337011033e-05, + "loss": 2.5521, + "theoretical_loss": 3.3513263828654147, + "tokens_seen": 2666782720 + }, + { + "epoch": 8.09, + "learning_rate": 9.703109327983953e-05, + "loss": 2.1874, + "theoretical_loss": 3.351319898138901, + "tokens_seen": 2666848256 + }, + { + "epoch": 8.09, + "learning_rate": 9.702106318956871e-05, + "loss": 2.3824, + "theoretical_loss": 3.3513134136163623, + "tokens_seen": 2666913792 + }, + { + "epoch": 8.09, + "learning_rate": 9.70110330992979e-05, + "loss": 2.33, + "theoretical_loss": 3.3513069292977864, + "tokens_seen": 2666979328 + }, + { + "epoch": 8.09, + "learning_rate": 9.700100300902708e-05, + "loss": 2.3961, + "theoretical_loss": 3.3513004451831625, + "tokens_seen": 2667044864 + }, + { + "epoch": 8.09, + "learning_rate": 9.699097291875628e-05, + "loss": 2.4551, + "theoretical_loss": 3.3512939612724795, + "tokens_seen": 2667110400 + }, + { + "epoch": 8.09, + "learning_rate": 9.698094282848546e-05, + "loss": 2.4346, + "theoretical_loss": 3.3512874775657253, + "tokens_seen": 2667175936 + }, + { + "epoch": 8.09, + "learning_rate": 9.697091273821465e-05, + "loss": 2.467, + "theoretical_loss": 3.3512809940628885, + "tokens_seen": 2667241472 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6839194297790527, + "objective/train/theoretical_loss": 3.3512745107639583, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.3512745107639583, + "tokens_seen": 2667307008 + }, + { + "epoch": 8.09, + "learning_rate": 9.696088264794384e-05, + "loss": 2.4461, + "theoretical_loss": 3.3512745107639583, + "tokens_seen": 2667307008 + }, + { + "epoch": 8.09, + "learning_rate": 9.695085255767303e-05, + "loss": 2.2623, + "theoretical_loss": 3.3512680276689224, + "tokens_seen": 2667372544 + }, + { + "epoch": 8.09, + "learning_rate": 9.694082246740221e-05, + "loss": 2.3063, + "theoretical_loss": 3.3512615447777705, + "tokens_seen": 2667438080 + }, + { + "epoch": 8.09, + "learning_rate": 9.693079237713139e-05, + "loss": 2.4168, + "theoretical_loss": 3.35125506209049, + "tokens_seen": 2667503616 + }, + { + "epoch": 8.09, + "learning_rate": 9.692076228686059e-05, + "loss": 2.1842, + "theoretical_loss": 3.3512485796070703, + "tokens_seen": 2667569152 + }, + { + "epoch": 8.09, + "learning_rate": 9.691073219658977e-05, + "loss": 2.4046, + "theoretical_loss": 3.3512420973274994, + "tokens_seen": 2667634688 + }, + { + "epoch": 8.09, + "learning_rate": 9.690070210631896e-05, + "loss": 2.2998, + "theoretical_loss": 3.3512356152517664, + "tokens_seen": 2667700224 + }, + { + "epoch": 8.09, + "learning_rate": 9.689067201604814e-05, + "loss": 2.3036, + "theoretical_loss": 3.3512291333798596, + "tokens_seen": 2667765760 + }, + { + "epoch": 8.09, + "learning_rate": 9.688064192577734e-05, + "loss": 2.6196, + "theoretical_loss": 3.3512226517117676, + "tokens_seen": 2667831296 + }, + { + "epoch": 8.09, + "learning_rate": 9.687061183550652e-05, + "loss": 2.4659, + "theoretical_loss": 3.351216170247479, + "tokens_seen": 2667896832 + }, + { + "epoch": 8.09, + "learning_rate": 9.686058174523571e-05, + "loss": 2.386, + "theoretical_loss": 3.351209688986983, + "tokens_seen": 2667962368 + }, + { + "epoch": 8.09, + "learning_rate": 9.68505516549649e-05, + "loss": 2.437, + "theoretical_loss": 3.351203207930267, + "tokens_seen": 2668027904 + }, + { + "epoch": 8.09, + "learning_rate": 9.684052156469409e-05, + "loss": 2.1631, + "theoretical_loss": 3.3511967270773204, + "tokens_seen": 2668093440 + }, + { + "epoch": 8.09, + "learning_rate": 9.683049147442327e-05, + "loss": 2.4713, + "theoretical_loss": 3.3511902464281316, + "tokens_seen": 2668158976 + }, + { + "epoch": 8.09, + "learning_rate": 9.682046138415245e-05, + "loss": 2.4376, + "theoretical_loss": 3.351183765982689, + "tokens_seen": 2668224512 + }, + { + "epoch": 8.09, + "learning_rate": 9.681043129388165e-05, + "loss": 2.4437, + "theoretical_loss": 3.3511772857409814, + "tokens_seen": 2668290048 + }, + { + "epoch": 8.09, + "learning_rate": 9.680040120361083e-05, + "loss": 2.7309, + "theoretical_loss": 3.3511708057029974, + "tokens_seen": 2668355584 + }, + { + "epoch": 8.09, + "learning_rate": 9.679037111334002e-05, + "loss": 2.2065, + "theoretical_loss": 3.3511643258687256, + "tokens_seen": 2668421120 + }, + { + "epoch": 8.09, + "learning_rate": 9.67803410230692e-05, + "loss": 2.4624, + "theoretical_loss": 3.3511578462381544, + "tokens_seen": 2668486656 + }, + { + "epoch": 8.09, + "learning_rate": 9.67703109327984e-05, + "loss": 2.3146, + "theoretical_loss": 3.3511513668112727, + "tokens_seen": 2668552192 + }, + { + "epoch": 8.09, + "learning_rate": 9.676028084252758e-05, + "loss": 2.3356, + "theoretical_loss": 3.351144887588069, + "tokens_seen": 2668617728 + }, + { + "epoch": 8.09, + "learning_rate": 9.675025075225677e-05, + "loss": 2.1376, + "theoretical_loss": 3.3511384085685316, + "tokens_seen": 2668683264 + }, + { + "epoch": 8.09, + "learning_rate": 9.674022066198596e-05, + "loss": 2.4704, + "theoretical_loss": 3.3511319297526496, + "tokens_seen": 2668748800 + }, + { + "epoch": 8.09, + "learning_rate": 9.673019057171514e-05, + "loss": 2.2042, + "theoretical_loss": 3.351125451140411, + "tokens_seen": 2668814336 + }, + { + "epoch": 8.09, + "learning_rate": 9.672016048144433e-05, + "loss": 2.3932, + "theoretical_loss": 3.351118972731805, + "tokens_seen": 2668879872 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7907612323760986, + "objective/train/theoretical_loss": 3.3511124945268196, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.3511124945268196, + "tokens_seen": 2668945408 + }, + { + "epoch": 8.09, + "learning_rate": 9.671013039117351e-05, + "loss": 2.5061, + "theoretical_loss": 3.3511124945268196, + "tokens_seen": 2668945408 + }, + { + "epoch": 8.09, + "learning_rate": 9.670010030090271e-05, + "loss": 2.2742, + "theoretical_loss": 3.351106016525444, + "tokens_seen": 2669010944 + }, + { + "epoch": 8.09, + "learning_rate": 9.669007021063189e-05, + "loss": 2.353, + "theoretical_loss": 3.351099538727666, + "tokens_seen": 2669076480 + }, + { + "epoch": 8.09, + "learning_rate": 9.668004012036108e-05, + "loss": 2.337, + "theoretical_loss": 3.3510930611334753, + "tokens_seen": 2669142016 + }, + { + "epoch": 8.09, + "learning_rate": 9.667001003009026e-05, + "loss": 2.265, + "theoretical_loss": 3.35108658374286, + "tokens_seen": 2669207552 + }, + { + "epoch": 8.09, + "learning_rate": 9.665997993981946e-05, + "loss": 2.3523, + "theoretical_loss": 3.3510801065558082, + "tokens_seen": 2669273088 + }, + { + "epoch": 8.09, + "learning_rate": 9.664994984954865e-05, + "loss": 2.7541, + "theoretical_loss": 3.351073629572309, + "tokens_seen": 2669338624 + }, + { + "epoch": 8.09, + "learning_rate": 9.663991975927785e-05, + "loss": 2.3792, + "theoretical_loss": 3.351067152792351, + "tokens_seen": 2669404160 + }, + { + "epoch": 8.09, + "learning_rate": 9.662988966900703e-05, + "loss": 2.4741, + "theoretical_loss": 3.3510606762159227, + "tokens_seen": 2669469696 + }, + { + "epoch": 8.09, + "learning_rate": 9.661985957873621e-05, + "loss": 2.4393, + "theoretical_loss": 3.351054199843013, + "tokens_seen": 2669535232 + }, + { + "epoch": 8.09, + "learning_rate": 9.66098294884654e-05, + "loss": 2.3786, + "theoretical_loss": 3.35104772367361, + "tokens_seen": 2669600768 + }, + { + "epoch": 8.09, + "learning_rate": 9.659979939819459e-05, + "loss": 2.3106, + "theoretical_loss": 3.3510412477077027, + "tokens_seen": 2669666304 + }, + { + "epoch": 8.09, + "learning_rate": 9.658976930792378e-05, + "loss": 2.1833, + "theoretical_loss": 3.3510347719452795, + "tokens_seen": 2669731840 + }, + { + "epoch": 8.09, + "learning_rate": 9.657973921765296e-05, + "loss": 2.5107, + "theoretical_loss": 3.351028296386329, + "tokens_seen": 2669797376 + }, + { + "epoch": 8.09, + "learning_rate": 9.656970912738216e-05, + "loss": 2.4172, + "theoretical_loss": 3.35102182103084, + "tokens_seen": 2669862912 + }, + { + "epoch": 8.09, + "learning_rate": 9.655967903711134e-05, + "loss": 2.4429, + "theoretical_loss": 3.351015345878801, + "tokens_seen": 2669928448 + }, + { + "epoch": 8.09, + "learning_rate": 9.654964894684053e-05, + "loss": 2.2468, + "theoretical_loss": 3.3510088709302006, + "tokens_seen": 2669993984 + }, + { + "epoch": 8.09, + "learning_rate": 9.653961885656971e-05, + "loss": 2.5413, + "theoretical_loss": 3.3510023961850277, + "tokens_seen": 2670059520 + }, + { + "epoch": 8.09, + "learning_rate": 9.65295887662989e-05, + "loss": 2.4373, + "theoretical_loss": 3.3509959216432703, + "tokens_seen": 2670125056 + }, + { + "epoch": 8.09, + "learning_rate": 9.651955867602809e-05, + "loss": 2.5644, + "theoretical_loss": 3.3509894473049178, + "tokens_seen": 2670190592 + }, + { + "epoch": 8.09, + "learning_rate": 9.650952858575727e-05, + "loss": 2.4285, + "theoretical_loss": 3.350982973169958, + "tokens_seen": 2670256128 + }, + { + "epoch": 8.09, + "learning_rate": 9.649949849548647e-05, + "loss": 2.6861, + "theoretical_loss": 3.35097649923838, + "tokens_seen": 2670321664 + }, + { + "epoch": 8.09, + "learning_rate": 9.648946840521565e-05, + "loss": 2.3492, + "theoretical_loss": 3.3509700255101724, + "tokens_seen": 2670387200 + }, + { + "epoch": 8.09, + "learning_rate": 9.647943831494484e-05, + "loss": 2.4444, + "theoretical_loss": 3.350963551985324, + "tokens_seen": 2670452736 + }, + { + "epoch": 8.09, + "learning_rate": 9.646940822467402e-05, + "loss": 2.3374, + "theoretical_loss": 3.3509570786638228, + "tokens_seen": 2670518272 + }, + { + "epoch": 8.09, + "objective/train/docs_used": 2885454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.51086163520813, + "objective/train/theoretical_loss": 3.350950605545658, + "objective/train/tokens_used": 2670841312, + "theoretical_loss": 3.350950605545658, + "tokens_seen": 2670583808 + }, + { + "epoch": 8.09, + "learning_rate": 9.645937813440322e-05, + "loss": 2.402, + "theoretical_loss": 3.350950605545658, + "tokens_seen": 2670583808 + }, + { + "epoch": 8.09, + "learning_rate": 9.64493480441324e-05, + "loss": 2.5058, + "theoretical_loss": 3.350944132630818, + "tokens_seen": 2670649344 + }, + { + "epoch": 8.09, + "learning_rate": 9.643931795386158e-05, + "loss": 2.5359, + "theoretical_loss": 3.3509376599192913, + "tokens_seen": 2670714880 + }, + { + "epoch": 8.09, + "learning_rate": 9.642928786359077e-05, + "loss": 2.2122, + "theoretical_loss": 3.3509311874110668, + "tokens_seen": 2670780416 + }, + { + "epoch": 8.09, + "learning_rate": 9.641925777331996e-05, + "loss": 2.6046, + "theoretical_loss": 3.350924715106133, + "tokens_seen": 2670845952 + }, + { + "epoch": 9.0, + "learning_rate": 9.640922768304915e-05, + "loss": 3.2697, + "theoretical_loss": 3.350916726135057, + "tokens_seen": 2670926848 + }, + { + "epoch": 9.0, + "learning_rate": 9.639919759277833e-05, + "loss": 2.5274, + "theoretical_loss": 3.35091025428431, + "tokens_seen": 2670992384 + }, + { + "epoch": 9.0, + "learning_rate": 9.638916750250753e-05, + "loss": 2.5511, + "theoretical_loss": 3.3509037826368164, + "tokens_seen": 2671057920 + }, + { + "epoch": 9.0, + "learning_rate": 9.63791374122367e-05, + "loss": 2.3948, + "theoretical_loss": 3.3508973111925657, + "tokens_seen": 2671123456 + }, + { + "epoch": 9.0, + "learning_rate": 9.63691073219659e-05, + "loss": 2.6237, + "theoretical_loss": 3.3508908399515462, + "tokens_seen": 2671188992 + }, + { + "epoch": 9.0, + "learning_rate": 9.635907723169508e-05, + "loss": 2.6033, + "theoretical_loss": 3.3508843689137464, + "tokens_seen": 2671254528 + }, + { + "epoch": 9.0, + "learning_rate": 9.634904714142426e-05, + "loss": 2.5948, + "theoretical_loss": 3.3508778980791547, + "tokens_seen": 2671320064 + }, + { + "epoch": 9.0, + "learning_rate": 9.633901705115346e-05, + "loss": 2.4325, + "theoretical_loss": 3.3508714274477605, + "tokens_seen": 2671385600 + }, + { + "epoch": 9.0, + "learning_rate": 9.632898696088264e-05, + "loss": 2.5262, + "theoretical_loss": 3.350864957019552, + "tokens_seen": 2671451136 + }, + { + "epoch": 9.0, + "learning_rate": 9.631895687061183e-05, + "loss": 2.4693, + "theoretical_loss": 3.3508584867945177, + "tokens_seen": 2671516672 + }, + { + "epoch": 9.0, + "learning_rate": 9.630892678034102e-05, + "loss": 2.5475, + "theoretical_loss": 3.3508520167726465, + "tokens_seen": 2671582208 + }, + { + "epoch": 9.0, + "learning_rate": 9.629889669007021e-05, + "loss": 2.3843, + "theoretical_loss": 3.350845546953927, + "tokens_seen": 2671647744 + }, + { + "epoch": 9.0, + "learning_rate": 9.628886659979939e-05, + "loss": 2.3817, + "theoretical_loss": 3.350839077338348, + "tokens_seen": 2671713280 + }, + { + "epoch": 9.0, + "learning_rate": 9.62788365095286e-05, + "loss": 2.6182, + "theoretical_loss": 3.350832607925897, + "tokens_seen": 2671778816 + }, + { + "epoch": 9.0, + "learning_rate": 9.626880641925778e-05, + "loss": 2.5975, + "theoretical_loss": 3.350826138716564, + "tokens_seen": 2671844352 + }, + { + "epoch": 9.0, + "learning_rate": 9.625877632898698e-05, + "loss": 2.3864, + "theoretical_loss": 3.3508196697103374, + "tokens_seen": 2671909888 + }, + { + "epoch": 9.0, + "learning_rate": 9.624874623871616e-05, + "loss": 2.4696, + "theoretical_loss": 3.3508132009072056, + "tokens_seen": 2671975424 + }, + { + "epoch": 9.0, + "learning_rate": 9.623871614844534e-05, + "loss": 2.4954, + "theoretical_loss": 3.350806732307157, + "tokens_seen": 2672040960 + }, + { + "epoch": 9.0, + "learning_rate": 9.622868605817453e-05, + "loss": 2.6436, + "theoretical_loss": 3.3508002639101804, + "tokens_seen": 2672106496 + }, + { + "epoch": 9.0, + "learning_rate": 9.621865596790371e-05, + "loss": 2.5274, + "theoretical_loss": 3.350793795716265, + "tokens_seen": 2672172032 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2952934, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1810548305511475, + "objective/train/theoretical_loss": 3.35078894470408, + "objective/train/tokens_used": 2692681184, + "theoretical_loss": 3.35078894470408, + "tokens_seen": 2672221184 + }, + { + "epoch": 9.0, + "learning_rate": 9.620862587763291e-05, + "loss": 2.4473, + "theoretical_loss": 3.350787327725399, + "tokens_seen": 2672237568 + }, + { + "epoch": 9.0, + "learning_rate": 9.619859578736209e-05, + "loss": 2.6232, + "theoretical_loss": 3.3507808599375704, + "tokens_seen": 2672303104 + }, + { + "epoch": 9.0, + "learning_rate": 9.618856569709128e-05, + "loss": 2.5224, + "theoretical_loss": 3.350774392352769, + "tokens_seen": 2672368640 + }, + { + "epoch": 9.0, + "learning_rate": 9.617853560682046e-05, + "loss": 2.5281, + "theoretical_loss": 3.3507679249709827, + "tokens_seen": 2672434176 + }, + { + "epoch": 9.0, + "learning_rate": 9.616850551654966e-05, + "loss": 2.3702, + "theoretical_loss": 3.350761457792201, + "tokens_seen": 2672499712 + }, + { + "epoch": 9.0, + "learning_rate": 9.615847542627884e-05, + "loss": 2.6086, + "theoretical_loss": 3.3507549908164114, + "tokens_seen": 2672565248 + }, + { + "epoch": 9.0, + "learning_rate": 9.614844533600802e-05, + "loss": 2.4655, + "theoretical_loss": 3.3507485240436026, + "tokens_seen": 2672630784 + }, + { + "epoch": 9.0, + "learning_rate": 9.613841524573722e-05, + "loss": 2.573, + "theoretical_loss": 3.3507420574737647, + "tokens_seen": 2672696320 + }, + { + "epoch": 9.0, + "learning_rate": 9.61283851554664e-05, + "loss": 2.5002, + "theoretical_loss": 3.3507355911068846, + "tokens_seen": 2672761856 + }, + { + "epoch": 9.0, + "learning_rate": 9.611835506519559e-05, + "loss": 2.4437, + "theoretical_loss": 3.3507291249429523, + "tokens_seen": 2672827392 + }, + { + "epoch": 9.0, + "learning_rate": 9.610832497492477e-05, + "loss": 2.6846, + "theoretical_loss": 3.350722658981956, + "tokens_seen": 2672892928 + }, + { + "epoch": 9.0, + "learning_rate": 9.609829488465397e-05, + "loss": 2.5539, + "theoretical_loss": 3.350716193223884, + "tokens_seen": 2672958464 + }, + { + "epoch": 9.0, + "learning_rate": 9.608826479438315e-05, + "loss": 2.5364, + "theoretical_loss": 3.350709727668725, + "tokens_seen": 2673024000 + }, + { + "epoch": 9.0, + "learning_rate": 9.607823470411234e-05, + "loss": 2.3411, + "theoretical_loss": 3.350703262316468, + "tokens_seen": 2673089536 + }, + { + "epoch": 9.0, + "learning_rate": 9.606820461384152e-05, + "loss": 2.5008, + "theoretical_loss": 3.350696797167102, + "tokens_seen": 2673155072 + }, + { + "epoch": 9.0, + "learning_rate": 9.605817452357072e-05, + "loss": 2.5384, + "theoretical_loss": 3.3506903322206143, + "tokens_seen": 2673220608 + }, + { + "epoch": 9.0, + "learning_rate": 9.60481444332999e-05, + "loss": 2.4822, + "theoretical_loss": 3.350683867476995, + "tokens_seen": 2673286144 + }, + { + "epoch": 9.0, + "learning_rate": 9.603811434302908e-05, + "loss": 2.5934, + "theoretical_loss": 3.350677402936232, + "tokens_seen": 2673351680 + }, + { + "epoch": 9.0, + "learning_rate": 9.602808425275828e-05, + "loss": 2.6287, + "theoretical_loss": 3.3506709385983147, + "tokens_seen": 2673417216 + }, + { + "epoch": 9.0, + "learning_rate": 9.601805416248746e-05, + "loss": 2.3684, + "theoretical_loss": 3.3506644744632306, + "tokens_seen": 2673482752 + }, + { + "epoch": 9.0, + "learning_rate": 9.600802407221665e-05, + "loss": 2.4646, + "theoretical_loss": 3.350658010530969, + "tokens_seen": 2673548288 + }, + { + "epoch": 9.0, + "learning_rate": 9.599799398194583e-05, + "loss": 2.6212, + "theoretical_loss": 3.3506515468015188, + "tokens_seen": 2673613824 + }, + { + "epoch": 9.0, + "learning_rate": 9.598796389167503e-05, + "loss": 2.5735, + "theoretical_loss": 3.350645083274869, + "tokens_seen": 2673679360 + }, + { + "epoch": 9.0, + "learning_rate": 9.597793380140421e-05, + "loss": 2.3819, + "theoretical_loss": 3.3506386199510065, + "tokens_seen": 2673744896 + }, + { + "epoch": 9.0, + "learning_rate": 9.59679037111334e-05, + "loss": 2.528, + "theoretical_loss": 3.350632156829922, + "tokens_seen": 2673810432 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2958187, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3570384979248047, + "objective/train/theoretical_loss": 3.3506273096221744, + "objective/train/tokens_used": 2694319584, + "theoretical_loss": 3.3506273096221744, + "tokens_seen": 2673859584 + }, + { + "epoch": 9.0, + "learning_rate": 9.595787362086258e-05, + "loss": 2.5021, + "theoretical_loss": 3.350625693911603, + "tokens_seen": 2673875968 + }, + { + "epoch": 9.0, + "learning_rate": 9.594784353059177e-05, + "loss": 2.4415, + "theoretical_loss": 3.3506192311960388, + "tokens_seen": 2673941504 + }, + { + "epoch": 9.0, + "learning_rate": 9.593781344032096e-05, + "loss": 2.2603, + "theoretical_loss": 3.3506127686832174, + "tokens_seen": 2674007040 + }, + { + "epoch": 9.0, + "learning_rate": 9.592778335005014e-05, + "loss": 2.5301, + "theoretical_loss": 3.3506063063731277, + "tokens_seen": 2674072576 + }, + { + "epoch": 9.0, + "learning_rate": 9.591775325977934e-05, + "loss": 2.509, + "theoretical_loss": 3.350599844265759, + "tokens_seen": 2674138112 + }, + { + "epoch": 9.0, + "learning_rate": 9.590772316950853e-05, + "loss": 2.4128, + "theoretical_loss": 3.3505933823610996, + "tokens_seen": 2674203648 + }, + { + "epoch": 9.0, + "learning_rate": 9.589769307923773e-05, + "loss": 2.5745, + "theoretical_loss": 3.3505869206591377, + "tokens_seen": 2674269184 + }, + { + "epoch": 9.0, + "learning_rate": 9.588766298896691e-05, + "loss": 2.5313, + "theoretical_loss": 3.3505804591598625, + "tokens_seen": 2674334720 + }, + { + "epoch": 9.0, + "learning_rate": 9.58776328986961e-05, + "loss": 2.7122, + "theoretical_loss": 3.350573997863262, + "tokens_seen": 2674400256 + }, + { + "epoch": 9.0, + "learning_rate": 9.586760280842528e-05, + "loss": 2.4939, + "theoretical_loss": 3.3505675367693257, + "tokens_seen": 2674465792 + }, + { + "epoch": 9.0, + "learning_rate": 9.585757271815446e-05, + "loss": 2.4745, + "theoretical_loss": 3.3505610758780424, + "tokens_seen": 2674531328 + }, + { + "epoch": 9.0, + "learning_rate": 9.584754262788366e-05, + "loss": 2.4968, + "theoretical_loss": 3.3505546151893997, + "tokens_seen": 2674596864 + }, + { + "epoch": 9.0, + "learning_rate": 9.583751253761284e-05, + "loss": 2.6016, + "theoretical_loss": 3.3505481547033873, + "tokens_seen": 2674662400 + }, + { + "epoch": 9.0, + "learning_rate": 9.582748244734203e-05, + "loss": 2.6346, + "theoretical_loss": 3.3505416944199933, + "tokens_seen": 2674727936 + }, + { + "epoch": 9.0, + "learning_rate": 9.581745235707122e-05, + "loss": 2.6288, + "theoretical_loss": 3.350535234339207, + "tokens_seen": 2674793472 + }, + { + "epoch": 9.0, + "learning_rate": 9.580742226680041e-05, + "loss": 2.4451, + "theoretical_loss": 3.350528774461016, + "tokens_seen": 2674859008 + }, + { + "epoch": 9.0, + "learning_rate": 9.579739217652959e-05, + "loss": 2.5009, + "theoretical_loss": 3.35052231478541, + "tokens_seen": 2674924544 + }, + { + "epoch": 9.0, + "learning_rate": 9.578736208625879e-05, + "loss": 2.4951, + "theoretical_loss": 3.3505158553123775, + "tokens_seen": 2674990080 + }, + { + "epoch": 9.0, + "learning_rate": 9.577733199598797e-05, + "loss": 2.6419, + "theoretical_loss": 3.3505093960419066, + "tokens_seen": 2675055616 + }, + { + "epoch": 9.0, + "learning_rate": 9.576730190571716e-05, + "loss": 2.3852, + "theoretical_loss": 3.350502936973987, + "tokens_seen": 2675121152 + }, + { + "epoch": 9.0, + "learning_rate": 9.575727181544634e-05, + "loss": 2.4446, + "theoretical_loss": 3.350496478108606, + "tokens_seen": 2675186688 + }, + { + "epoch": 9.0, + "learning_rate": 9.574724172517552e-05, + "loss": 2.3525, + "theoretical_loss": 3.3504900194457536, + "tokens_seen": 2675252224 + }, + { + "epoch": 9.0, + "learning_rate": 9.573721163490472e-05, + "loss": 2.6546, + "theoretical_loss": 3.3504835609854178, + "tokens_seen": 2675317760 + }, + { + "epoch": 9.0, + "learning_rate": 9.57271815446339e-05, + "loss": 2.3509, + "theoretical_loss": 3.350477102727588, + "tokens_seen": 2675383296 + }, + { + "epoch": 9.0, + "learning_rate": 9.57171514543631e-05, + "loss": 2.5375, + "theoretical_loss": 3.3504706446722516, + "tokens_seen": 2675448832 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2963060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6222784519195557, + "objective/train/theoretical_loss": 3.3504658012636295, + "objective/train/tokens_used": 2695957984, + "theoretical_loss": 3.3504658012636295, + "tokens_seen": 2675497984 + }, + { + "epoch": 9.0, + "learning_rate": 9.570712136409228e-05, + "loss": 2.4068, + "theoretical_loss": 3.3504641868193983, + "tokens_seen": 2675514368 + }, + { + "epoch": 9.0, + "learning_rate": 9.569709127382147e-05, + "loss": 2.3907, + "theoretical_loss": 3.3504577291690163, + "tokens_seen": 2675579904 + }, + { + "epoch": 9.0, + "learning_rate": 9.568706118355065e-05, + "loss": 2.5954, + "theoretical_loss": 3.350451271721095, + "tokens_seen": 2675645440 + }, + { + "epoch": 9.0, + "learning_rate": 9.567703109327985e-05, + "loss": 2.679, + "theoretical_loss": 3.350444814475622, + "tokens_seen": 2675710976 + }, + { + "epoch": 9.0, + "learning_rate": 9.566700100300903e-05, + "loss": 2.4443, + "theoretical_loss": 3.350438357432587, + "tokens_seen": 2675776512 + }, + { + "epoch": 9.0, + "learning_rate": 9.565697091273821e-05, + "loss": 2.5671, + "theoretical_loss": 3.3504319005919783, + "tokens_seen": 2675842048 + }, + { + "epoch": 9.0, + "learning_rate": 9.56469408224674e-05, + "loss": 2.6328, + "theoretical_loss": 3.3504254439537844, + "tokens_seen": 2675907584 + }, + { + "epoch": 9.0, + "learning_rate": 9.563691073219658e-05, + "loss": 2.4434, + "theoretical_loss": 3.3504189875179944, + "tokens_seen": 2675973120 + }, + { + "epoch": 9.0, + "learning_rate": 9.562688064192578e-05, + "loss": 2.7407, + "theoretical_loss": 3.3504125312845967, + "tokens_seen": 2676038656 + }, + { + "epoch": 9.0, + "learning_rate": 9.561685055165496e-05, + "loss": 2.6483, + "theoretical_loss": 3.35040607525358, + "tokens_seen": 2676104192 + }, + { + "epoch": 9.0, + "learning_rate": 9.560682046138415e-05, + "loss": 2.4679, + "theoretical_loss": 3.350399619424933, + "tokens_seen": 2676169728 + }, + { + "epoch": 9.0, + "learning_rate": 9.559679037111334e-05, + "loss": 2.4558, + "theoretical_loss": 3.350393163798645, + "tokens_seen": 2676235264 + }, + { + "epoch": 9.0, + "learning_rate": 9.558676028084253e-05, + "loss": 2.4955, + "theoretical_loss": 3.3503867083747036, + "tokens_seen": 2676300800 + }, + { + "epoch": 9.0, + "learning_rate": 9.557673019057171e-05, + "loss": 2.5075, + "theoretical_loss": 3.3503802531530984, + "tokens_seen": 2676366336 + }, + { + "epoch": 9.0, + "learning_rate": 9.556670010030089e-05, + "loss": 2.4722, + "theoretical_loss": 3.350373798133818, + "tokens_seen": 2676431872 + }, + { + "epoch": 9.0, + "learning_rate": 9.555667001003009e-05, + "loss": 2.6795, + "theoretical_loss": 3.350367343316851, + "tokens_seen": 2676497408 + }, + { + "epoch": 9.0, + "learning_rate": 9.554663991975927e-05, + "loss": 2.5542, + "theoretical_loss": 3.350360888702185, + "tokens_seen": 2676562944 + }, + { + "epoch": 9.0, + "learning_rate": 9.553660982948846e-05, + "loss": 2.7328, + "theoretical_loss": 3.3503544342898106, + "tokens_seen": 2676628480 + }, + { + "epoch": 9.0, + "learning_rate": 9.552657973921766e-05, + "loss": 2.5333, + "theoretical_loss": 3.3503479800797153, + "tokens_seen": 2676694016 + }, + { + "epoch": 9.0, + "learning_rate": 9.551654964894685e-05, + "loss": 2.4642, + "theoretical_loss": 3.3503415260718885, + "tokens_seen": 2676759552 + }, + { + "epoch": 9.0, + "learning_rate": 9.550651955867603e-05, + "loss": 2.5862, + "theoretical_loss": 3.350335072266318, + "tokens_seen": 2676825088 + }, + { + "epoch": 9.0, + "learning_rate": 9.549648946840523e-05, + "loss": 2.4841, + "theoretical_loss": 3.3503286186629935, + "tokens_seen": 2676890624 + }, + { + "epoch": 9.0, + "learning_rate": 9.548645937813441e-05, + "loss": 2.5033, + "theoretical_loss": 3.350322165261903, + "tokens_seen": 2676956160 + }, + { + "epoch": 9.0, + "learning_rate": 9.54764292878636e-05, + "loss": 2.4745, + "theoretical_loss": 3.350315712063036, + "tokens_seen": 2677021696 + }, + { + "epoch": 9.0, + "learning_rate": 9.546639919759278e-05, + "loss": 2.5057, + "theoretical_loss": 3.35030925906638, + "tokens_seen": 2677087232 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2967967, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6446802616119385, + "objective/train/theoretical_loss": 3.3503044194515827, + "objective/train/tokens_used": 2697596384, + "theoretical_loss": 3.3503044194515827, + "tokens_seen": 2677136384 + }, + { + "epoch": 9.0, + "learning_rate": 9.545636910732197e-05, + "loss": 2.6568, + "theoretical_loss": 3.350302806271925, + "tokens_seen": 2677152768 + }, + { + "epoch": 9.0, + "learning_rate": 9.544633901705116e-05, + "loss": 2.5134, + "theoretical_loss": 3.3502963536796586, + "tokens_seen": 2677218304 + }, + { + "epoch": 9.0, + "learning_rate": 9.543630892678034e-05, + "loss": 2.4511, + "theoretical_loss": 3.35028990128957, + "tokens_seen": 2677283840 + }, + { + "epoch": 9.0, + "learning_rate": 9.542627883650954e-05, + "loss": 2.4135, + "theoretical_loss": 3.3502834491016484, + "tokens_seen": 2677349376 + }, + { + "epoch": 9.0, + "learning_rate": 9.541624874623872e-05, + "loss": 2.4939, + "theoretical_loss": 3.350276997115882, + "tokens_seen": 2677414912 + }, + { + "epoch": 9.0, + "learning_rate": 9.540621865596791e-05, + "loss": 2.5675, + "theoretical_loss": 3.3502705453322594, + "tokens_seen": 2677480448 + }, + { + "epoch": 9.0, + "learning_rate": 9.540621865596791e-05, + "loss": 2.5052, + "theoretical_loss": 3.350264093750769, + "tokens_seen": 2677545984 + }, + { + "epoch": 9.0, + "learning_rate": 9.53961885656971e-05, + "loss": 2.4828, + "theoretical_loss": 3.3502576423714006, + "tokens_seen": 2677611520 + }, + { + "epoch": 9.0, + "learning_rate": 9.538615847542629e-05, + "loss": 2.5791, + "theoretical_loss": 3.3502511911941424, + "tokens_seen": 2677677056 + }, + { + "epoch": 9.0, + "learning_rate": 9.537612838515547e-05, + "loss": 2.6151, + "theoretical_loss": 3.350244740218983, + "tokens_seen": 2677742592 + }, + { + "epoch": 9.0, + "learning_rate": 9.536609829488465e-05, + "loss": 2.548, + "theoretical_loss": 3.3502382894459113, + "tokens_seen": 2677808128 + }, + { + "epoch": 9.0, + "learning_rate": 9.535606820461385e-05, + "loss": 2.468, + "theoretical_loss": 3.3502318388749157, + "tokens_seen": 2677873664 + }, + { + "epoch": 9.0, + "learning_rate": 9.534603811434303e-05, + "loss": 2.5817, + "theoretical_loss": 3.350225388505985, + "tokens_seen": 2677939200 + }, + { + "epoch": 9.0, + "learning_rate": 9.533600802407222e-05, + "loss": 2.6462, + "theoretical_loss": 3.3502189383391086, + "tokens_seen": 2678004736 + }, + { + "epoch": 9.0, + "learning_rate": 9.53259779338014e-05, + "loss": 2.623, + "theoretical_loss": 3.3502124883742743, + "tokens_seen": 2678070272 + }, + { + "epoch": 9.0, + "learning_rate": 9.53159478435306e-05, + "loss": 2.594, + "theoretical_loss": 3.3502060386114714, + "tokens_seen": 2678135808 + }, + { + "epoch": 9.0, + "learning_rate": 9.530591775325978e-05, + "loss": 2.6522, + "theoretical_loss": 3.350199589050688, + "tokens_seen": 2678201344 + }, + { + "epoch": 9.0, + "learning_rate": 9.529588766298897e-05, + "loss": 2.4872, + "theoretical_loss": 3.3501931396919136, + "tokens_seen": 2678266880 + }, + { + "epoch": 9.0, + "learning_rate": 9.528585757271815e-05, + "loss": 2.6397, + "theoretical_loss": 3.3501866905351365, + "tokens_seen": 2678332416 + }, + { + "epoch": 9.0, + "learning_rate": 9.527582748244733e-05, + "loss": 2.4135, + "theoretical_loss": 3.3501802415803454, + "tokens_seen": 2678397952 + }, + { + "epoch": 9.0, + "learning_rate": 9.526579739217653e-05, + "loss": 2.3672, + "theoretical_loss": 3.3501737928275297, + "tokens_seen": 2678463488 + }, + { + "epoch": 9.0, + "learning_rate": 9.525576730190571e-05, + "loss": 2.5778, + "theoretical_loss": 3.3501673442766773, + "tokens_seen": 2678529024 + }, + { + "epoch": 9.0, + "learning_rate": 9.52457372116349e-05, + "loss": 2.5197, + "theoretical_loss": 3.350160895927777, + "tokens_seen": 2678594560 + }, + { + "epoch": 9.0, + "learning_rate": 9.523570712136409e-05, + "loss": 2.3305, + "theoretical_loss": 3.350154447780818, + "tokens_seen": 2678660096 + }, + { + "epoch": 9.0, + "learning_rate": 9.522567703109328e-05, + "loss": 2.3792, + "theoretical_loss": 3.350147999835789, + "tokens_seen": 2678725632 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2973021, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.41925048828125, + "objective/train/theoretical_loss": 3.3501431640095265, + "objective/train/tokens_used": 2699234784, + "theoretical_loss": 3.3501431640095265, + "tokens_seen": 2678774784 + }, + { + "epoch": 9.0, + "learning_rate": 9.521564694082246e-05, + "loss": 2.3582, + "theoretical_loss": 3.350141552092678, + "tokens_seen": 2678791168 + }, + { + "epoch": 9.0, + "learning_rate": 9.520561685055166e-05, + "loss": 2.4846, + "theoretical_loss": 3.350135104551475, + "tokens_seen": 2678856704 + }, + { + "epoch": 9.0, + "learning_rate": 9.519558676028084e-05, + "loss": 2.7031, + "theoretical_loss": 3.350128657212167, + "tokens_seen": 2678922240 + }, + { + "epoch": 9.0, + "learning_rate": 9.518555667001003e-05, + "loss": 2.4809, + "theoretical_loss": 3.350122210074745, + "tokens_seen": 2678987776 + }, + { + "epoch": 9.0, + "learning_rate": 9.517552657973921e-05, + "loss": 2.462, + "theoretical_loss": 3.3501157631391956, + "tokens_seen": 2679053312 + }, + { + "epoch": 9.0, + "learning_rate": 9.51654964894684e-05, + "loss": 2.4773, + "theoretical_loss": 3.3501093164055087, + "tokens_seen": 2679118848 + }, + { + "epoch": 9.0, + "learning_rate": 9.51554663991976e-05, + "loss": 2.3851, + "theoretical_loss": 3.350102869873673, + "tokens_seen": 2679184384 + }, + { + "epoch": 9.0, + "learning_rate": 9.514543630892678e-05, + "loss": 2.6136, + "theoretical_loss": 3.3500964235436763, + "tokens_seen": 2679249920 + }, + { + "epoch": 9.0, + "learning_rate": 9.513540621865598e-05, + "loss": 2.5972, + "theoretical_loss": 3.3500899774155086, + "tokens_seen": 2679315456 + }, + { + "epoch": 9.0, + "learning_rate": 9.512537612838516e-05, + "loss": 2.4074, + "theoretical_loss": 3.350083531489158, + "tokens_seen": 2679380992 + }, + { + "epoch": 9.0, + "learning_rate": 9.511534603811435e-05, + "loss": 2.5501, + "theoretical_loss": 3.3500770857646134, + "tokens_seen": 2679446528 + }, + { + "epoch": 9.0, + "learning_rate": 9.510531594784354e-05, + "loss": 2.5593, + "theoretical_loss": 3.3500706402418636, + "tokens_seen": 2679512064 + }, + { + "epoch": 9.0, + "learning_rate": 9.509528585757273e-05, + "loss": 2.7475, + "theoretical_loss": 3.350064194920897, + "tokens_seen": 2679577600 + }, + { + "epoch": 9.0, + "learning_rate": 9.508525576730191e-05, + "loss": 2.4053, + "theoretical_loss": 3.350057749801703, + "tokens_seen": 2679643136 + }, + { + "epoch": 9.0, + "learning_rate": 9.507522567703109e-05, + "loss": 2.4608, + "theoretical_loss": 3.3500513048842695, + "tokens_seen": 2679708672 + }, + { + "epoch": 9.0, + "learning_rate": 9.506519558676029e-05, + "loss": 2.5006, + "theoretical_loss": 3.3500448601685857, + "tokens_seen": 2679774208 + }, + { + "epoch": 9.0, + "learning_rate": 9.505516549648947e-05, + "loss": 2.6238, + "theoretical_loss": 3.3500384156546406, + "tokens_seen": 2679839744 + }, + { + "epoch": 9.0, + "learning_rate": 9.504513540621866e-05, + "loss": 2.5365, + "theoretical_loss": 3.350031971342423, + "tokens_seen": 2679905280 + }, + { + "epoch": 9.0, + "learning_rate": 9.503510531594784e-05, + "loss": 2.4442, + "theoretical_loss": 3.350025527231921, + "tokens_seen": 2679970816 + }, + { + "epoch": 9.0, + "learning_rate": 9.502507522567704e-05, + "loss": 2.5511, + "theoretical_loss": 3.350019083323123, + "tokens_seen": 2680036352 + }, + { + "epoch": 9.0, + "learning_rate": 9.501504513540622e-05, + "loss": 2.3879, + "theoretical_loss": 3.3500126396160192, + "tokens_seen": 2680101888 + }, + { + "epoch": 9.0, + "learning_rate": 9.500501504513541e-05, + "loss": 2.4095, + "theoretical_loss": 3.3500061961105976, + "tokens_seen": 2680167424 + }, + { + "epoch": 9.0, + "learning_rate": 9.49949849548646e-05, + "loss": 2.4231, + "theoretical_loss": 3.349999752806847, + "tokens_seen": 2680232960 + }, + { + "epoch": 9.0, + "learning_rate": 9.498495486459379e-05, + "loss": 2.4259, + "theoretical_loss": 3.3499933097047556, + "tokens_seen": 2680298496 + }, + { + "epoch": 9.0, + "learning_rate": 9.497492477432297e-05, + "loss": 2.5132, + "theoretical_loss": 3.349986866804313, + "tokens_seen": 2680364032 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2978192, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8650176525115967, + "objective/train/theoretical_loss": 3.3499820347613065, + "objective/train/tokens_used": 2700873184, + "theoretical_loss": 3.3499820347613065, + "tokens_seen": 2680413184 + }, + { + "epoch": 9.0, + "learning_rate": 9.496489468405215e-05, + "loss": 2.65, + "theoretical_loss": 3.349980424105508, + "tokens_seen": 2680429568 + }, + { + "epoch": 9.0, + "learning_rate": 9.495486459378135e-05, + "loss": 2.3555, + "theoretical_loss": 3.3499739816083287, + "tokens_seen": 2680495104 + }, + { + "epoch": 9.0, + "learning_rate": 9.494483450351053e-05, + "loss": 2.5278, + "theoretical_loss": 3.349967539312764, + "tokens_seen": 2680560640 + }, + { + "epoch": 9.0, + "learning_rate": 9.493480441323972e-05, + "loss": 2.3119, + "theoretical_loss": 3.349961097218803, + "tokens_seen": 2680626176 + }, + { + "epoch": 9.0, + "learning_rate": 9.49247743229689e-05, + "loss": 2.3004, + "theoretical_loss": 3.3499546553264343, + "tokens_seen": 2680691712 + }, + { + "epoch": 9.0, + "learning_rate": 9.49147442326981e-05, + "loss": 2.4723, + "theoretical_loss": 3.349948213635647, + "tokens_seen": 2680757248 + }, + { + "epoch": 9.0, + "learning_rate": 9.490471414242728e-05, + "loss": 2.403, + "theoretical_loss": 3.349941772146429, + "tokens_seen": 2680822784 + }, + { + "epoch": 9.0, + "learning_rate": 9.489468405215647e-05, + "loss": 2.5018, + "theoretical_loss": 3.3499353308587696, + "tokens_seen": 2680888320 + }, + { + "epoch": 9.0, + "learning_rate": 9.488465396188566e-05, + "loss": 2.6169, + "theoretical_loss": 3.3499288897726576, + "tokens_seen": 2680953856 + }, + { + "epoch": 9.0, + "learning_rate": 9.487462387161484e-05, + "loss": 2.5015, + "theoretical_loss": 3.3499224488880817, + "tokens_seen": 2681019392 + }, + { + "epoch": 9.0, + "learning_rate": 9.486459378134403e-05, + "loss": 2.4857, + "theoretical_loss": 3.3499160082050308, + "tokens_seen": 2681084928 + }, + { + "epoch": 9.0, + "learning_rate": 9.485456369107321e-05, + "loss": 2.476, + "theoretical_loss": 3.349909567723494, + "tokens_seen": 2681150464 + }, + { + "epoch": 9.0, + "learning_rate": 9.484453360080241e-05, + "loss": 2.4542, + "theoretical_loss": 3.3499031274434588, + "tokens_seen": 2681216000 + }, + { + "epoch": 9.0, + "learning_rate": 9.483450351053159e-05, + "loss": 2.6962, + "theoretical_loss": 3.349896687364915, + "tokens_seen": 2681281536 + }, + { + "epoch": 9.0, + "learning_rate": 9.482447342026078e-05, + "loss": 2.4949, + "theoretical_loss": 3.3498902474878514, + "tokens_seen": 2681347072 + }, + { + "epoch": 9.0, + "learning_rate": 9.481444332998996e-05, + "loss": 2.652, + "theoretical_loss": 3.3498838078122564, + "tokens_seen": 2681412608 + }, + { + "epoch": 9.0, + "learning_rate": 9.480441323971916e-05, + "loss": 2.5891, + "theoretical_loss": 3.349877368338119, + "tokens_seen": 2681478144 + }, + { + "epoch": 9.0, + "learning_rate": 9.479438314944834e-05, + "loss": 2.4648, + "theoretical_loss": 3.3498709290654274, + "tokens_seen": 2681543680 + }, + { + "epoch": 9.0, + "learning_rate": 9.478435305917752e-05, + "loss": 2.4604, + "theoretical_loss": 3.349864489994171, + "tokens_seen": 2681609216 + }, + { + "epoch": 9.0, + "learning_rate": 9.477432296890673e-05, + "loss": 2.6368, + "theoretical_loss": 3.3498580511243388, + "tokens_seen": 2681674752 + }, + { + "epoch": 9.0, + "learning_rate": 9.476429287863591e-05, + "loss": 2.5602, + "theoretical_loss": 3.349851612455919, + "tokens_seen": 2681740288 + }, + { + "epoch": 9.0, + "learning_rate": 9.47542627883651e-05, + "loss": 2.446, + "theoretical_loss": 3.349845173988901, + "tokens_seen": 2681805824 + }, + { + "epoch": 9.0, + "learning_rate": 9.474423269809429e-05, + "loss": 2.4019, + "theoretical_loss": 3.3498387357232726, + "tokens_seen": 2681871360 + }, + { + "epoch": 9.0, + "learning_rate": 9.473420260782348e-05, + "loss": 2.4706, + "theoretical_loss": 3.3498322976590234, + "tokens_seen": 2681936896 + }, + { + "epoch": 9.0, + "learning_rate": 9.472417251755266e-05, + "loss": 2.3777, + "theoretical_loss": 3.349825859796142, + "tokens_seen": 2682002432 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2983240, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.269711971282959, + "objective/train/theoretical_loss": 3.3498210315311217, + "objective/train/tokens_used": 2702511584, + "theoretical_loss": 3.3498210315311217, + "tokens_seen": 2682051584 + }, + { + "epoch": 9.0, + "learning_rate": 9.471414242728186e-05, + "loss": 2.3991, + "theoretical_loss": 3.349819422134617, + "tokens_seen": 2682067968 + }, + { + "epoch": 9.0, + "learning_rate": 9.470411233701104e-05, + "loss": 2.5579, + "theoretical_loss": 3.349812984674437, + "tokens_seen": 2682133504 + }, + { + "epoch": 9.0, + "learning_rate": 9.469408224674023e-05, + "loss": 2.2484, + "theoretical_loss": 3.3498065474155916, + "tokens_seen": 2682199040 + }, + { + "epoch": 9.0, + "learning_rate": 9.468405215646941e-05, + "loss": 2.4358, + "theoretical_loss": 3.3498001103580686, + "tokens_seen": 2682264576 + }, + { + "epoch": 9.0, + "learning_rate": 9.46740220661986e-05, + "loss": 2.4445, + "theoretical_loss": 3.3497936735018574, + "tokens_seen": 2682330112 + }, + { + "epoch": 9.0, + "learning_rate": 9.466399197592779e-05, + "loss": 2.3598, + "theoretical_loss": 3.3497872368469466, + "tokens_seen": 2682395648 + }, + { + "epoch": 9.0, + "learning_rate": 9.465396188565697e-05, + "loss": 2.5353, + "theoretical_loss": 3.349780800393325, + "tokens_seen": 2682461184 + }, + { + "epoch": 9.0, + "learning_rate": 9.464393179538617e-05, + "loss": 2.4787, + "theoretical_loss": 3.349774364140982, + "tokens_seen": 2682526720 + }, + { + "epoch": 9.0, + "learning_rate": 9.463390170511535e-05, + "loss": 2.5249, + "theoretical_loss": 3.349767928089905, + "tokens_seen": 2682592256 + }, + { + "epoch": 9.0, + "learning_rate": 9.462387161484454e-05, + "loss": 2.4804, + "theoretical_loss": 3.349761492240084, + "tokens_seen": 2682657792 + }, + { + "epoch": 9.0, + "learning_rate": 9.461384152457372e-05, + "loss": 2.4502, + "theoretical_loss": 3.349755056591507, + "tokens_seen": 2682723328 + }, + { + "epoch": 9.0, + "learning_rate": 9.460381143430292e-05, + "loss": 2.5087, + "theoretical_loss": 3.3497486211441636, + "tokens_seen": 2682788864 + }, + { + "epoch": 9.0, + "learning_rate": 9.45937813440321e-05, + "loss": 2.4761, + "theoretical_loss": 3.349742185898042, + "tokens_seen": 2682854400 + }, + { + "epoch": 9.0, + "learning_rate": 9.458375125376128e-05, + "loss": 2.5645, + "theoretical_loss": 3.3497357508531307, + "tokens_seen": 2682919936 + }, + { + "epoch": 9.0, + "learning_rate": 9.457372116349047e-05, + "loss": 2.6554, + "theoretical_loss": 3.3497293160094195, + "tokens_seen": 2682985472 + }, + { + "epoch": 9.0, + "learning_rate": 9.456369107321966e-05, + "loss": 2.4141, + "theoretical_loss": 3.349722881366896, + "tokens_seen": 2683051008 + }, + { + "epoch": 9.0, + "learning_rate": 9.455366098294885e-05, + "loss": 2.4854, + "theoretical_loss": 3.3497164469255503, + "tokens_seen": 2683116544 + }, + { + "epoch": 9.0, + "learning_rate": 9.454363089267803e-05, + "loss": 2.4657, + "theoretical_loss": 3.34971001268537, + "tokens_seen": 2683182080 + }, + { + "epoch": 9.0, + "learning_rate": 9.453360080240723e-05, + "loss": 2.5172, + "theoretical_loss": 3.349703578646345, + "tokens_seen": 2683247616 + }, + { + "epoch": 9.0, + "learning_rate": 9.45235707121364e-05, + "loss": 2.3922, + "theoretical_loss": 3.349697144808463, + "tokens_seen": 2683313152 + }, + { + "epoch": 9.0, + "learning_rate": 9.45135406218656e-05, + "loss": 2.616, + "theoretical_loss": 3.349690711171714, + "tokens_seen": 2683378688 + }, + { + "epoch": 9.0, + "learning_rate": 9.450351053159478e-05, + "loss": 2.5826, + "theoretical_loss": 3.3496842777360856, + "tokens_seen": 2683444224 + }, + { + "epoch": 9.0, + "learning_rate": 9.449348044132396e-05, + "loss": 2.4373, + "theoretical_loss": 3.349677844501567, + "tokens_seen": 2683509760 + }, + { + "epoch": 9.0, + "learning_rate": 9.448345035105316e-05, + "loss": 2.3913, + "theoretical_loss": 3.3496714114681474, + "tokens_seen": 2683575296 + }, + { + "epoch": 9.0, + "learning_rate": 9.447342026078234e-05, + "loss": 2.5772, + "theoretical_loss": 3.349664978635815, + "tokens_seen": 2683640832 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2987970, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.242199659347534, + "objective/train/theoretical_loss": 3.349660154143523, + "objective/train/tokens_used": 2704149984, + "theoretical_loss": 3.349660154143523, + "tokens_seen": 2683689984 + }, + { + "epoch": 9.0, + "learning_rate": 9.446339017051153e-05, + "loss": 2.6329, + "theoretical_loss": 3.349658546004559, + "tokens_seen": 2683706368 + }, + { + "epoch": 9.0, + "learning_rate": 9.445336008024072e-05, + "loss": 2.6459, + "theoretical_loss": 3.3496521135743684, + "tokens_seen": 2683771904 + }, + { + "epoch": 9.0, + "learning_rate": 9.444332998996991e-05, + "loss": 2.4619, + "theoretical_loss": 3.349645681345232, + "tokens_seen": 2683837440 + }, + { + "epoch": 9.0, + "learning_rate": 9.443329989969909e-05, + "loss": 2.6722, + "theoretical_loss": 3.349639249317138, + "tokens_seen": 2683902976 + }, + { + "epoch": 9.0, + "learning_rate": 9.442326980942829e-05, + "loss": 2.2967, + "theoretical_loss": 3.3496328174900754, + "tokens_seen": 2683968512 + }, + { + "epoch": 9.0, + "learning_rate": 9.441323971915747e-05, + "loss": 2.5442, + "theoretical_loss": 3.3496263858640334, + "tokens_seen": 2684034048 + }, + { + "epoch": 9.0, + "learning_rate": 9.440320962888667e-05, + "loss": 2.5758, + "theoretical_loss": 3.3496199544390004, + "tokens_seen": 2684099584 + }, + { + "epoch": 9.0, + "learning_rate": 9.439317953861586e-05, + "loss": 2.6928, + "theoretical_loss": 3.3496135232149653, + "tokens_seen": 2684165120 + }, + { + "epoch": 9.0, + "learning_rate": 9.438314944834504e-05, + "loss": 2.4551, + "theoretical_loss": 3.3496070921919174, + "tokens_seen": 2684230656 + }, + { + "epoch": 9.0, + "learning_rate": 9.437311935807423e-05, + "loss": 2.4743, + "theoretical_loss": 3.349600661369845, + "tokens_seen": 2684296192 + }, + { + "epoch": 9.0, + "learning_rate": 9.436308926780341e-05, + "loss": 2.5219, + "theoretical_loss": 3.3495942307487367, + "tokens_seen": 2684361728 + }, + { + "epoch": 9.0, + "learning_rate": 9.435305917753261e-05, + "loss": 2.4942, + "theoretical_loss": 3.3495878003285817, + "tokens_seen": 2684427264 + }, + { + "epoch": 9.0, + "learning_rate": 9.434302908726179e-05, + "loss": 2.6053, + "theoretical_loss": 3.349581370109369, + "tokens_seen": 2684492800 + }, + { + "epoch": 9.0, + "learning_rate": 9.433299899699098e-05, + "loss": 2.6264, + "theoretical_loss": 3.3495749400910872, + "tokens_seen": 2684558336 + }, + { + "epoch": 9.0, + "learning_rate": 9.432296890672016e-05, + "loss": 2.4257, + "theoretical_loss": 3.349568510273725, + "tokens_seen": 2684623872 + }, + { + "epoch": 9.0, + "learning_rate": 9.431293881644936e-05, + "loss": 2.6415, + "theoretical_loss": 3.3495620806572712, + "tokens_seen": 2684689408 + }, + { + "epoch": 9.0, + "learning_rate": 9.430290872617854e-05, + "loss": 2.4957, + "theoretical_loss": 3.3495556512417144, + "tokens_seen": 2684754944 + }, + { + "epoch": 9.0, + "learning_rate": 9.429287863590772e-05, + "loss": 2.5744, + "theoretical_loss": 3.3495492220270444, + "tokens_seen": 2684820480 + }, + { + "epoch": 9.0, + "learning_rate": 9.428284854563692e-05, + "loss": 2.5631, + "theoretical_loss": 3.349542793013249, + "tokens_seen": 2684886016 + }, + { + "epoch": 9.0, + "learning_rate": 9.42728184553661e-05, + "loss": 2.4396, + "theoretical_loss": 3.3495363642003175, + "tokens_seen": 2684951552 + }, + { + "epoch": 9.0, + "learning_rate": 9.426278836509529e-05, + "loss": 2.7066, + "theoretical_loss": 3.3495299355882384, + "tokens_seen": 2685017088 + }, + { + "epoch": 9.0, + "learning_rate": 9.425275827482447e-05, + "loss": 2.5129, + "theoretical_loss": 3.349523507177001, + "tokens_seen": 2685082624 + }, + { + "epoch": 9.0, + "learning_rate": 9.424272818455367e-05, + "loss": 2.3729, + "theoretical_loss": 3.3495170789665933, + "tokens_seen": 2685148160 + }, + { + "epoch": 9.0, + "learning_rate": 9.423269809428285e-05, + "loss": 2.6764, + "theoretical_loss": 3.349510650957005, + "tokens_seen": 2685213696 + }, + { + "epoch": 9.0, + "learning_rate": 9.422266800401204e-05, + "loss": 2.5039, + "theoretical_loss": 3.349504223148225, + "tokens_seen": 2685279232 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2993087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.266566038131714, + "objective/train/theoretical_loss": 3.349499402423413, + "objective/train/tokens_used": 2705788384, + "theoretical_loss": 3.349499402423413, + "tokens_seen": 2685328384 + }, + { + "epoch": 9.0, + "learning_rate": 9.421263791374122e-05, + "loss": 2.3123, + "theoretical_loss": 3.349497795540241, + "tokens_seen": 2685344768 + }, + { + "epoch": 9.0, + "learning_rate": 9.420260782347042e-05, + "loss": 2.4644, + "theoretical_loss": 3.3494913681330427, + "tokens_seen": 2685410304 + }, + { + "epoch": 9.0, + "learning_rate": 9.41925777331996e-05, + "loss": 2.6636, + "theoretical_loss": 3.349484940926619, + "tokens_seen": 2685475840 + }, + { + "epoch": 9.0, + "learning_rate": 9.418254764292878e-05, + "loss": 2.4041, + "theoretical_loss": 3.3494785139209586, + "tokens_seen": 2685541376 + }, + { + "epoch": 9.0, + "learning_rate": 9.417251755265798e-05, + "loss": 2.5985, + "theoretical_loss": 3.3494720871160504, + "tokens_seen": 2685606912 + }, + { + "epoch": 9.0, + "learning_rate": 9.416248746238716e-05, + "loss": 2.6808, + "theoretical_loss": 3.3494656605118824, + "tokens_seen": 2685672448 + }, + { + "epoch": 9.0, + "learning_rate": 9.415245737211635e-05, + "loss": 2.4907, + "theoretical_loss": 3.3494592341084446, + "tokens_seen": 2685737984 + }, + { + "epoch": 9.0, + "learning_rate": 9.414242728184553e-05, + "loss": 2.5427, + "theoretical_loss": 3.3494528079057253, + "tokens_seen": 2685803520 + }, + { + "epoch": 9.0, + "learning_rate": 9.413239719157473e-05, + "loss": 2.6424, + "theoretical_loss": 3.3494463819037135, + "tokens_seen": 2685869056 + }, + { + "epoch": 9.0, + "learning_rate": 9.412236710130391e-05, + "loss": 2.4909, + "theoretical_loss": 3.3494399561023975, + "tokens_seen": 2685934592 + }, + { + "epoch": 9.0, + "learning_rate": 9.41123370110331e-05, + "loss": 2.5427, + "theoretical_loss": 3.3494335305017664, + "tokens_seen": 2686000128 + }, + { + "epoch": 9.0, + "learning_rate": 9.410230692076228e-05, + "loss": 2.5379, + "theoretical_loss": 3.3494271051018094, + "tokens_seen": 2686065664 + }, + { + "epoch": 9.0, + "learning_rate": 9.409227683049147e-05, + "loss": 2.5564, + "theoretical_loss": 3.349420679902515, + "tokens_seen": 2686131200 + }, + { + "epoch": 9.0, + "learning_rate": 9.408224674022066e-05, + "loss": 2.4453, + "theoretical_loss": 3.3494142549038726, + "tokens_seen": 2686196736 + }, + { + "epoch": 9.0, + "learning_rate": 9.407221664994984e-05, + "loss": 2.416, + "theoretical_loss": 3.34940783010587, + "tokens_seen": 2686262272 + }, + { + "epoch": 9.0, + "learning_rate": 9.406218655967904e-05, + "loss": 2.5312, + "theoretical_loss": 3.3494014055084973, + "tokens_seen": 2686327808 + }, + { + "epoch": 9.0, + "learning_rate": 9.405215646940822e-05, + "loss": 2.5955, + "theoretical_loss": 3.349394981111742, + "tokens_seen": 2686393344 + }, + { + "epoch": 9.0, + "learning_rate": 9.404212637913741e-05, + "loss": 2.3404, + "theoretical_loss": 3.349388556915594, + "tokens_seen": 2686458880 + }, + { + "epoch": 9.0, + "learning_rate": 9.403209628886659e-05, + "loss": 2.5961, + "theoretical_loss": 3.3493821329200415, + "tokens_seen": 2686524416 + }, + { + "epoch": 9.0, + "learning_rate": 9.40220661985958e-05, + "loss": 2.4571, + "theoretical_loss": 3.3493757091250735, + "tokens_seen": 2686589952 + }, + { + "epoch": 9.0, + "learning_rate": 9.401203610832498e-05, + "loss": 2.4288, + "theoretical_loss": 3.3493692855306794, + "tokens_seen": 2686655488 + }, + { + "epoch": 9.0, + "learning_rate": 9.400200601805416e-05, + "loss": 2.4724, + "theoretical_loss": 3.349362862136847, + "tokens_seen": 2686721024 + }, + { + "epoch": 9.0, + "learning_rate": 9.399197592778336e-05, + "loss": 2.5115, + "theoretical_loss": 3.349356438943566, + "tokens_seen": 2686786560 + }, + { + "epoch": 9.0, + "learning_rate": 9.398194583751254e-05, + "loss": 2.6144, + "theoretical_loss": 3.349350015950825, + "tokens_seen": 2686852096 + }, + { + "epoch": 9.0, + "learning_rate": 9.397191574724173e-05, + "loss": 2.5702, + "theoretical_loss": 3.349343593158613, + "tokens_seen": 2686917632 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 2997946, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7653236389160156, + "objective/train/theoretical_loss": 3.349338776196044, + "objective/train/tokens_used": 2707426784, + "theoretical_loss": 3.349338776196044, + "tokens_seen": 2686966784 + }, + { + "epoch": 9.0, + "learning_rate": 9.396188565697092e-05, + "loss": 2.5965, + "theoretical_loss": 3.349337170566918, + "tokens_seen": 2686983168 + }, + { + "epoch": 9.0, + "learning_rate": 9.395185556670011e-05, + "loss": 2.3846, + "theoretical_loss": 3.3493307481757304, + "tokens_seen": 2687048704 + }, + { + "epoch": 9.0, + "learning_rate": 9.394182547642929e-05, + "loss": 2.4552, + "theoretical_loss": 3.3493243259850374, + "tokens_seen": 2687114240 + }, + { + "epoch": 9.0, + "learning_rate": 9.393179538615849e-05, + "loss": 2.4558, + "theoretical_loss": 3.3493179039948293, + "tokens_seen": 2687179776 + }, + { + "epoch": 9.0, + "learning_rate": 9.392176529588767e-05, + "loss": 2.5193, + "theoretical_loss": 3.349311482205094, + "tokens_seen": 2687245312 + }, + { + "epoch": 9.0, + "learning_rate": 9.391173520561686e-05, + "loss": 2.3317, + "theoretical_loss": 3.34930506061582, + "tokens_seen": 2687310848 + }, + { + "epoch": 9.01, + "learning_rate": 9.390170511534604e-05, + "loss": 2.4767, + "theoretical_loss": 3.3492986392269977, + "tokens_seen": 2687376384 + }, + { + "epoch": 9.01, + "learning_rate": 9.389167502507522e-05, + "loss": 2.4391, + "theoretical_loss": 3.349292218038615, + "tokens_seen": 2687441920 + }, + { + "epoch": 9.01, + "learning_rate": 9.388164493480442e-05, + "loss": 2.5151, + "theoretical_loss": 3.34928579705066, + "tokens_seen": 2687507456 + }, + { + "epoch": 9.01, + "learning_rate": 9.38716148445336e-05, + "loss": 2.639, + "theoretical_loss": 3.349279376263123, + "tokens_seen": 2687572992 + }, + { + "epoch": 9.01, + "learning_rate": 9.38615847542628e-05, + "loss": 2.4654, + "theoretical_loss": 3.3492729556759917, + "tokens_seen": 2687638528 + }, + { + "epoch": 9.01, + "learning_rate": 9.385155466399198e-05, + "loss": 2.5405, + "theoretical_loss": 3.349266535289256, + "tokens_seen": 2687704064 + }, + { + "epoch": 9.01, + "learning_rate": 9.384152457372117e-05, + "loss": 2.4626, + "theoretical_loss": 3.349260115102904, + "tokens_seen": 2687769600 + }, + { + "epoch": 9.01, + "learning_rate": 9.383149448345035e-05, + "loss": 2.622, + "theoretical_loss": 3.3492536951169245, + "tokens_seen": 2687835136 + }, + { + "epoch": 9.01, + "learning_rate": 9.382146439317955e-05, + "loss": 2.5797, + "theoretical_loss": 3.349247275331307, + "tokens_seen": 2687900672 + }, + { + "epoch": 9.01, + "learning_rate": 9.381143430290873e-05, + "loss": 2.4543, + "theoretical_loss": 3.34924085574604, + "tokens_seen": 2687966208 + }, + { + "epoch": 9.01, + "learning_rate": 9.380140421263791e-05, + "loss": 2.6285, + "theoretical_loss": 3.349234436361112, + "tokens_seen": 2688031744 + }, + { + "epoch": 9.01, + "learning_rate": 9.37913741223671e-05, + "loss": 2.7167, + "theoretical_loss": 3.3492280171765128, + "tokens_seen": 2688097280 + }, + { + "epoch": 9.01, + "learning_rate": 9.378134403209628e-05, + "loss": 2.514, + "theoretical_loss": 3.34922159819223, + "tokens_seen": 2688162816 + }, + { + "epoch": 9.01, + "learning_rate": 9.377131394182548e-05, + "loss": 2.6402, + "theoretical_loss": 3.3492151794082536, + "tokens_seen": 2688228352 + }, + { + "epoch": 9.01, + "learning_rate": 9.376128385155466e-05, + "loss": 2.4815, + "theoretical_loss": 3.349208760824572, + "tokens_seen": 2688293888 + }, + { + "epoch": 9.01, + "learning_rate": 9.375125376128385e-05, + "loss": 2.6332, + "theoretical_loss": 3.349202342441174, + "tokens_seen": 2688359424 + }, + { + "epoch": 9.01, + "learning_rate": 9.374122367101304e-05, + "loss": 2.5042, + "theoretical_loss": 3.3491959242580487, + "tokens_seen": 2688424960 + }, + { + "epoch": 9.01, + "learning_rate": 9.373119358074223e-05, + "loss": 2.4076, + "theoretical_loss": 3.3491895062751844, + "tokens_seen": 2688490496 + }, + { + "epoch": 9.01, + "learning_rate": 9.372116349047141e-05, + "loss": 2.5747, + "theoretical_loss": 3.349183088492571, + "tokens_seen": 2688556032 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3000391, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.376398801803589, + "objective/train/theoretical_loss": 3.3491782752870183, + "objective/train/tokens_used": 2709065184, + "theoretical_loss": 3.3491782752870183, + "tokens_seen": 2688605184 + }, + { + "epoch": 9.01, + "learning_rate": 9.371113340020059e-05, + "loss": 2.5579, + "theoretical_loss": 3.3491766709101967, + "tokens_seen": 2688621568 + }, + { + "epoch": 9.01, + "learning_rate": 9.370110330992979e-05, + "loss": 2.5595, + "theoretical_loss": 3.34917025352805, + "tokens_seen": 2688687104 + }, + { + "epoch": 9.01, + "learning_rate": 9.369107321965897e-05, + "loss": 2.5054, + "theoretical_loss": 3.3491638363461202, + "tokens_seen": 2688752640 + }, + { + "epoch": 9.01, + "learning_rate": 9.368104312938816e-05, + "loss": 2.5672, + "theoretical_loss": 3.3491574193643965, + "tokens_seen": 2688818176 + }, + { + "epoch": 9.01, + "learning_rate": 9.367101303911734e-05, + "loss": 2.5482, + "theoretical_loss": 3.3491510025828677, + "tokens_seen": 2688883712 + }, + { + "epoch": 9.01, + "learning_rate": 9.366098294884654e-05, + "loss": 2.4766, + "theoretical_loss": 3.349144586001522, + "tokens_seen": 2688949248 + }, + { + "epoch": 9.01, + "learning_rate": 9.365095285857573e-05, + "loss": 2.4576, + "theoretical_loss": 3.349138169620349, + "tokens_seen": 2689014784 + }, + { + "epoch": 9.01, + "learning_rate": 9.364092276830493e-05, + "loss": 2.451, + "theoretical_loss": 3.349131753439337, + "tokens_seen": 2689080320 + }, + { + "epoch": 9.01, + "learning_rate": 9.363089267803411e-05, + "loss": 2.5126, + "theoretical_loss": 3.3491253374584753, + "tokens_seen": 2689145856 + }, + { + "epoch": 9.01, + "learning_rate": 9.36208625877633e-05, + "loss": 2.5602, + "theoretical_loss": 3.3491189216777526, + "tokens_seen": 2689211392 + }, + { + "epoch": 9.01, + "learning_rate": 9.361083249749248e-05, + "loss": 2.5492, + "theoretical_loss": 3.349112506097158, + "tokens_seen": 2689276928 + }, + { + "epoch": 9.01, + "learning_rate": 9.360080240722167e-05, + "loss": 2.4423, + "theoretical_loss": 3.3491060907166803, + "tokens_seen": 2689342464 + }, + { + "epoch": 9.01, + "learning_rate": 9.359077231695086e-05, + "loss": 2.1619, + "theoretical_loss": 3.3490996755363076, + "tokens_seen": 2689408000 + }, + { + "epoch": 9.01, + "learning_rate": 9.358074222668004e-05, + "loss": 2.4402, + "theoretical_loss": 3.34909326055603, + "tokens_seen": 2689473536 + }, + { + "epoch": 9.01, + "learning_rate": 9.357071213640924e-05, + "loss": 2.4854, + "theoretical_loss": 3.349086845775836, + "tokens_seen": 2689539072 + }, + { + "epoch": 9.01, + "learning_rate": 9.356068204613842e-05, + "loss": 2.5798, + "theoretical_loss": 3.349080431195714, + "tokens_seen": 2689604608 + }, + { + "epoch": 9.01, + "learning_rate": 9.355065195586761e-05, + "loss": 2.6045, + "theoretical_loss": 3.3490740168156536, + "tokens_seen": 2689670144 + }, + { + "epoch": 9.01, + "learning_rate": 9.35406218655968e-05, + "loss": 2.6162, + "theoretical_loss": 3.349067602635643, + "tokens_seen": 2689735680 + }, + { + "epoch": 9.01, + "learning_rate": 9.353059177532599e-05, + "loss": 2.4579, + "theoretical_loss": 3.3490611886556714, + "tokens_seen": 2689801216 + }, + { + "epoch": 9.01, + "learning_rate": 9.352056168505517e-05, + "loss": 2.4601, + "theoretical_loss": 3.349054774875728, + "tokens_seen": 2689866752 + }, + { + "epoch": 9.01, + "learning_rate": 9.351053159478435e-05, + "loss": 2.3506, + "theoretical_loss": 3.349048361295801, + "tokens_seen": 2689932288 + }, + { + "epoch": 9.01, + "learning_rate": 9.350050150451354e-05, + "loss": 2.4983, + "theoretical_loss": 3.3490419479158797, + "tokens_seen": 2689997824 + }, + { + "epoch": 9.01, + "learning_rate": 9.349047141424273e-05, + "loss": 2.5247, + "theoretical_loss": 3.349035534735953, + "tokens_seen": 2690063360 + }, + { + "epoch": 9.01, + "learning_rate": 9.348044132397192e-05, + "loss": 2.5782, + "theoretical_loss": 3.34902912175601, + "tokens_seen": 2690128896 + }, + { + "epoch": 9.01, + "learning_rate": 9.34704112337011e-05, + "loss": 2.5854, + "theoretical_loss": 3.349022708976039, + "tokens_seen": 2690194432 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3005512, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.808990716934204, + "objective/train/theoretical_loss": 3.349017899522286, + "objective/train/tokens_used": 2710703584, + "theoretical_loss": 3.349017899522286, + "tokens_seen": 2690243584 + }, + { + "epoch": 9.01, + "learning_rate": 9.34603811434303e-05, + "loss": 2.5282, + "theoretical_loss": 3.3490162963960293, + "tokens_seen": 2690259968 + }, + { + "epoch": 9.01, + "learning_rate": 9.345035105315948e-05, + "loss": 2.4439, + "theoretical_loss": 3.34900988401597, + "tokens_seen": 2690325504 + }, + { + "epoch": 9.01, + "learning_rate": 9.344032096288867e-05, + "loss": 2.4288, + "theoretical_loss": 3.3490034718358492, + "tokens_seen": 2690391040 + }, + { + "epoch": 9.01, + "learning_rate": 9.343029087261785e-05, + "loss": 2.4129, + "theoretical_loss": 3.3489970598556567, + "tokens_seen": 2690456576 + }, + { + "epoch": 9.01, + "learning_rate": 9.342026078234703e-05, + "loss": 2.4899, + "theoretical_loss": 3.348990648075381, + "tokens_seen": 2690522112 + }, + { + "epoch": 9.01, + "learning_rate": 9.341023069207623e-05, + "loss": 2.5882, + "theoretical_loss": 3.348984236495011, + "tokens_seen": 2690587648 + }, + { + "epoch": 9.01, + "learning_rate": 9.340020060180541e-05, + "loss": 2.4972, + "theoretical_loss": 3.3489778251145355, + "tokens_seen": 2690653184 + }, + { + "epoch": 9.01, + "learning_rate": 9.33901705115346e-05, + "loss": 2.6017, + "theoretical_loss": 3.3489714139339437, + "tokens_seen": 2690718720 + }, + { + "epoch": 9.01, + "learning_rate": 9.338014042126379e-05, + "loss": 2.398, + "theoretical_loss": 3.348965002953224, + "tokens_seen": 2690784256 + }, + { + "epoch": 9.01, + "learning_rate": 9.337011033099298e-05, + "loss": 2.3578, + "theoretical_loss": 3.348958592172366, + "tokens_seen": 2690849792 + }, + { + "epoch": 9.01, + "learning_rate": 9.336008024072216e-05, + "loss": 2.5765, + "theoretical_loss": 3.3489521815913577, + "tokens_seen": 2690915328 + }, + { + "epoch": 9.01, + "learning_rate": 9.335005015045136e-05, + "loss": 2.5514, + "theoretical_loss": 3.3489457712101887, + "tokens_seen": 2690980864 + }, + { + "epoch": 9.01, + "learning_rate": 9.334002006018054e-05, + "loss": 2.5192, + "theoretical_loss": 3.348939361028848, + "tokens_seen": 2691046400 + }, + { + "epoch": 9.01, + "learning_rate": 9.332998996990973e-05, + "loss": 2.601, + "theoretical_loss": 3.348932951047324, + "tokens_seen": 2691111936 + }, + { + "epoch": 9.01, + "learning_rate": 9.331995987963891e-05, + "loss": 2.5893, + "theoretical_loss": 3.3489265412656057, + "tokens_seen": 2691177472 + }, + { + "epoch": 9.01, + "learning_rate": 9.33099297893681e-05, + "loss": 2.7237, + "theoretical_loss": 3.3489201316836823, + "tokens_seen": 2691243008 + }, + { + "epoch": 9.01, + "learning_rate": 9.329989969909729e-05, + "loss": 2.5312, + "theoretical_loss": 3.3489137223015426, + "tokens_seen": 2691308544 + }, + { + "epoch": 9.01, + "learning_rate": 9.328986960882647e-05, + "loss": 2.3496, + "theoretical_loss": 3.348907313119175, + "tokens_seen": 2691374080 + }, + { + "epoch": 9.01, + "learning_rate": 9.327983951855568e-05, + "loss": 2.3597, + "theoretical_loss": 3.3489009041365696, + "tokens_seen": 2691439616 + }, + { + "epoch": 9.01, + "learning_rate": 9.326980942828486e-05, + "loss": 2.501, + "theoretical_loss": 3.348894495353714, + "tokens_seen": 2691505152 + }, + { + "epoch": 9.01, + "learning_rate": 9.325977933801405e-05, + "loss": 2.5607, + "theoretical_loss": 3.348888086770598, + "tokens_seen": 2691570688 + }, + { + "epoch": 9.01, + "learning_rate": 9.324974924774324e-05, + "loss": 2.5633, + "theoretical_loss": 3.34888167838721, + "tokens_seen": 2691636224 + }, + { + "epoch": 9.01, + "learning_rate": 9.323971915747243e-05, + "loss": 2.5191, + "theoretical_loss": 3.348875270203539, + "tokens_seen": 2691701760 + }, + { + "epoch": 9.01, + "learning_rate": 9.322968906720161e-05, + "loss": 2.5145, + "theoretical_loss": 3.3488688622195744, + "tokens_seen": 2691767296 + }, + { + "epoch": 9.01, + "learning_rate": 9.321965897693079e-05, + "loss": 2.6217, + "theoretical_loss": 3.348862454435304, + "tokens_seen": 2691832832 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3010524, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6775169372558594, + "objective/train/theoretical_loss": 3.348857648728145, + "objective/train/tokens_used": 2712341984, + "theoretical_loss": 3.348857648728145, + "tokens_seen": 2691881984 + }, + { + "epoch": 9.01, + "learning_rate": 9.320962888665999e-05, + "loss": 2.4644, + "theoretical_loss": 3.3488560468507185, + "tokens_seen": 2691898368 + }, + { + "epoch": 9.01, + "learning_rate": 9.319959879638917e-05, + "loss": 2.4241, + "theoretical_loss": 3.348849639465805, + "tokens_seen": 2691963904 + }, + { + "epoch": 9.01, + "learning_rate": 9.318956870611836e-05, + "loss": 2.4872, + "theoretical_loss": 3.3488432322805535, + "tokens_seen": 2692029440 + }, + { + "epoch": 9.01, + "learning_rate": 9.317953861584754e-05, + "loss": 2.4577, + "theoretical_loss": 3.3488368252949523, + "tokens_seen": 2692094976 + }, + { + "epoch": 9.01, + "learning_rate": 9.316950852557674e-05, + "loss": 2.4758, + "theoretical_loss": 3.348830418508991, + "tokens_seen": 2692160512 + }, + { + "epoch": 9.01, + "learning_rate": 9.315947843530592e-05, + "loss": 2.5567, + "theoretical_loss": 3.348824011922658, + "tokens_seen": 2692226048 + }, + { + "epoch": 9.01, + "learning_rate": 9.314944834503511e-05, + "loss": 2.509, + "theoretical_loss": 3.3488176055359418, + "tokens_seen": 2692291584 + }, + { + "epoch": 9.01, + "learning_rate": 9.31394182547643e-05, + "loss": 2.4332, + "theoretical_loss": 3.3488111993488325, + "tokens_seen": 2692357120 + }, + { + "epoch": 9.01, + "learning_rate": 9.312938816449349e-05, + "loss": 2.5211, + "theoretical_loss": 3.3488047933613183, + "tokens_seen": 2692422656 + }, + { + "epoch": 9.01, + "learning_rate": 9.311935807422267e-05, + "loss": 2.4526, + "theoretical_loss": 3.348798387573388, + "tokens_seen": 2692488192 + }, + { + "epoch": 9.01, + "learning_rate": 9.310932798395185e-05, + "loss": 2.3283, + "theoretical_loss": 3.3487919819850314, + "tokens_seen": 2692553728 + }, + { + "epoch": 9.01, + "learning_rate": 9.309929789368105e-05, + "loss": 2.5089, + "theoretical_loss": 3.348785576596236, + "tokens_seen": 2692619264 + }, + { + "epoch": 9.01, + "learning_rate": 9.308926780341023e-05, + "loss": 2.514, + "theoretical_loss": 3.3487791714069917, + "tokens_seen": 2692684800 + }, + { + "epoch": 9.01, + "learning_rate": 9.307923771313942e-05, + "loss": 2.4641, + "theoretical_loss": 3.3487727664172873, + "tokens_seen": 2692750336 + }, + { + "epoch": 9.01, + "learning_rate": 9.30692076228686e-05, + "loss": 2.4143, + "theoretical_loss": 3.3487663616271117, + "tokens_seen": 2692815872 + }, + { + "epoch": 9.01, + "learning_rate": 9.30591775325978e-05, + "loss": 2.3244, + "theoretical_loss": 3.348759957036454, + "tokens_seen": 2692881408 + }, + { + "epoch": 9.01, + "learning_rate": 9.304914744232698e-05, + "loss": 2.3964, + "theoretical_loss": 3.3487535526453023, + "tokens_seen": 2692946944 + }, + { + "epoch": 9.01, + "learning_rate": 9.303911735205617e-05, + "loss": 2.5666, + "theoretical_loss": 3.3487471484536466, + "tokens_seen": 2693012480 + }, + { + "epoch": 9.01, + "learning_rate": 9.302908726178536e-05, + "loss": 2.5591, + "theoretical_loss": 3.3487407444614754, + "tokens_seen": 2693078016 + }, + { + "epoch": 9.01, + "learning_rate": 9.301905717151454e-05, + "loss": 2.6173, + "theoretical_loss": 3.348734340668777, + "tokens_seen": 2693143552 + }, + { + "epoch": 9.01, + "learning_rate": 9.300902708124373e-05, + "loss": 2.3965, + "theoretical_loss": 3.348727937075542, + "tokens_seen": 2693209088 + }, + { + "epoch": 9.01, + "learning_rate": 9.299899699097291e-05, + "loss": 2.3013, + "theoretical_loss": 3.3487215336817573, + "tokens_seen": 2693274624 + }, + { + "epoch": 9.01, + "learning_rate": 9.298896690070211e-05, + "loss": 2.54, + "theoretical_loss": 3.348715130487413, + "tokens_seen": 2693340160 + }, + { + "epoch": 9.01, + "learning_rate": 9.297893681043129e-05, + "loss": 2.5662, + "theoretical_loss": 3.3487087274924985, + "tokens_seen": 2693405696 + }, + { + "epoch": 9.01, + "learning_rate": 9.296890672016048e-05, + "loss": 2.5496, + "theoretical_loss": 3.348702324697001, + "tokens_seen": 2693471232 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3015519, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.418639898300171, + "objective/train/theoretical_loss": 3.34869752273124, + "objective/train/tokens_used": 2713980384, + "theoretical_loss": 3.34869752273124, + "tokens_seen": 2693520384 + }, + { + "epoch": 9.01, + "learning_rate": 9.295887662988966e-05, + "loss": 2.4898, + "theoretical_loss": 3.3486959221009114, + "tokens_seen": 2693536768 + }, + { + "epoch": 9.01, + "learning_rate": 9.294884653961886e-05, + "loss": 2.4351, + "theoretical_loss": 3.3486895197042177, + "tokens_seen": 2693602304 + }, + { + "epoch": 9.01, + "learning_rate": 9.293881644934804e-05, + "loss": 2.5905, + "theoretical_loss": 3.3486831175069085, + "tokens_seen": 2693667840 + }, + { + "epoch": 9.01, + "learning_rate": 9.292878635907722e-05, + "loss": 2.3609, + "theoretical_loss": 3.348676715508973, + "tokens_seen": 2693733376 + }, + { + "epoch": 9.01, + "learning_rate": 9.291875626880642e-05, + "loss": 2.5254, + "theoretical_loss": 3.3486703137104006, + "tokens_seen": 2693798912 + }, + { + "epoch": 9.01, + "learning_rate": 9.29087261785356e-05, + "loss": 2.4032, + "theoretical_loss": 3.3486639121111796, + "tokens_seen": 2693864448 + }, + { + "epoch": 9.01, + "learning_rate": 9.28986960882648e-05, + "loss": 2.5244, + "theoretical_loss": 3.3486575107112997, + "tokens_seen": 2693929984 + }, + { + "epoch": 9.01, + "learning_rate": 9.288866599799399e-05, + "loss": 2.493, + "theoretical_loss": 3.348651109510749, + "tokens_seen": 2693995520 + }, + { + "epoch": 9.01, + "learning_rate": 9.287863590772318e-05, + "loss": 2.5402, + "theoretical_loss": 3.348644708509517, + "tokens_seen": 2694061056 + }, + { + "epoch": 9.01, + "learning_rate": 9.286860581745236e-05, + "loss": 2.4648, + "theoretical_loss": 3.348638307707593, + "tokens_seen": 2694126592 + }, + { + "epoch": 9.01, + "learning_rate": 9.285857572718156e-05, + "loss": 2.3642, + "theoretical_loss": 3.3486319071049646, + "tokens_seen": 2694192128 + }, + { + "epoch": 9.01, + "learning_rate": 9.284854563691074e-05, + "loss": 2.6369, + "theoretical_loss": 3.348625506701622, + "tokens_seen": 2694257664 + }, + { + "epoch": 9.01, + "learning_rate": 9.283851554663993e-05, + "loss": 2.6531, + "theoretical_loss": 3.3486191064975537, + "tokens_seen": 2694323200 + }, + { + "epoch": 9.01, + "learning_rate": 9.282848545636911e-05, + "loss": 2.3008, + "theoretical_loss": 3.348612706492749, + "tokens_seen": 2694388736 + }, + { + "epoch": 9.01, + "learning_rate": 9.28184553660983e-05, + "loss": 2.5617, + "theoretical_loss": 3.348606306687196, + "tokens_seen": 2694454272 + }, + { + "epoch": 9.01, + "learning_rate": 9.280842527582749e-05, + "loss": 2.5236, + "theoretical_loss": 3.3485999070808843, + "tokens_seen": 2694519808 + }, + { + "epoch": 9.01, + "learning_rate": 9.279839518555667e-05, + "loss": 2.4931, + "theoretical_loss": 3.3485935076738027, + "tokens_seen": 2694585344 + }, + { + "epoch": 9.01, + "learning_rate": 9.278836509528587e-05, + "loss": 2.36, + "theoretical_loss": 3.3485871084659404, + "tokens_seen": 2694650880 + }, + { + "epoch": 9.01, + "learning_rate": 9.277833500501505e-05, + "loss": 2.4905, + "theoretical_loss": 3.348580709457286, + "tokens_seen": 2694716416 + }, + { + "epoch": 9.01, + "learning_rate": 9.276830491474424e-05, + "loss": 2.2632, + "theoretical_loss": 3.3485743106478285, + "tokens_seen": 2694781952 + }, + { + "epoch": 9.01, + "learning_rate": 9.275827482447342e-05, + "loss": 2.7224, + "theoretical_loss": 3.348567912037557, + "tokens_seen": 2694847488 + }, + { + "epoch": 9.01, + "learning_rate": 9.274824473420262e-05, + "loss": 2.4383, + "theoretical_loss": 3.3485615136264606, + "tokens_seen": 2694913024 + }, + { + "epoch": 9.01, + "learning_rate": 9.27382146439318e-05, + "loss": 2.5493, + "theoretical_loss": 3.348555115414528, + "tokens_seen": 2694978560 + }, + { + "epoch": 9.01, + "learning_rate": 9.272818455366098e-05, + "loss": 2.5306, + "theoretical_loss": 3.348548717401748, + "tokens_seen": 2695044096 + }, + { + "epoch": 9.01, + "learning_rate": 9.271815446339017e-05, + "loss": 2.4288, + "theoretical_loss": 3.34854231958811, + "tokens_seen": 2695109632 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3016976, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.76336407661438, + "objective/train/theoretical_loss": 3.348537521358561, + "objective/train/tokens_used": 2715618784, + "theoretical_loss": 3.348537521358561, + "tokens_seen": 2695158784 + }, + { + "epoch": 9.01, + "learning_rate": 9.270812437311935e-05, + "loss": 2.7353, + "theoretical_loss": 3.3485359219736024, + "tokens_seen": 2695175168 + }, + { + "epoch": 9.01, + "learning_rate": 9.269809428284855e-05, + "loss": 2.3336, + "theoretical_loss": 3.3485295245582147, + "tokens_seen": 2695240704 + }, + { + "epoch": 9.01, + "learning_rate": 9.268806419257773e-05, + "loss": 2.5885, + "theoretical_loss": 3.3485231273419354, + "tokens_seen": 2695306240 + }, + { + "epoch": 9.01, + "learning_rate": 9.267803410230693e-05, + "loss": 2.6705, + "theoretical_loss": 3.348516730324754, + "tokens_seen": 2695371776 + }, + { + "epoch": 9.01, + "learning_rate": 9.26680040120361e-05, + "loss": 2.4702, + "theoretical_loss": 3.3485103335066593, + "tokens_seen": 2695437312 + }, + { + "epoch": 9.01, + "learning_rate": 9.26579739217653e-05, + "loss": 2.6055, + "theoretical_loss": 3.34850393688764, + "tokens_seen": 2695502848 + }, + { + "epoch": 9.01, + "learning_rate": 9.264794383149448e-05, + "loss": 2.3551, + "theoretical_loss": 3.3484975404676853, + "tokens_seen": 2695568384 + }, + { + "epoch": 9.01, + "learning_rate": 9.263791374122366e-05, + "loss": 2.6332, + "theoretical_loss": 3.348491144246784, + "tokens_seen": 2695633920 + }, + { + "epoch": 9.01, + "learning_rate": 9.262788365095286e-05, + "loss": 2.4266, + "theoretical_loss": 3.348484748224925, + "tokens_seen": 2695699456 + }, + { + "epoch": 9.01, + "learning_rate": 9.261785356068204e-05, + "loss": 2.4453, + "theoretical_loss": 3.348478352402098, + "tokens_seen": 2695764992 + }, + { + "epoch": 9.01, + "learning_rate": 9.260782347041123e-05, + "loss": 2.5401, + "theoretical_loss": 3.3484719567782903, + "tokens_seen": 2695830528 + }, + { + "epoch": 9.01, + "learning_rate": 9.259779338014041e-05, + "loss": 2.5475, + "theoretical_loss": 3.3484655613534926, + "tokens_seen": 2695896064 + }, + { + "epoch": 9.01, + "learning_rate": 9.258776328986961e-05, + "loss": 2.4486, + "theoretical_loss": 3.348459166127693, + "tokens_seen": 2695961600 + }, + { + "epoch": 9.01, + "learning_rate": 9.257773319959879e-05, + "loss": 2.3702, + "theoretical_loss": 3.348452771100881, + "tokens_seen": 2696027136 + }, + { + "epoch": 9.01, + "learning_rate": 9.256770310932799e-05, + "loss": 2.503, + "theoretical_loss": 3.348446376273045, + "tokens_seen": 2696092672 + }, + { + "epoch": 9.01, + "learning_rate": 9.255767301905717e-05, + "loss": 2.4661, + "theoretical_loss": 3.3484399816441743, + "tokens_seen": 2696158208 + }, + { + "epoch": 9.01, + "learning_rate": 9.254764292878636e-05, + "loss": 2.6792, + "theoretical_loss": 3.3484335872142577, + "tokens_seen": 2696223744 + }, + { + "epoch": 9.01, + "learning_rate": 9.253761283851554e-05, + "loss": 2.5309, + "theoretical_loss": 3.3484271929832845, + "tokens_seen": 2696289280 + }, + { + "epoch": 9.01, + "learning_rate": 9.252758274824474e-05, + "loss": 2.49, + "theoretical_loss": 3.3484207989512433, + "tokens_seen": 2696354816 + }, + { + "epoch": 9.01, + "learning_rate": 9.251755265797393e-05, + "loss": 2.4841, + "theoretical_loss": 3.3484144051181235, + "tokens_seen": 2696420352 + }, + { + "epoch": 9.01, + "learning_rate": 9.250752256770311e-05, + "loss": 2.6396, + "theoretical_loss": 3.3484080114839134, + "tokens_seen": 2696485888 + }, + { + "epoch": 9.01, + "learning_rate": 9.249749247743231e-05, + "loss": 2.3702, + "theoretical_loss": 3.3484016180486025, + "tokens_seen": 2696551424 + }, + { + "epoch": 9.01, + "learning_rate": 9.248746238716149e-05, + "loss": 2.6454, + "theoretical_loss": 3.34839522481218, + "tokens_seen": 2696616960 + }, + { + "epoch": 9.01, + "learning_rate": 9.247743229689068e-05, + "loss": 2.4776, + "theoretical_loss": 3.348388831774634, + "tokens_seen": 2696682496 + }, + { + "epoch": 9.01, + "learning_rate": 9.246740220661986e-05, + "loss": 2.4782, + "theoretical_loss": 3.348382438935954, + "tokens_seen": 2696748032 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3017312, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6346187591552734, + "objective/train/theoretical_loss": 3.348377644437444, + "objective/train/tokens_used": 2717257184, + "theoretical_loss": 3.348377644437444, + "tokens_seen": 2696797184 + }, + { + "epoch": 9.01, + "learning_rate": 9.245737211634906e-05, + "loss": 2.5932, + "theoretical_loss": 3.3483760462961296, + "tokens_seen": 2696813568 + }, + { + "epoch": 9.01, + "learning_rate": 9.244734202607824e-05, + "loss": 2.4838, + "theoretical_loss": 3.348369653855149, + "tokens_seen": 2696879104 + }, + { + "epoch": 9.01, + "learning_rate": 9.243731193580742e-05, + "loss": 2.544, + "theoretical_loss": 3.3483632616130015, + "tokens_seen": 2696944640 + }, + { + "epoch": 9.01, + "learning_rate": 9.242728184553662e-05, + "loss": 2.566, + "theoretical_loss": 3.3483568695696753, + "tokens_seen": 2697010176 + }, + { + "epoch": 9.01, + "learning_rate": 9.24172517552658e-05, + "loss": 2.5967, + "theoretical_loss": 3.3483504777251607, + "tokens_seen": 2697075712 + }, + { + "epoch": 9.01, + "learning_rate": 9.240722166499499e-05, + "loss": 2.3969, + "theoretical_loss": 3.348344086079446, + "tokens_seen": 2697141248 + }, + { + "epoch": 9.01, + "learning_rate": 9.239719157472417e-05, + "loss": 2.5665, + "theoretical_loss": 3.3483376946325194, + "tokens_seen": 2697206784 + }, + { + "epoch": 9.01, + "learning_rate": 9.238716148445337e-05, + "loss": 2.5226, + "theoretical_loss": 3.3483313033843713, + "tokens_seen": 2697272320 + }, + { + "epoch": 9.01, + "learning_rate": 9.237713139418255e-05, + "loss": 2.6004, + "theoretical_loss": 3.34832491233499, + "tokens_seen": 2697337856 + }, + { + "epoch": 9.01, + "learning_rate": 9.236710130391174e-05, + "loss": 2.554, + "theoretical_loss": 3.348318521484365, + "tokens_seen": 2697403392 + }, + { + "epoch": 9.01, + "learning_rate": 9.235707121364092e-05, + "loss": 2.5436, + "theoretical_loss": 3.3483121308324844, + "tokens_seen": 2697468928 + }, + { + "epoch": 9.01, + "learning_rate": 9.234704112337012e-05, + "loss": 2.4526, + "theoretical_loss": 3.3483057403793373, + "tokens_seen": 2697534464 + }, + { + "epoch": 9.01, + "learning_rate": 9.23370110330993e-05, + "loss": 2.6175, + "theoretical_loss": 3.348299350124914, + "tokens_seen": 2697600000 + }, + { + "epoch": 9.01, + "learning_rate": 9.232698094282848e-05, + "loss": 2.5596, + "theoretical_loss": 3.3482929600692017, + "tokens_seen": 2697665536 + }, + { + "epoch": 9.01, + "learning_rate": 9.231695085255768e-05, + "loss": 2.4795, + "theoretical_loss": 3.3482865702121907, + "tokens_seen": 2697731072 + }, + { + "epoch": 9.01, + "learning_rate": 9.230692076228686e-05, + "loss": 2.5909, + "theoretical_loss": 3.3482801805538696, + "tokens_seen": 2697796608 + }, + { + "epoch": 9.01, + "learning_rate": 9.229689067201605e-05, + "loss": 2.3352, + "theoretical_loss": 3.348273791094227, + "tokens_seen": 2697862144 + }, + { + "epoch": 9.01, + "learning_rate": 9.228686058174523e-05, + "loss": 2.5857, + "theoretical_loss": 3.348267401833252, + "tokens_seen": 2697927680 + }, + { + "epoch": 9.01, + "learning_rate": 9.227683049147443e-05, + "loss": 2.5642, + "theoretical_loss": 3.3482610127709345, + "tokens_seen": 2697993216 + }, + { + "epoch": 9.01, + "learning_rate": 9.226680040120361e-05, + "loss": 2.5775, + "theoretical_loss": 3.348254623907262, + "tokens_seen": 2698058752 + }, + { + "epoch": 9.01, + "learning_rate": 9.22567703109328e-05, + "loss": 2.3682, + "theoretical_loss": 3.348248235242225, + "tokens_seen": 2698124288 + }, + { + "epoch": 9.01, + "learning_rate": 9.224674022066198e-05, + "loss": 2.5553, + "theoretical_loss": 3.3482418467758115, + "tokens_seen": 2698189824 + }, + { + "epoch": 9.01, + "learning_rate": 9.223671013039117e-05, + "loss": 2.576, + "theoretical_loss": 3.348235458508011, + "tokens_seen": 2698255360 + }, + { + "epoch": 9.01, + "learning_rate": 9.222668004012036e-05, + "loss": 2.5993, + "theoretical_loss": 3.348229070438812, + "tokens_seen": 2698320896 + }, + { + "epoch": 9.01, + "learning_rate": 9.221664994984954e-05, + "loss": 2.6773, + "theoretical_loss": 3.3482226825682044, + "tokens_seen": 2698386432 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3018681, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7777905464172363, + "objective/train/theoretical_loss": 3.3482178917955667, + "objective/train/tokens_used": 2718895584, + "theoretical_loss": 3.3482178917955667, + "tokens_seen": 2698435584 + }, + { + "epoch": 9.01, + "learning_rate": 9.220661985957874e-05, + "loss": 2.5537, + "theoretical_loss": 3.348216294896176, + "tokens_seen": 2698451968 + }, + { + "epoch": 9.01, + "learning_rate": 9.219658976930792e-05, + "loss": 2.4422, + "theoretical_loss": 3.3482099074227167, + "tokens_seen": 2698517504 + }, + { + "epoch": 9.01, + "learning_rate": 9.218655967903711e-05, + "loss": 2.4932, + "theoretical_loss": 3.3482035201478153, + "tokens_seen": 2698583040 + }, + { + "epoch": 9.01, + "learning_rate": 9.217652958876629e-05, + "loss": 2.5341, + "theoretical_loss": 3.3481971330714604, + "tokens_seen": 2698648576 + }, + { + "epoch": 9.01, + "learning_rate": 9.216649949849549e-05, + "loss": 2.4564, + "theoretical_loss": 3.3481907461936418, + "tokens_seen": 2698714112 + }, + { + "epoch": 9.01, + "learning_rate": 9.215646940822467e-05, + "loss": 2.4454, + "theoretical_loss": 3.348184359514348, + "tokens_seen": 2698779648 + }, + { + "epoch": 9.01, + "learning_rate": 9.214643931795386e-05, + "loss": 2.4778, + "theoretical_loss": 3.3481779730335677, + "tokens_seen": 2698845184 + }, + { + "epoch": 9.01, + "learning_rate": 9.213640922768306e-05, + "loss": 2.5173, + "theoretical_loss": 3.3481715867512905, + "tokens_seen": 2698910720 + }, + { + "epoch": 9.01, + "learning_rate": 9.212637913741224e-05, + "loss": 2.3446, + "theoretical_loss": 3.348165200667505, + "tokens_seen": 2698976256 + }, + { + "epoch": 9.01, + "learning_rate": 9.211634904714143e-05, + "loss": 2.4609, + "theoretical_loss": 3.3481588147822006, + "tokens_seen": 2699041792 + }, + { + "epoch": 9.01, + "learning_rate": 9.210631895687062e-05, + "loss": 2.5181, + "theoretical_loss": 3.348152429095366, + "tokens_seen": 2699107328 + }, + { + "epoch": 9.01, + "learning_rate": 9.209628886659981e-05, + "loss": 2.7191, + "theoretical_loss": 3.3481460436069903, + "tokens_seen": 2699172864 + }, + { + "epoch": 9.01, + "learning_rate": 9.208625877632899e-05, + "loss": 2.4949, + "theoretical_loss": 3.348139658317063, + "tokens_seen": 2699238400 + }, + { + "epoch": 9.01, + "learning_rate": 9.207622868605819e-05, + "loss": 2.5996, + "theoretical_loss": 3.3481332732255717, + "tokens_seen": 2699303936 + }, + { + "epoch": 9.01, + "learning_rate": 9.206619859578737e-05, + "loss": 2.5915, + "theoretical_loss": 3.3481268883325073, + "tokens_seen": 2699369472 + }, + { + "epoch": 9.01, + "learning_rate": 9.205616850551656e-05, + "loss": 2.4666, + "theoretical_loss": 3.3481205036378574, + "tokens_seen": 2699435008 + }, + { + "epoch": 9.01, + "learning_rate": 9.204613841524574e-05, + "loss": 2.4573, + "theoretical_loss": 3.3481141191416115, + "tokens_seen": 2699500544 + }, + { + "epoch": 9.01, + "learning_rate": 9.203610832497492e-05, + "loss": 2.5823, + "theoretical_loss": 3.3481077348437585, + "tokens_seen": 2699566080 + }, + { + "epoch": 9.01, + "learning_rate": 9.202607823470412e-05, + "loss": 2.4488, + "theoretical_loss": 3.3481013507442876, + "tokens_seen": 2699631616 + }, + { + "epoch": 9.01, + "learning_rate": 9.20160481444333e-05, + "loss": 2.504, + "theoretical_loss": 3.3480949668431883, + "tokens_seen": 2699697152 + }, + { + "epoch": 9.01, + "learning_rate": 9.20060180541625e-05, + "loss": 2.5147, + "theoretical_loss": 3.3480885831404485, + "tokens_seen": 2699762688 + }, + { + "epoch": 9.01, + "learning_rate": 9.199598796389168e-05, + "loss": 2.4094, + "theoretical_loss": 3.3480821996360577, + "tokens_seen": 2699828224 + }, + { + "epoch": 9.01, + "learning_rate": 9.198595787362087e-05, + "loss": 2.6802, + "theoretical_loss": 3.348075816330005, + "tokens_seen": 2699893760 + }, + { + "epoch": 9.01, + "learning_rate": 9.197592778335005e-05, + "loss": 2.5812, + "theoretical_loss": 3.34806943322228, + "tokens_seen": 2699959296 + }, + { + "epoch": 9.01, + "learning_rate": 9.196589769307925e-05, + "loss": 2.6553, + "theoretical_loss": 3.348063050312871, + "tokens_seen": 2700024832 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3019876, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6788101196289062, + "objective/train/theoretical_loss": 3.3480582632609526, + "objective/train/tokens_used": 2720533984, + "theoretical_loss": 3.3480582632609526, + "tokens_seen": 2700073984 + }, + { + "epoch": 9.01, + "learning_rate": 9.195586760280843e-05, + "loss": 2.4986, + "theoretical_loss": 3.348056667601767, + "tokens_seen": 2700090368 + }, + { + "epoch": 9.01, + "learning_rate": 9.194583751253761e-05, + "loss": 2.3832, + "theoretical_loss": 3.348050285088957, + "tokens_seen": 2700155904 + }, + { + "epoch": 9.01, + "learning_rate": 9.19358074222668e-05, + "loss": 2.6269, + "theoretical_loss": 3.348043902774431, + "tokens_seen": 2700221440 + }, + { + "epoch": 9.01, + "learning_rate": 9.192577733199598e-05, + "loss": 2.5879, + "theoretical_loss": 3.3480375206581763, + "tokens_seen": 2700286976 + }, + { + "epoch": 9.01, + "learning_rate": 9.191574724172518e-05, + "loss": 2.6136, + "theoretical_loss": 3.3480311387401835, + "tokens_seen": 2700352512 + }, + { + "epoch": 9.01, + "learning_rate": 9.190571715145436e-05, + "loss": 2.3263, + "theoretical_loss": 3.3480247570204407, + "tokens_seen": 2700418048 + }, + { + "epoch": 9.01, + "learning_rate": 9.189568706118355e-05, + "loss": 2.6165, + "theoretical_loss": 3.3480183754989374, + "tokens_seen": 2700483584 + }, + { + "epoch": 9.01, + "learning_rate": 9.188565697091274e-05, + "loss": 2.5384, + "theoretical_loss": 3.3480119941756623, + "tokens_seen": 2700549120 + }, + { + "epoch": 9.01, + "learning_rate": 9.187562688064193e-05, + "loss": 2.4483, + "theoretical_loss": 3.348005613050605, + "tokens_seen": 2700614656 + }, + { + "epoch": 9.01, + "learning_rate": 9.186559679037111e-05, + "loss": 2.4207, + "theoretical_loss": 3.347999232123754, + "tokens_seen": 2700680192 + }, + { + "epoch": 9.01, + "learning_rate": 9.185556670010029e-05, + "loss": 2.5038, + "theoretical_loss": 3.3479928513950985, + "tokens_seen": 2700745728 + }, + { + "epoch": 9.01, + "learning_rate": 9.184553660982949e-05, + "loss": 2.4628, + "theoretical_loss": 3.3479864708646274, + "tokens_seen": 2700811264 + }, + { + "epoch": 9.01, + "learning_rate": 9.183550651955867e-05, + "loss": 2.5027, + "theoretical_loss": 3.34798009053233, + "tokens_seen": 2700876800 + }, + { + "epoch": 9.01, + "learning_rate": 9.182547642928786e-05, + "loss": 2.4503, + "theoretical_loss": 3.347973710398195, + "tokens_seen": 2700942336 + }, + { + "epoch": 9.01, + "learning_rate": 9.181544633901704e-05, + "loss": 2.5715, + "theoretical_loss": 3.347967330462212, + "tokens_seen": 2701007872 + }, + { + "epoch": 9.01, + "learning_rate": 9.180541624874624e-05, + "loss": 2.5443, + "theoretical_loss": 3.3479609507243695, + "tokens_seen": 2701073408 + }, + { + "epoch": 9.01, + "learning_rate": 9.179538615847542e-05, + "loss": 2.5882, + "theoretical_loss": 3.3479545711846566, + "tokens_seen": 2701138944 + }, + { + "epoch": 9.01, + "learning_rate": 9.178535606820461e-05, + "loss": 2.4002, + "theoretical_loss": 3.3479481918430625, + "tokens_seen": 2701204480 + }, + { + "epoch": 9.01, + "learning_rate": 9.177532597793381e-05, + "loss": 2.3953, + "theoretical_loss": 3.347941812699576, + "tokens_seen": 2701270016 + }, + { + "epoch": 9.01, + "learning_rate": 9.1765295887663e-05, + "loss": 2.6288, + "theoretical_loss": 3.347935433754187, + "tokens_seen": 2701335552 + }, + { + "epoch": 9.01, + "learning_rate": 9.175526579739218e-05, + "loss": 2.5281, + "theoretical_loss": 3.347929055006883, + "tokens_seen": 2701401088 + }, + { + "epoch": 9.01, + "learning_rate": 9.174523570712137e-05, + "loss": 2.4435, + "theoretical_loss": 3.3479226764576544, + "tokens_seen": 2701466624 + }, + { + "epoch": 9.01, + "learning_rate": 9.173520561685056e-05, + "loss": 2.5667, + "theoretical_loss": 3.34791629810649, + "tokens_seen": 2701532160 + }, + { + "epoch": 9.01, + "learning_rate": 9.172517552657974e-05, + "loss": 2.6293, + "theoretical_loss": 3.347909919953378, + "tokens_seen": 2701597696 + }, + { + "epoch": 9.01, + "learning_rate": 9.171514543630894e-05, + "loss": 2.3747, + "theoretical_loss": 3.347903541998308, + "tokens_seen": 2701663232 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3020678, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3766283988952637, + "objective/train/theoretical_loss": 3.3478987586619646, + "objective/train/tokens_used": 2722172384, + "theoretical_loss": 3.3478987586619646, + "tokens_seen": 2701712384 + }, + { + "epoch": 9.01, + "learning_rate": 9.170511534603812e-05, + "loss": 2.3823, + "theoretical_loss": 3.34789716424127, + "tokens_seen": 2701728768 + }, + { + "epoch": 9.01, + "learning_rate": 9.169508525576731e-05, + "loss": 2.5432, + "theoretical_loss": 3.3478907866822514, + "tokens_seen": 2701794304 + }, + { + "epoch": 9.01, + "learning_rate": 9.16850551654965e-05, + "loss": 2.5201, + "theoretical_loss": 3.3478844093212423, + "tokens_seen": 2701859840 + }, + { + "epoch": 9.01, + "learning_rate": 9.167502507522569e-05, + "loss": 2.4455, + "theoretical_loss": 3.3478780321582313, + "tokens_seen": 2701925376 + }, + { + "epoch": 9.01, + "learning_rate": 9.166499498495487e-05, + "loss": 2.3336, + "theoretical_loss": 3.3478716551932077, + "tokens_seen": 2701990912 + }, + { + "epoch": 9.01, + "learning_rate": 9.165496489468405e-05, + "loss": 2.3845, + "theoretical_loss": 3.34786527842616, + "tokens_seen": 2702056448 + }, + { + "epoch": 9.01, + "learning_rate": 9.164493480441324e-05, + "loss": 2.3596, + "theoretical_loss": 3.3478589018570784, + "tokens_seen": 2702121984 + }, + { + "epoch": 9.01, + "learning_rate": 9.163490471414243e-05, + "loss": 2.2297, + "theoretical_loss": 3.347852525485951, + "tokens_seen": 2702187520 + }, + { + "epoch": 9.01, + "learning_rate": 9.162487462387162e-05, + "loss": 2.3141, + "theoretical_loss": 3.347846149312767, + "tokens_seen": 2702253056 + }, + { + "epoch": 9.01, + "learning_rate": 9.16148445336008e-05, + "loss": 2.6271, + "theoretical_loss": 3.3478397733375154, + "tokens_seen": 2702318592 + }, + { + "epoch": 9.01, + "learning_rate": 9.160481444333e-05, + "loss": 2.3707, + "theoretical_loss": 3.3478333975601857, + "tokens_seen": 2702384128 + }, + { + "epoch": 9.01, + "learning_rate": 9.159478435305918e-05, + "loss": 2.3758, + "theoretical_loss": 3.3478270219807666, + "tokens_seen": 2702449664 + }, + { + "epoch": 9.01, + "learning_rate": 9.158475426278837e-05, + "loss": 2.3734, + "theoretical_loss": 3.3478206465992475, + "tokens_seen": 2702515200 + }, + { + "epoch": 9.01, + "learning_rate": 9.157472417251755e-05, + "loss": 2.4188, + "theoretical_loss": 3.3478142714156167, + "tokens_seen": 2702580736 + }, + { + "epoch": 9.01, + "learning_rate": 9.156469408224673e-05, + "loss": 2.438, + "theoretical_loss": 3.347807896429864, + "tokens_seen": 2702646272 + }, + { + "epoch": 9.01, + "learning_rate": 9.155466399197593e-05, + "loss": 2.3053, + "theoretical_loss": 3.347801521641978, + "tokens_seen": 2702711808 + }, + { + "epoch": 9.01, + "learning_rate": 9.154463390170511e-05, + "loss": 2.4268, + "theoretical_loss": 3.347795147051948, + "tokens_seen": 2702777344 + }, + { + "epoch": 9.01, + "learning_rate": 9.15346038114343e-05, + "loss": 2.5718, + "theoretical_loss": 3.3477887726597633, + "tokens_seen": 2702842880 + }, + { + "epoch": 9.01, + "learning_rate": 9.152457372116349e-05, + "loss": 2.5326, + "theoretical_loss": 3.3477823984654127, + "tokens_seen": 2702908416 + }, + { + "epoch": 9.01, + "learning_rate": 9.151454363089268e-05, + "loss": 2.465, + "theoretical_loss": 3.3477760244688852, + "tokens_seen": 2702973952 + }, + { + "epoch": 9.01, + "learning_rate": 9.150451354062186e-05, + "loss": 2.3186, + "theoretical_loss": 3.3477696506701697, + "tokens_seen": 2703039488 + }, + { + "epoch": 9.01, + "learning_rate": 9.149448345035106e-05, + "loss": 2.5388, + "theoretical_loss": 3.347763277069256, + "tokens_seen": 2703105024 + }, + { + "epoch": 9.01, + "learning_rate": 9.148445336008024e-05, + "loss": 2.4514, + "theoretical_loss": 3.347756903666132, + "tokens_seen": 2703170560 + }, + { + "epoch": 9.01, + "learning_rate": 9.147442326980943e-05, + "loss": 2.4898, + "theoretical_loss": 3.3477505304607877, + "tokens_seen": 2703236096 + }, + { + "epoch": 9.01, + "learning_rate": 9.146439317953861e-05, + "loss": 2.3741, + "theoretical_loss": 3.347744157453212, + "tokens_seen": 2703301632 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3021502, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.515824556350708, + "objective/train/theoretical_loss": 3.3477393778273092, + "objective/train/tokens_used": 2723810784, + "theoretical_loss": 3.3477393778273092, + "tokens_seen": 2703350784 + }, + { + "epoch": 9.01, + "learning_rate": 9.14543630892678e-05, + "loss": 2.5026, + "theoretical_loss": 3.3477377846433938, + "tokens_seen": 2703367168 + }, + { + "epoch": 9.01, + "learning_rate": 9.144433299899699e-05, + "loss": 2.2697, + "theoretical_loss": 3.3477314120313224, + "tokens_seen": 2703432704 + }, + { + "epoch": 9.01, + "learning_rate": 9.143430290872617e-05, + "loss": 2.6567, + "theoretical_loss": 3.3477250396169866, + "tokens_seen": 2703498240 + }, + { + "epoch": 9.01, + "learning_rate": 9.142427281845537e-05, + "loss": 2.3629, + "theoretical_loss": 3.3477186674003754, + "tokens_seen": 2703563776 + }, + { + "epoch": 9.01, + "learning_rate": 9.141424272818455e-05, + "loss": 2.4785, + "theoretical_loss": 3.347712295381478, + "tokens_seen": 2703629312 + }, + { + "epoch": 9.01, + "learning_rate": 9.140421263791374e-05, + "loss": 2.3719, + "theoretical_loss": 3.3477059235602837, + "tokens_seen": 2703694848 + }, + { + "epoch": 9.01, + "learning_rate": 9.139418254764294e-05, + "loss": 2.3826, + "theoretical_loss": 3.347699551936781, + "tokens_seen": 2703760384 + }, + { + "epoch": 9.01, + "learning_rate": 9.138415245737213e-05, + "loss": 2.4814, + "theoretical_loss": 3.3476931805109595, + "tokens_seen": 2703825920 + }, + { + "epoch": 9.01, + "learning_rate": 9.137412236710131e-05, + "loss": 2.3936, + "theoretical_loss": 3.3476868092828087, + "tokens_seen": 2703891456 + }, + { + "epoch": 9.01, + "learning_rate": 9.136409227683049e-05, + "loss": 2.6418, + "theoretical_loss": 3.3476804382523166, + "tokens_seen": 2703956992 + }, + { + "epoch": 9.01, + "learning_rate": 9.135406218655969e-05, + "loss": 2.1826, + "theoretical_loss": 3.347674067419473, + "tokens_seen": 2704022528 + }, + { + "epoch": 9.01, + "learning_rate": 9.134403209628887e-05, + "loss": 2.5841, + "theoretical_loss": 3.3476676967842667, + "tokens_seen": 2704088064 + }, + { + "epoch": 9.01, + "learning_rate": 9.133400200601806e-05, + "loss": 2.57, + "theoretical_loss": 3.347661326346687, + "tokens_seen": 2704153600 + }, + { + "epoch": 9.01, + "learning_rate": 9.132397191574724e-05, + "loss": 2.5573, + "theoretical_loss": 3.3476549561067226, + "tokens_seen": 2704219136 + }, + { + "epoch": 9.01, + "learning_rate": 9.131394182547644e-05, + "loss": 2.2342, + "theoretical_loss": 3.3476485860643628, + "tokens_seen": 2704284672 + }, + { + "epoch": 9.01, + "learning_rate": 9.130391173520562e-05, + "loss": 2.4137, + "theoretical_loss": 3.347642216219597, + "tokens_seen": 2704350208 + }, + { + "epoch": 9.01, + "learning_rate": 9.129388164493481e-05, + "loss": 2.5639, + "theoretical_loss": 3.3476358465724134, + "tokens_seen": 2704415744 + }, + { + "epoch": 9.01, + "learning_rate": 9.1283851554664e-05, + "loss": 2.5908, + "theoretical_loss": 3.347629477122802, + "tokens_seen": 2704481280 + }, + { + "epoch": 9.01, + "learning_rate": 9.127382146439319e-05, + "loss": 2.6909, + "theoretical_loss": 3.347623107870752, + "tokens_seen": 2704546816 + }, + { + "epoch": 9.01, + "learning_rate": 9.126379137412237e-05, + "loss": 2.6386, + "theoretical_loss": 3.347616738816251, + "tokens_seen": 2704612352 + }, + { + "epoch": 9.01, + "learning_rate": 9.125376128385155e-05, + "loss": 2.4273, + "theoretical_loss": 3.34761036995929, + "tokens_seen": 2704677888 + }, + { + "epoch": 9.01, + "learning_rate": 9.124373119358075e-05, + "loss": 2.3554, + "theoretical_loss": 3.3476040012998567, + "tokens_seen": 2704743424 + }, + { + "epoch": 9.01, + "learning_rate": 9.123370110330993e-05, + "loss": 2.4917, + "theoretical_loss": 3.347597632837941, + "tokens_seen": 2704808960 + }, + { + "epoch": 9.01, + "learning_rate": 9.122367101303912e-05, + "loss": 2.518, + "theoretical_loss": 3.3475912645735315, + "tokens_seen": 2704874496 + }, + { + "epoch": 9.01, + "learning_rate": 9.12136409227683e-05, + "loss": 2.4522, + "theoretical_loss": 3.3475848965066177, + "tokens_seen": 2704940032 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3023009, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6622426509857178, + "objective/train/theoretical_loss": 3.347580120586032, + "objective/train/tokens_used": 2725449184, + "theoretical_loss": 3.347580120586032, + "tokens_seen": 2704989184 + }, + { + "epoch": 9.01, + "learning_rate": 9.12036108324975e-05, + "loss": 2.6084, + "theoretical_loss": 3.347578528637188, + "tokens_seen": 2705005568 + }, + { + "epoch": 9.01, + "learning_rate": 9.119358074222668e-05, + "loss": 2.3855, + "theoretical_loss": 3.3475721609652327, + "tokens_seen": 2705071104 + }, + { + "epoch": 9.01, + "learning_rate": 9.118355065195587e-05, + "loss": 2.3553, + "theoretical_loss": 3.3475657934907397, + "tokens_seen": 2705136640 + }, + { + "epoch": 9.01, + "learning_rate": 9.117352056168506e-05, + "loss": 2.4591, + "theoretical_loss": 3.3475594262136985, + "tokens_seen": 2705202176 + }, + { + "epoch": 9.01, + "learning_rate": 9.116349047141424e-05, + "loss": 2.353, + "theoretical_loss": 3.347553059134098, + "tokens_seen": 2705267712 + }, + { + "epoch": 9.01, + "learning_rate": 9.115346038114343e-05, + "loss": 2.4662, + "theoretical_loss": 3.3475466922519277, + "tokens_seen": 2705333248 + }, + { + "epoch": 9.01, + "learning_rate": 9.114343029087261e-05, + "loss": 2.6766, + "theoretical_loss": 3.3475403255671767, + "tokens_seen": 2705398784 + }, + { + "epoch": 9.01, + "learning_rate": 9.113340020060181e-05, + "loss": 2.2726, + "theoretical_loss": 3.3475339590798336, + "tokens_seen": 2705464320 + }, + { + "epoch": 9.01, + "learning_rate": 9.112337011033099e-05, + "loss": 2.3949, + "theoretical_loss": 3.347527592789888, + "tokens_seen": 2705529856 + }, + { + "epoch": 9.01, + "learning_rate": 9.111334002006018e-05, + "loss": 2.5436, + "theoretical_loss": 3.347521226697329, + "tokens_seen": 2705595392 + }, + { + "epoch": 9.01, + "learning_rate": 9.110330992978936e-05, + "loss": 2.2764, + "theoretical_loss": 3.3475148608021454, + "tokens_seen": 2705660928 + }, + { + "epoch": 9.01, + "learning_rate": 9.109327983951856e-05, + "loss": 2.4247, + "theoretical_loss": 3.3475084951043264, + "tokens_seen": 2705726464 + }, + { + "epoch": 9.01, + "learning_rate": 9.108324974924774e-05, + "loss": 2.5978, + "theoretical_loss": 3.347502129603861, + "tokens_seen": 2705792000 + }, + { + "epoch": 9.01, + "learning_rate": 9.107321965897692e-05, + "loss": 2.5125, + "theoretical_loss": 3.3474957643007386, + "tokens_seen": 2705857536 + }, + { + "epoch": 9.01, + "learning_rate": 9.106318956870612e-05, + "loss": 2.4281, + "theoretical_loss": 3.3474893991949477, + "tokens_seen": 2705923072 + }, + { + "epoch": 9.01, + "learning_rate": 9.10531594784353e-05, + "loss": 2.529, + "theoretical_loss": 3.3474830342864785, + "tokens_seen": 2705988608 + }, + { + "epoch": 9.01, + "learning_rate": 9.104312938816449e-05, + "loss": 2.4302, + "theoretical_loss": 3.347476669575319, + "tokens_seen": 2706054144 + }, + { + "epoch": 9.01, + "learning_rate": 9.103309929789367e-05, + "loss": 2.6831, + "theoretical_loss": 3.3474703050614587, + "tokens_seen": 2706119680 + }, + { + "epoch": 9.01, + "learning_rate": 9.102306920762288e-05, + "loss": 2.4948, + "theoretical_loss": 3.3474639407448867, + "tokens_seen": 2706185216 + }, + { + "epoch": 9.01, + "learning_rate": 9.101303911735206e-05, + "loss": 2.5393, + "theoretical_loss": 3.347457576625592, + "tokens_seen": 2706250752 + }, + { + "epoch": 9.01, + "learning_rate": 9.100300902708126e-05, + "loss": 2.4722, + "theoretical_loss": 3.3474512127035645, + "tokens_seen": 2706316288 + }, + { + "epoch": 9.01, + "learning_rate": 9.099297893681044e-05, + "loss": 2.4593, + "theoretical_loss": 3.347444848978792, + "tokens_seen": 2706381824 + }, + { + "epoch": 9.01, + "learning_rate": 9.098294884653963e-05, + "loss": 2.3986, + "theoretical_loss": 3.3474384854512644, + "tokens_seen": 2706447360 + }, + { + "epoch": 9.01, + "learning_rate": 9.097291875626881e-05, + "loss": 2.4909, + "theoretical_loss": 3.347432122120971, + "tokens_seen": 2706512896 + }, + { + "epoch": 9.01, + "learning_rate": 9.0962888665998e-05, + "loss": 2.5909, + "theoretical_loss": 3.3474257589879004, + "tokens_seen": 2706578432 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3023836, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.43231463432312, + "objective/train/theoretical_loss": 3.3474209867675184, + "objective/train/tokens_used": 2727087584, + "theoretical_loss": 3.3474209867675184, + "tokens_seen": 2706627584 + }, + { + "epoch": 9.01, + "learning_rate": 9.095285857572719e-05, + "loss": 2.4824, + "theoretical_loss": 3.347419396052042, + "tokens_seen": 2706643968 + }, + { + "epoch": 9.01, + "learning_rate": 9.094282848545637e-05, + "loss": 2.6895, + "theoretical_loss": 3.3474130333133845, + "tokens_seen": 2706709504 + }, + { + "epoch": 9.01, + "learning_rate": 9.093279839518557e-05, + "loss": 2.4476, + "theoretical_loss": 3.3474066707719174, + "tokens_seen": 2706775040 + }, + { + "epoch": 9.01, + "learning_rate": 9.092276830491475e-05, + "loss": 2.305, + "theoretical_loss": 3.34740030842763, + "tokens_seen": 2706840576 + }, + { + "epoch": 9.01, + "learning_rate": 9.091273821464394e-05, + "loss": 2.574, + "theoretical_loss": 3.347393946280511, + "tokens_seen": 2706906112 + }, + { + "epoch": 9.01, + "learning_rate": 9.090270812437312e-05, + "loss": 2.4853, + "theoretical_loss": 3.3473875843305496, + "tokens_seen": 2706971648 + }, + { + "epoch": 9.01, + "learning_rate": 9.089267803410232e-05, + "loss": 2.5339, + "theoretical_loss": 3.3473812225777353, + "tokens_seen": 2707037184 + }, + { + "epoch": 9.01, + "learning_rate": 9.08826479438315e-05, + "loss": 2.4883, + "theoretical_loss": 3.3473748610220566, + "tokens_seen": 2707102720 + }, + { + "epoch": 9.01, + "learning_rate": 9.087261785356068e-05, + "loss": 2.5642, + "theoretical_loss": 3.347368499663503, + "tokens_seen": 2707168256 + }, + { + "epoch": 9.01, + "learning_rate": 9.086258776328987e-05, + "loss": 2.247, + "theoretical_loss": 3.3473621385020635, + "tokens_seen": 2707233792 + }, + { + "epoch": 9.01, + "learning_rate": 9.085255767301905e-05, + "loss": 2.4489, + "theoretical_loss": 3.3473557775377274, + "tokens_seen": 2707299328 + }, + { + "epoch": 9.01, + "learning_rate": 9.084252758274825e-05, + "loss": 2.4409, + "theoretical_loss": 3.347349416770484, + "tokens_seen": 2707364864 + }, + { + "epoch": 9.01, + "learning_rate": 9.083249749247743e-05, + "loss": 2.4257, + "theoretical_loss": 3.3473430562003212, + "tokens_seen": 2707430400 + }, + { + "epoch": 9.01, + "learning_rate": 9.082246740220663e-05, + "loss": 2.4335, + "theoretical_loss": 3.34733669582723, + "tokens_seen": 2707495936 + }, + { + "epoch": 9.01, + "learning_rate": 9.08124373119358e-05, + "loss": 2.4553, + "theoretical_loss": 3.347330335651198, + "tokens_seen": 2707561472 + }, + { + "epoch": 9.01, + "learning_rate": 9.0802407221665e-05, + "loss": 2.4265, + "theoretical_loss": 3.3473239756722153, + "tokens_seen": 2707627008 + }, + { + "epoch": 9.01, + "learning_rate": 9.079237713139418e-05, + "loss": 2.1892, + "theoretical_loss": 3.3473176158902698, + "tokens_seen": 2707692544 + }, + { + "epoch": 9.01, + "learning_rate": 9.078234704112336e-05, + "loss": 2.4363, + "theoretical_loss": 3.3473112563053524, + "tokens_seen": 2707758080 + }, + { + "epoch": 9.01, + "learning_rate": 9.077231695085256e-05, + "loss": 2.4412, + "theoretical_loss": 3.347304896917451, + "tokens_seen": 2707823616 + }, + { + "epoch": 9.01, + "learning_rate": 9.076228686058174e-05, + "loss": 2.5726, + "theoretical_loss": 3.3472985377265543, + "tokens_seen": 2707889152 + }, + { + "epoch": 9.01, + "learning_rate": 9.075225677031093e-05, + "loss": 2.5597, + "theoretical_loss": 3.347292178732653, + "tokens_seen": 2707954688 + }, + { + "epoch": 9.01, + "learning_rate": 9.074222668004011e-05, + "loss": 2.4856, + "theoretical_loss": 3.3472858199357347, + "tokens_seen": 2708020224 + }, + { + "epoch": 9.01, + "learning_rate": 9.073219658976931e-05, + "loss": 2.3227, + "theoretical_loss": 3.3472794613357895, + "tokens_seen": 2708085760 + }, + { + "epoch": 9.01, + "learning_rate": 9.072216649949849e-05, + "loss": 2.5618, + "theoretical_loss": 3.3472731029328067, + "tokens_seen": 2708151296 + }, + { + "epoch": 9.01, + "learning_rate": 9.071213640922769e-05, + "loss": 2.4099, + "theoretical_loss": 3.3472667447267743, + "tokens_seen": 2708216832 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3025007, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.44903826713562, + "objective/train/theoretical_loss": 3.3472619762014926, + "objective/train/tokens_used": 2728725984, + "theoretical_loss": 3.3472619762014926, + "tokens_seen": 2708265984 + }, + { + "epoch": 9.01, + "learning_rate": 9.070210631895687e-05, + "loss": 2.4373, + "theoretical_loss": 3.3472603867176822, + "tokens_seen": 2708282368 + }, + { + "epoch": 9.01, + "learning_rate": 9.069207622868606e-05, + "loss": 2.4842, + "theoretical_loss": 3.3472540289055193, + "tokens_seen": 2708347904 + }, + { + "epoch": 9.01, + "learning_rate": 9.068204613841524e-05, + "loss": 2.5435, + "theoretical_loss": 3.3472476712902752, + "tokens_seen": 2708413440 + }, + { + "epoch": 9.01, + "learning_rate": 9.067201604814442e-05, + "loss": 2.2348, + "theoretical_loss": 3.3472413138719386, + "tokens_seen": 2708478976 + }, + { + "epoch": 9.01, + "learning_rate": 9.066198595787362e-05, + "loss": 2.4506, + "theoretical_loss": 3.3472349566504986, + "tokens_seen": 2708544512 + }, + { + "epoch": 9.01, + "learning_rate": 9.065195586760281e-05, + "loss": 2.4943, + "theoretical_loss": 3.3472285996259443, + "tokens_seen": 2708610048 + }, + { + "epoch": 9.01, + "learning_rate": 9.064192577733201e-05, + "loss": 2.361, + "theoretical_loss": 3.3472222427982654, + "tokens_seen": 2708675584 + }, + { + "epoch": 9.01, + "learning_rate": 9.063189568706119e-05, + "loss": 2.529, + "theoretical_loss": 3.3472158861674504, + "tokens_seen": 2708741120 + }, + { + "epoch": 9.01, + "learning_rate": 9.062186559679038e-05, + "loss": 2.3365, + "theoretical_loss": 3.3472095297334885, + "tokens_seen": 2708806656 + }, + { + "epoch": 9.01, + "learning_rate": 9.061183550651956e-05, + "loss": 2.6391, + "theoretical_loss": 3.3472031734963696, + "tokens_seen": 2708872192 + }, + { + "epoch": 9.01, + "learning_rate": 9.060180541624876e-05, + "loss": 2.3859, + "theoretical_loss": 3.3471968174560818, + "tokens_seen": 2708937728 + }, + { + "epoch": 9.01, + "learning_rate": 9.059177532597794e-05, + "loss": 2.4009, + "theoretical_loss": 3.3471904616126147, + "tokens_seen": 2709003264 + }, + { + "epoch": 9.01, + "learning_rate": 9.058174523570712e-05, + "loss": 2.5432, + "theoretical_loss": 3.3471841059659577, + "tokens_seen": 2709068800 + }, + { + "epoch": 9.01, + "learning_rate": 9.057171514543632e-05, + "loss": 2.5378, + "theoretical_loss": 3.3471777505160993, + "tokens_seen": 2709134336 + }, + { + "epoch": 9.01, + "learning_rate": 9.05616850551655e-05, + "loss": 2.53, + "theoretical_loss": 3.3471713952630298, + "tokens_seen": 2709199872 + }, + { + "epoch": 9.01, + "learning_rate": 9.055165496489469e-05, + "loss": 2.3908, + "theoretical_loss": 3.347165040206737, + "tokens_seen": 2709265408 + }, + { + "epoch": 9.01, + "learning_rate": 9.054162487462387e-05, + "loss": 2.3766, + "theoretical_loss": 3.347158685347211, + "tokens_seen": 2709330944 + }, + { + "epoch": 9.01, + "learning_rate": 9.053159478435307e-05, + "loss": 2.5214, + "theoretical_loss": 3.34715233068444, + "tokens_seen": 2709396480 + }, + { + "epoch": 9.01, + "learning_rate": 9.052156469408225e-05, + "loss": 2.5639, + "theoretical_loss": 3.347145976218414, + "tokens_seen": 2709462016 + }, + { + "epoch": 9.01, + "learning_rate": 9.051153460381144e-05, + "loss": 2.3193, + "theoretical_loss": 3.347139621949122, + "tokens_seen": 2709527552 + }, + { + "epoch": 9.01, + "learning_rate": 9.050150451354062e-05, + "loss": 2.334, + "theoretical_loss": 3.3471332678765533, + "tokens_seen": 2709593088 + }, + { + "epoch": 9.01, + "learning_rate": 9.04914744232698e-05, + "loss": 2.5568, + "theoretical_loss": 3.3471269140006963, + "tokens_seen": 2709658624 + }, + { + "epoch": 9.01, + "learning_rate": 9.0481444332999e-05, + "loss": 2.5297, + "theoretical_loss": 3.3471205603215406, + "tokens_seen": 2709724160 + }, + { + "epoch": 9.01, + "learning_rate": 9.047141424272818e-05, + "loss": 2.4876, + "theoretical_loss": 3.347114206839076, + "tokens_seen": 2709789696 + }, + { + "epoch": 9.01, + "learning_rate": 9.046138415245738e-05, + "loss": 2.5896, + "theoretical_loss": 3.3471078535532905, + "tokens_seen": 2709855232 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3025793, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2607204914093018, + "objective/train/theoretical_loss": 3.3471030887180158, + "objective/train/tokens_used": 2730364384, + "theoretical_loss": 3.3471030887180158, + "tokens_seen": 2709904384 + }, + { + "epoch": 9.01, + "learning_rate": 9.045135406218656e-05, + "loss": 2.2939, + "theoretical_loss": 3.347101500464174, + "tokens_seen": 2709920768 + }, + { + "epoch": 9.01, + "learning_rate": 9.044132397191575e-05, + "loss": 2.3964, + "theoretical_loss": 3.3470951475717152, + "tokens_seen": 2709986304 + }, + { + "epoch": 9.01, + "learning_rate": 9.043129388164493e-05, + "loss": 2.4926, + "theoretical_loss": 3.347088794875904, + "tokens_seen": 2710051840 + }, + { + "epoch": 9.01, + "learning_rate": 9.042126379137413e-05, + "loss": 2.2263, + "theoretical_loss": 3.347082442376729, + "tokens_seen": 2710117376 + }, + { + "epoch": 9.01, + "learning_rate": 9.041123370110331e-05, + "loss": 2.4316, + "theoretical_loss": 3.347076090074179, + "tokens_seen": 2710182912 + }, + { + "epoch": 9.01, + "learning_rate": 9.04012036108325e-05, + "loss": 2.3903, + "theoretical_loss": 3.347069737968244, + "tokens_seen": 2710248448 + }, + { + "epoch": 9.01, + "learning_rate": 9.039117352056168e-05, + "loss": 2.5439, + "theoretical_loss": 3.3470633860589127, + "tokens_seen": 2710313984 + }, + { + "epoch": 9.01, + "learning_rate": 9.038114343029087e-05, + "loss": 2.4574, + "theoretical_loss": 3.3470570343461743, + "tokens_seen": 2710379520 + }, + { + "epoch": 9.01, + "learning_rate": 9.037111334002006e-05, + "loss": 2.4656, + "theoretical_loss": 3.3470506828300177, + "tokens_seen": 2710445056 + }, + { + "epoch": 9.01, + "learning_rate": 9.036108324974924e-05, + "loss": 2.4175, + "theoretical_loss": 3.347044331510433, + "tokens_seen": 2710510592 + }, + { + "epoch": 9.01, + "learning_rate": 9.035105315947844e-05, + "loss": 2.2381, + "theoretical_loss": 3.347037980387408, + "tokens_seen": 2710576128 + }, + { + "epoch": 9.01, + "learning_rate": 9.034102306920762e-05, + "loss": 2.3981, + "theoretical_loss": 3.347031629460933, + "tokens_seen": 2710641664 + }, + { + "epoch": 9.01, + "learning_rate": 9.033099297893681e-05, + "loss": 2.5554, + "theoretical_loss": 3.3470252787309964, + "tokens_seen": 2710707200 + }, + { + "epoch": 9.01, + "learning_rate": 9.032096288866599e-05, + "loss": 2.6523, + "theoretical_loss": 3.347018928197588, + "tokens_seen": 2710772736 + }, + { + "epoch": 9.01, + "learning_rate": 9.031093279839519e-05, + "loss": 2.4765, + "theoretical_loss": 3.3470125778606965, + "tokens_seen": 2710838272 + }, + { + "epoch": 9.01, + "learning_rate": 9.030090270812437e-05, + "loss": 2.3973, + "theoretical_loss": 3.3470062277203114, + "tokens_seen": 2710903808 + }, + { + "epoch": 9.01, + "learning_rate": 9.029087261785355e-05, + "loss": 2.4471, + "theoretical_loss": 3.3469998777764216, + "tokens_seen": 2710969344 + }, + { + "epoch": 9.01, + "learning_rate": 9.028084252758274e-05, + "loss": 2.4366, + "theoretical_loss": 3.3469935280290164, + "tokens_seen": 2711034880 + }, + { + "epoch": 9.01, + "learning_rate": 9.027081243731194e-05, + "loss": 2.4684, + "theoretical_loss": 3.3469871784780847, + "tokens_seen": 2711100416 + }, + { + "epoch": 9.01, + "learning_rate": 9.026078234704113e-05, + "loss": 2.2563, + "theoretical_loss": 3.3469808291236163, + "tokens_seen": 2711165952 + }, + { + "epoch": 9.01, + "learning_rate": 9.025075225677032e-05, + "loss": 2.523, + "theoretical_loss": 3.3469744799656, + "tokens_seen": 2711231488 + }, + { + "epoch": 9.01, + "learning_rate": 9.024072216649951e-05, + "loss": 2.4834, + "theoretical_loss": 3.3469681310040245, + "tokens_seen": 2711297024 + }, + { + "epoch": 9.01, + "learning_rate": 9.023069207622869e-05, + "loss": 2.6754, + "theoretical_loss": 3.3469617822388797, + "tokens_seen": 2711362560 + }, + { + "epoch": 9.01, + "learning_rate": 9.022066198595789e-05, + "loss": 2.2346, + "theoretical_loss": 3.3469554336701544, + "tokens_seen": 2711428096 + }, + { + "epoch": 9.01, + "learning_rate": 9.021063189568707e-05, + "loss": 2.4802, + "theoretical_loss": 3.346949085297838, + "tokens_seen": 2711493632 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3027278, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.664949655532837, + "objective/train/theoretical_loss": 3.3469443241474877, + "objective/train/tokens_used": 2732002784, + "theoretical_loss": 3.3469443241474877, + "tokens_seen": 2711542784 + }, + { + "epoch": 9.01, + "learning_rate": 9.020060180541626e-05, + "loss": 2.5917, + "theoretical_loss": 3.34694273712192, + "tokens_seen": 2711559168 + }, + { + "epoch": 9.01, + "learning_rate": 9.019057171514544e-05, + "loss": 2.5, + "theoretical_loss": 3.3469363891423884, + "tokens_seen": 2711624704 + }, + { + "epoch": 9.01, + "learning_rate": 9.018054162487462e-05, + "loss": 2.5995, + "theoretical_loss": 3.3469300413592338, + "tokens_seen": 2711690240 + }, + { + "epoch": 9.01, + "learning_rate": 9.017051153460382e-05, + "loss": 2.681, + "theoretical_loss": 3.3469236937724443, + "tokens_seen": 2711755776 + }, + { + "epoch": 9.01, + "learning_rate": 9.0160481444333e-05, + "loss": 2.594, + "theoretical_loss": 3.346917346382009, + "tokens_seen": 2711821312 + }, + { + "epoch": 9.01, + "learning_rate": 9.01504513540622e-05, + "loss": 2.4501, + "theoretical_loss": 3.3469109991879185, + "tokens_seen": 2711886848 + }, + { + "epoch": 9.01, + "learning_rate": 9.014042126379138e-05, + "loss": 2.4998, + "theoretical_loss": 3.3469046521901604, + "tokens_seen": 2711952384 + }, + { + "epoch": 9.01, + "learning_rate": 9.013039117352057e-05, + "loss": 2.222, + "theoretical_loss": 3.346898305388725, + "tokens_seen": 2712017920 + }, + { + "epoch": 9.01, + "learning_rate": 9.012036108324975e-05, + "loss": 2.7102, + "theoretical_loss": 3.346891958783601, + "tokens_seen": 2712083456 + }, + { + "epoch": 9.01, + "learning_rate": 9.011033099297895e-05, + "loss": 2.5579, + "theoretical_loss": 3.346885612374777, + "tokens_seen": 2712148992 + }, + { + "epoch": 9.01, + "learning_rate": 9.010030090270813e-05, + "loss": 2.4186, + "theoretical_loss": 3.3468792661622433, + "tokens_seen": 2712214528 + }, + { + "epoch": 9.01, + "learning_rate": 9.009027081243731e-05, + "loss": 2.3501, + "theoretical_loss": 3.346872920145988, + "tokens_seen": 2712280064 + }, + { + "epoch": 9.01, + "learning_rate": 9.00802407221665e-05, + "loss": 2.3832, + "theoretical_loss": 3.3468665743260013, + "tokens_seen": 2712345600 + }, + { + "epoch": 9.01, + "learning_rate": 9.007021063189568e-05, + "loss": 2.4072, + "theoretical_loss": 3.346860228702272, + "tokens_seen": 2712411136 + }, + { + "epoch": 9.01, + "learning_rate": 9.006018054162488e-05, + "loss": 2.5392, + "theoretical_loss": 3.346853883274789, + "tokens_seen": 2712476672 + }, + { + "epoch": 9.01, + "learning_rate": 9.005015045135406e-05, + "loss": 2.3604, + "theoretical_loss": 3.346847538043542, + "tokens_seen": 2712542208 + }, + { + "epoch": 9.01, + "learning_rate": 9.004012036108325e-05, + "loss": 2.3436, + "theoretical_loss": 3.34684119300852, + "tokens_seen": 2712607744 + }, + { + "epoch": 9.01, + "learning_rate": 9.003009027081244e-05, + "loss": 2.4144, + "theoretical_loss": 3.3468348481697117, + "tokens_seen": 2712673280 + }, + { + "epoch": 9.01, + "learning_rate": 9.002006018054163e-05, + "loss": 2.3782, + "theoretical_loss": 3.346828503527107, + "tokens_seen": 2712738816 + }, + { + "epoch": 9.01, + "learning_rate": 9.001003009027081e-05, + "loss": 2.3594, + "theoretical_loss": 3.3468221590806944, + "tokens_seen": 2712804352 + }, + { + "epoch": 9.01, + "learning_rate": 8.999999999999999e-05, + "loss": 2.2032, + "theoretical_loss": 3.346815814830464, + "tokens_seen": 2712869888 + }, + { + "epoch": 9.01, + "learning_rate": 8.998996990972919e-05, + "loss": 2.4264, + "theoretical_loss": 3.346809470776404, + "tokens_seen": 2712935424 + }, + { + "epoch": 9.01, + "learning_rate": 8.997993981945837e-05, + "loss": 2.6598, + "theoretical_loss": 3.346803126918504, + "tokens_seen": 2713000960 + }, + { + "epoch": 9.01, + "learning_rate": 8.996990972918756e-05, + "loss": 2.3309, + "theoretical_loss": 3.3467967832567536, + "tokens_seen": 2713066496 + }, + { + "epoch": 9.01, + "learning_rate": 8.995987963891674e-05, + "loss": 2.5602, + "theoretical_loss": 3.3467904397911417, + "tokens_seen": 2713132032 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3028099, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.017669439315796, + "objective/train/theoretical_loss": 3.346785682320642, + "objective/train/tokens_used": 2733641184, + "theoretical_loss": 3.346785682320642, + "tokens_seen": 2713181184 + }, + { + "epoch": 9.01, + "learning_rate": 8.994984954864594e-05, + "loss": 2.231, + "theoretical_loss": 3.3467840965216573, + "tokens_seen": 2713197568 + }, + { + "epoch": 9.01, + "learning_rate": 8.993981945837512e-05, + "loss": 2.2284, + "theoretical_loss": 3.34677775344829, + "tokens_seen": 2713263104 + }, + { + "epoch": 9.01, + "learning_rate": 8.992978936810431e-05, + "loss": 2.3132, + "theoretical_loss": 3.3467714105710282, + "tokens_seen": 2713328640 + }, + { + "epoch": 9.01, + "learning_rate": 8.99197592778335e-05, + "loss": 2.4309, + "theoretical_loss": 3.346765067889862, + "tokens_seen": 2713394176 + }, + { + "epoch": 9.01, + "learning_rate": 8.990972918756269e-05, + "loss": 2.4914, + "theoretical_loss": 3.34675872540478, + "tokens_seen": 2713459712 + }, + { + "epoch": 9.01, + "learning_rate": 8.989969909729188e-05, + "loss": 2.4282, + "theoretical_loss": 3.3467523831157724, + "tokens_seen": 2713525248 + }, + { + "epoch": 9.01, + "learning_rate": 8.988966900702107e-05, + "loss": 2.4858, + "theoretical_loss": 3.3467460410228274, + "tokens_seen": 2713590784 + }, + { + "epoch": 9.01, + "learning_rate": 8.987963891675026e-05, + "loss": 2.407, + "theoretical_loss": 3.346739699125934, + "tokens_seen": 2713656320 + }, + { + "epoch": 9.01, + "learning_rate": 8.986960882647944e-05, + "loss": 2.5352, + "theoretical_loss": 3.346733357425082, + "tokens_seen": 2713721856 + }, + { + "epoch": 9.01, + "learning_rate": 8.985957873620864e-05, + "loss": 2.4956, + "theoretical_loss": 3.3467270159202607, + "tokens_seen": 2713787392 + }, + { + "epoch": 9.01, + "learning_rate": 8.984954864593782e-05, + "loss": 2.5924, + "theoretical_loss": 3.346720674611459, + "tokens_seen": 2713852928 + }, + { + "epoch": 9.01, + "learning_rate": 8.983951855566701e-05, + "loss": 2.5222, + "theoretical_loss": 3.3467143334986664, + "tokens_seen": 2713918464 + }, + { + "epoch": 9.01, + "learning_rate": 8.982948846539619e-05, + "loss": 2.5752, + "theoretical_loss": 3.3467079925818717, + "tokens_seen": 2713984000 + }, + { + "epoch": 9.01, + "learning_rate": 8.981945837512539e-05, + "loss": 2.6224, + "theoretical_loss": 3.346701651861064, + "tokens_seen": 2714049536 + }, + { + "epoch": 9.01, + "learning_rate": 8.980942828485457e-05, + "loss": 2.4379, + "theoretical_loss": 3.3466953113362337, + "tokens_seen": 2714115072 + }, + { + "epoch": 9.01, + "learning_rate": 8.979939819458375e-05, + "loss": 2.5069, + "theoretical_loss": 3.346688971007368, + "tokens_seen": 2714180608 + }, + { + "epoch": 9.01, + "learning_rate": 8.978936810431294e-05, + "loss": 2.4782, + "theoretical_loss": 3.346682630874458, + "tokens_seen": 2714246144 + }, + { + "epoch": 9.01, + "learning_rate": 8.977933801404213e-05, + "loss": 2.3177, + "theoretical_loss": 3.3466762909374923, + "tokens_seen": 2714311680 + }, + { + "epoch": 9.01, + "learning_rate": 8.976930792377132e-05, + "loss": 2.6135, + "theoretical_loss": 3.3466699511964597, + "tokens_seen": 2714377216 + }, + { + "epoch": 9.01, + "learning_rate": 8.97592778335005e-05, + "loss": 2.6173, + "theoretical_loss": 3.3466636116513495, + "tokens_seen": 2714442752 + }, + { + "epoch": 9.01, + "learning_rate": 8.97492477432297e-05, + "loss": 2.4825, + "theoretical_loss": 3.346657272302151, + "tokens_seen": 2714508288 + }, + { + "epoch": 9.01, + "learning_rate": 8.973921765295888e-05, + "loss": 2.2932, + "theoretical_loss": 3.346650933148854, + "tokens_seen": 2714573824 + }, + { + "epoch": 9.01, + "learning_rate": 8.972918756268807e-05, + "loss": 2.5247, + "theoretical_loss": 3.346644594191447, + "tokens_seen": 2714639360 + }, + { + "epoch": 9.01, + "learning_rate": 8.971915747241725e-05, + "loss": 2.6085, + "theoretical_loss": 3.3466382554299194, + "tokens_seen": 2714704896 + }, + { + "epoch": 9.01, + "learning_rate": 8.970912738214643e-05, + "loss": 2.3835, + "theoretical_loss": 3.3466319168642604, + "tokens_seen": 2714770432 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3029601, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3096182346343994, + "objective/train/theoretical_loss": 3.3466271630685487, + "objective/train/tokens_used": 2735279584, + "theoretical_loss": 3.3466271630685487, + "tokens_seen": 2714819584 + }, + { + "epoch": 9.01, + "learning_rate": 8.969909729187563e-05, + "loss": 2.5157, + "theoretical_loss": 3.34662557849446, + "tokens_seen": 2714835968 + }, + { + "epoch": 9.01, + "learning_rate": 8.968906720160481e-05, + "loss": 2.4051, + "theoretical_loss": 3.3466192403205057, + "tokens_seen": 2714901504 + }, + { + "epoch": 9.01, + "learning_rate": 8.9679037111334e-05, + "loss": 2.4477, + "theoretical_loss": 3.3466129023423883, + "tokens_seen": 2714967040 + }, + { + "epoch": 9.01, + "learning_rate": 8.966900702106319e-05, + "loss": 2.6251, + "theoretical_loss": 3.3466065645600964, + "tokens_seen": 2715032576 + }, + { + "epoch": 9.01, + "learning_rate": 8.965897693079238e-05, + "loss": 2.6258, + "theoretical_loss": 3.346600226973619, + "tokens_seen": 2715098112 + }, + { + "epoch": 9.01, + "learning_rate": 8.964894684052156e-05, + "loss": 2.3633, + "theoretical_loss": 3.346593889582946, + "tokens_seen": 2715163648 + }, + { + "epoch": 9.01, + "learning_rate": 8.963891675025076e-05, + "loss": 2.529, + "theoretical_loss": 3.346587552388066, + "tokens_seen": 2715229184 + }, + { + "epoch": 9.01, + "learning_rate": 8.962888665997994e-05, + "loss": 2.6397, + "theoretical_loss": 3.3465812153889685, + "tokens_seen": 2715294720 + }, + { + "epoch": 9.01, + "learning_rate": 8.961885656970913e-05, + "loss": 2.5257, + "theoretical_loss": 3.3465748785856424, + "tokens_seen": 2715360256 + }, + { + "epoch": 9.01, + "learning_rate": 8.960882647943831e-05, + "loss": 2.4284, + "theoretical_loss": 3.3465685419780775, + "tokens_seen": 2715425792 + }, + { + "epoch": 9.01, + "learning_rate": 8.95987963891675e-05, + "loss": 2.4656, + "theoretical_loss": 3.3465622055662627, + "tokens_seen": 2715491328 + }, + { + "epoch": 9.01, + "learning_rate": 8.958876629889669e-05, + "loss": 2.5655, + "theoretical_loss": 3.3465558693501873, + "tokens_seen": 2715556864 + }, + { + "epoch": 9.01, + "learning_rate": 8.957873620862587e-05, + "loss": 2.4171, + "theoretical_loss": 3.3465495333298403, + "tokens_seen": 2715622400 + }, + { + "epoch": 9.01, + "learning_rate": 8.956870611835506e-05, + "loss": 2.5261, + "theoretical_loss": 3.3465431975052113, + "tokens_seen": 2715687936 + }, + { + "epoch": 9.01, + "learning_rate": 8.955867602808425e-05, + "loss": 2.4534, + "theoretical_loss": 3.3465368618762894, + "tokens_seen": 2715753472 + }, + { + "epoch": 9.01, + "learning_rate": 8.954864593781344e-05, + "loss": 2.49, + "theoretical_loss": 3.346530526443064, + "tokens_seen": 2715819008 + }, + { + "epoch": 9.01, + "learning_rate": 8.953861584754262e-05, + "loss": 2.4273, + "theoretical_loss": 3.3465241912055235, + "tokens_seen": 2715884544 + }, + { + "epoch": 9.01, + "learning_rate": 8.952858575727182e-05, + "loss": 2.4863, + "theoretical_loss": 3.346517856163658, + "tokens_seen": 2715950080 + }, + { + "epoch": 9.01, + "learning_rate": 8.951855566700101e-05, + "loss": 2.5458, + "theoretical_loss": 3.3465115213174568, + "tokens_seen": 2716015616 + }, + { + "epoch": 9.01, + "learning_rate": 8.950852557673019e-05, + "loss": 2.5614, + "theoretical_loss": 3.3465051866669087, + "tokens_seen": 2716081152 + }, + { + "epoch": 9.01, + "learning_rate": 8.949849548645939e-05, + "loss": 2.4884, + "theoretical_loss": 3.3464988522120027, + "tokens_seen": 2716146688 + }, + { + "epoch": 9.01, + "learning_rate": 8.948846539618857e-05, + "loss": 2.5859, + "theoretical_loss": 3.3464925179527287, + "tokens_seen": 2716212224 + }, + { + "epoch": 9.01, + "learning_rate": 8.947843530591776e-05, + "loss": 2.3992, + "theoretical_loss": 3.3464861838890756, + "tokens_seen": 2716277760 + }, + { + "epoch": 9.01, + "learning_rate": 8.946840521564694e-05, + "loss": 2.543, + "theoretical_loss": 3.3464798500210327, + "tokens_seen": 2716343296 + }, + { + "epoch": 9.01, + "learning_rate": 8.945837512537614e-05, + "loss": 2.293, + "theoretical_loss": 3.3464735163485892, + "tokens_seen": 2716408832 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3030296, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.225619316101074, + "objective/train/theoretical_loss": 3.3464687662226122, + "objective/train/tokens_used": 2736917984, + "theoretical_loss": 3.3464687662226122, + "tokens_seen": 2716457984 + }, + { + "epoch": 9.01, + "learning_rate": 8.944834503510532e-05, + "loss": 2.4554, + "theoretical_loss": 3.3464671828717343, + "tokens_seen": 2716474368 + }, + { + "epoch": 9.01, + "learning_rate": 8.943831494483451e-05, + "loss": 2.4269, + "theoretical_loss": 3.346460849590457, + "tokens_seen": 2716539904 + }, + { + "epoch": 9.01, + "learning_rate": 8.94282848545637e-05, + "loss": 2.2695, + "theoretical_loss": 3.3464545165047475, + "tokens_seen": 2716605440 + }, + { + "epoch": 9.01, + "learning_rate": 8.941825476429289e-05, + "loss": 2.4795, + "theoretical_loss": 3.3464481836145943, + "tokens_seen": 2716670976 + }, + { + "epoch": 9.01, + "learning_rate": 8.940822467402207e-05, + "loss": 2.5648, + "theoretical_loss": 3.346441850919986, + "tokens_seen": 2716736512 + }, + { + "epoch": 9.01, + "learning_rate": 8.939819458375125e-05, + "loss": 2.356, + "theoretical_loss": 3.3464355184209134, + "tokens_seen": 2716802048 + }, + { + "epoch": 9.01, + "learning_rate": 8.938816449348045e-05, + "loss": 2.3733, + "theoretical_loss": 3.3464291861173643, + "tokens_seen": 2716867584 + }, + { + "epoch": 9.01, + "learning_rate": 8.937813440320963e-05, + "loss": 2.3347, + "theoretical_loss": 3.346422854009329, + "tokens_seen": 2716933120 + }, + { + "epoch": 9.01, + "learning_rate": 8.936810431293882e-05, + "loss": 2.5608, + "theoretical_loss": 3.346416522096796, + "tokens_seen": 2716998656 + }, + { + "epoch": 9.01, + "learning_rate": 8.9358074222668e-05, + "loss": 2.531, + "theoretical_loss": 3.346410190379755, + "tokens_seen": 2717064192 + }, + { + "epoch": 9.01, + "learning_rate": 8.93480441323972e-05, + "loss": 2.3142, + "theoretical_loss": 3.3464038588581952, + "tokens_seen": 2717129728 + }, + { + "epoch": 9.01, + "learning_rate": 8.933801404212638e-05, + "loss": 2.5781, + "theoretical_loss": 3.3463975275321056, + "tokens_seen": 2717195264 + }, + { + "epoch": 9.01, + "learning_rate": 8.932798395185557e-05, + "loss": 2.4848, + "theoretical_loss": 3.3463911964014756, + "tokens_seen": 2717260800 + }, + { + "epoch": 9.01, + "learning_rate": 8.931795386158476e-05, + "loss": 2.3417, + "theoretical_loss": 3.3463848654662947, + "tokens_seen": 2717326336 + }, + { + "epoch": 9.01, + "learning_rate": 8.930792377131394e-05, + "loss": 2.3035, + "theoretical_loss": 3.3463785347265516, + "tokens_seen": 2717391872 + }, + { + "epoch": 9.01, + "learning_rate": 8.929789368104313e-05, + "loss": 2.4269, + "theoretical_loss": 3.346372204182236, + "tokens_seen": 2717457408 + }, + { + "epoch": 9.01, + "learning_rate": 8.928786359077231e-05, + "loss": 2.4688, + "theoretical_loss": 3.346365873833337, + "tokens_seen": 2717522944 + }, + { + "epoch": 9.01, + "learning_rate": 8.927783350050151e-05, + "loss": 2.3332, + "theoretical_loss": 3.346359543679844, + "tokens_seen": 2717588480 + }, + { + "epoch": 9.01, + "learning_rate": 8.926780341023069e-05, + "loss": 2.5477, + "theoretical_loss": 3.3463532137217458, + "tokens_seen": 2717654016 + }, + { + "epoch": 9.01, + "learning_rate": 8.925777331995988e-05, + "loss": 2.4139, + "theoretical_loss": 3.3463468839590327, + "tokens_seen": 2717719552 + }, + { + "epoch": 9.01, + "learning_rate": 8.924774322968906e-05, + "loss": 2.521, + "theoretical_loss": 3.3463405543916926, + "tokens_seen": 2717785088 + }, + { + "epoch": 9.01, + "learning_rate": 8.923771313941826e-05, + "loss": 2.4618, + "theoretical_loss": 3.3463342250197154, + "tokens_seen": 2717850624 + }, + { + "epoch": 9.01, + "learning_rate": 8.922768304914744e-05, + "loss": 2.5078, + "theoretical_loss": 3.346327895843091, + "tokens_seen": 2717916160 + }, + { + "epoch": 9.01, + "learning_rate": 8.921765295887662e-05, + "loss": 2.5378, + "theoretical_loss": 3.3463215668618074, + "tokens_seen": 2717981696 + }, + { + "epoch": 9.01, + "learning_rate": 8.920762286860582e-05, + "loss": 2.4536, + "theoretical_loss": 3.3463152380758547, + "tokens_seen": 2718047232 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3031593, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5399951934814453, + "objective/train/theoretical_loss": 3.3463104916145694, + "objective/train/tokens_used": 2738556384, + "theoretical_loss": 3.3463104916145694, + "tokens_seen": 2718096384 + }, + { + "epoch": 9.01, + "learning_rate": 8.9197592778335e-05, + "loss": 2.2788, + "theoretical_loss": 3.346308909485222, + "tokens_seen": 2718112768 + }, + { + "epoch": 9.01, + "learning_rate": 8.918756268806419e-05, + "loss": 2.3932, + "theoretical_loss": 3.3463025810898985, + "tokens_seen": 2718178304 + }, + { + "epoch": 9.01, + "learning_rate": 8.917753259779337e-05, + "loss": 2.3201, + "theoretical_loss": 3.346296252889873, + "tokens_seen": 2718243840 + }, + { + "epoch": 9.01, + "learning_rate": 8.916750250752257e-05, + "loss": 2.5007, + "theoretical_loss": 3.346289924885136, + "tokens_seen": 2718309376 + }, + { + "epoch": 9.01, + "learning_rate": 8.915747241725175e-05, + "loss": 2.4995, + "theoretical_loss": 3.3462835970756757, + "tokens_seen": 2718374912 + }, + { + "epoch": 9.01, + "learning_rate": 8.914744232698096e-05, + "loss": 2.1284, + "theoretical_loss": 3.3462772694614817, + "tokens_seen": 2718440448 + }, + { + "epoch": 9.01, + "learning_rate": 8.913741223671014e-05, + "loss": 2.3525, + "theoretical_loss": 3.346270942042543, + "tokens_seen": 2718505984 + }, + { + "epoch": 9.01, + "learning_rate": 8.912738214643933e-05, + "loss": 2.3159, + "theoretical_loss": 3.3462646148188493, + "tokens_seen": 2718571520 + }, + { + "epoch": 9.01, + "learning_rate": 8.911735205616851e-05, + "loss": 2.4572, + "theoretical_loss": 3.34625828779039, + "tokens_seen": 2718637056 + }, + { + "epoch": 9.01, + "learning_rate": 8.91073219658977e-05, + "loss": 2.4231, + "theoretical_loss": 3.3462519609571535, + "tokens_seen": 2718702592 + }, + { + "epoch": 9.01, + "learning_rate": 8.909729187562689e-05, + "loss": 2.566, + "theoretical_loss": 3.34624563431913, + "tokens_seen": 2718768128 + }, + { + "epoch": 9.01, + "learning_rate": 8.908726178535607e-05, + "loss": 2.3595, + "theoretical_loss": 3.346239307876308, + "tokens_seen": 2718833664 + }, + { + "epoch": 9.01, + "learning_rate": 8.907723169508527e-05, + "loss": 2.2927, + "theoretical_loss": 3.3462329816286775, + "tokens_seen": 2718899200 + }, + { + "epoch": 9.01, + "learning_rate": 8.906720160481445e-05, + "loss": 2.3046, + "theoretical_loss": 3.3462266555762277, + "tokens_seen": 2718964736 + }, + { + "epoch": 9.01, + "learning_rate": 8.905717151454364e-05, + "loss": 2.3478, + "theoretical_loss": 3.346220329718947, + "tokens_seen": 2719030272 + }, + { + "epoch": 9.01, + "learning_rate": 8.904714142427282e-05, + "loss": 2.3898, + "theoretical_loss": 3.3462140040568253, + "tokens_seen": 2719095808 + }, + { + "epoch": 9.01, + "learning_rate": 8.903711133400202e-05, + "loss": 2.4567, + "theoretical_loss": 3.3462076785898525, + "tokens_seen": 2719161344 + }, + { + "epoch": 9.01, + "learning_rate": 8.90270812437312e-05, + "loss": 2.3752, + "theoretical_loss": 3.3462013533180164, + "tokens_seen": 2719226880 + }, + { + "epoch": 9.01, + "learning_rate": 8.901705115346038e-05, + "loss": 2.5012, + "theoretical_loss": 3.3461950282413078, + "tokens_seen": 2719292416 + }, + { + "epoch": 9.01, + "learning_rate": 8.900702106318957e-05, + "loss": 2.5094, + "theoretical_loss": 3.346188703359715, + "tokens_seen": 2719357952 + }, + { + "epoch": 9.01, + "learning_rate": 8.899699097291875e-05, + "loss": 2.4932, + "theoretical_loss": 3.3461823786732277, + "tokens_seen": 2719423488 + }, + { + "epoch": 9.01, + "learning_rate": 8.898696088264795e-05, + "loss": 2.3301, + "theoretical_loss": 3.346176054181835, + "tokens_seen": 2719489024 + }, + { + "epoch": 9.01, + "learning_rate": 8.897693079237713e-05, + "loss": 2.6792, + "theoretical_loss": 3.346169729885526, + "tokens_seen": 2719554560 + }, + { + "epoch": 9.01, + "learning_rate": 8.896690070210633e-05, + "loss": 2.2836, + "theoretical_loss": 3.3461634057842904, + "tokens_seen": 2719620096 + }, + { + "epoch": 9.01, + "learning_rate": 8.89568706118355e-05, + "loss": 2.4883, + "theoretical_loss": 3.3461570818781174, + "tokens_seen": 2719685632 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 3032285, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6351773738861084, + "objective/train/theoretical_loss": 3.3461523390764905, + "objective/train/tokens_used": 2740194784, + "theoretical_loss": 3.3461523390764905, + "tokens_seen": 2719734784 + }, + { + "epoch": 9.01, + "learning_rate": 8.89468405215647e-05, + "loss": 2.4932, + "theoretical_loss": 3.346150758166996, + "tokens_seen": 2719751168 + }, + { + "epoch": 9.01, + "learning_rate": 8.893681043129388e-05, + "loss": 2.4323, + "theoretical_loss": 3.3461444346509155, + "tokens_seen": 2719816704 + }, + { + "epoch": 9.01, + "learning_rate": 8.892678034102306e-05, + "loss": 2.6189, + "theoretical_loss": 3.346138111329866, + "tokens_seen": 2719882240 + }, + { + "epoch": 9.01, + "learning_rate": 8.891675025075226e-05, + "loss": 2.4015, + "theoretical_loss": 3.3461317882038353, + "tokens_seen": 2719947776 + }, + { + "epoch": 9.01, + "learning_rate": 8.890672016048144e-05, + "loss": 2.237, + "theoretical_loss": 3.346125465272814, + "tokens_seen": 2720013312 + }, + { + "epoch": 9.01, + "learning_rate": 8.889669007021063e-05, + "loss": 2.5683, + "theoretical_loss": 3.346119142536791, + "tokens_seen": 2720078848 + }, + { + "epoch": 9.01, + "learning_rate": 8.888665997993981e-05, + "loss": 2.5468, + "theoretical_loss": 3.346112819995755, + "tokens_seen": 2720144384 + }, + { + "epoch": 9.01, + "learning_rate": 8.887662988966901e-05, + "loss": 2.2924, + "theoretical_loss": 3.3461064976496964, + "tokens_seen": 2720209920 + }, + { + "epoch": 9.01, + "learning_rate": 8.886659979939819e-05, + "loss": 2.445, + "theoretical_loss": 3.3461001754986035, + "tokens_seen": 2720275456 + }, + { + "epoch": 9.01, + "learning_rate": 8.885656970912739e-05, + "loss": 2.5636, + "theoretical_loss": 3.346093853542466, + "tokens_seen": 2720340992 + }, + { + "epoch": 9.02, + "learning_rate": 8.884653961885657e-05, + "loss": 2.4154, + "theoretical_loss": 3.3460875317812735, + "tokens_seen": 2720406528 + }, + { + "epoch": 9.02, + "learning_rate": 8.883650952858576e-05, + "loss": 2.3037, + "theoretical_loss": 3.3460812102150146, + "tokens_seen": 2720472064 + }, + { + "epoch": 9.02, + "learning_rate": 8.882647943831494e-05, + "loss": 2.3153, + "theoretical_loss": 3.346074888843679, + "tokens_seen": 2720537600 + }, + { + "epoch": 9.02, + "learning_rate": 8.881644934804412e-05, + "loss": 2.4399, + "theoretical_loss": 3.3460685676672557, + "tokens_seen": 2720603136 + }, + { + "epoch": 9.02, + "learning_rate": 8.880641925777332e-05, + "loss": 2.4557, + "theoretical_loss": 3.346062246685735, + "tokens_seen": 2720668672 + }, + { + "epoch": 9.02, + "learning_rate": 8.87963891675025e-05, + "loss": 2.4132, + "theoretical_loss": 3.3460559258991047, + "tokens_seen": 2720734208 + }, + { + "epoch": 9.02, + "learning_rate": 8.87863590772317e-05, + "loss": 2.564, + "theoretical_loss": 3.3460496053073547, + "tokens_seen": 2720799744 + }, + { + "epoch": 9.02, + "learning_rate": 8.877632898696087e-05, + "loss": 2.5701, + "theoretical_loss": 3.3460432849104746, + "tokens_seen": 2720865280 + }, + { + "epoch": 9.02, + "learning_rate": 8.876629889669008e-05, + "loss": 2.4568, + "theoretical_loss": 3.346036964708454, + "tokens_seen": 2720930816 + }, + { + "epoch": 9.02, + "learning_rate": 8.875626880641926e-05, + "loss": 2.4847, + "theoretical_loss": 3.3460306447012815, + "tokens_seen": 2720996352 + }, + { + "epoch": 9.02, + "learning_rate": 8.874623871614846e-05, + "loss": 2.3425, + "theoretical_loss": 3.3460243248889463, + "tokens_seen": 2721061888 + }, + { + "epoch": 9.02, + "learning_rate": 8.873620862587764e-05, + "loss": 2.2386, + "theoretical_loss": 3.3460180052714383, + "tokens_seen": 2721127424 + }, + { + "epoch": 9.02, + "learning_rate": 8.872617853560682e-05, + "loss": 2.393, + "theoretical_loss": 3.3460116858487465, + "tokens_seen": 2721192960 + }, + { + "epoch": 9.02, + "learning_rate": 8.871614844533602e-05, + "loss": 2.304, + "theoretical_loss": 3.3460053666208602, + "tokens_seen": 2721258496 + }, + { + "epoch": 9.02, + "learning_rate": 8.87061183550652e-05, + "loss": 2.3945, + "theoretical_loss": 3.345999047587769, + "tokens_seen": 2721324032 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3033905, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0698394775390625, + "objective/train/theoretical_loss": 3.345994308440778, + "objective/train/tokens_used": 2741833184, + "theoretical_loss": 3.345994308440778, + "tokens_seen": 2721373184 + }, + { + "epoch": 9.02, + "learning_rate": 8.869608826479439e-05, + "loss": 2.4059, + "theoretical_loss": 3.3459927287494615, + "tokens_seen": 2721389568 + }, + { + "epoch": 9.02, + "learning_rate": 8.868605817452357e-05, + "loss": 2.4346, + "theoretical_loss": 3.3459864101059273, + "tokens_seen": 2721455104 + }, + { + "epoch": 9.02, + "learning_rate": 8.867602808425277e-05, + "loss": 2.4229, + "theoretical_loss": 3.345980091657156, + "tokens_seen": 2721520640 + }, + { + "epoch": 9.02, + "learning_rate": 8.866599799398195e-05, + "loss": 2.6494, + "theoretical_loss": 3.345973773403137, + "tokens_seen": 2721586176 + }, + { + "epoch": 9.02, + "learning_rate": 8.865596790371114e-05, + "loss": 2.3349, + "theoretical_loss": 3.3459674553438594, + "tokens_seen": 2721651712 + }, + { + "epoch": 9.02, + "learning_rate": 8.864593781344032e-05, + "loss": 2.4173, + "theoretical_loss": 3.3459611374793123, + "tokens_seen": 2721717248 + }, + { + "epoch": 9.02, + "learning_rate": 8.86359077231695e-05, + "loss": 2.4095, + "theoretical_loss": 3.345954819809485, + "tokens_seen": 2721782784 + }, + { + "epoch": 9.02, + "learning_rate": 8.86258776328987e-05, + "loss": 2.4723, + "theoretical_loss": 3.3459485023343674, + "tokens_seen": 2721848320 + }, + { + "epoch": 9.02, + "learning_rate": 8.861584754262788e-05, + "loss": 2.3486, + "theoretical_loss": 3.345942185053948, + "tokens_seen": 2721913856 + }, + { + "epoch": 9.02, + "learning_rate": 8.860581745235708e-05, + "loss": 2.2855, + "theoretical_loss": 3.3459358679682167, + "tokens_seen": 2721979392 + }, + { + "epoch": 9.02, + "learning_rate": 8.859578736208626e-05, + "loss": 2.246, + "theoretical_loss": 3.3459295510771625, + "tokens_seen": 2722044928 + }, + { + "epoch": 9.02, + "learning_rate": 8.858575727181545e-05, + "loss": 2.3922, + "theoretical_loss": 3.345923234380775, + "tokens_seen": 2722110464 + }, + { + "epoch": 9.02, + "learning_rate": 8.857572718154463e-05, + "loss": 2.3476, + "theoretical_loss": 3.3459169178790438, + "tokens_seen": 2722176000 + }, + { + "epoch": 9.02, + "learning_rate": 8.856569709127383e-05, + "loss": 2.6362, + "theoretical_loss": 3.345910601571957, + "tokens_seen": 2722241536 + }, + { + "epoch": 9.02, + "learning_rate": 8.855566700100301e-05, + "loss": 2.5481, + "theoretical_loss": 3.345904285459505, + "tokens_seen": 2722307072 + }, + { + "epoch": 9.02, + "learning_rate": 8.85456369107322e-05, + "loss": 2.5078, + "theoretical_loss": 3.3458979695416766, + "tokens_seen": 2722372608 + }, + { + "epoch": 9.02, + "learning_rate": 8.853560682046138e-05, + "loss": 2.3407, + "theoretical_loss": 3.3458916538184615, + "tokens_seen": 2722438144 + }, + { + "epoch": 9.02, + "learning_rate": 8.852557673019057e-05, + "loss": 2.4986, + "theoretical_loss": 3.345885338289849, + "tokens_seen": 2722503680 + }, + { + "epoch": 9.02, + "learning_rate": 8.851554663991976e-05, + "loss": 2.3408, + "theoretical_loss": 3.345879022955828, + "tokens_seen": 2722569216 + }, + { + "epoch": 9.02, + "learning_rate": 8.850551654964894e-05, + "loss": 2.1757, + "theoretical_loss": 3.345872707816388, + "tokens_seen": 2722634752 + }, + { + "epoch": 9.02, + "learning_rate": 8.849548645937814e-05, + "loss": 2.5375, + "theoretical_loss": 3.3458663928715184, + "tokens_seen": 2722700288 + }, + { + "epoch": 9.02, + "learning_rate": 8.848545636910732e-05, + "loss": 2.3659, + "theoretical_loss": 3.345860078121209, + "tokens_seen": 2722765824 + }, + { + "epoch": 9.02, + "learning_rate": 8.847542627883651e-05, + "loss": 2.6168, + "theoretical_loss": 3.345853763565448, + "tokens_seen": 2722831360 + }, + { + "epoch": 9.02, + "learning_rate": 8.846539618856569e-05, + "loss": 2.4486, + "theoretical_loss": 3.345847449204226, + "tokens_seen": 2722896896 + }, + { + "epoch": 9.02, + "learning_rate": 8.845536609829489e-05, + "loss": 2.45, + "theoretical_loss": 3.3458411350375314, + "tokens_seen": 2722962432 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3034626, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.963073968887329, + "objective/train/theoretical_loss": 3.345836399540163, + "objective/train/tokens_used": 2743471584, + "theoretical_loss": 3.345836399540163, + "tokens_seen": 2723011584 + }, + { + "epoch": 9.02, + "learning_rate": 8.844533600802407e-05, + "loss": 2.5032, + "theoretical_loss": 3.345834821065354, + "tokens_seen": 2723027968 + }, + { + "epoch": 9.02, + "learning_rate": 8.843530591775325e-05, + "loss": 2.6076, + "theoretical_loss": 3.3458285072876826, + "tokens_seen": 2723093504 + }, + { + "epoch": 9.02, + "learning_rate": 8.842527582748244e-05, + "loss": 2.5869, + "theoretical_loss": 3.3458221937045067, + "tokens_seen": 2723159040 + }, + { + "epoch": 9.02, + "learning_rate": 8.841524573721163e-05, + "loss": 2.3971, + "theoretical_loss": 3.3458158803158162, + "tokens_seen": 2723224576 + }, + { + "epoch": 9.02, + "learning_rate": 8.840521564694082e-05, + "loss": 2.4375, + "theoretical_loss": 3.3458095671216004, + "tokens_seen": 2723290112 + }, + { + "epoch": 9.02, + "learning_rate": 8.839518555667001e-05, + "loss": 2.2837, + "theoretical_loss": 3.3458032541218476, + "tokens_seen": 2723355648 + }, + { + "epoch": 9.02, + "learning_rate": 8.838515546639921e-05, + "loss": 2.5032, + "theoretical_loss": 3.345796941316548, + "tokens_seen": 2723421184 + }, + { + "epoch": 9.02, + "learning_rate": 8.837512537612839e-05, + "loss": 2.5264, + "theoretical_loss": 3.3457906287056907, + "tokens_seen": 2723486720 + }, + { + "epoch": 9.02, + "learning_rate": 8.836509528585759e-05, + "loss": 2.6092, + "theoretical_loss": 3.345784316289265, + "tokens_seen": 2723552256 + }, + { + "epoch": 9.02, + "learning_rate": 8.835506519558677e-05, + "loss": 2.142, + "theoretical_loss": 3.3457780040672604, + "tokens_seen": 2723617792 + }, + { + "epoch": 9.02, + "learning_rate": 8.834503510531596e-05, + "loss": 2.3826, + "theoretical_loss": 3.345771692039666, + "tokens_seen": 2723683328 + }, + { + "epoch": 9.02, + "learning_rate": 8.833500501504514e-05, + "loss": 2.4013, + "theoretical_loss": 3.3457653802064713, + "tokens_seen": 2723748864 + }, + { + "epoch": 9.02, + "learning_rate": 8.832497492477432e-05, + "loss": 2.2591, + "theoretical_loss": 3.3457590685676655, + "tokens_seen": 2723814400 + }, + { + "epoch": 9.02, + "learning_rate": 8.831494483450352e-05, + "loss": 2.4184, + "theoretical_loss": 3.3457527571232384, + "tokens_seen": 2723879936 + }, + { + "epoch": 9.02, + "learning_rate": 8.83049147442327e-05, + "loss": 2.5094, + "theoretical_loss": 3.3457464458731785, + "tokens_seen": 2723945472 + }, + { + "epoch": 9.02, + "learning_rate": 8.82948846539619e-05, + "loss": 2.5085, + "theoretical_loss": 3.345740134817476, + "tokens_seen": 2724011008 + }, + { + "epoch": 9.02, + "learning_rate": 8.828485456369108e-05, + "loss": 2.3189, + "theoretical_loss": 3.3457338239561194, + "tokens_seen": 2724076544 + }, + { + "epoch": 9.02, + "learning_rate": 8.827482447342027e-05, + "loss": 2.5706, + "theoretical_loss": 3.345727513289099, + "tokens_seen": 2724142080 + }, + { + "epoch": 9.02, + "learning_rate": 8.826479438314945e-05, + "loss": 2.5929, + "theoretical_loss": 3.3457212028164034, + "tokens_seen": 2724207616 + }, + { + "epoch": 9.02, + "learning_rate": 8.825476429287865e-05, + "loss": 2.4153, + "theoretical_loss": 3.3457148925380222, + "tokens_seen": 2724273152 + }, + { + "epoch": 9.02, + "learning_rate": 8.824473420260783e-05, + "loss": 2.3619, + "theoretical_loss": 3.3457085824539443, + "tokens_seen": 2724338688 + }, + { + "epoch": 9.02, + "learning_rate": 8.823470411233701e-05, + "loss": 2.3245, + "theoretical_loss": 3.34570227256416, + "tokens_seen": 2724404224 + }, + { + "epoch": 9.02, + "learning_rate": 8.82246740220662e-05, + "loss": 2.601, + "theoretical_loss": 3.345695962868658, + "tokens_seen": 2724469760 + }, + { + "epoch": 9.02, + "learning_rate": 8.821464393179538e-05, + "loss": 2.6061, + "theoretical_loss": 3.3456896533674274, + "tokens_seen": 2724535296 + }, + { + "epoch": 9.02, + "learning_rate": 8.820461384152458e-05, + "loss": 2.3929, + "theoretical_loss": 3.345683344060458, + "tokens_seen": 2724600832 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3035895, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.7589055299758911, + "objective/train/theoretical_loss": 3.3456786122077085, + "objective/train/tokens_used": 2745109984, + "theoretical_loss": 3.3456786122077085, + "tokens_seen": 2724649984 + }, + { + "epoch": 9.02, + "learning_rate": 8.819458375125376e-05, + "loss": 2.2881, + "theoretical_loss": 3.345677034947739, + "tokens_seen": 2724666368 + }, + { + "epoch": 9.02, + "learning_rate": 8.818455366098295e-05, + "loss": 2.3022, + "theoretical_loss": 3.34567072602926, + "tokens_seen": 2724731904 + }, + { + "epoch": 9.02, + "learning_rate": 8.817452357071214e-05, + "loss": 2.1046, + "theoretical_loss": 3.3456644173050103, + "tokens_seen": 2724797440 + }, + { + "epoch": 9.02, + "learning_rate": 8.816449348044133e-05, + "loss": 2.5029, + "theoretical_loss": 3.3456581087749786, + "tokens_seen": 2724862976 + }, + { + "epoch": 9.02, + "learning_rate": 8.815446339017051e-05, + "loss": 2.2405, + "theoretical_loss": 3.345651800439155, + "tokens_seen": 2724928512 + }, + { + "epoch": 9.02, + "learning_rate": 8.814443329989969e-05, + "loss": 2.2078, + "theoretical_loss": 3.3456454922975283, + "tokens_seen": 2724994048 + }, + { + "epoch": 9.02, + "learning_rate": 8.813440320962889e-05, + "loss": 2.6171, + "theoretical_loss": 3.3456391843500883, + "tokens_seen": 2725059584 + }, + { + "epoch": 9.02, + "learning_rate": 8.812437311935807e-05, + "loss": 2.5739, + "theoretical_loss": 3.345632876596824, + "tokens_seen": 2725125120 + }, + { + "epoch": 9.02, + "learning_rate": 8.811434302908726e-05, + "loss": 2.2381, + "theoretical_loss": 3.3456265690377256, + "tokens_seen": 2725190656 + }, + { + "epoch": 9.02, + "learning_rate": 8.810431293881644e-05, + "loss": 2.4961, + "theoretical_loss": 3.345620261672781, + "tokens_seen": 2725256192 + }, + { + "epoch": 9.02, + "learning_rate": 8.809428284854564e-05, + "loss": 2.3015, + "theoretical_loss": 3.3456139545019807, + "tokens_seen": 2725321728 + }, + { + "epoch": 9.02, + "learning_rate": 8.808425275827482e-05, + "loss": 2.2388, + "theoretical_loss": 3.3456076475253136, + "tokens_seen": 2725387264 + }, + { + "epoch": 9.02, + "learning_rate": 8.807422266800401e-05, + "loss": 2.3501, + "theoretical_loss": 3.3456013407427694, + "tokens_seen": 2725452800 + }, + { + "epoch": 9.02, + "learning_rate": 8.80641925777332e-05, + "loss": 2.6902, + "theoretical_loss": 3.345595034154337, + "tokens_seen": 2725518336 + }, + { + "epoch": 9.02, + "learning_rate": 8.805416248746239e-05, + "loss": 2.3765, + "theoretical_loss": 3.345588727760006, + "tokens_seen": 2725583872 + }, + { + "epoch": 9.02, + "learning_rate": 8.804413239719157e-05, + "loss": 2.7276, + "theoretical_loss": 3.3455824215597656, + "tokens_seen": 2725649408 + }, + { + "epoch": 9.02, + "learning_rate": 8.803410230692075e-05, + "loss": 2.4665, + "theoretical_loss": 3.3455761155536052, + "tokens_seen": 2725714944 + }, + { + "epoch": 9.02, + "learning_rate": 8.802407221664996e-05, + "loss": 2.5525, + "theoretical_loss": 3.345569809741515, + "tokens_seen": 2725780480 + }, + { + "epoch": 9.02, + "learning_rate": 8.801404212637914e-05, + "loss": 2.394, + "theoretical_loss": 3.345563504123483, + "tokens_seen": 2725846016 + }, + { + "epoch": 9.02, + "learning_rate": 8.800401203610834e-05, + "loss": 2.442, + "theoretical_loss": 3.345557198699499, + "tokens_seen": 2725911552 + }, + { + "epoch": 9.02, + "learning_rate": 8.799398194583752e-05, + "loss": 2.4424, + "theoretical_loss": 3.345550893469553, + "tokens_seen": 2725977088 + }, + { + "epoch": 9.02, + "learning_rate": 8.798395185556671e-05, + "loss": 2.3502, + "theoretical_loss": 3.3455445884336337, + "tokens_seen": 2726042624 + }, + { + "epoch": 9.02, + "learning_rate": 8.797392176529589e-05, + "loss": 2.4644, + "theoretical_loss": 3.3455382835917304, + "tokens_seen": 2726108160 + }, + { + "epoch": 9.02, + "learning_rate": 8.796389167502509e-05, + "loss": 2.5218, + "theoretical_loss": 3.345531978943833, + "tokens_seen": 2726173696 + }, + { + "epoch": 9.02, + "learning_rate": 8.795386158475427e-05, + "loss": 2.4493, + "theoretical_loss": 3.3455256744899304, + "tokens_seen": 2726239232 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3036506, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.179093360900879, + "objective/train/theoretical_loss": 3.3455209462768067, + "objective/train/tokens_used": 2746748384, + "theoretical_loss": 3.3455209462768067, + "tokens_seen": 2726288384 + }, + { + "epoch": 9.02, + "learning_rate": 8.794383149448345e-05, + "loss": 2.3196, + "theoretical_loss": 3.3455193702300123, + "tokens_seen": 2726304768 + }, + { + "epoch": 9.02, + "learning_rate": 8.793380140421264e-05, + "loss": 2.3157, + "theoretical_loss": 3.3455130661640684, + "tokens_seen": 2726370304 + }, + { + "epoch": 9.02, + "learning_rate": 8.792377131394183e-05, + "loss": 2.5317, + "theoretical_loss": 3.3455067622920867, + "tokens_seen": 2726435840 + }, + { + "epoch": 9.02, + "learning_rate": 8.791374122367102e-05, + "loss": 2.3654, + "theoretical_loss": 3.345500458614058, + "tokens_seen": 2726501376 + }, + { + "epoch": 9.02, + "learning_rate": 8.79037111334002e-05, + "loss": 2.8115, + "theoretical_loss": 3.3454941551299715, + "tokens_seen": 2726566912 + }, + { + "epoch": 9.02, + "learning_rate": 8.78936810431294e-05, + "loss": 2.5319, + "theoretical_loss": 3.3454878518398155, + "tokens_seen": 2726632448 + }, + { + "epoch": 9.02, + "learning_rate": 8.788365095285858e-05, + "loss": 2.7463, + "theoretical_loss": 3.3454815487435803, + "tokens_seen": 2726697984 + }, + { + "epoch": 9.02, + "learning_rate": 8.787362086258777e-05, + "loss": 2.6278, + "theoretical_loss": 3.3454752458412553, + "tokens_seen": 2726763520 + }, + { + "epoch": 9.02, + "learning_rate": 8.786359077231695e-05, + "loss": 2.4002, + "theoretical_loss": 3.3454689431328295, + "tokens_seen": 2726829056 + }, + { + "epoch": 9.02, + "learning_rate": 8.785356068204613e-05, + "loss": 2.4712, + "theoretical_loss": 3.345462640618292, + "tokens_seen": 2726894592 + }, + { + "epoch": 9.02, + "learning_rate": 8.784353059177533e-05, + "loss": 2.3138, + "theoretical_loss": 3.3454563382976334, + "tokens_seen": 2726960128 + }, + { + "epoch": 9.02, + "learning_rate": 8.783350050150451e-05, + "loss": 2.5643, + "theoretical_loss": 3.345450036170842, + "tokens_seen": 2727025664 + }, + { + "epoch": 9.02, + "learning_rate": 8.78234704112337e-05, + "loss": 2.5678, + "theoretical_loss": 3.3454437342379073, + "tokens_seen": 2727091200 + }, + { + "epoch": 9.02, + "learning_rate": 8.781344032096289e-05, + "loss": 2.4181, + "theoretical_loss": 3.3454374324988185, + "tokens_seen": 2727156736 + }, + { + "epoch": 9.02, + "learning_rate": 8.780341023069208e-05, + "loss": 2.3091, + "theoretical_loss": 3.345431130953566, + "tokens_seen": 2727222272 + }, + { + "epoch": 9.02, + "learning_rate": 8.779338014042126e-05, + "loss": 2.3829, + "theoretical_loss": 3.3454248296021376, + "tokens_seen": 2727287808 + }, + { + "epoch": 9.02, + "learning_rate": 8.778335005015046e-05, + "loss": 1.9371, + "theoretical_loss": 3.345418528444524, + "tokens_seen": 2727353344 + }, + { + "epoch": 9.02, + "learning_rate": 8.777331995987964e-05, + "loss": 2.4161, + "theoretical_loss": 3.3454122274807143, + "tokens_seen": 2727418880 + }, + { + "epoch": 9.02, + "learning_rate": 8.776328986960883e-05, + "loss": 2.4681, + "theoretical_loss": 3.345405926710698, + "tokens_seen": 2727484416 + }, + { + "epoch": 9.02, + "learning_rate": 8.775325977933801e-05, + "loss": 2.5088, + "theoretical_loss": 3.3453996261344634, + "tokens_seen": 2727549952 + }, + { + "epoch": 9.02, + "learning_rate": 8.77432296890672e-05, + "loss": 2.3381, + "theoretical_loss": 3.3453933257520014, + "tokens_seen": 2727615488 + }, + { + "epoch": 9.02, + "learning_rate": 8.773319959879639e-05, + "loss": 2.442, + "theoretical_loss": 3.3453870255633005, + "tokens_seen": 2727681024 + }, + { + "epoch": 9.02, + "learning_rate": 8.772316950852557e-05, + "loss": 2.6436, + "theoretical_loss": 3.34538072556835, + "tokens_seen": 2727746560 + }, + { + "epoch": 9.02, + "learning_rate": 8.771313941825476e-05, + "loss": 2.3266, + "theoretical_loss": 3.3453744257671394, + "tokens_seen": 2727812096 + }, + { + "epoch": 9.02, + "learning_rate": 8.770310932798395e-05, + "loss": 2.4118, + "theoretical_loss": 3.345368126159659, + "tokens_seen": 2727877632 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3038069, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2184360027313232, + "objective/train/theoretical_loss": 3.3453634015811766, + "objective/train/tokens_used": 2748386784, + "theoretical_loss": 3.3453634015811766, + "tokens_seen": 2727926784 + }, + { + "epoch": 9.02, + "learning_rate": 8.769307923771314e-05, + "loss": 2.4523, + "theoretical_loss": 3.3453618267458967, + "tokens_seen": 2727943168 + }, + { + "epoch": 9.02, + "learning_rate": 8.768304914744232e-05, + "loss": 2.3652, + "theoretical_loss": 3.345355527525843, + "tokens_seen": 2728008704 + }, + { + "epoch": 9.02, + "learning_rate": 8.767301905717152e-05, + "loss": 2.5357, + "theoretical_loss": 3.3453492284994866, + "tokens_seen": 2728074240 + }, + { + "epoch": 9.02, + "learning_rate": 8.76629889669007e-05, + "loss": 2.4676, + "theoretical_loss": 3.345342929666818, + "tokens_seen": 2728139776 + }, + { + "epoch": 9.02, + "learning_rate": 8.765295887662988e-05, + "loss": 2.5539, + "theoretical_loss": 3.3453366310278247, + "tokens_seen": 2728205312 + }, + { + "epoch": 9.02, + "learning_rate": 8.764292878635909e-05, + "loss": 2.5368, + "theoretical_loss": 3.3453303325824977, + "tokens_seen": 2728270848 + }, + { + "epoch": 9.02, + "learning_rate": 8.763289869608827e-05, + "loss": 2.3721, + "theoretical_loss": 3.345324034330826, + "tokens_seen": 2728336384 + }, + { + "epoch": 9.02, + "learning_rate": 8.762286860581746e-05, + "loss": 2.4803, + "theoretical_loss": 3.3453177362727984, + "tokens_seen": 2728401920 + }, + { + "epoch": 9.02, + "learning_rate": 8.761283851554664e-05, + "loss": 2.1628, + "theoretical_loss": 3.3453114384084053, + "tokens_seen": 2728467456 + }, + { + "epoch": 9.02, + "learning_rate": 8.760280842527584e-05, + "loss": 2.4607, + "theoretical_loss": 3.345305140737635, + "tokens_seen": 2728532992 + }, + { + "epoch": 9.02, + "learning_rate": 8.759277833500502e-05, + "loss": 2.434, + "theoretical_loss": 3.345298843260478, + "tokens_seen": 2728598528 + }, + { + "epoch": 9.02, + "learning_rate": 8.758274824473421e-05, + "loss": 2.6565, + "theoretical_loss": 3.345292545976923, + "tokens_seen": 2728664064 + }, + { + "epoch": 9.02, + "learning_rate": 8.75727181544634e-05, + "loss": 2.6705, + "theoretical_loss": 3.3452862488869592, + "tokens_seen": 2728729600 + }, + { + "epoch": 9.02, + "learning_rate": 8.756268806419259e-05, + "loss": 2.316, + "theoretical_loss": 3.3452799519905767, + "tokens_seen": 2728795136 + }, + { + "epoch": 9.02, + "learning_rate": 8.755265797392177e-05, + "loss": 2.5316, + "theoretical_loss": 3.3452736552877647, + "tokens_seen": 2728860672 + }, + { + "epoch": 9.02, + "learning_rate": 8.754262788365095e-05, + "loss": 2.3929, + "theoretical_loss": 3.345267358778512, + "tokens_seen": 2728926208 + }, + { + "epoch": 9.02, + "learning_rate": 8.753259779338015e-05, + "loss": 2.1864, + "theoretical_loss": 3.345261062462809, + "tokens_seen": 2728991744 + }, + { + "epoch": 9.02, + "learning_rate": 8.752256770310933e-05, + "loss": 2.6648, + "theoretical_loss": 3.345254766340644, + "tokens_seen": 2729057280 + }, + { + "epoch": 9.02, + "learning_rate": 8.751253761283852e-05, + "loss": 2.4784, + "theoretical_loss": 3.3452484704120073, + "tokens_seen": 2729122816 + }, + { + "epoch": 9.02, + "learning_rate": 8.75025075225677e-05, + "loss": 2.5267, + "theoretical_loss": 3.345242174676888, + "tokens_seen": 2729188352 + }, + { + "epoch": 9.02, + "learning_rate": 8.74924774322969e-05, + "loss": 2.5693, + "theoretical_loss": 3.3452358791352754, + "tokens_seen": 2729253888 + }, + { + "epoch": 9.02, + "learning_rate": 8.748244734202608e-05, + "loss": 2.3759, + "theoretical_loss": 3.345229583787159, + "tokens_seen": 2729319424 + }, + { + "epoch": 9.02, + "learning_rate": 8.747241725175527e-05, + "loss": 2.5627, + "theoretical_loss": 3.3452232886325275, + "tokens_seen": 2729384960 + }, + { + "epoch": 9.02, + "learning_rate": 8.746238716148446e-05, + "loss": 2.5261, + "theoretical_loss": 3.345216993671372, + "tokens_seen": 2729450496 + }, + { + "epoch": 9.02, + "learning_rate": 8.745235707121364e-05, + "loss": 2.6887, + "theoretical_loss": 3.3452106989036805, + "tokens_seen": 2729516032 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3038699, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8688695430755615, + "objective/train/theoretical_loss": 3.3452059779548664, + "objective/train/tokens_used": 2750025184, + "theoretical_loss": 3.3452059779548664, + "tokens_seen": 2729565184 + }, + { + "epoch": 9.02, + "learning_rate": 8.744232698094283e-05, + "loss": 2.6707, + "theoretical_loss": 3.3452044043294427, + "tokens_seen": 2729581568 + }, + { + "epoch": 9.02, + "learning_rate": 8.743229689067201e-05, + "loss": 2.3662, + "theoretical_loss": 3.3451981099486483, + "tokens_seen": 2729647104 + }, + { + "epoch": 9.02, + "learning_rate": 8.742226680040121e-05, + "loss": 2.497, + "theoretical_loss": 3.3451918157612863, + "tokens_seen": 2729712640 + }, + { + "epoch": 9.02, + "learning_rate": 8.741223671013039e-05, + "loss": 2.4127, + "theoretical_loss": 3.3451855217673465, + "tokens_seen": 2729778176 + }, + { + "epoch": 9.02, + "learning_rate": 8.740220661985958e-05, + "loss": 2.4335, + "theoretical_loss": 3.345179227966818, + "tokens_seen": 2729843712 + }, + { + "epoch": 9.02, + "learning_rate": 8.739217652958876e-05, + "loss": 2.4438, + "theoretical_loss": 3.3451729343596908, + "tokens_seen": 2729909248 + }, + { + "epoch": 9.02, + "learning_rate": 8.738214643931796e-05, + "loss": 2.5983, + "theoretical_loss": 3.3451666409459535, + "tokens_seen": 2729974784 + }, + { + "epoch": 9.02, + "learning_rate": 8.737211634904714e-05, + "loss": 2.4841, + "theoretical_loss": 3.345160347725596, + "tokens_seen": 2730040320 + }, + { + "epoch": 9.02, + "learning_rate": 8.736208625877632e-05, + "loss": 2.5532, + "theoretical_loss": 3.3451540546986074, + "tokens_seen": 2730105856 + }, + { + "epoch": 9.02, + "learning_rate": 8.735205616850552e-05, + "loss": 2.5245, + "theoretical_loss": 3.3451477618649776, + "tokens_seen": 2730171392 + }, + { + "epoch": 9.02, + "learning_rate": 8.73420260782347e-05, + "loss": 2.3655, + "theoretical_loss": 3.345141469224696, + "tokens_seen": 2730236928 + }, + { + "epoch": 9.02, + "learning_rate": 8.733199598796389e-05, + "loss": 2.4799, + "theoretical_loss": 3.3451351767777515, + "tokens_seen": 2730302464 + }, + { + "epoch": 9.02, + "learning_rate": 8.732196589769307e-05, + "loss": 2.4953, + "theoretical_loss": 3.345128884524134, + "tokens_seen": 2730368000 + }, + { + "epoch": 9.02, + "learning_rate": 8.731193580742227e-05, + "loss": 2.5956, + "theoretical_loss": 3.345122592463832, + "tokens_seen": 2730433536 + }, + { + "epoch": 9.02, + "learning_rate": 8.730190571715145e-05, + "loss": 2.5069, + "theoretical_loss": 3.3451163005968363, + "tokens_seen": 2730499072 + }, + { + "epoch": 9.02, + "learning_rate": 8.729187562688064e-05, + "loss": 2.2419, + "theoretical_loss": 3.345110008923135, + "tokens_seen": 2730564608 + }, + { + "epoch": 9.02, + "learning_rate": 8.728184553660982e-05, + "loss": 2.4143, + "theoretical_loss": 3.3451037174427185, + "tokens_seen": 2730630144 + }, + { + "epoch": 9.02, + "learning_rate": 8.727181544633903e-05, + "loss": 2.6231, + "theoretical_loss": 3.345097426155576, + "tokens_seen": 2730695680 + }, + { + "epoch": 9.02, + "learning_rate": 8.726178535606821e-05, + "loss": 2.5469, + "theoretical_loss": 3.345091135061697, + "tokens_seen": 2730761216 + }, + { + "epoch": 9.02, + "learning_rate": 8.72517552657974e-05, + "loss": 2.5187, + "theoretical_loss": 3.3450848441610708, + "tokens_seen": 2730826752 + }, + { + "epoch": 9.02, + "learning_rate": 8.724172517552659e-05, + "loss": 2.4043, + "theoretical_loss": 3.3450785534536864, + "tokens_seen": 2730892288 + }, + { + "epoch": 9.02, + "learning_rate": 8.723169508525577e-05, + "loss": 2.4681, + "theoretical_loss": 3.3450722629395337, + "tokens_seen": 2730957824 + }, + { + "epoch": 9.02, + "learning_rate": 8.722166499498496e-05, + "loss": 2.5426, + "theoretical_loss": 3.345065972618602, + "tokens_seen": 2731023360 + }, + { + "epoch": 9.02, + "learning_rate": 8.721163490471415e-05, + "loss": 2.6289, + "theoretical_loss": 3.3450596824908807, + "tokens_seen": 2731088896 + }, + { + "epoch": 9.02, + "learning_rate": 8.720160481444334e-05, + "loss": 2.4455, + "theoretical_loss": 3.3450533925563595, + "tokens_seen": 2731154432 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3039495, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.587313413619995, + "objective/train/theoretical_loss": 3.3450486752322495, + "objective/train/tokens_used": 2751663584, + "theoretical_loss": 3.3450486752322495, + "tokens_seen": 2731203584 + }, + { + "epoch": 9.02, + "learning_rate": 8.719157472417252e-05, + "loss": 2.4664, + "theoretical_loss": 3.3450471028150277, + "tokens_seen": 2731219968 + }, + { + "epoch": 9.02, + "learning_rate": 8.718154463390172e-05, + "loss": 2.4929, + "theoretical_loss": 3.3450408132668743, + "tokens_seen": 2731285504 + }, + { + "epoch": 9.02, + "learning_rate": 8.71715145436309e-05, + "loss": 2.3728, + "theoretical_loss": 3.345034523911889, + "tokens_seen": 2731351040 + }, + { + "epoch": 9.02, + "learning_rate": 8.716148445336008e-05, + "loss": 2.4153, + "theoretical_loss": 3.3450282347500617, + "tokens_seen": 2731416576 + }, + { + "epoch": 9.02, + "learning_rate": 8.715145436308927e-05, + "loss": 2.4335, + "theoretical_loss": 3.3450219457813812, + "tokens_seen": 2731482112 + }, + { + "epoch": 9.02, + "learning_rate": 8.714142427281845e-05, + "loss": 2.3268, + "theoretical_loss": 3.3450156570058374, + "tokens_seen": 2731547648 + }, + { + "epoch": 9.02, + "learning_rate": 8.713139418254765e-05, + "loss": 2.4964, + "theoretical_loss": 3.345009368423419, + "tokens_seen": 2731613184 + }, + { + "epoch": 9.02, + "learning_rate": 8.712136409227683e-05, + "loss": 2.3694, + "theoretical_loss": 3.3450030800341164, + "tokens_seen": 2731678720 + }, + { + "epoch": 9.02, + "learning_rate": 8.711133400200603e-05, + "loss": 2.4122, + "theoretical_loss": 3.3449967918379184, + "tokens_seen": 2731744256 + }, + { + "epoch": 9.02, + "learning_rate": 8.71013039117352e-05, + "loss": 2.4679, + "theoretical_loss": 3.344990503834815, + "tokens_seen": 2731809792 + }, + { + "epoch": 9.02, + "learning_rate": 8.70912738214644e-05, + "loss": 2.5426, + "theoretical_loss": 3.3449842160247947, + "tokens_seen": 2731875328 + }, + { + "epoch": 9.02, + "learning_rate": 8.708124373119358e-05, + "loss": 2.6564, + "theoretical_loss": 3.344977928407848, + "tokens_seen": 2731940864 + }, + { + "epoch": 9.02, + "learning_rate": 8.707121364092276e-05, + "loss": 2.5926, + "theoretical_loss": 3.3449716409839634, + "tokens_seen": 2732006400 + }, + { + "epoch": 9.02, + "learning_rate": 8.706118355065196e-05, + "loss": 2.3289, + "theoretical_loss": 3.344965353753131, + "tokens_seen": 2732071936 + }, + { + "epoch": 9.02, + "learning_rate": 8.705115346038114e-05, + "loss": 2.3142, + "theoretical_loss": 3.34495906671534, + "tokens_seen": 2732137472 + }, + { + "epoch": 9.02, + "learning_rate": 8.704112337011033e-05, + "loss": 2.205, + "theoretical_loss": 3.34495277987058, + "tokens_seen": 2732203008 + }, + { + "epoch": 9.02, + "learning_rate": 8.703109327983951e-05, + "loss": 2.385, + "theoretical_loss": 3.34494649321884, + "tokens_seen": 2732268544 + }, + { + "epoch": 9.02, + "learning_rate": 8.702106318956871e-05, + "loss": 2.2641, + "theoretical_loss": 3.34494020676011, + "tokens_seen": 2732334080 + }, + { + "epoch": 9.02, + "learning_rate": 8.701103309929789e-05, + "loss": 2.369, + "theoretical_loss": 3.344933920494379, + "tokens_seen": 2732399616 + }, + { + "epoch": 9.02, + "learning_rate": 8.700100300902709e-05, + "loss": 2.6799, + "theoretical_loss": 3.3449276344216368, + "tokens_seen": 2732465152 + }, + { + "epoch": 9.02, + "learning_rate": 8.699097291875627e-05, + "loss": 2.3294, + "theoretical_loss": 3.344921348541873, + "tokens_seen": 2732530688 + }, + { + "epoch": 9.02, + "learning_rate": 8.698094282848546e-05, + "loss": 2.406, + "theoretical_loss": 3.344915062855076, + "tokens_seen": 2732596224 + }, + { + "epoch": 9.02, + "learning_rate": 8.697091273821464e-05, + "loss": 2.3385, + "theoretical_loss": 3.3449087773612365, + "tokens_seen": 2732661760 + }, + { + "epoch": 9.02, + "learning_rate": 8.696088264794382e-05, + "loss": 2.4609, + "theoretical_loss": 3.3449024920603434, + "tokens_seen": 2732727296 + }, + { + "epoch": 9.02, + "learning_rate": 8.695085255767302e-05, + "loss": 2.4149, + "theoretical_loss": 3.344896206952386, + "tokens_seen": 2732792832 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3040619, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9567456245422363, + "objective/train/theoretical_loss": 3.344891493248026, + "objective/train/tokens_used": 2753301984, + "theoretical_loss": 3.344891493248026, + "tokens_seen": 2732841984 + }, + { + "epoch": 9.02, + "learning_rate": 8.69408224674022e-05, + "loss": 2.5365, + "theoretical_loss": 3.3448899220373542, + "tokens_seen": 2732858368 + }, + { + "epoch": 9.02, + "learning_rate": 8.69307923771314e-05, + "loss": 2.5683, + "theoretical_loss": 3.344883637315237, + "tokens_seen": 2732923904 + }, + { + "epoch": 9.02, + "learning_rate": 8.692076228686057e-05, + "loss": 2.4893, + "theoretical_loss": 3.344877352786024, + "tokens_seen": 2732989440 + }, + { + "epoch": 9.02, + "learning_rate": 8.691073219658977e-05, + "loss": 2.3949, + "theoretical_loss": 3.344871068449705, + "tokens_seen": 2733054976 + }, + { + "epoch": 9.02, + "learning_rate": 8.690070210631895e-05, + "loss": 2.3622, + "theoretical_loss": 3.3448647843062687, + "tokens_seen": 2733120512 + }, + { + "epoch": 9.02, + "learning_rate": 8.689067201604816e-05, + "loss": 2.4987, + "theoretical_loss": 3.344858500355705, + "tokens_seen": 2733186048 + }, + { + "epoch": 9.02, + "learning_rate": 8.688064192577734e-05, + "loss": 2.5113, + "theoretical_loss": 3.344852216598004, + "tokens_seen": 2733251584 + }, + { + "epoch": 9.02, + "learning_rate": 8.687061183550652e-05, + "loss": 2.495, + "theoretical_loss": 3.3448459330331537, + "tokens_seen": 2733317120 + }, + { + "epoch": 9.02, + "learning_rate": 8.686058174523572e-05, + "loss": 2.478, + "theoretical_loss": 3.3448396496611448, + "tokens_seen": 2733382656 + }, + { + "epoch": 9.02, + "learning_rate": 8.68505516549649e-05, + "loss": 2.5614, + "theoretical_loss": 3.3448333664819665, + "tokens_seen": 2733448192 + }, + { + "epoch": 9.02, + "learning_rate": 8.684052156469409e-05, + "loss": 2.4774, + "theoretical_loss": 3.3448270834956078, + "tokens_seen": 2733513728 + }, + { + "epoch": 9.02, + "learning_rate": 8.683049147442327e-05, + "loss": 2.3193, + "theoretical_loss": 3.344820800702059, + "tokens_seen": 2733579264 + }, + { + "epoch": 9.02, + "learning_rate": 8.682046138415247e-05, + "loss": 2.451, + "theoretical_loss": 3.344814518101308, + "tokens_seen": 2733644800 + }, + { + "epoch": 9.02, + "learning_rate": 8.681043129388165e-05, + "loss": 2.5732, + "theoretical_loss": 3.344808235693346, + "tokens_seen": 2733710336 + }, + { + "epoch": 9.02, + "learning_rate": 8.680040120361084e-05, + "loss": 2.3941, + "theoretical_loss": 3.3448019534781617, + "tokens_seen": 2733775872 + }, + { + "epoch": 9.02, + "learning_rate": 8.679037111334002e-05, + "loss": 2.4164, + "theoretical_loss": 3.3447956714557443, + "tokens_seen": 2733841408 + }, + { + "epoch": 9.02, + "learning_rate": 8.67803410230692e-05, + "loss": 2.4021, + "theoretical_loss": 3.344789389626084, + "tokens_seen": 2733906944 + }, + { + "epoch": 9.02, + "learning_rate": 8.67703109327984e-05, + "loss": 2.4597, + "theoretical_loss": 3.3447831079891692, + "tokens_seen": 2733972480 + }, + { + "epoch": 9.02, + "learning_rate": 8.676028084252758e-05, + "loss": 2.62, + "theoretical_loss": 3.34477682654499, + "tokens_seen": 2734038016 + }, + { + "epoch": 9.02, + "learning_rate": 8.675025075225678e-05, + "loss": 2.6227, + "theoretical_loss": 3.3447705452935366, + "tokens_seen": 2734103552 + }, + { + "epoch": 9.02, + "learning_rate": 8.674022066198596e-05, + "loss": 2.2779, + "theoretical_loss": 3.3447642642347972, + "tokens_seen": 2734169088 + }, + { + "epoch": 9.02, + "learning_rate": 8.673019057171515e-05, + "loss": 2.4283, + "theoretical_loss": 3.344757983368762, + "tokens_seen": 2734234624 + }, + { + "epoch": 9.02, + "learning_rate": 8.672016048144433e-05, + "loss": 2.5683, + "theoretical_loss": 3.3447517026954205, + "tokens_seen": 2734300160 + }, + { + "epoch": 9.02, + "learning_rate": 8.671013039117353e-05, + "loss": 2.4462, + "theoretical_loss": 3.3447454222147615, + "tokens_seen": 2734365696 + }, + { + "epoch": 9.02, + "learning_rate": 8.670010030090271e-05, + "loss": 2.536, + "theoretical_loss": 3.344739141926775, + "tokens_seen": 2734431232 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3042101, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3498635292053223, + "objective/train/theoretical_loss": 3.34473443183722, + "objective/train/tokens_used": 2754940384, + "theoretical_loss": 3.34473443183722, + "tokens_seen": 2734480384 + }, + { + "epoch": 9.02, + "learning_rate": 8.66900702106319e-05, + "loss": 2.3985, + "theoretical_loss": 3.3447328618314502, + "tokens_seen": 2734496768 + }, + { + "epoch": 9.02, + "learning_rate": 8.668004012036108e-05, + "loss": 2.5914, + "theoretical_loss": 3.3447265819287773, + "tokens_seen": 2734562304 + }, + { + "epoch": 9.02, + "learning_rate": 8.667001003009027e-05, + "loss": 2.6638, + "theoretical_loss": 3.344720302218745, + "tokens_seen": 2734627840 + }, + { + "epoch": 9.02, + "learning_rate": 8.665997993981946e-05, + "loss": 2.4705, + "theoretical_loss": 3.3447140227013428, + "tokens_seen": 2734693376 + }, + { + "epoch": 9.02, + "learning_rate": 8.664994984954864e-05, + "loss": 2.3937, + "theoretical_loss": 3.3447077433765604, + "tokens_seen": 2734758912 + }, + { + "epoch": 9.02, + "learning_rate": 8.663991975927784e-05, + "loss": 2.4341, + "theoretical_loss": 3.3447014642443875, + "tokens_seen": 2734824448 + }, + { + "epoch": 9.02, + "learning_rate": 8.662988966900702e-05, + "loss": 2.4658, + "theoretical_loss": 3.344695185304813, + "tokens_seen": 2734889984 + }, + { + "epoch": 9.02, + "learning_rate": 8.661985957873621e-05, + "loss": 2.4541, + "theoretical_loss": 3.3446889065578267, + "tokens_seen": 2734955520 + }, + { + "epoch": 9.02, + "learning_rate": 8.660982948846539e-05, + "loss": 2.2148, + "theoretical_loss": 3.3446826280034183, + "tokens_seen": 2735021056 + }, + { + "epoch": 9.02, + "learning_rate": 8.659979939819459e-05, + "loss": 2.57, + "theoretical_loss": 3.3446763496415772, + "tokens_seen": 2735086592 + }, + { + "epoch": 9.02, + "learning_rate": 8.658976930792377e-05, + "loss": 2.4172, + "theoretical_loss": 3.3446700714722923, + "tokens_seen": 2735152128 + }, + { + "epoch": 9.02, + "learning_rate": 8.657973921765295e-05, + "loss": 2.2085, + "theoretical_loss": 3.344663793495554, + "tokens_seen": 2735217664 + }, + { + "epoch": 9.02, + "learning_rate": 8.656970912738214e-05, + "loss": 2.4196, + "theoretical_loss": 3.3446575157113507, + "tokens_seen": 2735283200 + }, + { + "epoch": 9.02, + "learning_rate": 8.655967903711133e-05, + "loss": 2.3733, + "theoretical_loss": 3.344651238119673, + "tokens_seen": 2735348736 + }, + { + "epoch": 9.02, + "learning_rate": 8.654964894684052e-05, + "loss": 2.3005, + "theoretical_loss": 3.34464496072051, + "tokens_seen": 2735414272 + }, + { + "epoch": 9.02, + "learning_rate": 8.65396188565697e-05, + "loss": 2.2031, + "theoretical_loss": 3.3446386835138506, + "tokens_seen": 2735479808 + }, + { + "epoch": 9.02, + "learning_rate": 8.65295887662989e-05, + "loss": 2.4846, + "theoretical_loss": 3.3446324064996853, + "tokens_seen": 2735545344 + }, + { + "epoch": 9.02, + "learning_rate": 8.651955867602809e-05, + "loss": 2.526, + "theoretical_loss": 3.3446261296780024, + "tokens_seen": 2735610880 + }, + { + "epoch": 9.02, + "learning_rate": 8.650952858575729e-05, + "loss": 2.6556, + "theoretical_loss": 3.3446198530487923, + "tokens_seen": 2735676416 + }, + { + "epoch": 9.02, + "learning_rate": 8.649949849548647e-05, + "loss": 2.6416, + "theoretical_loss": 3.344613576612044, + "tokens_seen": 2735741952 + }, + { + "epoch": 9.02, + "learning_rate": 8.648946840521566e-05, + "loss": 2.2605, + "theoretical_loss": 3.3446073003677474, + "tokens_seen": 2735807488 + }, + { + "epoch": 9.02, + "learning_rate": 8.647943831494484e-05, + "loss": 2.4396, + "theoretical_loss": 3.3446010243158915, + "tokens_seen": 2735873024 + }, + { + "epoch": 9.02, + "learning_rate": 8.646940822467402e-05, + "loss": 2.4657, + "theoretical_loss": 3.3445947484564664, + "tokens_seen": 2735938560 + }, + { + "epoch": 9.02, + "learning_rate": 8.645937813440322e-05, + "loss": 2.5911, + "theoretical_loss": 3.3445884727894613, + "tokens_seen": 2736004096 + }, + { + "epoch": 9.02, + "learning_rate": 8.64493480441324e-05, + "loss": 2.457, + "theoretical_loss": 3.3445821973148653, + "tokens_seen": 2736069632 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3042943, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7071778774261475, + "objective/train/theoretical_loss": 3.3445774908351806, + "objective/train/tokens_used": 2756578784, + "theoretical_loss": 3.3445774908351806, + "tokens_seen": 2736118784 + }, + { + "epoch": 9.02, + "learning_rate": 8.64393179538616e-05, + "loss": 2.5422, + "theoretical_loss": 3.3445759220326687, + "tokens_seen": 2736135168 + }, + { + "epoch": 9.02, + "learning_rate": 8.642928786359077e-05, + "loss": 2.4415, + "theoretical_loss": 3.34456964694286, + "tokens_seen": 2736200704 + }, + { + "epoch": 9.02, + "learning_rate": 8.641925777331997e-05, + "loss": 2.5758, + "theoretical_loss": 3.3445633720454295, + "tokens_seen": 2736266240 + }, + { + "epoch": 9.02, + "learning_rate": 8.640922768304915e-05, + "loss": 2.3861, + "theoretical_loss": 3.3445570973403664, + "tokens_seen": 2736331776 + }, + { + "epoch": 9.02, + "learning_rate": 8.639919759277835e-05, + "loss": 2.6208, + "theoretical_loss": 3.34455082282766, + "tokens_seen": 2736397312 + }, + { + "epoch": 9.02, + "learning_rate": 8.638916750250753e-05, + "loss": 2.5704, + "theoretical_loss": 3.3445445485073004, + "tokens_seen": 2736462848 + }, + { + "epoch": 9.02, + "learning_rate": 8.637913741223671e-05, + "loss": 2.3924, + "theoretical_loss": 3.3445382743792766, + "tokens_seen": 2736528384 + }, + { + "epoch": 9.02, + "learning_rate": 8.63691073219659e-05, + "loss": 2.4768, + "theoretical_loss": 3.344532000443578, + "tokens_seen": 2736593920 + }, + { + "epoch": 9.02, + "learning_rate": 8.635907723169508e-05, + "loss": 2.552, + "theoretical_loss": 3.3445257267001947, + "tokens_seen": 2736659456 + }, + { + "epoch": 9.02, + "learning_rate": 8.634904714142428e-05, + "loss": 2.4516, + "theoretical_loss": 3.3445194531491156, + "tokens_seen": 2736724992 + }, + { + "epoch": 9.02, + "learning_rate": 8.633901705115346e-05, + "loss": 2.5574, + "theoretical_loss": 3.3445131797903302, + "tokens_seen": 2736790528 + }, + { + "epoch": 9.02, + "learning_rate": 8.632898696088265e-05, + "loss": 2.441, + "theoretical_loss": 3.3445069066238284, + "tokens_seen": 2736856064 + }, + { + "epoch": 9.02, + "learning_rate": 8.631895687061183e-05, + "loss": 2.4851, + "theoretical_loss": 3.3445006336495995, + "tokens_seen": 2736921600 + }, + { + "epoch": 9.02, + "learning_rate": 8.630892678034103e-05, + "loss": 2.4619, + "theoretical_loss": 3.3444943608676327, + "tokens_seen": 2736987136 + }, + { + "epoch": 9.02, + "learning_rate": 8.629889669007021e-05, + "loss": 2.4577, + "theoretical_loss": 3.3444880882779184, + "tokens_seen": 2737052672 + }, + { + "epoch": 9.02, + "learning_rate": 8.628886659979939e-05, + "loss": 2.5742, + "theoretical_loss": 3.344481815880445, + "tokens_seen": 2737118208 + }, + { + "epoch": 9.02, + "learning_rate": 8.627883650952859e-05, + "loss": 2.4789, + "theoretical_loss": 3.3444755436752027, + "tokens_seen": 2737183744 + }, + { + "epoch": 9.02, + "learning_rate": 8.626880641925777e-05, + "loss": 2.2583, + "theoretical_loss": 3.344469271662181, + "tokens_seen": 2737249280 + }, + { + "epoch": 9.02, + "learning_rate": 8.625877632898696e-05, + "loss": 2.4647, + "theoretical_loss": 3.3444629998413693, + "tokens_seen": 2737314816 + }, + { + "epoch": 9.02, + "learning_rate": 8.624874623871614e-05, + "loss": 2.4653, + "theoretical_loss": 3.344456728212757, + "tokens_seen": 2737380352 + }, + { + "epoch": 9.02, + "learning_rate": 8.623871614844534e-05, + "loss": 2.4721, + "theoretical_loss": 3.3444504567763333, + "tokens_seen": 2737445888 + }, + { + "epoch": 9.02, + "learning_rate": 8.622868605817452e-05, + "loss": 2.3402, + "theoretical_loss": 3.3444441855320886, + "tokens_seen": 2737511424 + }, + { + "epoch": 9.02, + "learning_rate": 8.621865596790371e-05, + "loss": 2.3718, + "theoretical_loss": 3.3444379144800114, + "tokens_seen": 2737576960 + }, + { + "epoch": 9.02, + "learning_rate": 8.62086258776329e-05, + "loss": 2.5531, + "theoretical_loss": 3.344431643620092, + "tokens_seen": 2737642496 + }, + { + "epoch": 9.02, + "learning_rate": 8.619859578736209e-05, + "loss": 2.3394, + "theoretical_loss": 3.3444253729523195, + "tokens_seen": 2737708032 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3044299, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.315783739089966, + "objective/train/theoretical_loss": 3.34442067007758, + "objective/train/tokens_used": 2758217184, + "theoretical_loss": 3.34442067007758, + "tokens_seen": 2737757184 + }, + { + "epoch": 9.02, + "learning_rate": 8.618856569709127e-05, + "loss": 2.6575, + "theoretical_loss": 3.3444191024766834, + "tokens_seen": 2737773568 + }, + { + "epoch": 9.02, + "learning_rate": 8.617853560682045e-05, + "loss": 2.6944, + "theoretical_loss": 3.344412832193173, + "tokens_seen": 2737839104 + }, + { + "epoch": 9.02, + "learning_rate": 8.616850551654965e-05, + "loss": 2.3452, + "theoretical_loss": 3.344406562101779, + "tokens_seen": 2737904640 + }, + { + "epoch": 9.02, + "learning_rate": 8.615847542627883e-05, + "loss": 2.2995, + "theoretical_loss": 3.3444002922024896, + "tokens_seen": 2737970176 + }, + { + "epoch": 9.02, + "learning_rate": 8.614844533600804e-05, + "loss": 2.4077, + "theoretical_loss": 3.344394022495295, + "tokens_seen": 2738035712 + }, + { + "epoch": 9.02, + "learning_rate": 8.613841524573722e-05, + "loss": 2.3193, + "theoretical_loss": 3.344387752980184, + "tokens_seen": 2738101248 + }, + { + "epoch": 9.02, + "learning_rate": 8.612838515546641e-05, + "loss": 2.5251, + "theoretical_loss": 3.344381483657147, + "tokens_seen": 2738166784 + }, + { + "epoch": 9.02, + "learning_rate": 8.611835506519559e-05, + "loss": 2.542, + "theoretical_loss": 3.344375214526173, + "tokens_seen": 2738232320 + }, + { + "epoch": 9.02, + "learning_rate": 8.610832497492479e-05, + "loss": 2.3662, + "theoretical_loss": 3.344368945587252, + "tokens_seen": 2738297856 + }, + { + "epoch": 9.02, + "learning_rate": 8.609829488465397e-05, + "loss": 2.3774, + "theoretical_loss": 3.3443626768403725, + "tokens_seen": 2738363392 + }, + { + "epoch": 9.02, + "learning_rate": 8.608826479438315e-05, + "loss": 2.4703, + "theoretical_loss": 3.3443564082855253, + "tokens_seen": 2738428928 + }, + { + "epoch": 9.02, + "learning_rate": 8.607823470411234e-05, + "loss": 2.6515, + "theoretical_loss": 3.3443501399226987, + "tokens_seen": 2738494464 + }, + { + "epoch": 9.02, + "learning_rate": 8.606820461384153e-05, + "loss": 2.543, + "theoretical_loss": 3.3443438717518834, + "tokens_seen": 2738560000 + }, + { + "epoch": 9.02, + "learning_rate": 8.605817452357072e-05, + "loss": 2.4228, + "theoretical_loss": 3.3443376037730683, + "tokens_seen": 2738625536 + }, + { + "epoch": 9.02, + "learning_rate": 8.60481444332999e-05, + "loss": 2.3895, + "theoretical_loss": 3.3443313359862428, + "tokens_seen": 2738691072 + }, + { + "epoch": 9.02, + "learning_rate": 8.60381143430291e-05, + "loss": 2.5395, + "theoretical_loss": 3.3443250683913965, + "tokens_seen": 2738756608 + }, + { + "epoch": 9.02, + "learning_rate": 8.602808425275828e-05, + "loss": 2.5847, + "theoretical_loss": 3.3443188009885194, + "tokens_seen": 2738822144 + }, + { + "epoch": 9.02, + "learning_rate": 8.601805416248747e-05, + "loss": 2.4355, + "theoretical_loss": 3.3443125337776003, + "tokens_seen": 2738887680 + }, + { + "epoch": 9.02, + "learning_rate": 8.600802407221665e-05, + "loss": 2.3777, + "theoretical_loss": 3.344306266758629, + "tokens_seen": 2738953216 + }, + { + "epoch": 9.02, + "learning_rate": 8.599799398194583e-05, + "loss": 2.4211, + "theoretical_loss": 3.3442999999315957, + "tokens_seen": 2739018752 + }, + { + "epoch": 9.02, + "learning_rate": 8.598796389167503e-05, + "loss": 2.41, + "theoretical_loss": 3.344293733296489, + "tokens_seen": 2739084288 + }, + { + "epoch": 9.02, + "learning_rate": 8.597793380140421e-05, + "loss": 2.4068, + "theoretical_loss": 3.3442874668532987, + "tokens_seen": 2739149824 + }, + { + "epoch": 9.02, + "learning_rate": 8.59679037111334e-05, + "loss": 2.2186, + "theoretical_loss": 3.344281200602014, + "tokens_seen": 2739215360 + }, + { + "epoch": 9.02, + "learning_rate": 8.595787362086259e-05, + "loss": 2.5145, + "theoretical_loss": 3.3442749345426255, + "tokens_seen": 2739280896 + }, + { + "epoch": 9.02, + "learning_rate": 8.594784353059178e-05, + "loss": 2.4074, + "theoretical_loss": 3.344268668675122, + "tokens_seen": 2739346432 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3044920, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7756268978118896, + "objective/train/theoretical_loss": 3.344263969400412, + "objective/train/tokens_used": 2759855584, + "theoretical_loss": 3.344263969400412, + "tokens_seen": 2739395584 + }, + { + "epoch": 9.02, + "learning_rate": 8.593781344032096e-05, + "loss": 2.4823, + "theoretical_loss": 3.344262402999493, + "tokens_seen": 2739411968 + }, + { + "epoch": 9.02, + "learning_rate": 8.592778335005016e-05, + "loss": 2.4276, + "theoretical_loss": 3.3442561375157283, + "tokens_seen": 2739477504 + }, + { + "epoch": 9.02, + "learning_rate": 8.591775325977934e-05, + "loss": 2.43, + "theoretical_loss": 3.3442498722238168, + "tokens_seen": 2739543040 + }, + { + "epoch": 9.02, + "learning_rate": 8.590772316950853e-05, + "loss": 2.5496, + "theoretical_loss": 3.3442436071237487, + "tokens_seen": 2739608576 + }, + { + "epoch": 9.02, + "learning_rate": 8.589769307923771e-05, + "loss": 2.4101, + "theoretical_loss": 3.3442373422155134, + "tokens_seen": 2739674112 + }, + { + "epoch": 9.02, + "learning_rate": 8.58876629889669e-05, + "loss": 2.5729, + "theoretical_loss": 3.3442310774991006, + "tokens_seen": 2739739648 + }, + { + "epoch": 9.02, + "learning_rate": 8.587763289869609e-05, + "loss": 2.3956, + "theoretical_loss": 3.3442248129744994, + "tokens_seen": 2739805184 + }, + { + "epoch": 9.02, + "learning_rate": 8.586760280842527e-05, + "loss": 2.6488, + "theoretical_loss": 3.3442185486416998, + "tokens_seen": 2739870720 + }, + { + "epoch": 9.02, + "learning_rate": 8.585757271815446e-05, + "loss": 2.468, + "theoretical_loss": 3.3442122845006907, + "tokens_seen": 2739936256 + }, + { + "epoch": 9.02, + "learning_rate": 8.584754262788365e-05, + "loss": 2.119, + "theoretical_loss": 3.3442060205514625, + "tokens_seen": 2740001792 + }, + { + "epoch": 9.02, + "learning_rate": 8.583751253761284e-05, + "loss": 2.5209, + "theoretical_loss": 3.344199756794004, + "tokens_seen": 2740067328 + }, + { + "epoch": 9.02, + "learning_rate": 8.582748244734202e-05, + "loss": 2.348, + "theoretical_loss": 3.344193493228305, + "tokens_seen": 2740132864 + }, + { + "epoch": 9.02, + "learning_rate": 8.581745235707122e-05, + "loss": 2.3846, + "theoretical_loss": 3.344187229854355, + "tokens_seen": 2740198400 + }, + { + "epoch": 9.02, + "learning_rate": 8.58074222668004e-05, + "loss": 2.3726, + "theoretical_loss": 3.344180966672144, + "tokens_seen": 2740263936 + }, + { + "epoch": 9.02, + "learning_rate": 8.579739217652958e-05, + "loss": 2.6364, + "theoretical_loss": 3.3441747036816607, + "tokens_seen": 2740329472 + }, + { + "epoch": 9.02, + "learning_rate": 8.578736208625877e-05, + "loss": 2.6088, + "theoretical_loss": 3.3441684408828953, + "tokens_seen": 2740395008 + }, + { + "epoch": 9.02, + "learning_rate": 8.577733199598795e-05, + "loss": 2.6415, + "theoretical_loss": 3.3441621782758375, + "tokens_seen": 2740460544 + }, + { + "epoch": 9.02, + "learning_rate": 8.576730190571716e-05, + "loss": 2.4186, + "theoretical_loss": 3.344155915860476, + "tokens_seen": 2740526080 + }, + { + "epoch": 9.02, + "learning_rate": 8.575727181544634e-05, + "loss": 2.4393, + "theoretical_loss": 3.344149653636801, + "tokens_seen": 2740591616 + }, + { + "epoch": 9.02, + "learning_rate": 8.574724172517554e-05, + "loss": 2.5174, + "theoretical_loss": 3.344143391604802, + "tokens_seen": 2740657152 + }, + { + "epoch": 9.02, + "learning_rate": 8.573721163490472e-05, + "loss": 2.4571, + "theoretical_loss": 3.3441371297644684, + "tokens_seen": 2740722688 + }, + { + "epoch": 9.02, + "learning_rate": 8.572718154463391e-05, + "loss": 2.4218, + "theoretical_loss": 3.3441308681157897, + "tokens_seen": 2740788224 + }, + { + "epoch": 9.02, + "learning_rate": 8.57171514543631e-05, + "loss": 2.3624, + "theoretical_loss": 3.3441246066587555, + "tokens_seen": 2740853760 + }, + { + "epoch": 9.02, + "learning_rate": 8.570712136409228e-05, + "loss": 2.3664, + "theoretical_loss": 3.3441183453933556, + "tokens_seen": 2740919296 + }, + { + "epoch": 9.02, + "learning_rate": 8.569709127382147e-05, + "loss": 2.6807, + "theoretical_loss": 3.3441120843195793, + "tokens_seen": 2740984832 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3046299, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.602384567260742, + "objective/train/theoretical_loss": 3.3441073886399937, + "objective/train/tokens_used": 2761493984, + "theoretical_loss": 3.3441073886399937, + "tokens_seen": 2741033984 + }, + { + "epoch": 9.02, + "learning_rate": 8.568706118355065e-05, + "loss": 2.5294, + "theoretical_loss": 3.344105823437416, + "tokens_seen": 2741050368 + }, + { + "epoch": 9.02, + "learning_rate": 8.567703109327985e-05, + "loss": 2.6034, + "theoretical_loss": 3.3440995627468557, + "tokens_seen": 2741115904 + }, + { + "epoch": 9.02, + "learning_rate": 8.566700100300903e-05, + "loss": 2.449, + "theoretical_loss": 3.344093302247888, + "tokens_seen": 2741181440 + }, + { + "epoch": 9.02, + "learning_rate": 8.565697091273822e-05, + "loss": 2.4083, + "theoretical_loss": 3.3440870419405018, + "tokens_seen": 2741246976 + }, + { + "epoch": 9.02, + "learning_rate": 8.56469408224674e-05, + "loss": 2.4795, + "theoretical_loss": 3.3440807818246867, + "tokens_seen": 2741312512 + }, + { + "epoch": 9.02, + "learning_rate": 8.56369107321966e-05, + "loss": 2.5531, + "theoretical_loss": 3.344074521900433, + "tokens_seen": 2741378048 + }, + { + "epoch": 9.02, + "learning_rate": 8.562688064192578e-05, + "loss": 2.5257, + "theoretical_loss": 3.3440682621677302, + "tokens_seen": 2741443584 + }, + { + "epoch": 9.02, + "learning_rate": 8.561685055165497e-05, + "loss": 2.3922, + "theoretical_loss": 3.344062002626567, + "tokens_seen": 2741509120 + }, + { + "epoch": 9.02, + "learning_rate": 8.560682046138416e-05, + "loss": 2.6664, + "theoretical_loss": 3.3440557432769333, + "tokens_seen": 2741574656 + }, + { + "epoch": 9.02, + "learning_rate": 8.559679037111334e-05, + "loss": 2.306, + "theoretical_loss": 3.3440494841188193, + "tokens_seen": 2741640192 + }, + { + "epoch": 9.02, + "learning_rate": 8.558676028084253e-05, + "loss": 2.7116, + "theoretical_loss": 3.344043225152214, + "tokens_seen": 2741705728 + }, + { + "epoch": 9.02, + "learning_rate": 8.557673019057171e-05, + "loss": 2.252, + "theoretical_loss": 3.344036966377107, + "tokens_seen": 2741771264 + }, + { + "epoch": 9.02, + "learning_rate": 8.556670010030091e-05, + "loss": 2.6756, + "theoretical_loss": 3.344030707793488, + "tokens_seen": 2741836800 + }, + { + "epoch": 9.02, + "learning_rate": 8.555667001003009e-05, + "loss": 2.4945, + "theoretical_loss": 3.3440244494013465, + "tokens_seen": 2741902336 + }, + { + "epoch": 9.02, + "learning_rate": 8.554663991975928e-05, + "loss": 2.4178, + "theoretical_loss": 3.3440181912006715, + "tokens_seen": 2741967872 + }, + { + "epoch": 9.02, + "learning_rate": 8.553660982948846e-05, + "loss": 2.282, + "theoretical_loss": 3.344011933191454, + "tokens_seen": 2742033408 + }, + { + "epoch": 9.02, + "learning_rate": 8.552657973921766e-05, + "loss": 2.4259, + "theoretical_loss": 3.344005675373682, + "tokens_seen": 2742098944 + }, + { + "epoch": 9.02, + "learning_rate": 8.551654964894684e-05, + "loss": 2.3923, + "theoretical_loss": 3.343999417747346, + "tokens_seen": 2742164480 + }, + { + "epoch": 9.02, + "learning_rate": 8.550651955867602e-05, + "loss": 2.5874, + "theoretical_loss": 3.3439931603124355, + "tokens_seen": 2742230016 + }, + { + "epoch": 9.02, + "learning_rate": 8.549648946840522e-05, + "loss": 2.2923, + "theoretical_loss": 3.3439869030689398, + "tokens_seen": 2742295552 + }, + { + "epoch": 9.02, + "learning_rate": 8.54864593781344e-05, + "loss": 2.4709, + "theoretical_loss": 3.343980646016848, + "tokens_seen": 2742361088 + }, + { + "epoch": 9.02, + "learning_rate": 8.547642928786359e-05, + "loss": 2.3824, + "theoretical_loss": 3.343974389156151, + "tokens_seen": 2742426624 + }, + { + "epoch": 9.02, + "learning_rate": 8.546639919759277e-05, + "loss": 2.2731, + "theoretical_loss": 3.343968132486837, + "tokens_seen": 2742492160 + }, + { + "epoch": 9.02, + "learning_rate": 8.545636910732197e-05, + "loss": 2.4687, + "theoretical_loss": 3.3439618760088967, + "tokens_seen": 2742557696 + }, + { + "epoch": 9.02, + "learning_rate": 8.544633901705115e-05, + "loss": 2.2563, + "theoretical_loss": 3.343955619722319, + "tokens_seen": 2742623232 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3046956, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.401214122772217, + "objective/train/theoretical_loss": 3.343950927632961, + "objective/train/tokens_used": 2763132384, + "theoretical_loss": 3.343950927632961, + "tokens_seen": 2742672384 + }, + { + "epoch": 9.02, + "learning_rate": 8.543630892678034e-05, + "loss": 2.3249, + "theoretical_loss": 3.343949363627093, + "tokens_seen": 2742688768 + }, + { + "epoch": 9.02, + "learning_rate": 8.542627883650952e-05, + "loss": 2.6563, + "theoretical_loss": 3.34394310772321, + "tokens_seen": 2742754304 + }, + { + "epoch": 9.02, + "learning_rate": 8.54162487462387e-05, + "loss": 2.3534, + "theoretical_loss": 3.343936852010658, + "tokens_seen": 2742819840 + }, + { + "epoch": 9.02, + "learning_rate": 8.54062186559679e-05, + "loss": 2.292, + "theoretical_loss": 3.3439305964894266, + "tokens_seen": 2742885376 + }, + { + "epoch": 9.02, + "learning_rate": 8.53961885656971e-05, + "loss": 2.5334, + "theoretical_loss": 3.3439243411595063, + "tokens_seen": 2742950912 + }, + { + "epoch": 9.02, + "learning_rate": 8.538615847542629e-05, + "loss": 2.4177, + "theoretical_loss": 3.343918086020886, + "tokens_seen": 2743016448 + }, + { + "epoch": 9.02, + "learning_rate": 8.537612838515547e-05, + "loss": 2.581, + "theoretical_loss": 3.3439118310735556, + "tokens_seen": 2743081984 + }, + { + "epoch": 9.02, + "learning_rate": 8.536609829488466e-05, + "loss": 2.5198, + "theoretical_loss": 3.3439055763175047, + "tokens_seen": 2743147520 + }, + { + "epoch": 9.02, + "learning_rate": 8.535606820461385e-05, + "loss": 2.356, + "theoretical_loss": 3.3438993217527226, + "tokens_seen": 2743213056 + }, + { + "epoch": 9.02, + "learning_rate": 8.534603811434304e-05, + "loss": 2.4081, + "theoretical_loss": 3.343893067379199, + "tokens_seen": 2743278592 + }, + { + "epoch": 9.02, + "learning_rate": 8.533600802407222e-05, + "loss": 2.4232, + "theoretical_loss": 3.3438868131969235, + "tokens_seen": 2743344128 + }, + { + "epoch": 9.02, + "learning_rate": 8.532597793380142e-05, + "loss": 2.3846, + "theoretical_loss": 3.343880559205886, + "tokens_seen": 2743409664 + }, + { + "epoch": 9.02, + "learning_rate": 8.53159478435306e-05, + "loss": 2.5269, + "theoretical_loss": 3.343874305406075, + "tokens_seen": 2743475200 + }, + { + "epoch": 9.02, + "learning_rate": 8.530591775325978e-05, + "loss": 2.3495, + "theoretical_loss": 3.343868051797482, + "tokens_seen": 2743540736 + }, + { + "epoch": 9.02, + "learning_rate": 8.529588766298897e-05, + "loss": 2.6237, + "theoretical_loss": 3.343861798380095, + "tokens_seen": 2743606272 + }, + { + "epoch": 9.02, + "learning_rate": 8.528585757271815e-05, + "loss": 2.5949, + "theoretical_loss": 3.343855545153904, + "tokens_seen": 2743671808 + }, + { + "epoch": 9.02, + "learning_rate": 8.527582748244735e-05, + "loss": 2.3811, + "theoretical_loss": 3.343849292118898, + "tokens_seen": 2743737344 + }, + { + "epoch": 9.02, + "learning_rate": 8.526579739217653e-05, + "loss": 2.6842, + "theoretical_loss": 3.3438430392750678, + "tokens_seen": 2743802880 + }, + { + "epoch": 9.02, + "learning_rate": 8.525576730190572e-05, + "loss": 2.5466, + "theoretical_loss": 3.3438367866224024, + "tokens_seen": 2743868416 + }, + { + "epoch": 9.02, + "learning_rate": 8.52457372116349e-05, + "loss": 2.31, + "theoretical_loss": 3.343830534160891, + "tokens_seen": 2743933952 + }, + { + "epoch": 9.02, + "learning_rate": 8.52357071213641e-05, + "loss": 2.3793, + "theoretical_loss": 3.3438242818905244, + "tokens_seen": 2743999488 + }, + { + "epoch": 9.02, + "learning_rate": 8.522567703109328e-05, + "loss": 2.4407, + "theoretical_loss": 3.343818029811291, + "tokens_seen": 2744065024 + }, + { + "epoch": 9.02, + "learning_rate": 8.521564694082246e-05, + "loss": 2.5539, + "theoretical_loss": 3.3438117779231806, + "tokens_seen": 2744130560 + }, + { + "epoch": 9.02, + "learning_rate": 8.520561685055166e-05, + "loss": 2.5432, + "theoretical_loss": 3.3438055262261828, + "tokens_seen": 2744196096 + }, + { + "epoch": 9.02, + "learning_rate": 8.519558676028084e-05, + "loss": 2.7001, + "theoretical_loss": 3.3437992747202876, + "tokens_seen": 2744261632 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3047945, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.678570032119751, + "objective/train/theoretical_loss": 3.343794586216271, + "objective/train/tokens_used": 2764770784, + "theoretical_loss": 3.343794586216271, + "tokens_seen": 2744310784 + }, + { + "epoch": 9.02, + "learning_rate": 8.518555667001003e-05, + "loss": 2.5894, + "theoretical_loss": 3.3437930234054845, + "tokens_seen": 2744327168 + }, + { + "epoch": 9.02, + "learning_rate": 8.517552657973921e-05, + "loss": 2.5968, + "theoretical_loss": 3.3437867722817627, + "tokens_seen": 2744392704 + }, + { + "epoch": 9.02, + "learning_rate": 8.516549648946841e-05, + "loss": 2.5166, + "theoretical_loss": 3.343780521349112, + "tokens_seen": 2744458240 + }, + { + "epoch": 9.02, + "learning_rate": 8.515546639919759e-05, + "loss": 2.3543, + "theoretical_loss": 3.3437742706075224, + "tokens_seen": 2744523776 + }, + { + "epoch": 9.02, + "learning_rate": 8.514543630892678e-05, + "loss": 2.2563, + "theoretical_loss": 3.3437680200569826, + "tokens_seen": 2744589312 + }, + { + "epoch": 9.02, + "learning_rate": 8.513540621865597e-05, + "loss": 2.3974, + "theoretical_loss": 3.3437617696974833, + "tokens_seen": 2744654848 + }, + { + "epoch": 9.02, + "learning_rate": 8.512537612838516e-05, + "loss": 2.6487, + "theoretical_loss": 3.3437555195290134, + "tokens_seen": 2744720384 + }, + { + "epoch": 9.02, + "learning_rate": 8.511534603811434e-05, + "loss": 2.5725, + "theoretical_loss": 3.3437492695515623, + "tokens_seen": 2744785920 + }, + { + "epoch": 9.02, + "learning_rate": 8.510531594784352e-05, + "loss": 2.3758, + "theoretical_loss": 3.34374301976512, + "tokens_seen": 2744851456 + }, + { + "epoch": 9.02, + "learning_rate": 8.509528585757272e-05, + "loss": 2.7769, + "theoretical_loss": 3.3437367701696763, + "tokens_seen": 2744916992 + }, + { + "epoch": 9.02, + "learning_rate": 8.50852557673019e-05, + "loss": 2.6974, + "theoretical_loss": 3.3437305207652206, + "tokens_seen": 2744982528 + }, + { + "epoch": 9.02, + "learning_rate": 8.50752256770311e-05, + "loss": 2.4366, + "theoretical_loss": 3.3437242715517423, + "tokens_seen": 2745048064 + }, + { + "epoch": 9.02, + "learning_rate": 8.506519558676027e-05, + "loss": 2.5512, + "theoretical_loss": 3.343718022529231, + "tokens_seen": 2745113600 + }, + { + "epoch": 9.02, + "learning_rate": 8.505516549648947e-05, + "loss": 2.4074, + "theoretical_loss": 3.3437117736976765, + "tokens_seen": 2745179136 + }, + { + "epoch": 9.02, + "learning_rate": 8.504513540621865e-05, + "loss": 2.4888, + "theoretical_loss": 3.343705525057069, + "tokens_seen": 2745244672 + }, + { + "epoch": 9.02, + "learning_rate": 8.503510531594785e-05, + "loss": 2.5963, + "theoretical_loss": 3.3436992766073965, + "tokens_seen": 2745310208 + }, + { + "epoch": 9.02, + "learning_rate": 8.502507522567703e-05, + "loss": 2.4466, + "theoretical_loss": 3.34369302834865, + "tokens_seen": 2745375744 + }, + { + "epoch": 9.02, + "learning_rate": 8.501504513540622e-05, + "loss": 2.4791, + "theoretical_loss": 3.3436867802808186, + "tokens_seen": 2745441280 + }, + { + "epoch": 9.02, + "learning_rate": 8.500501504513542e-05, + "loss": 2.279, + "theoretical_loss": 3.3436805324038925, + "tokens_seen": 2745506816 + }, + { + "epoch": 9.02, + "learning_rate": 8.49949849548646e-05, + "loss": 2.5459, + "theoretical_loss": 3.34367428471786, + "tokens_seen": 2745572352 + }, + { + "epoch": 9.02, + "learning_rate": 8.498495486459379e-05, + "loss": 2.463, + "theoretical_loss": 3.343668037222712, + "tokens_seen": 2745637888 + }, + { + "epoch": 9.02, + "learning_rate": 8.497492477432297e-05, + "loss": 2.6408, + "theoretical_loss": 3.3436617899184378, + "tokens_seen": 2745703424 + }, + { + "epoch": 9.02, + "learning_rate": 8.496489468405217e-05, + "loss": 2.4176, + "theoretical_loss": 3.3436555428050263, + "tokens_seen": 2745768960 + }, + { + "epoch": 9.02, + "learning_rate": 8.495486459378135e-05, + "loss": 2.6099, + "theoretical_loss": 3.343649295882468, + "tokens_seen": 2745834496 + }, + { + "epoch": 9.02, + "learning_rate": 8.494483450351054e-05, + "loss": 2.4963, + "theoretical_loss": 3.343643049150752, + "tokens_seen": 2745900032 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3048529, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.913114309310913, + "objective/train/theoretical_loss": 3.3436383642271994, + "objective/train/tokens_used": 2766409184, + "theoretical_loss": 3.3436383642271994, + "tokens_seen": 2745949184 + }, + { + "epoch": 9.02, + "learning_rate": 8.493480441323972e-05, + "loss": 2.499, + "theoretical_loss": 3.3436368026098684, + "tokens_seen": 2745965568 + }, + { + "epoch": 9.02, + "learning_rate": 8.49247743229689e-05, + "loss": 2.5037, + "theoretical_loss": 3.3436305562598063, + "tokens_seen": 2746031104 + }, + { + "epoch": 9.02, + "learning_rate": 8.49147442326981e-05, + "loss": 2.2587, + "theoretical_loss": 3.3436243101005556, + "tokens_seen": 2746096640 + }, + { + "epoch": 9.02, + "learning_rate": 8.490471414242728e-05, + "loss": 2.6068, + "theoretical_loss": 3.343618064132106, + "tokens_seen": 2746162176 + }, + { + "epoch": 9.02, + "learning_rate": 8.489468405215648e-05, + "loss": 2.34, + "theoretical_loss": 3.3436118183544465, + "tokens_seen": 2746227712 + }, + { + "epoch": 9.02, + "learning_rate": 8.488465396188566e-05, + "loss": 2.3877, + "theoretical_loss": 3.3436055727675673, + "tokens_seen": 2746293248 + }, + { + "epoch": 9.02, + "learning_rate": 8.487462387161485e-05, + "loss": 2.5483, + "theoretical_loss": 3.343599327371458, + "tokens_seen": 2746358784 + }, + { + "epoch": 9.02, + "learning_rate": 8.486459378134403e-05, + "loss": 2.6627, + "theoretical_loss": 3.343593082166108, + "tokens_seen": 2746424320 + }, + { + "epoch": 9.02, + "learning_rate": 8.485456369107323e-05, + "loss": 2.4576, + "theoretical_loss": 3.3435868371515074, + "tokens_seen": 2746489856 + }, + { + "epoch": 9.02, + "learning_rate": 8.484453360080241e-05, + "loss": 2.6211, + "theoretical_loss": 3.343580592327645, + "tokens_seen": 2746555392 + }, + { + "epoch": 9.02, + "learning_rate": 8.48345035105316e-05, + "loss": 2.5661, + "theoretical_loss": 3.3435743476945112, + "tokens_seen": 2746620928 + }, + { + "epoch": 9.02, + "learning_rate": 8.482447342026078e-05, + "loss": 2.4132, + "theoretical_loss": 3.343568103252095, + "tokens_seen": 2746686464 + }, + { + "epoch": 9.02, + "learning_rate": 8.481444332998997e-05, + "loss": 2.3632, + "theoretical_loss": 3.3435618590003866, + "tokens_seen": 2746752000 + }, + { + "epoch": 9.02, + "learning_rate": 8.480441323971916e-05, + "loss": 2.7587, + "theoretical_loss": 3.3435556149393753, + "tokens_seen": 2746817536 + }, + { + "epoch": 9.02, + "learning_rate": 8.479438314944834e-05, + "loss": 2.4132, + "theoretical_loss": 3.343549371069051, + "tokens_seen": 2746883072 + }, + { + "epoch": 9.02, + "learning_rate": 8.478435305917754e-05, + "loss": 2.3777, + "theoretical_loss": 3.343543127389403, + "tokens_seen": 2746948608 + }, + { + "epoch": 9.02, + "learning_rate": 8.477432296890672e-05, + "loss": 2.657, + "theoretical_loss": 3.3435368839004207, + "tokens_seen": 2747014144 + }, + { + "epoch": 9.02, + "learning_rate": 8.476429287863591e-05, + "loss": 2.6184, + "theoretical_loss": 3.3435306406020944, + "tokens_seen": 2747079680 + }, + { + "epoch": 9.02, + "learning_rate": 8.475426278836509e-05, + "loss": 2.3506, + "theoretical_loss": 3.3435243974944133, + "tokens_seen": 2747145216 + }, + { + "epoch": 9.02, + "learning_rate": 8.474423269809429e-05, + "loss": 2.3258, + "theoretical_loss": 3.3435181545773673, + "tokens_seen": 2747210752 + }, + { + "epoch": 9.02, + "learning_rate": 8.473420260782347e-05, + "loss": 2.4646, + "theoretical_loss": 3.3435119118509453, + "tokens_seen": 2747276288 + }, + { + "epoch": 9.02, + "learning_rate": 8.472417251755265e-05, + "loss": 2.5641, + "theoretical_loss": 3.343505669315138, + "tokens_seen": 2747341824 + }, + { + "epoch": 9.02, + "learning_rate": 8.471414242728184e-05, + "loss": 2.47, + "theoretical_loss": 3.3434994269699345, + "tokens_seen": 2747407360 + }, + { + "epoch": 9.02, + "learning_rate": 8.470411233701103e-05, + "loss": 2.5874, + "theoretical_loss": 3.3434931848153244, + "tokens_seen": 2747472896 + }, + { + "epoch": 9.02, + "learning_rate": 8.469408224674022e-05, + "loss": 2.5591, + "theoretical_loss": 3.343486942851297, + "tokens_seen": 2747538432 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3049896, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3291542530059814, + "objective/train/theoretical_loss": 3.343482261503341, + "objective/train/tokens_used": 2768047584, + "theoretical_loss": 3.343482261503341, + "tokens_seen": 2747587584 + }, + { + "epoch": 9.02, + "learning_rate": 8.46840521564694e-05, + "loss": 2.4246, + "theoretical_loss": 3.343480701077843, + "tokens_seen": 2747603968 + }, + { + "epoch": 9.02, + "learning_rate": 8.46740220661986e-05, + "loss": 2.5321, + "theoretical_loss": 3.343474459494951, + "tokens_seen": 2747669504 + }, + { + "epoch": 9.02, + "learning_rate": 8.466399197592778e-05, + "loss": 2.3287, + "theoretical_loss": 3.343468218102611, + "tokens_seen": 2747735040 + }, + { + "epoch": 9.02, + "learning_rate": 8.465396188565697e-05, + "loss": 2.3041, + "theoretical_loss": 3.3434619769008127, + "tokens_seen": 2747800576 + }, + { + "epoch": 9.02, + "learning_rate": 8.464393179538617e-05, + "loss": 2.4648, + "theoretical_loss": 3.343455735889546, + "tokens_seen": 2747866112 + }, + { + "epoch": 9.02, + "learning_rate": 8.463390170511536e-05, + "loss": 2.5683, + "theoretical_loss": 3.3434494950688, + "tokens_seen": 2747931648 + }, + { + "epoch": 9.02, + "learning_rate": 8.462387161484454e-05, + "loss": 2.3942, + "theoretical_loss": 3.343443254438564, + "tokens_seen": 2747997184 + }, + { + "epoch": 9.02, + "learning_rate": 8.461384152457372e-05, + "loss": 2.3454, + "theoretical_loss": 3.343437013998829, + "tokens_seen": 2748062720 + }, + { + "epoch": 9.02, + "learning_rate": 8.460381143430292e-05, + "loss": 2.5215, + "theoretical_loss": 3.343430773749583, + "tokens_seen": 2748128256 + }, + { + "epoch": 9.02, + "learning_rate": 8.45937813440321e-05, + "loss": 2.4878, + "theoretical_loss": 3.3434245336908175, + "tokens_seen": 2748193792 + }, + { + "epoch": 9.02, + "learning_rate": 8.45837512537613e-05, + "loss": 2.4882, + "theoretical_loss": 3.3434182938225203, + "tokens_seen": 2748259328 + }, + { + "epoch": 9.02, + "learning_rate": 8.457372116349047e-05, + "loss": 2.6121, + "theoretical_loss": 3.3434120541446823, + "tokens_seen": 2748324864 + }, + { + "epoch": 9.02, + "learning_rate": 8.456369107321967e-05, + "loss": 2.5605, + "theoretical_loss": 3.3434058146572925, + "tokens_seen": 2748390400 + }, + { + "epoch": 9.02, + "learning_rate": 8.455366098294885e-05, + "loss": 2.4763, + "theoretical_loss": 3.343399575360341, + "tokens_seen": 2748455936 + }, + { + "epoch": 9.02, + "learning_rate": 8.454363089267805e-05, + "loss": 2.5591, + "theoretical_loss": 3.343393336253817, + "tokens_seen": 2748521472 + }, + { + "epoch": 9.02, + "learning_rate": 8.453360080240723e-05, + "loss": 2.4258, + "theoretical_loss": 3.3433870973377107, + "tokens_seen": 2748587008 + }, + { + "epoch": 9.02, + "learning_rate": 8.452357071213641e-05, + "loss": 2.5189, + "theoretical_loss": 3.343380858612011, + "tokens_seen": 2748652544 + }, + { + "epoch": 9.02, + "learning_rate": 8.45135406218656e-05, + "loss": 2.1278, + "theoretical_loss": 3.343374620076708, + "tokens_seen": 2748718080 + }, + { + "epoch": 9.02, + "learning_rate": 8.450351053159478e-05, + "loss": 2.5019, + "theoretical_loss": 3.3433683817317914, + "tokens_seen": 2748783616 + }, + { + "epoch": 9.02, + "learning_rate": 8.449348044132398e-05, + "loss": 2.2668, + "theoretical_loss": 3.343362143577251, + "tokens_seen": 2748849152 + }, + { + "epoch": 9.02, + "learning_rate": 8.448345035105316e-05, + "loss": 2.3786, + "theoretical_loss": 3.343355905613076, + "tokens_seen": 2748914688 + }, + { + "epoch": 9.02, + "learning_rate": 8.447342026078235e-05, + "loss": 2.4304, + "theoretical_loss": 3.3433496678392562, + "tokens_seen": 2748980224 + }, + { + "epoch": 9.02, + "learning_rate": 8.446339017051153e-05, + "loss": 2.2939, + "theoretical_loss": 3.343343430255781, + "tokens_seen": 2749045760 + }, + { + "epoch": 9.02, + "learning_rate": 8.445336008024073e-05, + "loss": 2.486, + "theoretical_loss": 3.3433371928626414, + "tokens_seen": 2749111296 + }, + { + "epoch": 9.02, + "learning_rate": 8.444332998996991e-05, + "loss": 2.3991, + "theoretical_loss": 3.343330955659825, + "tokens_seen": 2749176832 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3050663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7712154388427734, + "objective/train/theoretical_loss": 3.343326277882607, + "objective/train/tokens_used": 2769685984, + "theoretical_loss": 3.343326277882607, + "tokens_seen": 2749225984 + }, + { + "epoch": 9.02, + "learning_rate": 8.443329989969909e-05, + "loss": 2.629, + "theoretical_loss": 3.343324718647323, + "tokens_seen": 2749242368 + }, + { + "epoch": 9.02, + "learning_rate": 8.442326980942829e-05, + "loss": 2.6172, + "theoretical_loss": 3.3433184818251243, + "tokens_seen": 2749307904 + }, + { + "epoch": 9.02, + "learning_rate": 8.441323971915747e-05, + "loss": 2.6499, + "theoretical_loss": 3.3433122451932187, + "tokens_seen": 2749373440 + }, + { + "epoch": 9.02, + "learning_rate": 8.440320962888666e-05, + "loss": 2.4668, + "theoretical_loss": 3.3433060087515964, + "tokens_seen": 2749438976 + }, + { + "epoch": 9.02, + "learning_rate": 8.439317953861584e-05, + "loss": 2.3916, + "theoretical_loss": 3.3432997725002465, + "tokens_seen": 2749504512 + }, + { + "epoch": 9.02, + "learning_rate": 8.438314944834504e-05, + "loss": 2.4763, + "theoretical_loss": 3.3432935364391585, + "tokens_seen": 2749570048 + }, + { + "epoch": 9.02, + "learning_rate": 8.437311935807422e-05, + "loss": 2.4274, + "theoretical_loss": 3.3432873005683224, + "tokens_seen": 2749635584 + }, + { + "epoch": 9.02, + "learning_rate": 8.436308926780341e-05, + "loss": 2.5979, + "theoretical_loss": 3.343281064887728, + "tokens_seen": 2749701120 + }, + { + "epoch": 9.02, + "learning_rate": 8.43530591775326e-05, + "loss": 2.698, + "theoretical_loss": 3.3432748293973646, + "tokens_seen": 2749766656 + }, + { + "epoch": 9.02, + "learning_rate": 8.434302908726178e-05, + "loss": 2.3288, + "theoretical_loss": 3.3432685940972218, + "tokens_seen": 2749832192 + }, + { + "epoch": 9.02, + "learning_rate": 8.433299899699097e-05, + "loss": 2.7288, + "theoretical_loss": 3.34326235898729, + "tokens_seen": 2749897728 + }, + { + "epoch": 9.02, + "learning_rate": 8.432296890672015e-05, + "loss": 2.5144, + "theoretical_loss": 3.343256124067558, + "tokens_seen": 2749963264 + }, + { + "epoch": 9.02, + "learning_rate": 8.431293881644935e-05, + "loss": 2.3794, + "theoretical_loss": 3.343249889338016, + "tokens_seen": 2750028800 + }, + { + "epoch": 9.02, + "learning_rate": 8.430290872617853e-05, + "loss": 2.6217, + "theoretical_loss": 3.343243654798653, + "tokens_seen": 2750094336 + }, + { + "epoch": 9.02, + "learning_rate": 8.429287863590772e-05, + "loss": 2.6935, + "theoretical_loss": 3.3432374204494595, + "tokens_seen": 2750159872 + }, + { + "epoch": 9.02, + "learning_rate": 8.42828485456369e-05, + "loss": 2.3991, + "theoretical_loss": 3.3432311862904247, + "tokens_seen": 2750225408 + }, + { + "epoch": 9.02, + "learning_rate": 8.42728184553661e-05, + "loss": 2.4905, + "theoretical_loss": 3.3432249523215387, + "tokens_seen": 2750290944 + }, + { + "epoch": 9.02, + "learning_rate": 8.426278836509529e-05, + "loss": 2.6304, + "theoretical_loss": 3.343218718542791, + "tokens_seen": 2750356480 + }, + { + "epoch": 9.02, + "learning_rate": 8.425275827482449e-05, + "loss": 2.4569, + "theoretical_loss": 3.3432124849541704, + "tokens_seen": 2750422016 + }, + { + "epoch": 9.02, + "learning_rate": 8.424272818455367e-05, + "loss": 2.542, + "theoretical_loss": 3.3432062515556678, + "tokens_seen": 2750487552 + }, + { + "epoch": 9.02, + "learning_rate": 8.423269809428285e-05, + "loss": 2.5297, + "theoretical_loss": 3.343200018347272, + "tokens_seen": 2750553088 + }, + { + "epoch": 9.02, + "learning_rate": 8.422266800401204e-05, + "loss": 2.7175, + "theoretical_loss": 3.3431937853289733, + "tokens_seen": 2750618624 + }, + { + "epoch": 9.02, + "learning_rate": 8.421263791374123e-05, + "loss": 2.4088, + "theoretical_loss": 3.343187552500761, + "tokens_seen": 2750684160 + }, + { + "epoch": 9.02, + "learning_rate": 8.420260782347042e-05, + "loss": 2.7552, + "theoretical_loss": 3.343181319862625, + "tokens_seen": 2750749696 + }, + { + "epoch": 9.02, + "learning_rate": 8.41925777331996e-05, + "loss": 2.5118, + "theoretical_loss": 3.3431750874145547, + "tokens_seen": 2750815232 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3051449, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.110750675201416, + "objective/train/theoretical_loss": 3.343170413203227, + "objective/train/tokens_used": 2771324384, + "theoretical_loss": 3.343170413203227, + "tokens_seen": 2750864384 + }, + { + "epoch": 9.02, + "learning_rate": 8.41825476429288e-05, + "loss": 2.585, + "theoretical_loss": 3.34316885515654, + "tokens_seen": 2750880768 + }, + { + "epoch": 9.02, + "learning_rate": 8.417251755265798e-05, + "loss": 2.5309, + "theoretical_loss": 3.3431626230885705, + "tokens_seen": 2750946304 + }, + { + "epoch": 9.02, + "learning_rate": 8.416248746238717e-05, + "loss": 2.4949, + "theoretical_loss": 3.343156391210636, + "tokens_seen": 2751011840 + }, + { + "epoch": 9.02, + "learning_rate": 8.415245737211635e-05, + "loss": 2.5979, + "theoretical_loss": 3.343150159522726, + "tokens_seen": 2751077376 + }, + { + "epoch": 9.02, + "learning_rate": 8.414242728184553e-05, + "loss": 2.294, + "theoretical_loss": 3.3431439280248303, + "tokens_seen": 2751142912 + }, + { + "epoch": 9.02, + "learning_rate": 8.413239719157473e-05, + "loss": 2.5809, + "theoretical_loss": 3.343137696716939, + "tokens_seen": 2751208448 + }, + { + "epoch": 9.02, + "learning_rate": 8.412236710130391e-05, + "loss": 2.5761, + "theoretical_loss": 3.3431314655990407, + "tokens_seen": 2751273984 + }, + { + "epoch": 9.02, + "learning_rate": 8.41123370110331e-05, + "loss": 2.7838, + "theoretical_loss": 3.3431252346711258, + "tokens_seen": 2751339520 + }, + { + "epoch": 9.02, + "learning_rate": 8.410230692076229e-05, + "loss": 2.3589, + "theoretical_loss": 3.3431190039331837, + "tokens_seen": 2751405056 + }, + { + "epoch": 9.02, + "learning_rate": 8.409227683049148e-05, + "loss": 2.4556, + "theoretical_loss": 3.3431127733852044, + "tokens_seen": 2751470592 + }, + { + "epoch": 9.02, + "learning_rate": 8.408224674022066e-05, + "loss": 2.4731, + "theoretical_loss": 3.3431065430271776, + "tokens_seen": 2751536128 + }, + { + "epoch": 9.02, + "learning_rate": 8.407221664994986e-05, + "loss": 2.5256, + "theoretical_loss": 3.3431003128590926, + "tokens_seen": 2751601664 + }, + { + "epoch": 9.02, + "learning_rate": 8.406218655967904e-05, + "loss": 2.7614, + "theoretical_loss": 3.3430940828809392, + "tokens_seen": 2751667200 + }, + { + "epoch": 9.02, + "learning_rate": 8.405215646940823e-05, + "loss": 2.4853, + "theoretical_loss": 3.3430878530927077, + "tokens_seen": 2751732736 + }, + { + "epoch": 9.02, + "learning_rate": 8.404212637913741e-05, + "loss": 2.5413, + "theoretical_loss": 3.343081623494387, + "tokens_seen": 2751798272 + }, + { + "epoch": 9.02, + "learning_rate": 8.40320962888666e-05, + "loss": 2.5087, + "theoretical_loss": 3.343075394085967, + "tokens_seen": 2751863808 + }, + { + "epoch": 9.02, + "learning_rate": 8.402206619859579e-05, + "loss": 2.5232, + "theoretical_loss": 3.3430691648674373, + "tokens_seen": 2751929344 + }, + { + "epoch": 9.02, + "learning_rate": 8.401203610832497e-05, + "loss": 2.6541, + "theoretical_loss": 3.3430629358387884, + "tokens_seen": 2751994880 + }, + { + "epoch": 9.02, + "learning_rate": 8.400200601805416e-05, + "loss": 2.5617, + "theoretical_loss": 3.3430567070000086, + "tokens_seen": 2752060416 + }, + { + "epoch": 9.02, + "learning_rate": 8.399197592778335e-05, + "loss": 2.5308, + "theoretical_loss": 3.343050478351089, + "tokens_seen": 2752125952 + }, + { + "epoch": 9.02, + "learning_rate": 8.398194583751254e-05, + "loss": 2.3949, + "theoretical_loss": 3.343044249892018, + "tokens_seen": 2752191488 + }, + { + "epoch": 9.02, + "learning_rate": 8.397191574724172e-05, + "loss": 2.5149, + "theoretical_loss": 3.343038021622786, + "tokens_seen": 2752257024 + }, + { + "epoch": 9.02, + "learning_rate": 8.396188565697092e-05, + "loss": 2.5325, + "theoretical_loss": 3.343031793543383, + "tokens_seen": 2752322560 + }, + { + "epoch": 9.02, + "learning_rate": 8.39518555667001e-05, + "loss": 2.5266, + "theoretical_loss": 3.3430255656537984, + "tokens_seen": 2752388096 + }, + { + "epoch": 9.02, + "learning_rate": 8.394182547642928e-05, + "loss": 2.4756, + "theoretical_loss": 3.343019337954021, + "tokens_seen": 2752453632 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 3052192, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3216819763183594, + "objective/train/theoretical_loss": 3.343014667303744, + "objective/train/tokens_used": 2772962784, + "theoretical_loss": 3.343014667303744, + "tokens_seen": 2752502784 + }, + { + "epoch": 9.02, + "learning_rate": 8.393179538615847e-05, + "loss": 2.6169, + "theoretical_loss": 3.3430131104440424, + "tokens_seen": 2752519168 + }, + { + "epoch": 9.02, + "learning_rate": 8.392176529588765e-05, + "loss": 2.482, + "theoretical_loss": 3.3430068831238504, + "tokens_seen": 2752584704 + }, + { + "epoch": 9.02, + "learning_rate": 8.391173520561685e-05, + "loss": 2.5109, + "theoretical_loss": 3.3430006559934355, + "tokens_seen": 2752650240 + }, + { + "epoch": 9.02, + "learning_rate": 8.390170511534603e-05, + "loss": 2.3607, + "theoretical_loss": 3.3429944290527875, + "tokens_seen": 2752715776 + }, + { + "epoch": 9.02, + "learning_rate": 8.389167502507524e-05, + "loss": 2.4361, + "theoretical_loss": 3.342988202301896, + "tokens_seen": 2752781312 + }, + { + "epoch": 9.02, + "learning_rate": 8.388164493480442e-05, + "loss": 2.6625, + "theoretical_loss": 3.3429819757407513, + "tokens_seen": 2752846848 + }, + { + "epoch": 9.02, + "learning_rate": 8.387161484453361e-05, + "loss": 2.4013, + "theoretical_loss": 3.3429757493693417, + "tokens_seen": 2752912384 + }, + { + "epoch": 9.02, + "learning_rate": 8.38615847542628e-05, + "loss": 2.4642, + "theoretical_loss": 3.3429695231876577, + "tokens_seen": 2752977920 + }, + { + "epoch": 9.02, + "learning_rate": 8.385155466399198e-05, + "loss": 2.5945, + "theoretical_loss": 3.3429632971956895, + "tokens_seen": 2753043456 + }, + { + "epoch": 9.02, + "learning_rate": 8.384152457372117e-05, + "loss": 2.5531, + "theoretical_loss": 3.342957071393426, + "tokens_seen": 2753108992 + }, + { + "epoch": 9.02, + "learning_rate": 8.383149448345035e-05, + "loss": 2.5267, + "theoretical_loss": 3.342950845780857, + "tokens_seen": 2753174528 + }, + { + "epoch": 9.02, + "learning_rate": 8.382146439317955e-05, + "loss": 2.6416, + "theoretical_loss": 3.342944620357973, + "tokens_seen": 2753240064 + }, + { + "epoch": 9.02, + "learning_rate": 8.381143430290873e-05, + "loss": 2.3934, + "theoretical_loss": 3.3429383951247624, + "tokens_seen": 2753305600 + }, + { + "epoch": 9.03, + "learning_rate": 8.380140421263792e-05, + "loss": 2.5382, + "theoretical_loss": 3.342932170081216, + "tokens_seen": 2753371136 + }, + { + "epoch": 9.03, + "learning_rate": 8.37913741223671e-05, + "loss": 2.4531, + "theoretical_loss": 3.3429259452273232, + "tokens_seen": 2753436672 + }, + { + "epoch": 9.03, + "learning_rate": 8.37813440320963e-05, + "loss": 2.4676, + "theoretical_loss": 3.342919720563074, + "tokens_seen": 2753502208 + }, + { + "epoch": 9.03, + "learning_rate": 8.377131394182548e-05, + "loss": 2.5689, + "theoretical_loss": 3.342913496088457, + "tokens_seen": 2753567744 + }, + { + "epoch": 9.03, + "learning_rate": 8.376128385155467e-05, + "loss": 2.4843, + "theoretical_loss": 3.342907271803463, + "tokens_seen": 2753633280 + }, + { + "epoch": 9.03, + "learning_rate": 8.375125376128386e-05, + "loss": 2.2755, + "theoretical_loss": 3.3429010477080814, + "tokens_seen": 2753698816 + }, + { + "epoch": 9.03, + "learning_rate": 8.374122367101304e-05, + "loss": 2.5847, + "theoretical_loss": 3.3428948238023017, + "tokens_seen": 2753764352 + }, + { + "epoch": 9.03, + "learning_rate": 8.373119358074223e-05, + "loss": 2.432, + "theoretical_loss": 3.3428886000861135, + "tokens_seen": 2753829888 + }, + { + "epoch": 9.03, + "learning_rate": 8.372116349047141e-05, + "loss": 2.6106, + "theoretical_loss": 3.3428823765595075, + "tokens_seen": 2753895424 + }, + { + "epoch": 9.03, + "learning_rate": 8.37111334002006e-05, + "loss": 2.3883, + "theoretical_loss": 3.3428761532224724, + "tokens_seen": 2753960960 + }, + { + "epoch": 9.03, + "learning_rate": 8.370110330992979e-05, + "loss": 2.623, + "theoretical_loss": 3.342869930074998, + "tokens_seen": 2754026496 + }, + { + "epoch": 9.03, + "learning_rate": 8.369107321965898e-05, + "loss": 2.7354, + "theoretical_loss": 3.3428637071170746, + "tokens_seen": 2754092032 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3053299, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4595139026641846, + "objective/train/theoretical_loss": 3.3428590400230185, + "objective/train/tokens_used": 2774601184, + "theoretical_loss": 3.3428590400230185, + "tokens_seen": 2754141184 + }, + { + "epoch": 9.03, + "learning_rate": 8.368104312938816e-05, + "loss": 2.5975, + "theoretical_loss": 3.3428574843486913, + "tokens_seen": 2754157568 + }, + { + "epoch": 9.03, + "learning_rate": 8.367101303911736e-05, + "loss": 2.597, + "theoretical_loss": 3.3428512617698383, + "tokens_seen": 2754223104 + }, + { + "epoch": 9.03, + "learning_rate": 8.366098294884654e-05, + "loss": 2.3425, + "theoretical_loss": 3.342845039380505, + "tokens_seen": 2754288640 + }, + { + "epoch": 9.03, + "learning_rate": 8.365095285857572e-05, + "loss": 2.5006, + "theoretical_loss": 3.3428388171806813, + "tokens_seen": 2754354176 + }, + { + "epoch": 9.03, + "learning_rate": 8.364092276830492e-05, + "loss": 2.3676, + "theoretical_loss": 3.3428325951703566, + "tokens_seen": 2754419712 + }, + { + "epoch": 9.03, + "learning_rate": 8.36308926780341e-05, + "loss": 2.4105, + "theoretical_loss": 3.342826373349521, + "tokens_seen": 2754485248 + }, + { + "epoch": 9.03, + "learning_rate": 8.362086258776329e-05, + "loss": 2.4269, + "theoretical_loss": 3.342820151718164, + "tokens_seen": 2754550784 + }, + { + "epoch": 9.03, + "learning_rate": 8.361083249749247e-05, + "loss": 2.4402, + "theoretical_loss": 3.3428139302762756, + "tokens_seen": 2754616320 + }, + { + "epoch": 9.03, + "learning_rate": 8.360080240722167e-05, + "loss": 2.5332, + "theoretical_loss": 3.3428077090238455, + "tokens_seen": 2754681856 + }, + { + "epoch": 9.03, + "learning_rate": 8.359077231695085e-05, + "loss": 2.2975, + "theoretical_loss": 3.3428014879608625, + "tokens_seen": 2754747392 + }, + { + "epoch": 9.03, + "learning_rate": 8.358074222668004e-05, + "loss": 2.5401, + "theoretical_loss": 3.342795267087318, + "tokens_seen": 2754812928 + }, + { + "epoch": 9.03, + "learning_rate": 8.357071213640922e-05, + "loss": 2.4926, + "theoretical_loss": 3.3427890464031997, + "tokens_seen": 2754878464 + }, + { + "epoch": 9.03, + "learning_rate": 8.35606820461384e-05, + "loss": 2.4376, + "theoretical_loss": 3.342782825908499, + "tokens_seen": 2754944000 + }, + { + "epoch": 9.03, + "learning_rate": 8.35506519558676e-05, + "loss": 2.4278, + "theoretical_loss": 3.342776605603205, + "tokens_seen": 2755009536 + }, + { + "epoch": 9.03, + "learning_rate": 8.354062186559678e-05, + "loss": 2.437, + "theoretical_loss": 3.3427703854873076, + "tokens_seen": 2755075072 + }, + { + "epoch": 9.03, + "learning_rate": 8.353059177532598e-05, + "loss": 2.5139, + "theoretical_loss": 3.3427641655607965, + "tokens_seen": 2755140608 + }, + { + "epoch": 9.03, + "learning_rate": 8.352056168505517e-05, + "loss": 2.5385, + "theoretical_loss": 3.342757945823661, + "tokens_seen": 2755206144 + }, + { + "epoch": 9.03, + "learning_rate": 8.351053159478436e-05, + "loss": 2.6244, + "theoretical_loss": 3.3427517262758912, + "tokens_seen": 2755271680 + }, + { + "epoch": 9.03, + "learning_rate": 8.350050150451355e-05, + "loss": 2.7407, + "theoretical_loss": 3.342745506917477, + "tokens_seen": 2755337216 + }, + { + "epoch": 9.03, + "learning_rate": 8.349047141424274e-05, + "loss": 2.3523, + "theoretical_loss": 3.342739287748408, + "tokens_seen": 2755402752 + }, + { + "epoch": 9.03, + "learning_rate": 8.348044132397192e-05, + "loss": 2.3014, + "theoretical_loss": 3.3427330687686734, + "tokens_seen": 2755468288 + }, + { + "epoch": 9.03, + "learning_rate": 8.347041123370112e-05, + "loss": 2.4967, + "theoretical_loss": 3.3427268499782636, + "tokens_seen": 2755533824 + }, + { + "epoch": 9.03, + "learning_rate": 8.34603811434303e-05, + "loss": 2.1469, + "theoretical_loss": 3.3427206313771682, + "tokens_seen": 2755599360 + }, + { + "epoch": 9.03, + "learning_rate": 8.345035105315948e-05, + "loss": 2.6803, + "theoretical_loss": 3.342714412965377, + "tokens_seen": 2755664896 + }, + { + "epoch": 9.03, + "learning_rate": 8.344032096288867e-05, + "loss": 2.446, + "theoretical_loss": 3.3427081947428796, + "tokens_seen": 2755730432 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3053886, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7006075382232666, + "objective/train/theoretical_loss": 3.342703531200224, + "objective/train/tokens_used": 2776239584, + "theoretical_loss": 3.342703531200224, + "tokens_seen": 2755779584 + }, + { + "epoch": 9.03, + "learning_rate": 8.343029087261785e-05, + "loss": 2.4527, + "theoretical_loss": 3.342701976709665, + "tokens_seen": 2755795968 + }, + { + "epoch": 9.03, + "learning_rate": 8.342026078234705e-05, + "loss": 2.3498, + "theoretical_loss": 3.3426957588657245, + "tokens_seen": 2755861504 + }, + { + "epoch": 9.03, + "learning_rate": 8.341023069207623e-05, + "loss": 2.6792, + "theoretical_loss": 3.3426895412110467, + "tokens_seen": 2755927040 + }, + { + "epoch": 9.03, + "learning_rate": 8.340020060180542e-05, + "loss": 2.7355, + "theoretical_loss": 3.3426833237456215, + "tokens_seen": 2755992576 + }, + { + "epoch": 9.03, + "learning_rate": 8.33901705115346e-05, + "loss": 2.5891, + "theoretical_loss": 3.342677106469439, + "tokens_seen": 2756058112 + }, + { + "epoch": 9.03, + "learning_rate": 8.33801404212638e-05, + "loss": 2.7604, + "theoretical_loss": 3.3426708893824886, + "tokens_seen": 2756123648 + }, + { + "epoch": 9.03, + "learning_rate": 8.337011033099298e-05, + "loss": 2.2506, + "theoretical_loss": 3.3426646724847604, + "tokens_seen": 2756189184 + }, + { + "epoch": 9.03, + "learning_rate": 8.336008024072216e-05, + "loss": 2.594, + "theoretical_loss": 3.3426584557762435, + "tokens_seen": 2756254720 + }, + { + "epoch": 9.03, + "learning_rate": 8.335005015045136e-05, + "loss": 2.597, + "theoretical_loss": 3.3426522392569282, + "tokens_seen": 2756320256 + }, + { + "epoch": 9.03, + "learning_rate": 8.334002006018054e-05, + "loss": 2.245, + "theoretical_loss": 3.3426460229268042, + "tokens_seen": 2756385792 + }, + { + "epoch": 9.03, + "learning_rate": 8.332998996990973e-05, + "loss": 2.4048, + "theoretical_loss": 3.342639806785861, + "tokens_seen": 2756451328 + }, + { + "epoch": 9.03, + "learning_rate": 8.331995987963891e-05, + "loss": 2.6015, + "theoretical_loss": 3.3426335908340885, + "tokens_seen": 2756516864 + }, + { + "epoch": 9.03, + "learning_rate": 8.330992978936811e-05, + "loss": 2.5602, + "theoretical_loss": 3.3426273750714763, + "tokens_seen": 2756582400 + }, + { + "epoch": 9.03, + "learning_rate": 8.329989969909729e-05, + "loss": 2.436, + "theoretical_loss": 3.3426211594980146, + "tokens_seen": 2756647936 + }, + { + "epoch": 9.03, + "learning_rate": 8.328986960882648e-05, + "loss": 2.5681, + "theoretical_loss": 3.3426149441136928, + "tokens_seen": 2756713472 + }, + { + "epoch": 9.03, + "learning_rate": 8.327983951855567e-05, + "loss": 2.6582, + "theoretical_loss": 3.3426087289185005, + "tokens_seen": 2756779008 + }, + { + "epoch": 9.03, + "learning_rate": 8.326980942828486e-05, + "loss": 2.5048, + "theoretical_loss": 3.3426025139124276, + "tokens_seen": 2756844544 + }, + { + "epoch": 9.03, + "learning_rate": 8.325977933801404e-05, + "loss": 2.386, + "theoretical_loss": 3.342596299095464, + "tokens_seen": 2756910080 + }, + { + "epoch": 9.03, + "learning_rate": 8.324974924774322e-05, + "loss": 2.4919, + "theoretical_loss": 3.342590084467599, + "tokens_seen": 2756975616 + }, + { + "epoch": 9.03, + "learning_rate": 8.323971915747242e-05, + "loss": 2.5791, + "theoretical_loss": 3.3425838700288226, + "tokens_seen": 2757041152 + }, + { + "epoch": 9.03, + "learning_rate": 8.32296890672016e-05, + "loss": 2.4118, + "theoretical_loss": 3.342577655779125, + "tokens_seen": 2757106688 + }, + { + "epoch": 9.03, + "learning_rate": 8.32196589769308e-05, + "loss": 2.6311, + "theoretical_loss": 3.3425714417184955, + "tokens_seen": 2757172224 + }, + { + "epoch": 9.03, + "learning_rate": 8.320962888665997e-05, + "loss": 2.6556, + "theoretical_loss": 3.342565227846924, + "tokens_seen": 2757237760 + }, + { + "epoch": 9.03, + "learning_rate": 8.319959879638917e-05, + "loss": 2.5279, + "theoretical_loss": 3.3425590141644, + "tokens_seen": 2757303296 + }, + { + "epoch": 9.03, + "learning_rate": 8.318956870611835e-05, + "loss": 2.5213, + "theoretical_loss": 3.3425528006709135, + "tokens_seen": 2757368832 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3055353, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.100982904434204, + "objective/train/theoretical_loss": 3.342548140674848, + "objective/train/tokens_used": 2777877984, + "theoretical_loss": 3.342548140674848, + "tokens_seen": 2757417984 + }, + { + "epoch": 9.03, + "learning_rate": 8.317953861584754e-05, + "loss": 2.3994, + "theoretical_loss": 3.342546587366454, + "tokens_seen": 2757434368 + }, + { + "epoch": 9.03, + "learning_rate": 8.316950852557673e-05, + "loss": 2.6293, + "theoretical_loss": 3.3425403742510116, + "tokens_seen": 2757499904 + }, + { + "epoch": 9.03, + "learning_rate": 8.315947843530591e-05, + "loss": 2.5488, + "theoretical_loss": 3.342534161324576, + "tokens_seen": 2757565440 + }, + { + "epoch": 9.03, + "learning_rate": 8.31494483450351e-05, + "loss": 2.7169, + "theoretical_loss": 3.342527948587137, + "tokens_seen": 2757630976 + }, + { + "epoch": 9.03, + "learning_rate": 8.31394182547643e-05, + "loss": 2.4209, + "theoretical_loss": 3.342521736038684, + "tokens_seen": 2757696512 + }, + { + "epoch": 9.03, + "learning_rate": 8.312938816449349e-05, + "loss": 2.4986, + "theoretical_loss": 3.342515523679207, + "tokens_seen": 2757762048 + }, + { + "epoch": 9.03, + "learning_rate": 8.311935807422267e-05, + "loss": 2.4322, + "theoretical_loss": 3.3425093115086955, + "tokens_seen": 2757827584 + }, + { + "epoch": 9.03, + "learning_rate": 8.310932798395187e-05, + "loss": 2.4312, + "theoretical_loss": 3.34250309952714, + "tokens_seen": 2757893120 + }, + { + "epoch": 9.03, + "learning_rate": 8.309929789368105e-05, + "loss": 2.4108, + "theoretical_loss": 3.342496887734529, + "tokens_seen": 2757958656 + }, + { + "epoch": 9.03, + "learning_rate": 8.308926780341024e-05, + "loss": 2.4769, + "theoretical_loss": 3.342490676130854, + "tokens_seen": 2758024192 + }, + { + "epoch": 9.03, + "learning_rate": 8.307923771313942e-05, + "loss": 2.5581, + "theoretical_loss": 3.342484464716103, + "tokens_seen": 2758089728 + }, + { + "epoch": 9.03, + "learning_rate": 8.30692076228686e-05, + "loss": 2.429, + "theoretical_loss": 3.3424782534902673, + "tokens_seen": 2758155264 + }, + { + "epoch": 9.03, + "learning_rate": 8.30591775325978e-05, + "loss": 2.4346, + "theoretical_loss": 3.3424720424533354, + "tokens_seen": 2758220800 + }, + { + "epoch": 9.03, + "learning_rate": 8.304914744232698e-05, + "loss": 2.4751, + "theoretical_loss": 3.3424658316052978, + "tokens_seen": 2758286336 + }, + { + "epoch": 9.03, + "learning_rate": 8.303911735205618e-05, + "loss": 2.491, + "theoretical_loss": 3.342459620946144, + "tokens_seen": 2758351872 + }, + { + "epoch": 9.03, + "learning_rate": 8.302908726178536e-05, + "loss": 2.3891, + "theoretical_loss": 3.342453410475864, + "tokens_seen": 2758417408 + }, + { + "epoch": 9.03, + "learning_rate": 8.301905717151455e-05, + "loss": 2.5525, + "theoretical_loss": 3.3424472001944467, + "tokens_seen": 2758482944 + }, + { + "epoch": 9.03, + "learning_rate": 8.300902708124373e-05, + "loss": 2.4217, + "theoretical_loss": 3.342440990101883, + "tokens_seen": 2758548480 + }, + { + "epoch": 9.03, + "learning_rate": 8.299899699097293e-05, + "loss": 2.6379, + "theoretical_loss": 3.3424347801981624, + "tokens_seen": 2758614016 + }, + { + "epoch": 9.03, + "learning_rate": 8.298896690070211e-05, + "loss": 2.428, + "theoretical_loss": 3.342428570483275, + "tokens_seen": 2758679552 + }, + { + "epoch": 9.03, + "learning_rate": 8.29789368104313e-05, + "loss": 2.4668, + "theoretical_loss": 3.342422360957209, + "tokens_seen": 2758745088 + }, + { + "epoch": 9.03, + "learning_rate": 8.296890672016048e-05, + "loss": 2.585, + "theoretical_loss": 3.3424161516199558, + "tokens_seen": 2758810624 + }, + { + "epoch": 9.03, + "learning_rate": 8.295887662988967e-05, + "loss": 2.3631, + "theoretical_loss": 3.342409942471505, + "tokens_seen": 2758876160 + }, + { + "epoch": 9.03, + "learning_rate": 8.294884653961886e-05, + "loss": 2.6248, + "theoretical_loss": 3.342403733511845, + "tokens_seen": 2758941696 + }, + { + "epoch": 9.03, + "learning_rate": 8.293881644934804e-05, + "loss": 2.6587, + "theoretical_loss": 3.342397524740967, + "tokens_seen": 2759007232 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3056240, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.275782823562622, + "objective/train/theoretical_loss": 3.342392868286691, + "objective/train/tokens_used": 2779516384, + "theoretical_loss": 3.342392868286691, + "tokens_seen": 2759056384 + }, + { + "epoch": 9.03, + "learning_rate": 8.292878635907724e-05, + "loss": 2.591, + "theoretical_loss": 3.342391316158861, + "tokens_seen": 2759072768 + }, + { + "epoch": 9.03, + "learning_rate": 8.291875626880642e-05, + "loss": 2.3263, + "theoretical_loss": 3.342385107765516, + "tokens_seen": 2759138304 + }, + { + "epoch": 9.03, + "learning_rate": 8.290872617853561e-05, + "loss": 2.5589, + "theoretical_loss": 3.342378899560921, + "tokens_seen": 2759203840 + }, + { + "epoch": 9.03, + "learning_rate": 8.289869608826479e-05, + "loss": 2.4113, + "theoretical_loss": 3.3423726915450676, + "tokens_seen": 2759269376 + }, + { + "epoch": 9.03, + "learning_rate": 8.288866599799399e-05, + "loss": 2.6454, + "theoretical_loss": 3.342366483717944, + "tokens_seen": 2759334912 + }, + { + "epoch": 9.03, + "learning_rate": 8.287863590772317e-05, + "loss": 2.5523, + "theoretical_loss": 3.3423602760795412, + "tokens_seen": 2759400448 + }, + { + "epoch": 9.03, + "learning_rate": 8.286860581745235e-05, + "loss": 2.5768, + "theoretical_loss": 3.3423540686298483, + "tokens_seen": 2759465984 + }, + { + "epoch": 9.03, + "learning_rate": 8.285857572718154e-05, + "loss": 2.3207, + "theoretical_loss": 3.342347861368855, + "tokens_seen": 2759531520 + }, + { + "epoch": 9.03, + "learning_rate": 8.284854563691073e-05, + "loss": 2.5345, + "theoretical_loss": 3.3423416542965514, + "tokens_seen": 2759597056 + }, + { + "epoch": 9.03, + "learning_rate": 8.283851554663992e-05, + "loss": 2.4999, + "theoretical_loss": 3.3423354474129274, + "tokens_seen": 2759662592 + }, + { + "epoch": 9.03, + "learning_rate": 8.28284854563691e-05, + "loss": 2.4594, + "theoretical_loss": 3.342329240717972, + "tokens_seen": 2759728128 + }, + { + "epoch": 9.03, + "learning_rate": 8.28184553660983e-05, + "loss": 2.4725, + "theoretical_loss": 3.342323034211676, + "tokens_seen": 2759793664 + }, + { + "epoch": 9.03, + "learning_rate": 8.280842527582748e-05, + "loss": 2.3221, + "theoretical_loss": 3.3423168278940287, + "tokens_seen": 2759859200 + }, + { + "epoch": 9.03, + "learning_rate": 8.279839518555667e-05, + "loss": 2.4609, + "theoretical_loss": 3.34231062176502, + "tokens_seen": 2759924736 + }, + { + "epoch": 9.03, + "learning_rate": 8.278836509528585e-05, + "loss": 2.4242, + "theoretical_loss": 3.342304415824639, + "tokens_seen": 2759990272 + }, + { + "epoch": 9.03, + "learning_rate": 8.277833500501503e-05, + "loss": 2.3773, + "theoretical_loss": 3.342298210072877, + "tokens_seen": 2760055808 + }, + { + "epoch": 9.03, + "learning_rate": 8.276830491474424e-05, + "loss": 2.3389, + "theoretical_loss": 3.342292004509722, + "tokens_seen": 2760121344 + }, + { + "epoch": 9.03, + "learning_rate": 8.275827482447342e-05, + "loss": 2.6039, + "theoretical_loss": 3.342285799135165, + "tokens_seen": 2760186880 + }, + { + "epoch": 9.03, + "learning_rate": 8.274824473420262e-05, + "loss": 2.366, + "theoretical_loss": 3.342279593949196, + "tokens_seen": 2760252416 + }, + { + "epoch": 9.03, + "learning_rate": 8.27382146439318e-05, + "loss": 2.6736, + "theoretical_loss": 3.3422733889518037, + "tokens_seen": 2760317952 + }, + { + "epoch": 9.03, + "learning_rate": 8.2728184553661e-05, + "loss": 2.6792, + "theoretical_loss": 3.3422671841429783, + "tokens_seen": 2760383488 + }, + { + "epoch": 9.03, + "learning_rate": 8.271815446339017e-05, + "loss": 2.5877, + "theoretical_loss": 3.34226097952271, + "tokens_seen": 2760449024 + }, + { + "epoch": 9.03, + "learning_rate": 8.270812437311937e-05, + "loss": 2.2223, + "theoretical_loss": 3.3422547750909883, + "tokens_seen": 2760514560 + }, + { + "epoch": 9.03, + "learning_rate": 8.269809428284855e-05, + "loss": 2.3807, + "theoretical_loss": 3.3422485708478034, + "tokens_seen": 2760580096 + }, + { + "epoch": 9.03, + "learning_rate": 8.268806419257775e-05, + "loss": 2.7767, + "theoretical_loss": 3.342242366793144, + "tokens_seen": 2760645632 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3056889, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8164095878601074, + "objective/train/theoretical_loss": 3.342237713875864, + "objective/train/tokens_used": 2781154784, + "theoretical_loss": 3.342237713875864, + "tokens_seen": 2760694784 + }, + { + "epoch": 9.03, + "learning_rate": 8.267803410230693e-05, + "loss": 2.7167, + "theoretical_loss": 3.342236162927001, + "tokens_seen": 2760711168 + }, + { + "epoch": 9.03, + "learning_rate": 8.266800401203611e-05, + "loss": 2.5165, + "theoretical_loss": 3.342229959249364, + "tokens_seen": 2760776704 + }, + { + "epoch": 9.03, + "learning_rate": 8.26579739217653e-05, + "loss": 2.8169, + "theoretical_loss": 3.342223755760222, + "tokens_seen": 2760842240 + }, + { + "epoch": 9.03, + "learning_rate": 8.264794383149448e-05, + "loss": 2.7298, + "theoretical_loss": 3.342217552459566, + "tokens_seen": 2760907776 + }, + { + "epoch": 9.03, + "learning_rate": 8.263791374122368e-05, + "loss": 2.5019, + "theoretical_loss": 3.342211349347385, + "tokens_seen": 2760973312 + }, + { + "epoch": 9.03, + "learning_rate": 8.262788365095286e-05, + "loss": 2.6768, + "theoretical_loss": 3.342205146423669, + "tokens_seen": 2761038848 + }, + { + "epoch": 9.03, + "learning_rate": 8.261785356068205e-05, + "loss": 2.3636, + "theoretical_loss": 3.342198943688408, + "tokens_seen": 2761104384 + }, + { + "epoch": 9.03, + "learning_rate": 8.260782347041123e-05, + "loss": 2.6138, + "theoretical_loss": 3.3421927411415915, + "tokens_seen": 2761169920 + }, + { + "epoch": 9.03, + "learning_rate": 8.259779338014043e-05, + "loss": 2.6135, + "theoretical_loss": 3.342186538783209, + "tokens_seen": 2761235456 + }, + { + "epoch": 9.03, + "learning_rate": 8.258776328986961e-05, + "loss": 2.4694, + "theoretical_loss": 3.342180336613251, + "tokens_seen": 2761300992 + }, + { + "epoch": 9.03, + "learning_rate": 8.257773319959879e-05, + "loss": 2.218, + "theoretical_loss": 3.3421741346317075, + "tokens_seen": 2761366528 + }, + { + "epoch": 9.03, + "learning_rate": 8.256770310932799e-05, + "loss": 2.3897, + "theoretical_loss": 3.342167932838567, + "tokens_seen": 2761432064 + }, + { + "epoch": 9.03, + "learning_rate": 8.255767301905717e-05, + "loss": 2.6079, + "theoretical_loss": 3.342161731233821, + "tokens_seen": 2761497600 + }, + { + "epoch": 9.03, + "learning_rate": 8.254764292878636e-05, + "loss": 2.43, + "theoretical_loss": 3.342155529817458, + "tokens_seen": 2761563136 + }, + { + "epoch": 9.03, + "learning_rate": 8.253761283851554e-05, + "loss": 2.6721, + "theoretical_loss": 3.342149328589468, + "tokens_seen": 2761628672 + }, + { + "epoch": 9.03, + "learning_rate": 8.252758274824474e-05, + "loss": 2.6487, + "theoretical_loss": 3.3421431275498414, + "tokens_seen": 2761694208 + }, + { + "epoch": 9.03, + "learning_rate": 8.251755265797392e-05, + "loss": 2.6246, + "theoretical_loss": 3.3421369266985677, + "tokens_seen": 2761759744 + }, + { + "epoch": 9.03, + "learning_rate": 8.250752256770311e-05, + "loss": 2.6215, + "theoretical_loss": 3.342130726035636, + "tokens_seen": 2761825280 + }, + { + "epoch": 9.03, + "learning_rate": 8.24974924774323e-05, + "loss": 2.5355, + "theoretical_loss": 3.3421245255610375, + "tokens_seen": 2761890816 + }, + { + "epoch": 9.03, + "learning_rate": 8.248746238716148e-05, + "loss": 2.6012, + "theoretical_loss": 3.342118325274761, + "tokens_seen": 2761956352 + }, + { + "epoch": 9.03, + "learning_rate": 8.247743229689067e-05, + "loss": 2.6301, + "theoretical_loss": 3.342112125176797, + "tokens_seen": 2762021888 + }, + { + "epoch": 9.03, + "learning_rate": 8.246740220661985e-05, + "loss": 2.6137, + "theoretical_loss": 3.3421059252671346, + "tokens_seen": 2762087424 + }, + { + "epoch": 9.03, + "learning_rate": 8.245737211634905e-05, + "loss": 2.6044, + "theoretical_loss": 3.342099725545764, + "tokens_seen": 2762152960 + }, + { + "epoch": 9.03, + "learning_rate": 8.244734202607823e-05, + "loss": 2.745, + "theoretical_loss": 3.342093526012675, + "tokens_seen": 2762218496 + }, + { + "epoch": 9.03, + "learning_rate": 8.243731193580742e-05, + "loss": 2.5402, + "theoretical_loss": 3.342087326667857, + "tokens_seen": 2762284032 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3057194, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.808140754699707, + "objective/train/theoretical_loss": 3.3420826772827907, + "objective/train/tokens_used": 2782793184, + "theoretical_loss": 3.3420826772827907, + "tokens_seen": 2762333184 + }, + { + "epoch": 9.03, + "learning_rate": 8.24272818455366e-05, + "loss": 2.6042, + "theoretical_loss": 3.3420811275113005, + "tokens_seen": 2762349568 + }, + { + "epoch": 9.03, + "learning_rate": 8.24172517552658e-05, + "loss": 2.7787, + "theoretical_loss": 3.3420749285429947, + "tokens_seen": 2762415104 + }, + { + "epoch": 9.03, + "learning_rate": 8.240722166499498e-05, + "loss": 2.6483, + "theoretical_loss": 3.3420687297629303, + "tokens_seen": 2762480640 + }, + { + "epoch": 9.03, + "learning_rate": 8.239719157472417e-05, + "loss": 2.5126, + "theoretical_loss": 3.342062531171096, + "tokens_seen": 2762546176 + }, + { + "epoch": 9.03, + "learning_rate": 8.238716148445337e-05, + "loss": 2.8203, + "theoretical_loss": 3.3420563327674824, + "tokens_seen": 2762611712 + }, + { + "epoch": 9.03, + "learning_rate": 8.237713139418255e-05, + "loss": 2.6166, + "theoretical_loss": 3.3420501345520788, + "tokens_seen": 2762677248 + }, + { + "epoch": 9.03, + "learning_rate": 8.236710130391174e-05, + "loss": 2.6048, + "theoretical_loss": 3.3420439365248757, + "tokens_seen": 2762742784 + }, + { + "epoch": 9.03, + "learning_rate": 8.235707121364093e-05, + "loss": 2.6263, + "theoretical_loss": 3.342037738685862, + "tokens_seen": 2762808320 + }, + { + "epoch": 9.03, + "learning_rate": 8.234704112337012e-05, + "loss": 2.5023, + "theoretical_loss": 3.3420315410350283, + "tokens_seen": 2762873856 + }, + { + "epoch": 9.03, + "learning_rate": 8.23370110330993e-05, + "loss": 2.6478, + "theoretical_loss": 3.342025343572364, + "tokens_seen": 2762939392 + }, + { + "epoch": 9.03, + "learning_rate": 8.23269809428285e-05, + "loss": 2.4604, + "theoretical_loss": 3.3420191462978597, + "tokens_seen": 2763004928 + }, + { + "epoch": 9.03, + "learning_rate": 8.231695085255768e-05, + "loss": 2.8117, + "theoretical_loss": 3.342012949211504, + "tokens_seen": 2763070464 + }, + { + "epoch": 9.03, + "learning_rate": 8.230692076228687e-05, + "loss": 2.5738, + "theoretical_loss": 3.3420067523132873, + "tokens_seen": 2763136000 + }, + { + "epoch": 9.03, + "learning_rate": 8.229689067201605e-05, + "loss": 2.3218, + "theoretical_loss": 3.3420005556031995, + "tokens_seen": 2763201536 + }, + { + "epoch": 9.03, + "learning_rate": 8.228686058174523e-05, + "loss": 2.5205, + "theoretical_loss": 3.3419943590812307, + "tokens_seen": 2763267072 + }, + { + "epoch": 9.03, + "learning_rate": 8.227683049147443e-05, + "loss": 2.4808, + "theoretical_loss": 3.34198816274737, + "tokens_seen": 2763332608 + }, + { + "epoch": 9.03, + "learning_rate": 8.226680040120361e-05, + "loss": 2.6017, + "theoretical_loss": 3.341981966601608, + "tokens_seen": 2763398144 + }, + { + "epoch": 9.03, + "learning_rate": 8.22567703109328e-05, + "loss": 2.5444, + "theoretical_loss": 3.341975770643934, + "tokens_seen": 2763463680 + }, + { + "epoch": 9.03, + "learning_rate": 8.224674022066199e-05, + "loss": 2.5646, + "theoretical_loss": 3.341969574874338, + "tokens_seen": 2763529216 + }, + { + "epoch": 9.03, + "learning_rate": 8.223671013039118e-05, + "loss": 2.6438, + "theoretical_loss": 3.34196337929281, + "tokens_seen": 2763594752 + }, + { + "epoch": 9.03, + "learning_rate": 8.222668004012036e-05, + "loss": 2.7176, + "theoretical_loss": 3.341957183899339, + "tokens_seen": 2763660288 + }, + { + "epoch": 9.03, + "learning_rate": 8.221664994984956e-05, + "loss": 2.6072, + "theoretical_loss": 3.3419509886939163, + "tokens_seen": 2763725824 + }, + { + "epoch": 9.03, + "learning_rate": 8.220661985957874e-05, + "loss": 2.4126, + "theoretical_loss": 3.3419447936765305, + "tokens_seen": 2763791360 + }, + { + "epoch": 9.03, + "learning_rate": 8.219658976930793e-05, + "loss": 2.4675, + "theoretical_loss": 3.341938598847172, + "tokens_seen": 2763856896 + }, + { + "epoch": 9.03, + "learning_rate": 8.218655967903711e-05, + "loss": 2.7008, + "theoretical_loss": 3.3419324042058305, + "tokens_seen": 2763922432 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3057194, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.764094829559326, + "objective/train/theoretical_loss": 3.3419277583482043, + "objective/train/tokens_used": 2784431584, + "theoretical_loss": 3.3419277583482043, + "tokens_seen": 2763971584 + }, + { + "epoch": 9.03, + "learning_rate": 8.21765295887663e-05, + "loss": 2.7049, + "theoretical_loss": 3.3419262097524958, + "tokens_seen": 2763987968 + }, + { + "epoch": 9.03, + "learning_rate": 8.216649949849549e-05, + "loss": 2.5855, + "theoretical_loss": 3.341920015487158, + "tokens_seen": 2764053504 + }, + { + "epoch": 9.03, + "learning_rate": 8.215646940822467e-05, + "loss": 2.7729, + "theoretical_loss": 3.3419138214098063, + "tokens_seen": 2764119040 + }, + { + "epoch": 9.03, + "learning_rate": 8.214643931795386e-05, + "loss": 2.5923, + "theoretical_loss": 3.341907627520431, + "tokens_seen": 2764184576 + }, + { + "epoch": 9.03, + "learning_rate": 8.213640922768305e-05, + "loss": 2.4704, + "theoretical_loss": 3.341901433819022, + "tokens_seen": 2764250112 + }, + { + "epoch": 9.03, + "learning_rate": 8.212637913741224e-05, + "loss": 2.5664, + "theoretical_loss": 3.3418952403055693, + "tokens_seen": 2764315648 + }, + { + "epoch": 9.03, + "learning_rate": 8.211634904714142e-05, + "loss": 2.5618, + "theoretical_loss": 3.3418890469800617, + "tokens_seen": 2764381184 + }, + { + "epoch": 9.03, + "learning_rate": 8.210631895687062e-05, + "loss": 2.6369, + "theoretical_loss": 3.3418828538424905, + "tokens_seen": 2764446720 + }, + { + "epoch": 9.03, + "learning_rate": 8.20962888665998e-05, + "loss": 2.7687, + "theoretical_loss": 3.3418766608928445, + "tokens_seen": 2764512256 + }, + { + "epoch": 9.03, + "learning_rate": 8.208625877632898e-05, + "loss": 2.5957, + "theoretical_loss": 3.341870468131114, + "tokens_seen": 2764577792 + }, + { + "epoch": 9.03, + "learning_rate": 8.207622868605817e-05, + "loss": 2.5281, + "theoretical_loss": 3.3418642755572887, + "tokens_seen": 2764643328 + }, + { + "epoch": 9.03, + "learning_rate": 8.206619859578735e-05, + "loss": 2.8613, + "theoretical_loss": 3.3418580831713585, + "tokens_seen": 2764708864 + }, + { + "epoch": 9.03, + "learning_rate": 8.205616850551655e-05, + "loss": 2.5224, + "theoretical_loss": 3.341851890973313, + "tokens_seen": 2764774400 + }, + { + "epoch": 9.03, + "learning_rate": 8.204613841524573e-05, + "loss": 2.7024, + "theoretical_loss": 3.3418456989631427, + "tokens_seen": 2764839936 + }, + { + "epoch": 9.03, + "learning_rate": 8.203610832497492e-05, + "loss": 2.64, + "theoretical_loss": 3.341839507140837, + "tokens_seen": 2764905472 + }, + { + "epoch": 9.03, + "learning_rate": 8.20260782347041e-05, + "loss": 2.455, + "theoretical_loss": 3.3418333155063853, + "tokens_seen": 2764971008 + }, + { + "epoch": 9.03, + "learning_rate": 8.201604814443331e-05, + "loss": 2.7303, + "theoretical_loss": 3.3418271240597783, + "tokens_seen": 2765036544 + }, + { + "epoch": 9.03, + "learning_rate": 8.20060180541625e-05, + "loss": 2.6266, + "theoretical_loss": 3.341820932801005, + "tokens_seen": 2765102080 + }, + { + "epoch": 9.03, + "learning_rate": 8.199598796389168e-05, + "loss": 2.7386, + "theoretical_loss": 3.3418147417300563, + "tokens_seen": 2765167616 + }, + { + "epoch": 9.03, + "learning_rate": 8.198595787362087e-05, + "loss": 2.8061, + "theoretical_loss": 3.3418085508469213, + "tokens_seen": 2765233152 + }, + { + "epoch": 9.03, + "learning_rate": 8.197592778335005e-05, + "loss": 2.5307, + "theoretical_loss": 3.34180236015159, + "tokens_seen": 2765298688 + }, + { + "epoch": 9.03, + "learning_rate": 8.196589769307925e-05, + "loss": 2.4359, + "theoretical_loss": 3.341796169644052, + "tokens_seen": 2765364224 + }, + { + "epoch": 9.03, + "learning_rate": 8.195586760280843e-05, + "loss": 2.6798, + "theoretical_loss": 3.3417899793242976, + "tokens_seen": 2765429760 + }, + { + "epoch": 9.03, + "learning_rate": 8.194583751253762e-05, + "loss": 2.7175, + "theoretical_loss": 3.3417837891923163, + "tokens_seen": 2765495296 + }, + { + "epoch": 9.03, + "learning_rate": 8.19358074222668e-05, + "loss": 2.5466, + "theoretical_loss": 3.3417775992480983, + "tokens_seen": 2765560832 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3057950, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.230217456817627, + "objective/train/theoretical_loss": 3.341772956913148, + "objective/train/tokens_used": 2786069984, + "theoretical_loss": 3.341772956913148, + "tokens_seen": 2765609984 + }, + { + "epoch": 9.03, + "learning_rate": 8.1925777331996e-05, + "loss": 2.4395, + "theoretical_loss": 3.341771409491633, + "tokens_seen": 2765626368 + }, + { + "epoch": 9.03, + "learning_rate": 8.191574724172518e-05, + "loss": 2.687, + "theoretical_loss": 3.3417652199229106, + "tokens_seen": 2765691904 + }, + { + "epoch": 9.03, + "learning_rate": 8.190571715145437e-05, + "loss": 2.7266, + "theoretical_loss": 3.3417590305419216, + "tokens_seen": 2765757440 + }, + { + "epoch": 9.03, + "learning_rate": 8.189568706118356e-05, + "loss": 2.6091, + "theoretical_loss": 3.3417528413486544, + "tokens_seen": 2765822976 + }, + { + "epoch": 9.03, + "learning_rate": 8.188565697091274e-05, + "loss": 2.7915, + "theoretical_loss": 3.3417466523430996, + "tokens_seen": 2765888512 + }, + { + "epoch": 9.03, + "learning_rate": 8.187562688064193e-05, + "loss": 2.7074, + "theoretical_loss": 3.3417404635252472, + "tokens_seen": 2765954048 + }, + { + "epoch": 9.03, + "learning_rate": 8.186559679037111e-05, + "loss": 2.4968, + "theoretical_loss": 3.3417342748950873, + "tokens_seen": 2766019584 + }, + { + "epoch": 9.03, + "learning_rate": 8.18555667001003e-05, + "loss": 2.7055, + "theoretical_loss": 3.341728086452609, + "tokens_seen": 2766085120 + }, + { + "epoch": 9.03, + "learning_rate": 8.184553660982949e-05, + "loss": 2.5665, + "theoretical_loss": 3.3417218981978025, + "tokens_seen": 2766150656 + }, + { + "epoch": 9.03, + "learning_rate": 8.183550651955868e-05, + "loss": 2.6593, + "theoretical_loss": 3.341715710130658, + "tokens_seen": 2766216192 + }, + { + "epoch": 9.03, + "learning_rate": 8.182547642928786e-05, + "loss": 2.6148, + "theoretical_loss": 3.341709522251165, + "tokens_seen": 2766281728 + }, + { + "epoch": 9.03, + "learning_rate": 8.181544633901706e-05, + "loss": 2.4798, + "theoretical_loss": 3.3417033345593135, + "tokens_seen": 2766347264 + }, + { + "epoch": 9.03, + "learning_rate": 8.180541624874624e-05, + "loss": 2.7866, + "theoretical_loss": 3.341697147055093, + "tokens_seen": 2766412800 + }, + { + "epoch": 9.03, + "learning_rate": 8.179538615847542e-05, + "loss": 2.4542, + "theoretical_loss": 3.341690959738494, + "tokens_seen": 2766478336 + }, + { + "epoch": 9.03, + "learning_rate": 8.178535606820462e-05, + "loss": 2.9131, + "theoretical_loss": 3.3416847726095056, + "tokens_seen": 2766543872 + }, + { + "epoch": 9.03, + "learning_rate": 8.17753259779338e-05, + "loss": 2.6036, + "theoretical_loss": 3.3416785856681184, + "tokens_seen": 2766609408 + }, + { + "epoch": 9.03, + "learning_rate": 8.176529588766299e-05, + "loss": 2.7713, + "theoretical_loss": 3.341672398914322, + "tokens_seen": 2766674944 + }, + { + "epoch": 9.03, + "learning_rate": 8.175526579739217e-05, + "loss": 2.627, + "theoretical_loss": 3.3416662123481062, + "tokens_seen": 2766740480 + }, + { + "epoch": 9.03, + "learning_rate": 8.174523570712137e-05, + "loss": 2.5549, + "theoretical_loss": 3.341660025969461, + "tokens_seen": 2766806016 + }, + { + "epoch": 9.03, + "learning_rate": 8.173520561685055e-05, + "loss": 2.5296, + "theoretical_loss": 3.3416538397783757, + "tokens_seen": 2766871552 + }, + { + "epoch": 9.03, + "learning_rate": 8.172517552657974e-05, + "loss": 2.636, + "theoretical_loss": 3.3416476537748414, + "tokens_seen": 2766937088 + }, + { + "epoch": 9.03, + "learning_rate": 8.171514543630892e-05, + "loss": 2.6099, + "theoretical_loss": 3.341641467958847, + "tokens_seen": 2767002624 + }, + { + "epoch": 9.03, + "learning_rate": 8.17051153460381e-05, + "loss": 2.6633, + "theoretical_loss": 3.3416352823303823, + "tokens_seen": 2767068160 + }, + { + "epoch": 9.03, + "learning_rate": 8.16950852557673e-05, + "loss": 2.5735, + "theoretical_loss": 3.3416290968894375, + "tokens_seen": 2767133696 + }, + { + "epoch": 9.03, + "learning_rate": 8.168505516549648e-05, + "loss": 2.3601, + "theoretical_loss": 3.3416229116360023, + "tokens_seen": 2767199232 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3059390, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6719067096710205, + "objective/train/theoretical_loss": 3.3416182728189736, + "objective/train/tokens_used": 2787708384, + "theoretical_loss": 3.3416182728189736, + "tokens_seen": 2767248384 + }, + { + "epoch": 9.03, + "learning_rate": 8.167502507522568e-05, + "loss": 2.5191, + "theoretical_loss": 3.3416167265700674, + "tokens_seen": 2767264768 + }, + { + "epoch": 9.03, + "learning_rate": 8.166499498495486e-05, + "loss": 2.5792, + "theoretical_loss": 3.3416105416916215, + "tokens_seen": 2767330304 + }, + { + "epoch": 9.03, + "learning_rate": 8.165496489468405e-05, + "loss": 2.6278, + "theoretical_loss": 3.341604357000655, + "tokens_seen": 2767395840 + }, + { + "epoch": 9.03, + "learning_rate": 8.164493480441323e-05, + "loss": 2.684, + "theoretical_loss": 3.3415981724971577, + "tokens_seen": 2767461376 + }, + { + "epoch": 9.03, + "learning_rate": 8.163490471414244e-05, + "loss": 2.6501, + "theoretical_loss": 3.3415919881811194, + "tokens_seen": 2767526912 + }, + { + "epoch": 9.03, + "learning_rate": 8.162487462387162e-05, + "loss": 2.4164, + "theoretical_loss": 3.3415858040525306, + "tokens_seen": 2767592448 + }, + { + "epoch": 9.03, + "learning_rate": 8.161484453360082e-05, + "loss": 2.7205, + "theoretical_loss": 3.3415796201113803, + "tokens_seen": 2767657984 + }, + { + "epoch": 9.03, + "learning_rate": 8.160481444333e-05, + "loss": 2.6279, + "theoretical_loss": 3.341573436357659, + "tokens_seen": 2767723520 + }, + { + "epoch": 9.03, + "learning_rate": 8.159478435305918e-05, + "loss": 2.6307, + "theoretical_loss": 3.3415672527913562, + "tokens_seen": 2767789056 + }, + { + "epoch": 9.03, + "learning_rate": 8.158475426278837e-05, + "loss": 2.5388, + "theoretical_loss": 3.341561069412462, + "tokens_seen": 2767854592 + }, + { + "epoch": 9.03, + "learning_rate": 8.157472417251755e-05, + "loss": 2.6335, + "theoretical_loss": 3.341554886220966, + "tokens_seen": 2767920128 + }, + { + "epoch": 9.03, + "learning_rate": 8.156469408224675e-05, + "loss": 2.6052, + "theoretical_loss": 3.3415487032168585, + "tokens_seen": 2767985664 + }, + { + "epoch": 9.03, + "learning_rate": 8.155466399197593e-05, + "loss": 2.7198, + "theoretical_loss": 3.3415425204001292, + "tokens_seen": 2768051200 + }, + { + "epoch": 9.03, + "learning_rate": 8.154463390170512e-05, + "loss": 2.7088, + "theoretical_loss": 3.341536337770768, + "tokens_seen": 2768116736 + }, + { + "epoch": 9.03, + "learning_rate": 8.15346038114343e-05, + "loss": 2.6435, + "theoretical_loss": 3.3415301553287646, + "tokens_seen": 2768182272 + }, + { + "epoch": 9.03, + "learning_rate": 8.15245737211635e-05, + "loss": 2.54, + "theoretical_loss": 3.341523973074109, + "tokens_seen": 2768247808 + }, + { + "epoch": 9.03, + "learning_rate": 8.151454363089268e-05, + "loss": 2.7325, + "theoretical_loss": 3.3415177910067912, + "tokens_seen": 2768313344 + }, + { + "epoch": 9.03, + "learning_rate": 8.150451354062186e-05, + "loss": 2.637, + "theoretical_loss": 3.341511609126801, + "tokens_seen": 2768378880 + }, + { + "epoch": 9.03, + "learning_rate": 8.149448345035106e-05, + "loss": 2.6533, + "theoretical_loss": 3.3415054274341283, + "tokens_seen": 2768444416 + }, + { + "epoch": 9.03, + "learning_rate": 8.148445336008024e-05, + "loss": 2.7405, + "theoretical_loss": 3.341499245928763, + "tokens_seen": 2768509952 + }, + { + "epoch": 9.03, + "learning_rate": 8.147442326980943e-05, + "loss": 2.5407, + "theoretical_loss": 3.3414930646106953, + "tokens_seen": 2768575488 + }, + { + "epoch": 9.03, + "learning_rate": 8.146439317953861e-05, + "loss": 2.7002, + "theoretical_loss": 3.3414868834799143, + "tokens_seen": 2768641024 + }, + { + "epoch": 9.03, + "learning_rate": 8.145436308926781e-05, + "loss": 2.6367, + "theoretical_loss": 3.3414807025364106, + "tokens_seen": 2768706560 + }, + { + "epoch": 9.03, + "learning_rate": 8.144433299899699e-05, + "loss": 2.6046, + "theoretical_loss": 3.3414745217801736, + "tokens_seen": 2768772096 + }, + { + "epoch": 9.03, + "learning_rate": 8.143430290872618e-05, + "loss": 2.5089, + "theoretical_loss": 3.341468341211194, + "tokens_seen": 2768837632 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3060023, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7207846641540527, + "objective/train/theoretical_loss": 3.34146370590734, + "objective/train/tokens_used": 2789346784, + "theoretical_loss": 3.34146370590734, + "tokens_seen": 2768886784 + }, + { + "epoch": 9.03, + "learning_rate": 8.142427281845537e-05, + "loss": 2.4353, + "theoretical_loss": 3.341462160829461, + "tokens_seen": 2768903168 + }, + { + "epoch": 9.03, + "learning_rate": 8.141424272818456e-05, + "loss": 2.6708, + "theoretical_loss": 3.341455980634964, + "tokens_seen": 2768968704 + }, + { + "epoch": 9.03, + "learning_rate": 8.140421263791374e-05, + "loss": 2.6756, + "theoretical_loss": 3.3414498006276943, + "tokens_seen": 2769034240 + }, + { + "epoch": 9.03, + "learning_rate": 8.139418254764292e-05, + "loss": 2.6526, + "theoretical_loss": 3.341443620807641, + "tokens_seen": 2769099776 + }, + { + "epoch": 9.03, + "learning_rate": 8.138415245737212e-05, + "loss": 2.6626, + "theoretical_loss": 3.3414374411747936, + "tokens_seen": 2769165312 + }, + { + "epoch": 9.03, + "learning_rate": 8.13741223671013e-05, + "loss": 2.6203, + "theoretical_loss": 3.3414312617291424, + "tokens_seen": 2769230848 + }, + { + "epoch": 9.03, + "learning_rate": 8.13640922768305e-05, + "loss": 2.5294, + "theoretical_loss": 3.3414250824706775, + "tokens_seen": 2769296384 + }, + { + "epoch": 9.03, + "learning_rate": 8.135406218655967e-05, + "loss": 2.5487, + "theoretical_loss": 3.341418903399389, + "tokens_seen": 2769361920 + }, + { + "epoch": 9.03, + "learning_rate": 8.134403209628887e-05, + "loss": 2.5378, + "theoretical_loss": 3.341412724515266, + "tokens_seen": 2769427456 + }, + { + "epoch": 9.03, + "learning_rate": 8.133400200601805e-05, + "loss": 2.659, + "theoretical_loss": 3.341406545818299, + "tokens_seen": 2769492992 + }, + { + "epoch": 9.03, + "learning_rate": 8.132397191574724e-05, + "loss": 2.7624, + "theoretical_loss": 3.341400367308478, + "tokens_seen": 2769558528 + }, + { + "epoch": 9.03, + "learning_rate": 8.131394182547643e-05, + "loss": 2.4688, + "theoretical_loss": 3.341394188985792, + "tokens_seen": 2769624064 + }, + { + "epoch": 9.03, + "learning_rate": 8.130391173520561e-05, + "loss": 2.6025, + "theoretical_loss": 3.341388010850232, + "tokens_seen": 2769689600 + }, + { + "epoch": 9.03, + "learning_rate": 8.12938816449348e-05, + "loss": 2.5517, + "theoretical_loss": 3.3413818329017877, + "tokens_seen": 2769755136 + }, + { + "epoch": 9.03, + "learning_rate": 8.128385155466398e-05, + "loss": 2.6636, + "theoretical_loss": 3.341375655140448, + "tokens_seen": 2769820672 + }, + { + "epoch": 9.03, + "learning_rate": 8.127382146439318e-05, + "loss": 2.5374, + "theoretical_loss": 3.341369477566204, + "tokens_seen": 2769886208 + }, + { + "epoch": 9.03, + "learning_rate": 8.126379137412237e-05, + "loss": 2.7683, + "theoretical_loss": 3.3413633001790455, + "tokens_seen": 2769951744 + }, + { + "epoch": 9.03, + "learning_rate": 8.125376128385157e-05, + "loss": 2.4044, + "theoretical_loss": 3.3413571229789616, + "tokens_seen": 2770017280 + }, + { + "epoch": 9.03, + "learning_rate": 8.124373119358075e-05, + "loss": 2.7138, + "theoretical_loss": 3.3413509459659427, + "tokens_seen": 2770082816 + }, + { + "epoch": 9.03, + "learning_rate": 8.123370110330994e-05, + "loss": 2.7264, + "theoretical_loss": 3.3413447691399787, + "tokens_seen": 2770148352 + }, + { + "epoch": 9.03, + "learning_rate": 8.122367101303912e-05, + "loss": 2.6645, + "theoretical_loss": 3.34133859250106, + "tokens_seen": 2770213888 + }, + { + "epoch": 9.03, + "learning_rate": 8.12136409227683e-05, + "loss": 2.6139, + "theoretical_loss": 3.3413324160491755, + "tokens_seen": 2770279424 + }, + { + "epoch": 9.03, + "learning_rate": 8.12036108324975e-05, + "loss": 2.6431, + "theoretical_loss": 3.3413262397843155, + "tokens_seen": 2770344960 + }, + { + "epoch": 9.03, + "learning_rate": 8.119358074222668e-05, + "loss": 2.6038, + "theoretical_loss": 3.341320063706471, + "tokens_seen": 2770410496 + }, + { + "epoch": 9.03, + "learning_rate": 8.118355065195588e-05, + "loss": 2.4322, + "theoretical_loss": 3.3413138878156303, + "tokens_seen": 2770476032 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3061251, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7137176990509033, + "objective/train/theoretical_loss": 3.341309256020215, + "objective/train/tokens_used": 2790985184, + "theoretical_loss": 3.341309256020215, + "tokens_seen": 2770525184 + }, + { + "epoch": 9.03, + "learning_rate": 8.117352056168506e-05, + "loss": 2.5654, + "theoretical_loss": 3.341307712111784, + "tokens_seen": 2770541568 + }, + { + "epoch": 9.03, + "learning_rate": 8.116349047141425e-05, + "loss": 2.6307, + "theoretical_loss": 3.341301536594922, + "tokens_seen": 2770607104 + }, + { + "epoch": 9.03, + "learning_rate": 8.115346038114343e-05, + "loss": 2.8404, + "theoretical_loss": 3.3412953612650336, + "tokens_seen": 2770672640 + }, + { + "epoch": 9.03, + "learning_rate": 8.114343029087263e-05, + "loss": 2.5843, + "theoretical_loss": 3.3412891861221103, + "tokens_seen": 2770738176 + }, + { + "epoch": 9.03, + "learning_rate": 8.113340020060181e-05, + "loss": 2.6372, + "theoretical_loss": 3.3412830111661407, + "tokens_seen": 2770803712 + }, + { + "epoch": 9.03, + "learning_rate": 8.1123370110331e-05, + "loss": 2.7179, + "theoretical_loss": 3.341276836397115, + "tokens_seen": 2770869248 + }, + { + "epoch": 9.03, + "learning_rate": 8.111334002006018e-05, + "loss": 2.7756, + "theoretical_loss": 3.341270661815023, + "tokens_seen": 2770934784 + }, + { + "epoch": 9.03, + "learning_rate": 8.110330992978937e-05, + "loss": 2.6066, + "theoretical_loss": 3.341264487419855, + "tokens_seen": 2771000320 + }, + { + "epoch": 9.03, + "learning_rate": 8.109327983951856e-05, + "loss": 2.4583, + "theoretical_loss": 3.3412583132116005, + "tokens_seen": 2771065856 + }, + { + "epoch": 9.03, + "learning_rate": 8.108324974924774e-05, + "loss": 2.6058, + "theoretical_loss": 3.34125213919025, + "tokens_seen": 2771131392 + }, + { + "epoch": 9.03, + "learning_rate": 8.107321965897694e-05, + "loss": 2.781, + "theoretical_loss": 3.341245965355793, + "tokens_seen": 2771196928 + }, + { + "epoch": 9.03, + "learning_rate": 8.106318956870612e-05, + "loss": 2.6181, + "theoretical_loss": 3.3412397917082193, + "tokens_seen": 2771262464 + }, + { + "epoch": 9.03, + "learning_rate": 8.105315947843531e-05, + "loss": 2.4156, + "theoretical_loss": 3.341233618247519, + "tokens_seen": 2771328000 + }, + { + "epoch": 9.03, + "learning_rate": 8.104312938816449e-05, + "loss": 2.5989, + "theoretical_loss": 3.341227444973682, + "tokens_seen": 2771393536 + }, + { + "epoch": 9.03, + "learning_rate": 8.103309929789369e-05, + "loss": 2.6885, + "theoretical_loss": 3.3412212718866985, + "tokens_seen": 2771459072 + }, + { + "epoch": 9.03, + "learning_rate": 8.102306920762287e-05, + "loss": 2.377, + "theoretical_loss": 3.3412150989865577, + "tokens_seen": 2771524608 + }, + { + "epoch": 9.03, + "learning_rate": 8.101303911735205e-05, + "loss": 2.4531, + "theoretical_loss": 3.3412089262732505, + "tokens_seen": 2771590144 + }, + { + "epoch": 9.03, + "learning_rate": 8.100300902708124e-05, + "loss": 2.4547, + "theoretical_loss": 3.3412027537467663, + "tokens_seen": 2771655680 + }, + { + "epoch": 9.03, + "learning_rate": 8.099297893681043e-05, + "loss": 2.6835, + "theoretical_loss": 3.341196581407095, + "tokens_seen": 2771721216 + }, + { + "epoch": 9.03, + "learning_rate": 8.098294884653962e-05, + "loss": 2.5231, + "theoretical_loss": 3.341190409254226, + "tokens_seen": 2771786752 + }, + { + "epoch": 9.03, + "learning_rate": 8.09729187562688e-05, + "loss": 2.5176, + "theoretical_loss": 3.341184237288151, + "tokens_seen": 2771852288 + }, + { + "epoch": 9.03, + "learning_rate": 8.0962888665998e-05, + "loss": 2.5739, + "theoretical_loss": 3.341178065508858, + "tokens_seen": 2771917824 + }, + { + "epoch": 9.03, + "learning_rate": 8.095285857572718e-05, + "loss": 2.1606, + "theoretical_loss": 3.3411718939163375, + "tokens_seen": 2771983360 + }, + { + "epoch": 9.03, + "learning_rate": 8.094282848545637e-05, + "loss": 2.4952, + "theoretical_loss": 3.34116572251058, + "tokens_seen": 2772048896 + }, + { + "epoch": 9.03, + "learning_rate": 8.093279839518555e-05, + "loss": 2.4499, + "theoretical_loss": 3.3411595512915753, + "tokens_seen": 2772114432 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3062042, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7866768836975098, + "objective/train/theoretical_loss": 3.3411549229998716, + "objective/train/tokens_used": 2792623584, + "theoretical_loss": 3.3411549229998716, + "tokens_seen": 2772163584 + }, + { + "epoch": 9.03, + "learning_rate": 8.092276830491473e-05, + "loss": 2.6487, + "theoretical_loss": 3.3411533802593123, + "tokens_seen": 2772179968 + }, + { + "epoch": 9.03, + "learning_rate": 8.091273821464393e-05, + "loss": 2.5793, + "theoretical_loss": 3.3411472094137826, + "tokens_seen": 2772245504 + }, + { + "epoch": 9.03, + "learning_rate": 8.090270812437311e-05, + "loss": 2.4937, + "theoretical_loss": 3.3411410387549747, + "tokens_seen": 2772311040 + }, + { + "epoch": 9.03, + "learning_rate": 8.089267803410232e-05, + "loss": 2.5277, + "theoretical_loss": 3.341134868282879, + "tokens_seen": 2772376576 + }, + { + "epoch": 9.03, + "learning_rate": 8.08826479438315e-05, + "loss": 2.7706, + "theoretical_loss": 3.341128697997486, + "tokens_seen": 2772442112 + }, + { + "epoch": 9.03, + "learning_rate": 8.08726178535607e-05, + "loss": 2.6836, + "theoretical_loss": 3.341122527898785, + "tokens_seen": 2772507648 + }, + { + "epoch": 9.03, + "learning_rate": 8.086258776328987e-05, + "loss": 2.3315, + "theoretical_loss": 3.341116357986766, + "tokens_seen": 2772573184 + }, + { + "epoch": 9.03, + "learning_rate": 8.085255767301907e-05, + "loss": 2.7318, + "theoretical_loss": 3.341110188261419, + "tokens_seen": 2772638720 + }, + { + "epoch": 9.03, + "learning_rate": 8.084252758274825e-05, + "loss": 2.275, + "theoretical_loss": 3.341104018722734, + "tokens_seen": 2772704256 + }, + { + "epoch": 9.03, + "learning_rate": 8.083249749247745e-05, + "loss": 2.7011, + "theoretical_loss": 3.341097849370701, + "tokens_seen": 2772769792 + }, + { + "epoch": 9.03, + "learning_rate": 8.082246740220663e-05, + "loss": 2.52, + "theoretical_loss": 3.34109168020531, + "tokens_seen": 2772835328 + }, + { + "epoch": 9.03, + "learning_rate": 8.081243731193581e-05, + "loss": 2.5819, + "theoretical_loss": 3.3410855112265505, + "tokens_seen": 2772900864 + }, + { + "epoch": 9.03, + "learning_rate": 8.0802407221665e-05, + "loss": 2.3535, + "theoretical_loss": 3.341079342434413, + "tokens_seen": 2772966400 + }, + { + "epoch": 9.03, + "learning_rate": 8.079237713139418e-05, + "loss": 2.2812, + "theoretical_loss": 3.341073173828887, + "tokens_seen": 2773031936 + }, + { + "epoch": 9.03, + "learning_rate": 8.078234704112338e-05, + "loss": 2.6555, + "theoretical_loss": 3.341067005409963, + "tokens_seen": 2773097472 + }, + { + "epoch": 9.03, + "learning_rate": 8.077231695085256e-05, + "loss": 2.4494, + "theoretical_loss": 3.34106083717763, + "tokens_seen": 2773163008 + }, + { + "epoch": 9.03, + "learning_rate": 8.076228686058175e-05, + "loss": 2.6346, + "theoretical_loss": 3.341054669131879, + "tokens_seen": 2773228544 + }, + { + "epoch": 9.03, + "learning_rate": 8.075225677031093e-05, + "loss": 2.4257, + "theoretical_loss": 3.341048501272699, + "tokens_seen": 2773294080 + }, + { + "epoch": 9.03, + "learning_rate": 8.074222668004013e-05, + "loss": 2.7666, + "theoretical_loss": 3.341042333600081, + "tokens_seen": 2773359616 + }, + { + "epoch": 9.03, + "learning_rate": 8.073219658976931e-05, + "loss": 2.4224, + "theoretical_loss": 3.341036166114014, + "tokens_seen": 2773425152 + }, + { + "epoch": 9.03, + "learning_rate": 8.072216649949849e-05, + "loss": 2.5607, + "theoretical_loss": 3.3410299988144887, + "tokens_seen": 2773490688 + }, + { + "epoch": 9.03, + "learning_rate": 8.071213640922769e-05, + "loss": 2.718, + "theoretical_loss": 3.3410238317014946, + "tokens_seen": 2773556224 + }, + { + "epoch": 9.03, + "learning_rate": 8.070210631895687e-05, + "loss": 2.521, + "theoretical_loss": 3.3410176647750216, + "tokens_seen": 2773621760 + }, + { + "epoch": 9.03, + "learning_rate": 8.069207622868606e-05, + "loss": 2.4129, + "theoretical_loss": 3.3410114980350594, + "tokens_seen": 2773687296 + }, + { + "epoch": 9.03, + "learning_rate": 8.068204613841524e-05, + "loss": 2.635, + "theoretical_loss": 3.3410053314815986, + "tokens_seen": 2773752832 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3067302, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2883129119873047, + "objective/train/theoretical_loss": 3.341000706688889, + "objective/train/tokens_used": 2794261984, + "theoretical_loss": 3.341000706688889, + "tokens_seen": 2773801984 + }, + { + "epoch": 9.03, + "learning_rate": 8.067201604814444e-05, + "loss": 2.4711, + "theoretical_loss": 3.340999165114629, + "tokens_seen": 2773818368 + }, + { + "epoch": 9.03, + "learning_rate": 8.066198595787362e-05, + "loss": 2.383, + "theoretical_loss": 3.3409929989341407, + "tokens_seen": 2773883904 + }, + { + "epoch": 9.03, + "learning_rate": 8.065195586760281e-05, + "loss": 2.6253, + "theoretical_loss": 3.340986832940123, + "tokens_seen": 2773949440 + }, + { + "epoch": 9.03, + "learning_rate": 8.0641925777332e-05, + "loss": 2.7796, + "theoretical_loss": 3.3409806671325666, + "tokens_seen": 2774014976 + }, + { + "epoch": 9.03, + "learning_rate": 8.063189568706118e-05, + "loss": 2.6772, + "theoretical_loss": 3.3409745015114605, + "tokens_seen": 2774080512 + }, + { + "epoch": 9.03, + "learning_rate": 8.062186559679037e-05, + "loss": 2.6236, + "theoretical_loss": 3.340968336076796, + "tokens_seen": 2774146048 + }, + { + "epoch": 9.03, + "learning_rate": 8.061183550651955e-05, + "loss": 2.536, + "theoretical_loss": 3.3409621708285617, + "tokens_seen": 2774211584 + }, + { + "epoch": 9.03, + "learning_rate": 8.060180541624875e-05, + "loss": 2.735, + "theoretical_loss": 3.3409560057667482, + "tokens_seen": 2774277120 + }, + { + "epoch": 9.03, + "learning_rate": 8.059177532597793e-05, + "loss": 2.667, + "theoretical_loss": 3.340949840891346, + "tokens_seen": 2774342656 + }, + { + "epoch": 9.03, + "learning_rate": 8.058174523570712e-05, + "loss": 2.7296, + "theoretical_loss": 3.340943676202344, + "tokens_seen": 2774408192 + }, + { + "epoch": 9.03, + "learning_rate": 8.05717151454363e-05, + "loss": 2.5024, + "theoretical_loss": 3.340937511699733, + "tokens_seen": 2774473728 + }, + { + "epoch": 9.03, + "learning_rate": 8.05616850551655e-05, + "loss": 2.6737, + "theoretical_loss": 3.3409313473835023, + "tokens_seen": 2774539264 + }, + { + "epoch": 9.03, + "learning_rate": 8.055165496489468e-05, + "loss": 2.5798, + "theoretical_loss": 3.3409251832536424, + "tokens_seen": 2774604800 + }, + { + "epoch": 9.03, + "learning_rate": 8.054162487462387e-05, + "loss": 2.3861, + "theoretical_loss": 3.3409190193101432, + "tokens_seen": 2774670336 + }, + { + "epoch": 9.03, + "learning_rate": 8.053159478435305e-05, + "loss": 2.8896, + "theoretical_loss": 3.340912855552994, + "tokens_seen": 2774735872 + }, + { + "epoch": 9.03, + "learning_rate": 8.052156469408224e-05, + "loss": 2.5696, + "theoretical_loss": 3.3409066919821857, + "tokens_seen": 2774801408 + }, + { + "epoch": 9.03, + "learning_rate": 8.051153460381144e-05, + "loss": 2.5358, + "theoretical_loss": 3.340900528597708, + "tokens_seen": 2774866944 + }, + { + "epoch": 9.03, + "learning_rate": 8.050150451354063e-05, + "loss": 2.4325, + "theoretical_loss": 3.34089436539955, + "tokens_seen": 2774932480 + }, + { + "epoch": 9.03, + "learning_rate": 8.049147442326982e-05, + "loss": 2.6197, + "theoretical_loss": 3.3408882023877027, + "tokens_seen": 2774998016 + }, + { + "epoch": 9.03, + "learning_rate": 8.0481444332999e-05, + "loss": 2.6506, + "theoretical_loss": 3.340882039562156, + "tokens_seen": 2775063552 + }, + { + "epoch": 9.03, + "learning_rate": 8.04714142427282e-05, + "loss": 2.6295, + "theoretical_loss": 3.3408758769228997, + "tokens_seen": 2775129088 + }, + { + "epoch": 9.03, + "learning_rate": 8.046138415245738e-05, + "loss": 2.5523, + "theoretical_loss": 3.3408697144699233, + "tokens_seen": 2775194624 + }, + { + "epoch": 9.03, + "learning_rate": 8.045135406218657e-05, + "loss": 2.6394, + "theoretical_loss": 3.340863552203217, + "tokens_seen": 2775260160 + }, + { + "epoch": 9.03, + "learning_rate": 8.044132397191575e-05, + "loss": 2.5564, + "theoretical_loss": 3.3408573901227716, + "tokens_seen": 2775325696 + }, + { + "epoch": 9.03, + "learning_rate": 8.043129388164493e-05, + "loss": 2.5599, + "theoretical_loss": 3.340851228228576, + "tokens_seen": 2775391232 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3072487, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6910083293914795, + "objective/train/theoretical_loss": 3.34084660693015, + "objective/train/tokens_used": 2795900384, + "theoretical_loss": 3.34084660693015, + "tokens_seen": 2775440384 + }, + { + "epoch": 9.03, + "learning_rate": 8.042126379137413e-05, + "loss": 2.63, + "theoretical_loss": 3.340845066520621, + "tokens_seen": 2775456768 + }, + { + "epoch": 9.03, + "learning_rate": 8.041123370110331e-05, + "loss": 2.6937, + "theoretical_loss": 3.340838904998895, + "tokens_seen": 2775522304 + }, + { + "epoch": 9.03, + "learning_rate": 8.04012036108325e-05, + "loss": 2.5121, + "theoretical_loss": 3.3408327436633902, + "tokens_seen": 2775587840 + }, + { + "epoch": 9.03, + "learning_rate": 8.039117352056169e-05, + "loss": 2.5008, + "theoretical_loss": 3.3408265825140955, + "tokens_seen": 2775653376 + }, + { + "epoch": 9.03, + "learning_rate": 8.038114343029088e-05, + "loss": 2.4722, + "theoretical_loss": 3.3408204215510002, + "tokens_seen": 2775718912 + }, + { + "epoch": 9.03, + "learning_rate": 8.037111334002006e-05, + "loss": 2.5156, + "theoretical_loss": 3.340814260774095, + "tokens_seen": 2775784448 + }, + { + "epoch": 9.03, + "learning_rate": 8.036108324974926e-05, + "loss": 2.7847, + "theoretical_loss": 3.3408081001833705, + "tokens_seen": 2775849984 + }, + { + "epoch": 9.03, + "learning_rate": 8.035105315947844e-05, + "loss": 2.6316, + "theoretical_loss": 3.3408019397788156, + "tokens_seen": 2775915520 + }, + { + "epoch": 9.03, + "learning_rate": 8.034102306920763e-05, + "loss": 2.4175, + "theoretical_loss": 3.3407957795604206, + "tokens_seen": 2775981056 + }, + { + "epoch": 9.03, + "learning_rate": 8.033099297893681e-05, + "loss": 2.672, + "theoretical_loss": 3.3407896195281754, + "tokens_seen": 2776046592 + }, + { + "epoch": 9.03, + "learning_rate": 8.0320962888666e-05, + "loss": 2.5554, + "theoretical_loss": 3.3407834596820707, + "tokens_seen": 2776112128 + }, + { + "epoch": 9.03, + "learning_rate": 8.031093279839519e-05, + "loss": 2.5287, + "theoretical_loss": 3.3407773000220957, + "tokens_seen": 2776177664 + }, + { + "epoch": 9.03, + "learning_rate": 8.030090270812437e-05, + "loss": 2.6885, + "theoretical_loss": 3.3407711405482403, + "tokens_seen": 2776243200 + }, + { + "epoch": 9.03, + "learning_rate": 8.029087261785356e-05, + "loss": 2.5444, + "theoretical_loss": 3.340764981260495, + "tokens_seen": 2776308736 + }, + { + "epoch": 9.03, + "learning_rate": 8.028084252758275e-05, + "loss": 2.623, + "theoretical_loss": 3.3407588221588496, + "tokens_seen": 2776374272 + }, + { + "epoch": 9.03, + "learning_rate": 8.027081243731194e-05, + "loss": 2.8124, + "theoretical_loss": 3.3407526632432942, + "tokens_seen": 2776439808 + }, + { + "epoch": 9.03, + "learning_rate": 8.026078234704112e-05, + "loss": 2.6795, + "theoretical_loss": 3.3407465045138185, + "tokens_seen": 2776505344 + }, + { + "epoch": 9.03, + "learning_rate": 8.025075225677032e-05, + "loss": 2.6835, + "theoretical_loss": 3.3407403459704126, + "tokens_seen": 2776570880 + }, + { + "epoch": 9.03, + "learning_rate": 8.02407221664995e-05, + "loss": 2.5806, + "theoretical_loss": 3.3407341876130663, + "tokens_seen": 2776636416 + }, + { + "epoch": 9.03, + "learning_rate": 8.023069207622868e-05, + "loss": 2.4968, + "theoretical_loss": 3.3407280294417703, + "tokens_seen": 2776701952 + }, + { + "epoch": 9.03, + "learning_rate": 8.022066198595787e-05, + "loss": 2.6244, + "theoretical_loss": 3.3407218714565134, + "tokens_seen": 2776767488 + }, + { + "epoch": 9.03, + "learning_rate": 8.021063189568705e-05, + "loss": 2.5654, + "theoretical_loss": 3.340715713657287, + "tokens_seen": 2776833024 + }, + { + "epoch": 9.03, + "learning_rate": 8.020060180541625e-05, + "loss": 2.7385, + "theoretical_loss": 3.34070955604408, + "tokens_seen": 2776898560 + }, + { + "epoch": 9.03, + "learning_rate": 8.019057171514543e-05, + "loss": 2.7559, + "theoretical_loss": 3.340703398616883, + "tokens_seen": 2776964096 + }, + { + "epoch": 9.03, + "learning_rate": 8.018054162487462e-05, + "loss": 2.5437, + "theoretical_loss": 3.3406972413756852, + "tokens_seen": 2777029632 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3077510, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.568877935409546, + "objective/train/theoretical_loss": 3.3406926235668437, + "objective/train/tokens_used": 2797538784, + "theoretical_loss": 3.3406926235668437, + "tokens_seen": 2777078784 + }, + { + "epoch": 9.03, + "learning_rate": 8.01705115346038e-05, + "loss": 2.695, + "theoretical_loss": 3.340691084320478, + "tokens_seen": 2777095168 + }, + { + "epoch": 9.03, + "learning_rate": 8.0160481444333e-05, + "loss": 2.543, + "theoretical_loss": 3.3406849274512496, + "tokens_seen": 2777160704 + }, + { + "epoch": 9.03, + "learning_rate": 8.015045135406218e-05, + "loss": 2.6785, + "theoretical_loss": 3.3406787707679912, + "tokens_seen": 2777226240 + }, + { + "epoch": 9.03, + "learning_rate": 8.014042126379138e-05, + "loss": 2.5716, + "theoretical_loss": 3.340672614270693, + "tokens_seen": 2777291776 + }, + { + "epoch": 9.03, + "learning_rate": 8.013039117352057e-05, + "loss": 2.7329, + "theoretical_loss": 3.3406664579593444, + "tokens_seen": 2777357312 + }, + { + "epoch": 9.03, + "learning_rate": 8.012036108324975e-05, + "loss": 2.4811, + "theoretical_loss": 3.340660301833935, + "tokens_seen": 2777422848 + }, + { + "epoch": 9.03, + "learning_rate": 8.011033099297895e-05, + "loss": 2.651, + "theoretical_loss": 3.340654145894456, + "tokens_seen": 2777488384 + }, + { + "epoch": 9.03, + "learning_rate": 8.010030090270813e-05, + "loss": 2.4168, + "theoretical_loss": 3.340647990140896, + "tokens_seen": 2777553920 + }, + { + "epoch": 9.03, + "learning_rate": 8.009027081243732e-05, + "loss": 2.6692, + "theoretical_loss": 3.3406418345732463, + "tokens_seen": 2777619456 + }, + { + "epoch": 9.03, + "learning_rate": 8.00802407221665e-05, + "loss": 2.5574, + "theoretical_loss": 3.3406356791914957, + "tokens_seen": 2777684992 + }, + { + "epoch": 9.03, + "learning_rate": 8.00702106318957e-05, + "loss": 2.4916, + "theoretical_loss": 3.340629523995635, + "tokens_seen": 2777750528 + }, + { + "epoch": 9.03, + "learning_rate": 8.006018054162488e-05, + "loss": 2.5774, + "theoretical_loss": 3.3406233689856544, + "tokens_seen": 2777816064 + }, + { + "epoch": 9.03, + "learning_rate": 8.005015045135407e-05, + "loss": 2.7198, + "theoretical_loss": 3.3406172141615436, + "tokens_seen": 2777881600 + }, + { + "epoch": 9.03, + "learning_rate": 8.004012036108325e-05, + "loss": 2.5228, + "theoretical_loss": 3.340611059523292, + "tokens_seen": 2777947136 + }, + { + "epoch": 9.03, + "learning_rate": 8.003009027081244e-05, + "loss": 2.4927, + "theoretical_loss": 3.34060490507089, + "tokens_seen": 2778012672 + }, + { + "epoch": 9.03, + "learning_rate": 8.002006018054163e-05, + "loss": 2.6844, + "theoretical_loss": 3.3405987508043284, + "tokens_seen": 2778078208 + }, + { + "epoch": 9.03, + "learning_rate": 8.001003009027081e-05, + "loss": 2.6019, + "theoretical_loss": 3.3405925967235963, + "tokens_seen": 2778143744 + }, + { + "epoch": 9.03, + "learning_rate": 8e-05, + "loss": 2.4223, + "theoretical_loss": 3.3405864428286836, + "tokens_seen": 2778209280 + }, + { + "epoch": 9.03, + "learning_rate": 7.998996990972919e-05, + "loss": 2.4582, + "theoretical_loss": 3.340580289119581, + "tokens_seen": 2778274816 + }, + { + "epoch": 9.03, + "learning_rate": 7.997993981945838e-05, + "loss": 2.4935, + "theoretical_loss": 3.340574135596278, + "tokens_seen": 2778340352 + }, + { + "epoch": 9.03, + "learning_rate": 7.996990972918756e-05, + "loss": 2.5698, + "theoretical_loss": 3.3405679822587646, + "tokens_seen": 2778405888 + }, + { + "epoch": 9.03, + "learning_rate": 7.995987963891676e-05, + "loss": 2.5985, + "theoretical_loss": 3.340561829107031, + "tokens_seen": 2778471424 + }, + { + "epoch": 9.03, + "learning_rate": 7.994984954864594e-05, + "loss": 2.3722, + "theoretical_loss": 3.3405556761410673, + "tokens_seen": 2778536960 + }, + { + "epoch": 9.03, + "learning_rate": 7.993981945837512e-05, + "loss": 2.7048, + "theoretical_loss": 3.3405495233608633, + "tokens_seen": 2778602496 + }, + { + "epoch": 9.03, + "learning_rate": 7.992978936810432e-05, + "loss": 2.4426, + "theoretical_loss": 3.340543370766409, + "tokens_seen": 2778668032 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3082685, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6620914936065674, + "objective/train/theoretical_loss": 3.3405387564424607, + "objective/train/tokens_used": 2799177184, + "theoretical_loss": 3.3405387564424607, + "tokens_seen": 2778717184 + }, + { + "epoch": 9.03, + "learning_rate": 7.99197592778335e-05, + "loss": 2.6913, + "theoretical_loss": 3.340537218357695, + "tokens_seen": 2778733568 + }, + { + "epoch": 9.03, + "learning_rate": 7.990972918756269e-05, + "loss": 2.4606, + "theoretical_loss": 3.34053106613471, + "tokens_seen": 2778799104 + }, + { + "epoch": 9.03, + "learning_rate": 7.989969909729187e-05, + "loss": 2.3812, + "theoretical_loss": 3.340524914097445, + "tokens_seen": 2778864640 + }, + { + "epoch": 9.03, + "learning_rate": 7.988966900702107e-05, + "loss": 2.3834, + "theoretical_loss": 3.3405187622458903, + "tokens_seen": 2778930176 + }, + { + "epoch": 9.03, + "learning_rate": 7.987963891675025e-05, + "loss": 2.5216, + "theoretical_loss": 3.3405126105800353, + "tokens_seen": 2778995712 + }, + { + "epoch": 9.03, + "learning_rate": 7.986960882647944e-05, + "loss": 2.7411, + "theoretical_loss": 3.34050645909987, + "tokens_seen": 2779061248 + }, + { + "epoch": 9.03, + "learning_rate": 7.985957873620862e-05, + "loss": 2.4807, + "theoretical_loss": 3.3405003078053848, + "tokens_seen": 2779126784 + }, + { + "epoch": 9.03, + "learning_rate": 7.98495486459378e-05, + "loss": 2.7882, + "theoretical_loss": 3.340494156696569, + "tokens_seen": 2779192320 + }, + { + "epoch": 9.03, + "learning_rate": 7.9839518555667e-05, + "loss": 2.4295, + "theoretical_loss": 3.340488005773414, + "tokens_seen": 2779257856 + }, + { + "epoch": 9.03, + "learning_rate": 7.982948846539618e-05, + "loss": 2.5566, + "theoretical_loss": 3.3404818550359083, + "tokens_seen": 2779323392 + }, + { + "epoch": 9.03, + "learning_rate": 7.981945837512538e-05, + "loss": 2.8394, + "theoretical_loss": 3.3404757044840427, + "tokens_seen": 2779388928 + }, + { + "epoch": 9.03, + "learning_rate": 7.980942828485456e-05, + "loss": 2.5115, + "theoretical_loss": 3.340469554117807, + "tokens_seen": 2779454464 + }, + { + "epoch": 9.03, + "learning_rate": 7.979939819458375e-05, + "loss": 2.5682, + "theoretical_loss": 3.3404634039371914, + "tokens_seen": 2779520000 + }, + { + "epoch": 9.03, + "learning_rate": 7.978936810431293e-05, + "loss": 2.6734, + "theoretical_loss": 3.3404572539421857, + "tokens_seen": 2779585536 + }, + { + "epoch": 9.03, + "learning_rate": 7.977933801404213e-05, + "loss": 2.7588, + "theoretical_loss": 3.34045110413278, + "tokens_seen": 2779651072 + }, + { + "epoch": 9.03, + "learning_rate": 7.976930792377131e-05, + "loss": 2.7321, + "theoretical_loss": 3.3404449545089645, + "tokens_seen": 2779716608 + }, + { + "epoch": 9.03, + "learning_rate": 7.975927783350052e-05, + "loss": 2.5139, + "theoretical_loss": 3.340438805070729, + "tokens_seen": 2779782144 + }, + { + "epoch": 9.03, + "learning_rate": 7.97492477432297e-05, + "loss": 2.3409, + "theoretical_loss": 3.3404326558180637, + "tokens_seen": 2779847680 + }, + { + "epoch": 9.03, + "learning_rate": 7.973921765295888e-05, + "loss": 2.4864, + "theoretical_loss": 3.3404265067509584, + "tokens_seen": 2779913216 + }, + { + "epoch": 9.03, + "learning_rate": 7.972918756268807e-05, + "loss": 2.6374, + "theoretical_loss": 3.3404203578694034, + "tokens_seen": 2779978752 + }, + { + "epoch": 9.03, + "learning_rate": 7.971915747241725e-05, + "loss": 2.502, + "theoretical_loss": 3.340414209173389, + "tokens_seen": 2780044288 + }, + { + "epoch": 9.03, + "learning_rate": 7.970912738214645e-05, + "loss": 2.5791, + "theoretical_loss": 3.340408060662904, + "tokens_seen": 2780109824 + }, + { + "epoch": 9.03, + "learning_rate": 7.969909729187563e-05, + "loss": 2.729, + "theoretical_loss": 3.34040191233794, + "tokens_seen": 2780175360 + }, + { + "epoch": 9.03, + "learning_rate": 7.968906720160482e-05, + "loss": 2.5997, + "theoretical_loss": 3.340395764198486, + "tokens_seen": 2780240896 + }, + { + "epoch": 9.03, + "learning_rate": 7.9679037111334e-05, + "loss": 2.5509, + "theoretical_loss": 3.340389616244532, + "tokens_seen": 2780306432 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3083297, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.752763032913208, + "objective/train/theoretical_loss": 3.340385005400795, + "objective/train/tokens_used": 2800815584, + "theoretical_loss": 3.340385005400795, + "tokens_seen": 2780355584 + }, + { + "epoch": 9.03, + "learning_rate": 7.96690070210632e-05, + "loss": 2.6614, + "theoretical_loss": 3.3403834684760687, + "tokens_seen": 2780371968 + }, + { + "epoch": 9.03, + "learning_rate": 7.965897693079238e-05, + "loss": 2.4063, + "theoretical_loss": 3.3403773208930856, + "tokens_seen": 2780437504 + }, + { + "epoch": 9.03, + "learning_rate": 7.964894684052156e-05, + "loss": 2.6488, + "theoretical_loss": 3.3403711734955728, + "tokens_seen": 2780503040 + }, + { + "epoch": 9.03, + "learning_rate": 7.963891675025076e-05, + "loss": 2.6235, + "theoretical_loss": 3.3403650262835205, + "tokens_seen": 2780568576 + }, + { + "epoch": 9.03, + "learning_rate": 7.962888665997994e-05, + "loss": 2.5696, + "theoretical_loss": 3.3403588792569185, + "tokens_seen": 2780634112 + }, + { + "epoch": 9.03, + "learning_rate": 7.961885656970913e-05, + "loss": 2.5847, + "theoretical_loss": 3.3403527324157576, + "tokens_seen": 2780699648 + }, + { + "epoch": 9.03, + "learning_rate": 7.960882647943831e-05, + "loss": 2.6372, + "theoretical_loss": 3.3403465857600265, + "tokens_seen": 2780765184 + }, + { + "epoch": 9.03, + "learning_rate": 7.959879638916751e-05, + "loss": 2.4585, + "theoretical_loss": 3.3403404392897165, + "tokens_seen": 2780830720 + }, + { + "epoch": 9.03, + "learning_rate": 7.958876629889669e-05, + "loss": 2.6911, + "theoretical_loss": 3.3403342930048168, + "tokens_seen": 2780896256 + }, + { + "epoch": 9.03, + "learning_rate": 7.957873620862588e-05, + "loss": 2.6005, + "theoretical_loss": 3.340328146905318, + "tokens_seen": 2780961792 + }, + { + "epoch": 9.03, + "learning_rate": 7.956870611835507e-05, + "loss": 2.582, + "theoretical_loss": 3.34032200099121, + "tokens_seen": 2781027328 + }, + { + "epoch": 9.03, + "learning_rate": 7.955867602808425e-05, + "loss": 2.5728, + "theoretical_loss": 3.3403158552624825, + "tokens_seen": 2781092864 + }, + { + "epoch": 9.03, + "learning_rate": 7.954864593781344e-05, + "loss": 2.6884, + "theoretical_loss": 3.3403097097191257, + "tokens_seen": 2781158400 + }, + { + "epoch": 9.03, + "learning_rate": 7.953861584754262e-05, + "loss": 2.6768, + "theoretical_loss": 3.34030356436113, + "tokens_seen": 2781223936 + }, + { + "epoch": 9.03, + "learning_rate": 7.952858575727182e-05, + "loss": 2.446, + "theoretical_loss": 3.340297419188485, + "tokens_seen": 2781289472 + }, + { + "epoch": 9.03, + "learning_rate": 7.9518555667001e-05, + "loss": 2.5252, + "theoretical_loss": 3.3402912742011805, + "tokens_seen": 2781355008 + }, + { + "epoch": 9.03, + "learning_rate": 7.950852557673019e-05, + "loss": 2.4793, + "theoretical_loss": 3.3402851293992075, + "tokens_seen": 2781420544 + }, + { + "epoch": 9.03, + "learning_rate": 7.949849548645937e-05, + "loss": 2.4229, + "theoretical_loss": 3.3402789847825547, + "tokens_seen": 2781486080 + }, + { + "epoch": 9.03, + "learning_rate": 7.948846539618857e-05, + "loss": 2.3398, + "theoretical_loss": 3.3402728403512136, + "tokens_seen": 2781551616 + }, + { + "epoch": 9.03, + "learning_rate": 7.947843530591775e-05, + "loss": 2.3789, + "theoretical_loss": 3.3402666961051737, + "tokens_seen": 2781617152 + }, + { + "epoch": 9.03, + "learning_rate": 7.946840521564694e-05, + "loss": 2.3727, + "theoretical_loss": 3.3402605520444246, + "tokens_seen": 2781682688 + }, + { + "epoch": 9.03, + "learning_rate": 7.945837512537613e-05, + "loss": 2.6879, + "theoretical_loss": 3.3402544081689567, + "tokens_seen": 2781748224 + }, + { + "epoch": 9.03, + "learning_rate": 7.944834503510531e-05, + "loss": 2.612, + "theoretical_loss": 3.34024826447876, + "tokens_seen": 2781813760 + }, + { + "epoch": 9.03, + "learning_rate": 7.94383149448345e-05, + "loss": 2.4196, + "theoretical_loss": 3.3402421209738247, + "tokens_seen": 2781879296 + }, + { + "epoch": 9.03, + "learning_rate": 7.942828485456368e-05, + "loss": 2.7975, + "theoretical_loss": 3.3402359776541406, + "tokens_seen": 2781944832 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3084472, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4595320224761963, + "objective/train/theoretical_loss": 3.3402313702859425, + "objective/train/tokens_used": 2802453984, + "theoretical_loss": 3.3402313702859425, + "tokens_seen": 2781993984 + }, + { + "epoch": 9.03, + "learning_rate": 7.941825476429288e-05, + "loss": 2.5628, + "theoretical_loss": 3.3402298345196977, + "tokens_seen": 2782010368 + }, + { + "epoch": 9.03, + "learning_rate": 7.940822467402206e-05, + "loss": 2.4913, + "theoretical_loss": 3.340223691570486, + "tokens_seen": 2782075904 + }, + { + "epoch": 9.03, + "learning_rate": 7.939819458375125e-05, + "loss": 2.3514, + "theoretical_loss": 3.3402175488064962, + "tokens_seen": 2782141440 + }, + { + "epoch": 9.03, + "learning_rate": 7.938816449348045e-05, + "loss": 2.4313, + "theoretical_loss": 3.340211406227718, + "tokens_seen": 2782206976 + }, + { + "epoch": 9.03, + "learning_rate": 7.937813440320964e-05, + "loss": 2.4726, + "theoretical_loss": 3.3402052638341413, + "tokens_seen": 2782272512 + }, + { + "epoch": 9.03, + "learning_rate": 7.936810431293882e-05, + "loss": 2.5166, + "theoretical_loss": 3.3401991216257563, + "tokens_seen": 2782338048 + }, + { + "epoch": 9.03, + "learning_rate": 7.9358074222668e-05, + "loss": 2.6196, + "theoretical_loss": 3.3401929796025525, + "tokens_seen": 2782403584 + }, + { + "epoch": 9.03, + "learning_rate": 7.93480441323972e-05, + "loss": 2.5253, + "theoretical_loss": 3.340186837764521, + "tokens_seen": 2782469120 + }, + { + "epoch": 9.03, + "learning_rate": 7.933801404212638e-05, + "loss": 2.4653, + "theoretical_loss": 3.3401806961116507, + "tokens_seen": 2782534656 + }, + { + "epoch": 9.03, + "learning_rate": 7.932798395185558e-05, + "loss": 2.5544, + "theoretical_loss": 3.3401745546439328, + "tokens_seen": 2782600192 + }, + { + "epoch": 9.03, + "learning_rate": 7.931795386158476e-05, + "loss": 2.4939, + "theoretical_loss": 3.3401684133613565, + "tokens_seen": 2782665728 + }, + { + "epoch": 9.03, + "learning_rate": 7.930792377131395e-05, + "loss": 2.6596, + "theoretical_loss": 3.340162272263912, + "tokens_seen": 2782731264 + }, + { + "epoch": 9.03, + "learning_rate": 7.929789368104313e-05, + "loss": 2.5697, + "theoretical_loss": 3.34015613135159, + "tokens_seen": 2782796800 + }, + { + "epoch": 9.03, + "learning_rate": 7.928786359077233e-05, + "loss": 2.7127, + "theoretical_loss": 3.3401499906243797, + "tokens_seen": 2782862336 + }, + { + "epoch": 9.03, + "learning_rate": 7.927783350050151e-05, + "loss": 2.4114, + "theoretical_loss": 3.3401438500822715, + "tokens_seen": 2782927872 + }, + { + "epoch": 9.03, + "learning_rate": 7.92678034102307e-05, + "loss": 2.6629, + "theoretical_loss": 3.340137709725256, + "tokens_seen": 2782993408 + }, + { + "epoch": 9.03, + "learning_rate": 7.925777331995988e-05, + "loss": 2.4614, + "theoretical_loss": 3.3401315695533222, + "tokens_seen": 2783058944 + }, + { + "epoch": 9.03, + "learning_rate": 7.924774322968906e-05, + "loss": 2.6039, + "theoretical_loss": 3.3401254295664606, + "tokens_seen": 2783124480 + }, + { + "epoch": 9.03, + "learning_rate": 7.923771313941826e-05, + "loss": 2.7114, + "theoretical_loss": 3.3401192897646617, + "tokens_seen": 2783190016 + }, + { + "epoch": 9.03, + "learning_rate": 7.922768304914744e-05, + "loss": 2.4596, + "theoretical_loss": 3.340113150147915, + "tokens_seen": 2783255552 + }, + { + "epoch": 9.03, + "learning_rate": 7.921765295887664e-05, + "loss": 2.7177, + "theoretical_loss": 3.3401070107162116, + "tokens_seen": 2783321088 + }, + { + "epoch": 9.03, + "learning_rate": 7.920762286860582e-05, + "loss": 2.5502, + "theoretical_loss": 3.34010087146954, + "tokens_seen": 2783386624 + }, + { + "epoch": 9.03, + "learning_rate": 7.919759277833501e-05, + "loss": 2.4841, + "theoretical_loss": 3.3400947324078913, + "tokens_seen": 2783452160 + }, + { + "epoch": 9.03, + "learning_rate": 7.918756268806419e-05, + "loss": 2.5702, + "theoretical_loss": 3.3400885935312554, + "tokens_seen": 2783517696 + }, + { + "epoch": 9.03, + "learning_rate": 7.917753259779339e-05, + "loss": 2.5054, + "theoretical_loss": 3.3400824548396226, + "tokens_seen": 2783583232 + }, + { + "epoch": 9.03, + "objective/train/docs_used": 3085060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6211459636688232, + "objective/train/theoretical_loss": 3.3400778509422997, + "objective/train/tokens_used": 2804092384, + "theoretical_loss": 3.3400778509422997, + "tokens_seen": 2783632384 + }, + { + "epoch": 9.03, + "learning_rate": 7.916750250752257e-05, + "loss": 2.6224, + "theoretical_loss": 3.340076316332982, + "tokens_seen": 2783648768 + }, + { + "epoch": 9.03, + "learning_rate": 7.915747241725175e-05, + "loss": 2.6274, + "theoretical_loss": 3.3400701780113247, + "tokens_seen": 2783714304 + }, + { + "epoch": 9.03, + "learning_rate": 7.914744232698094e-05, + "loss": 2.6667, + "theoretical_loss": 3.3400640398746404, + "tokens_seen": 2783779840 + }, + { + "epoch": 9.03, + "learning_rate": 7.913741223671012e-05, + "loss": 2.5809, + "theoretical_loss": 3.340057901922919, + "tokens_seen": 2783845376 + }, + { + "epoch": 9.03, + "learning_rate": 7.912738214643932e-05, + "loss": 2.4727, + "theoretical_loss": 3.3400517641561507, + "tokens_seen": 2783910912 + }, + { + "epoch": 9.03, + "learning_rate": 7.91173520561685e-05, + "loss": 2.4556, + "theoretical_loss": 3.3400456265743257, + "tokens_seen": 2783976448 + }, + { + "epoch": 9.03, + "learning_rate": 7.91073219658977e-05, + "loss": 2.2887, + "theoretical_loss": 3.340039489177434, + "tokens_seen": 2784041984 + }, + { + "epoch": 9.03, + "learning_rate": 7.909729187562688e-05, + "loss": 2.6122, + "theoretical_loss": 3.3400333519654657, + "tokens_seen": 2784107520 + }, + { + "epoch": 9.03, + "learning_rate": 7.908726178535607e-05, + "loss": 2.6889, + "theoretical_loss": 3.340027214938411, + "tokens_seen": 2784173056 + }, + { + "epoch": 9.03, + "learning_rate": 7.907723169508525e-05, + "loss": 2.594, + "theoretical_loss": 3.34002107809626, + "tokens_seen": 2784238592 + }, + { + "epoch": 9.03, + "learning_rate": 7.906720160481443e-05, + "loss": 2.3059, + "theoretical_loss": 3.340014941439002, + "tokens_seen": 2784304128 + }, + { + "epoch": 9.03, + "learning_rate": 7.905717151454363e-05, + "loss": 2.5486, + "theoretical_loss": 3.3400088049666277, + "tokens_seen": 2784369664 + }, + { + "epoch": 9.03, + "learning_rate": 7.904714142427281e-05, + "loss": 2.6686, + "theoretical_loss": 3.3400026686791273, + "tokens_seen": 2784435200 + }, + { + "epoch": 9.03, + "learning_rate": 7.9037111334002e-05, + "loss": 2.5491, + "theoretical_loss": 3.339996532576491, + "tokens_seen": 2784500736 + }, + { + "epoch": 9.03, + "learning_rate": 7.902708124373119e-05, + "loss": 2.5154, + "theoretical_loss": 3.3399903966587083, + "tokens_seen": 2784566272 + }, + { + "epoch": 9.03, + "learning_rate": 7.901705115346038e-05, + "loss": 2.4893, + "theoretical_loss": 3.3399842609257697, + "tokens_seen": 2784631808 + }, + { + "epoch": 9.03, + "learning_rate": 7.900702106318957e-05, + "loss": 2.5609, + "theoretical_loss": 3.3399781253776655, + "tokens_seen": 2784697344 + }, + { + "epoch": 9.03, + "learning_rate": 7.899699097291877e-05, + "loss": 2.6708, + "theoretical_loss": 3.339971990014385, + "tokens_seen": 2784762880 + }, + { + "epoch": 9.03, + "learning_rate": 7.898696088264795e-05, + "loss": 2.5817, + "theoretical_loss": 3.339965854835919, + "tokens_seen": 2784828416 + }, + { + "epoch": 9.03, + "learning_rate": 7.897693079237714e-05, + "loss": 2.5771, + "theoretical_loss": 3.3399597198422573, + "tokens_seen": 2784893952 + }, + { + "epoch": 9.03, + "learning_rate": 7.896690070210633e-05, + "loss": 2.498, + "theoretical_loss": 3.3399535850333897, + "tokens_seen": 2784959488 + }, + { + "epoch": 9.03, + "learning_rate": 7.895687061183551e-05, + "loss": 2.4247, + "theoretical_loss": 3.3399474504093067, + "tokens_seen": 2785025024 + }, + { + "epoch": 9.03, + "learning_rate": 7.89468405215647e-05, + "loss": 2.6537, + "theoretical_loss": 3.339941315969998, + "tokens_seen": 2785090560 + }, + { + "epoch": 9.03, + "learning_rate": 7.893681043129388e-05, + "loss": 2.431, + "theoretical_loss": 3.339935181715455, + "tokens_seen": 2785156096 + }, + { + "epoch": 9.03, + "learning_rate": 7.892678034102308e-05, + "loss": 2.6204, + "theoretical_loss": 3.339929047645666, + "tokens_seen": 2785221632 + }, + { + "debugging/Self-BLEU-5": 0.631565009689488, + "debugging/distinct-1-grams": 0.7724014323998957, + "debugging/distinct-2-grams": 0.9572424727114127, + "debugging/entropy-1-grams": 6.373069813690105, + "debugging/entropy-2-grams": 7.591540182231531, + "debugging/length": 486.71875, + "debugging/num_segments": 32, + "debugging/score": 0.002868440217734781, + "debugging/score_std": 0.005695597969154905, + "epoch": 9.03, + "objective/train/docs_used": 3086413, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.784790515899658, + "objective/train/theoretical_loss": 3.339924447214563, + "objective/train/tokens_used": 2805730784, + "theoretical_loss": 3.339924447214563, + "tokens_seen": 2785270784 + }, + { + "epoch": 9.03, + "learning_rate": 7.891675025075226e-05, + "loss": 2.6857, + "theoretical_loss": 3.3399229137606214, + "tokens_seen": 2785287168 + }, + { + "epoch": 9.03, + "learning_rate": 7.890672016048145e-05, + "loss": 2.4288, + "theoretical_loss": 3.3399167800603125, + "tokens_seen": 2785352704 + }, + { + "epoch": 9.03, + "learning_rate": 7.889669007021063e-05, + "loss": 2.4718, + "theoretical_loss": 3.339910646544728, + "tokens_seen": 2785418240 + }, + { + "epoch": 9.03, + "learning_rate": 7.888665997993983e-05, + "loss": 2.7171, + "theoretical_loss": 3.339904513213859, + "tokens_seen": 2785483776 + }, + { + "epoch": 9.03, + "learning_rate": 7.887662988966901e-05, + "loss": 2.3859, + "theoretical_loss": 3.3398983800676953, + "tokens_seen": 2785549312 + }, + { + "epoch": 9.03, + "learning_rate": 7.886659979939819e-05, + "loss": 2.469, + "theoretical_loss": 3.339892247106227, + "tokens_seen": 2785614848 + }, + { + "epoch": 9.03, + "learning_rate": 7.885656970912739e-05, + "loss": 2.896, + "theoretical_loss": 3.3398861143294436, + "tokens_seen": 2785680384 + }, + { + "epoch": 9.03, + "learning_rate": 7.884653961885657e-05, + "loss": 2.4891, + "theoretical_loss": 3.3398799817373357, + "tokens_seen": 2785745920 + }, + { + "epoch": 9.03, + "learning_rate": 7.883650952858576e-05, + "loss": 2.492, + "theoretical_loss": 3.339873849329894, + "tokens_seen": 2785811456 + }, + { + "epoch": 9.03, + "learning_rate": 7.882647943831494e-05, + "loss": 2.716, + "theoretical_loss": 3.3398677171071074, + "tokens_seen": 2785876992 + }, + { + "epoch": 9.03, + "learning_rate": 7.881644934804414e-05, + "loss": 2.8296, + "theoretical_loss": 3.3398615850689666, + "tokens_seen": 2785942528 + }, + { + "epoch": 9.03, + "learning_rate": 7.880641925777332e-05, + "loss": 2.5828, + "theoretical_loss": 3.3398554532154616, + "tokens_seen": 2786008064 + }, + { + "epoch": 9.03, + "learning_rate": 7.879638916750251e-05, + "loss": 2.7829, + "theoretical_loss": 3.3398493215465828, + "tokens_seen": 2786073600 + }, + { + "epoch": 9.03, + "learning_rate": 7.87863590772317e-05, + "loss": 2.399, + "theoretical_loss": 3.33984319006232, + "tokens_seen": 2786139136 + }, + { + "epoch": 9.03, + "learning_rate": 7.877632898696088e-05, + "loss": 2.6177, + "theoretical_loss": 3.3398370587626633, + "tokens_seen": 2786204672 + }, + { + "epoch": 9.03, + "learning_rate": 7.876629889669007e-05, + "loss": 2.5674, + "theoretical_loss": 3.3398309276476033, + "tokens_seen": 2786270208 + }, + { + "epoch": 9.03, + "learning_rate": 7.875626880641925e-05, + "loss": 2.5447, + "theoretical_loss": 3.339824796717129, + "tokens_seen": 2786335744 + }, + { + "epoch": 9.04, + "learning_rate": 7.874623871614845e-05, + "loss": 2.6547, + "theoretical_loss": 3.3398186659712312, + "tokens_seen": 2786401280 + }, + { + "epoch": 9.04, + "learning_rate": 7.873620862587763e-05, + "loss": 2.7711, + "theoretical_loss": 3.3398125354099, + "tokens_seen": 2786466816 + }, + { + "epoch": 9.04, + "learning_rate": 7.872617853560682e-05, + "loss": 2.5262, + "theoretical_loss": 3.339806405033126, + "tokens_seen": 2786532352 + }, + { + "epoch": 9.04, + "learning_rate": 7.8716148445336e-05, + "loss": 2.3373, + "theoretical_loss": 3.3398002748408984, + "tokens_seen": 2786597888 + }, + { + "epoch": 9.04, + "learning_rate": 7.87061183550652e-05, + "loss": 2.3093, + "theoretical_loss": 3.3397941448332076, + "tokens_seen": 2786663424 + }, + { + "epoch": 9.04, + "learning_rate": 7.869608826479438e-05, + "loss": 2.6707, + "theoretical_loss": 3.339788015010044, + "tokens_seen": 2786728960 + }, + { + "epoch": 9.04, + "learning_rate": 7.868605817452357e-05, + "loss": 2.4969, + "theoretical_loss": 3.339781885371397, + "tokens_seen": 2786794496 + }, + { + "epoch": 9.04, + "learning_rate": 7.867602808425275e-05, + "loss": 2.8057, + "theoretical_loss": 3.3397757559172576, + "tokens_seen": 2786860032 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3087009, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4898712635040283, + "objective/train/theoretical_loss": 3.33977115894773, + "objective/train/tokens_used": 2807369184, + "theoretical_loss": 3.33977115894773, + "tokens_seen": 2786909184 + }, + { + "epoch": 9.04, + "learning_rate": 7.866599799398194e-05, + "loss": 2.6431, + "theoretical_loss": 3.3397696266476156, + "tokens_seen": 2786925568 + }, + { + "epoch": 9.04, + "learning_rate": 7.865596790371113e-05, + "loss": 2.6238, + "theoretical_loss": 3.3397634975624606, + "tokens_seen": 2786991104 + }, + { + "epoch": 9.04, + "learning_rate": 7.864593781344031e-05, + "loss": 2.3314, + "theoretical_loss": 3.3397573686617834, + "tokens_seen": 2787056640 + }, + { + "epoch": 9.04, + "learning_rate": 7.863590772316952e-05, + "loss": 2.3791, + "theoretical_loss": 3.3397512399455738, + "tokens_seen": 2787122176 + }, + { + "epoch": 9.04, + "learning_rate": 7.86258776328987e-05, + "loss": 2.3266, + "theoretical_loss": 3.339745111413822, + "tokens_seen": 2787187712 + }, + { + "epoch": 9.04, + "learning_rate": 7.86158475426279e-05, + "loss": 2.5768, + "theoretical_loss": 3.339738983066518, + "tokens_seen": 2787253248 + }, + { + "epoch": 9.04, + "learning_rate": 7.860581745235708e-05, + "loss": 2.3696, + "theoretical_loss": 3.3397328549036516, + "tokens_seen": 2787318784 + }, + { + "epoch": 9.04, + "learning_rate": 7.859578736208627e-05, + "loss": 2.5868, + "theoretical_loss": 3.339726726925214, + "tokens_seen": 2787384320 + }, + { + "epoch": 9.04, + "learning_rate": 7.858575727181545e-05, + "loss": 2.6459, + "theoretical_loss": 3.339720599131194, + "tokens_seen": 2787449856 + }, + { + "epoch": 9.04, + "learning_rate": 7.857572718154463e-05, + "loss": 2.6205, + "theoretical_loss": 3.3397144715215825, + "tokens_seen": 2787515392 + }, + { + "epoch": 9.04, + "learning_rate": 7.856569709127383e-05, + "loss": 2.4393, + "theoretical_loss": 3.339708344096369, + "tokens_seen": 2787580928 + }, + { + "epoch": 9.04, + "learning_rate": 7.855566700100301e-05, + "loss": 2.6849, + "theoretical_loss": 3.3397022168555446, + "tokens_seen": 2787646464 + }, + { + "epoch": 9.04, + "learning_rate": 7.85456369107322e-05, + "loss": 2.496, + "theoretical_loss": 3.339696089799099, + "tokens_seen": 2787712000 + }, + { + "epoch": 9.04, + "learning_rate": 7.853560682046139e-05, + "loss": 2.5335, + "theoretical_loss": 3.3396899629270216, + "tokens_seen": 2787777536 + }, + { + "epoch": 9.04, + "learning_rate": 7.852557673019058e-05, + "loss": 2.6411, + "theoretical_loss": 3.339683836239303, + "tokens_seen": 2787843072 + }, + { + "epoch": 9.04, + "learning_rate": 7.851554663991976e-05, + "loss": 2.5157, + "theoretical_loss": 3.339677709735934, + "tokens_seen": 2787908608 + }, + { + "epoch": 9.04, + "learning_rate": 7.850551654964896e-05, + "loss": 2.6938, + "theoretical_loss": 3.3396715834169037, + "tokens_seen": 2787974144 + }, + { + "epoch": 9.04, + "learning_rate": 7.849548645937814e-05, + "loss": 2.4611, + "theoretical_loss": 3.339665457282203, + "tokens_seen": 2788039680 + }, + { + "epoch": 9.04, + "learning_rate": 7.848545636910733e-05, + "loss": 2.3717, + "theoretical_loss": 3.3396593313318212, + "tokens_seen": 2788105216 + }, + { + "epoch": 9.04, + "learning_rate": 7.847542627883651e-05, + "loss": 2.4462, + "theoretical_loss": 3.339653205565749, + "tokens_seen": 2788170752 + }, + { + "epoch": 9.04, + "learning_rate": 7.84653961885657e-05, + "loss": 2.4147, + "theoretical_loss": 3.3396470799839766, + "tokens_seen": 2788236288 + }, + { + "epoch": 9.04, + "learning_rate": 7.845536609829489e-05, + "loss": 2.4938, + "theoretical_loss": 3.3396409545864936, + "tokens_seen": 2788301824 + }, + { + "epoch": 9.04, + "learning_rate": 7.844533600802407e-05, + "loss": 2.5327, + "theoretical_loss": 3.3396348293732907, + "tokens_seen": 2788367360 + }, + { + "epoch": 9.04, + "learning_rate": 7.843530591775326e-05, + "loss": 2.5907, + "theoretical_loss": 3.339628704344358, + "tokens_seen": 2788432896 + }, + { + "epoch": 9.04, + "learning_rate": 7.842527582748245e-05, + "loss": 2.6772, + "theoretical_loss": 3.339622579499685, + "tokens_seen": 2788498432 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3088542, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6083874702453613, + "objective/train/theoretical_loss": 3.339617985987095, + "objective/train/tokens_used": 2809007584, + "theoretical_loss": 3.339617985987095, + "tokens_seen": 2788547584 + }, + { + "epoch": 9.04, + "learning_rate": 7.841524573721164e-05, + "loss": 2.6183, + "theoretical_loss": 3.3396164548392626, + "tokens_seen": 2788563968 + }, + { + "epoch": 9.04, + "learning_rate": 7.840521564694082e-05, + "loss": 2.5305, + "theoretical_loss": 3.3396103303630804, + "tokens_seen": 2788629504 + }, + { + "epoch": 9.04, + "learning_rate": 7.839518555667002e-05, + "loss": 2.3468, + "theoretical_loss": 3.3396042060711286, + "tokens_seen": 2788695040 + }, + { + "epoch": 9.04, + "learning_rate": 7.83851554663992e-05, + "loss": 2.448, + "theoretical_loss": 3.3395980819633975, + "tokens_seen": 2788760576 + }, + { + "epoch": 9.04, + "learning_rate": 7.837512537612838e-05, + "loss": 2.5464, + "theoretical_loss": 3.3395919580398767, + "tokens_seen": 2788826112 + }, + { + "epoch": 9.04, + "learning_rate": 7.836509528585757e-05, + "loss": 2.6832, + "theoretical_loss": 3.3395858343005576, + "tokens_seen": 2788891648 + }, + { + "epoch": 9.04, + "learning_rate": 7.835506519558675e-05, + "loss": 2.4476, + "theoretical_loss": 3.339579710745429, + "tokens_seen": 2788957184 + }, + { + "epoch": 9.04, + "learning_rate": 7.834503510531595e-05, + "loss": 2.4611, + "theoretical_loss": 3.3395735873744816, + "tokens_seen": 2789022720 + }, + { + "epoch": 9.04, + "learning_rate": 7.833500501504513e-05, + "loss": 2.5181, + "theoretical_loss": 3.3395674641877053, + "tokens_seen": 2789088256 + }, + { + "epoch": 9.04, + "learning_rate": 7.832497492477432e-05, + "loss": 2.339, + "theoretical_loss": 3.3395613411850906, + "tokens_seen": 2789153792 + }, + { + "epoch": 9.04, + "learning_rate": 7.83149448345035e-05, + "loss": 2.5866, + "theoretical_loss": 3.3395552183666277, + "tokens_seen": 2789219328 + }, + { + "epoch": 9.04, + "learning_rate": 7.83049147442327e-05, + "loss": 2.3584, + "theoretical_loss": 3.339549095732306, + "tokens_seen": 2789284864 + }, + { + "epoch": 9.04, + "learning_rate": 7.829488465396188e-05, + "loss": 2.5508, + "theoretical_loss": 3.3395429732821165, + "tokens_seen": 2789350400 + }, + { + "epoch": 9.04, + "learning_rate": 7.828485456369106e-05, + "loss": 2.6086, + "theoretical_loss": 3.3395368510160486, + "tokens_seen": 2789415936 + }, + { + "epoch": 9.04, + "learning_rate": 7.827482447342026e-05, + "loss": 2.6453, + "theoretical_loss": 3.339530728934093, + "tokens_seen": 2789481472 + }, + { + "epoch": 9.04, + "learning_rate": 7.826479438314945e-05, + "loss": 2.5547, + "theoretical_loss": 3.33952460703624, + "tokens_seen": 2789547008 + }, + { + "epoch": 9.04, + "learning_rate": 7.825476429287865e-05, + "loss": 2.5398, + "theoretical_loss": 3.3395184853224786, + "tokens_seen": 2789612544 + }, + { + "epoch": 9.04, + "learning_rate": 7.824473420260783e-05, + "loss": 2.3062, + "theoretical_loss": 3.3395123637928, + "tokens_seen": 2789678080 + }, + { + "epoch": 9.04, + "learning_rate": 7.823470411233702e-05, + "loss": 2.5321, + "theoretical_loss": 3.339506242447194, + "tokens_seen": 2789743616 + }, + { + "epoch": 9.04, + "learning_rate": 7.82246740220662e-05, + "loss": 2.7037, + "theoretical_loss": 3.339500121285651, + "tokens_seen": 2789809152 + }, + { + "epoch": 9.04, + "learning_rate": 7.82146439317954e-05, + "loss": 2.6475, + "theoretical_loss": 3.339494000308161, + "tokens_seen": 2789874688 + }, + { + "epoch": 9.04, + "learning_rate": 7.820461384152458e-05, + "loss": 2.8356, + "theoretical_loss": 3.3394878795147136, + "tokens_seen": 2789940224 + }, + { + "epoch": 9.04, + "learning_rate": 7.819458375125377e-05, + "loss": 2.5824, + "theoretical_loss": 3.3394817589052996, + "tokens_seen": 2790005760 + }, + { + "epoch": 9.04, + "learning_rate": 7.818455366098295e-05, + "loss": 2.6188, + "theoretical_loss": 3.339475638479909, + "tokens_seen": 2790071296 + }, + { + "epoch": 9.04, + "learning_rate": 7.817452357071214e-05, + "loss": 2.4857, + "theoretical_loss": 3.339469518238532, + "tokens_seen": 2790136832 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3088910, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5234055519104004, + "objective/train/theoretical_loss": 3.339464928178252, + "objective/train/tokens_used": 2810645984, + "theoretical_loss": 3.339464928178252, + "tokens_seen": 2790185984 + }, + { + "epoch": 9.04, + "learning_rate": 7.816449348044133e-05, + "loss": 2.3729, + "theoretical_loss": 3.3394633981811586, + "tokens_seen": 2790202368 + }, + { + "epoch": 9.04, + "learning_rate": 7.815446339017051e-05, + "loss": 2.4597, + "theoretical_loss": 3.339457278307779, + "tokens_seen": 2790267904 + }, + { + "epoch": 9.04, + "learning_rate": 7.81444332998997e-05, + "loss": 2.5623, + "theoretical_loss": 3.3394511586183833, + "tokens_seen": 2790333440 + }, + { + "epoch": 9.04, + "learning_rate": 7.813440320962889e-05, + "loss": 2.3841, + "theoretical_loss": 3.3394450391129613, + "tokens_seen": 2790398976 + }, + { + "epoch": 9.04, + "learning_rate": 7.812437311935808e-05, + "loss": 2.477, + "theoretical_loss": 3.3394389197915038, + "tokens_seen": 2790464512 + }, + { + "epoch": 9.04, + "learning_rate": 7.811434302908726e-05, + "loss": 2.6174, + "theoretical_loss": 3.3394328006540013, + "tokens_seen": 2790530048 + }, + { + "epoch": 9.04, + "learning_rate": 7.810431293881646e-05, + "loss": 2.4909, + "theoretical_loss": 3.3394266817004423, + "tokens_seen": 2790595584 + }, + { + "epoch": 9.04, + "learning_rate": 7.809428284854564e-05, + "loss": 2.4409, + "theoretical_loss": 3.339420562930819, + "tokens_seen": 2790661120 + }, + { + "epoch": 9.04, + "learning_rate": 7.808425275827482e-05, + "loss": 2.6899, + "theoretical_loss": 3.3394144443451195, + "tokens_seen": 2790726656 + }, + { + "epoch": 9.04, + "learning_rate": 7.807422266800401e-05, + "loss": 2.6488, + "theoretical_loss": 3.3394083259433356, + "tokens_seen": 2790792192 + }, + { + "epoch": 9.04, + "learning_rate": 7.80641925777332e-05, + "loss": 2.5213, + "theoretical_loss": 3.339402207725457, + "tokens_seen": 2790857728 + }, + { + "epoch": 9.04, + "learning_rate": 7.805416248746239e-05, + "loss": 2.6839, + "theoretical_loss": 3.339396089691473, + "tokens_seen": 2790923264 + }, + { + "epoch": 9.04, + "learning_rate": 7.804413239719157e-05, + "loss": 2.5514, + "theoretical_loss": 3.339389971841375, + "tokens_seen": 2790988800 + }, + { + "epoch": 9.04, + "learning_rate": 7.803410230692077e-05, + "loss": 2.4911, + "theoretical_loss": 3.3393838541751526, + "tokens_seen": 2791054336 + }, + { + "epoch": 9.04, + "learning_rate": 7.802407221664995e-05, + "loss": 2.5271, + "theoretical_loss": 3.339377736692796, + "tokens_seen": 2791119872 + }, + { + "epoch": 9.04, + "learning_rate": 7.801404212637914e-05, + "loss": 2.5731, + "theoretical_loss": 3.339371619394295, + "tokens_seen": 2791185408 + }, + { + "epoch": 9.04, + "learning_rate": 7.800401203610832e-05, + "loss": 2.4974, + "theoretical_loss": 3.33936550227964, + "tokens_seen": 2791250944 + }, + { + "epoch": 9.04, + "learning_rate": 7.79939819458375e-05, + "loss": 2.5777, + "theoretical_loss": 3.3393593853488217, + "tokens_seen": 2791316480 + }, + { + "epoch": 9.04, + "learning_rate": 7.79839518555667e-05, + "loss": 2.5156, + "theoretical_loss": 3.3393532686018297, + "tokens_seen": 2791382016 + }, + { + "epoch": 9.04, + "learning_rate": 7.797392176529588e-05, + "loss": 2.4002, + "theoretical_loss": 3.339347152038654, + "tokens_seen": 2791447552 + }, + { + "epoch": 9.04, + "learning_rate": 7.796389167502508e-05, + "loss": 2.472, + "theoretical_loss": 3.339341035659285, + "tokens_seen": 2791513088 + }, + { + "epoch": 9.04, + "learning_rate": 7.795386158475426e-05, + "loss": 2.5738, + "theoretical_loss": 3.339334919463713, + "tokens_seen": 2791578624 + }, + { + "epoch": 9.04, + "learning_rate": 7.794383149448345e-05, + "loss": 2.6255, + "theoretical_loss": 3.339328803451928, + "tokens_seen": 2791644160 + }, + { + "epoch": 9.04, + "learning_rate": 7.793380140421263e-05, + "loss": 2.5841, + "theoretical_loss": 3.3393226876239206, + "tokens_seen": 2791709696 + }, + { + "epoch": 9.04, + "learning_rate": 7.792377131394183e-05, + "loss": 2.549, + "theoretical_loss": 3.33931657197968, + "tokens_seen": 2791775232 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3090356, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8051693439483643, + "objective/train/theoretical_loss": 3.3393119853670914, + "objective/train/tokens_used": 2812284384, + "theoretical_loss": 3.3393119853670914, + "tokens_seen": 2791824384 + }, + { + "epoch": 9.04, + "learning_rate": 7.791374122367101e-05, + "loss": 2.5421, + "theoretical_loss": 3.3393104565191973, + "tokens_seen": 2791840768 + }, + { + "epoch": 9.04, + "learning_rate": 7.79037111334002e-05, + "loss": 2.4464, + "theoretical_loss": 3.339304341242462, + "tokens_seen": 2791906304 + }, + { + "epoch": 9.04, + "learning_rate": 7.789368104312938e-05, + "loss": 2.6234, + "theoretical_loss": 3.339298226149465, + "tokens_seen": 2791971840 + }, + { + "epoch": 9.04, + "learning_rate": 7.788365095285858e-05, + "loss": 2.4585, + "theoretical_loss": 3.339292111240195, + "tokens_seen": 2792037376 + }, + { + "epoch": 9.04, + "learning_rate": 7.787362086258777e-05, + "loss": 2.5813, + "theoretical_loss": 3.3392859965146444, + "tokens_seen": 2792102912 + }, + { + "epoch": 9.04, + "learning_rate": 7.786359077231695e-05, + "loss": 2.7484, + "theoretical_loss": 3.3392798819728013, + "tokens_seen": 2792168448 + }, + { + "epoch": 9.04, + "learning_rate": 7.785356068204615e-05, + "loss": 2.2367, + "theoretical_loss": 3.3392737676146576, + "tokens_seen": 2792233984 + }, + { + "epoch": 9.04, + "learning_rate": 7.784353059177533e-05, + "loss": 2.5697, + "theoretical_loss": 3.339267653440202, + "tokens_seen": 2792299520 + }, + { + "epoch": 9.04, + "learning_rate": 7.783350050150452e-05, + "loss": 2.5588, + "theoretical_loss": 3.339261539449425, + "tokens_seen": 2792365056 + }, + { + "epoch": 9.04, + "learning_rate": 7.78234704112337e-05, + "loss": 2.5926, + "theoretical_loss": 3.3392554256423175, + "tokens_seen": 2792430592 + }, + { + "epoch": 9.04, + "learning_rate": 7.78134403209629e-05, + "loss": 2.3587, + "theoretical_loss": 3.339249312018869, + "tokens_seen": 2792496128 + }, + { + "epoch": 9.04, + "learning_rate": 7.780341023069208e-05, + "loss": 2.4122, + "theoretical_loss": 3.3392431985790703, + "tokens_seen": 2792561664 + }, + { + "epoch": 9.04, + "learning_rate": 7.779338014042126e-05, + "loss": 2.5629, + "theoretical_loss": 3.3392370853229107, + "tokens_seen": 2792627200 + }, + { + "epoch": 9.04, + "learning_rate": 7.778335005015046e-05, + "loss": 2.5419, + "theoretical_loss": 3.3392309722503812, + "tokens_seen": 2792692736 + }, + { + "epoch": 9.04, + "learning_rate": 7.777331995987964e-05, + "loss": 2.7639, + "theoretical_loss": 3.3392248593614715, + "tokens_seen": 2792758272 + }, + { + "epoch": 9.04, + "learning_rate": 7.776328986960883e-05, + "loss": 2.3276, + "theoretical_loss": 3.3392187466561714, + "tokens_seen": 2792823808 + }, + { + "epoch": 9.04, + "learning_rate": 7.775325977933801e-05, + "loss": 2.57, + "theoretical_loss": 3.339212634134472, + "tokens_seen": 2792889344 + }, + { + "epoch": 9.04, + "learning_rate": 7.774322968906721e-05, + "loss": 2.4958, + "theoretical_loss": 3.3392065217963633, + "tokens_seen": 2792954880 + }, + { + "epoch": 9.04, + "learning_rate": 7.773319959879639e-05, + "loss": 2.5493, + "theoretical_loss": 3.339200409641835, + "tokens_seen": 2793020416 + }, + { + "epoch": 9.04, + "learning_rate": 7.772316950852558e-05, + "loss": 2.4168, + "theoretical_loss": 3.3391942976708773, + "tokens_seen": 2793085952 + }, + { + "epoch": 9.04, + "learning_rate": 7.771313941825477e-05, + "loss": 2.5198, + "theoretical_loss": 3.3391881858834807, + "tokens_seen": 2793151488 + }, + { + "epoch": 9.04, + "learning_rate": 7.770310932798395e-05, + "loss": 2.4985, + "theoretical_loss": 3.339182074279635, + "tokens_seen": 2793217024 + }, + { + "epoch": 9.04, + "learning_rate": 7.769307923771314e-05, + "loss": 2.3755, + "theoretical_loss": 3.339175962859331, + "tokens_seen": 2793282560 + }, + { + "epoch": 9.04, + "learning_rate": 7.768304914744232e-05, + "loss": 2.5783, + "theoretical_loss": 3.3391698516225583, + "tokens_seen": 2793348096 + }, + { + "epoch": 9.04, + "learning_rate": 7.767301905717152e-05, + "loss": 2.4142, + "theoretical_loss": 3.3391637405693078, + "tokens_seen": 2793413632 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3091122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.811616897583008, + "objective/train/theoretical_loss": 3.3391591573997994, + "objective/train/tokens_used": 2813922784, + "theoretical_loss": 3.3391591573997994, + "tokens_seen": 2793462784 + }, + { + "epoch": 9.04, + "learning_rate": 7.76629889669007e-05, + "loss": 2.592, + "theoretical_loss": 3.3391576296995686, + "tokens_seen": 2793479168 + }, + { + "epoch": 9.04, + "learning_rate": 7.765295887662989e-05, + "loss": 2.4272, + "theoretical_loss": 3.3391515190133316, + "tokens_seen": 2793544704 + }, + { + "epoch": 9.04, + "learning_rate": 7.764292878635907e-05, + "loss": 2.4838, + "theoretical_loss": 3.339145408510587, + "tokens_seen": 2793610240 + }, + { + "epoch": 9.04, + "learning_rate": 7.763289869608827e-05, + "loss": 2.3595, + "theoretical_loss": 3.339139298191325, + "tokens_seen": 2793675776 + }, + { + "epoch": 9.04, + "learning_rate": 7.762286860581745e-05, + "loss": 2.5583, + "theoretical_loss": 3.339133188055535, + "tokens_seen": 2793741312 + }, + { + "epoch": 9.04, + "learning_rate": 7.761283851554664e-05, + "loss": 2.5324, + "theoretical_loss": 3.3391270781032083, + "tokens_seen": 2793806848 + }, + { + "epoch": 9.04, + "learning_rate": 7.760280842527583e-05, + "loss": 2.643, + "theoretical_loss": 3.339120968334335, + "tokens_seen": 2793872384 + }, + { + "epoch": 9.04, + "learning_rate": 7.759277833500501e-05, + "loss": 2.6001, + "theoretical_loss": 3.339114858748904, + "tokens_seen": 2793937920 + }, + { + "epoch": 9.04, + "learning_rate": 7.75827482447342e-05, + "loss": 2.6193, + "theoretical_loss": 3.3391087493469067, + "tokens_seen": 2794003456 + }, + { + "epoch": 9.04, + "learning_rate": 7.757271815446338e-05, + "loss": 2.5896, + "theoretical_loss": 3.339102640128333, + "tokens_seen": 2794068992 + }, + { + "epoch": 9.04, + "learning_rate": 7.756268806419258e-05, + "loss": 2.4544, + "theoretical_loss": 3.3390965310931735, + "tokens_seen": 2794134528 + }, + { + "epoch": 9.04, + "learning_rate": 7.755265797392176e-05, + "loss": 2.4438, + "theoretical_loss": 3.3390904222414175, + "tokens_seen": 2794200064 + }, + { + "epoch": 9.04, + "learning_rate": 7.754262788365095e-05, + "loss": 2.4965, + "theoretical_loss": 3.3390843135730552, + "tokens_seen": 2794265600 + }, + { + "epoch": 9.04, + "learning_rate": 7.753259779338013e-05, + "loss": 2.3546, + "theoretical_loss": 3.339078205088078, + "tokens_seen": 2794331136 + }, + { + "epoch": 9.04, + "learning_rate": 7.752256770310933e-05, + "loss": 2.5354, + "theoretical_loss": 3.339072096786475, + "tokens_seen": 2794396672 + }, + { + "epoch": 9.04, + "learning_rate": 7.751253761283852e-05, + "loss": 2.6099, + "theoretical_loss": 3.339065988668237, + "tokens_seen": 2794462208 + }, + { + "epoch": 9.04, + "learning_rate": 7.75025075225677e-05, + "loss": 2.5188, + "theoretical_loss": 3.3390598807333536, + "tokens_seen": 2794527744 + }, + { + "epoch": 9.04, + "learning_rate": 7.74924774322969e-05, + "loss": 2.3495, + "theoretical_loss": 3.3390537729818157, + "tokens_seen": 2794593280 + }, + { + "epoch": 9.04, + "learning_rate": 7.748244734202608e-05, + "loss": 2.5788, + "theoretical_loss": 3.339047665413613, + "tokens_seen": 2794658816 + }, + { + "epoch": 9.04, + "learning_rate": 7.747241725175528e-05, + "loss": 2.4375, + "theoretical_loss": 3.3390415580287356, + "tokens_seen": 2794724352 + }, + { + "epoch": 9.04, + "learning_rate": 7.746238716148446e-05, + "loss": 2.3476, + "theoretical_loss": 3.339035450827174, + "tokens_seen": 2794789888 + }, + { + "epoch": 9.04, + "learning_rate": 7.745235707121365e-05, + "loss": 2.655, + "theoretical_loss": 3.3390293438089182, + "tokens_seen": 2794855424 + }, + { + "epoch": 9.04, + "learning_rate": 7.744232698094283e-05, + "loss": 2.4789, + "theoretical_loss": 3.3390232369739588, + "tokens_seen": 2794920960 + }, + { + "epoch": 9.04, + "learning_rate": 7.743229689067203e-05, + "loss": 2.5063, + "theoretical_loss": 3.3390171303222855, + "tokens_seen": 2794986496 + }, + { + "epoch": 9.04, + "learning_rate": 7.742226680040121e-05, + "loss": 2.5743, + "theoretical_loss": 3.339011023853889, + "tokens_seen": 2795052032 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3092623, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5476949214935303, + "objective/train/theoretical_loss": 3.3390064441228606, + "objective/train/tokens_used": 2815561184, + "theoretical_loss": 3.3390064441228606, + "tokens_seen": 2795101184 + }, + { + "epoch": 9.04, + "learning_rate": 7.74122367101304e-05, + "loss": 2.5317, + "theoretical_loss": 3.339004917568759, + "tokens_seen": 2795117568 + }, + { + "epoch": 9.04, + "learning_rate": 7.740220661985958e-05, + "loss": 2.4174, + "theoretical_loss": 3.3389988114668854, + "tokens_seen": 2795183104 + }, + { + "epoch": 9.04, + "learning_rate": 7.739217652958876e-05, + "loss": 2.4816, + "theoretical_loss": 3.3389927055482596, + "tokens_seen": 2795248640 + }, + { + "epoch": 9.04, + "learning_rate": 7.738214643931796e-05, + "loss": 2.4839, + "theoretical_loss": 3.3389865998128707, + "tokens_seen": 2795314176 + }, + { + "epoch": 9.04, + "learning_rate": 7.737211634904714e-05, + "loss": 2.5497, + "theoretical_loss": 3.33898049426071, + "tokens_seen": 2795379712 + }, + { + "epoch": 9.04, + "learning_rate": 7.736208625877634e-05, + "loss": 2.3331, + "theoretical_loss": 3.3389743888917662, + "tokens_seen": 2795445248 + }, + { + "epoch": 9.04, + "learning_rate": 7.735205616850552e-05, + "loss": 2.6082, + "theoretical_loss": 3.338968283706031, + "tokens_seen": 2795510784 + }, + { + "epoch": 9.04, + "learning_rate": 7.734202607823471e-05, + "loss": 2.3801, + "theoretical_loss": 3.3389621787034938, + "tokens_seen": 2795576320 + }, + { + "epoch": 9.04, + "learning_rate": 7.733199598796389e-05, + "loss": 2.5962, + "theoretical_loss": 3.3389560738841446, + "tokens_seen": 2795641856 + }, + { + "epoch": 9.04, + "learning_rate": 7.732196589769309e-05, + "loss": 2.5972, + "theoretical_loss": 3.3389499692479747, + "tokens_seen": 2795707392 + }, + { + "epoch": 9.04, + "learning_rate": 7.731193580742227e-05, + "loss": 2.359, + "theoretical_loss": 3.338943864794973, + "tokens_seen": 2795772928 + }, + { + "epoch": 9.04, + "learning_rate": 7.730190571715145e-05, + "loss": 2.5763, + "theoretical_loss": 3.3389377605251305, + "tokens_seen": 2795838464 + }, + { + "epoch": 9.04, + "learning_rate": 7.729187562688064e-05, + "loss": 2.5547, + "theoretical_loss": 3.338931656438437, + "tokens_seen": 2795904000 + }, + { + "epoch": 9.04, + "learning_rate": 7.728184553660982e-05, + "loss": 2.4348, + "theoretical_loss": 3.338925552534883, + "tokens_seen": 2795969536 + }, + { + "epoch": 9.04, + "learning_rate": 7.727181544633902e-05, + "loss": 2.4218, + "theoretical_loss": 3.3389194488144587, + "tokens_seen": 2796035072 + }, + { + "epoch": 9.04, + "learning_rate": 7.72617853560682e-05, + "loss": 2.5711, + "theoretical_loss": 3.338913345277154, + "tokens_seen": 2796100608 + }, + { + "epoch": 9.04, + "learning_rate": 7.72517552657974e-05, + "loss": 2.3784, + "theoretical_loss": 3.3389072419229597, + "tokens_seen": 2796166144 + }, + { + "epoch": 9.04, + "learning_rate": 7.724172517552658e-05, + "loss": 2.3532, + "theoretical_loss": 3.338901138751866, + "tokens_seen": 2796231680 + }, + { + "epoch": 9.04, + "learning_rate": 7.723169508525577e-05, + "loss": 2.288, + "theoretical_loss": 3.338895035763862, + "tokens_seen": 2796297216 + }, + { + "epoch": 9.04, + "learning_rate": 7.722166499498495e-05, + "loss": 2.4424, + "theoretical_loss": 3.338888932958939, + "tokens_seen": 2796362752 + }, + { + "epoch": 9.04, + "learning_rate": 7.721163490471413e-05, + "loss": 2.5778, + "theoretical_loss": 3.338882830337087, + "tokens_seen": 2796428288 + }, + { + "epoch": 9.04, + "learning_rate": 7.720160481444333e-05, + "loss": 2.4005, + "theoretical_loss": 3.3388767278982963, + "tokens_seen": 2796493824 + }, + { + "epoch": 9.04, + "learning_rate": 7.719157472417251e-05, + "loss": 2.4741, + "theoretical_loss": 3.3388706256425564, + "tokens_seen": 2796559360 + }, + { + "epoch": 9.04, + "learning_rate": 7.71815446339017e-05, + "loss": 2.2968, + "theoretical_loss": 3.3388645235698586, + "tokens_seen": 2796624896 + }, + { + "epoch": 9.04, + "learning_rate": 7.717151454363088e-05, + "loss": 2.714, + "theoretical_loss": 3.338858421680192, + "tokens_seen": 2796690432 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3093254, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.578052043914795, + "objective/train/theoretical_loss": 3.3388538453830514, + "objective/train/tokens_used": 2817199584, + "theoretical_loss": 3.3388538453830514, + "tokens_seen": 2796739584 + }, + { + "epoch": 9.04, + "learning_rate": 7.716148445336008e-05, + "loss": 2.4706, + "theoretical_loss": 3.3388523199735483, + "tokens_seen": 2796755968 + }, + { + "epoch": 9.04, + "learning_rate": 7.715145436308926e-05, + "loss": 2.4972, + "theoretical_loss": 3.338846218449916, + "tokens_seen": 2796821504 + }, + { + "epoch": 9.04, + "learning_rate": 7.714142427281846e-05, + "loss": 2.5701, + "theoretical_loss": 3.338840117109287, + "tokens_seen": 2796887040 + }, + { + "epoch": 9.04, + "learning_rate": 7.713139418254765e-05, + "loss": 2.4464, + "theoretical_loss": 3.33883401595165, + "tokens_seen": 2796952576 + }, + { + "epoch": 9.04, + "learning_rate": 7.712136409227684e-05, + "loss": 2.5816, + "theoretical_loss": 3.338827914976996, + "tokens_seen": 2797018112 + }, + { + "epoch": 9.04, + "learning_rate": 7.711133400200603e-05, + "loss": 2.4814, + "theoretical_loss": 3.3388218141853154, + "tokens_seen": 2797083648 + }, + { + "epoch": 9.04, + "learning_rate": 7.710130391173521e-05, + "loss": 2.2842, + "theoretical_loss": 3.338815713576598, + "tokens_seen": 2797149184 + }, + { + "epoch": 9.04, + "learning_rate": 7.70912738214644e-05, + "loss": 2.5101, + "theoretical_loss": 3.3388096131508345, + "tokens_seen": 2797214720 + }, + { + "epoch": 9.04, + "learning_rate": 7.708124373119358e-05, + "loss": 2.3743, + "theoretical_loss": 3.3388035129080142, + "tokens_seen": 2797280256 + }, + { + "epoch": 9.04, + "learning_rate": 7.707121364092278e-05, + "loss": 2.2384, + "theoretical_loss": 3.3387974128481286, + "tokens_seen": 2797345792 + }, + { + "epoch": 9.04, + "learning_rate": 7.706118355065196e-05, + "loss": 2.5621, + "theoretical_loss": 3.338791312971167, + "tokens_seen": 2797411328 + }, + { + "epoch": 9.04, + "learning_rate": 7.705115346038115e-05, + "loss": 2.5326, + "theoretical_loss": 3.3387852132771196, + "tokens_seen": 2797476864 + }, + { + "epoch": 9.04, + "learning_rate": 7.704112337011033e-05, + "loss": 2.4578, + "theoretical_loss": 3.3387791137659772, + "tokens_seen": 2797542400 + }, + { + "epoch": 9.04, + "learning_rate": 7.703109327983953e-05, + "loss": 2.469, + "theoretical_loss": 3.33877301443773, + "tokens_seen": 2797607936 + }, + { + "epoch": 9.04, + "learning_rate": 7.702106318956871e-05, + "loss": 2.5364, + "theoretical_loss": 3.3387669152923674, + "tokens_seen": 2797673472 + }, + { + "epoch": 9.04, + "learning_rate": 7.701103309929789e-05, + "loss": 2.5915, + "theoretical_loss": 3.3387608163298808, + "tokens_seen": 2797739008 + }, + { + "epoch": 9.04, + "learning_rate": 7.700100300902709e-05, + "loss": 2.4775, + "theoretical_loss": 3.3387547175502594, + "tokens_seen": 2797804544 + }, + { + "epoch": 9.04, + "learning_rate": 7.699097291875627e-05, + "loss": 2.3477, + "theoretical_loss": 3.338748618953494, + "tokens_seen": 2797870080 + }, + { + "epoch": 9.04, + "learning_rate": 7.698094282848546e-05, + "loss": 2.6582, + "theoretical_loss": 3.338742520539575, + "tokens_seen": 2797935616 + }, + { + "epoch": 9.04, + "learning_rate": 7.697091273821464e-05, + "loss": 2.5221, + "theoretical_loss": 3.338736422308492, + "tokens_seen": 2798001152 + }, + { + "epoch": 9.04, + "learning_rate": 7.696088264794384e-05, + "loss": 2.3072, + "theoretical_loss": 3.338730324260236, + "tokens_seen": 2798066688 + }, + { + "epoch": 9.04, + "learning_rate": 7.695085255767302e-05, + "loss": 2.3291, + "theoretical_loss": 3.3387242263947967, + "tokens_seen": 2798132224 + }, + { + "epoch": 9.04, + "learning_rate": 7.694082246740221e-05, + "loss": 2.47, + "theoretical_loss": 3.3387181287121646, + "tokens_seen": 2798197760 + }, + { + "epoch": 9.04, + "learning_rate": 7.69307923771314e-05, + "loss": 2.587, + "theoretical_loss": 3.3387120312123293, + "tokens_seen": 2798263296 + }, + { + "epoch": 9.04, + "learning_rate": 7.692076228686058e-05, + "loss": 2.4359, + "theoretical_loss": 3.338705933895282, + "tokens_seen": 2798328832 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3094900, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0245330333709717, + "objective/train/theoretical_loss": 3.3387013610274447, + "objective/train/tokens_used": 2818837984, + "theoretical_loss": 3.3387013610274447, + "tokens_seen": 2798377984 + }, + { + "epoch": 9.04, + "learning_rate": 7.691073219658977e-05, + "loss": 2.3763, + "theoretical_loss": 3.3386998367610126, + "tokens_seen": 2798394368 + }, + { + "epoch": 9.04, + "learning_rate": 7.690070210631895e-05, + "loss": 2.5684, + "theoretical_loss": 3.3386937398095107, + "tokens_seen": 2798459904 + }, + { + "epoch": 9.04, + "learning_rate": 7.689067201604815e-05, + "loss": 2.6606, + "theoretical_loss": 3.3386876430407675, + "tokens_seen": 2798525440 + }, + { + "epoch": 9.04, + "learning_rate": 7.688064192577733e-05, + "loss": 2.4088, + "theoretical_loss": 3.3386815464547728, + "tokens_seen": 2798590976 + }, + { + "epoch": 9.04, + "learning_rate": 7.687061183550652e-05, + "loss": 2.489, + "theoretical_loss": 3.3386754500515163, + "tokens_seen": 2798656512 + }, + { + "epoch": 9.04, + "learning_rate": 7.68605817452357e-05, + "loss": 2.3903, + "theoretical_loss": 3.3386693538309897, + "tokens_seen": 2798722048 + }, + { + "epoch": 9.04, + "learning_rate": 7.68505516549649e-05, + "loss": 2.4245, + "theoretical_loss": 3.338663257793182, + "tokens_seen": 2798787584 + }, + { + "epoch": 9.04, + "learning_rate": 7.684052156469408e-05, + "loss": 2.2944, + "theoretical_loss": 3.3386571619380834, + "tokens_seen": 2798853120 + }, + { + "epoch": 9.04, + "learning_rate": 7.683049147442327e-05, + "loss": 2.3576, + "theoretical_loss": 3.3386510662656845, + "tokens_seen": 2798918656 + }, + { + "epoch": 9.04, + "learning_rate": 7.682046138415245e-05, + "loss": 2.4516, + "theoretical_loss": 3.338644970775976, + "tokens_seen": 2798984192 + }, + { + "epoch": 9.04, + "learning_rate": 7.681043129388164e-05, + "loss": 2.5686, + "theoretical_loss": 3.3386388754689476, + "tokens_seen": 2799049728 + }, + { + "epoch": 9.04, + "learning_rate": 7.680040120361083e-05, + "loss": 2.4763, + "theoretical_loss": 3.33863278034459, + "tokens_seen": 2799115264 + }, + { + "epoch": 9.04, + "learning_rate": 7.679037111334001e-05, + "loss": 2.592, + "theoretical_loss": 3.3386266854028928, + "tokens_seen": 2799180800 + }, + { + "epoch": 9.04, + "learning_rate": 7.67803410230692e-05, + "loss": 2.4338, + "theoretical_loss": 3.3386205906438464, + "tokens_seen": 2799246336 + }, + { + "epoch": 9.04, + "learning_rate": 7.677031093279839e-05, + "loss": 2.4883, + "theoretical_loss": 3.3386144960674415, + "tokens_seen": 2799311872 + }, + { + "epoch": 9.04, + "learning_rate": 7.67602808425276e-05, + "loss": 2.4985, + "theoretical_loss": 3.338608401673668, + "tokens_seen": 2799377408 + }, + { + "epoch": 9.04, + "learning_rate": 7.675025075225678e-05, + "loss": 2.448, + "theoretical_loss": 3.338602307462516, + "tokens_seen": 2799442944 + }, + { + "epoch": 9.04, + "learning_rate": 7.674022066198597e-05, + "loss": 2.4229, + "theoretical_loss": 3.338596213433976, + "tokens_seen": 2799508480 + }, + { + "epoch": 9.04, + "learning_rate": 7.673019057171515e-05, + "loss": 2.471, + "theoretical_loss": 3.338590119588039, + "tokens_seen": 2799574016 + }, + { + "epoch": 9.04, + "learning_rate": 7.672016048144433e-05, + "loss": 2.3485, + "theoretical_loss": 3.338584025924694, + "tokens_seen": 2799639552 + }, + { + "epoch": 9.04, + "learning_rate": 7.671013039117353e-05, + "loss": 2.5994, + "theoretical_loss": 3.3385779324439313, + "tokens_seen": 2799705088 + }, + { + "epoch": 9.04, + "learning_rate": 7.670010030090271e-05, + "loss": 2.4617, + "theoretical_loss": 3.338571839145742, + "tokens_seen": 2799770624 + }, + { + "epoch": 9.04, + "learning_rate": 7.66900702106319e-05, + "loss": 2.3616, + "theoretical_loss": 3.338565746030116, + "tokens_seen": 2799836160 + }, + { + "epoch": 9.04, + "learning_rate": 7.668004012036109e-05, + "loss": 2.4393, + "theoretical_loss": 3.3385596530970436, + "tokens_seen": 2799901696 + }, + { + "epoch": 9.04, + "learning_rate": 7.667001003009028e-05, + "loss": 2.3614, + "theoretical_loss": 3.338553560346515, + "tokens_seen": 2799967232 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3095607, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.203073501586914, + "objective/train/theoretical_loss": 3.3385489909034067, + "objective/train/tokens_used": 2820476384, + "theoretical_loss": 3.3385489909034067, + "tokens_seen": 2800016384 + }, + { + "epoch": 9.04, + "learning_rate": 7.665997993981946e-05, + "loss": 2.4064, + "theoretical_loss": 3.33854746777852, + "tokens_seen": 2800032768 + }, + { + "epoch": 9.04, + "learning_rate": 7.664994984954866e-05, + "loss": 2.5696, + "theoretical_loss": 3.3385413753930497, + "tokens_seen": 2800098304 + }, + { + "epoch": 9.04, + "learning_rate": 7.663991975927784e-05, + "loss": 2.4297, + "theoretical_loss": 3.338535283190094, + "tokens_seen": 2800163840 + }, + { + "epoch": 9.04, + "learning_rate": 7.662988966900703e-05, + "loss": 2.6976, + "theoretical_loss": 3.3385291911696426, + "tokens_seen": 2800229376 + }, + { + "epoch": 9.04, + "learning_rate": 7.661985957873621e-05, + "loss": 2.6404, + "theoretical_loss": 3.3385230993316863, + "tokens_seen": 2800294912 + }, + { + "epoch": 9.04, + "learning_rate": 7.66098294884654e-05, + "loss": 2.4123, + "theoretical_loss": 3.3385170076762156, + "tokens_seen": 2800360448 + }, + { + "epoch": 9.04, + "learning_rate": 7.659979939819459e-05, + "loss": 2.6177, + "theoretical_loss": 3.3385109162032207, + "tokens_seen": 2800425984 + }, + { + "epoch": 9.04, + "learning_rate": 7.658976930792377e-05, + "loss": 2.3342, + "theoretical_loss": 3.3385048249126914, + "tokens_seen": 2800491520 + }, + { + "epoch": 9.04, + "learning_rate": 7.657973921765296e-05, + "loss": 2.4227, + "theoretical_loss": 3.3384987338046184, + "tokens_seen": 2800557056 + }, + { + "epoch": 9.04, + "learning_rate": 7.656970912738215e-05, + "loss": 2.2767, + "theoretical_loss": 3.338492642878992, + "tokens_seen": 2800622592 + }, + { + "epoch": 9.04, + "learning_rate": 7.655967903711134e-05, + "loss": 2.3927, + "theoretical_loss": 3.3384865521358016, + "tokens_seen": 2800688128 + }, + { + "epoch": 9.04, + "learning_rate": 7.654964894684052e-05, + "loss": 2.3275, + "theoretical_loss": 3.3384804615750383, + "tokens_seen": 2800753664 + }, + { + "epoch": 9.04, + "learning_rate": 7.653961885656972e-05, + "loss": 2.6519, + "theoretical_loss": 3.3384743711966927, + "tokens_seen": 2800819200 + }, + { + "epoch": 9.04, + "learning_rate": 7.65295887662989e-05, + "loss": 2.6585, + "theoretical_loss": 3.338468281000754, + "tokens_seen": 2800884736 + }, + { + "epoch": 9.04, + "learning_rate": 7.651955867602808e-05, + "loss": 2.3957, + "theoretical_loss": 3.3384621909872134, + "tokens_seen": 2800950272 + }, + { + "epoch": 9.04, + "learning_rate": 7.650952858575727e-05, + "loss": 2.3969, + "theoretical_loss": 3.3384561011560603, + "tokens_seen": 2801015808 + }, + { + "epoch": 9.04, + "learning_rate": 7.649949849548645e-05, + "loss": 2.3789, + "theoretical_loss": 3.338450011507286, + "tokens_seen": 2801081344 + }, + { + "epoch": 9.04, + "learning_rate": 7.648946840521565e-05, + "loss": 2.4124, + "theoretical_loss": 3.33844392204088, + "tokens_seen": 2801146880 + }, + { + "epoch": 9.04, + "learning_rate": 7.647943831494483e-05, + "loss": 2.6135, + "theoretical_loss": 3.338437832756833, + "tokens_seen": 2801212416 + }, + { + "epoch": 9.04, + "learning_rate": 7.646940822467402e-05, + "loss": 2.4642, + "theoretical_loss": 3.3384317436551347, + "tokens_seen": 2801277952 + }, + { + "epoch": 9.04, + "learning_rate": 7.64593781344032e-05, + "loss": 2.1682, + "theoretical_loss": 3.3384256547357762, + "tokens_seen": 2801343488 + }, + { + "epoch": 9.04, + "learning_rate": 7.64493480441324e-05, + "loss": 2.5387, + "theoretical_loss": 3.3384195659987475, + "tokens_seen": 2801409024 + }, + { + "epoch": 9.04, + "learning_rate": 7.643931795386158e-05, + "loss": 2.3915, + "theoretical_loss": 3.338413477444038, + "tokens_seen": 2801474560 + }, + { + "epoch": 9.04, + "learning_rate": 7.642928786359076e-05, + "loss": 2.3521, + "theoretical_loss": 3.338407389071639, + "tokens_seen": 2801540096 + }, + { + "epoch": 9.04, + "learning_rate": 7.641925777331996e-05, + "loss": 2.4513, + "theoretical_loss": 3.3384013008815403, + "tokens_seen": 2801605632 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3097101, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5937111377716064, + "objective/train/theoretical_loss": 3.3383967348585957, + "objective/train/tokens_used": 2822114784, + "theoretical_loss": 3.3383967348585957, + "tokens_seen": 2801654784 + }, + { + "epoch": 9.04, + "learning_rate": 7.640922768304914e-05, + "loss": 2.4766, + "theoretical_loss": 3.3383952128737326, + "tokens_seen": 2801671168 + }, + { + "epoch": 9.04, + "learning_rate": 7.639919759277833e-05, + "loss": 2.3462, + "theoretical_loss": 3.338389125048206, + "tokens_seen": 2801736704 + }, + { + "epoch": 9.04, + "learning_rate": 7.638916750250751e-05, + "loss": 2.5073, + "theoretical_loss": 3.3383830374049506, + "tokens_seen": 2801802240 + }, + { + "epoch": 9.04, + "learning_rate": 7.637913741223672e-05, + "loss": 2.5348, + "theoretical_loss": 3.3383769499439566, + "tokens_seen": 2801867776 + }, + { + "epoch": 9.04, + "learning_rate": 7.63691073219659e-05, + "loss": 2.3951, + "theoretical_loss": 3.3383708626652147, + "tokens_seen": 2801933312 + }, + { + "epoch": 9.04, + "learning_rate": 7.63590772316951e-05, + "loss": 2.4191, + "theoretical_loss": 3.3383647755687145, + "tokens_seen": 2801998848 + }, + { + "epoch": 9.04, + "learning_rate": 7.634904714142428e-05, + "loss": 2.5451, + "theoretical_loss": 3.338358688654447, + "tokens_seen": 2802064384 + }, + { + "epoch": 9.04, + "learning_rate": 7.633901705115347e-05, + "loss": 2.5796, + "theoretical_loss": 3.338352601922402, + "tokens_seen": 2802129920 + }, + { + "epoch": 9.04, + "learning_rate": 7.632898696088265e-05, + "loss": 2.5613, + "theoretical_loss": 3.338346515372571, + "tokens_seen": 2802195456 + }, + { + "epoch": 9.04, + "learning_rate": 7.631895687061184e-05, + "loss": 2.444, + "theoretical_loss": 3.3383404290049423, + "tokens_seen": 2802260992 + }, + { + "epoch": 9.04, + "learning_rate": 7.630892678034103e-05, + "loss": 2.3734, + "theoretical_loss": 3.338334342819507, + "tokens_seen": 2802326528 + }, + { + "epoch": 9.04, + "learning_rate": 7.629889669007021e-05, + "loss": 2.6362, + "theoretical_loss": 3.338328256816256, + "tokens_seen": 2802392064 + }, + { + "epoch": 9.04, + "learning_rate": 7.62888665997994e-05, + "loss": 2.4708, + "theoretical_loss": 3.3383221709951787, + "tokens_seen": 2802457600 + }, + { + "epoch": 9.04, + "learning_rate": 7.627883650952859e-05, + "loss": 2.6388, + "theoretical_loss": 3.3383160853562663, + "tokens_seen": 2802523136 + }, + { + "epoch": 9.04, + "learning_rate": 7.626880641925778e-05, + "loss": 2.4284, + "theoretical_loss": 3.3383099998995083, + "tokens_seen": 2802588672 + }, + { + "epoch": 9.04, + "learning_rate": 7.625877632898696e-05, + "loss": 2.6033, + "theoretical_loss": 3.3383039146248956, + "tokens_seen": 2802654208 + }, + { + "epoch": 9.04, + "learning_rate": 7.624874623871616e-05, + "loss": 2.35, + "theoretical_loss": 3.3382978295324177, + "tokens_seen": 2802719744 + }, + { + "epoch": 9.04, + "learning_rate": 7.623871614844534e-05, + "loss": 2.4869, + "theoretical_loss": 3.3382917446220657, + "tokens_seen": 2802785280 + }, + { + "epoch": 9.04, + "learning_rate": 7.622868605817452e-05, + "loss": 2.3306, + "theoretical_loss": 3.3382856598938293, + "tokens_seen": 2802850816 + }, + { + "epoch": 9.04, + "learning_rate": 7.621865596790371e-05, + "loss": 2.3669, + "theoretical_loss": 3.3382795753476993, + "tokens_seen": 2802916352 + }, + { + "epoch": 9.04, + "learning_rate": 7.62086258776329e-05, + "loss": 2.5122, + "theoretical_loss": 3.3382734909836653, + "tokens_seen": 2802981888 + }, + { + "epoch": 9.04, + "learning_rate": 7.619859578736209e-05, + "loss": 2.5048, + "theoretical_loss": 3.3382674068017186, + "tokens_seen": 2803047424 + }, + { + "epoch": 9.04, + "learning_rate": 7.618856569709127e-05, + "loss": 2.4671, + "theoretical_loss": 3.3382613228018485, + "tokens_seen": 2803112960 + }, + { + "epoch": 9.04, + "learning_rate": 7.617853560682047e-05, + "loss": 2.4367, + "theoretical_loss": 3.338255238984046, + "tokens_seen": 2803178496 + }, + { + "epoch": 9.04, + "learning_rate": 7.616850551654965e-05, + "loss": 2.2542, + "theoretical_loss": 3.338249155348301, + "tokens_seen": 2803244032 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3097855, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.132349967956543, + "objective/train/theoretical_loss": 3.338244592740962, + "objective/train/tokens_used": 2823753184, + "theoretical_loss": 3.338244592740962, + "tokens_seen": 2803293184 + }, + { + "epoch": 9.04, + "learning_rate": 7.615847542627884e-05, + "loss": 2.1831, + "theoretical_loss": 3.3382430718946043, + "tokens_seen": 2803309568 + }, + { + "epoch": 9.04, + "learning_rate": 7.614844533600802e-05, + "loss": 2.1396, + "theoretical_loss": 3.3382369886229455, + "tokens_seen": 2803375104 + }, + { + "epoch": 9.04, + "learning_rate": 7.61384152457372e-05, + "loss": 2.5765, + "theoretical_loss": 3.3382309055333153, + "tokens_seen": 2803440640 + }, + { + "epoch": 9.04, + "learning_rate": 7.61283851554664e-05, + "loss": 2.6951, + "theoretical_loss": 3.3382248226257034, + "tokens_seen": 2803506176 + }, + { + "epoch": 9.04, + "learning_rate": 7.611835506519558e-05, + "loss": 2.6146, + "theoretical_loss": 3.338218739900101, + "tokens_seen": 2803571712 + }, + { + "epoch": 9.04, + "learning_rate": 7.610832497492477e-05, + "loss": 2.3565, + "theoretical_loss": 3.338212657356498, + "tokens_seen": 2803637248 + }, + { + "epoch": 9.04, + "learning_rate": 7.609829488465396e-05, + "loss": 2.245, + "theoretical_loss": 3.3382065749948846, + "tokens_seen": 2803702784 + }, + { + "epoch": 9.04, + "learning_rate": 7.608826479438315e-05, + "loss": 2.48, + "theoretical_loss": 3.3382004928152513, + "tokens_seen": 2803768320 + }, + { + "epoch": 9.04, + "learning_rate": 7.607823470411233e-05, + "loss": 2.6141, + "theoretical_loss": 3.3381944108175885, + "tokens_seen": 2803833856 + }, + { + "epoch": 9.04, + "learning_rate": 7.606820461384153e-05, + "loss": 2.424, + "theoretical_loss": 3.338188329001886, + "tokens_seen": 2803899392 + }, + { + "epoch": 9.04, + "learning_rate": 7.605817452357071e-05, + "loss": 2.4551, + "theoretical_loss": 3.338182247368134, + "tokens_seen": 2803964928 + }, + { + "epoch": 9.04, + "learning_rate": 7.60481444332999e-05, + "loss": 2.6384, + "theoretical_loss": 3.338176165916324, + "tokens_seen": 2804030464 + }, + { + "epoch": 9.04, + "learning_rate": 7.603811434302908e-05, + "loss": 2.4951, + "theoretical_loss": 3.3381700846464453, + "tokens_seen": 2804096000 + }, + { + "epoch": 9.04, + "learning_rate": 7.602808425275826e-05, + "loss": 2.8007, + "theoretical_loss": 3.338164003558488, + "tokens_seen": 2804161536 + }, + { + "epoch": 9.04, + "learning_rate": 7.601805416248746e-05, + "loss": 2.5352, + "theoretical_loss": 3.338157922652443, + "tokens_seen": 2804227072 + }, + { + "epoch": 9.04, + "learning_rate": 7.600802407221665e-05, + "loss": 2.4421, + "theoretical_loss": 3.338151841928301, + "tokens_seen": 2804292608 + }, + { + "epoch": 9.04, + "learning_rate": 7.599799398194585e-05, + "loss": 2.4983, + "theoretical_loss": 3.3381457613860515, + "tokens_seen": 2804358144 + }, + { + "epoch": 9.04, + "learning_rate": 7.598796389167503e-05, + "loss": 2.5024, + "theoretical_loss": 3.3381396810256847, + "tokens_seen": 2804423680 + }, + { + "epoch": 9.04, + "learning_rate": 7.597793380140422e-05, + "loss": 2.3933, + "theoretical_loss": 3.3381336008471916, + "tokens_seen": 2804489216 + }, + { + "epoch": 9.04, + "learning_rate": 7.59679037111334e-05, + "loss": 2.3014, + "theoretical_loss": 3.338127520850562, + "tokens_seen": 2804554752 + }, + { + "epoch": 9.04, + "learning_rate": 7.59578736208626e-05, + "loss": 2.5412, + "theoretical_loss": 3.3381214410357867, + "tokens_seen": 2804620288 + }, + { + "epoch": 9.04, + "learning_rate": 7.594784353059178e-05, + "loss": 2.4495, + "theoretical_loss": 3.3381153614028554, + "tokens_seen": 2804685824 + }, + { + "epoch": 9.04, + "learning_rate": 7.593781344032096e-05, + "loss": 2.4091, + "theoretical_loss": 3.3381092819517586, + "tokens_seen": 2804751360 + }, + { + "epoch": 9.04, + "learning_rate": 7.592778335005016e-05, + "loss": 2.5146, + "theoretical_loss": 3.3381032026824866, + "tokens_seen": 2804816896 + }, + { + "epoch": 9.04, + "learning_rate": 7.591775325977934e-05, + "loss": 2.7033, + "theoretical_loss": 3.3380971235950305, + "tokens_seen": 2804882432 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3098475, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3949697017669678, + "objective/train/theoretical_loss": 3.3380925643987482, + "objective/train/tokens_used": 2825391584, + "theoretical_loss": 3.3380925643987482, + "tokens_seen": 2804931584 + }, + { + "epoch": 9.04, + "learning_rate": 7.590772316950853e-05, + "loss": 2.5413, + "theoretical_loss": 3.3380910446893792, + "tokens_seen": 2804947968 + }, + { + "epoch": 9.04, + "learning_rate": 7.589769307923771e-05, + "loss": 2.401, + "theoretical_loss": 3.3380849659655243, + "tokens_seen": 2805013504 + }, + { + "epoch": 9.04, + "learning_rate": 7.588766298896691e-05, + "loss": 2.5116, + "theoretical_loss": 3.338078887423455, + "tokens_seen": 2805079040 + }, + { + "epoch": 9.04, + "learning_rate": 7.587763289869609e-05, + "loss": 2.3715, + "theoretical_loss": 3.338072809063163, + "tokens_seen": 2805144576 + }, + { + "epoch": 9.04, + "learning_rate": 7.586760280842528e-05, + "loss": 2.352, + "theoretical_loss": 3.3380667308846372, + "tokens_seen": 2805210112 + }, + { + "epoch": 9.04, + "learning_rate": 7.585757271815447e-05, + "loss": 2.6919, + "theoretical_loss": 3.3380606528878687, + "tokens_seen": 2805275648 + }, + { + "epoch": 9.04, + "learning_rate": 7.584754262788365e-05, + "loss": 2.3773, + "theoretical_loss": 3.3380545750728476, + "tokens_seen": 2805341184 + }, + { + "epoch": 9.04, + "learning_rate": 7.583751253761284e-05, + "loss": 2.506, + "theoretical_loss": 3.3380484974395648, + "tokens_seen": 2805406720 + }, + { + "epoch": 9.04, + "learning_rate": 7.582748244734202e-05, + "loss": 2.3581, + "theoretical_loss": 3.3380424199880094, + "tokens_seen": 2805472256 + }, + { + "epoch": 9.04, + "learning_rate": 7.581745235707122e-05, + "loss": 2.4248, + "theoretical_loss": 3.3380363427181727, + "tokens_seen": 2805537792 + }, + { + "epoch": 9.04, + "learning_rate": 7.58074222668004e-05, + "loss": 2.5646, + "theoretical_loss": 3.338030265630045, + "tokens_seen": 2805603328 + }, + { + "epoch": 9.04, + "learning_rate": 7.579739217652959e-05, + "loss": 2.5261, + "theoretical_loss": 3.3380241887236157, + "tokens_seen": 2805668864 + }, + { + "epoch": 9.04, + "learning_rate": 7.578736208625877e-05, + "loss": 2.5823, + "theoretical_loss": 3.3380181119988763, + "tokens_seen": 2805734400 + }, + { + "epoch": 9.04, + "learning_rate": 7.577733199598797e-05, + "loss": 2.5409, + "theoretical_loss": 3.338012035455816, + "tokens_seen": 2805799936 + }, + { + "epoch": 9.04, + "learning_rate": 7.576730190571715e-05, + "loss": 2.5619, + "theoretical_loss": 3.3380059590944264, + "tokens_seen": 2805865472 + }, + { + "epoch": 9.04, + "learning_rate": 7.575727181544634e-05, + "loss": 2.3542, + "theoretical_loss": 3.337999882914697, + "tokens_seen": 2805931008 + }, + { + "epoch": 9.04, + "learning_rate": 7.574724172517553e-05, + "loss": 2.4134, + "theoretical_loss": 3.337993806916618, + "tokens_seen": 2805996544 + }, + { + "epoch": 9.04, + "learning_rate": 7.57372116349047e-05, + "loss": 2.3433, + "theoretical_loss": 3.3379877311001804, + "tokens_seen": 2806062080 + }, + { + "epoch": 9.04, + "learning_rate": 7.57271815446339e-05, + "loss": 2.5471, + "theoretical_loss": 3.337981655465374, + "tokens_seen": 2806127616 + }, + { + "epoch": 9.04, + "learning_rate": 7.571715145436308e-05, + "loss": 2.4294, + "theoretical_loss": 3.337975580012189, + "tokens_seen": 2806193152 + }, + { + "epoch": 9.04, + "learning_rate": 7.570712136409228e-05, + "loss": 2.6443, + "theoretical_loss": 3.3379695047406166, + "tokens_seen": 2806258688 + }, + { + "epoch": 9.04, + "learning_rate": 7.569709127382146e-05, + "loss": 2.309, + "theoretical_loss": 3.337963429650646, + "tokens_seen": 2806324224 + }, + { + "epoch": 9.04, + "learning_rate": 7.568706118355065e-05, + "loss": 2.5333, + "theoretical_loss": 3.3379573547422683, + "tokens_seen": 2806389760 + }, + { + "epoch": 9.04, + "learning_rate": 7.567703109327983e-05, + "loss": 2.2402, + "theoretical_loss": 3.3379512800154734, + "tokens_seen": 2806455296 + }, + { + "epoch": 9.04, + "learning_rate": 7.566700100300903e-05, + "loss": 2.2134, + "theoretical_loss": 3.337945205470252, + "tokens_seen": 2806520832 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3099968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.338284730911255, + "objective/train/theoretical_loss": 3.3379406496804873, + "objective/train/tokens_used": 2827029984, + "theoretical_loss": 3.3379406496804873, + "tokens_seen": 2806569984 + }, + { + "epoch": 9.04, + "learning_rate": 7.565697091273821e-05, + "loss": 2.5216, + "theoretical_loss": 3.337939131106594, + "tokens_seen": 2806586368 + }, + { + "epoch": 9.04, + "learning_rate": 7.564694082246739e-05, + "loss": 2.6206, + "theoretical_loss": 3.33793305692449, + "tokens_seen": 2806651904 + }, + { + "epoch": 9.04, + "learning_rate": 7.56369107321966e-05, + "loss": 2.5285, + "theoretical_loss": 3.3379269829239306, + "tokens_seen": 2806717440 + }, + { + "epoch": 9.04, + "learning_rate": 7.562688064192578e-05, + "loss": 2.4885, + "theoretical_loss": 3.3379209091049056, + "tokens_seen": 2806782976 + }, + { + "epoch": 9.04, + "learning_rate": 7.561685055165498e-05, + "loss": 2.5203, + "theoretical_loss": 3.337914835467406, + "tokens_seen": 2806848512 + }, + { + "epoch": 9.04, + "learning_rate": 7.560682046138416e-05, + "loss": 2.3913, + "theoretical_loss": 3.337908762011421, + "tokens_seen": 2806914048 + }, + { + "epoch": 9.04, + "learning_rate": 7.559679037111335e-05, + "loss": 2.4652, + "theoretical_loss": 3.3379026887369423, + "tokens_seen": 2806979584 + }, + { + "epoch": 9.04, + "learning_rate": 7.558676028084253e-05, + "loss": 2.5681, + "theoretical_loss": 3.3378966156439596, + "tokens_seen": 2807045120 + }, + { + "epoch": 9.04, + "learning_rate": 7.557673019057173e-05, + "loss": 2.7705, + "theoretical_loss": 3.3378905427324628, + "tokens_seen": 2807110656 + }, + { + "epoch": 9.04, + "learning_rate": 7.556670010030091e-05, + "loss": 2.6694, + "theoretical_loss": 3.337884470002443, + "tokens_seen": 2807176192 + }, + { + "epoch": 9.04, + "learning_rate": 7.55566700100301e-05, + "loss": 2.5442, + "theoretical_loss": 3.33787839745389, + "tokens_seen": 2807241728 + }, + { + "epoch": 9.04, + "learning_rate": 7.554663991975928e-05, + "loss": 2.476, + "theoretical_loss": 3.3378723250867948, + "tokens_seen": 2807307264 + }, + { + "epoch": 9.04, + "learning_rate": 7.553660982948846e-05, + "loss": 2.377, + "theoretical_loss": 3.3378662529011467, + "tokens_seen": 2807372800 + }, + { + "epoch": 9.04, + "learning_rate": 7.552657973921766e-05, + "loss": 2.5089, + "theoretical_loss": 3.337860180896937, + "tokens_seen": 2807438336 + }, + { + "epoch": 9.04, + "learning_rate": 7.551654964894684e-05, + "loss": 2.5643, + "theoretical_loss": 3.3378541090741556, + "tokens_seen": 2807503872 + }, + { + "epoch": 9.04, + "learning_rate": 7.550651955867604e-05, + "loss": 2.5229, + "theoretical_loss": 3.337848037432793, + "tokens_seen": 2807569408 + }, + { + "epoch": 9.04, + "learning_rate": 7.549648946840522e-05, + "loss": 2.5231, + "theoretical_loss": 3.3378419659728396, + "tokens_seen": 2807634944 + }, + { + "epoch": 9.04, + "learning_rate": 7.548645937813441e-05, + "loss": 2.3669, + "theoretical_loss": 3.3378358946942854, + "tokens_seen": 2807700480 + }, + { + "epoch": 9.04, + "learning_rate": 7.547642928786359e-05, + "loss": 2.4584, + "theoretical_loss": 3.3378298235971213, + "tokens_seen": 2807766016 + }, + { + "epoch": 9.04, + "learning_rate": 7.546639919759279e-05, + "loss": 2.601, + "theoretical_loss": 3.337823752681337, + "tokens_seen": 2807831552 + }, + { + "epoch": 9.04, + "learning_rate": 7.545636910732197e-05, + "loss": 2.4989, + "theoretical_loss": 3.337817681946923, + "tokens_seen": 2807897088 + }, + { + "epoch": 9.04, + "learning_rate": 7.544633901705115e-05, + "loss": 2.539, + "theoretical_loss": 3.33781161139387, + "tokens_seen": 2807962624 + }, + { + "epoch": 9.04, + "learning_rate": 7.543630892678034e-05, + "loss": 2.502, + "theoretical_loss": 3.3378055410221683, + "tokens_seen": 2808028160 + }, + { + "epoch": 9.04, + "learning_rate": 7.542627883650952e-05, + "loss": 2.6034, + "theoretical_loss": 3.337799470831808, + "tokens_seen": 2808093696 + }, + { + "epoch": 9.04, + "learning_rate": 7.541624874623872e-05, + "loss": 2.6315, + "theoretical_loss": 3.33779340082278, + "tokens_seen": 2808159232 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3101199, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.767174482345581, + "objective/train/theoretical_loss": 3.3377888484350016, + "objective/train/tokens_used": 2828668384, + "theoretical_loss": 3.3377888484350016, + "tokens_seen": 2808208384 + }, + { + "epoch": 9.04, + "learning_rate": 7.54062186559679e-05, + "loss": 2.4264, + "theoretical_loss": 3.3377873309950736, + "tokens_seen": 2808224768 + }, + { + "epoch": 9.04, + "learning_rate": 7.53961885656971e-05, + "loss": 2.4804, + "theoretical_loss": 3.33778126134868, + "tokens_seen": 2808290304 + }, + { + "epoch": 9.04, + "learning_rate": 7.538615847542628e-05, + "loss": 2.5069, + "theoretical_loss": 3.337775191883589, + "tokens_seen": 2808355840 + }, + { + "epoch": 9.04, + "learning_rate": 7.537612838515547e-05, + "loss": 2.5731, + "theoretical_loss": 3.337769122599792, + "tokens_seen": 2808421376 + }, + { + "epoch": 9.04, + "learning_rate": 7.536609829488465e-05, + "loss": 2.6204, + "theoretical_loss": 3.337763053497278, + "tokens_seen": 2808486912 + }, + { + "epoch": 9.04, + "learning_rate": 7.535606820461383e-05, + "loss": 2.2722, + "theoretical_loss": 3.337756984576038, + "tokens_seen": 2808552448 + }, + { + "epoch": 9.04, + "learning_rate": 7.534603811434303e-05, + "loss": 2.3416, + "theoretical_loss": 3.3377509158360628, + "tokens_seen": 2808617984 + }, + { + "epoch": 9.04, + "learning_rate": 7.533600802407221e-05, + "loss": 2.6535, + "theoretical_loss": 3.3377448472773423, + "tokens_seen": 2808683520 + }, + { + "epoch": 9.04, + "learning_rate": 7.53259779338014e-05, + "loss": 2.3627, + "theoretical_loss": 3.3377387788998663, + "tokens_seen": 2808749056 + }, + { + "epoch": 9.04, + "learning_rate": 7.531594784353058e-05, + "loss": 2.5533, + "theoretical_loss": 3.337732710703626, + "tokens_seen": 2808814592 + }, + { + "epoch": 9.04, + "learning_rate": 7.530591775325978e-05, + "loss": 2.6144, + "theoretical_loss": 3.3377266426886116, + "tokens_seen": 2808880128 + }, + { + "epoch": 9.04, + "learning_rate": 7.529588766298896e-05, + "loss": 2.6395, + "theoretical_loss": 3.3377205748548135, + "tokens_seen": 2808945664 + }, + { + "epoch": 9.04, + "learning_rate": 7.528585757271816e-05, + "loss": 2.411, + "theoretical_loss": 3.3377145072022216, + "tokens_seen": 2809011200 + }, + { + "epoch": 9.04, + "learning_rate": 7.527582748244734e-05, + "loss": 2.4311, + "theoretical_loss": 3.337708439730826, + "tokens_seen": 2809076736 + }, + { + "epoch": 9.04, + "learning_rate": 7.526579739217652e-05, + "loss": 2.4575, + "theoretical_loss": 3.3377023724406185, + "tokens_seen": 2809142272 + }, + { + "epoch": 9.04, + "learning_rate": 7.525576730190573e-05, + "loss": 2.4331, + "theoretical_loss": 3.3376963053315882, + "tokens_seen": 2809207808 + }, + { + "epoch": 9.04, + "learning_rate": 7.524573721163491e-05, + "loss": 2.7503, + "theoretical_loss": 3.337690238403726, + "tokens_seen": 2809273344 + }, + { + "epoch": 9.04, + "learning_rate": 7.52357071213641e-05, + "loss": 2.5649, + "theoretical_loss": 3.3376841716570222, + "tokens_seen": 2809338880 + }, + { + "epoch": 9.04, + "learning_rate": 7.522567703109328e-05, + "loss": 2.7501, + "theoretical_loss": 3.3376781050914666, + "tokens_seen": 2809404416 + }, + { + "epoch": 9.04, + "learning_rate": 7.521564694082248e-05, + "loss": 2.3003, + "theoretical_loss": 3.3376720387070504, + "tokens_seen": 2809469952 + }, + { + "epoch": 9.04, + "learning_rate": 7.520561685055166e-05, + "loss": 2.4429, + "theoretical_loss": 3.3376659725037636, + "tokens_seen": 2809535488 + }, + { + "epoch": 9.04, + "learning_rate": 7.519558676028085e-05, + "loss": 2.4707, + "theoretical_loss": 3.3376599064815964, + "tokens_seen": 2809601024 + }, + { + "epoch": 9.04, + "learning_rate": 7.518555667001003e-05, + "loss": 2.449, + "theoretical_loss": 3.3376538406405394, + "tokens_seen": 2809666560 + }, + { + "epoch": 9.04, + "learning_rate": 7.517552657973923e-05, + "loss": 2.6315, + "theoretical_loss": 3.337647774980583, + "tokens_seen": 2809732096 + }, + { + "epoch": 9.04, + "learning_rate": 7.516549648946841e-05, + "loss": 2.4781, + "theoretical_loss": 3.3376417095017175, + "tokens_seen": 2809797632 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3101997, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5690271854400635, + "objective/train/theoretical_loss": 3.3376371605114032, + "objective/train/tokens_used": 2830306784, + "theoretical_loss": 3.3376371605114032, + "tokens_seen": 2809846784 + }, + { + "epoch": 9.04, + "learning_rate": 7.515546639919759e-05, + "loss": 2.4925, + "theoretical_loss": 3.337635644203933, + "tokens_seen": 2809863168 + }, + { + "epoch": 9.04, + "learning_rate": 7.514543630892679e-05, + "loss": 2.275, + "theoretical_loss": 3.33762957908722, + "tokens_seen": 2809928704 + }, + { + "epoch": 9.04, + "learning_rate": 7.513540621865597e-05, + "loss": 2.5908, + "theoretical_loss": 3.3376235141515695, + "tokens_seen": 2809994240 + }, + { + "epoch": 9.04, + "learning_rate": 7.512537612838516e-05, + "loss": 2.3907, + "theoretical_loss": 3.337617449396971, + "tokens_seen": 2810059776 + }, + { + "epoch": 9.04, + "learning_rate": 7.511534603811434e-05, + "loss": 2.626, + "theoretical_loss": 3.3376113848234152, + "tokens_seen": 2810125312 + }, + { + "epoch": 9.04, + "learning_rate": 7.510531594784354e-05, + "loss": 2.3395, + "theoretical_loss": 3.337605320430893, + "tokens_seen": 2810190848 + }, + { + "epoch": 9.04, + "learning_rate": 7.509528585757272e-05, + "loss": 2.6132, + "theoretical_loss": 3.3375992562193937, + "tokens_seen": 2810256384 + }, + { + "epoch": 9.04, + "learning_rate": 7.508525576730191e-05, + "loss": 2.4492, + "theoretical_loss": 3.3375931921889084, + "tokens_seen": 2810321920 + }, + { + "epoch": 9.04, + "learning_rate": 7.50752256770311e-05, + "loss": 2.5417, + "theoretical_loss": 3.3375871283394276, + "tokens_seen": 2810387456 + }, + { + "epoch": 9.04, + "learning_rate": 7.506519558676028e-05, + "loss": 2.6433, + "theoretical_loss": 3.337581064670941, + "tokens_seen": 2810452992 + }, + { + "epoch": 9.04, + "learning_rate": 7.505516549648947e-05, + "loss": 2.4419, + "theoretical_loss": 3.3375750011834397, + "tokens_seen": 2810518528 + }, + { + "epoch": 9.04, + "learning_rate": 7.504513540621865e-05, + "loss": 2.5748, + "theoretical_loss": 3.337568937876914, + "tokens_seen": 2810584064 + }, + { + "epoch": 9.04, + "learning_rate": 7.503510531594785e-05, + "loss": 2.4087, + "theoretical_loss": 3.3375628747513533, + "tokens_seen": 2810649600 + }, + { + "epoch": 9.04, + "learning_rate": 7.502507522567703e-05, + "loss": 2.4356, + "theoretical_loss": 3.337556811806749, + "tokens_seen": 2810715136 + }, + { + "epoch": 9.04, + "learning_rate": 7.501504513540622e-05, + "loss": 2.361, + "theoretical_loss": 3.3375507490430913, + "tokens_seen": 2810780672 + }, + { + "epoch": 9.04, + "learning_rate": 7.50050150451354e-05, + "loss": 2.6224, + "theoretical_loss": 3.3375446864603706, + "tokens_seen": 2810846208 + }, + { + "epoch": 9.04, + "learning_rate": 7.49949849548646e-05, + "loss": 2.5423, + "theoretical_loss": 3.337538624058577, + "tokens_seen": 2810911744 + }, + { + "epoch": 9.04, + "learning_rate": 7.498495486459378e-05, + "loss": 2.3949, + "theoretical_loss": 3.337532561837701, + "tokens_seen": 2810977280 + }, + { + "epoch": 9.04, + "learning_rate": 7.497492477432297e-05, + "loss": 2.5578, + "theoretical_loss": 3.3375264997977334, + "tokens_seen": 2811042816 + }, + { + "epoch": 9.04, + "learning_rate": 7.496489468405215e-05, + "loss": 2.5988, + "theoretical_loss": 3.337520437938664, + "tokens_seen": 2811108352 + }, + { + "epoch": 9.04, + "learning_rate": 7.495486459378134e-05, + "loss": 2.4388, + "theoretical_loss": 3.3375143762604833, + "tokens_seen": 2811173888 + }, + { + "epoch": 9.04, + "learning_rate": 7.494483450351053e-05, + "loss": 2.3702, + "theoretical_loss": 3.337508314763182, + "tokens_seen": 2811239424 + }, + { + "epoch": 9.04, + "learning_rate": 7.493480441323971e-05, + "loss": 2.5228, + "theoretical_loss": 3.33750225344675, + "tokens_seen": 2811304960 + }, + { + "epoch": 9.04, + "learning_rate": 7.49247743229689e-05, + "loss": 2.3634, + "theoretical_loss": 3.337496192311178, + "tokens_seen": 2811370496 + }, + { + "epoch": 9.04, + "learning_rate": 7.491474423269809e-05, + "loss": 2.4638, + "theoretical_loss": 3.3374901313564562, + "tokens_seen": 2811436032 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3102724, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.393054485321045, + "objective/train/theoretical_loss": 3.3374855857590924, + "objective/train/tokens_used": 2831945184, + "theoretical_loss": 3.3374855857590924, + "tokens_seen": 2811485184 + }, + { + "epoch": 9.04, + "learning_rate": 7.490471414242728e-05, + "loss": 2.3449, + "theoretical_loss": 3.3374840705825752, + "tokens_seen": 2811501568 + }, + { + "epoch": 9.04, + "learning_rate": 7.489468405215646e-05, + "loss": 2.3937, + "theoretical_loss": 3.3374780099895256, + "tokens_seen": 2811567104 + }, + { + "epoch": 9.04, + "learning_rate": 7.488465396188567e-05, + "loss": 2.4124, + "theoretical_loss": 3.3374719495772975, + "tokens_seen": 2811632640 + }, + { + "epoch": 9.04, + "learning_rate": 7.487462387161485e-05, + "loss": 2.5584, + "theoretical_loss": 3.3374658893458813, + "tokens_seen": 2811698176 + }, + { + "epoch": 9.04, + "learning_rate": 7.486459378134403e-05, + "loss": 2.5408, + "theoretical_loss": 3.337459829295267, + "tokens_seen": 2811763712 + }, + { + "epoch": 9.04, + "learning_rate": 7.485456369107323e-05, + "loss": 2.4343, + "theoretical_loss": 3.3374537694254456, + "tokens_seen": 2811829248 + }, + { + "epoch": 9.04, + "learning_rate": 7.484453360080241e-05, + "loss": 2.5624, + "theoretical_loss": 3.3374477097364075, + "tokens_seen": 2811894784 + }, + { + "epoch": 9.04, + "learning_rate": 7.48345035105316e-05, + "loss": 2.553, + "theoretical_loss": 3.3374416502281425, + "tokens_seen": 2811960320 + }, + { + "epoch": 9.04, + "learning_rate": 7.482447342026079e-05, + "loss": 2.2969, + "theoretical_loss": 3.3374355909006415, + "tokens_seen": 2812025856 + }, + { + "epoch": 9.04, + "learning_rate": 7.481444332998998e-05, + "loss": 2.5402, + "theoretical_loss": 3.337429531753895, + "tokens_seen": 2812091392 + }, + { + "epoch": 9.04, + "learning_rate": 7.480441323971916e-05, + "loss": 2.3277, + "theoretical_loss": 3.3374234727878926, + "tokens_seen": 2812156928 + }, + { + "epoch": 9.04, + "learning_rate": 7.479438314944836e-05, + "loss": 2.5683, + "theoretical_loss": 3.337417414002626, + "tokens_seen": 2812222464 + }, + { + "epoch": 9.04, + "learning_rate": 7.478435305917754e-05, + "loss": 2.4426, + "theoretical_loss": 3.337411355398084, + "tokens_seen": 2812288000 + }, + { + "epoch": 9.04, + "learning_rate": 7.477432296890672e-05, + "loss": 2.6851, + "theoretical_loss": 3.3374052969742585, + "tokens_seen": 2812353536 + }, + { + "epoch": 9.04, + "learning_rate": 7.476429287863591e-05, + "loss": 2.5371, + "theoretical_loss": 3.3373992387311393, + "tokens_seen": 2812419072 + }, + { + "epoch": 9.04, + "learning_rate": 7.47542627883651e-05, + "loss": 2.5624, + "theoretical_loss": 3.337393180668716, + "tokens_seen": 2812484608 + }, + { + "epoch": 9.04, + "learning_rate": 7.474423269809429e-05, + "loss": 2.5337, + "theoretical_loss": 3.3373871227869807, + "tokens_seen": 2812550144 + }, + { + "epoch": 9.04, + "learning_rate": 7.473420260782347e-05, + "loss": 2.4708, + "theoretical_loss": 3.337381065085922, + "tokens_seen": 2812615680 + }, + { + "epoch": 9.04, + "learning_rate": 7.472417251755266e-05, + "loss": 2.6607, + "theoretical_loss": 3.3373750075655315, + "tokens_seen": 2812681216 + }, + { + "epoch": 9.04, + "learning_rate": 7.471414242728185e-05, + "loss": 2.4814, + "theoretical_loss": 3.3373689502257995, + "tokens_seen": 2812746752 + }, + { + "epoch": 9.04, + "learning_rate": 7.470411233701104e-05, + "loss": 2.5545, + "theoretical_loss": 3.337362893066716, + "tokens_seen": 2812812288 + }, + { + "epoch": 9.04, + "learning_rate": 7.469408224674022e-05, + "loss": 2.3502, + "theoretical_loss": 3.3373568360882713, + "tokens_seen": 2812877824 + }, + { + "epoch": 9.04, + "learning_rate": 7.468405215646942e-05, + "loss": 2.4651, + "theoretical_loss": 3.3373507792904564, + "tokens_seen": 2812943360 + }, + { + "epoch": 9.04, + "learning_rate": 7.46740220661986e-05, + "loss": 2.5462, + "theoretical_loss": 3.337344722673261, + "tokens_seen": 2813008896 + }, + { + "epoch": 9.04, + "learning_rate": 7.466399197592778e-05, + "loss": 2.3839, + "theoretical_loss": 3.3373386662366764, + "tokens_seen": 2813074432 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3104026, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1686153411865234, + "objective/train/theoretical_loss": 3.337334124027757, + "objective/train/tokens_used": 2833583584, + "theoretical_loss": 3.337334124027757, + "tokens_seen": 2813123584 + }, + { + "epoch": 9.04, + "learning_rate": 7.465396188565697e-05, + "loss": 2.5068, + "theoretical_loss": 3.3373326099806917, + "tokens_seen": 2813139968 + }, + { + "epoch": 9.04, + "learning_rate": 7.464393179538615e-05, + "loss": 2.3425, + "theoretical_loss": 3.3373265539052985, + "tokens_seen": 2813205504 + }, + { + "epoch": 9.04, + "learning_rate": 7.463390170511535e-05, + "loss": 2.7318, + "theoretical_loss": 3.337320498010487, + "tokens_seen": 2813271040 + }, + { + "epoch": 9.04, + "learning_rate": 7.462387161484453e-05, + "loss": 2.4421, + "theoretical_loss": 3.3373144422962473, + "tokens_seen": 2813336576 + }, + { + "epoch": 9.04, + "learning_rate": 7.461384152457372e-05, + "loss": 2.5153, + "theoretical_loss": 3.3373083867625697, + "tokens_seen": 2813402112 + }, + { + "epoch": 9.04, + "learning_rate": 7.46038114343029e-05, + "loss": 2.494, + "theoretical_loss": 3.337302331409445, + "tokens_seen": 2813467648 + }, + { + "epoch": 9.04, + "learning_rate": 7.45937813440321e-05, + "loss": 2.5239, + "theoretical_loss": 3.3372962762368634, + "tokens_seen": 2813533184 + }, + { + "epoch": 9.04, + "learning_rate": 7.458375125376128e-05, + "loss": 2.4961, + "theoretical_loss": 3.3372902212448157, + "tokens_seen": 2813598720 + }, + { + "epoch": 9.04, + "learning_rate": 7.457372116349046e-05, + "loss": 2.6105, + "theoretical_loss": 3.3372841664332915, + "tokens_seen": 2813664256 + }, + { + "epoch": 9.04, + "learning_rate": 7.456369107321966e-05, + "loss": 2.4737, + "theoretical_loss": 3.3372781118022816, + "tokens_seen": 2813729792 + }, + { + "epoch": 9.04, + "learning_rate": 7.455366098294884e-05, + "loss": 2.6632, + "theoretical_loss": 3.3372720573517767, + "tokens_seen": 2813795328 + }, + { + "epoch": 9.04, + "learning_rate": 7.454363089267803e-05, + "loss": 2.2944, + "theoretical_loss": 3.337266003081767, + "tokens_seen": 2813860864 + }, + { + "epoch": 9.04, + "learning_rate": 7.453360080240721e-05, + "loss": 2.5736, + "theoretical_loss": 3.337259948992243, + "tokens_seen": 2813926400 + }, + { + "epoch": 9.04, + "learning_rate": 7.452357071213641e-05, + "loss": 2.436, + "theoretical_loss": 3.3372538950831947, + "tokens_seen": 2813991936 + }, + { + "epoch": 9.04, + "learning_rate": 7.451354062186559e-05, + "loss": 2.3356, + "theoretical_loss": 3.337247841354613, + "tokens_seen": 2814057472 + }, + { + "epoch": 9.04, + "learning_rate": 7.45035105315948e-05, + "loss": 2.3164, + "theoretical_loss": 3.337241787806488, + "tokens_seen": 2814123008 + }, + { + "epoch": 9.04, + "learning_rate": 7.449348044132398e-05, + "loss": 2.4074, + "theoretical_loss": 3.337235734438811, + "tokens_seen": 2814188544 + }, + { + "epoch": 9.04, + "learning_rate": 7.448345035105317e-05, + "loss": 2.5546, + "theoretical_loss": 3.337229681251571, + "tokens_seen": 2814254080 + }, + { + "epoch": 9.04, + "learning_rate": 7.447342026078235e-05, + "loss": 2.4158, + "theoretical_loss": 3.337223628244759, + "tokens_seen": 2814319616 + }, + { + "epoch": 9.04, + "learning_rate": 7.446339017051154e-05, + "loss": 2.6238, + "theoretical_loss": 3.337217575418366, + "tokens_seen": 2814385152 + }, + { + "epoch": 9.04, + "learning_rate": 7.445336008024073e-05, + "loss": 2.6175, + "theoretical_loss": 3.3372115227723818, + "tokens_seen": 2814450688 + }, + { + "epoch": 9.04, + "learning_rate": 7.444332998996991e-05, + "loss": 2.2484, + "theoretical_loss": 3.337205470306797, + "tokens_seen": 2814516224 + }, + { + "epoch": 9.04, + "learning_rate": 7.44332998996991e-05, + "loss": 2.4198, + "theoretical_loss": 3.3371994180216022, + "tokens_seen": 2814581760 + }, + { + "epoch": 9.04, + "learning_rate": 7.442326980942829e-05, + "loss": 2.5911, + "theoretical_loss": 3.3371933659167876, + "tokens_seen": 2814647296 + }, + { + "epoch": 9.04, + "learning_rate": 7.441323971915748e-05, + "loss": 2.5859, + "theoretical_loss": 3.3371873139923434, + "tokens_seen": 2814712832 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3105372, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.217341184616089, + "objective/train/theoretical_loss": 3.337182775167373, + "objective/train/tokens_used": 2835221984, + "theoretical_loss": 3.337182775167373, + "tokens_seen": 2814761984 + }, + { + "epoch": 9.04, + "learning_rate": 7.440320962888666e-05, + "loss": 2.4945, + "theoretical_loss": 3.3371812622482606, + "tokens_seen": 2814778368 + }, + { + "epoch": 9.04, + "learning_rate": 7.439317953861586e-05, + "loss": 2.6398, + "theoretical_loss": 3.3371752106845287, + "tokens_seen": 2814843904 + }, + { + "epoch": 9.04, + "learning_rate": 7.438314944834504e-05, + "loss": 2.56, + "theoretical_loss": 3.3371691593011397, + "tokens_seen": 2814909440 + }, + { + "epoch": 9.04, + "learning_rate": 7.437311935807422e-05, + "loss": 2.5215, + "theoretical_loss": 3.3371631080980824, + "tokens_seen": 2814974976 + }, + { + "epoch": 9.04, + "learning_rate": 7.436308926780341e-05, + "loss": 2.2919, + "theoretical_loss": 3.3371570570753475, + "tokens_seen": 2815040512 + }, + { + "epoch": 9.04, + "learning_rate": 7.43530591775326e-05, + "loss": 2.7935, + "theoretical_loss": 3.3371510062329266, + "tokens_seen": 2815106048 + }, + { + "epoch": 9.04, + "learning_rate": 7.434302908726179e-05, + "loss": 2.4562, + "theoretical_loss": 3.337144955570809, + "tokens_seen": 2815171584 + }, + { + "epoch": 9.04, + "learning_rate": 7.433299899699097e-05, + "loss": 2.4948, + "theoretical_loss": 3.3371389050889855, + "tokens_seen": 2815237120 + }, + { + "epoch": 9.04, + "learning_rate": 7.432296890672017e-05, + "loss": 2.4038, + "theoretical_loss": 3.3371328547874466, + "tokens_seen": 2815302656 + }, + { + "epoch": 9.04, + "learning_rate": 7.431293881644935e-05, + "loss": 2.552, + "theoretical_loss": 3.3371268046661826, + "tokens_seen": 2815368192 + }, + { + "epoch": 9.04, + "learning_rate": 7.430290872617854e-05, + "loss": 2.3863, + "theoretical_loss": 3.337120754725184, + "tokens_seen": 2815433728 + }, + { + "epoch": 9.04, + "learning_rate": 7.429287863590772e-05, + "loss": 2.3661, + "theoretical_loss": 3.337114704964441, + "tokens_seen": 2815499264 + }, + { + "epoch": 9.04, + "learning_rate": 7.42828485456369e-05, + "loss": 2.3206, + "theoretical_loss": 3.3371086553839446, + "tokens_seen": 2815564800 + }, + { + "epoch": 9.04, + "learning_rate": 7.42728184553661e-05, + "loss": 2.5749, + "theoretical_loss": 3.337102605983685, + "tokens_seen": 2815630336 + }, + { + "epoch": 9.04, + "learning_rate": 7.426278836509528e-05, + "loss": 2.4239, + "theoretical_loss": 3.337096556763652, + "tokens_seen": 2815695872 + }, + { + "epoch": 9.04, + "learning_rate": 7.425275827482447e-05, + "loss": 2.44, + "theoretical_loss": 3.3370905077238366, + "tokens_seen": 2815761408 + }, + { + "epoch": 9.04, + "learning_rate": 7.424272818455366e-05, + "loss": 2.4795, + "theoretical_loss": 3.337084458864229, + "tokens_seen": 2815826944 + }, + { + "epoch": 9.04, + "learning_rate": 7.423269809428285e-05, + "loss": 2.3251, + "theoretical_loss": 3.3370784101848203, + "tokens_seen": 2815892480 + }, + { + "epoch": 9.04, + "learning_rate": 7.422266800401203e-05, + "loss": 2.2276, + "theoretical_loss": 3.3370723616856006, + "tokens_seen": 2815958016 + }, + { + "epoch": 9.04, + "learning_rate": 7.421263791374123e-05, + "loss": 2.6106, + "theoretical_loss": 3.33706631336656, + "tokens_seen": 2816023552 + }, + { + "epoch": 9.04, + "learning_rate": 7.420260782347041e-05, + "loss": 2.5259, + "theoretical_loss": 3.337060265227689, + "tokens_seen": 2816089088 + }, + { + "epoch": 9.04, + "learning_rate": 7.41925777331996e-05, + "loss": 2.5082, + "theoretical_loss": 3.337054217268978, + "tokens_seen": 2816154624 + }, + { + "epoch": 9.04, + "learning_rate": 7.418254764292878e-05, + "loss": 2.3782, + "theoretical_loss": 3.337048169490418, + "tokens_seen": 2816220160 + }, + { + "epoch": 9.04, + "learning_rate": 7.417251755265796e-05, + "loss": 2.3555, + "theoretical_loss": 3.3370421218919986, + "tokens_seen": 2816285696 + }, + { + "epoch": 9.04, + "learning_rate": 7.416248746238716e-05, + "loss": 2.4316, + "theoretical_loss": 3.337036074473711, + "tokens_seen": 2816351232 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3106163, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4927053451538086, + "objective/train/theoretical_loss": 3.337031539028201, + "objective/train/tokens_used": 2836860384, + "theoretical_loss": 3.337031539028201, + "tokens_seen": 2816400384 + }, + { + "epoch": 9.04, + "learning_rate": 7.415245737211634e-05, + "loss": 2.4994, + "theoretical_loss": 3.3370300272355453, + "tokens_seen": 2816416768 + }, + { + "epoch": 9.04, + "learning_rate": 7.414242728184553e-05, + "loss": 2.3026, + "theoretical_loss": 3.337023980177492, + "tokens_seen": 2816482304 + }, + { + "epoch": 9.04, + "learning_rate": 7.413239719157473e-05, + "loss": 2.4086, + "theoretical_loss": 3.3370179332995415, + "tokens_seen": 2816547840 + }, + { + "epoch": 9.04, + "learning_rate": 7.412236710130392e-05, + "loss": 2.4192, + "theoretical_loss": 3.3370118866016845, + "tokens_seen": 2816613376 + }, + { + "epoch": 9.04, + "learning_rate": 7.41123370110331e-05, + "loss": 2.3815, + "theoretical_loss": 3.337005840083911, + "tokens_seen": 2816678912 + }, + { + "epoch": 9.04, + "learning_rate": 7.41023069207623e-05, + "loss": 2.4755, + "theoretical_loss": 3.3369997937462115, + "tokens_seen": 2816744448 + }, + { + "epoch": 9.04, + "learning_rate": 7.409227683049148e-05, + "loss": 2.3886, + "theoretical_loss": 3.336993747588577, + "tokens_seen": 2816809984 + }, + { + "epoch": 9.04, + "learning_rate": 7.408224674022066e-05, + "loss": 2.4995, + "theoretical_loss": 3.3369877016109974, + "tokens_seen": 2816875520 + }, + { + "epoch": 9.04, + "learning_rate": 7.407221664994986e-05, + "loss": 2.7162, + "theoretical_loss": 3.3369816558134633, + "tokens_seen": 2816941056 + }, + { + "epoch": 9.04, + "learning_rate": 7.406218655967904e-05, + "loss": 2.5368, + "theoretical_loss": 3.336975610195965, + "tokens_seen": 2817006592 + }, + { + "epoch": 9.04, + "learning_rate": 7.405215646940823e-05, + "loss": 2.4987, + "theoretical_loss": 3.3369695647584936, + "tokens_seen": 2817072128 + }, + { + "epoch": 9.04, + "learning_rate": 7.404212637913741e-05, + "loss": 2.4377, + "theoretical_loss": 3.3369635195010385, + "tokens_seen": 2817137664 + }, + { + "epoch": 9.04, + "learning_rate": 7.403209628886661e-05, + "loss": 2.5971, + "theoretical_loss": 3.336957474423591, + "tokens_seen": 2817203200 + }, + { + "epoch": 9.04, + "learning_rate": 7.402206619859579e-05, + "loss": 2.3369, + "theoretical_loss": 3.3369514295261413, + "tokens_seen": 2817268736 + }, + { + "epoch": 9.04, + "learning_rate": 7.401203610832498e-05, + "loss": 2.2675, + "theoretical_loss": 3.33694538480868, + "tokens_seen": 2817334272 + }, + { + "epoch": 9.04, + "learning_rate": 7.400200601805417e-05, + "loss": 2.4411, + "theoretical_loss": 3.336939340271197, + "tokens_seen": 2817399808 + }, + { + "epoch": 9.04, + "learning_rate": 7.399197592778335e-05, + "loss": 2.4291, + "theoretical_loss": 3.3369332959136835, + "tokens_seen": 2817465344 + }, + { + "epoch": 9.04, + "learning_rate": 7.398194583751254e-05, + "loss": 2.56, + "theoretical_loss": 3.336927251736129, + "tokens_seen": 2817530880 + }, + { + "epoch": 9.04, + "learning_rate": 7.397191574724172e-05, + "loss": 2.2283, + "theoretical_loss": 3.3369212077385253, + "tokens_seen": 2817596416 + }, + { + "epoch": 9.04, + "learning_rate": 7.396188565697092e-05, + "loss": 2.5146, + "theoretical_loss": 3.3369151639208616, + "tokens_seen": 2817661952 + }, + { + "epoch": 9.04, + "learning_rate": 7.39518555667001e-05, + "loss": 2.509, + "theoretical_loss": 3.336909120283129, + "tokens_seen": 2817727488 + }, + { + "epoch": 9.04, + "learning_rate": 7.394182547642929e-05, + "loss": 2.5419, + "theoretical_loss": 3.336903076825318, + "tokens_seen": 2817793024 + }, + { + "epoch": 9.04, + "learning_rate": 7.393179538615847e-05, + "loss": 2.5549, + "theoretical_loss": 3.3368970335474186, + "tokens_seen": 2817858560 + }, + { + "epoch": 9.04, + "learning_rate": 7.392176529588767e-05, + "loss": 2.3819, + "theoretical_loss": 3.3368909904494215, + "tokens_seen": 2817924096 + }, + { + "epoch": 9.04, + "learning_rate": 7.391173520561685e-05, + "loss": 2.4538, + "theoretical_loss": 3.3368849475313174, + "tokens_seen": 2817989632 + }, + { + "epoch": 9.04, + "objective/train/docs_used": 3107003, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3604273796081543, + "objective/train/theoretical_loss": 3.3368804154607883, + "objective/train/tokens_used": 2838498784, + "theoretical_loss": 3.3368804154607883, + "tokens_seen": 2818038784 + }, + { + "epoch": 9.04, + "learning_rate": 7.390170511534604e-05, + "loss": 2.4977, + "theoretical_loss": 3.3368789047930965, + "tokens_seen": 2818055168 + }, + { + "epoch": 9.04, + "learning_rate": 7.389167502507523e-05, + "loss": 2.5878, + "theoretical_loss": 3.3368728622347494, + "tokens_seen": 2818120704 + }, + { + "epoch": 9.04, + "learning_rate": 7.38816449348044e-05, + "loss": 2.4945, + "theoretical_loss": 3.336866819856266, + "tokens_seen": 2818186240 + }, + { + "epoch": 9.04, + "learning_rate": 7.38716148445336e-05, + "loss": 2.3901, + "theoretical_loss": 3.336860777657638, + "tokens_seen": 2818251776 + }, + { + "epoch": 9.04, + "learning_rate": 7.386158475426278e-05, + "loss": 2.4752, + "theoretical_loss": 3.336854735638855, + "tokens_seen": 2818317312 + }, + { + "epoch": 9.04, + "learning_rate": 7.385155466399198e-05, + "loss": 2.3849, + "theoretical_loss": 3.336848693799907, + "tokens_seen": 2818382848 + }, + { + "epoch": 9.04, + "learning_rate": 7.384152457372116e-05, + "loss": 2.4204, + "theoretical_loss": 3.3368426521407857, + "tokens_seen": 2818448384 + }, + { + "epoch": 9.04, + "learning_rate": 7.383149448345035e-05, + "loss": 2.585, + "theoretical_loss": 3.3368366106614804, + "tokens_seen": 2818513920 + }, + { + "epoch": 9.04, + "learning_rate": 7.382146439317953e-05, + "loss": 2.4135, + "theoretical_loss": 3.3368305693619824, + "tokens_seen": 2818579456 + }, + { + "epoch": 9.04, + "learning_rate": 7.381143430290873e-05, + "loss": 2.283, + "theoretical_loss": 3.336824528242282, + "tokens_seen": 2818644992 + }, + { + "epoch": 9.04, + "learning_rate": 7.380140421263791e-05, + "loss": 2.6764, + "theoretical_loss": 3.336818487302369, + "tokens_seen": 2818710528 + }, + { + "epoch": 9.04, + "learning_rate": 7.379137412236709e-05, + "loss": 2.6035, + "theoretical_loss": 3.3368124465422344, + "tokens_seen": 2818776064 + }, + { + "epoch": 9.04, + "learning_rate": 7.378134403209629e-05, + "loss": 2.6027, + "theoretical_loss": 3.336806405961869, + "tokens_seen": 2818841600 + }, + { + "epoch": 9.04, + "learning_rate": 7.377131394182547e-05, + "loss": 2.628, + "theoretical_loss": 3.3368003655612632, + "tokens_seen": 2818907136 + }, + { + "epoch": 9.04, + "learning_rate": 7.376128385155466e-05, + "loss": 2.499, + "theoretical_loss": 3.3367943253404064, + "tokens_seen": 2818972672 + }, + { + "epoch": 9.04, + "learning_rate": 7.375125376128386e-05, + "loss": 2.333, + "theoretical_loss": 3.3367882852992903, + "tokens_seen": 2819038208 + }, + { + "epoch": 9.04, + "learning_rate": 7.374122367101305e-05, + "loss": 2.3418, + "theoretical_loss": 3.336782245437905, + "tokens_seen": 2819103744 + }, + { + "epoch": 9.04, + "learning_rate": 7.373119358074223e-05, + "loss": 2.4791, + "theoretical_loss": 3.336776205756241, + "tokens_seen": 2819169280 + }, + { + "epoch": 9.04, + "learning_rate": 7.372116349047143e-05, + "loss": 2.4482, + "theoretical_loss": 3.3367701662542886, + "tokens_seen": 2819234816 + }, + { + "epoch": 9.04, + "learning_rate": 7.371113340020061e-05, + "loss": 2.635, + "theoretical_loss": 3.3367641269320383, + "tokens_seen": 2819300352 + }, + { + "epoch": 9.05, + "learning_rate": 7.37011033099298e-05, + "loss": 2.278, + "theoretical_loss": 3.3367580877894807, + "tokens_seen": 2819365888 + }, + { + "epoch": 9.05, + "learning_rate": 7.369107321965898e-05, + "loss": 2.4185, + "theoretical_loss": 3.336752048826606, + "tokens_seen": 2819431424 + }, + { + "epoch": 9.05, + "learning_rate": 7.368104312938816e-05, + "loss": 2.3721, + "theoretical_loss": 3.336746010043405, + "tokens_seen": 2819496960 + }, + { + "epoch": 9.05, + "learning_rate": 7.367101303911736e-05, + "loss": 2.6377, + "theoretical_loss": 3.3367399714398687, + "tokens_seen": 2819562496 + }, + { + "epoch": 9.05, + "learning_rate": 7.366098294884654e-05, + "loss": 2.4099, + "theoretical_loss": 3.336733933015986, + "tokens_seen": 2819628032 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3108427, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3784573078155518, + "objective/train/theoretical_loss": 3.336729404315967, + "objective/train/tokens_used": 2840137184, + "theoretical_loss": 3.336729404315967, + "tokens_seen": 2819677184 + }, + { + "epoch": 9.05, + "learning_rate": 7.365095285857574e-05, + "loss": 2.419, + "theoretical_loss": 3.336727894771749, + "tokens_seen": 2819693568 + }, + { + "epoch": 9.05, + "learning_rate": 7.364092276830492e-05, + "loss": 2.5594, + "theoretical_loss": 3.3367218567071473, + "tokens_seen": 2819759104 + }, + { + "epoch": 9.05, + "learning_rate": 7.363089267803411e-05, + "loss": 2.5385, + "theoretical_loss": 3.3367158188221713, + "tokens_seen": 2819824640 + }, + { + "epoch": 9.05, + "learning_rate": 7.362086258776329e-05, + "loss": 2.4969, + "theoretical_loss": 3.336709781116812, + "tokens_seen": 2819890176 + }, + { + "epoch": 9.05, + "learning_rate": 7.361083249749249e-05, + "loss": 2.3974, + "theoretical_loss": 3.33670374359106, + "tokens_seen": 2819955712 + }, + { + "epoch": 9.05, + "learning_rate": 7.360080240722167e-05, + "loss": 2.4244, + "theoretical_loss": 3.336697706244905, + "tokens_seen": 2820021248 + }, + { + "epoch": 9.05, + "learning_rate": 7.359077231695085e-05, + "loss": 2.3286, + "theoretical_loss": 3.3366916690783377, + "tokens_seen": 2820086784 + }, + { + "epoch": 9.05, + "learning_rate": 7.358074222668004e-05, + "loss": 2.4967, + "theoretical_loss": 3.336685632091349, + "tokens_seen": 2820152320 + }, + { + "epoch": 9.05, + "learning_rate": 7.357071213640922e-05, + "loss": 2.5199, + "theoretical_loss": 3.33667959528393, + "tokens_seen": 2820217856 + }, + { + "epoch": 9.05, + "learning_rate": 7.356068204613842e-05, + "loss": 2.4097, + "theoretical_loss": 3.3366735586560696, + "tokens_seen": 2820283392 + }, + { + "epoch": 9.05, + "learning_rate": 7.35506519558676e-05, + "loss": 2.1904, + "theoretical_loss": 3.336667522207759, + "tokens_seen": 2820348928 + }, + { + "epoch": 9.05, + "learning_rate": 7.35406218655968e-05, + "loss": 2.5278, + "theoretical_loss": 3.336661485938989, + "tokens_seen": 2820414464 + }, + { + "epoch": 9.05, + "learning_rate": 7.353059177532598e-05, + "loss": 2.437, + "theoretical_loss": 3.33665544984975, + "tokens_seen": 2820480000 + }, + { + "epoch": 9.05, + "learning_rate": 7.352056168505517e-05, + "loss": 2.6543, + "theoretical_loss": 3.336649413940032, + "tokens_seen": 2820545536 + }, + { + "epoch": 9.05, + "learning_rate": 7.351053159478435e-05, + "loss": 2.3701, + "theoretical_loss": 3.336643378209826, + "tokens_seen": 2820611072 + }, + { + "epoch": 9.05, + "learning_rate": 7.350050150451353e-05, + "loss": 2.4875, + "theoretical_loss": 3.3366373426591225, + "tokens_seen": 2820676608 + }, + { + "epoch": 9.05, + "learning_rate": 7.349047141424273e-05, + "loss": 2.3664, + "theoretical_loss": 3.3366313072879117, + "tokens_seen": 2820742144 + }, + { + "epoch": 9.05, + "learning_rate": 7.348044132397191e-05, + "loss": 2.339, + "theoretical_loss": 3.336625272096184, + "tokens_seen": 2820807680 + }, + { + "epoch": 9.05, + "learning_rate": 7.34704112337011e-05, + "loss": 2.7247, + "theoretical_loss": 3.33661923708393, + "tokens_seen": 2820873216 + }, + { + "epoch": 9.05, + "learning_rate": 7.346038114343028e-05, + "loss": 2.5163, + "theoretical_loss": 3.3366132022511406, + "tokens_seen": 2820938752 + }, + { + "epoch": 9.05, + "learning_rate": 7.345035105315948e-05, + "loss": 2.2932, + "theoretical_loss": 3.3366071675978057, + "tokens_seen": 2821004288 + }, + { + "epoch": 9.05, + "learning_rate": 7.344032096288866e-05, + "loss": 2.424, + "theoretical_loss": 3.3366011331239163, + "tokens_seen": 2821069824 + }, + { + "epoch": 9.05, + "learning_rate": 7.343029087261786e-05, + "loss": 2.4845, + "theoretical_loss": 3.336595098829463, + "tokens_seen": 2821135360 + }, + { + "epoch": 9.05, + "learning_rate": 7.342026078234704e-05, + "loss": 2.4685, + "theoretical_loss": 3.336589064714435, + "tokens_seen": 2821200896 + }, + { + "epoch": 9.05, + "learning_rate": 7.341023069207622e-05, + "loss": 2.3847, + "theoretical_loss": 3.336583030778825, + "tokens_seen": 2821266432 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3109158, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8144352436065674, + "objective/train/theoretical_loss": 3.3365785054448533, + "objective/train/tokens_used": 2841775584, + "theoretical_loss": 3.3365785054448533, + "tokens_seen": 2821315584 + }, + { + "epoch": 9.05, + "learning_rate": 7.340020060180541e-05, + "loss": 2.5031, + "theoretical_loss": 3.3365769970226213, + "tokens_seen": 2821331968 + }, + { + "epoch": 9.05, + "learning_rate": 7.33901705115346e-05, + "loss": 2.3664, + "theoretical_loss": 3.3365709634458156, + "tokens_seen": 2821397504 + }, + { + "epoch": 9.05, + "learning_rate": 7.33801404212638e-05, + "loss": 2.16, + "theoretical_loss": 3.3365649300483984, + "tokens_seen": 2821463040 + }, + { + "epoch": 9.05, + "learning_rate": 7.337011033099298e-05, + "loss": 2.4165, + "theoretical_loss": 3.3365588968303594, + "tokens_seen": 2821528576 + }, + { + "epoch": 9.05, + "learning_rate": 7.336008024072218e-05, + "loss": 2.407, + "theoretical_loss": 3.33655286379169, + "tokens_seen": 2821594112 + }, + { + "epoch": 9.05, + "learning_rate": 7.335005015045136e-05, + "loss": 2.2609, + "theoretical_loss": 3.3365468309323805, + "tokens_seen": 2821659648 + }, + { + "epoch": 9.05, + "learning_rate": 7.334002006018055e-05, + "loss": 2.367, + "theoretical_loss": 3.3365407982524213, + "tokens_seen": 2821725184 + }, + { + "epoch": 9.05, + "learning_rate": 7.332998996990973e-05, + "loss": 2.4216, + "theoretical_loss": 3.3365347657518027, + "tokens_seen": 2821790720 + }, + { + "epoch": 9.05, + "learning_rate": 7.331995987963893e-05, + "loss": 2.4341, + "theoretical_loss": 3.3365287334305154, + "tokens_seen": 2821856256 + }, + { + "epoch": 9.05, + "learning_rate": 7.330992978936811e-05, + "loss": 2.5748, + "theoretical_loss": 3.33652270128855, + "tokens_seen": 2821921792 + }, + { + "epoch": 9.05, + "learning_rate": 7.329989969909729e-05, + "loss": 2.3453, + "theoretical_loss": 3.3365166693258965, + "tokens_seen": 2821987328 + }, + { + "epoch": 9.05, + "learning_rate": 7.328986960882649e-05, + "loss": 2.6206, + "theoretical_loss": 3.336510637542546, + "tokens_seen": 2822052864 + }, + { + "epoch": 9.05, + "learning_rate": 7.327983951855567e-05, + "loss": 2.5509, + "theoretical_loss": 3.336504605938489, + "tokens_seen": 2822118400 + }, + { + "epoch": 9.05, + "learning_rate": 7.326980942828486e-05, + "loss": 2.4037, + "theoretical_loss": 3.3364985745137155, + "tokens_seen": 2822183936 + }, + { + "epoch": 9.05, + "learning_rate": 7.325977933801404e-05, + "loss": 2.4929, + "theoretical_loss": 3.3364925432682164, + "tokens_seen": 2822249472 + }, + { + "epoch": 9.05, + "learning_rate": 7.324974924774324e-05, + "loss": 2.5365, + "theoretical_loss": 3.336486512201982, + "tokens_seen": 2822315008 + }, + { + "epoch": 9.05, + "learning_rate": 7.323971915747242e-05, + "loss": 2.3508, + "theoretical_loss": 3.336480481315003, + "tokens_seen": 2822380544 + }, + { + "epoch": 9.05, + "learning_rate": 7.322968906720161e-05, + "loss": 2.3649, + "theoretical_loss": 3.33647445060727, + "tokens_seen": 2822446080 + }, + { + "epoch": 9.05, + "learning_rate": 7.32196589769308e-05, + "loss": 2.4036, + "theoretical_loss": 3.336468420078773, + "tokens_seen": 2822511616 + }, + { + "epoch": 9.05, + "learning_rate": 7.320962888665998e-05, + "loss": 2.2957, + "theoretical_loss": 3.3364623897295034, + "tokens_seen": 2822577152 + }, + { + "epoch": 9.05, + "learning_rate": 7.319959879638917e-05, + "loss": 2.2367, + "theoretical_loss": 3.3364563595594507, + "tokens_seen": 2822642688 + }, + { + "epoch": 9.05, + "learning_rate": 7.318956870611835e-05, + "loss": 2.4229, + "theoretical_loss": 3.336450329568606, + "tokens_seen": 2822708224 + }, + { + "epoch": 9.05, + "learning_rate": 7.317953861584755e-05, + "loss": 2.6107, + "theoretical_loss": 3.33644429975696, + "tokens_seen": 2822773760 + }, + { + "epoch": 9.05, + "learning_rate": 7.316950852557673e-05, + "loss": 2.5266, + "theoretical_loss": 3.3364382701245026, + "tokens_seen": 2822839296 + }, + { + "epoch": 9.05, + "learning_rate": 7.315947843530592e-05, + "loss": 2.4865, + "theoretical_loss": 3.3364322406712246, + "tokens_seen": 2822904832 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3109712, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.819840431213379, + "objective/train/theoretical_loss": 3.3364277186988467, + "objective/train/tokens_used": 2843413984, + "theoretical_loss": 3.3364277186988467, + "tokens_seen": 2822953984 + }, + { + "epoch": 9.05, + "learning_rate": 7.31494483450351e-05, + "loss": 2.426, + "theoretical_loss": 3.336426211397116, + "tokens_seen": 2822970368 + }, + { + "epoch": 9.05, + "learning_rate": 7.31394182547643e-05, + "loss": 2.6115, + "theoretical_loss": 3.336420182302169, + "tokens_seen": 2823035904 + }, + { + "epoch": 9.05, + "learning_rate": 7.312938816449348e-05, + "loss": 2.5873, + "theoretical_loss": 3.336414153386372, + "tokens_seen": 2823101440 + }, + { + "epoch": 9.05, + "learning_rate": 7.311935807422267e-05, + "loss": 2.6309, + "theoretical_loss": 3.336408124649717, + "tokens_seen": 2823166976 + }, + { + "epoch": 9.05, + "learning_rate": 7.310932798395185e-05, + "loss": 2.497, + "theoretical_loss": 3.336402096092194, + "tokens_seen": 2823232512 + }, + { + "epoch": 9.05, + "learning_rate": 7.309929789368104e-05, + "loss": 2.3235, + "theoretical_loss": 3.3363960677137934, + "tokens_seen": 2823298048 + }, + { + "epoch": 9.05, + "learning_rate": 7.308926780341023e-05, + "loss": 2.5845, + "theoretical_loss": 3.336390039514506, + "tokens_seen": 2823363584 + }, + { + "epoch": 9.05, + "learning_rate": 7.307923771313941e-05, + "loss": 2.4049, + "theoretical_loss": 3.3363840114943217, + "tokens_seen": 2823429120 + }, + { + "epoch": 9.05, + "learning_rate": 7.30692076228686e-05, + "loss": 2.3738, + "theoretical_loss": 3.336377983653232, + "tokens_seen": 2823494656 + }, + { + "epoch": 9.05, + "learning_rate": 7.305917753259779e-05, + "loss": 2.4578, + "theoretical_loss": 3.336371955991227, + "tokens_seen": 2823560192 + }, + { + "epoch": 9.05, + "learning_rate": 7.304914744232698e-05, + "loss": 2.4537, + "theoretical_loss": 3.3363659285082967, + "tokens_seen": 2823625728 + }, + { + "epoch": 9.05, + "learning_rate": 7.303911735205616e-05, + "loss": 2.4202, + "theoretical_loss": 3.3363599012044323, + "tokens_seen": 2823691264 + }, + { + "epoch": 9.05, + "learning_rate": 7.302908726178536e-05, + "loss": 2.4976, + "theoretical_loss": 3.336353874079624, + "tokens_seen": 2823756800 + }, + { + "epoch": 9.05, + "learning_rate": 7.301905717151454e-05, + "loss": 2.4362, + "theoretical_loss": 3.3363478471338626, + "tokens_seen": 2823822336 + }, + { + "epoch": 9.05, + "learning_rate": 7.300902708124373e-05, + "loss": 2.4602, + "theoretical_loss": 3.336341820367138, + "tokens_seen": 2823887872 + }, + { + "epoch": 9.05, + "learning_rate": 7.299899699097293e-05, + "loss": 2.5308, + "theoretical_loss": 3.3363357937794413, + "tokens_seen": 2823953408 + }, + { + "epoch": 9.05, + "learning_rate": 7.298896690070211e-05, + "loss": 2.7138, + "theoretical_loss": 3.3363297673707635, + "tokens_seen": 2824018944 + }, + { + "epoch": 9.05, + "learning_rate": 7.29789368104313e-05, + "loss": 2.2875, + "theoretical_loss": 3.3363237411410935, + "tokens_seen": 2824084480 + }, + { + "epoch": 9.05, + "learning_rate": 7.296890672016048e-05, + "loss": 2.5003, + "theoretical_loss": 3.3363177150904235, + "tokens_seen": 2824150016 + }, + { + "epoch": 9.05, + "learning_rate": 7.295887662988968e-05, + "loss": 2.4404, + "theoretical_loss": 3.3363116892187437, + "tokens_seen": 2824215552 + }, + { + "epoch": 9.05, + "learning_rate": 7.294884653961886e-05, + "loss": 2.4719, + "theoretical_loss": 3.3363056635260437, + "tokens_seen": 2824281088 + }, + { + "epoch": 9.05, + "learning_rate": 7.293881644934806e-05, + "loss": 2.4763, + "theoretical_loss": 3.3362996380123144, + "tokens_seen": 2824346624 + }, + { + "epoch": 9.05, + "learning_rate": 7.292878635907724e-05, + "loss": 2.2853, + "theoretical_loss": 3.3362936126775473, + "tokens_seen": 2824412160 + }, + { + "epoch": 9.05, + "learning_rate": 7.291875626880642e-05, + "loss": 2.5973, + "theoretical_loss": 3.336287587521732, + "tokens_seen": 2824477696 + }, + { + "epoch": 9.05, + "learning_rate": 7.290872617853561e-05, + "loss": 2.4975, + "theoretical_loss": 3.336281562544859, + "tokens_seen": 2824543232 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3109712, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6142570972442627, + "objective/train/theoretical_loss": 3.3362770439296296, + "objective/train/tokens_used": 2845052384, + "theoretical_loss": 3.3362770439296296, + "tokens_seen": 2824592384 + }, + { + "epoch": 9.05, + "learning_rate": 7.28986960882648e-05, + "loss": 2.7737, + "theoretical_loss": 3.336275537746919, + "tokens_seen": 2824608768 + }, + { + "epoch": 9.05, + "learning_rate": 7.288866599799399e-05, + "loss": 2.4958, + "theoretical_loss": 3.336269513127903, + "tokens_seen": 2824674304 + }, + { + "epoch": 9.05, + "learning_rate": 7.287863590772317e-05, + "loss": 2.5299, + "theoretical_loss": 3.336263488687801, + "tokens_seen": 2824739840 + }, + { + "epoch": 9.05, + "learning_rate": 7.286860581745236e-05, + "loss": 2.6303, + "theoretical_loss": 3.3362574644266036, + "tokens_seen": 2824805376 + }, + { + "epoch": 9.05, + "learning_rate": 7.285857572718154e-05, + "loss": 2.5422, + "theoretical_loss": 3.3362514403443013, + "tokens_seen": 2824870912 + }, + { + "epoch": 9.05, + "learning_rate": 7.284854563691074e-05, + "loss": 2.2946, + "theoretical_loss": 3.336245416440885, + "tokens_seen": 2824936448 + }, + { + "epoch": 9.05, + "learning_rate": 7.283851554663992e-05, + "loss": 2.5583, + "theoretical_loss": 3.3362393927163447, + "tokens_seen": 2825001984 + }, + { + "epoch": 9.05, + "learning_rate": 7.282848545636912e-05, + "loss": 2.4954, + "theoretical_loss": 3.3362333691706714, + "tokens_seen": 2825067520 + }, + { + "epoch": 9.05, + "learning_rate": 7.28184553660983e-05, + "loss": 2.5508, + "theoretical_loss": 3.3362273458038554, + "tokens_seen": 2825133056 + }, + { + "epoch": 9.05, + "learning_rate": 7.280842527582748e-05, + "loss": 2.5063, + "theoretical_loss": 3.3362213226158874, + "tokens_seen": 2825198592 + }, + { + "epoch": 9.05, + "learning_rate": 7.279839518555667e-05, + "loss": 2.5041, + "theoretical_loss": 3.336215299606758, + "tokens_seen": 2825264128 + }, + { + "epoch": 9.05, + "learning_rate": 7.278836509528585e-05, + "loss": 2.4361, + "theoretical_loss": 3.336209276776457, + "tokens_seen": 2825329664 + }, + { + "epoch": 9.05, + "learning_rate": 7.277833500501505e-05, + "loss": 2.5883, + "theoretical_loss": 3.336203254124976, + "tokens_seen": 2825395200 + }, + { + "epoch": 9.05, + "learning_rate": 7.276830491474423e-05, + "loss": 2.359, + "theoretical_loss": 3.3361972316523048, + "tokens_seen": 2825460736 + }, + { + "epoch": 9.05, + "learning_rate": 7.275827482447342e-05, + "loss": 2.7164, + "theoretical_loss": 3.3361912093584345, + "tokens_seen": 2825526272 + }, + { + "epoch": 9.05, + "learning_rate": 7.27482447342026e-05, + "loss": 2.2164, + "theoretical_loss": 3.3361851872433554, + "tokens_seen": 2825591808 + }, + { + "epoch": 9.05, + "learning_rate": 7.27382146439318e-05, + "loss": 2.4792, + "theoretical_loss": 3.336179165307058, + "tokens_seen": 2825657344 + }, + { + "epoch": 9.05, + "learning_rate": 7.272818455366098e-05, + "loss": 2.4037, + "theoretical_loss": 3.3361731435495328, + "tokens_seen": 2825722880 + }, + { + "epoch": 9.05, + "learning_rate": 7.271815446339016e-05, + "loss": 2.4928, + "theoretical_loss": 3.33616712197077, + "tokens_seen": 2825788416 + }, + { + "epoch": 9.05, + "learning_rate": 7.270812437311936e-05, + "loss": 2.3418, + "theoretical_loss": 3.336161100570761, + "tokens_seen": 2825853952 + }, + { + "epoch": 9.05, + "learning_rate": 7.269809428284854e-05, + "loss": 2.4452, + "theoretical_loss": 3.3361550793494956, + "tokens_seen": 2825919488 + }, + { + "epoch": 9.05, + "learning_rate": 7.268806419257773e-05, + "loss": 2.6341, + "theoretical_loss": 3.3361490583069653, + "tokens_seen": 2825985024 + }, + { + "epoch": 9.05, + "learning_rate": 7.267803410230691e-05, + "loss": 2.456, + "theoretical_loss": 3.336143037443159, + "tokens_seen": 2826050560 + }, + { + "epoch": 9.05, + "learning_rate": 7.266800401203611e-05, + "loss": 2.6128, + "theoretical_loss": 3.336137016758069, + "tokens_seen": 2826116096 + }, + { + "epoch": 9.05, + "learning_rate": 7.265797392176529e-05, + "loss": 2.5758, + "theoretical_loss": 3.3361309962516845, + "tokens_seen": 2826181632 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3110476, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6412525177001953, + "objective/train/theoretical_loss": 3.336126480989167, + "objective/train/tokens_used": 2846690784, + "theoretical_loss": 3.336126480989167, + "tokens_seen": 2826230784 + }, + { + "epoch": 9.05, + "learning_rate": 7.264794383149448e-05, + "loss": 2.3107, + "theoretical_loss": 3.3361249759239975, + "tokens_seen": 2826247168 + }, + { + "epoch": 9.05, + "learning_rate": 7.263791374122367e-05, + "loss": 2.3819, + "theoretical_loss": 3.336118955774997, + "tokens_seen": 2826312704 + }, + { + "epoch": 9.05, + "learning_rate": 7.262788365095287e-05, + "loss": 2.476, + "theoretical_loss": 3.3361129358046746, + "tokens_seen": 2826378240 + }, + { + "epoch": 9.05, + "learning_rate": 7.261785356068205e-05, + "loss": 2.4325, + "theoretical_loss": 3.33610691601302, + "tokens_seen": 2826443776 + }, + { + "epoch": 9.05, + "learning_rate": 7.260782347041124e-05, + "loss": 2.4252, + "theoretical_loss": 3.336100896400025, + "tokens_seen": 2826509312 + }, + { + "epoch": 9.05, + "learning_rate": 7.259779338014043e-05, + "loss": 2.5878, + "theoretical_loss": 3.336094876965679, + "tokens_seen": 2826574848 + }, + { + "epoch": 9.05, + "learning_rate": 7.258776328986961e-05, + "loss": 2.4458, + "theoretical_loss": 3.336088857709973, + "tokens_seen": 2826640384 + }, + { + "epoch": 9.05, + "learning_rate": 7.25777331995988e-05, + "loss": 2.4333, + "theoretical_loss": 3.3360828386328976, + "tokens_seen": 2826705920 + }, + { + "epoch": 9.05, + "learning_rate": 7.256770310932799e-05, + "loss": 2.3542, + "theoretical_loss": 3.3360768197344433, + "tokens_seen": 2826771456 + }, + { + "epoch": 9.05, + "learning_rate": 7.255767301905718e-05, + "loss": 2.6672, + "theoretical_loss": 3.3360708010146003, + "tokens_seen": 2826836992 + }, + { + "epoch": 9.05, + "learning_rate": 7.254764292878636e-05, + "loss": 2.6666, + "theoretical_loss": 3.33606478247336, + "tokens_seen": 2826902528 + }, + { + "epoch": 9.05, + "learning_rate": 7.253761283851556e-05, + "loss": 2.6276, + "theoretical_loss": 3.336058764110712, + "tokens_seen": 2826968064 + }, + { + "epoch": 9.05, + "learning_rate": 7.252758274824474e-05, + "loss": 2.5024, + "theoretical_loss": 3.336052745926648, + "tokens_seen": 2827033600 + }, + { + "epoch": 9.05, + "learning_rate": 7.251755265797392e-05, + "loss": 2.6741, + "theoretical_loss": 3.336046727921157, + "tokens_seen": 2827099136 + }, + { + "epoch": 9.05, + "learning_rate": 7.250752256770311e-05, + "loss": 2.5814, + "theoretical_loss": 3.336040710094231, + "tokens_seen": 2827164672 + }, + { + "epoch": 9.05, + "learning_rate": 7.24974924774323e-05, + "loss": 2.5774, + "theoretical_loss": 3.33603469244586, + "tokens_seen": 2827230208 + }, + { + "epoch": 9.05, + "learning_rate": 7.248746238716149e-05, + "loss": 2.3766, + "theoretical_loss": 3.3360286749760344, + "tokens_seen": 2827295744 + }, + { + "epoch": 9.05, + "learning_rate": 7.247743229689067e-05, + "loss": 2.5361, + "theoretical_loss": 3.336022657684745, + "tokens_seen": 2827361280 + }, + { + "epoch": 9.05, + "learning_rate": 7.246740220661987e-05, + "loss": 2.49, + "theoretical_loss": 3.3360166405719824, + "tokens_seen": 2827426816 + }, + { + "epoch": 9.05, + "learning_rate": 7.245737211634905e-05, + "loss": 2.7176, + "theoretical_loss": 3.336010623637737, + "tokens_seen": 2827492352 + }, + { + "epoch": 9.05, + "learning_rate": 7.244734202607824e-05, + "loss": 2.2236, + "theoretical_loss": 3.3360046068819997, + "tokens_seen": 2827557888 + }, + { + "epoch": 9.05, + "learning_rate": 7.243731193580742e-05, + "loss": 2.4232, + "theoretical_loss": 3.3359985903047606, + "tokens_seen": 2827623424 + }, + { + "epoch": 9.05, + "learning_rate": 7.24272818455366e-05, + "loss": 2.4257, + "theoretical_loss": 3.3359925739060103, + "tokens_seen": 2827688960 + }, + { + "epoch": 9.05, + "learning_rate": 7.24172517552658e-05, + "loss": 2.4383, + "theoretical_loss": 3.3359865576857395, + "tokens_seen": 2827754496 + }, + { + "epoch": 9.05, + "learning_rate": 7.240722166499498e-05, + "loss": 2.5196, + "theoretical_loss": 3.335980541643939, + "tokens_seen": 2827820032 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3111771, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.474888563156128, + "objective/train/theoretical_loss": 3.335976029729704, + "objective/train/tokens_used": 2848329184, + "theoretical_loss": 3.335976029729704, + "tokens_seen": 2827869184 + }, + { + "epoch": 9.05, + "learning_rate": 7.239719157472417e-05, + "loss": 2.5369, + "theoretical_loss": 3.3359745257805993, + "tokens_seen": 2827885568 + }, + { + "epoch": 9.05, + "learning_rate": 7.238716148445336e-05, + "loss": 2.4891, + "theoretical_loss": 3.335968510095711, + "tokens_seen": 2827951104 + }, + { + "epoch": 9.05, + "learning_rate": 7.237713139418255e-05, + "loss": 2.4773, + "theoretical_loss": 3.335962494589264, + "tokens_seen": 2828016640 + }, + { + "epoch": 9.05, + "learning_rate": 7.236710130391173e-05, + "loss": 2.5334, + "theoretical_loss": 3.3359564792612497, + "tokens_seen": 2828082176 + }, + { + "epoch": 9.05, + "learning_rate": 7.235707121364093e-05, + "loss": 2.3856, + "theoretical_loss": 3.335950464111658, + "tokens_seen": 2828147712 + }, + { + "epoch": 9.05, + "learning_rate": 7.234704112337011e-05, + "loss": 2.6345, + "theoretical_loss": 3.33594444914048, + "tokens_seen": 2828213248 + }, + { + "epoch": 9.05, + "learning_rate": 7.23370110330993e-05, + "loss": 2.3716, + "theoretical_loss": 3.3359384343477063, + "tokens_seen": 2828278784 + }, + { + "epoch": 9.05, + "learning_rate": 7.232698094282848e-05, + "loss": 2.578, + "theoretical_loss": 3.3359324197333273, + "tokens_seen": 2828344320 + }, + { + "epoch": 9.05, + "learning_rate": 7.231695085255766e-05, + "loss": 2.4609, + "theoretical_loss": 3.3359264052973336, + "tokens_seen": 2828409856 + }, + { + "epoch": 9.05, + "learning_rate": 7.230692076228686e-05, + "loss": 2.3574, + "theoretical_loss": 3.3359203910397155, + "tokens_seen": 2828475392 + }, + { + "epoch": 9.05, + "learning_rate": 7.229689067201604e-05, + "loss": 2.3589, + "theoretical_loss": 3.335914376960464, + "tokens_seen": 2828540928 + }, + { + "epoch": 9.05, + "learning_rate": 7.228686058174523e-05, + "loss": 2.5108, + "theoretical_loss": 3.3359083630595694, + "tokens_seen": 2828606464 + }, + { + "epoch": 9.05, + "learning_rate": 7.227683049147442e-05, + "loss": 2.4254, + "theoretical_loss": 3.3359023493370223, + "tokens_seen": 2828672000 + }, + { + "epoch": 9.05, + "learning_rate": 7.226680040120361e-05, + "loss": 2.3433, + "theoretical_loss": 3.3358963357928135, + "tokens_seen": 2828737536 + }, + { + "epoch": 9.05, + "learning_rate": 7.22567703109328e-05, + "loss": 2.4847, + "theoretical_loss": 3.335890322426933, + "tokens_seen": 2828803072 + }, + { + "epoch": 9.05, + "learning_rate": 7.2246740220662e-05, + "loss": 2.5292, + "theoretical_loss": 3.3358843092393724, + "tokens_seen": 2828868608 + }, + { + "epoch": 9.05, + "learning_rate": 7.223671013039118e-05, + "loss": 2.5681, + "theoretical_loss": 3.3358782962301214, + "tokens_seen": 2828934144 + }, + { + "epoch": 9.05, + "learning_rate": 7.222668004012036e-05, + "loss": 2.643, + "theoretical_loss": 3.335872283399171, + "tokens_seen": 2828999680 + }, + { + "epoch": 9.05, + "learning_rate": 7.221664994984956e-05, + "loss": 2.4738, + "theoretical_loss": 3.3358662707465117, + "tokens_seen": 2829065216 + }, + { + "epoch": 9.05, + "learning_rate": 7.220661985957874e-05, + "loss": 2.5504, + "theoretical_loss": 3.3358602582721337, + "tokens_seen": 2829130752 + }, + { + "epoch": 9.05, + "learning_rate": 7.219658976930793e-05, + "loss": 2.3837, + "theoretical_loss": 3.3358542459760283, + "tokens_seen": 2829196288 + }, + { + "epoch": 9.05, + "learning_rate": 7.218655967903711e-05, + "loss": 2.4726, + "theoretical_loss": 3.3358482338581856, + "tokens_seen": 2829261824 + }, + { + "epoch": 9.05, + "learning_rate": 7.217652958876631e-05, + "loss": 2.5591, + "theoretical_loss": 3.3358422219185964, + "tokens_seen": 2829327360 + }, + { + "epoch": 9.05, + "learning_rate": 7.216649949849549e-05, + "loss": 2.4347, + "theoretical_loss": 3.335836210157251, + "tokens_seen": 2829392896 + }, + { + "epoch": 9.05, + "learning_rate": 7.215646940822468e-05, + "loss": 2.3011, + "theoretical_loss": 3.3358301985741403, + "tokens_seen": 2829458432 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3112456, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.512845516204834, + "objective/train/theoretical_loss": 3.3358256900037677, + "objective/train/tokens_used": 2849967584, + "theoretical_loss": 3.3358256900037677, + "tokens_seen": 2829507584 + }, + { + "epoch": 9.05, + "learning_rate": 7.214643931795387e-05, + "loss": 2.5023, + "theoretical_loss": 3.3358241871692544, + "tokens_seen": 2829523968 + }, + { + "epoch": 9.05, + "learning_rate": 7.213640922768305e-05, + "loss": 2.4557, + "theoretical_loss": 3.3358181759425847, + "tokens_seen": 2829589504 + }, + { + "epoch": 9.05, + "learning_rate": 7.212637913741224e-05, + "loss": 2.4216, + "theoretical_loss": 3.3358121648941212, + "tokens_seen": 2829655040 + }, + { + "epoch": 9.05, + "learning_rate": 7.211634904714142e-05, + "loss": 2.3943, + "theoretical_loss": 3.3358061540238544, + "tokens_seen": 2829720576 + }, + { + "epoch": 9.05, + "learning_rate": 7.210631895687062e-05, + "loss": 2.4175, + "theoretical_loss": 3.335800143331775, + "tokens_seen": 2829786112 + }, + { + "epoch": 9.05, + "learning_rate": 7.20962888665998e-05, + "loss": 2.4382, + "theoretical_loss": 3.3357941328178744, + "tokens_seen": 2829851648 + }, + { + "epoch": 9.05, + "learning_rate": 7.208625877632899e-05, + "loss": 2.5505, + "theoretical_loss": 3.3357881224821417, + "tokens_seen": 2829917184 + }, + { + "epoch": 9.05, + "learning_rate": 7.207622868605817e-05, + "loss": 2.5105, + "theoretical_loss": 3.335782112324569, + "tokens_seen": 2829982720 + }, + { + "epoch": 9.05, + "learning_rate": 7.206619859578737e-05, + "loss": 2.6769, + "theoretical_loss": 3.3357761023451458, + "tokens_seen": 2830048256 + }, + { + "epoch": 9.05, + "learning_rate": 7.205616850551655e-05, + "loss": 2.5972, + "theoretical_loss": 3.335770092543863, + "tokens_seen": 2830113792 + }, + { + "epoch": 9.05, + "learning_rate": 7.204613841524574e-05, + "loss": 2.5399, + "theoretical_loss": 3.3357640829207114, + "tokens_seen": 2830179328 + }, + { + "epoch": 9.05, + "learning_rate": 7.203610832497493e-05, + "loss": 2.5439, + "theoretical_loss": 3.3357580734756813, + "tokens_seen": 2830244864 + }, + { + "epoch": 9.05, + "learning_rate": 7.20260782347041e-05, + "loss": 2.3918, + "theoretical_loss": 3.335752064208764, + "tokens_seen": 2830310400 + }, + { + "epoch": 9.05, + "learning_rate": 7.20160481444333e-05, + "loss": 2.4668, + "theoretical_loss": 3.335746055119949, + "tokens_seen": 2830375936 + }, + { + "epoch": 9.05, + "learning_rate": 7.200601805416248e-05, + "loss": 2.4866, + "theoretical_loss": 3.3357400462092275, + "tokens_seen": 2830441472 + }, + { + "epoch": 9.05, + "learning_rate": 7.199598796389168e-05, + "loss": 2.717, + "theoretical_loss": 3.3357340374765903, + "tokens_seen": 2830507008 + }, + { + "epoch": 9.05, + "learning_rate": 7.198595787362086e-05, + "loss": 2.4148, + "theoretical_loss": 3.335728028922028, + "tokens_seen": 2830572544 + }, + { + "epoch": 9.05, + "learning_rate": 7.197592778335005e-05, + "loss": 2.5159, + "theoretical_loss": 3.3357220205455302, + "tokens_seen": 2830638080 + }, + { + "epoch": 9.05, + "learning_rate": 7.196589769307923e-05, + "loss": 2.5961, + "theoretical_loss": 3.335716012347089, + "tokens_seen": 2830703616 + }, + { + "epoch": 9.05, + "learning_rate": 7.195586760280843e-05, + "loss": 2.2895, + "theoretical_loss": 3.3357100043266934, + "tokens_seen": 2830769152 + }, + { + "epoch": 9.05, + "learning_rate": 7.194583751253761e-05, + "loss": 2.4945, + "theoretical_loss": 3.3357039964843356, + "tokens_seen": 2830834688 + }, + { + "epoch": 9.05, + "learning_rate": 7.193580742226679e-05, + "loss": 2.5031, + "theoretical_loss": 3.335697988820005, + "tokens_seen": 2830900224 + }, + { + "epoch": 9.05, + "learning_rate": 7.192577733199599e-05, + "loss": 2.6141, + "theoretical_loss": 3.335691981333693, + "tokens_seen": 2830965760 + }, + { + "epoch": 9.05, + "learning_rate": 7.191574724172517e-05, + "loss": 2.7367, + "theoretical_loss": 3.33568597402539, + "tokens_seen": 2831031296 + }, + { + "epoch": 9.05, + "learning_rate": 7.190571715145436e-05, + "loss": 2.5713, + "theoretical_loss": 3.335679966895086, + "tokens_seen": 2831096832 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3113798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2083704471588135, + "objective/train/theoretical_loss": 3.335675461664165, + "objective/train/tokens_used": 2851605984, + "theoretical_loss": 3.335675461664165, + "tokens_seen": 2831145984 + }, + { + "epoch": 9.05, + "learning_rate": 7.189568706118354e-05, + "loss": 2.2972, + "theoretical_loss": 3.3356739599427723, + "tokens_seen": 2831162368 + }, + { + "epoch": 9.05, + "learning_rate": 7.188565697091274e-05, + "loss": 2.6682, + "theoretical_loss": 3.3356679531684392, + "tokens_seen": 2831227904 + }, + { + "epoch": 9.05, + "learning_rate": 7.187562688064193e-05, + "loss": 2.4884, + "theoretical_loss": 3.3356619465720776, + "tokens_seen": 2831293440 + }, + { + "epoch": 9.05, + "learning_rate": 7.186559679037113e-05, + "loss": 2.6148, + "theoretical_loss": 3.3356559401536776, + "tokens_seen": 2831358976 + }, + { + "epoch": 9.05, + "learning_rate": 7.185556670010031e-05, + "loss": 2.3565, + "theoretical_loss": 3.3356499339132304, + "tokens_seen": 2831424512 + }, + { + "epoch": 9.05, + "learning_rate": 7.18455366098295e-05, + "loss": 2.689, + "theoretical_loss": 3.335643927850726, + "tokens_seen": 2831490048 + }, + { + "epoch": 9.05, + "learning_rate": 7.183550651955868e-05, + "loss": 2.5171, + "theoretical_loss": 3.335637921966156, + "tokens_seen": 2831555584 + }, + { + "epoch": 9.05, + "learning_rate": 7.182547642928786e-05, + "loss": 2.4447, + "theoretical_loss": 3.33563191625951, + "tokens_seen": 2831621120 + }, + { + "epoch": 9.05, + "learning_rate": 7.181544633901706e-05, + "loss": 2.4241, + "theoretical_loss": 3.3356259107307786, + "tokens_seen": 2831686656 + }, + { + "epoch": 9.05, + "learning_rate": 7.180541624874624e-05, + "loss": 2.5041, + "theoretical_loss": 3.3356199053799527, + "tokens_seen": 2831752192 + }, + { + "epoch": 9.05, + "learning_rate": 7.179538615847543e-05, + "loss": 2.5991, + "theoretical_loss": 3.3356139002070235, + "tokens_seen": 2831817728 + }, + { + "epoch": 9.05, + "learning_rate": 7.178535606820462e-05, + "loss": 2.4936, + "theoretical_loss": 3.335607895211981, + "tokens_seen": 2831883264 + }, + { + "epoch": 9.05, + "learning_rate": 7.177532597793381e-05, + "loss": 2.6421, + "theoretical_loss": 3.3356018903948157, + "tokens_seen": 2831948800 + }, + { + "epoch": 9.05, + "learning_rate": 7.176529588766299e-05, + "loss": 2.3729, + "theoretical_loss": 3.3355958857555184, + "tokens_seen": 2832014336 + }, + { + "epoch": 9.05, + "learning_rate": 7.175526579739219e-05, + "loss": 2.5688, + "theoretical_loss": 3.3355898812940796, + "tokens_seen": 2832079872 + }, + { + "epoch": 9.05, + "learning_rate": 7.174523570712137e-05, + "loss": 2.6224, + "theoretical_loss": 3.33558387701049, + "tokens_seen": 2832145408 + }, + { + "epoch": 9.05, + "learning_rate": 7.173520561685055e-05, + "loss": 2.5301, + "theoretical_loss": 3.335577872904741, + "tokens_seen": 2832210944 + }, + { + "epoch": 9.05, + "learning_rate": 7.172517552657974e-05, + "loss": 2.6476, + "theoretical_loss": 3.335571868976822, + "tokens_seen": 2832276480 + }, + { + "epoch": 9.05, + "learning_rate": 7.171514543630892e-05, + "loss": 2.7141, + "theoretical_loss": 3.335565865226724, + "tokens_seen": 2832342016 + }, + { + "epoch": 9.05, + "learning_rate": 7.170511534603812e-05, + "loss": 2.4636, + "theoretical_loss": 3.3355598616544375, + "tokens_seen": 2832407552 + }, + { + "epoch": 9.05, + "learning_rate": 7.16950852557673e-05, + "loss": 2.6452, + "theoretical_loss": 3.3355538582599538, + "tokens_seen": 2832473088 + }, + { + "epoch": 9.05, + "learning_rate": 7.16850551654965e-05, + "loss": 2.642, + "theoretical_loss": 3.335547855043263, + "tokens_seen": 2832538624 + }, + { + "epoch": 9.05, + "learning_rate": 7.167502507522568e-05, + "loss": 2.3198, + "theoretical_loss": 3.3355418520043556, + "tokens_seen": 2832604160 + }, + { + "epoch": 9.05, + "learning_rate": 7.166499498495487e-05, + "loss": 2.5708, + "theoretical_loss": 3.3355358491432225, + "tokens_seen": 2832669696 + }, + { + "epoch": 9.05, + "learning_rate": 7.165496489468405e-05, + "loss": 2.6009, + "theoretical_loss": 3.3355298464598544, + "tokens_seen": 2832735232 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3114259, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.892911911010742, + "objective/train/theoretical_loss": 3.3355253445639805, + "objective/train/tokens_used": 2853244384, + "theoretical_loss": 3.3355253445639805, + "tokens_seen": 2832784384 + }, + { + "epoch": 9.05, + "learning_rate": 7.164493480441323e-05, + "loss": 2.629, + "theoretical_loss": 3.3355238439542414, + "tokens_seen": 2832800768 + }, + { + "epoch": 9.05, + "learning_rate": 7.163490471414243e-05, + "loss": 2.2791, + "theoretical_loss": 3.3355178416263747, + "tokens_seen": 2832866304 + }, + { + "epoch": 9.05, + "learning_rate": 7.162487462387161e-05, + "loss": 2.3665, + "theoretical_loss": 3.3355118394762444, + "tokens_seen": 2832931840 + }, + { + "epoch": 9.05, + "learning_rate": 7.16148445336008e-05, + "loss": 2.5986, + "theoretical_loss": 3.3355058375038418, + "tokens_seen": 2832997376 + }, + { + "epoch": 9.05, + "learning_rate": 7.160481444332998e-05, + "loss": 2.3972, + "theoretical_loss": 3.335499835709157, + "tokens_seen": 2833062912 + }, + { + "epoch": 9.05, + "learning_rate": 7.159478435305918e-05, + "loss": 2.5189, + "theoretical_loss": 3.3354938340921807, + "tokens_seen": 2833128448 + }, + { + "epoch": 9.05, + "learning_rate": 7.158475426278836e-05, + "loss": 2.4092, + "theoretical_loss": 3.3354878326529036, + "tokens_seen": 2833193984 + }, + { + "epoch": 9.05, + "learning_rate": 7.157472417251756e-05, + "loss": 2.3758, + "theoretical_loss": 3.3354818313913164, + "tokens_seen": 2833259520 + }, + { + "epoch": 9.05, + "learning_rate": 7.156469408224674e-05, + "loss": 2.4183, + "theoretical_loss": 3.33547583030741, + "tokens_seen": 2833325056 + }, + { + "epoch": 9.05, + "learning_rate": 7.155466399197592e-05, + "loss": 2.7433, + "theoretical_loss": 3.335469829401174, + "tokens_seen": 2833390592 + }, + { + "epoch": 9.05, + "learning_rate": 7.154463390170511e-05, + "loss": 2.3742, + "theoretical_loss": 3.3354638286726, + "tokens_seen": 2833456128 + }, + { + "epoch": 9.05, + "learning_rate": 7.153460381143429e-05, + "loss": 2.4386, + "theoretical_loss": 3.3354578281216787, + "tokens_seen": 2833521664 + }, + { + "epoch": 9.05, + "learning_rate": 7.152457372116349e-05, + "loss": 2.5075, + "theoretical_loss": 3.3354518277484004, + "tokens_seen": 2833587200 + }, + { + "epoch": 9.05, + "learning_rate": 7.151454363089267e-05, + "loss": 2.6572, + "theoretical_loss": 3.335445827552755, + "tokens_seen": 2833652736 + }, + { + "epoch": 9.05, + "learning_rate": 7.150451354062188e-05, + "loss": 2.4277, + "theoretical_loss": 3.3354398275347346, + "tokens_seen": 2833718272 + }, + { + "epoch": 9.05, + "learning_rate": 7.149448345035106e-05, + "loss": 2.497, + "theoretical_loss": 3.3354338276943287, + "tokens_seen": 2833783808 + }, + { + "epoch": 9.05, + "learning_rate": 7.148445336008025e-05, + "loss": 2.43, + "theoretical_loss": 3.3354278280315284, + "tokens_seen": 2833849344 + }, + { + "epoch": 9.05, + "learning_rate": 7.147442326980943e-05, + "loss": 2.5561, + "theoretical_loss": 3.335421828546324, + "tokens_seen": 2833914880 + }, + { + "epoch": 9.05, + "learning_rate": 7.146439317953863e-05, + "loss": 2.6864, + "theoretical_loss": 3.335415829238707, + "tokens_seen": 2833980416 + }, + { + "epoch": 9.05, + "learning_rate": 7.145436308926781e-05, + "loss": 2.4556, + "theoretical_loss": 3.335409830108667, + "tokens_seen": 2834045952 + }, + { + "epoch": 9.05, + "learning_rate": 7.144433299899699e-05, + "loss": 2.6418, + "theoretical_loss": 3.335403831156195, + "tokens_seen": 2834111488 + }, + { + "epoch": 9.05, + "learning_rate": 7.143430290872619e-05, + "loss": 2.5709, + "theoretical_loss": 3.335397832381282, + "tokens_seen": 2834177024 + }, + { + "epoch": 9.05, + "learning_rate": 7.142427281845537e-05, + "loss": 2.3779, + "theoretical_loss": 3.335391833783918, + "tokens_seen": 2834242560 + }, + { + "epoch": 9.05, + "learning_rate": 7.141424272818456e-05, + "loss": 2.3517, + "theoretical_loss": 3.335385835364094, + "tokens_seen": 2834308096 + }, + { + "epoch": 9.05, + "learning_rate": 7.140421263791374e-05, + "loss": 2.3169, + "theoretical_loss": 3.335379837121801, + "tokens_seen": 2834373632 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3115758, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6162445545196533, + "objective/train/theoretical_loss": 3.33537533855658, + "objective/train/tokens_used": 2854882784, + "theoretical_loss": 3.33537533855658, + "tokens_seen": 2834422784 + }, + { + "epoch": 9.05, + "learning_rate": 7.139418254764294e-05, + "loss": 2.5001, + "theoretical_loss": 3.335373839057029, + "tokens_seen": 2834439168 + }, + { + "epoch": 9.05, + "learning_rate": 7.138415245737212e-05, + "loss": 2.4818, + "theoretical_loss": 3.335367841169769, + "tokens_seen": 2834504704 + }, + { + "epoch": 9.05, + "learning_rate": 7.137412236710131e-05, + "loss": 2.4575, + "theoretical_loss": 3.3353618434600114, + "tokens_seen": 2834570240 + }, + { + "epoch": 9.05, + "learning_rate": 7.13640922768305e-05, + "loss": 2.5612, + "theoretical_loss": 3.3353558459277473, + "tokens_seen": 2834635776 + }, + { + "epoch": 9.05, + "learning_rate": 7.135406218655968e-05, + "loss": 2.5293, + "theoretical_loss": 3.3353498485729665, + "tokens_seen": 2834701312 + }, + { + "epoch": 9.05, + "learning_rate": 7.134403209628887e-05, + "loss": 2.4063, + "theoretical_loss": 3.3353438513956606, + "tokens_seen": 2834766848 + }, + { + "epoch": 9.05, + "learning_rate": 7.133400200601805e-05, + "loss": 2.3225, + "theoretical_loss": 3.33533785439582, + "tokens_seen": 2834832384 + }, + { + "epoch": 9.05, + "learning_rate": 7.132397191574725e-05, + "loss": 2.2525, + "theoretical_loss": 3.3353318575734345, + "tokens_seen": 2834897920 + }, + { + "epoch": 9.05, + "learning_rate": 7.131394182547643e-05, + "loss": 2.4538, + "theoretical_loss": 3.3353258609284957, + "tokens_seen": 2834963456 + }, + { + "epoch": 9.05, + "learning_rate": 7.130391173520562e-05, + "loss": 2.6547, + "theoretical_loss": 3.3353198644609945, + "tokens_seen": 2835028992 + }, + { + "epoch": 9.05, + "learning_rate": 7.12938816449348e-05, + "loss": 2.5011, + "theoretical_loss": 3.3353138681709202, + "tokens_seen": 2835094528 + }, + { + "epoch": 9.05, + "learning_rate": 7.1283851554664e-05, + "loss": 2.5753, + "theoretical_loss": 3.335307872058265, + "tokens_seen": 2835160064 + }, + { + "epoch": 9.05, + "learning_rate": 7.127382146439318e-05, + "loss": 2.4553, + "theoretical_loss": 3.3353018761230184, + "tokens_seen": 2835225600 + }, + { + "epoch": 9.05, + "learning_rate": 7.126379137412237e-05, + "loss": 2.4132, + "theoretical_loss": 3.3352958803651713, + "tokens_seen": 2835291136 + }, + { + "epoch": 9.05, + "learning_rate": 7.125376128385155e-05, + "loss": 2.5063, + "theoretical_loss": 3.3352898847847148, + "tokens_seen": 2835356672 + }, + { + "epoch": 9.05, + "learning_rate": 7.124373119358074e-05, + "loss": 2.487, + "theoretical_loss": 3.335283889381639, + "tokens_seen": 2835422208 + }, + { + "epoch": 9.05, + "learning_rate": 7.123370110330993e-05, + "loss": 2.4572, + "theoretical_loss": 3.3352778941559356, + "tokens_seen": 2835487744 + }, + { + "epoch": 9.05, + "learning_rate": 7.122367101303911e-05, + "loss": 2.5384, + "theoretical_loss": 3.3352718991075934, + "tokens_seen": 2835553280 + }, + { + "epoch": 9.05, + "learning_rate": 7.12136409227683e-05, + "loss": 2.6556, + "theoretical_loss": 3.335265904236605, + "tokens_seen": 2835618816 + }, + { + "epoch": 9.05, + "learning_rate": 7.120361083249749e-05, + "loss": 2.526, + "theoretical_loss": 3.33525990954296, + "tokens_seen": 2835684352 + }, + { + "epoch": 9.05, + "learning_rate": 7.119358074222668e-05, + "loss": 2.6132, + "theoretical_loss": 3.3352539150266485, + "tokens_seen": 2835749888 + }, + { + "epoch": 9.05, + "learning_rate": 7.118355065195586e-05, + "loss": 2.6482, + "theoretical_loss": 3.3352479206876624, + "tokens_seen": 2835815424 + }, + { + "epoch": 9.05, + "learning_rate": 7.117352056168506e-05, + "loss": 2.4331, + "theoretical_loss": 3.335241926525992, + "tokens_seen": 2835880960 + }, + { + "epoch": 9.05, + "learning_rate": 7.116349047141424e-05, + "loss": 2.4222, + "theoretical_loss": 3.3352359325416274, + "tokens_seen": 2835946496 + }, + { + "epoch": 9.05, + "learning_rate": 7.115346038114342e-05, + "loss": 2.7726, + "theoretical_loss": 3.3352299387345603, + "tokens_seen": 2836012032 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3116453, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3123247623443604, + "objective/train/theoretical_loss": 3.335225443495605, + "objective/train/tokens_used": 2856521184, + "theoretical_loss": 3.335225443495605, + "tokens_seen": 2836061184 + }, + { + "epoch": 9.05, + "learning_rate": 7.114343029087261e-05, + "loss": 2.5464, + "theoretical_loss": 3.3352239451047803, + "tokens_seen": 2836077568 + }, + { + "epoch": 9.05, + "learning_rate": 7.113340020060181e-05, + "loss": 2.5186, + "theoretical_loss": 3.3352179516522784, + "tokens_seen": 2836143104 + }, + { + "epoch": 9.05, + "learning_rate": 7.1123370110331e-05, + "loss": 2.1754, + "theoretical_loss": 3.3352119583770454, + "tokens_seen": 2836208640 + }, + { + "epoch": 9.05, + "learning_rate": 7.111334002006018e-05, + "loss": 2.3704, + "theoretical_loss": 3.335205965279072, + "tokens_seen": 2836274176 + }, + { + "epoch": 9.05, + "learning_rate": 7.110330992978938e-05, + "loss": 2.5163, + "theoretical_loss": 3.3351999723583488, + "tokens_seen": 2836339712 + }, + { + "epoch": 9.05, + "learning_rate": 7.109327983951856e-05, + "loss": 2.5422, + "theoretical_loss": 3.3351939796148664, + "tokens_seen": 2836405248 + }, + { + "epoch": 9.05, + "learning_rate": 7.108324974924776e-05, + "loss": 2.5639, + "theoretical_loss": 3.3351879870486156, + "tokens_seen": 2836470784 + }, + { + "epoch": 9.05, + "learning_rate": 7.107321965897694e-05, + "loss": 2.6502, + "theoretical_loss": 3.3351819946595866, + "tokens_seen": 2836536320 + }, + { + "epoch": 9.05, + "learning_rate": 7.106318956870612e-05, + "loss": 2.6018, + "theoretical_loss": 3.3351760024477706, + "tokens_seen": 2836601856 + }, + { + "epoch": 9.05, + "learning_rate": 7.105315947843531e-05, + "loss": 2.4992, + "theoretical_loss": 3.335170010413158, + "tokens_seen": 2836667392 + }, + { + "epoch": 9.05, + "learning_rate": 7.10431293881645e-05, + "loss": 2.4526, + "theoretical_loss": 3.33516401855574, + "tokens_seen": 2836732928 + }, + { + "epoch": 9.05, + "learning_rate": 7.103309929789369e-05, + "loss": 2.5207, + "theoretical_loss": 3.335158026875506, + "tokens_seen": 2836798464 + }, + { + "epoch": 9.05, + "learning_rate": 7.102306920762287e-05, + "loss": 2.3642, + "theoretical_loss": 3.335152035372448, + "tokens_seen": 2836864000 + }, + { + "epoch": 9.05, + "learning_rate": 7.101303911735206e-05, + "loss": 2.4575, + "theoretical_loss": 3.335146044046556, + "tokens_seen": 2836929536 + }, + { + "epoch": 9.05, + "learning_rate": 7.100300902708124e-05, + "loss": 2.5457, + "theoretical_loss": 3.3351400528978212, + "tokens_seen": 2836995072 + }, + { + "epoch": 9.05, + "learning_rate": 7.099297893681044e-05, + "loss": 2.5745, + "theoretical_loss": 3.3351340619262335, + "tokens_seen": 2837060608 + }, + { + "epoch": 9.05, + "learning_rate": 7.098294884653962e-05, + "loss": 2.5769, + "theoretical_loss": 3.335128071131784, + "tokens_seen": 2837126144 + }, + { + "epoch": 9.05, + "learning_rate": 7.097291875626882e-05, + "loss": 2.7244, + "theoretical_loss": 3.3351220805144637, + "tokens_seen": 2837191680 + }, + { + "epoch": 9.05, + "learning_rate": 7.0962888665998e-05, + "loss": 2.4498, + "theoretical_loss": 3.3351160900742625, + "tokens_seen": 2837257216 + }, + { + "epoch": 9.05, + "learning_rate": 7.095285857572718e-05, + "loss": 2.6485, + "theoretical_loss": 3.3351100998111716, + "tokens_seen": 2837322752 + }, + { + "epoch": 9.05, + "learning_rate": 7.094282848545637e-05, + "loss": 2.4012, + "theoretical_loss": 3.3351041097251812, + "tokens_seen": 2837388288 + }, + { + "epoch": 9.05, + "learning_rate": 7.093279839518555e-05, + "loss": 2.5412, + "theoretical_loss": 3.3350981198162826, + "tokens_seen": 2837453824 + }, + { + "epoch": 9.05, + "learning_rate": 7.092276830491475e-05, + "loss": 2.3694, + "theoretical_loss": 3.3350921300844663, + "tokens_seen": 2837519360 + }, + { + "epoch": 9.05, + "learning_rate": 7.091273821464393e-05, + "loss": 2.3489, + "theoretical_loss": 3.335086140529723, + "tokens_seen": 2837584896 + }, + { + "epoch": 9.05, + "learning_rate": 7.090270812437312e-05, + "loss": 2.3931, + "theoretical_loss": 3.335080151152043, + "tokens_seen": 2837650432 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3117693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3868918418884277, + "objective/train/theoretical_loss": 3.3350756592349753, + "objective/train/tokens_used": 2858159584, + "theoretical_loss": 3.3350756592349753, + "tokens_seen": 2837699584 + }, + { + "epoch": 9.05, + "learning_rate": 7.08926780341023e-05, + "loss": 2.4442, + "theoretical_loss": 3.3350741619514173, + "tokens_seen": 2837715968 + }, + { + "epoch": 9.05, + "learning_rate": 7.08826479438315e-05, + "loss": 2.5692, + "theoretical_loss": 3.335068172927836, + "tokens_seen": 2837781504 + }, + { + "epoch": 9.05, + "learning_rate": 7.087261785356068e-05, + "loss": 2.63, + "theoretical_loss": 3.3350621840812913, + "tokens_seen": 2837847040 + }, + { + "epoch": 9.05, + "learning_rate": 7.086258776328986e-05, + "loss": 2.4414, + "theoretical_loss": 3.335056195411772, + "tokens_seen": 2837912576 + }, + { + "epoch": 9.05, + "learning_rate": 7.085255767301906e-05, + "loss": 2.3309, + "theoretical_loss": 3.33505020691927, + "tokens_seen": 2837978112 + }, + { + "epoch": 9.05, + "learning_rate": 7.084252758274824e-05, + "loss": 2.6522, + "theoretical_loss": 3.335044218603776, + "tokens_seen": 2838043648 + }, + { + "epoch": 9.05, + "learning_rate": 7.083249749247743e-05, + "loss": 2.4978, + "theoretical_loss": 3.3350382304652797, + "tokens_seen": 2838109184 + }, + { + "epoch": 9.05, + "learning_rate": 7.082246740220661e-05, + "loss": 2.6873, + "theoretical_loss": 3.3350322425037726, + "tokens_seen": 2838174720 + }, + { + "epoch": 9.05, + "learning_rate": 7.081243731193581e-05, + "loss": 2.5506, + "theoretical_loss": 3.335026254719245, + "tokens_seen": 2838240256 + }, + { + "epoch": 9.05, + "learning_rate": 7.080240722166499e-05, + "loss": 2.4422, + "theoretical_loss": 3.3350202671116875, + "tokens_seen": 2838305792 + }, + { + "epoch": 9.05, + "learning_rate": 7.079237713139418e-05, + "loss": 2.1816, + "theoretical_loss": 3.3350142796810913, + "tokens_seen": 2838371328 + }, + { + "epoch": 9.05, + "learning_rate": 7.078234704112337e-05, + "loss": 2.5189, + "theoretical_loss": 3.335008292427447, + "tokens_seen": 2838436864 + }, + { + "epoch": 9.05, + "learning_rate": 7.077231695085255e-05, + "loss": 2.5292, + "theoretical_loss": 3.335002305350745, + "tokens_seen": 2838502400 + }, + { + "epoch": 9.05, + "learning_rate": 7.076228686058174e-05, + "loss": 2.6072, + "theoretical_loss": 3.334996318450976, + "tokens_seen": 2838567936 + }, + { + "epoch": 9.05, + "learning_rate": 7.075225677031094e-05, + "loss": 2.4624, + "theoretical_loss": 3.3349903317281306, + "tokens_seen": 2838633472 + }, + { + "epoch": 9.05, + "learning_rate": 7.074222668004013e-05, + "loss": 2.3923, + "theoretical_loss": 3.3349843451822, + "tokens_seen": 2838699008 + }, + { + "epoch": 9.05, + "learning_rate": 7.073219658976931e-05, + "loss": 2.7007, + "theoretical_loss": 3.3349783588131743, + "tokens_seen": 2838764544 + }, + { + "epoch": 9.05, + "learning_rate": 7.07221664994985e-05, + "loss": 2.5387, + "theoretical_loss": 3.3349723726210447, + "tokens_seen": 2838830080 + }, + { + "epoch": 9.05, + "learning_rate": 7.071213640922769e-05, + "loss": 2.4441, + "theoretical_loss": 3.3349663866058012, + "tokens_seen": 2838895616 + }, + { + "epoch": 9.05, + "learning_rate": 7.070210631895688e-05, + "loss": 2.4919, + "theoretical_loss": 3.3349604007674354, + "tokens_seen": 2838961152 + }, + { + "epoch": 9.05, + "learning_rate": 7.069207622868606e-05, + "loss": 2.3713, + "theoretical_loss": 3.3349544151059374, + "tokens_seen": 2839026688 + }, + { + "epoch": 9.05, + "learning_rate": 7.068204613841526e-05, + "loss": 2.4058, + "theoretical_loss": 3.3349484296212974, + "tokens_seen": 2839092224 + }, + { + "epoch": 9.05, + "learning_rate": 7.067201604814444e-05, + "loss": 2.6764, + "theoretical_loss": 3.3349424443135076, + "tokens_seen": 2839157760 + }, + { + "epoch": 9.05, + "learning_rate": 7.066198595787362e-05, + "loss": 2.5437, + "theoretical_loss": 3.334936459182557, + "tokens_seen": 2839223296 + }, + { + "epoch": 9.05, + "learning_rate": 7.065195586760281e-05, + "loss": 2.2872, + "theoretical_loss": 3.334930474228438, + "tokens_seen": 2839288832 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3118361, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2741293907165527, + "objective/train/theoretical_loss": 3.3349259856288875, + "objective/train/tokens_used": 2859797984, + "theoretical_loss": 3.3349259856288875, + "tokens_seen": 2839337984 + }, + { + "epoch": 9.05, + "learning_rate": 7.0641925777332e-05, + "loss": 2.4527, + "theoretical_loss": 3.334924489451139, + "tokens_seen": 2839354368 + }, + { + "epoch": 9.05, + "learning_rate": 7.063189568706119e-05, + "loss": 2.6236, + "theoretical_loss": 3.334918504850653, + "tokens_seen": 2839419904 + }, + { + "epoch": 9.05, + "learning_rate": 7.062186559679037e-05, + "loss": 2.3621, + "theoretical_loss": 3.3349125204269696, + "tokens_seen": 2839485440 + }, + { + "epoch": 9.05, + "learning_rate": 7.061183550651957e-05, + "loss": 2.2026, + "theoretical_loss": 3.3349065361800796, + "tokens_seen": 2839550976 + }, + { + "epoch": 9.05, + "learning_rate": 7.060180541624875e-05, + "loss": 2.6059, + "theoretical_loss": 3.334900552109974, + "tokens_seen": 2839616512 + }, + { + "epoch": 9.05, + "learning_rate": 7.059177532597794e-05, + "loss": 2.3048, + "theoretical_loss": 3.334894568216643, + "tokens_seen": 2839682048 + }, + { + "epoch": 9.05, + "learning_rate": 7.058174523570712e-05, + "loss": 2.4875, + "theoretical_loss": 3.334888584500077, + "tokens_seen": 2839747584 + }, + { + "epoch": 9.05, + "learning_rate": 7.05717151454363e-05, + "loss": 2.371, + "theoretical_loss": 3.334882600960268, + "tokens_seen": 2839813120 + }, + { + "epoch": 9.05, + "learning_rate": 7.05616850551655e-05, + "loss": 2.4643, + "theoretical_loss": 3.3348766175972058, + "tokens_seen": 2839878656 + }, + { + "epoch": 9.05, + "learning_rate": 7.055165496489468e-05, + "loss": 2.2583, + "theoretical_loss": 3.334870634410881, + "tokens_seen": 2839944192 + }, + { + "epoch": 9.05, + "learning_rate": 7.054162487462387e-05, + "loss": 2.3101, + "theoretical_loss": 3.334864651401285, + "tokens_seen": 2840009728 + }, + { + "epoch": 9.05, + "learning_rate": 7.053159478435306e-05, + "loss": 2.2473, + "theoretical_loss": 3.3348586685684074, + "tokens_seen": 2840075264 + }, + { + "epoch": 9.05, + "learning_rate": 7.052156469408225e-05, + "loss": 2.3893, + "theoretical_loss": 3.3348526859122405, + "tokens_seen": 2840140800 + }, + { + "epoch": 9.05, + "learning_rate": 7.051153460381143e-05, + "loss": 2.6443, + "theoretical_loss": 3.334846703432773, + "tokens_seen": 2840206336 + }, + { + "epoch": 9.05, + "learning_rate": 7.050150451354063e-05, + "loss": 2.5428, + "theoretical_loss": 3.3348407211299973, + "tokens_seen": 2840271872 + }, + { + "epoch": 9.05, + "learning_rate": 7.049147442326981e-05, + "loss": 2.5685, + "theoretical_loss": 3.3348347390039033, + "tokens_seen": 2840337408 + }, + { + "epoch": 9.05, + "learning_rate": 7.0481444332999e-05, + "loss": 2.7043, + "theoretical_loss": 3.334828757054482, + "tokens_seen": 2840402944 + }, + { + "epoch": 9.05, + "learning_rate": 7.047141424272818e-05, + "loss": 2.5532, + "theoretical_loss": 3.3348227752817237, + "tokens_seen": 2840468480 + }, + { + "epoch": 9.05, + "learning_rate": 7.046138415245736e-05, + "loss": 2.6344, + "theoretical_loss": 3.3348167936856195, + "tokens_seen": 2840534016 + }, + { + "epoch": 9.05, + "learning_rate": 7.045135406218656e-05, + "loss": 2.6371, + "theoretical_loss": 3.33481081226616, + "tokens_seen": 2840599552 + }, + { + "epoch": 9.05, + "learning_rate": 7.044132397191574e-05, + "loss": 2.2211, + "theoretical_loss": 3.3348048310233356, + "tokens_seen": 2840665088 + }, + { + "epoch": 9.05, + "learning_rate": 7.043129388164493e-05, + "loss": 2.4027, + "theoretical_loss": 3.3347988499571377, + "tokens_seen": 2840730624 + }, + { + "epoch": 9.05, + "learning_rate": 7.042126379137412e-05, + "loss": 2.5921, + "theoretical_loss": 3.3347928690675563, + "tokens_seen": 2840796160 + }, + { + "epoch": 9.05, + "learning_rate": 7.041123370110331e-05, + "loss": 2.4748, + "theoretical_loss": 3.334786888354583, + "tokens_seen": 2840861696 + }, + { + "epoch": 9.05, + "learning_rate": 7.040120361083249e-05, + "loss": 2.5208, + "theoretical_loss": 3.3347809078182076, + "tokens_seen": 2840927232 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3119862, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.945129871368408, + "objective/train/theoretical_loss": 3.334776422531813, + "objective/train/tokens_used": 2861436384, + "theoretical_loss": 3.334776422531813, + "tokens_seen": 2840976384 + }, + { + "epoch": 9.05, + "learning_rate": 7.039117352056169e-05, + "loss": 2.7065, + "theoretical_loss": 3.3347749274584206, + "tokens_seen": 2840992768 + }, + { + "epoch": 9.05, + "learning_rate": 7.038114343029088e-05, + "loss": 2.6135, + "theoretical_loss": 3.3347689472752142, + "tokens_seen": 2841058304 + }, + { + "epoch": 9.05, + "learning_rate": 7.037111334002006e-05, + "loss": 2.503, + "theoretical_loss": 3.3347629672685777, + "tokens_seen": 2841123840 + }, + { + "epoch": 9.05, + "learning_rate": 7.036108324974926e-05, + "loss": 2.1928, + "theoretical_loss": 3.334756987438502, + "tokens_seen": 2841189376 + }, + { + "epoch": 9.05, + "learning_rate": 7.035105315947844e-05, + "loss": 2.5751, + "theoretical_loss": 3.3347510077849787, + "tokens_seen": 2841254912 + }, + { + "epoch": 9.05, + "learning_rate": 7.034102306920763e-05, + "loss": 2.3265, + "theoretical_loss": 3.3347450283079976, + "tokens_seen": 2841320448 + }, + { + "epoch": 9.05, + "learning_rate": 7.033099297893681e-05, + "loss": 2.5345, + "theoretical_loss": 3.33473904900755, + "tokens_seen": 2841385984 + }, + { + "epoch": 9.05, + "learning_rate": 7.032096288866601e-05, + "loss": 2.3925, + "theoretical_loss": 3.334733069883626, + "tokens_seen": 2841451520 + }, + { + "epoch": 9.05, + "learning_rate": 7.031093279839519e-05, + "loss": 2.4786, + "theoretical_loss": 3.3347270909362168, + "tokens_seen": 2841517056 + }, + { + "epoch": 9.05, + "learning_rate": 7.030090270812438e-05, + "loss": 2.3908, + "theoretical_loss": 3.3347211121653126, + "tokens_seen": 2841582592 + }, + { + "epoch": 9.05, + "learning_rate": 7.029087261785357e-05, + "loss": 2.4434, + "theoretical_loss": 3.3347151335709047, + "tokens_seen": 2841648128 + }, + { + "epoch": 9.05, + "learning_rate": 7.028084252758275e-05, + "loss": 2.4898, + "theoretical_loss": 3.334709155152984, + "tokens_seen": 2841713664 + }, + { + "epoch": 9.05, + "learning_rate": 7.027081243731194e-05, + "loss": 2.4476, + "theoretical_loss": 3.3347031769115407, + "tokens_seen": 2841779200 + }, + { + "epoch": 9.05, + "learning_rate": 7.026078234704112e-05, + "loss": 2.4393, + "theoretical_loss": 3.3346971988465657, + "tokens_seen": 2841844736 + }, + { + "epoch": 9.05, + "learning_rate": 7.025075225677032e-05, + "loss": 2.4699, + "theoretical_loss": 3.3346912209580495, + "tokens_seen": 2841910272 + }, + { + "epoch": 9.05, + "learning_rate": 7.02407221664995e-05, + "loss": 2.5499, + "theoretical_loss": 3.334685243245983, + "tokens_seen": 2841975808 + }, + { + "epoch": 9.05, + "learning_rate": 7.023069207622869e-05, + "loss": 2.4967, + "theoretical_loss": 3.3346792657103568, + "tokens_seen": 2842041344 + }, + { + "epoch": 9.05, + "learning_rate": 7.022066198595787e-05, + "loss": 2.4077, + "theoretical_loss": 3.334673288351162, + "tokens_seen": 2842106880 + }, + { + "epoch": 9.05, + "learning_rate": 7.021063189568707e-05, + "loss": 2.3918, + "theoretical_loss": 3.3346673111683893, + "tokens_seen": 2842172416 + }, + { + "epoch": 9.05, + "learning_rate": 7.020060180541625e-05, + "loss": 2.4316, + "theoretical_loss": 3.334661334162029, + "tokens_seen": 2842237952 + }, + { + "epoch": 9.05, + "learning_rate": 7.019057171514544e-05, + "loss": 2.6871, + "theoretical_loss": 3.3346553573320716, + "tokens_seen": 2842303488 + }, + { + "epoch": 9.05, + "learning_rate": 7.018054162487463e-05, + "loss": 2.7274, + "theoretical_loss": 3.334649380678509, + "tokens_seen": 2842369024 + }, + { + "epoch": 9.05, + "learning_rate": 7.01705115346038e-05, + "loss": 2.544, + "theoretical_loss": 3.334643404201331, + "tokens_seen": 2842434560 + }, + { + "epoch": 9.05, + "learning_rate": 7.0160481444333e-05, + "loss": 2.4854, + "theoretical_loss": 3.3346374279005278, + "tokens_seen": 2842500096 + }, + { + "epoch": 9.05, + "learning_rate": 7.015045135406218e-05, + "loss": 2.5916, + "theoretical_loss": 3.334631451776091, + "tokens_seen": 2842565632 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3120543, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.583648443222046, + "objective/train/theoretical_loss": 3.334626969798499, + "objective/train/tokens_used": 2863074784, + "theoretical_loss": 3.334626969798499, + "tokens_seen": 2842614784 + }, + { + "epoch": 9.05, + "learning_rate": 7.014042126379138e-05, + "loss": 2.3906, + "theoretical_loss": 3.334625475828012, + "tokens_seen": 2842631168 + }, + { + "epoch": 9.05, + "learning_rate": 7.013039117352056e-05, + "loss": 2.2878, + "theoretical_loss": 3.33461950005628, + "tokens_seen": 2842696704 + }, + { + "epoch": 9.05, + "learning_rate": 7.012036108324975e-05, + "loss": 2.3834, + "theoretical_loss": 3.3346135244608868, + "tokens_seen": 2842762240 + }, + { + "epoch": 9.05, + "learning_rate": 7.011033099297893e-05, + "loss": 2.7113, + "theoretical_loss": 3.3346075490418228, + "tokens_seen": 2842827776 + }, + { + "epoch": 9.05, + "learning_rate": 7.010030090270813e-05, + "loss": 2.4254, + "theoretical_loss": 3.334601573799078, + "tokens_seen": 2842893312 + }, + { + "epoch": 9.05, + "learning_rate": 7.009027081243731e-05, + "loss": 2.3905, + "theoretical_loss": 3.3345955987326446, + "tokens_seen": 2842958848 + }, + { + "epoch": 9.05, + "learning_rate": 7.008024072216649e-05, + "loss": 2.2589, + "theoretical_loss": 3.334589623842512, + "tokens_seen": 2843024384 + }, + { + "epoch": 9.05, + "learning_rate": 7.007021063189569e-05, + "loss": 2.449, + "theoretical_loss": 3.334583649128672, + "tokens_seen": 2843089920 + }, + { + "epoch": 9.05, + "learning_rate": 7.006018054162487e-05, + "loss": 2.5977, + "theoretical_loss": 3.334577674591115, + "tokens_seen": 2843155456 + }, + { + "epoch": 9.05, + "learning_rate": 7.005015045135406e-05, + "loss": 2.4547, + "theoretical_loss": 3.334571700229831, + "tokens_seen": 2843220992 + }, + { + "epoch": 9.05, + "learning_rate": 7.004012036108324e-05, + "loss": 2.2587, + "theoretical_loss": 3.3345657260448114, + "tokens_seen": 2843286528 + }, + { + "epoch": 9.05, + "learning_rate": 7.003009027081244e-05, + "loss": 2.6789, + "theoretical_loss": 3.334559752036047, + "tokens_seen": 2843352064 + }, + { + "epoch": 9.05, + "learning_rate": 7.002006018054162e-05, + "loss": 2.3901, + "theoretical_loss": 3.334553778203528, + "tokens_seen": 2843417600 + }, + { + "epoch": 9.05, + "learning_rate": 7.001003009027081e-05, + "loss": 2.5352, + "theoretical_loss": 3.334547804547246, + "tokens_seen": 2843483136 + }, + { + "epoch": 9.05, + "learning_rate": 7.000000000000001e-05, + "loss": 2.5053, + "theoretical_loss": 3.334541831067191, + "tokens_seen": 2843548672 + }, + { + "epoch": 9.05, + "learning_rate": 6.998996990972919e-05, + "loss": 2.4485, + "theoretical_loss": 3.334535857763354, + "tokens_seen": 2843614208 + }, + { + "epoch": 9.05, + "learning_rate": 6.997993981945838e-05, + "loss": 2.5533, + "theoretical_loss": 3.3345298846357254, + "tokens_seen": 2843679744 + }, + { + "epoch": 9.05, + "learning_rate": 6.996990972918756e-05, + "loss": 2.4259, + "theoretical_loss": 3.334523911684297, + "tokens_seen": 2843745280 + }, + { + "epoch": 9.05, + "learning_rate": 6.995987963891676e-05, + "loss": 2.4391, + "theoretical_loss": 3.3345179389090585, + "tokens_seen": 2843810816 + }, + { + "epoch": 9.05, + "learning_rate": 6.994984954864594e-05, + "loss": 2.4519, + "theoretical_loss": 3.334511966310001, + "tokens_seen": 2843876352 + }, + { + "epoch": 9.05, + "learning_rate": 6.993981945837513e-05, + "loss": 2.3061, + "theoretical_loss": 3.334505993887115, + "tokens_seen": 2843941888 + }, + { + "epoch": 9.05, + "learning_rate": 6.992978936810432e-05, + "loss": 2.4432, + "theoretical_loss": 3.3345000216403915, + "tokens_seen": 2844007424 + }, + { + "epoch": 9.05, + "learning_rate": 6.991975927783351e-05, + "loss": 2.5059, + "theoretical_loss": 3.3344940495698214, + "tokens_seen": 2844072960 + }, + { + "epoch": 9.05, + "learning_rate": 6.990972918756269e-05, + "loss": 2.6217, + "theoretical_loss": 3.334488077675395, + "tokens_seen": 2844138496 + }, + { + "epoch": 9.05, + "learning_rate": 6.989969909729189e-05, + "loss": 2.5369, + "theoretical_loss": 3.334482105957103, + "tokens_seen": 2844204032 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3121594, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.596513032913208, + "objective/train/theoretical_loss": 3.334477627283967, + "objective/train/tokens_used": 2864713184, + "theoretical_loss": 3.334477627283967, + "tokens_seen": 2844253184 + }, + { + "epoch": 9.05, + "learning_rate": 6.988966900702107e-05, + "loss": 2.4556, + "theoretical_loss": 3.334476134414937, + "tokens_seen": 2844269568 + }, + { + "epoch": 9.05, + "learning_rate": 6.987963891675025e-05, + "loss": 2.4493, + "theoretical_loss": 3.334470163048887, + "tokens_seen": 2844335104 + }, + { + "epoch": 9.05, + "learning_rate": 6.986960882647944e-05, + "loss": 2.5537, + "theoretical_loss": 3.334464191858944, + "tokens_seen": 2844400640 + }, + { + "epoch": 9.05, + "learning_rate": 6.985957873620862e-05, + "loss": 2.5302, + "theoretical_loss": 3.3344582208450984, + "tokens_seen": 2844466176 + }, + { + "epoch": 9.05, + "learning_rate": 6.984954864593782e-05, + "loss": 2.484, + "theoretical_loss": 3.3344522500073412, + "tokens_seen": 2844531712 + }, + { + "epoch": 9.05, + "learning_rate": 6.9839518555667e-05, + "loss": 2.311, + "theoretical_loss": 3.3344462793456633, + "tokens_seen": 2844597248 + }, + { + "epoch": 9.05, + "learning_rate": 6.98294884653962e-05, + "loss": 2.4302, + "theoretical_loss": 3.3344403088600556, + "tokens_seen": 2844662784 + }, + { + "epoch": 9.05, + "learning_rate": 6.981945837512538e-05, + "loss": 2.4406, + "theoretical_loss": 3.3344343385505084, + "tokens_seen": 2844728320 + }, + { + "epoch": 9.05, + "learning_rate": 6.980942828485457e-05, + "loss": 2.556, + "theoretical_loss": 3.3344283684170124, + "tokens_seen": 2844793856 + }, + { + "epoch": 9.05, + "learning_rate": 6.979939819458375e-05, + "loss": 2.3753, + "theoretical_loss": 3.3344223984595587, + "tokens_seen": 2844859392 + }, + { + "epoch": 9.05, + "learning_rate": 6.978936810431293e-05, + "loss": 2.5256, + "theoretical_loss": 3.334416428678138, + "tokens_seen": 2844924928 + }, + { + "epoch": 9.05, + "learning_rate": 6.977933801404213e-05, + "loss": 2.5719, + "theoretical_loss": 3.334410459072741, + "tokens_seen": 2844990464 + }, + { + "epoch": 9.05, + "learning_rate": 6.976930792377131e-05, + "loss": 2.5068, + "theoretical_loss": 3.3344044896433584, + "tokens_seen": 2845056000 + }, + { + "epoch": 9.05, + "learning_rate": 6.97592778335005e-05, + "loss": 2.351, + "theoretical_loss": 3.334398520389981, + "tokens_seen": 2845121536 + }, + { + "epoch": 9.05, + "learning_rate": 6.974924774322968e-05, + "loss": 2.6112, + "theoretical_loss": 3.3343925513125994, + "tokens_seen": 2845187072 + }, + { + "epoch": 9.05, + "learning_rate": 6.973921765295888e-05, + "loss": 2.4918, + "theoretical_loss": 3.334386582411205, + "tokens_seen": 2845252608 + }, + { + "epoch": 9.05, + "learning_rate": 6.972918756268806e-05, + "loss": 2.3206, + "theoretical_loss": 3.334380613685788, + "tokens_seen": 2845318144 + }, + { + "epoch": 9.05, + "learning_rate": 6.971915747241725e-05, + "loss": 2.4723, + "theoretical_loss": 3.3343746451363385, + "tokens_seen": 2845383680 + }, + { + "epoch": 9.05, + "learning_rate": 6.970912738214644e-05, + "loss": 2.6329, + "theoretical_loss": 3.334368676762849, + "tokens_seen": 2845449216 + }, + { + "epoch": 9.05, + "learning_rate": 6.969909729187562e-05, + "loss": 2.4107, + "theoretical_loss": 3.3343627085653083, + "tokens_seen": 2845514752 + }, + { + "epoch": 9.05, + "learning_rate": 6.968906720160481e-05, + "loss": 2.4036, + "theoretical_loss": 3.3343567405437087, + "tokens_seen": 2845580288 + }, + { + "epoch": 9.05, + "learning_rate": 6.967903711133399e-05, + "loss": 2.5171, + "theoretical_loss": 3.33435077269804, + "tokens_seen": 2845645824 + }, + { + "epoch": 9.05, + "learning_rate": 6.966900702106319e-05, + "loss": 2.3274, + "theoretical_loss": 3.3343448050282936, + "tokens_seen": 2845711360 + }, + { + "epoch": 9.05, + "learning_rate": 6.965897693079237e-05, + "loss": 2.5703, + "theoretical_loss": 3.33433883753446, + "tokens_seen": 2845776896 + }, + { + "epoch": 9.05, + "learning_rate": 6.964894684052156e-05, + "loss": 2.4829, + "theoretical_loss": 3.33433287021653, + "tokens_seen": 2845842432 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3122339, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2980520725250244, + "objective/train/theoretical_loss": 3.3343283948435136, + "objective/train/tokens_used": 2866351584, + "theoretical_loss": 3.3343283948435136, + "tokens_seen": 2845891584 + }, + { + "epoch": 9.05, + "learning_rate": 6.963891675025074e-05, + "loss": 2.2539, + "theoretical_loss": 3.334326903074494, + "tokens_seen": 2845907968 + }, + { + "epoch": 9.05, + "learning_rate": 6.962888665997995e-05, + "loss": 2.4441, + "theoretical_loss": 3.3343209361083432, + "tokens_seen": 2845973504 + }, + { + "epoch": 9.05, + "learning_rate": 6.961885656970913e-05, + "loss": 2.4975, + "theoretical_loss": 3.3343149693180685, + "tokens_seen": 2846039040 + }, + { + "epoch": 9.05, + "learning_rate": 6.960882647943833e-05, + "loss": 2.4827, + "theoretical_loss": 3.33430900270366, + "tokens_seen": 2846104576 + }, + { + "epoch": 9.05, + "learning_rate": 6.959879638916751e-05, + "loss": 2.4191, + "theoretical_loss": 3.3343030362651094, + "tokens_seen": 2846170112 + }, + { + "epoch": 9.05, + "learning_rate": 6.958876629889669e-05, + "loss": 2.5447, + "theoretical_loss": 3.334297070002407, + "tokens_seen": 2846235648 + }, + { + "epoch": 9.05, + "learning_rate": 6.957873620862589e-05, + "loss": 2.4339, + "theoretical_loss": 3.334291103915543, + "tokens_seen": 2846301184 + }, + { + "epoch": 9.05, + "learning_rate": 6.956870611835507e-05, + "loss": 2.665, + "theoretical_loss": 3.334285138004509, + "tokens_seen": 2846366720 + }, + { + "epoch": 9.05, + "learning_rate": 6.955867602808426e-05, + "loss": 2.3733, + "theoretical_loss": 3.3342791722692953, + "tokens_seen": 2846432256 + }, + { + "epoch": 9.05, + "learning_rate": 6.954864593781344e-05, + "loss": 2.4047, + "theoretical_loss": 3.3342732067098932, + "tokens_seen": 2846497792 + }, + { + "epoch": 9.05, + "learning_rate": 6.953861584754264e-05, + "loss": 2.6059, + "theoretical_loss": 3.334267241326293, + "tokens_seen": 2846563328 + }, + { + "epoch": 9.05, + "learning_rate": 6.952858575727182e-05, + "loss": 2.7371, + "theoretical_loss": 3.334261276118485, + "tokens_seen": 2846628864 + }, + { + "epoch": 9.05, + "learning_rate": 6.951855566700101e-05, + "loss": 2.4302, + "theoretical_loss": 3.3342553110864612, + "tokens_seen": 2846694400 + }, + { + "epoch": 9.05, + "learning_rate": 6.95085255767302e-05, + "loss": 2.5707, + "theoretical_loss": 3.3342493462302114, + "tokens_seen": 2846759936 + }, + { + "epoch": 9.05, + "learning_rate": 6.949849548645938e-05, + "loss": 2.5062, + "theoretical_loss": 3.3342433815497268, + "tokens_seen": 2846825472 + }, + { + "epoch": 9.05, + "learning_rate": 6.948846539618857e-05, + "loss": 2.6396, + "theoretical_loss": 3.334237417044998, + "tokens_seen": 2846891008 + }, + { + "epoch": 9.05, + "learning_rate": 6.947843530591775e-05, + "loss": 2.4697, + "theoretical_loss": 3.3342314527160157, + "tokens_seen": 2846956544 + }, + { + "epoch": 9.05, + "learning_rate": 6.946840521564695e-05, + "loss": 2.4715, + "theoretical_loss": 3.334225488562771, + "tokens_seen": 2847022080 + }, + { + "epoch": 9.05, + "learning_rate": 6.945837512537613e-05, + "loss": 2.5129, + "theoretical_loss": 3.334219524585255, + "tokens_seen": 2847087616 + }, + { + "epoch": 9.05, + "learning_rate": 6.944834503510532e-05, + "loss": 2.4431, + "theoretical_loss": 3.334213560783457, + "tokens_seen": 2847153152 + }, + { + "epoch": 9.05, + "learning_rate": 6.94383149448345e-05, + "loss": 2.5643, + "theoretical_loss": 3.3342075971573695, + "tokens_seen": 2847218688 + }, + { + "epoch": 9.05, + "learning_rate": 6.94282848545637e-05, + "loss": 2.4, + "theoretical_loss": 3.334201633706982, + "tokens_seen": 2847284224 + }, + { + "epoch": 9.05, + "learning_rate": 6.941825476429288e-05, + "loss": 2.4614, + "theoretical_loss": 3.334195670432287, + "tokens_seen": 2847349760 + }, + { + "epoch": 9.05, + "learning_rate": 6.940822467402207e-05, + "loss": 2.4409, + "theoretical_loss": 3.3341897073332727, + "tokens_seen": 2847415296 + }, + { + "epoch": 9.05, + "learning_rate": 6.939819458375125e-05, + "loss": 2.318, + "theoretical_loss": 3.334183744409932, + "tokens_seen": 2847480832 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3123600, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.336703062057495, + "objective/train/theoretical_loss": 3.334179272332706, + "objective/train/tokens_used": 2867989984, + "theoretical_loss": 3.334179272332706, + "tokens_seen": 2847529984 + }, + { + "epoch": 9.05, + "learning_rate": 6.938816449348044e-05, + "loss": 2.554, + "theoretical_loss": 3.3341777816622544, + "tokens_seen": 2847546368 + }, + { + "epoch": 9.05, + "learning_rate": 6.937813440320963e-05, + "loss": 2.5298, + "theoretical_loss": 3.3341718190902316, + "tokens_seen": 2847611904 + }, + { + "epoch": 9.05, + "learning_rate": 6.936810431293881e-05, + "loss": 2.4457, + "theoretical_loss": 3.334165856693854, + "tokens_seen": 2847677440 + }, + { + "epoch": 9.05, + "learning_rate": 6.9358074222668e-05, + "loss": 2.5899, + "theoretical_loss": 3.3341598944731126, + "tokens_seen": 2847742976 + }, + { + "epoch": 9.05, + "learning_rate": 6.934804413239719e-05, + "loss": 2.4573, + "theoretical_loss": 3.3341539324279976, + "tokens_seen": 2847808512 + }, + { + "epoch": 9.05, + "learning_rate": 6.933801404212638e-05, + "loss": 2.6027, + "theoretical_loss": 3.3341479705585004, + "tokens_seen": 2847874048 + }, + { + "epoch": 9.05, + "learning_rate": 6.932798395185556e-05, + "loss": 2.6306, + "theoretical_loss": 3.3341420088646117, + "tokens_seen": 2847939584 + }, + { + "epoch": 9.05, + "learning_rate": 6.931795386158476e-05, + "loss": 2.558, + "theoretical_loss": 3.334136047346322, + "tokens_seen": 2848005120 + }, + { + "epoch": 9.05, + "learning_rate": 6.930792377131394e-05, + "loss": 2.4538, + "theoretical_loss": 3.334130086003622, + "tokens_seen": 2848070656 + }, + { + "epoch": 9.05, + "learning_rate": 6.929789368104312e-05, + "loss": 2.6169, + "theoretical_loss": 3.334124124836503, + "tokens_seen": 2848136192 + }, + { + "epoch": 9.05, + "learning_rate": 6.928786359077231e-05, + "loss": 2.8541, + "theoretical_loss": 3.3341181638449555, + "tokens_seen": 2848201728 + }, + { + "epoch": 9.05, + "learning_rate": 6.92778335005015e-05, + "loss": 2.7151, + "theoretical_loss": 3.3341122030289707, + "tokens_seen": 2848267264 + }, + { + "epoch": 9.05, + "learning_rate": 6.926780341023069e-05, + "loss": 2.276, + "theoretical_loss": 3.3341062423885384, + "tokens_seen": 2848332800 + }, + { + "epoch": 9.05, + "learning_rate": 6.925777331995987e-05, + "loss": 2.7338, + "theoretical_loss": 3.33410028192365, + "tokens_seen": 2848398336 + }, + { + "epoch": 9.05, + "learning_rate": 6.924774322968908e-05, + "loss": 2.5919, + "theoretical_loss": 3.3340943216342964, + "tokens_seen": 2848463872 + }, + { + "epoch": 9.05, + "learning_rate": 6.923771313941826e-05, + "loss": 2.5803, + "theoretical_loss": 3.334088361520468, + "tokens_seen": 2848529408 + }, + { + "epoch": 9.05, + "learning_rate": 6.922768304914746e-05, + "loss": 2.2421, + "theoretical_loss": 3.3340824015821564, + "tokens_seen": 2848594944 + }, + { + "epoch": 9.05, + "learning_rate": 6.921765295887664e-05, + "loss": 2.592, + "theoretical_loss": 3.334076441819352, + "tokens_seen": 2848660480 + }, + { + "epoch": 9.05, + "learning_rate": 6.920762286860582e-05, + "loss": 2.5655, + "theoretical_loss": 3.3340704822320446, + "tokens_seen": 2848726016 + }, + { + "epoch": 9.05, + "learning_rate": 6.919759277833501e-05, + "loss": 2.8106, + "theoretical_loss": 3.3340645228202264, + "tokens_seen": 2848791552 + }, + { + "epoch": 9.05, + "learning_rate": 6.918756268806419e-05, + "loss": 2.6399, + "theoretical_loss": 3.3340585635838877, + "tokens_seen": 2848857088 + }, + { + "epoch": 9.05, + "learning_rate": 6.917753259779339e-05, + "loss": 2.5535, + "theoretical_loss": 3.334052604523019, + "tokens_seen": 2848922624 + }, + { + "epoch": 9.05, + "learning_rate": 6.916750250752257e-05, + "loss": 2.5134, + "theoretical_loss": 3.334046645637611, + "tokens_seen": 2848988160 + }, + { + "epoch": 9.05, + "learning_rate": 6.915747241725176e-05, + "loss": 2.3421, + "theoretical_loss": 3.3340406869276555, + "tokens_seen": 2849053696 + }, + { + "epoch": 9.05, + "learning_rate": 6.914744232698094e-05, + "loss": 2.5463, + "theoretical_loss": 3.334034728393142, + "tokens_seen": 2849119232 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3124335, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5036303997039795, + "objective/train/theoretical_loss": 3.334030259607386, + "objective/train/tokens_used": 2869628384, + "theoretical_loss": 3.334030259607386, + "tokens_seen": 2849168384 + }, + { + "epoch": 9.05, + "learning_rate": 6.913741223671014e-05, + "loss": 2.4691, + "theoretical_loss": 3.3340287700340623, + "tokens_seen": 2849184768 + }, + { + "epoch": 9.05, + "learning_rate": 6.912738214643932e-05, + "loss": 2.6984, + "theoretical_loss": 3.334022811850407, + "tokens_seen": 2849250304 + }, + { + "epoch": 9.05, + "learning_rate": 6.911735205616852e-05, + "loss": 2.4179, + "theoretical_loss": 3.3340168538421664, + "tokens_seen": 2849315840 + }, + { + "epoch": 9.05, + "learning_rate": 6.91073219658977e-05, + "loss": 2.4168, + "theoretical_loss": 3.3340108960093318, + "tokens_seen": 2849381376 + }, + { + "epoch": 9.05, + "learning_rate": 6.909729187562688e-05, + "loss": 2.6349, + "theoretical_loss": 3.3340049383518937, + "tokens_seen": 2849446912 + }, + { + "epoch": 9.05, + "learning_rate": 6.908726178535607e-05, + "loss": 2.452, + "theoretical_loss": 3.3339989808698434, + "tokens_seen": 2849512448 + }, + { + "epoch": 9.05, + "learning_rate": 6.907723169508525e-05, + "loss": 2.6119, + "theoretical_loss": 3.3339930235631705, + "tokens_seen": 2849577984 + }, + { + "epoch": 9.05, + "learning_rate": 6.906720160481445e-05, + "loss": 2.418, + "theoretical_loss": 3.3339870664318676, + "tokens_seen": 2849643520 + }, + { + "epoch": 9.05, + "learning_rate": 6.905717151454363e-05, + "loss": 2.7169, + "theoretical_loss": 3.333981109475924, + "tokens_seen": 2849709056 + }, + { + "epoch": 9.05, + "learning_rate": 6.904714142427282e-05, + "loss": 2.6007, + "theoretical_loss": 3.333975152695331, + "tokens_seen": 2849774592 + }, + { + "epoch": 9.05, + "learning_rate": 6.9037111334002e-05, + "loss": 2.4138, + "theoretical_loss": 3.33396919609008, + "tokens_seen": 2849840128 + }, + { + "epoch": 9.05, + "learning_rate": 6.90270812437312e-05, + "loss": 2.5448, + "theoretical_loss": 3.3339632396601604, + "tokens_seen": 2849905664 + }, + { + "epoch": 9.05, + "learning_rate": 6.901705115346038e-05, + "loss": 2.4245, + "theoretical_loss": 3.3339572834055646, + "tokens_seen": 2849971200 + }, + { + "epoch": 9.05, + "learning_rate": 6.900702106318956e-05, + "loss": 2.6705, + "theoretical_loss": 3.3339513273262824, + "tokens_seen": 2850036736 + }, + { + "epoch": 9.05, + "learning_rate": 6.899699097291876e-05, + "loss": 2.2287, + "theoretical_loss": 3.333945371422305, + "tokens_seen": 2850102272 + }, + { + "epoch": 9.05, + "learning_rate": 6.898696088264794e-05, + "loss": 2.3452, + "theoretical_loss": 3.3339394156936226, + "tokens_seen": 2850167808 + }, + { + "epoch": 9.05, + "learning_rate": 6.897693079237713e-05, + "loss": 2.4594, + "theoretical_loss": 3.3339334601402273, + "tokens_seen": 2850233344 + }, + { + "epoch": 9.05, + "learning_rate": 6.896690070210631e-05, + "loss": 2.4433, + "theoretical_loss": 3.3339275047621086, + "tokens_seen": 2850298880 + }, + { + "epoch": 9.05, + "learning_rate": 6.895687061183551e-05, + "loss": 2.2988, + "theoretical_loss": 3.333921549559258, + "tokens_seen": 2850364416 + }, + { + "epoch": 9.05, + "learning_rate": 6.894684052156469e-05, + "loss": 2.6014, + "theoretical_loss": 3.333915594531666, + "tokens_seen": 2850429952 + }, + { + "epoch": 9.05, + "learning_rate": 6.893681043129388e-05, + "loss": 2.5682, + "theoretical_loss": 3.3339096396793235, + "tokens_seen": 2850495488 + }, + { + "epoch": 9.05, + "learning_rate": 6.892678034102306e-05, + "loss": 2.5384, + "theoretical_loss": 3.3339036850022215, + "tokens_seen": 2850561024 + }, + { + "epoch": 9.05, + "learning_rate": 6.891675025075225e-05, + "loss": 2.4143, + "theoretical_loss": 3.3338977305003503, + "tokens_seen": 2850626560 + }, + { + "epoch": 9.05, + "learning_rate": 6.890672016048144e-05, + "loss": 2.5048, + "theoretical_loss": 3.3338917761737017, + "tokens_seen": 2850692096 + }, + { + "epoch": 9.05, + "learning_rate": 6.889669007021062e-05, + "loss": 2.4951, + "theoretical_loss": 3.333885822022266, + "tokens_seen": 2850757632 + }, + { + "epoch": 9.05, + "objective/train/docs_used": 3125555, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.345184326171875, + "objective/train/theoretical_loss": 3.3338813565236665, + "objective/train/tokens_used": 2871266784, + "theoretical_loss": 3.3338813565236665, + "tokens_seen": 2850806784 + }, + { + "epoch": 9.05, + "learning_rate": 6.888665997993982e-05, + "loss": 2.4705, + "theoretical_loss": 3.3338798680460333, + "tokens_seen": 2850823168 + }, + { + "epoch": 9.05, + "learning_rate": 6.887662988966901e-05, + "loss": 2.4939, + "theoretical_loss": 3.3338739142449954, + "tokens_seen": 2850888704 + }, + { + "epoch": 9.05, + "learning_rate": 6.88665997993982e-05, + "loss": 2.5153, + "theoretical_loss": 3.3338679606191426, + "tokens_seen": 2850954240 + }, + { + "epoch": 9.05, + "learning_rate": 6.885656970912739e-05, + "loss": 2.501, + "theoretical_loss": 3.3338620071684657, + "tokens_seen": 2851019776 + }, + { + "epoch": 9.05, + "learning_rate": 6.884653961885658e-05, + "loss": 2.5784, + "theoretical_loss": 3.3338560538929563, + "tokens_seen": 2851085312 + }, + { + "epoch": 9.05, + "learning_rate": 6.883650952858576e-05, + "loss": 2.4804, + "theoretical_loss": 3.3338501007926045, + "tokens_seen": 2851150848 + }, + { + "epoch": 9.05, + "learning_rate": 6.882647943831496e-05, + "loss": 2.5069, + "theoretical_loss": 3.333844147867401, + "tokens_seen": 2851216384 + }, + { + "epoch": 9.05, + "learning_rate": 6.881644934804414e-05, + "loss": 2.5071, + "theoretical_loss": 3.3338381951173366, + "tokens_seen": 2851281920 + }, + { + "epoch": 9.05, + "learning_rate": 6.880641925777332e-05, + "loss": 2.4296, + "theoretical_loss": 3.3338322425424027, + "tokens_seen": 2851347456 + }, + { + "epoch": 9.05, + "learning_rate": 6.879638916750251e-05, + "loss": 2.4729, + "theoretical_loss": 3.3338262901425897, + "tokens_seen": 2851412992 + }, + { + "epoch": 9.05, + "learning_rate": 6.87863590772317e-05, + "loss": 2.4877, + "theoretical_loss": 3.3338203379178886, + "tokens_seen": 2851478528 + }, + { + "epoch": 9.05, + "learning_rate": 6.877632898696089e-05, + "loss": 2.507, + "theoretical_loss": 3.3338143858682905, + "tokens_seen": 2851544064 + }, + { + "epoch": 9.05, + "learning_rate": 6.876629889669007e-05, + "loss": 2.2931, + "theoretical_loss": 3.3338084339937852, + "tokens_seen": 2851609600 + }, + { + "epoch": 9.05, + "learning_rate": 6.875626880641927e-05, + "loss": 2.5694, + "theoretical_loss": 3.333802482294365, + "tokens_seen": 2851675136 + }, + { + "epoch": 9.05, + "learning_rate": 6.874623871614845e-05, + "loss": 2.6808, + "theoretical_loss": 3.333796530770019, + "tokens_seen": 2851740672 + }, + { + "epoch": 9.05, + "learning_rate": 6.873620862587764e-05, + "loss": 2.5263, + "theoretical_loss": 3.3337905794207394, + "tokens_seen": 2851806208 + }, + { + "epoch": 9.05, + "learning_rate": 6.872617853560682e-05, + "loss": 2.5938, + "theoretical_loss": 3.3337846282465167, + "tokens_seen": 2851871744 + }, + { + "epoch": 9.05, + "learning_rate": 6.8716148445336e-05, + "loss": 2.4617, + "theoretical_loss": 3.3337786772473414, + "tokens_seen": 2851937280 + }, + { + "epoch": 9.05, + "learning_rate": 6.87061183550652e-05, + "loss": 2.5219, + "theoretical_loss": 3.333772726423205, + "tokens_seen": 2852002816 + }, + { + "epoch": 9.05, + "learning_rate": 6.869608826479438e-05, + "loss": 2.5063, + "theoretical_loss": 3.333766775774097, + "tokens_seen": 2852068352 + }, + { + "epoch": 9.05, + "learning_rate": 6.868605817452357e-05, + "loss": 2.4076, + "theoretical_loss": 3.33376082530001, + "tokens_seen": 2852133888 + }, + { + "epoch": 9.05, + "learning_rate": 6.867602808425276e-05, + "loss": 2.5417, + "theoretical_loss": 3.333754875000934, + "tokens_seen": 2852199424 + }, + { + "epoch": 9.05, + "learning_rate": 6.866599799398195e-05, + "loss": 2.4632, + "theoretical_loss": 3.3337489248768586, + "tokens_seen": 2852264960 + }, + { + "epoch": 9.05, + "learning_rate": 6.865596790371113e-05, + "loss": 2.4324, + "theoretical_loss": 3.3337429749277767, + "tokens_seen": 2852330496 + }, + { + "epoch": 9.06, + "learning_rate": 6.864593781344033e-05, + "loss": 2.3288, + "theoretical_loss": 3.3337370251536784, + "tokens_seen": 2852396032 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3126270, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7467286586761475, + "objective/train/theoretical_loss": 3.3337325629379313, + "objective/train/tokens_used": 2872905184, + "theoretical_loss": 3.3337325629379313, + "tokens_seen": 2852445184 + }, + { + "epoch": 9.06, + "learning_rate": 6.863590772316951e-05, + "loss": 2.6581, + "theoretical_loss": 3.333731075554554, + "tokens_seen": 2852461568 + }, + { + "epoch": 9.06, + "learning_rate": 6.862587763289869e-05, + "loss": 2.4176, + "theoretical_loss": 3.3337251261303944, + "tokens_seen": 2852527104 + }, + { + "epoch": 9.06, + "learning_rate": 6.861584754262788e-05, + "loss": 2.5473, + "theoretical_loss": 3.333719176881191, + "tokens_seen": 2852592640 + }, + { + "epoch": 9.06, + "learning_rate": 6.860581745235706e-05, + "loss": 2.4497, + "theoretical_loss": 3.3337132278069346, + "tokens_seen": 2852658176 + }, + { + "epoch": 9.06, + "learning_rate": 6.859578736208626e-05, + "loss": 2.3774, + "theoretical_loss": 3.3337072789076156, + "tokens_seen": 2852723712 + }, + { + "epoch": 9.06, + "learning_rate": 6.858575727181544e-05, + "loss": 2.5611, + "theoretical_loss": 3.333701330183225, + "tokens_seen": 2852789248 + }, + { + "epoch": 9.06, + "learning_rate": 6.857572718154463e-05, + "loss": 2.3091, + "theoretical_loss": 3.3336953816337536, + "tokens_seen": 2852854784 + }, + { + "epoch": 9.06, + "learning_rate": 6.856569709127382e-05, + "loss": 2.4156, + "theoretical_loss": 3.3336894332591926, + "tokens_seen": 2852920320 + }, + { + "epoch": 9.06, + "learning_rate": 6.855566700100301e-05, + "loss": 2.8666, + "theoretical_loss": 3.3336834850595323, + "tokens_seen": 2852985856 + }, + { + "epoch": 9.06, + "learning_rate": 6.854563691073219e-05, + "loss": 2.3223, + "theoretical_loss": 3.333677537034764, + "tokens_seen": 2853051392 + }, + { + "epoch": 9.06, + "learning_rate": 6.853560682046139e-05, + "loss": 2.3363, + "theoretical_loss": 3.333671589184878, + "tokens_seen": 2853116928 + }, + { + "epoch": 9.06, + "learning_rate": 6.852557673019057e-05, + "loss": 2.5387, + "theoretical_loss": 3.3336656415098656, + "tokens_seen": 2853182464 + }, + { + "epoch": 9.06, + "learning_rate": 6.851554663991975e-05, + "loss": 2.2602, + "theoretical_loss": 3.3336596940097176, + "tokens_seen": 2853248000 + }, + { + "epoch": 9.06, + "learning_rate": 6.850551654964896e-05, + "loss": 2.6177, + "theoretical_loss": 3.333653746684425, + "tokens_seen": 2853313536 + }, + { + "epoch": 9.06, + "learning_rate": 6.849548645937814e-05, + "loss": 2.6236, + "theoretical_loss": 3.333647799533978, + "tokens_seen": 2853379072 + }, + { + "epoch": 9.06, + "learning_rate": 6.848545636910733e-05, + "loss": 2.6675, + "theoretical_loss": 3.333641852558368, + "tokens_seen": 2853444608 + }, + { + "epoch": 9.06, + "learning_rate": 6.847542627883651e-05, + "loss": 2.5198, + "theoretical_loss": 3.333635905757586, + "tokens_seen": 2853510144 + }, + { + "epoch": 9.06, + "learning_rate": 6.846539618856571e-05, + "loss": 2.418, + "theoretical_loss": 3.3336299591316223, + "tokens_seen": 2853575680 + }, + { + "epoch": 9.06, + "learning_rate": 6.845536609829489e-05, + "loss": 2.573, + "theoretical_loss": 3.3336240126804677, + "tokens_seen": 2853641216 + }, + { + "epoch": 9.06, + "learning_rate": 6.844533600802408e-05, + "loss": 2.45, + "theoretical_loss": 3.3336180664041137, + "tokens_seen": 2853706752 + }, + { + "epoch": 9.06, + "learning_rate": 6.843530591775327e-05, + "loss": 2.606, + "theoretical_loss": 3.3336121203025506, + "tokens_seen": 2853772288 + }, + { + "epoch": 9.06, + "learning_rate": 6.842527582748245e-05, + "loss": 2.4482, + "theoretical_loss": 3.3336061743757694, + "tokens_seen": 2853837824 + }, + { + "epoch": 9.06, + "learning_rate": 6.841524573721164e-05, + "loss": 2.5252, + "theoretical_loss": 3.333600228623761, + "tokens_seen": 2853903360 + }, + { + "epoch": 9.06, + "learning_rate": 6.840521564694082e-05, + "loss": 2.6504, + "theoretical_loss": 3.333594283046516, + "tokens_seen": 2853968896 + }, + { + "epoch": 9.06, + "learning_rate": 6.839518555667002e-05, + "loss": 2.3391, + "theoretical_loss": 3.3335883376440263, + "tokens_seen": 2854034432 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3127355, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.068079710006714, + "objective/train/theoretical_loss": 3.3335838787068353, + "objective/train/tokens_used": 2874543584, + "theoretical_loss": 3.3335838787068353, + "tokens_seen": 2854083584 + }, + { + "epoch": 9.06, + "learning_rate": 6.83851554663992e-05, + "loss": 2.4132, + "theoretical_loss": 3.333582392416281, + "tokens_seen": 2854099968 + }, + { + "epoch": 9.06, + "learning_rate": 6.837512537612839e-05, + "loss": 2.6333, + "theoretical_loss": 3.3335764473632725, + "tokens_seen": 2854165504 + }, + { + "epoch": 9.06, + "learning_rate": 6.836509528585757e-05, + "loss": 2.506, + "theoretical_loss": 3.3335705024849904, + "tokens_seen": 2854231040 + }, + { + "epoch": 9.06, + "learning_rate": 6.835506519558677e-05, + "loss": 2.4315, + "theoretical_loss": 3.333564557781427, + "tokens_seen": 2854296576 + }, + { + "epoch": 9.06, + "learning_rate": 6.834503510531595e-05, + "loss": 2.4952, + "theoretical_loss": 3.333558613252572, + "tokens_seen": 2854362112 + }, + { + "epoch": 9.06, + "learning_rate": 6.833500501504514e-05, + "loss": 2.5511, + "theoretical_loss": 3.3335526688984163, + "tokens_seen": 2854427648 + }, + { + "epoch": 9.06, + "learning_rate": 6.832497492477433e-05, + "loss": 2.627, + "theoretical_loss": 3.333546724718951, + "tokens_seen": 2854493184 + }, + { + "epoch": 9.06, + "learning_rate": 6.83149448345035e-05, + "loss": 2.4496, + "theoretical_loss": 3.3335407807141673, + "tokens_seen": 2854558720 + }, + { + "epoch": 9.06, + "learning_rate": 6.83049147442327e-05, + "loss": 2.602, + "theoretical_loss": 3.333534836884056, + "tokens_seen": 2854624256 + }, + { + "epoch": 9.06, + "learning_rate": 6.829488465396188e-05, + "loss": 2.4458, + "theoretical_loss": 3.3335288932286073, + "tokens_seen": 2854689792 + }, + { + "epoch": 9.06, + "learning_rate": 6.828485456369108e-05, + "loss": 2.4877, + "theoretical_loss": 3.3335229497478127, + "tokens_seen": 2854755328 + }, + { + "epoch": 9.06, + "learning_rate": 6.827482447342026e-05, + "loss": 2.5011, + "theoretical_loss": 3.333517006441663, + "tokens_seen": 2854820864 + }, + { + "epoch": 9.06, + "learning_rate": 6.826479438314945e-05, + "loss": 2.427, + "theoretical_loss": 3.3335110633101483, + "tokens_seen": 2854886400 + }, + { + "epoch": 9.06, + "learning_rate": 6.825476429287863e-05, + "loss": 2.3785, + "theoretical_loss": 3.3335051203532604, + "tokens_seen": 2854951936 + }, + { + "epoch": 9.06, + "learning_rate": 6.824473420260783e-05, + "loss": 2.5949, + "theoretical_loss": 3.3334991775709897, + "tokens_seen": 2855017472 + }, + { + "epoch": 9.06, + "learning_rate": 6.823470411233701e-05, + "loss": 2.3144, + "theoretical_loss": 3.3334932349633273, + "tokens_seen": 2855083008 + }, + { + "epoch": 9.06, + "learning_rate": 6.822467402206619e-05, + "loss": 2.5453, + "theoretical_loss": 3.333487292530264, + "tokens_seen": 2855148544 + }, + { + "epoch": 9.06, + "learning_rate": 6.821464393179539e-05, + "loss": 2.6195, + "theoretical_loss": 3.33348135027179, + "tokens_seen": 2855214080 + }, + { + "epoch": 9.06, + "learning_rate": 6.820461384152457e-05, + "loss": 2.6285, + "theoretical_loss": 3.3334754081878977, + "tokens_seen": 2855279616 + }, + { + "epoch": 9.06, + "learning_rate": 6.819458375125376e-05, + "loss": 2.5403, + "theoretical_loss": 3.3334694662785767, + "tokens_seen": 2855345152 + }, + { + "epoch": 9.06, + "learning_rate": 6.818455366098294e-05, + "loss": 2.2033, + "theoretical_loss": 3.333463524543818, + "tokens_seen": 2855410688 + }, + { + "epoch": 9.06, + "learning_rate": 6.817452357071214e-05, + "loss": 2.608, + "theoretical_loss": 3.3334575829836126, + "tokens_seen": 2855476224 + }, + { + "epoch": 9.06, + "learning_rate": 6.816449348044132e-05, + "loss": 2.3024, + "theoretical_loss": 3.333451641597952, + "tokens_seen": 2855541760 + }, + { + "epoch": 9.06, + "learning_rate": 6.815446339017051e-05, + "loss": 2.4655, + "theoretical_loss": 3.3334457003868256, + "tokens_seen": 2855607296 + }, + { + "epoch": 9.06, + "learning_rate": 6.81444332998997e-05, + "loss": 2.4184, + "theoretical_loss": 3.3334397593502256, + "tokens_seen": 2855672832 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3128119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.611710548400879, + "objective/train/theoretical_loss": 3.3334353036873026, + "objective/train/tokens_used": 2876181984, + "theoretical_loss": 3.3334353036873026, + "tokens_seen": 2855721984 + }, + { + "epoch": 9.06, + "learning_rate": 6.813440320962887e-05, + "loss": 2.5344, + "theoretical_loss": 3.3334338184881425, + "tokens_seen": 2855738368 + }, + { + "epoch": 9.06, + "learning_rate": 6.812437311935808e-05, + "loss": 2.3691, + "theoretical_loss": 3.333427877800567, + "tokens_seen": 2855803904 + }, + { + "epoch": 9.06, + "learning_rate": 6.811434302908726e-05, + "loss": 2.5387, + "theoretical_loss": 3.33342193728749, + "tokens_seen": 2855869440 + }, + { + "epoch": 9.06, + "learning_rate": 6.810431293881646e-05, + "loss": 2.5228, + "theoretical_loss": 3.3334159969489026, + "tokens_seen": 2855934976 + }, + { + "epoch": 9.06, + "learning_rate": 6.809428284854564e-05, + "loss": 2.4064, + "theoretical_loss": 3.3334100567847953, + "tokens_seen": 2856000512 + }, + { + "epoch": 9.06, + "learning_rate": 6.808425275827483e-05, + "loss": 2.4085, + "theoretical_loss": 3.333404116795159, + "tokens_seen": 2856066048 + }, + { + "epoch": 9.06, + "learning_rate": 6.807422266800402e-05, + "loss": 2.7246, + "theoretical_loss": 3.333398176979985, + "tokens_seen": 2856131584 + }, + { + "epoch": 9.06, + "learning_rate": 6.806419257773321e-05, + "loss": 2.4456, + "theoretical_loss": 3.333392237339264, + "tokens_seen": 2856197120 + }, + { + "epoch": 9.06, + "learning_rate": 6.805416248746239e-05, + "loss": 2.6483, + "theoretical_loss": 3.333386297872987, + "tokens_seen": 2856262656 + }, + { + "epoch": 9.06, + "learning_rate": 6.804413239719159e-05, + "loss": 2.5908, + "theoretical_loss": 3.3333803585811443, + "tokens_seen": 2856328192 + }, + { + "epoch": 9.06, + "learning_rate": 6.803410230692077e-05, + "loss": 2.5531, + "theoretical_loss": 3.3333744194637274, + "tokens_seen": 2856393728 + }, + { + "epoch": 9.06, + "learning_rate": 6.802407221664995e-05, + "loss": 2.3757, + "theoretical_loss": 3.3333684805207264, + "tokens_seen": 2856459264 + }, + { + "epoch": 9.06, + "learning_rate": 6.801404212637914e-05, + "loss": 2.3152, + "theoretical_loss": 3.333362541752133, + "tokens_seen": 2856524800 + }, + { + "epoch": 9.06, + "learning_rate": 6.800401203610832e-05, + "loss": 2.4897, + "theoretical_loss": 3.333356603157938, + "tokens_seen": 2856590336 + }, + { + "epoch": 9.06, + "learning_rate": 6.799398194583752e-05, + "loss": 2.5406, + "theoretical_loss": 3.3333506647381324, + "tokens_seen": 2856655872 + }, + { + "epoch": 9.06, + "learning_rate": 6.79839518555667e-05, + "loss": 2.4855, + "theoretical_loss": 3.333344726492706, + "tokens_seen": 2856721408 + }, + { + "epoch": 9.06, + "learning_rate": 6.79739217652959e-05, + "loss": 2.6684, + "theoretical_loss": 3.3333387884216505, + "tokens_seen": 2856786944 + }, + { + "epoch": 9.06, + "learning_rate": 6.796389167502508e-05, + "loss": 2.5807, + "theoretical_loss": 3.3333328505249566, + "tokens_seen": 2856852480 + }, + { + "epoch": 9.06, + "learning_rate": 6.795386158475427e-05, + "loss": 2.5407, + "theoretical_loss": 3.3333269128026157, + "tokens_seen": 2856918016 + }, + { + "epoch": 9.06, + "learning_rate": 6.794383149448345e-05, + "loss": 2.4283, + "theoretical_loss": 3.333320975254618, + "tokens_seen": 2856983552 + }, + { + "epoch": 9.06, + "learning_rate": 6.793380140421263e-05, + "loss": 2.5575, + "theoretical_loss": 3.3333150378809546, + "tokens_seen": 2857049088 + }, + { + "epoch": 9.06, + "learning_rate": 6.792377131394183e-05, + "loss": 2.5777, + "theoretical_loss": 3.3333091006816167, + "tokens_seen": 2857114624 + }, + { + "epoch": 9.06, + "learning_rate": 6.791374122367101e-05, + "loss": 2.4471, + "theoretical_loss": 3.3333031636565944, + "tokens_seen": 2857180160 + }, + { + "epoch": 9.06, + "learning_rate": 6.79037111334002e-05, + "loss": 2.3267, + "theoretical_loss": 3.3332972268058794, + "tokens_seen": 2857245696 + }, + { + "epoch": 9.06, + "learning_rate": 6.789368104312938e-05, + "loss": 2.5967, + "theoretical_loss": 3.3332912901294622, + "tokens_seen": 2857311232 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3132157, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2394497394561768, + "objective/train/theoretical_loss": 3.333286837736527, + "objective/train/tokens_used": 2877820384, + "theoretical_loss": 3.333286837736527, + "tokens_seen": 2857360384 + }, + { + "epoch": 9.06, + "learning_rate": 6.788365095285858e-05, + "loss": 2.2583, + "theoretical_loss": 3.3332853536273337, + "tokens_seen": 2857376768 + }, + { + "epoch": 9.06, + "learning_rate": 6.787362086258776e-05, + "loss": 2.5216, + "theoretical_loss": 3.333279417299485, + "tokens_seen": 2857442304 + }, + { + "epoch": 9.06, + "learning_rate": 6.786359077231695e-05, + "loss": 2.6943, + "theoretical_loss": 3.333273481145907, + "tokens_seen": 2857507840 + }, + { + "epoch": 9.06, + "learning_rate": 6.785356068204614e-05, + "loss": 2.5525, + "theoretical_loss": 3.33326754516659, + "tokens_seen": 2857573376 + }, + { + "epoch": 9.06, + "learning_rate": 6.784353059177532e-05, + "loss": 2.4234, + "theoretical_loss": 3.333261609361525, + "tokens_seen": 2857638912 + }, + { + "epoch": 9.06, + "learning_rate": 6.783350050150451e-05, + "loss": 2.4454, + "theoretical_loss": 3.333255673730704, + "tokens_seen": 2857704448 + }, + { + "epoch": 9.06, + "learning_rate": 6.782347041123369e-05, + "loss": 2.5654, + "theoretical_loss": 3.3332497382741164, + "tokens_seen": 2857769984 + }, + { + "epoch": 9.06, + "learning_rate": 6.781344032096289e-05, + "loss": 2.5573, + "theoretical_loss": 3.3332438029917544, + "tokens_seen": 2857835520 + }, + { + "epoch": 9.06, + "learning_rate": 6.780341023069207e-05, + "loss": 2.4256, + "theoretical_loss": 3.3332378678836077, + "tokens_seen": 2857901056 + }, + { + "epoch": 9.06, + "learning_rate": 6.779338014042126e-05, + "loss": 2.4517, + "theoretical_loss": 3.333231932949668, + "tokens_seen": 2857966592 + }, + { + "epoch": 9.06, + "learning_rate": 6.778335005015044e-05, + "loss": 2.5645, + "theoretical_loss": 3.333225998189926, + "tokens_seen": 2858032128 + }, + { + "epoch": 9.06, + "learning_rate": 6.777331995987964e-05, + "loss": 2.2893, + "theoretical_loss": 3.3332200636043723, + "tokens_seen": 2858097664 + }, + { + "epoch": 9.06, + "learning_rate": 6.776328986960882e-05, + "loss": 2.5464, + "theoretical_loss": 3.3332141291929984, + "tokens_seen": 2858163200 + }, + { + "epoch": 9.06, + "learning_rate": 6.775325977933803e-05, + "loss": 2.4848, + "theoretical_loss": 3.3332081949557946, + "tokens_seen": 2858228736 + }, + { + "epoch": 9.06, + "learning_rate": 6.774322968906721e-05, + "loss": 2.492, + "theoretical_loss": 3.333202260892752, + "tokens_seen": 2858294272 + }, + { + "epoch": 9.06, + "learning_rate": 6.773319959879639e-05, + "loss": 2.7494, + "theoretical_loss": 3.3331963270038614, + "tokens_seen": 2858359808 + }, + { + "epoch": 9.06, + "learning_rate": 6.772316950852559e-05, + "loss": 2.573, + "theoretical_loss": 3.333190393289114, + "tokens_seen": 2858425344 + }, + { + "epoch": 9.06, + "learning_rate": 6.771313941825477e-05, + "loss": 2.1454, + "theoretical_loss": 3.3331844597485003, + "tokens_seen": 2858490880 + }, + { + "epoch": 9.06, + "learning_rate": 6.770310932798396e-05, + "loss": 2.5408, + "theoretical_loss": 3.3331785263820115, + "tokens_seen": 2858556416 + }, + { + "epoch": 9.06, + "learning_rate": 6.769307923771314e-05, + "loss": 2.5224, + "theoretical_loss": 3.333172593189638, + "tokens_seen": 2858621952 + }, + { + "epoch": 9.06, + "learning_rate": 6.768304914744234e-05, + "loss": 2.4533, + "theoretical_loss": 3.3331666601713716, + "tokens_seen": 2858687488 + }, + { + "epoch": 9.06, + "learning_rate": 6.767301905717152e-05, + "loss": 2.4133, + "theoretical_loss": 3.333160727327203, + "tokens_seen": 2858753024 + }, + { + "epoch": 9.06, + "learning_rate": 6.766298896690071e-05, + "loss": 2.5077, + "theoretical_loss": 3.3331547946571223, + "tokens_seen": 2858818560 + }, + { + "epoch": 9.06, + "learning_rate": 6.76529588766299e-05, + "loss": 2.6148, + "theoretical_loss": 3.3331488621611207, + "tokens_seen": 2858884096 + }, + { + "epoch": 9.06, + "learning_rate": 6.764292878635908e-05, + "loss": 2.4195, + "theoretical_loss": 3.33314292983919, + "tokens_seen": 2858949632 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3137113, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2119882106781006, + "objective/train/theoretical_loss": 3.3331384807119697, + "objective/train/tokens_used": 2879458784, + "theoretical_loss": 3.3331384807119697, + "tokens_seen": 2858998784 + }, + { + "epoch": 9.06, + "learning_rate": 6.763289869608827e-05, + "loss": 2.4812, + "theoretical_loss": 3.33313699769132, + "tokens_seen": 2859015168 + }, + { + "epoch": 9.06, + "learning_rate": 6.762286860581745e-05, + "loss": 2.3022, + "theoretical_loss": 3.333131065717502, + "tokens_seen": 2859080704 + }, + { + "epoch": 9.06, + "learning_rate": 6.761283851554665e-05, + "loss": 2.2923, + "theoretical_loss": 3.333125133917727, + "tokens_seen": 2859146240 + }, + { + "epoch": 9.06, + "learning_rate": 6.760280842527583e-05, + "loss": 2.7675, + "theoretical_loss": 3.3331192022919858, + "tokens_seen": 2859211776 + }, + { + "epoch": 9.06, + "learning_rate": 6.759277833500502e-05, + "loss": 2.6004, + "theoretical_loss": 3.3331132708402693, + "tokens_seen": 2859277312 + }, + { + "epoch": 9.06, + "learning_rate": 6.75827482447342e-05, + "loss": 2.4464, + "theoretical_loss": 3.3331073395625683, + "tokens_seen": 2859342848 + }, + { + "epoch": 9.06, + "learning_rate": 6.75727181544634e-05, + "loss": 2.5301, + "theoretical_loss": 3.333101408458874, + "tokens_seen": 2859408384 + }, + { + "epoch": 9.06, + "learning_rate": 6.756268806419258e-05, + "loss": 2.4387, + "theoretical_loss": 3.3330954775291772, + "tokens_seen": 2859473920 + }, + { + "epoch": 9.06, + "learning_rate": 6.755265797392177e-05, + "loss": 2.5108, + "theoretical_loss": 3.3330895467734685, + "tokens_seen": 2859539456 + }, + { + "epoch": 9.06, + "learning_rate": 6.754262788365095e-05, + "loss": 2.4655, + "theoretical_loss": 3.333083616191739, + "tokens_seen": 2859604992 + }, + { + "epoch": 9.06, + "learning_rate": 6.753259779338014e-05, + "loss": 2.4918, + "theoretical_loss": 3.33307768578398, + "tokens_seen": 2859670528 + }, + { + "epoch": 9.06, + "learning_rate": 6.752256770310933e-05, + "loss": 2.5753, + "theoretical_loss": 3.333071755550182, + "tokens_seen": 2859736064 + }, + { + "epoch": 9.06, + "learning_rate": 6.751253761283851e-05, + "loss": 2.4908, + "theoretical_loss": 3.333065825490336, + "tokens_seen": 2859801600 + }, + { + "epoch": 9.06, + "learning_rate": 6.75025075225677e-05, + "loss": 2.4651, + "theoretical_loss": 3.333059895604433, + "tokens_seen": 2859867136 + }, + { + "epoch": 9.06, + "learning_rate": 6.749247743229689e-05, + "loss": 2.5839, + "theoretical_loss": 3.3330539658924634, + "tokens_seen": 2859932672 + }, + { + "epoch": 9.06, + "learning_rate": 6.748244734202608e-05, + "loss": 2.5681, + "theoretical_loss": 3.333048036354419, + "tokens_seen": 2859998208 + }, + { + "epoch": 9.06, + "learning_rate": 6.747241725175526e-05, + "loss": 2.3637, + "theoretical_loss": 3.3330421069902894, + "tokens_seen": 2860063744 + }, + { + "epoch": 9.06, + "learning_rate": 6.746238716148446e-05, + "loss": 2.5068, + "theoretical_loss": 3.333036177800067, + "tokens_seen": 2860129280 + }, + { + "epoch": 9.06, + "learning_rate": 6.745235707121364e-05, + "loss": 2.6546, + "theoretical_loss": 3.333030248783742, + "tokens_seen": 2860194816 + }, + { + "epoch": 9.06, + "learning_rate": 6.744232698094282e-05, + "loss": 2.4288, + "theoretical_loss": 3.3330243199413054, + "tokens_seen": 2860260352 + }, + { + "epoch": 9.06, + "learning_rate": 6.743229689067201e-05, + "loss": 2.6828, + "theoretical_loss": 3.333018391272748, + "tokens_seen": 2860325888 + }, + { + "epoch": 9.06, + "learning_rate": 6.74222668004012e-05, + "loss": 2.478, + "theoretical_loss": 3.3330124627780613, + "tokens_seen": 2860391424 + }, + { + "epoch": 9.06, + "learning_rate": 6.741223671013039e-05, + "loss": 2.319, + "theoretical_loss": 3.333006534457235, + "tokens_seen": 2860456960 + }, + { + "epoch": 9.06, + "learning_rate": 6.740220661985957e-05, + "loss": 2.4792, + "theoretical_loss": 3.3330006063102613, + "tokens_seen": 2860522496 + }, + { + "epoch": 9.06, + "learning_rate": 6.739217652958877e-05, + "loss": 2.3884, + "theoretical_loss": 3.3329946783371303, + "tokens_seen": 2860588032 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3142005, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6601314544677734, + "objective/train/theoretical_loss": 3.332990232471361, + "objective/train/tokens_used": 2881097184, + "theoretical_loss": 3.332990232471361, + "tokens_seen": 2860637184 + }, + { + "epoch": 9.06, + "learning_rate": 6.738214643931795e-05, + "loss": 2.5702, + "theoretical_loss": 3.332988750537833, + "tokens_seen": 2860653568 + }, + { + "epoch": 9.06, + "learning_rate": 6.737211634904716e-05, + "loss": 2.5787, + "theoretical_loss": 3.332982822912361, + "tokens_seen": 2860719104 + }, + { + "epoch": 9.06, + "learning_rate": 6.736208625877634e-05, + "loss": 2.4182, + "theoretical_loss": 3.332976895460704, + "tokens_seen": 2860784640 + }, + { + "epoch": 9.06, + "learning_rate": 6.735205616850552e-05, + "loss": 2.3265, + "theoretical_loss": 3.3329709681828543, + "tokens_seen": 2860850176 + }, + { + "epoch": 9.06, + "learning_rate": 6.734202607823471e-05, + "loss": 2.4604, + "theoretical_loss": 3.3329650410788023, + "tokens_seen": 2860915712 + }, + { + "epoch": 9.06, + "learning_rate": 6.733199598796389e-05, + "loss": 2.5948, + "theoretical_loss": 3.3329591141485384, + "tokens_seen": 2860981248 + }, + { + "epoch": 9.06, + "learning_rate": 6.732196589769309e-05, + "loss": 2.3667, + "theoretical_loss": 3.3329531873920537, + "tokens_seen": 2861046784 + }, + { + "epoch": 9.06, + "learning_rate": 6.731193580742227e-05, + "loss": 2.3541, + "theoretical_loss": 3.33294726080934, + "tokens_seen": 2861112320 + }, + { + "epoch": 9.06, + "learning_rate": 6.730190571715146e-05, + "loss": 2.4925, + "theoretical_loss": 3.3329413344003873, + "tokens_seen": 2861177856 + }, + { + "epoch": 9.06, + "learning_rate": 6.729187562688064e-05, + "loss": 2.7094, + "theoretical_loss": 3.3329354081651865, + "tokens_seen": 2861243392 + }, + { + "epoch": 9.06, + "learning_rate": 6.728184553660984e-05, + "loss": 2.6538, + "theoretical_loss": 3.332929482103729, + "tokens_seen": 2861308928 + }, + { + "epoch": 9.06, + "learning_rate": 6.727181544633902e-05, + "loss": 2.3126, + "theoretical_loss": 3.332923556216006, + "tokens_seen": 2861374464 + }, + { + "epoch": 9.06, + "learning_rate": 6.726178535606822e-05, + "loss": 2.429, + "theoretical_loss": 3.332917630502007, + "tokens_seen": 2861440000 + }, + { + "epoch": 9.06, + "learning_rate": 6.72517552657974e-05, + "loss": 2.4755, + "theoretical_loss": 3.3329117049617247, + "tokens_seen": 2861505536 + }, + { + "epoch": 9.06, + "learning_rate": 6.724172517552658e-05, + "loss": 2.5942, + "theoretical_loss": 3.3329057795951487, + "tokens_seen": 2861571072 + }, + { + "epoch": 9.06, + "learning_rate": 6.723169508525577e-05, + "loss": 2.5126, + "theoretical_loss": 3.332899854402271, + "tokens_seen": 2861636608 + }, + { + "epoch": 9.06, + "learning_rate": 6.722166499498495e-05, + "loss": 2.5825, + "theoretical_loss": 3.3328939293830815, + "tokens_seen": 2861702144 + }, + { + "epoch": 9.06, + "learning_rate": 6.721163490471415e-05, + "loss": 2.5213, + "theoretical_loss": 3.3328880045375717, + "tokens_seen": 2861767680 + }, + { + "epoch": 9.06, + "learning_rate": 6.720160481444333e-05, + "loss": 2.4334, + "theoretical_loss": 3.332882079865733, + "tokens_seen": 2861833216 + }, + { + "epoch": 9.06, + "learning_rate": 6.719157472417252e-05, + "loss": 2.6621, + "theoretical_loss": 3.332876155367555, + "tokens_seen": 2861898752 + }, + { + "epoch": 9.06, + "learning_rate": 6.71815446339017e-05, + "loss": 2.5232, + "theoretical_loss": 3.33287023104303, + "tokens_seen": 2861964288 + }, + { + "epoch": 9.06, + "learning_rate": 6.71715145436309e-05, + "loss": 2.6383, + "theoretical_loss": 3.332864306892148, + "tokens_seen": 2862029824 + }, + { + "epoch": 9.06, + "learning_rate": 6.716148445336008e-05, + "loss": 2.6445, + "theoretical_loss": 3.3328583829149006, + "tokens_seen": 2862095360 + }, + { + "epoch": 9.06, + "learning_rate": 6.715145436308926e-05, + "loss": 2.763, + "theoretical_loss": 3.3328524591112783, + "tokens_seen": 2862160896 + }, + { + "epoch": 9.06, + "learning_rate": 6.714142427281846e-05, + "loss": 2.5569, + "theoretical_loss": 3.332846535481272, + "tokens_seen": 2862226432 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3147060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.084378242492676, + "objective/train/theoretical_loss": 3.332842092872698, + "objective/train/tokens_used": 2882735584, + "theoretical_loss": 3.332842092872698, + "tokens_seen": 2862275584 + }, + { + "epoch": 9.06, + "learning_rate": 6.713139418254764e-05, + "loss": 2.4888, + "theoretical_loss": 3.332840612024873, + "tokens_seen": 2862291968 + }, + { + "epoch": 9.06, + "learning_rate": 6.712136409227683e-05, + "loss": 2.4615, + "theoretical_loss": 3.3328346887420723, + "tokens_seen": 2862357504 + }, + { + "epoch": 9.06, + "learning_rate": 6.711133400200601e-05, + "loss": 2.2559, + "theoretical_loss": 3.3328287656328603, + "tokens_seen": 2862423040 + }, + { + "epoch": 9.06, + "learning_rate": 6.710130391173521e-05, + "loss": 2.3213, + "theoretical_loss": 3.3328228426972277, + "tokens_seen": 2862488576 + }, + { + "epoch": 9.06, + "learning_rate": 6.709127382146439e-05, + "loss": 2.4881, + "theoretical_loss": 3.3328169199351665, + "tokens_seen": 2862554112 + }, + { + "epoch": 9.06, + "learning_rate": 6.708124373119358e-05, + "loss": 2.4275, + "theoretical_loss": 3.332810997346667, + "tokens_seen": 2862619648 + }, + { + "epoch": 9.06, + "learning_rate": 6.707121364092276e-05, + "loss": 2.5605, + "theoretical_loss": 3.3328050749317204, + "tokens_seen": 2862685184 + }, + { + "epoch": 9.06, + "learning_rate": 6.706118355065195e-05, + "loss": 2.4149, + "theoretical_loss": 3.3327991526903173, + "tokens_seen": 2862750720 + }, + { + "epoch": 9.06, + "learning_rate": 6.705115346038114e-05, + "loss": 2.5469, + "theoretical_loss": 3.332793230622449, + "tokens_seen": 2862816256 + }, + { + "epoch": 9.06, + "learning_rate": 6.704112337011032e-05, + "loss": 2.7063, + "theoretical_loss": 3.332787308728106, + "tokens_seen": 2862881792 + }, + { + "epoch": 9.06, + "learning_rate": 6.703109327983952e-05, + "loss": 2.593, + "theoretical_loss": 3.3327813870072793, + "tokens_seen": 2862947328 + }, + { + "epoch": 9.06, + "learning_rate": 6.70210631895687e-05, + "loss": 2.4969, + "theoretical_loss": 3.33277546545996, + "tokens_seen": 2863012864 + }, + { + "epoch": 9.06, + "learning_rate": 6.701103309929789e-05, + "loss": 2.4576, + "theoretical_loss": 3.3327695440861396, + "tokens_seen": 2863078400 + }, + { + "epoch": 9.06, + "learning_rate": 6.700100300902709e-05, + "loss": 2.4056, + "theoretical_loss": 3.3327636228858086, + "tokens_seen": 2863143936 + }, + { + "epoch": 9.06, + "learning_rate": 6.699097291875628e-05, + "loss": 2.5119, + "theoretical_loss": 3.3327577018589576, + "tokens_seen": 2863209472 + }, + { + "epoch": 9.06, + "learning_rate": 6.698094282848546e-05, + "loss": 2.7036, + "theoretical_loss": 3.3327517810055776, + "tokens_seen": 2863275008 + }, + { + "epoch": 9.06, + "learning_rate": 6.697091273821466e-05, + "loss": 2.4854, + "theoretical_loss": 3.33274586032566, + "tokens_seen": 2863340544 + }, + { + "epoch": 9.06, + "learning_rate": 6.696088264794384e-05, + "loss": 2.457, + "theoretical_loss": 3.3327399398191955, + "tokens_seen": 2863406080 + }, + { + "epoch": 9.06, + "learning_rate": 6.695085255767302e-05, + "loss": 2.5288, + "theoretical_loss": 3.332734019486175, + "tokens_seen": 2863471616 + }, + { + "epoch": 9.06, + "learning_rate": 6.694082246740221e-05, + "loss": 2.5955, + "theoretical_loss": 3.33272809932659, + "tokens_seen": 2863537152 + }, + { + "epoch": 9.06, + "learning_rate": 6.69307923771314e-05, + "loss": 2.4453, + "theoretical_loss": 3.3327221793404305, + "tokens_seen": 2863602688 + }, + { + "epoch": 9.06, + "learning_rate": 6.692076228686059e-05, + "loss": 2.3644, + "theoretical_loss": 3.332716259527688, + "tokens_seen": 2863668224 + }, + { + "epoch": 9.06, + "learning_rate": 6.691073219658977e-05, + "loss": 2.5976, + "theoretical_loss": 3.332710339888353, + "tokens_seen": 2863733760 + }, + { + "epoch": 9.06, + "learning_rate": 6.690070210631897e-05, + "loss": 2.5624, + "theoretical_loss": 3.3327044204224174, + "tokens_seen": 2863799296 + }, + { + "epoch": 9.06, + "learning_rate": 6.689067201604815e-05, + "loss": 2.6193, + "theoretical_loss": 3.3326985011298715, + "tokens_seen": 2863864832 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3152143, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8077213764190674, + "objective/train/theoretical_loss": 3.3326940617742435, + "objective/train/tokens_used": 2884373984, + "theoretical_loss": 3.3326940617742435, + "tokens_seen": 2863913984 + }, + { + "epoch": 9.06, + "learning_rate": 6.688064192577734e-05, + "loss": 2.8222, + "theoretical_loss": 3.332692582010706, + "tokens_seen": 2863930368 + }, + { + "epoch": 9.06, + "learning_rate": 6.687061183550652e-05, + "loss": 2.6721, + "theoretical_loss": 3.332686663064912, + "tokens_seen": 2863995904 + }, + { + "epoch": 9.06, + "learning_rate": 6.68605817452357e-05, + "loss": 2.4493, + "theoretical_loss": 3.3326807442924813, + "tokens_seen": 2864061440 + }, + { + "epoch": 9.06, + "learning_rate": 6.68505516549649e-05, + "loss": 2.2301, + "theoretical_loss": 3.3326748256934042, + "tokens_seen": 2864126976 + }, + { + "epoch": 9.06, + "learning_rate": 6.684052156469408e-05, + "loss": 2.5563, + "theoretical_loss": 3.332668907267671, + "tokens_seen": 2864192512 + }, + { + "epoch": 9.06, + "learning_rate": 6.683049147442327e-05, + "loss": 2.3402, + "theoretical_loss": 3.332662989015274, + "tokens_seen": 2864258048 + }, + { + "epoch": 9.06, + "learning_rate": 6.682046138415246e-05, + "loss": 2.4769, + "theoretical_loss": 3.332657070936203, + "tokens_seen": 2864323584 + }, + { + "epoch": 9.06, + "learning_rate": 6.681043129388165e-05, + "loss": 2.5793, + "theoretical_loss": 3.3326511530304495, + "tokens_seen": 2864389120 + }, + { + "epoch": 9.06, + "learning_rate": 6.680040120361083e-05, + "loss": 2.4926, + "theoretical_loss": 3.3326452352980045, + "tokens_seen": 2864454656 + }, + { + "epoch": 9.06, + "learning_rate": 6.679037111334003e-05, + "loss": 2.5124, + "theoretical_loss": 3.3326393177388587, + "tokens_seen": 2864520192 + }, + { + "epoch": 9.06, + "learning_rate": 6.678034102306921e-05, + "loss": 2.5227, + "theoretical_loss": 3.3326334003530036, + "tokens_seen": 2864585728 + }, + { + "epoch": 9.06, + "learning_rate": 6.677031093279839e-05, + "loss": 2.4948, + "theoretical_loss": 3.3326274831404294, + "tokens_seen": 2864651264 + }, + { + "epoch": 9.06, + "learning_rate": 6.676028084252758e-05, + "loss": 2.4942, + "theoretical_loss": 3.3326215661011274, + "tokens_seen": 2864716800 + }, + { + "epoch": 9.06, + "learning_rate": 6.675025075225676e-05, + "loss": 2.479, + "theoretical_loss": 3.3326156492350885, + "tokens_seen": 2864782336 + }, + { + "epoch": 9.06, + "learning_rate": 6.674022066198596e-05, + "loss": 2.4767, + "theoretical_loss": 3.332609732542304, + "tokens_seen": 2864847872 + }, + { + "epoch": 9.06, + "learning_rate": 6.673019057171514e-05, + "loss": 2.657, + "theoretical_loss": 3.3326038160227647, + "tokens_seen": 2864913408 + }, + { + "epoch": 9.06, + "learning_rate": 6.672016048144433e-05, + "loss": 2.459, + "theoretical_loss": 3.332597899676461, + "tokens_seen": 2864978944 + }, + { + "epoch": 9.06, + "learning_rate": 6.671013039117352e-05, + "loss": 2.6108, + "theoretical_loss": 3.332591983503385, + "tokens_seen": 2865044480 + }, + { + "epoch": 9.06, + "learning_rate": 6.670010030090271e-05, + "loss": 2.6327, + "theoretical_loss": 3.3325860675035264, + "tokens_seen": 2865110016 + }, + { + "epoch": 9.06, + "learning_rate": 6.669007021063189e-05, + "loss": 2.5517, + "theoretical_loss": 3.3325801516768774, + "tokens_seen": 2865175552 + }, + { + "epoch": 9.06, + "learning_rate": 6.668004012036109e-05, + "loss": 2.4616, + "theoretical_loss": 3.3325742360234276, + "tokens_seen": 2865241088 + }, + { + "epoch": 9.06, + "learning_rate": 6.667001003009027e-05, + "loss": 2.2434, + "theoretical_loss": 3.332568320543169, + "tokens_seen": 2865306624 + }, + { + "epoch": 9.06, + "learning_rate": 6.665997993981945e-05, + "loss": 2.5207, + "theoretical_loss": 3.332562405236093, + "tokens_seen": 2865372160 + }, + { + "epoch": 9.06, + "learning_rate": 6.664994984954864e-05, + "loss": 2.3578, + "theoretical_loss": 3.332556490102189, + "tokens_seen": 2865437696 + }, + { + "epoch": 9.06, + "learning_rate": 6.663991975927782e-05, + "loss": 2.4631, + "theoretical_loss": 3.332550575141449, + "tokens_seen": 2865503232 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3155084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4753198623657227, + "objective/train/theoretical_loss": 3.3325461390345272, + "objective/train/tokens_used": 2886012384, + "theoretical_loss": 3.3325461390345272, + "tokens_seen": 2865552384 + }, + { + "epoch": 9.06, + "learning_rate": 6.662988966900702e-05, + "loss": 2.4282, + "theoretical_loss": 3.3325446603538635, + "tokens_seen": 2865568768 + }, + { + "epoch": 9.06, + "learning_rate": 6.661985957873621e-05, + "loss": 2.4072, + "theoretical_loss": 3.3325387457394244, + "tokens_seen": 2865634304 + }, + { + "epoch": 9.06, + "learning_rate": 6.660982948846541e-05, + "loss": 2.5356, + "theoretical_loss": 3.3325328312981215, + "tokens_seen": 2865699840 + }, + { + "epoch": 9.06, + "learning_rate": 6.659979939819459e-05, + "loss": 2.4144, + "theoretical_loss": 3.3325269170299467, + "tokens_seen": 2865765376 + }, + { + "epoch": 9.06, + "learning_rate": 6.658976930792378e-05, + "loss": 2.4587, + "theoretical_loss": 3.3325210029348904, + "tokens_seen": 2865830912 + }, + { + "epoch": 9.06, + "learning_rate": 6.657973921765296e-05, + "loss": 2.5245, + "theoretical_loss": 3.332515089012944, + "tokens_seen": 2865896448 + }, + { + "epoch": 9.06, + "learning_rate": 6.656970912738215e-05, + "loss": 2.4431, + "theoretical_loss": 3.332509175264098, + "tokens_seen": 2865961984 + }, + { + "epoch": 9.06, + "learning_rate": 6.655967903711134e-05, + "loss": 2.5193, + "theoretical_loss": 3.3325032616883434, + "tokens_seen": 2866027520 + }, + { + "epoch": 9.06, + "learning_rate": 6.654964894684052e-05, + "loss": 2.5198, + "theoretical_loss": 3.3324973482856715, + "tokens_seen": 2866093056 + }, + { + "epoch": 9.06, + "learning_rate": 6.653961885656972e-05, + "loss": 2.303, + "theoretical_loss": 3.332491435056073, + "tokens_seen": 2866158592 + }, + { + "epoch": 9.06, + "learning_rate": 6.65295887662989e-05, + "loss": 2.416, + "theoretical_loss": 3.3324855219995397, + "tokens_seen": 2866224128 + }, + { + "epoch": 9.06, + "learning_rate": 6.651955867602809e-05, + "loss": 2.4879, + "theoretical_loss": 3.3324796091160613, + "tokens_seen": 2866289664 + }, + { + "epoch": 9.06, + "learning_rate": 6.650952858575727e-05, + "loss": 2.4726, + "theoretical_loss": 3.3324736964056294, + "tokens_seen": 2866355200 + }, + { + "epoch": 9.06, + "learning_rate": 6.649949849548647e-05, + "loss": 2.4955, + "theoretical_loss": 3.3324677838682355, + "tokens_seen": 2866420736 + }, + { + "epoch": 9.06, + "learning_rate": 6.648946840521565e-05, + "loss": 2.665, + "theoretical_loss": 3.3324618715038694, + "tokens_seen": 2866486272 + }, + { + "epoch": 9.06, + "learning_rate": 6.647943831494484e-05, + "loss": 2.6182, + "theoretical_loss": 3.332455959312523, + "tokens_seen": 2866551808 + }, + { + "epoch": 9.06, + "learning_rate": 6.646940822467403e-05, + "loss": 2.6111, + "theoretical_loss": 3.332450047294187, + "tokens_seen": 2866617344 + }, + { + "epoch": 9.06, + "learning_rate": 6.64593781344032e-05, + "loss": 2.521, + "theoretical_loss": 3.3324441354488523, + "tokens_seen": 2866682880 + }, + { + "epoch": 9.06, + "learning_rate": 6.64493480441324e-05, + "loss": 2.6337, + "theoretical_loss": 3.33243822377651, + "tokens_seen": 2866748416 + }, + { + "epoch": 9.06, + "learning_rate": 6.643931795386158e-05, + "loss": 2.5176, + "theoretical_loss": 3.3324323122771515, + "tokens_seen": 2866813952 + }, + { + "epoch": 9.06, + "learning_rate": 6.642928786359078e-05, + "loss": 2.6229, + "theoretical_loss": 3.3324264009507667, + "tokens_seen": 2866879488 + }, + { + "epoch": 9.06, + "learning_rate": 6.641925777331996e-05, + "loss": 2.4876, + "theoretical_loss": 3.3324204897973475, + "tokens_seen": 2866945024 + }, + { + "epoch": 9.06, + "learning_rate": 6.640922768304915e-05, + "loss": 2.4092, + "theoretical_loss": 3.3324145788168846, + "tokens_seen": 2867010560 + }, + { + "epoch": 9.06, + "learning_rate": 6.639919759277833e-05, + "loss": 2.6421, + "theoretical_loss": 3.3324086680093687, + "tokens_seen": 2867076096 + }, + { + "epoch": 9.06, + "learning_rate": 6.638916750250753e-05, + "loss": 2.533, + "theoretical_loss": 3.3324027573747914, + "tokens_seen": 2867141632 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3155765, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.516918897628784, + "objective/train/theoretical_loss": 3.332398324512344, + "objective/train/tokens_used": 2887650784, + "theoretical_loss": 3.332398324512344, + "tokens_seen": 2867190784 + }, + { + "epoch": 9.06, + "learning_rate": 6.637913741223671e-05, + "loss": 2.5648, + "theoretical_loss": 3.3323968469131438, + "tokens_seen": 2867207168 + }, + { + "epoch": 9.06, + "learning_rate": 6.636910732196589e-05, + "loss": 2.4542, + "theoretical_loss": 3.332390936624416, + "tokens_seen": 2867272704 + }, + { + "epoch": 9.06, + "learning_rate": 6.635907723169509e-05, + "loss": 2.4724, + "theoretical_loss": 3.3323850265085992, + "tokens_seen": 2867338240 + }, + { + "epoch": 9.06, + "learning_rate": 6.634904714142427e-05, + "loss": 2.4243, + "theoretical_loss": 3.3323791165656846, + "tokens_seen": 2867403776 + }, + { + "epoch": 9.06, + "learning_rate": 6.633901705115346e-05, + "loss": 2.5752, + "theoretical_loss": 3.3323732067956637, + "tokens_seen": 2867469312 + }, + { + "epoch": 9.06, + "learning_rate": 6.632898696088264e-05, + "loss": 2.5192, + "theoretical_loss": 3.3323672971985268, + "tokens_seen": 2867534848 + }, + { + "epoch": 9.06, + "learning_rate": 6.631895687061184e-05, + "loss": 2.6973, + "theoretical_loss": 3.3323613877742653, + "tokens_seen": 2867600384 + }, + { + "epoch": 9.06, + "learning_rate": 6.630892678034102e-05, + "loss": 2.3265, + "theoretical_loss": 3.33235547852287, + "tokens_seen": 2867665920 + }, + { + "epoch": 9.06, + "learning_rate": 6.629889669007021e-05, + "loss": 2.5246, + "theoretical_loss": 3.3323495694443315, + "tokens_seen": 2867731456 + }, + { + "epoch": 9.06, + "learning_rate": 6.62888665997994e-05, + "loss": 2.5364, + "theoretical_loss": 3.3323436605386414, + "tokens_seen": 2867796992 + }, + { + "epoch": 9.06, + "learning_rate": 6.627883650952857e-05, + "loss": 2.5202, + "theoretical_loss": 3.332337751805791, + "tokens_seen": 2867862528 + }, + { + "epoch": 9.06, + "learning_rate": 6.626880641925777e-05, + "loss": 2.2394, + "theoretical_loss": 3.33233184324577, + "tokens_seen": 2867928064 + }, + { + "epoch": 9.06, + "learning_rate": 6.625877632898695e-05, + "loss": 2.4426, + "theoretical_loss": 3.3323259348585705, + "tokens_seen": 2867993600 + }, + { + "epoch": 9.06, + "learning_rate": 6.624874623871616e-05, + "loss": 2.4274, + "theoretical_loss": 3.332320026644183, + "tokens_seen": 2868059136 + }, + { + "epoch": 9.06, + "learning_rate": 6.623871614844534e-05, + "loss": 2.5463, + "theoretical_loss": 3.332314118602599, + "tokens_seen": 2868124672 + }, + { + "epoch": 9.06, + "learning_rate": 6.622868605817453e-05, + "loss": 2.5708, + "theoretical_loss": 3.332308210733809, + "tokens_seen": 2868190208 + }, + { + "epoch": 9.06, + "learning_rate": 6.621865596790372e-05, + "loss": 2.4985, + "theoretical_loss": 3.332302303037804, + "tokens_seen": 2868255744 + }, + { + "epoch": 9.06, + "learning_rate": 6.620862587763291e-05, + "loss": 2.3956, + "theoretical_loss": 3.3322963955145752, + "tokens_seen": 2868321280 + }, + { + "epoch": 9.06, + "learning_rate": 6.619859578736209e-05, + "loss": 2.4385, + "theoretical_loss": 3.3322904881641136, + "tokens_seen": 2868386816 + }, + { + "epoch": 9.06, + "learning_rate": 6.618856569709129e-05, + "loss": 2.5918, + "theoretical_loss": 3.33228458098641, + "tokens_seen": 2868452352 + }, + { + "epoch": 9.06, + "learning_rate": 6.617853560682047e-05, + "loss": 2.5443, + "theoretical_loss": 3.332278673981456, + "tokens_seen": 2868517888 + }, + { + "epoch": 9.06, + "learning_rate": 6.616850551654965e-05, + "loss": 2.2584, + "theoretical_loss": 3.3322727671492416, + "tokens_seen": 2868583424 + }, + { + "epoch": 9.06, + "learning_rate": 6.615847542627884e-05, + "loss": 2.3988, + "theoretical_loss": 3.3322668604897587, + "tokens_seen": 2868648960 + }, + { + "epoch": 9.06, + "learning_rate": 6.614844533600802e-05, + "loss": 2.5244, + "theoretical_loss": 3.332260954002998, + "tokens_seen": 2868714496 + }, + { + "epoch": 9.06, + "learning_rate": 6.613841524573722e-05, + "loss": 2.5164, + "theoretical_loss": 3.3322550476889505, + "tokens_seen": 2868780032 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3156418, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6902074813842773, + "objective/train/theoretical_loss": 3.3322506180667526, + "objective/train/tokens_used": 2889289184, + "theoretical_loss": 3.3322506180667526, + "tokens_seen": 2868829184 + }, + { + "epoch": 9.06, + "learning_rate": 6.61283851554664e-05, + "loss": 2.716, + "theoretical_loss": 3.3322491415476074, + "tokens_seen": 2868845568 + }, + { + "epoch": 9.06, + "learning_rate": 6.61183550651956e-05, + "loss": 2.6165, + "theoretical_loss": 3.332243235578959, + "tokens_seen": 2868911104 + }, + { + "epoch": 9.06, + "learning_rate": 6.610832497492478e-05, + "loss": 2.6926, + "theoretical_loss": 3.332237329782997, + "tokens_seen": 2868976640 + }, + { + "epoch": 9.06, + "learning_rate": 6.609829488465397e-05, + "loss": 2.4864, + "theoretical_loss": 3.332231424159712, + "tokens_seen": 2869042176 + }, + { + "epoch": 9.06, + "learning_rate": 6.608826479438315e-05, + "loss": 2.4133, + "theoretical_loss": 3.3322255187090954, + "tokens_seen": 2869107712 + }, + { + "epoch": 9.06, + "learning_rate": 6.607823470411233e-05, + "loss": 2.6438, + "theoretical_loss": 3.332219613431138, + "tokens_seen": 2869173248 + }, + { + "epoch": 9.06, + "learning_rate": 6.606820461384153e-05, + "loss": 2.3934, + "theoretical_loss": 3.3322137083258307, + "tokens_seen": 2869238784 + }, + { + "epoch": 9.06, + "learning_rate": 6.605817452357071e-05, + "loss": 2.4803, + "theoretical_loss": 3.332207803393165, + "tokens_seen": 2869304320 + }, + { + "epoch": 9.06, + "learning_rate": 6.60481444332999e-05, + "loss": 2.5877, + "theoretical_loss": 3.332201898633131, + "tokens_seen": 2869369856 + }, + { + "epoch": 9.06, + "learning_rate": 6.603811434302908e-05, + "loss": 2.5397, + "theoretical_loss": 3.3321959940457204, + "tokens_seen": 2869435392 + }, + { + "epoch": 9.06, + "learning_rate": 6.602808425275828e-05, + "loss": 2.5144, + "theoretical_loss": 3.3321900896309242, + "tokens_seen": 2869500928 + }, + { + "epoch": 9.06, + "learning_rate": 6.601805416248746e-05, + "loss": 2.3375, + "theoretical_loss": 3.332184185388733, + "tokens_seen": 2869566464 + }, + { + "epoch": 9.06, + "learning_rate": 6.600802407221665e-05, + "loss": 2.3468, + "theoretical_loss": 3.3321782813191385, + "tokens_seen": 2869632000 + }, + { + "epoch": 9.06, + "learning_rate": 6.599799398194584e-05, + "loss": 2.5754, + "theoretical_loss": 3.332172377422131, + "tokens_seen": 2869697536 + }, + { + "epoch": 9.06, + "learning_rate": 6.598796389167502e-05, + "loss": 2.5621, + "theoretical_loss": 3.332166473697702, + "tokens_seen": 2869763072 + }, + { + "epoch": 9.06, + "learning_rate": 6.597793380140421e-05, + "loss": 2.5392, + "theoretical_loss": 3.332160570145842, + "tokens_seen": 2869828608 + }, + { + "epoch": 9.06, + "learning_rate": 6.596790371113339e-05, + "loss": 2.6597, + "theoretical_loss": 3.3321546667665425, + "tokens_seen": 2869894144 + }, + { + "epoch": 9.06, + "learning_rate": 6.595787362086259e-05, + "loss": 2.4456, + "theoretical_loss": 3.332148763559794, + "tokens_seen": 2869959680 + }, + { + "epoch": 9.06, + "learning_rate": 6.594784353059177e-05, + "loss": 2.4729, + "theoretical_loss": 3.3321428605255887, + "tokens_seen": 2870025216 + }, + { + "epoch": 9.06, + "learning_rate": 6.593781344032096e-05, + "loss": 2.3923, + "theoretical_loss": 3.3321369576639164, + "tokens_seen": 2870090752 + }, + { + "epoch": 9.06, + "learning_rate": 6.592778335005014e-05, + "loss": 2.5375, + "theoretical_loss": 3.3321310549747682, + "tokens_seen": 2870156288 + }, + { + "epoch": 9.06, + "learning_rate": 6.591775325977934e-05, + "loss": 2.5318, + "theoretical_loss": 3.332125152458136, + "tokens_seen": 2870221824 + }, + { + "epoch": 9.06, + "learning_rate": 6.590772316950852e-05, + "loss": 2.4205, + "theoretical_loss": 3.3321192501140096, + "tokens_seen": 2870287360 + }, + { + "epoch": 9.06, + "learning_rate": 6.589769307923771e-05, + "loss": 2.4232, + "theoretical_loss": 3.3321133479423812, + "tokens_seen": 2870352896 + }, + { + "epoch": 9.06, + "learning_rate": 6.58876629889669e-05, + "loss": 2.5827, + "theoretical_loss": 3.332107445943241, + "tokens_seen": 2870418432 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3158008, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7475223541259766, + "objective/train/theoretical_loss": 3.332103019557076, + "objective/train/tokens_used": 2890927584, + "theoretical_loss": 3.332103019557076, + "tokens_seen": 2870467584 + }, + { + "epoch": 9.06, + "learning_rate": 6.587763289869609e-05, + "loss": 2.4819, + "theoretical_loss": 3.33210154411658, + "tokens_seen": 2870483968 + }, + { + "epoch": 9.06, + "learning_rate": 6.586760280842529e-05, + "loss": 2.5383, + "theoretical_loss": 3.3320956424623898, + "tokens_seen": 2870549504 + }, + { + "epoch": 9.06, + "learning_rate": 6.585757271815447e-05, + "loss": 2.324, + "theoretical_loss": 3.3320897409806616, + "tokens_seen": 2870615040 + }, + { + "epoch": 9.06, + "learning_rate": 6.584754262788366e-05, + "loss": 2.583, + "theoretical_loss": 3.3320838396713857, + "tokens_seen": 2870680576 + }, + { + "epoch": 9.06, + "learning_rate": 6.583751253761284e-05, + "loss": 2.4099, + "theoretical_loss": 3.332077938534553, + "tokens_seen": 2870746112 + }, + { + "epoch": 9.06, + "learning_rate": 6.582748244734204e-05, + "loss": 2.3563, + "theoretical_loss": 3.332072037570155, + "tokens_seen": 2870811648 + }, + { + "epoch": 9.06, + "learning_rate": 6.581745235707122e-05, + "loss": 2.454, + "theoretical_loss": 3.3320661367781828, + "tokens_seen": 2870877184 + }, + { + "epoch": 9.06, + "learning_rate": 6.580742226680041e-05, + "loss": 2.4977, + "theoretical_loss": 3.332060236158627, + "tokens_seen": 2870942720 + }, + { + "epoch": 9.06, + "learning_rate": 6.57973921765296e-05, + "loss": 2.5304, + "theoretical_loss": 3.3320543357114794, + "tokens_seen": 2871008256 + }, + { + "epoch": 9.06, + "learning_rate": 6.578736208625877e-05, + "loss": 2.4688, + "theoretical_loss": 3.3320484354367306, + "tokens_seen": 2871073792 + }, + { + "epoch": 9.06, + "learning_rate": 6.577733199598797e-05, + "loss": 2.0925, + "theoretical_loss": 3.332042535334371, + "tokens_seen": 2871139328 + }, + { + "epoch": 9.06, + "learning_rate": 6.576730190571715e-05, + "loss": 2.4728, + "theoretical_loss": 3.3320366354043927, + "tokens_seen": 2871204864 + }, + { + "epoch": 9.06, + "learning_rate": 6.575727181544635e-05, + "loss": 2.4925, + "theoretical_loss": 3.3320307356467858, + "tokens_seen": 2871270400 + }, + { + "epoch": 9.06, + "learning_rate": 6.574724172517553e-05, + "loss": 2.4684, + "theoretical_loss": 3.332024836061542, + "tokens_seen": 2871335936 + }, + { + "epoch": 9.06, + "learning_rate": 6.573721163490472e-05, + "loss": 2.3979, + "theoretical_loss": 3.3320189366486517, + "tokens_seen": 2871401472 + }, + { + "epoch": 9.06, + "learning_rate": 6.57271815446339e-05, + "loss": 2.4599, + "theoretical_loss": 3.3320130374081067, + "tokens_seen": 2871467008 + }, + { + "epoch": 9.06, + "learning_rate": 6.57171514543631e-05, + "loss": 2.4855, + "theoretical_loss": 3.3320071383398973, + "tokens_seen": 2871532544 + }, + { + "epoch": 9.06, + "learning_rate": 6.570712136409228e-05, + "loss": 2.473, + "theoretical_loss": 3.332001239444015, + "tokens_seen": 2871598080 + }, + { + "epoch": 9.06, + "learning_rate": 6.569709127382147e-05, + "loss": 2.4407, + "theoretical_loss": 3.3319953407204506, + "tokens_seen": 2871663616 + }, + { + "epoch": 9.06, + "learning_rate": 6.568706118355065e-05, + "loss": 2.3886, + "theoretical_loss": 3.3319894421691956, + "tokens_seen": 2871729152 + }, + { + "epoch": 9.06, + "learning_rate": 6.567703109327984e-05, + "loss": 2.562, + "theoretical_loss": 3.3319835437902405, + "tokens_seen": 2871794688 + }, + { + "epoch": 9.06, + "learning_rate": 6.566700100300903e-05, + "loss": 2.4828, + "theoretical_loss": 3.331977645583576, + "tokens_seen": 2871860224 + }, + { + "epoch": 9.06, + "learning_rate": 6.565697091273821e-05, + "loss": 2.6112, + "theoretical_loss": 3.3319717475491943, + "tokens_seen": 2871925760 + }, + { + "epoch": 9.06, + "learning_rate": 6.56469408224674e-05, + "loss": 2.3374, + "theoretical_loss": 3.3319658496870854, + "tokens_seen": 2871991296 + }, + { + "epoch": 9.06, + "learning_rate": 6.563691073219659e-05, + "loss": 2.4156, + "theoretical_loss": 3.331959951997241, + "tokens_seen": 2872056832 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3158755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3322813510894775, + "objective/train/theoretical_loss": 3.3319555288429004, + "objective/train/tokens_used": 2892565984, + "theoretical_loss": 3.3319555288429004, + "tokens_seen": 2872105984 + }, + { + "epoch": 9.06, + "learning_rate": 6.562688064192578e-05, + "loss": 2.4209, + "theoretical_loss": 3.3319540544796515, + "tokens_seen": 2872122368 + }, + { + "epoch": 9.06, + "learning_rate": 6.561685055165496e-05, + "loss": 2.4444, + "theoretical_loss": 3.3319481571343084, + "tokens_seen": 2872187904 + }, + { + "epoch": 9.06, + "learning_rate": 6.560682046138416e-05, + "loss": 2.4408, + "theoretical_loss": 3.3319422599612025, + "tokens_seen": 2872253440 + }, + { + "epoch": 9.06, + "learning_rate": 6.559679037111334e-05, + "loss": 2.5797, + "theoretical_loss": 3.3319363629603256, + "tokens_seen": 2872318976 + }, + { + "epoch": 9.06, + "learning_rate": 6.558676028084252e-05, + "loss": 2.3991, + "theoretical_loss": 3.3319304661316673, + "tokens_seen": 2872384512 + }, + { + "epoch": 9.06, + "learning_rate": 6.557673019057171e-05, + "loss": 2.5284, + "theoretical_loss": 3.3319245694752198, + "tokens_seen": 2872450048 + }, + { + "epoch": 9.06, + "learning_rate": 6.55667001003009e-05, + "loss": 2.4352, + "theoretical_loss": 3.331918672990974, + "tokens_seen": 2872515584 + }, + { + "epoch": 9.06, + "learning_rate": 6.555667001003009e-05, + "loss": 2.5254, + "theoretical_loss": 3.3319127766789203, + "tokens_seen": 2872581120 + }, + { + "epoch": 9.06, + "learning_rate": 6.554663991975927e-05, + "loss": 2.5239, + "theoretical_loss": 3.3319068805390506, + "tokens_seen": 2872646656 + }, + { + "epoch": 9.06, + "learning_rate": 6.553660982948847e-05, + "loss": 2.652, + "theoretical_loss": 3.331900984571355, + "tokens_seen": 2872712192 + }, + { + "epoch": 9.06, + "learning_rate": 6.552657973921765e-05, + "loss": 2.6851, + "theoretical_loss": 3.3318950887758256, + "tokens_seen": 2872777728 + }, + { + "epoch": 9.06, + "learning_rate": 6.551654964894684e-05, + "loss": 2.3715, + "theoretical_loss": 3.3318891931524526, + "tokens_seen": 2872843264 + }, + { + "epoch": 9.06, + "learning_rate": 6.550651955867602e-05, + "loss": 2.5542, + "theoretical_loss": 3.3318832977012276, + "tokens_seen": 2872908800 + }, + { + "epoch": 9.06, + "learning_rate": 6.549648946840522e-05, + "loss": 2.4889, + "theoretical_loss": 3.3318774024221414, + "tokens_seen": 2872974336 + }, + { + "epoch": 9.06, + "learning_rate": 6.548645937813441e-05, + "loss": 2.5007, + "theoretical_loss": 3.3318715073151846, + "tokens_seen": 2873039872 + }, + { + "epoch": 9.06, + "learning_rate": 6.547642928786359e-05, + "loss": 2.6971, + "theoretical_loss": 3.3318656123803487, + "tokens_seen": 2873105408 + }, + { + "epoch": 9.06, + "learning_rate": 6.546639919759279e-05, + "loss": 2.6553, + "theoretical_loss": 3.3318597176176254, + "tokens_seen": 2873170944 + }, + { + "epoch": 9.06, + "learning_rate": 6.545636910732197e-05, + "loss": 2.3546, + "theoretical_loss": 3.3318538230270045, + "tokens_seen": 2873236480 + }, + { + "epoch": 9.06, + "learning_rate": 6.544633901705116e-05, + "loss": 2.4249, + "theoretical_loss": 3.331847928608478, + "tokens_seen": 2873302016 + }, + { + "epoch": 9.06, + "learning_rate": 6.543630892678034e-05, + "loss": 2.2473, + "theoretical_loss": 3.3318420343620363, + "tokens_seen": 2873367552 + }, + { + "epoch": 9.06, + "learning_rate": 6.542627883650954e-05, + "loss": 2.5999, + "theoretical_loss": 3.331836140287671, + "tokens_seen": 2873433088 + }, + { + "epoch": 9.06, + "learning_rate": 6.541624874623872e-05, + "loss": 2.4504, + "theoretical_loss": 3.3318302463853726, + "tokens_seen": 2873498624 + }, + { + "epoch": 9.06, + "learning_rate": 6.540621865596791e-05, + "loss": 2.3901, + "theoretical_loss": 3.3318243526551328, + "tokens_seen": 2873564160 + }, + { + "epoch": 9.06, + "learning_rate": 6.53961885656971e-05, + "loss": 2.634, + "theoretical_loss": 3.331818459096942, + "tokens_seen": 2873629696 + }, + { + "epoch": 9.06, + "learning_rate": 6.538615847542628e-05, + "loss": 2.5559, + "theoretical_loss": 3.3318125657107918, + "tokens_seen": 2873695232 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3160138, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6905925273895264, + "objective/train/theoretical_loss": 3.331808145784075, + "objective/train/tokens_used": 2894204384, + "theoretical_loss": 3.331808145784075, + "tokens_seen": 2873744384 + }, + { + "epoch": 9.06, + "learning_rate": 6.537612838515547e-05, + "loss": 2.5407, + "theoretical_loss": 3.331806672496673, + "tokens_seen": 2873760768 + }, + { + "epoch": 9.06, + "learning_rate": 6.536609829488465e-05, + "loss": 2.5148, + "theoretical_loss": 3.3318007794545763, + "tokens_seen": 2873826304 + }, + { + "epoch": 9.06, + "learning_rate": 6.535606820461385e-05, + "loss": 2.4546, + "theoretical_loss": 3.3317948865844933, + "tokens_seen": 2873891840 + }, + { + "epoch": 9.06, + "learning_rate": 6.534603811434303e-05, + "loss": 2.323, + "theoretical_loss": 3.3317889938864154, + "tokens_seen": 2873957376 + }, + { + "epoch": 9.06, + "learning_rate": 6.533600802407222e-05, + "loss": 2.6555, + "theoretical_loss": 3.3317831013603323, + "tokens_seen": 2874022912 + }, + { + "epoch": 9.06, + "learning_rate": 6.53259779338014e-05, + "loss": 2.6756, + "theoretical_loss": 3.3317772090062365, + "tokens_seen": 2874088448 + }, + { + "epoch": 9.06, + "learning_rate": 6.53159478435306e-05, + "loss": 2.4425, + "theoretical_loss": 3.3317713168241183, + "tokens_seen": 2874153984 + }, + { + "epoch": 9.06, + "learning_rate": 6.530591775325978e-05, + "loss": 2.3572, + "theoretical_loss": 3.3317654248139688, + "tokens_seen": 2874219520 + }, + { + "epoch": 9.06, + "learning_rate": 6.529588766298896e-05, + "loss": 2.4111, + "theoretical_loss": 3.331759532975779, + "tokens_seen": 2874285056 + }, + { + "epoch": 9.06, + "learning_rate": 6.528585757271816e-05, + "loss": 2.5379, + "theoretical_loss": 3.3317536413095405, + "tokens_seen": 2874350592 + }, + { + "epoch": 9.06, + "learning_rate": 6.527582748244734e-05, + "loss": 2.4636, + "theoretical_loss": 3.3317477498152437, + "tokens_seen": 2874416128 + }, + { + "epoch": 9.06, + "learning_rate": 6.526579739217653e-05, + "loss": 2.4652, + "theoretical_loss": 3.3317418584928804, + "tokens_seen": 2874481664 + }, + { + "epoch": 9.06, + "learning_rate": 6.525576730190571e-05, + "loss": 2.4893, + "theoretical_loss": 3.331735967342441, + "tokens_seen": 2874547200 + }, + { + "epoch": 9.06, + "learning_rate": 6.524573721163491e-05, + "loss": 2.583, + "theoretical_loss": 3.3317300763639164, + "tokens_seen": 2874612736 + }, + { + "epoch": 9.06, + "learning_rate": 6.523570712136409e-05, + "loss": 2.3742, + "theoretical_loss": 3.3317241855572988, + "tokens_seen": 2874678272 + }, + { + "epoch": 9.06, + "learning_rate": 6.522567703109328e-05, + "loss": 2.5567, + "theoretical_loss": 3.331718294922578, + "tokens_seen": 2874743808 + }, + { + "epoch": 9.06, + "learning_rate": 6.521564694082246e-05, + "loss": 2.2579, + "theoretical_loss": 3.3317124044597453, + "tokens_seen": 2874809344 + }, + { + "epoch": 9.06, + "learning_rate": 6.520561685055165e-05, + "loss": 2.7352, + "theoretical_loss": 3.3317065141687925, + "tokens_seen": 2874874880 + }, + { + "epoch": 9.06, + "learning_rate": 6.519558676028084e-05, + "loss": 2.4214, + "theoretical_loss": 3.3317006240497102, + "tokens_seen": 2874940416 + }, + { + "epoch": 9.06, + "learning_rate": 6.518555667001002e-05, + "loss": 2.4621, + "theoretical_loss": 3.3316947341024896, + "tokens_seen": 2875005952 + }, + { + "epoch": 9.06, + "learning_rate": 6.517552657973922e-05, + "loss": 2.5238, + "theoretical_loss": 3.3316888443271213, + "tokens_seen": 2875071488 + }, + { + "epoch": 9.06, + "learning_rate": 6.51654964894684e-05, + "loss": 2.4557, + "theoretical_loss": 3.331682954723597, + "tokens_seen": 2875137024 + }, + { + "epoch": 9.06, + "learning_rate": 6.515546639919759e-05, + "loss": 2.613, + "theoretical_loss": 3.331677065291907, + "tokens_seen": 2875202560 + }, + { + "epoch": 9.06, + "learning_rate": 6.514543630892677e-05, + "loss": 2.5674, + "theoretical_loss": 3.331671176032043, + "tokens_seen": 2875268096 + }, + { + "epoch": 9.06, + "learning_rate": 6.513540621865597e-05, + "loss": 2.6122, + "theoretical_loss": 3.3316652869439967, + "tokens_seen": 2875333632 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3160660, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.253877878189087, + "objective/train/theoretical_loss": 3.331660870240711, + "objective/train/tokens_used": 2895842784, + "theoretical_loss": 3.331660870240711, + "tokens_seen": 2875382784 + }, + { + "epoch": 9.06, + "learning_rate": 6.512537612838516e-05, + "loss": 2.5391, + "theoretical_loss": 3.3316593980277576, + "tokens_seen": 2875399168 + }, + { + "epoch": 9.06, + "learning_rate": 6.511534603811436e-05, + "loss": 2.3998, + "theoretical_loss": 3.3316535092833175, + "tokens_seen": 2875464704 + }, + { + "epoch": 9.06, + "learning_rate": 6.510531594784354e-05, + "loss": 2.3606, + "theoretical_loss": 3.331647620710668, + "tokens_seen": 2875530240 + }, + { + "epoch": 9.06, + "learning_rate": 6.509528585757272e-05, + "loss": 2.6188, + "theoretical_loss": 3.3316417323097993, + "tokens_seen": 2875595776 + }, + { + "epoch": 9.06, + "learning_rate": 6.508525576730191e-05, + "loss": 2.6883, + "theoretical_loss": 3.3316358440807035, + "tokens_seen": 2875661312 + }, + { + "epoch": 9.06, + "learning_rate": 6.50752256770311e-05, + "loss": 2.4971, + "theoretical_loss": 3.33162995602337, + "tokens_seen": 2875726848 + }, + { + "epoch": 9.06, + "learning_rate": 6.506519558676029e-05, + "loss": 2.3351, + "theoretical_loss": 3.331624068137792, + "tokens_seen": 2875792384 + }, + { + "epoch": 9.06, + "learning_rate": 6.505516549648947e-05, + "loss": 2.4437, + "theoretical_loss": 3.331618180423959, + "tokens_seen": 2875857920 + }, + { + "epoch": 9.06, + "learning_rate": 6.504513540621867e-05, + "loss": 2.4519, + "theoretical_loss": 3.331612292881863, + "tokens_seen": 2875923456 + }, + { + "epoch": 9.06, + "learning_rate": 6.503510531594785e-05, + "loss": 2.3857, + "theoretical_loss": 3.3316064055114945, + "tokens_seen": 2875988992 + }, + { + "epoch": 9.06, + "learning_rate": 6.502507522567704e-05, + "loss": 2.5666, + "theoretical_loss": 3.331600518312844, + "tokens_seen": 2876054528 + }, + { + "epoch": 9.06, + "learning_rate": 6.501504513540622e-05, + "loss": 2.5026, + "theoretical_loss": 3.3315946312859044, + "tokens_seen": 2876120064 + }, + { + "epoch": 9.06, + "learning_rate": 6.50050150451354e-05, + "loss": 2.5694, + "theoretical_loss": 3.331588744430665, + "tokens_seen": 2876185600 + }, + { + "epoch": 9.06, + "learning_rate": 6.49949849548646e-05, + "loss": 2.3816, + "theoretical_loss": 3.331582857747118, + "tokens_seen": 2876251136 + }, + { + "epoch": 9.06, + "learning_rate": 6.498495486459378e-05, + "loss": 2.4189, + "theoretical_loss": 3.331576971235254, + "tokens_seen": 2876316672 + }, + { + "epoch": 9.06, + "learning_rate": 6.497492477432297e-05, + "loss": 2.3695, + "theoretical_loss": 3.3315710848950637, + "tokens_seen": 2876382208 + }, + { + "epoch": 9.06, + "learning_rate": 6.496489468405216e-05, + "loss": 2.3016, + "theoretical_loss": 3.331565198726539, + "tokens_seen": 2876447744 + }, + { + "epoch": 9.06, + "learning_rate": 6.495486459378135e-05, + "loss": 2.4693, + "theoretical_loss": 3.3315593127296705, + "tokens_seen": 2876513280 + }, + { + "epoch": 9.06, + "learning_rate": 6.494483450351053e-05, + "loss": 2.647, + "theoretical_loss": 3.3315534269044496, + "tokens_seen": 2876578816 + }, + { + "epoch": 9.06, + "learning_rate": 6.493480441323973e-05, + "loss": 2.4497, + "theoretical_loss": 3.331547541250867, + "tokens_seen": 2876644352 + }, + { + "epoch": 9.06, + "learning_rate": 6.492477432296891e-05, + "loss": 2.6928, + "theoretical_loss": 3.331541655768914, + "tokens_seen": 2876709888 + }, + { + "epoch": 9.06, + "learning_rate": 6.491474423269809e-05, + "loss": 2.3208, + "theoretical_loss": 3.331535770458582, + "tokens_seen": 2876775424 + }, + { + "epoch": 9.06, + "learning_rate": 6.490471414242728e-05, + "loss": 2.5594, + "theoretical_loss": 3.331529885319861, + "tokens_seen": 2876840960 + }, + { + "epoch": 9.06, + "learning_rate": 6.489468405215646e-05, + "loss": 2.4064, + "theoretical_loss": 3.331524000352743, + "tokens_seen": 2876906496 + }, + { + "epoch": 9.06, + "learning_rate": 6.488465396188566e-05, + "loss": 2.4342, + "theoretical_loss": 3.3315181155572193, + "tokens_seen": 2876972032 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3161920, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.757772445678711, + "objective/train/theoretical_loss": 3.3315137020731793, + "objective/train/tokens_used": 2897481184, + "theoretical_loss": 3.3315137020731793, + "tokens_seen": 2877021184 + }, + { + "epoch": 9.06, + "learning_rate": 6.487462387161484e-05, + "loss": 2.4191, + "theoretical_loss": 3.3315122309332805, + "tokens_seen": 2877037568 + }, + { + "epoch": 9.06, + "learning_rate": 6.486459378134403e-05, + "loss": 2.4256, + "theoretical_loss": 3.331506346480918, + "tokens_seen": 2877103104 + }, + { + "epoch": 9.06, + "learning_rate": 6.485456369107322e-05, + "loss": 2.329, + "theoretical_loss": 3.331500462200122, + "tokens_seen": 2877168640 + }, + { + "epoch": 9.06, + "learning_rate": 6.484453360080241e-05, + "loss": 2.4589, + "theoretical_loss": 3.3314945780908847, + "tokens_seen": 2877234176 + }, + { + "epoch": 9.06, + "learning_rate": 6.483450351053159e-05, + "loss": 2.4518, + "theoretical_loss": 3.3314886941531965, + "tokens_seen": 2877299712 + }, + { + "epoch": 9.06, + "learning_rate": 6.482447342026079e-05, + "loss": 2.6108, + "theoretical_loss": 3.331482810387049, + "tokens_seen": 2877365248 + }, + { + "epoch": 9.06, + "learning_rate": 6.481444332998997e-05, + "loss": 2.4799, + "theoretical_loss": 3.331476926792433, + "tokens_seen": 2877430784 + }, + { + "epoch": 9.06, + "learning_rate": 6.480441323971915e-05, + "loss": 2.3894, + "theoretical_loss": 3.3314710433693397, + "tokens_seen": 2877496320 + }, + { + "epoch": 9.06, + "learning_rate": 6.479438314944834e-05, + "loss": 2.4588, + "theoretical_loss": 3.3314651601177596, + "tokens_seen": 2877561856 + }, + { + "epoch": 9.06, + "learning_rate": 6.478435305917752e-05, + "loss": 2.3016, + "theoretical_loss": 3.331459277037685, + "tokens_seen": 2877627392 + }, + { + "epoch": 9.06, + "learning_rate": 6.477432296890672e-05, + "loss": 2.3938, + "theoretical_loss": 3.331453394129106, + "tokens_seen": 2877692928 + }, + { + "epoch": 9.06, + "learning_rate": 6.47642928786359e-05, + "loss": 2.5702, + "theoretical_loss": 3.331447511392014, + "tokens_seen": 2877758464 + }, + { + "epoch": 9.06, + "learning_rate": 6.47542627883651e-05, + "loss": 2.496, + "theoretical_loss": 3.3314416288264, + "tokens_seen": 2877824000 + }, + { + "epoch": 9.06, + "learning_rate": 6.474423269809429e-05, + "loss": 2.677, + "theoretical_loss": 3.3314357464322555, + "tokens_seen": 2877889536 + }, + { + "epoch": 9.06, + "learning_rate": 6.473420260782348e-05, + "loss": 2.6278, + "theoretical_loss": 3.331429864209571, + "tokens_seen": 2877955072 + }, + { + "epoch": 9.06, + "learning_rate": 6.472417251755266e-05, + "loss": 2.4673, + "theoretical_loss": 3.3314239821583382, + "tokens_seen": 2878020608 + }, + { + "epoch": 9.06, + "learning_rate": 6.471414242728185e-05, + "loss": 2.5217, + "theoretical_loss": 3.3314181002785475, + "tokens_seen": 2878086144 + }, + { + "epoch": 9.06, + "learning_rate": 6.470411233701104e-05, + "loss": 2.4034, + "theoretical_loss": 3.3314122185701907, + "tokens_seen": 2878151680 + }, + { + "epoch": 9.06, + "learning_rate": 6.469408224674022e-05, + "loss": 2.3116, + "theoretical_loss": 3.3314063370332585, + "tokens_seen": 2878217216 + }, + { + "epoch": 9.06, + "learning_rate": 6.468405215646942e-05, + "loss": 2.3997, + "theoretical_loss": 3.331400455667742, + "tokens_seen": 2878282752 + }, + { + "epoch": 9.06, + "learning_rate": 6.46740220661986e-05, + "loss": 2.5869, + "theoretical_loss": 3.3313945744736326, + "tokens_seen": 2878348288 + }, + { + "epoch": 9.06, + "learning_rate": 6.466399197592779e-05, + "loss": 2.6553, + "theoretical_loss": 3.331388693450921, + "tokens_seen": 2878413824 + }, + { + "epoch": 9.06, + "learning_rate": 6.465396188565697e-05, + "loss": 2.2364, + "theoretical_loss": 3.331382812599599, + "tokens_seen": 2878479360 + }, + { + "epoch": 9.06, + "learning_rate": 6.464393179538617e-05, + "loss": 2.6375, + "theoretical_loss": 3.3313769319196567, + "tokens_seen": 2878544896 + }, + { + "epoch": 9.06, + "learning_rate": 6.463390170511535e-05, + "loss": 2.5607, + "theoretical_loss": 3.331371051411086, + "tokens_seen": 2878610432 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3162632, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.179271936416626, + "objective/train/theoretical_loss": 3.3313666411421146, + "objective/train/tokens_used": 2899119584, + "theoretical_loss": 3.3313666411421146, + "tokens_seen": 2878659584 + }, + { + "epoch": 9.06, + "learning_rate": 6.462387161484454e-05, + "loss": 2.3759, + "theoretical_loss": 3.331365171073877, + "tokens_seen": 2878675968 + }, + { + "epoch": 9.06, + "learning_rate": 6.461384152457372e-05, + "loss": 2.506, + "theoretical_loss": 3.331359290908022, + "tokens_seen": 2878741504 + }, + { + "epoch": 9.06, + "learning_rate": 6.46038114343029e-05, + "loss": 2.5714, + "theoretical_loss": 3.331353410913512, + "tokens_seen": 2878807040 + }, + { + "epoch": 9.06, + "learning_rate": 6.45937813440321e-05, + "loss": 2.4713, + "theoretical_loss": 3.331347531090337, + "tokens_seen": 2878872576 + }, + { + "epoch": 9.06, + "learning_rate": 6.458375125376128e-05, + "loss": 2.3313, + "theoretical_loss": 3.3313416514384895, + "tokens_seen": 2878938112 + }, + { + "epoch": 9.06, + "learning_rate": 6.457372116349048e-05, + "loss": 2.5598, + "theoretical_loss": 3.33133577195796, + "tokens_seen": 2879003648 + }, + { + "epoch": 9.06, + "learning_rate": 6.456369107321966e-05, + "loss": 2.6142, + "theoretical_loss": 3.331329892648739, + "tokens_seen": 2879069184 + }, + { + "epoch": 9.06, + "learning_rate": 6.455366098294885e-05, + "loss": 2.455, + "theoretical_loss": 3.331324013510818, + "tokens_seen": 2879134720 + }, + { + "epoch": 9.06, + "learning_rate": 6.454363089267803e-05, + "loss": 2.4538, + "theoretical_loss": 3.331318134544189, + "tokens_seen": 2879200256 + }, + { + "epoch": 9.06, + "learning_rate": 6.453360080240723e-05, + "loss": 2.3307, + "theoretical_loss": 3.331312255748842, + "tokens_seen": 2879265792 + }, + { + "epoch": 9.06, + "learning_rate": 6.452357071213641e-05, + "loss": 2.1474, + "theoretical_loss": 3.3313063771247684, + "tokens_seen": 2879331328 + }, + { + "epoch": 9.06, + "learning_rate": 6.451354062186559e-05, + "loss": 2.3233, + "theoretical_loss": 3.3313004986719594, + "tokens_seen": 2879396864 + }, + { + "epoch": 9.06, + "learning_rate": 6.450351053159479e-05, + "loss": 2.5089, + "theoretical_loss": 3.3312946203904064, + "tokens_seen": 2879462400 + }, + { + "epoch": 9.06, + "learning_rate": 6.449348044132397e-05, + "loss": 2.425, + "theoretical_loss": 3.3312887422801003, + "tokens_seen": 2879527936 + }, + { + "epoch": 9.06, + "learning_rate": 6.448345035105316e-05, + "loss": 2.5226, + "theoretical_loss": 3.331282864341032, + "tokens_seen": 2879593472 + }, + { + "epoch": 9.06, + "learning_rate": 6.447342026078234e-05, + "loss": 2.5041, + "theoretical_loss": 3.331276986573193, + "tokens_seen": 2879659008 + }, + { + "epoch": 9.06, + "learning_rate": 6.446339017051154e-05, + "loss": 2.6398, + "theoretical_loss": 3.3312711089765736, + "tokens_seen": 2879724544 + }, + { + "epoch": 9.06, + "learning_rate": 6.445336008024072e-05, + "loss": 2.6832, + "theoretical_loss": 3.3312652315511655, + "tokens_seen": 2879790080 + }, + { + "epoch": 9.06, + "learning_rate": 6.444332998996991e-05, + "loss": 2.39, + "theoretical_loss": 3.33125935429696, + "tokens_seen": 2879855616 + }, + { + "epoch": 9.06, + "learning_rate": 6.44332998996991e-05, + "loss": 2.5598, + "theoretical_loss": 3.331253477213948, + "tokens_seen": 2879921152 + }, + { + "epoch": 9.06, + "learning_rate": 6.442326980942827e-05, + "loss": 2.2599, + "theoretical_loss": 3.331247600302121, + "tokens_seen": 2879986688 + }, + { + "epoch": 9.06, + "learning_rate": 6.441323971915747e-05, + "loss": 2.3975, + "theoretical_loss": 3.3312417235614697, + "tokens_seen": 2880052224 + }, + { + "epoch": 9.06, + "learning_rate": 6.440320962888665e-05, + "loss": 2.5418, + "theoretical_loss": 3.331235846991985, + "tokens_seen": 2880117760 + }, + { + "epoch": 9.06, + "learning_rate": 6.439317953861585e-05, + "loss": 2.2777, + "theoretical_loss": 3.3312299705936583, + "tokens_seen": 2880183296 + }, + { + "epoch": 9.06, + "learning_rate": 6.438314944834503e-05, + "loss": 2.3966, + "theoretical_loss": 3.3312240943664806, + "tokens_seen": 2880248832 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3164046, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.804119348526001, + "objective/train/theoretical_loss": 3.331219687308409, + "objective/train/tokens_used": 2900757984, + "theoretical_loss": 3.331219687308409, + "tokens_seen": 2880297984 + }, + { + "epoch": 9.06, + "learning_rate": 6.437311935807423e-05, + "loss": 2.4945, + "theoretical_loss": 3.3312182183104437, + "tokens_seen": 2880314368 + }, + { + "epoch": 9.06, + "learning_rate": 6.436308926780342e-05, + "loss": 2.5769, + "theoretical_loss": 3.3312123424255375, + "tokens_seen": 2880379904 + }, + { + "epoch": 9.06, + "learning_rate": 6.435305917753261e-05, + "loss": 2.4816, + "theoretical_loss": 3.3312064667117545, + "tokens_seen": 2880445440 + }, + { + "epoch": 9.06, + "learning_rate": 6.434302908726179e-05, + "loss": 2.6016, + "theoretical_loss": 3.3312005911690843, + "tokens_seen": 2880510976 + }, + { + "epoch": 9.06, + "learning_rate": 6.433299899699099e-05, + "loss": 2.3692, + "theoretical_loss": 3.3311947157975195, + "tokens_seen": 2880576512 + }, + { + "epoch": 9.06, + "learning_rate": 6.432296890672017e-05, + "loss": 2.634, + "theoretical_loss": 3.3311888405970502, + "tokens_seen": 2880642048 + }, + { + "epoch": 9.06, + "learning_rate": 6.431293881644935e-05, + "loss": 2.331, + "theoretical_loss": 3.331182965567668, + "tokens_seen": 2880707584 + }, + { + "epoch": 9.06, + "learning_rate": 6.430290872617854e-05, + "loss": 2.3178, + "theoretical_loss": 3.331177090709364, + "tokens_seen": 2880773120 + }, + { + "epoch": 9.06, + "learning_rate": 6.429287863590772e-05, + "loss": 2.6035, + "theoretical_loss": 3.331171216022129, + "tokens_seen": 2880838656 + }, + { + "epoch": 9.06, + "learning_rate": 6.428284854563692e-05, + "loss": 2.6059, + "theoretical_loss": 3.3311653415059546, + "tokens_seen": 2880904192 + }, + { + "epoch": 9.06, + "learning_rate": 6.42728184553661e-05, + "loss": 2.437, + "theoretical_loss": 3.3311594671608313, + "tokens_seen": 2880969728 + }, + { + "epoch": 9.06, + "learning_rate": 6.42627883650953e-05, + "loss": 2.3289, + "theoretical_loss": 3.3311535929867513, + "tokens_seen": 2881035264 + }, + { + "epoch": 9.06, + "learning_rate": 6.425275827482448e-05, + "loss": 2.4126, + "theoretical_loss": 3.3311477189837047, + "tokens_seen": 2881100800 + }, + { + "epoch": 9.06, + "learning_rate": 6.424272818455367e-05, + "loss": 2.3074, + "theoretical_loss": 3.3311418451516825, + "tokens_seen": 2881166336 + }, + { + "epoch": 9.06, + "learning_rate": 6.423269809428285e-05, + "loss": 2.4564, + "theoretical_loss": 3.331135971490677, + "tokens_seen": 2881231872 + }, + { + "epoch": 9.06, + "learning_rate": 6.422266800401203e-05, + "loss": 2.3835, + "theoretical_loss": 3.3311300980006786, + "tokens_seen": 2881297408 + }, + { + "epoch": 9.06, + "learning_rate": 6.421263791374123e-05, + "loss": 2.4012, + "theoretical_loss": 3.331124224681678, + "tokens_seen": 2881362944 + }, + { + "epoch": 9.06, + "learning_rate": 6.420260782347041e-05, + "loss": 2.4469, + "theoretical_loss": 3.331118351533667, + "tokens_seen": 2881428480 + }, + { + "epoch": 9.06, + "learning_rate": 6.41925777331996e-05, + "loss": 2.401, + "theoretical_loss": 3.3311124785566366, + "tokens_seen": 2881494016 + }, + { + "epoch": 9.06, + "learning_rate": 6.418254764292878e-05, + "loss": 2.2299, + "theoretical_loss": 3.331106605750578, + "tokens_seen": 2881559552 + }, + { + "epoch": 9.06, + "learning_rate": 6.417251755265798e-05, + "loss": 2.524, + "theoretical_loss": 3.3311007331154823, + "tokens_seen": 2881625088 + }, + { + "epoch": 9.06, + "learning_rate": 6.416248746238716e-05, + "loss": 2.4346, + "theoretical_loss": 3.33109486065134, + "tokens_seen": 2881690624 + }, + { + "epoch": 9.06, + "learning_rate": 6.415245737211635e-05, + "loss": 2.4026, + "theoretical_loss": 3.331088988358143, + "tokens_seen": 2881756160 + }, + { + "epoch": 9.06, + "learning_rate": 6.414242728184554e-05, + "loss": 2.2235, + "theoretical_loss": 3.3310831162358827, + "tokens_seen": 2881821696 + }, + { + "epoch": 9.06, + "learning_rate": 6.413239719157472e-05, + "loss": 2.5858, + "theoretical_loss": 3.331077244284549, + "tokens_seen": 2881887232 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3164773, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6361265182495117, + "objective/train/theoretical_loss": 3.3310728404332153, + "objective/train/tokens_used": 2902396384, + "theoretical_loss": 3.3310728404332153, + "tokens_seen": 2881936384 + }, + { + "epoch": 9.06, + "learning_rate": 6.412236710130391e-05, + "loss": 2.6775, + "theoretical_loss": 3.3310713725041343, + "tokens_seen": 2881952768 + }, + { + "epoch": 9.06, + "learning_rate": 6.411233701103309e-05, + "loss": 2.5041, + "theoretical_loss": 3.3310655008946295, + "tokens_seen": 2882018304 + }, + { + "epoch": 9.06, + "learning_rate": 6.410230692076229e-05, + "loss": 2.5245, + "theoretical_loss": 3.331059629456025, + "tokens_seen": 2882083840 + }, + { + "epoch": 9.06, + "learning_rate": 6.409227683049147e-05, + "loss": 2.5258, + "theoretical_loss": 3.3310537581883124, + "tokens_seen": 2882149376 + }, + { + "epoch": 9.06, + "learning_rate": 6.408224674022066e-05, + "loss": 2.5013, + "theoretical_loss": 3.331047887091483, + "tokens_seen": 2882214912 + }, + { + "epoch": 9.06, + "learning_rate": 6.407221664994984e-05, + "loss": 2.2928, + "theoretical_loss": 3.3310420161655276, + "tokens_seen": 2882280448 + }, + { + "epoch": 9.06, + "learning_rate": 6.406218655967904e-05, + "loss": 2.4285, + "theoretical_loss": 3.3310361454104376, + "tokens_seen": 2882345984 + }, + { + "epoch": 9.06, + "learning_rate": 6.405215646940822e-05, + "loss": 2.4369, + "theoretical_loss": 3.3310302748262037, + "tokens_seen": 2882411520 + }, + { + "epoch": 9.06, + "learning_rate": 6.404212637913741e-05, + "loss": 2.5429, + "theoretical_loss": 3.331024404412818, + "tokens_seen": 2882477056 + }, + { + "epoch": 9.06, + "learning_rate": 6.40320962888666e-05, + "loss": 2.3675, + "theoretical_loss": 3.331018534170271, + "tokens_seen": 2882542592 + }, + { + "epoch": 9.06, + "learning_rate": 6.402206619859578e-05, + "loss": 2.4348, + "theoretical_loss": 3.331012664098554, + "tokens_seen": 2882608128 + }, + { + "epoch": 9.06, + "learning_rate": 6.401203610832497e-05, + "loss": 2.4904, + "theoretical_loss": 3.3310067941976573, + "tokens_seen": 2882673664 + }, + { + "epoch": 9.06, + "learning_rate": 6.400200601805415e-05, + "loss": 2.394, + "theoretical_loss": 3.3310009244675736, + "tokens_seen": 2882739200 + }, + { + "epoch": 9.06, + "learning_rate": 6.399197592778336e-05, + "loss": 2.4595, + "theoretical_loss": 3.3309950549082927, + "tokens_seen": 2882804736 + }, + { + "epoch": 9.06, + "learning_rate": 6.398194583751254e-05, + "loss": 2.5951, + "theoretical_loss": 3.330989185519807, + "tokens_seen": 2882870272 + }, + { + "epoch": 9.06, + "learning_rate": 6.397191574724174e-05, + "loss": 2.484, + "theoretical_loss": 3.330983316302106, + "tokens_seen": 2882935808 + }, + { + "epoch": 9.06, + "learning_rate": 6.396188565697092e-05, + "loss": 2.6293, + "theoretical_loss": 3.3309774472551825, + "tokens_seen": 2883001344 + }, + { + "epoch": 9.06, + "learning_rate": 6.395185556670011e-05, + "loss": 2.417, + "theoretical_loss": 3.3309715783790264, + "tokens_seen": 2883066880 + }, + { + "epoch": 9.06, + "learning_rate": 6.39418254764293e-05, + "loss": 2.3788, + "theoretical_loss": 3.33096570967363, + "tokens_seen": 2883132416 + }, + { + "epoch": 9.06, + "learning_rate": 6.393179538615847e-05, + "loss": 2.3809, + "theoretical_loss": 3.3309598411389834, + "tokens_seen": 2883197952 + }, + { + "epoch": 9.06, + "learning_rate": 6.392176529588767e-05, + "loss": 2.4596, + "theoretical_loss": 3.3309539727750783, + "tokens_seen": 2883263488 + }, + { + "epoch": 9.06, + "learning_rate": 6.391173520561685e-05, + "loss": 2.2959, + "theoretical_loss": 3.330948104581906, + "tokens_seen": 2883329024 + }, + { + "epoch": 9.06, + "learning_rate": 6.390170511534605e-05, + "loss": 2.5506, + "theoretical_loss": 3.3309422365594568, + "tokens_seen": 2883394560 + }, + { + "epoch": 9.06, + "learning_rate": 6.389167502507523e-05, + "loss": 2.3988, + "theoretical_loss": 3.3309363687077225, + "tokens_seen": 2883460096 + }, + { + "epoch": 9.06, + "learning_rate": 6.388164493480442e-05, + "loss": 2.5207, + "theoretical_loss": 3.3309305010266947, + "tokens_seen": 2883525632 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3166053, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.405791759490967, + "objective/train/theoretical_loss": 3.330926100377944, + "objective/train/tokens_used": 2904034784, + "theoretical_loss": 3.330926100377944, + "tokens_seen": 2883574784 + }, + { + "epoch": 9.06, + "learning_rate": 6.38716148445336e-05, + "loss": 2.3887, + "theoretical_loss": 3.330924633516364, + "tokens_seen": 2883591168 + }, + { + "epoch": 9.06, + "learning_rate": 6.38615847542628e-05, + "loss": 2.3877, + "theoretical_loss": 3.3309187661767212, + "tokens_seen": 2883656704 + }, + { + "epoch": 9.06, + "learning_rate": 6.385155466399198e-05, + "loss": 2.5413, + "theoretical_loss": 3.330912899007758, + "tokens_seen": 2883722240 + }, + { + "epoch": 9.06, + "learning_rate": 6.384152457372116e-05, + "loss": 2.4213, + "theoretical_loss": 3.3309070320094656, + "tokens_seen": 2883787776 + }, + { + "epoch": 9.06, + "learning_rate": 6.383149448345035e-05, + "loss": 2.701, + "theoretical_loss": 3.3309011651818348, + "tokens_seen": 2883853312 + }, + { + "epoch": 9.06, + "learning_rate": 6.382146439317953e-05, + "loss": 2.6159, + "theoretical_loss": 3.3308952985248568, + "tokens_seen": 2883918848 + }, + { + "epoch": 9.06, + "learning_rate": 6.381143430290873e-05, + "loss": 2.3296, + "theoretical_loss": 3.330889432038523, + "tokens_seen": 2883984384 + }, + { + "epoch": 9.06, + "learning_rate": 6.380140421263791e-05, + "loss": 2.3713, + "theoretical_loss": 3.3308835657228246, + "tokens_seen": 2884049920 + }, + { + "epoch": 9.06, + "learning_rate": 6.37913741223671e-05, + "loss": 2.1748, + "theoretical_loss": 3.3308776995777523, + "tokens_seen": 2884115456 + }, + { + "epoch": 9.06, + "learning_rate": 6.378134403209629e-05, + "loss": 2.5209, + "theoretical_loss": 3.330871833603298, + "tokens_seen": 2884180992 + }, + { + "epoch": 9.06, + "learning_rate": 6.377131394182548e-05, + "loss": 2.4565, + "theoretical_loss": 3.3308659677994523, + "tokens_seen": 2884246528 + }, + { + "epoch": 9.06, + "learning_rate": 6.376128385155466e-05, + "loss": 2.492, + "theoretical_loss": 3.3308601021662065, + "tokens_seen": 2884312064 + }, + { + "epoch": 9.06, + "learning_rate": 6.375125376128386e-05, + "loss": 2.4598, + "theoretical_loss": 3.3308542367035514, + "tokens_seen": 2884377600 + }, + { + "epoch": 9.06, + "learning_rate": 6.374122367101304e-05, + "loss": 2.4345, + "theoretical_loss": 3.330848371411479, + "tokens_seen": 2884443136 + }, + { + "epoch": 9.06, + "learning_rate": 6.373119358074222e-05, + "loss": 2.3754, + "theoretical_loss": 3.3308425062899794, + "tokens_seen": 2884508672 + }, + { + "epoch": 9.06, + "learning_rate": 6.372116349047141e-05, + "loss": 2.3832, + "theoretical_loss": 3.3308366413390447, + "tokens_seen": 2884574208 + }, + { + "epoch": 9.06, + "learning_rate": 6.37111334002006e-05, + "loss": 2.3712, + "theoretical_loss": 3.330830776558666, + "tokens_seen": 2884639744 + }, + { + "epoch": 9.06, + "learning_rate": 6.370110330992979e-05, + "loss": 2.343, + "theoretical_loss": 3.3308249119488336, + "tokens_seen": 2884705280 + }, + { + "epoch": 9.06, + "learning_rate": 6.369107321965897e-05, + "loss": 2.3637, + "theoretical_loss": 3.3308190475095394, + "tokens_seen": 2884770816 + }, + { + "epoch": 9.06, + "learning_rate": 6.368104312938817e-05, + "loss": 2.5453, + "theoretical_loss": 3.3308131832407746, + "tokens_seen": 2884836352 + }, + { + "epoch": 9.06, + "learning_rate": 6.367101303911735e-05, + "loss": 2.3727, + "theoretical_loss": 3.33080731914253, + "tokens_seen": 2884901888 + }, + { + "epoch": 9.06, + "learning_rate": 6.366098294884654e-05, + "loss": 2.3636, + "theoretical_loss": 3.330801455214797, + "tokens_seen": 2884967424 + }, + { + "epoch": 9.06, + "learning_rate": 6.365095285857572e-05, + "loss": 2.3483, + "theoretical_loss": 3.3307955914575667, + "tokens_seen": 2885032960 + }, + { + "epoch": 9.06, + "learning_rate": 6.36409227683049e-05, + "loss": 2.4805, + "theoretical_loss": 3.33078972787083, + "tokens_seen": 2885098496 + }, + { + "epoch": 9.06, + "learning_rate": 6.36308926780341e-05, + "loss": 2.5655, + "theoretical_loss": 3.330783864454579, + "tokens_seen": 2885164032 + }, + { + "epoch": 9.06, + "objective/train/docs_used": 3166535, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7018685340881348, + "objective/train/theoretical_loss": 3.330779467004266, + "objective/train/tokens_used": 2905673184, + "theoretical_loss": 3.330779467004266, + "tokens_seen": 2885213184 + }, + { + "epoch": 9.06, + "learning_rate": 6.362086258776329e-05, + "loss": 2.597, + "theoretical_loss": 3.3307780012088037, + "tokens_seen": 2885229568 + }, + { + "epoch": 9.06, + "learning_rate": 6.361083249749249e-05, + "loss": 2.1375, + "theoretical_loss": 3.330772138133496, + "tokens_seen": 2885295104 + }, + { + "epoch": 9.06, + "learning_rate": 6.360080240722167e-05, + "loss": 2.4751, + "theoretical_loss": 3.3307662752286467, + "tokens_seen": 2885360640 + }, + { + "epoch": 9.07, + "learning_rate": 6.359077231695086e-05, + "loss": 2.1657, + "theoretical_loss": 3.3307604124942474, + "tokens_seen": 2885426176 + }, + { + "epoch": 9.07, + "learning_rate": 6.358074222668004e-05, + "loss": 2.386, + "theoretical_loss": 3.3307545499302886, + "tokens_seen": 2885491712 + }, + { + "epoch": 9.07, + "learning_rate": 6.357071213640924e-05, + "loss": 2.3062, + "theoretical_loss": 3.3307486875367625, + "tokens_seen": 2885557248 + }, + { + "epoch": 9.07, + "learning_rate": 6.356068204613842e-05, + "loss": 2.4952, + "theoretical_loss": 3.330742825313659, + "tokens_seen": 2885622784 + }, + { + "epoch": 9.07, + "learning_rate": 6.355065195586761e-05, + "loss": 2.459, + "theoretical_loss": 3.3307369632609705, + "tokens_seen": 2885688320 + }, + { + "epoch": 9.07, + "learning_rate": 6.35406218655968e-05, + "loss": 2.5718, + "theoretical_loss": 3.330731101378687, + "tokens_seen": 2885753856 + }, + { + "epoch": 9.07, + "learning_rate": 6.353059177532598e-05, + "loss": 2.199, + "theoretical_loss": 3.3307252396668003, + "tokens_seen": 2885819392 + }, + { + "epoch": 9.07, + "learning_rate": 6.352056168505517e-05, + "loss": 2.3627, + "theoretical_loss": 3.3307193781253024, + "tokens_seen": 2885884928 + }, + { + "epoch": 9.07, + "learning_rate": 6.351053159478435e-05, + "loss": 2.4131, + "theoretical_loss": 3.330713516754183, + "tokens_seen": 2885950464 + }, + { + "epoch": 9.07, + "learning_rate": 6.350050150451355e-05, + "loss": 2.4946, + "theoretical_loss": 3.330707655553434, + "tokens_seen": 2886016000 + }, + { + "epoch": 9.07, + "learning_rate": 6.349047141424273e-05, + "loss": 2.3777, + "theoretical_loss": 3.330701794523047, + "tokens_seen": 2886081536 + }, + { + "epoch": 9.07, + "learning_rate": 6.348044132397192e-05, + "loss": 2.3754, + "theoretical_loss": 3.3306959336630118, + "tokens_seen": 2886147072 + }, + { + "epoch": 9.07, + "learning_rate": 6.34704112337011e-05, + "loss": 2.5423, + "theoretical_loss": 3.330690072973321, + "tokens_seen": 2886212608 + }, + { + "epoch": 9.07, + "learning_rate": 6.34603811434303e-05, + "loss": 2.5509, + "theoretical_loss": 3.330684212453965, + "tokens_seen": 2886278144 + }, + { + "epoch": 9.07, + "learning_rate": 6.345035105315948e-05, + "loss": 2.6709, + "theoretical_loss": 3.3306783521049352, + "tokens_seen": 2886343680 + }, + { + "epoch": 9.07, + "learning_rate": 6.344032096288866e-05, + "loss": 2.5673, + "theoretical_loss": 3.3306724919262227, + "tokens_seen": 2886409216 + }, + { + "epoch": 9.07, + "learning_rate": 6.343029087261786e-05, + "loss": 2.4125, + "theoretical_loss": 3.3306666319178193, + "tokens_seen": 2886474752 + }, + { + "epoch": 9.07, + "learning_rate": 6.342026078234704e-05, + "loss": 2.4085, + "theoretical_loss": 3.3306607720797152, + "tokens_seen": 2886540288 + }, + { + "epoch": 9.07, + "learning_rate": 6.341023069207623e-05, + "loss": 2.438, + "theoretical_loss": 3.3306549124119025, + "tokens_seen": 2886605824 + }, + { + "epoch": 9.07, + "learning_rate": 6.340020060180541e-05, + "loss": 2.6097, + "theoretical_loss": 3.3306490529143717, + "tokens_seen": 2886671360 + }, + { + "epoch": 9.07, + "learning_rate": 6.339017051153461e-05, + "loss": 2.452, + "theoretical_loss": 3.330643193587114, + "tokens_seen": 2886736896 + }, + { + "epoch": 9.07, + "learning_rate": 6.338014042126379e-05, + "loss": 2.3173, + "theoretical_loss": 3.330637334430121, + "tokens_seen": 2886802432 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3167833, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4216222763061523, + "objective/train/theoretical_loss": 3.330632940174107, + "objective/train/tokens_used": 2907311584, + "theoretical_loss": 3.330632940174107, + "tokens_seen": 2886851584 + }, + { + "epoch": 9.07, + "learning_rate": 6.337011033099298e-05, + "loss": 2.4267, + "theoretical_loss": 3.3306314754433837, + "tokens_seen": 2886867968 + }, + { + "epoch": 9.07, + "learning_rate": 6.336008024072216e-05, + "loss": 2.5534, + "theoretical_loss": 3.3306256166268935, + "tokens_seen": 2886933504 + }, + { + "epoch": 9.07, + "learning_rate": 6.335005015045135e-05, + "loss": 2.6131, + "theoretical_loss": 3.3306197579806414, + "tokens_seen": 2886999040 + }, + { + "epoch": 9.07, + "learning_rate": 6.334002006018054e-05, + "loss": 2.3525, + "theoretical_loss": 3.330613899504618, + "tokens_seen": 2887064576 + }, + { + "epoch": 9.07, + "learning_rate": 6.332998996990972e-05, + "loss": 2.5285, + "theoretical_loss": 3.3306080411988157, + "tokens_seen": 2887130112 + }, + { + "epoch": 9.07, + "learning_rate": 6.331995987963892e-05, + "loss": 2.4893, + "theoretical_loss": 3.3306021830632244, + "tokens_seen": 2887195648 + }, + { + "epoch": 9.07, + "learning_rate": 6.33099297893681e-05, + "loss": 2.4252, + "theoretical_loss": 3.3305963250978365, + "tokens_seen": 2887261184 + }, + { + "epoch": 9.07, + "learning_rate": 6.329989969909729e-05, + "loss": 2.5183, + "theoretical_loss": 3.3305904673026423, + "tokens_seen": 2887326720 + }, + { + "epoch": 9.07, + "learning_rate": 6.328986960882647e-05, + "loss": 2.5603, + "theoretical_loss": 3.3305846096776337, + "tokens_seen": 2887392256 + }, + { + "epoch": 9.07, + "learning_rate": 6.327983951855567e-05, + "loss": 2.3244, + "theoretical_loss": 3.330578752222801, + "tokens_seen": 2887457792 + }, + { + "epoch": 9.07, + "learning_rate": 6.326980942828485e-05, + "loss": 2.4572, + "theoretical_loss": 3.330572894938136, + "tokens_seen": 2887523328 + }, + { + "epoch": 9.07, + "learning_rate": 6.325977933801404e-05, + "loss": 2.3013, + "theoretical_loss": 3.3305670378236303, + "tokens_seen": 2887588864 + }, + { + "epoch": 9.07, + "learning_rate": 6.324974924774324e-05, + "loss": 2.4974, + "theoretical_loss": 3.330561180879274, + "tokens_seen": 2887654400 + }, + { + "epoch": 9.07, + "learning_rate": 6.323971915747242e-05, + "loss": 2.3403, + "theoretical_loss": 3.3305553241050596, + "tokens_seen": 2887719936 + }, + { + "epoch": 9.07, + "learning_rate": 6.322968906720161e-05, + "loss": 2.3679, + "theoretical_loss": 3.330549467500977, + "tokens_seen": 2887785472 + }, + { + "epoch": 9.07, + "learning_rate": 6.32196589769308e-05, + "loss": 2.3539, + "theoretical_loss": 3.330543611067018, + "tokens_seen": 2887851008 + }, + { + "epoch": 9.07, + "learning_rate": 6.320962888665999e-05, + "loss": 2.3458, + "theoretical_loss": 3.3305377548031743, + "tokens_seen": 2887916544 + }, + { + "epoch": 9.07, + "learning_rate": 6.319959879638917e-05, + "loss": 2.541, + "theoretical_loss": 3.330531898709436, + "tokens_seen": 2887982080 + }, + { + "epoch": 9.07, + "learning_rate": 6.318956870611837e-05, + "loss": 2.4861, + "theoretical_loss": 3.3305260427857952, + "tokens_seen": 2888047616 + }, + { + "epoch": 9.07, + "learning_rate": 6.317953861584755e-05, + "loss": 2.6761, + "theoretical_loss": 3.3305201870322425, + "tokens_seen": 2888113152 + }, + { + "epoch": 9.07, + "learning_rate": 6.316950852557674e-05, + "loss": 2.634, + "theoretical_loss": 3.3305143314487697, + "tokens_seen": 2888178688 + }, + { + "epoch": 9.07, + "learning_rate": 6.315947843530592e-05, + "loss": 2.6197, + "theoretical_loss": 3.3305084760353676, + "tokens_seen": 2888244224 + }, + { + "epoch": 9.07, + "learning_rate": 6.31494483450351e-05, + "loss": 2.5153, + "theoretical_loss": 3.330502620792027, + "tokens_seen": 2888309760 + }, + { + "epoch": 9.07, + "learning_rate": 6.31394182547643e-05, + "loss": 2.4049, + "theoretical_loss": 3.33049676571874, + "tokens_seen": 2888375296 + }, + { + "epoch": 9.07, + "learning_rate": 6.312938816449348e-05, + "loss": 2.5198, + "theoretical_loss": 3.330490910815498, + "tokens_seen": 2888440832 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3168499, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.473315954208374, + "objective/train/theoretical_loss": 3.330486519749652, + "objective/train/tokens_used": 2908949984, + "theoretical_loss": 3.330486519749652, + "tokens_seen": 2888489984 + }, + { + "epoch": 9.07, + "learning_rate": 6.311935807422267e-05, + "loss": 2.4572, + "theoretical_loss": 3.3304850560822907, + "tokens_seen": 2888506368 + }, + { + "epoch": 9.07, + "learning_rate": 6.310932798395186e-05, + "loss": 2.4719, + "theoretical_loss": 3.3304792015191103, + "tokens_seen": 2888571904 + }, + { + "epoch": 9.07, + "learning_rate": 6.309929789368105e-05, + "loss": 2.5284, + "theoretical_loss": 3.330473347125948, + "tokens_seen": 2888637440 + }, + { + "epoch": 9.07, + "learning_rate": 6.308926780341023e-05, + "loss": 2.7058, + "theoretical_loss": 3.3304674929027955, + "tokens_seen": 2888702976 + }, + { + "epoch": 9.07, + "learning_rate": 6.307923771313943e-05, + "loss": 2.592, + "theoretical_loss": 3.3304616388496426, + "tokens_seen": 2888768512 + }, + { + "epoch": 9.07, + "learning_rate": 6.306920762286861e-05, + "loss": 2.4699, + "theoretical_loss": 3.3304557849664818, + "tokens_seen": 2888834048 + }, + { + "epoch": 9.07, + "learning_rate": 6.305917753259779e-05, + "loss": 2.342, + "theoretical_loss": 3.3304499312533036, + "tokens_seen": 2888899584 + }, + { + "epoch": 9.07, + "learning_rate": 6.304914744232698e-05, + "loss": 2.5059, + "theoretical_loss": 3.3304440777100996, + "tokens_seen": 2888965120 + }, + { + "epoch": 9.07, + "learning_rate": 6.303911735205616e-05, + "loss": 2.3133, + "theoretical_loss": 3.3304382243368607, + "tokens_seen": 2889030656 + }, + { + "epoch": 9.07, + "learning_rate": 6.302908726178536e-05, + "loss": 2.374, + "theoretical_loss": 3.330432371133578, + "tokens_seen": 2889096192 + }, + { + "epoch": 9.07, + "learning_rate": 6.301905717151454e-05, + "loss": 2.5582, + "theoretical_loss": 3.3304265181002437, + "tokens_seen": 2889161728 + }, + { + "epoch": 9.07, + "learning_rate": 6.300902708124373e-05, + "loss": 2.4725, + "theoretical_loss": 3.3304206652368475, + "tokens_seen": 2889227264 + }, + { + "epoch": 9.07, + "learning_rate": 6.299899699097292e-05, + "loss": 2.5037, + "theoretical_loss": 3.330414812543382, + "tokens_seen": 2889292800 + }, + { + "epoch": 9.07, + "learning_rate": 6.298896690070211e-05, + "loss": 2.4058, + "theoretical_loss": 3.330408960019837, + "tokens_seen": 2889358336 + }, + { + "epoch": 9.07, + "learning_rate": 6.297893681043129e-05, + "loss": 2.7017, + "theoretical_loss": 3.3304031076662053, + "tokens_seen": 2889423872 + }, + { + "epoch": 9.07, + "learning_rate": 6.296890672016049e-05, + "loss": 2.2997, + "theoretical_loss": 3.3303972554824774, + "tokens_seen": 2889489408 + }, + { + "epoch": 9.07, + "learning_rate": 6.295887662988967e-05, + "loss": 2.4587, + "theoretical_loss": 3.330391403468644, + "tokens_seen": 2889554944 + }, + { + "epoch": 9.07, + "learning_rate": 6.294884653961885e-05, + "loss": 2.6964, + "theoretical_loss": 3.3303855516246967, + "tokens_seen": 2889620480 + }, + { + "epoch": 9.07, + "learning_rate": 6.293881644934804e-05, + "loss": 2.3359, + "theoretical_loss": 3.330379699950627, + "tokens_seen": 2889686016 + }, + { + "epoch": 9.07, + "learning_rate": 6.292878635907722e-05, + "loss": 2.5284, + "theoretical_loss": 3.330373848446426, + "tokens_seen": 2889751552 + }, + { + "epoch": 9.07, + "learning_rate": 6.291875626880642e-05, + "loss": 2.4084, + "theoretical_loss": 3.3303679971120843, + "tokens_seen": 2889817088 + }, + { + "epoch": 9.07, + "learning_rate": 6.29087261785356e-05, + "loss": 2.3329, + "theoretical_loss": 3.330362145947594, + "tokens_seen": 2889882624 + }, + { + "epoch": 9.07, + "learning_rate": 6.28986960882648e-05, + "loss": 2.4144, + "theoretical_loss": 3.3303562949529457, + "tokens_seen": 2889948160 + }, + { + "epoch": 9.07, + "learning_rate": 6.288866599799398e-05, + "loss": 2.4324, + "theoretical_loss": 3.3303504441281313, + "tokens_seen": 2890013696 + }, + { + "epoch": 9.07, + "learning_rate": 6.287863590772317e-05, + "loss": 2.5212, + "theoretical_loss": 3.330344593473141, + "tokens_seen": 2890079232 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3169578, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.701667070388794, + "objective/train/theoretical_loss": 3.330340205593341, + "objective/train/tokens_used": 2910588384, + "theoretical_loss": 3.330340205593341, + "tokens_seen": 2890128384 + }, + { + "epoch": 9.07, + "learning_rate": 6.286860581745236e-05, + "loss": 2.5265, + "theoretical_loss": 3.3303387429879674, + "tokens_seen": 2890144768 + }, + { + "epoch": 9.07, + "learning_rate": 6.285857572718155e-05, + "loss": 2.4791, + "theoretical_loss": 3.3303328926726, + "tokens_seen": 2890210304 + }, + { + "epoch": 9.07, + "learning_rate": 6.284854563691074e-05, + "loss": 2.3908, + "theoretical_loss": 3.3303270425270313, + "tokens_seen": 2890275840 + }, + { + "epoch": 9.07, + "learning_rate": 6.283851554663992e-05, + "loss": 2.4094, + "theoretical_loss": 3.3303211925512524, + "tokens_seen": 2890341376 + }, + { + "epoch": 9.07, + "learning_rate": 6.282848545636912e-05, + "loss": 2.274, + "theoretical_loss": 3.330315342745254, + "tokens_seen": 2890406912 + }, + { + "epoch": 9.07, + "learning_rate": 6.28184553660983e-05, + "loss": 2.4913, + "theoretical_loss": 3.330309493109028, + "tokens_seen": 2890472448 + }, + { + "epoch": 9.07, + "learning_rate": 6.280842527582749e-05, + "loss": 2.3407, + "theoretical_loss": 3.330303643642565, + "tokens_seen": 2890537984 + }, + { + "epoch": 9.07, + "learning_rate": 6.279839518555667e-05, + "loss": 2.4615, + "theoretical_loss": 3.330297794345856, + "tokens_seen": 2890603520 + }, + { + "epoch": 9.07, + "learning_rate": 6.278836509528587e-05, + "loss": 2.391, + "theoretical_loss": 3.3302919452188933, + "tokens_seen": 2890669056 + }, + { + "epoch": 9.07, + "learning_rate": 6.277833500501505e-05, + "loss": 2.509, + "theoretical_loss": 3.330286096261667, + "tokens_seen": 2890734592 + }, + { + "epoch": 9.07, + "learning_rate": 6.276830491474424e-05, + "loss": 2.404, + "theoretical_loss": 3.3302802474741693, + "tokens_seen": 2890800128 + }, + { + "epoch": 9.07, + "learning_rate": 6.275827482447342e-05, + "loss": 2.2916, + "theoretical_loss": 3.3302743988563903, + "tokens_seen": 2890865664 + }, + { + "epoch": 9.07, + "learning_rate": 6.27482447342026e-05, + "loss": 2.3204, + "theoretical_loss": 3.3302685504083227, + "tokens_seen": 2890931200 + }, + { + "epoch": 9.07, + "learning_rate": 6.27382146439318e-05, + "loss": 2.2827, + "theoretical_loss": 3.3302627021299562, + "tokens_seen": 2890996736 + }, + { + "epoch": 9.07, + "learning_rate": 6.272818455366098e-05, + "loss": 2.4137, + "theoretical_loss": 3.3302568540212834, + "tokens_seen": 2891062272 + }, + { + "epoch": 9.07, + "learning_rate": 6.271815446339018e-05, + "loss": 2.4718, + "theoretical_loss": 3.3302510060822943, + "tokens_seen": 2891127808 + }, + { + "epoch": 9.07, + "learning_rate": 6.270812437311936e-05, + "loss": 2.4071, + "theoretical_loss": 3.330245158312981, + "tokens_seen": 2891193344 + }, + { + "epoch": 9.07, + "learning_rate": 6.269809428284855e-05, + "loss": 2.3999, + "theoretical_loss": 3.3302393107133343, + "tokens_seen": 2891258880 + }, + { + "epoch": 9.07, + "learning_rate": 6.268806419257773e-05, + "loss": 2.3854, + "theoretical_loss": 3.3302334632833457, + "tokens_seen": 2891324416 + }, + { + "epoch": 9.07, + "learning_rate": 6.267803410230693e-05, + "loss": 2.5526, + "theoretical_loss": 3.330227616023006, + "tokens_seen": 2891389952 + }, + { + "epoch": 9.07, + "learning_rate": 6.266800401203611e-05, + "loss": 2.5465, + "theoretical_loss": 3.330221768932307, + "tokens_seen": 2891455488 + }, + { + "epoch": 9.07, + "learning_rate": 6.265797392176529e-05, + "loss": 2.3876, + "theoretical_loss": 3.3302159220112397, + "tokens_seen": 2891521024 + }, + { + "epoch": 9.07, + "learning_rate": 6.264794383149448e-05, + "loss": 2.4539, + "theoretical_loss": 3.3302100752597954, + "tokens_seen": 2891586560 + }, + { + "epoch": 9.07, + "learning_rate": 6.263791374122367e-05, + "loss": 2.4152, + "theoretical_loss": 3.3302042286779647, + "tokens_seen": 2891652096 + }, + { + "epoch": 9.07, + "learning_rate": 6.262788365095286e-05, + "loss": 2.5599, + "theoretical_loss": 3.33019838226574, + "tokens_seen": 2891717632 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3170128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2916345596313477, + "objective/train/theoretical_loss": 3.3301939975678696, + "objective/train/tokens_used": 2912226784, + "theoretical_loss": 3.3301939975678696, + "tokens_seen": 2891766784 + }, + { + "epoch": 9.07, + "learning_rate": 6.261785356068204e-05, + "loss": 2.3525, + "theoretical_loss": 3.3301925360231115, + "tokens_seen": 2891783168 + }, + { + "epoch": 9.07, + "learning_rate": 6.260782347041124e-05, + "loss": 2.4128, + "theoretical_loss": 3.330186689950071, + "tokens_seen": 2891848704 + }, + { + "epoch": 9.07, + "learning_rate": 6.259779338014042e-05, + "loss": 2.4114, + "theoretical_loss": 3.33018084404661, + "tokens_seen": 2891914240 + }, + { + "epoch": 9.07, + "learning_rate": 6.258776328986961e-05, + "loss": 2.4945, + "theoretical_loss": 3.3301749983127182, + "tokens_seen": 2891979776 + }, + { + "epoch": 9.07, + "learning_rate": 6.25777331995988e-05, + "loss": 2.1564, + "theoretical_loss": 3.3301691527483888, + "tokens_seen": 2892045312 + }, + { + "epoch": 9.07, + "learning_rate": 6.256770310932797e-05, + "loss": 2.4542, + "theoretical_loss": 3.330163307353612, + "tokens_seen": 2892110848 + }, + { + "epoch": 9.07, + "learning_rate": 6.255767301905717e-05, + "loss": 2.5137, + "theoretical_loss": 3.3301574621283794, + "tokens_seen": 2892176384 + }, + { + "epoch": 9.07, + "learning_rate": 6.254764292878635e-05, + "loss": 2.4448, + "theoretical_loss": 3.3301516170726817, + "tokens_seen": 2892241920 + }, + { + "epoch": 9.07, + "learning_rate": 6.253761283851554e-05, + "loss": 2.5755, + "theoretical_loss": 3.330145772186511, + "tokens_seen": 2892307456 + }, + { + "epoch": 9.07, + "learning_rate": 6.252758274824473e-05, + "loss": 2.3907, + "theoretical_loss": 3.3301399274698578, + "tokens_seen": 2892372992 + }, + { + "epoch": 9.07, + "learning_rate": 6.251755265797392e-05, + "loss": 2.3285, + "theoretical_loss": 3.3301340829227137, + "tokens_seen": 2892438528 + }, + { + "epoch": 9.07, + "learning_rate": 6.25075225677031e-05, + "loss": 2.2929, + "theoretical_loss": 3.3301282385450697, + "tokens_seen": 2892504064 + }, + { + "epoch": 9.07, + "learning_rate": 6.24974924774323e-05, + "loss": 2.5332, + "theoretical_loss": 3.330122394336917, + "tokens_seen": 2892569600 + }, + { + "epoch": 9.07, + "learning_rate": 6.248746238716149e-05, + "loss": 2.6279, + "theoretical_loss": 3.3301165502982473, + "tokens_seen": 2892635136 + }, + { + "epoch": 9.07, + "learning_rate": 6.247743229689067e-05, + "loss": 2.3998, + "theoretical_loss": 3.3301107064290516, + "tokens_seen": 2892700672 + }, + { + "epoch": 9.07, + "learning_rate": 6.246740220661985e-05, + "loss": 2.4477, + "theoretical_loss": 3.3301048627293213, + "tokens_seen": 2892766208 + }, + { + "epoch": 9.07, + "learning_rate": 6.245737211634905e-05, + "loss": 2.451, + "theoretical_loss": 3.3300990191990474, + "tokens_seen": 2892831744 + }, + { + "epoch": 9.07, + "learning_rate": 6.244734202607823e-05, + "loss": 2.2109, + "theoretical_loss": 3.330093175838221, + "tokens_seen": 2892897280 + }, + { + "epoch": 9.07, + "learning_rate": 6.243731193580742e-05, + "loss": 2.5827, + "theoretical_loss": 3.3300873326468334, + "tokens_seen": 2892962816 + }, + { + "epoch": 9.07, + "learning_rate": 6.24272818455366e-05, + "loss": 2.5365, + "theoretical_loss": 3.3300814896248765, + "tokens_seen": 2893028352 + }, + { + "epoch": 9.07, + "learning_rate": 6.24172517552658e-05, + "loss": 2.559, + "theoretical_loss": 3.3300756467723405, + "tokens_seen": 2893093888 + }, + { + "epoch": 9.07, + "learning_rate": 6.240722166499498e-05, + "loss": 2.5851, + "theoretical_loss": 3.330069804089218, + "tokens_seen": 2893159424 + }, + { + "epoch": 9.07, + "learning_rate": 6.239719157472418e-05, + "loss": 2.4748, + "theoretical_loss": 3.3300639615754988, + "tokens_seen": 2893224960 + }, + { + "epoch": 9.07, + "learning_rate": 6.238716148445337e-05, + "loss": 2.5398, + "theoretical_loss": 3.330058119231175, + "tokens_seen": 2893290496 + }, + { + "epoch": 9.07, + "learning_rate": 6.237713139418255e-05, + "loss": 2.3941, + "theoretical_loss": 3.3300522770562377, + "tokens_seen": 2893356032 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3171580, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8437039852142334, + "objective/train/theoretical_loss": 3.330047895536189, + "objective/train/tokens_used": 2913865184, + "theoretical_loss": 3.330047895536189, + "tokens_seen": 2893405184 + }, + { + "epoch": 9.07, + "learning_rate": 6.236710130391173e-05, + "loss": 2.4525, + "theoretical_loss": 3.330046435050678, + "tokens_seen": 2893421568 + }, + { + "epoch": 9.07, + "learning_rate": 6.235707121364093e-05, + "loss": 2.3613, + "theoretical_loss": 3.330040593214487, + "tokens_seen": 2893487104 + }, + { + "epoch": 9.07, + "learning_rate": 6.234704112337011e-05, + "loss": 2.4057, + "theoretical_loss": 3.330034751547657, + "tokens_seen": 2893552640 + }, + { + "epoch": 9.07, + "learning_rate": 6.23370110330993e-05, + "loss": 2.2033, + "theoretical_loss": 3.330028910050178, + "tokens_seen": 2893618176 + }, + { + "epoch": 9.07, + "learning_rate": 6.232698094282848e-05, + "loss": 2.3393, + "theoretical_loss": 3.330023068722042, + "tokens_seen": 2893683712 + }, + { + "epoch": 9.07, + "learning_rate": 6.231695085255768e-05, + "loss": 2.6296, + "theoretical_loss": 3.3300172275632396, + "tokens_seen": 2893749248 + }, + { + "epoch": 9.07, + "learning_rate": 6.230692076228686e-05, + "loss": 2.6424, + "theoretical_loss": 3.3300113865737626, + "tokens_seen": 2893814784 + }, + { + "epoch": 9.07, + "learning_rate": 6.229689067201605e-05, + "loss": 2.4173, + "theoretical_loss": 3.330005545753602, + "tokens_seen": 2893880320 + }, + { + "epoch": 9.07, + "learning_rate": 6.228686058174524e-05, + "loss": 2.4357, + "theoretical_loss": 3.3299997051027495, + "tokens_seen": 2893945856 + }, + { + "epoch": 9.07, + "learning_rate": 6.227683049147442e-05, + "loss": 2.5212, + "theoretical_loss": 3.329993864621196, + "tokens_seen": 2894011392 + }, + { + "epoch": 9.07, + "learning_rate": 6.226680040120361e-05, + "loss": 2.5522, + "theoretical_loss": 3.329988024308933, + "tokens_seen": 2894076928 + }, + { + "epoch": 9.07, + "learning_rate": 6.225677031093279e-05, + "loss": 2.5049, + "theoretical_loss": 3.3299821841659507, + "tokens_seen": 2894142464 + }, + { + "epoch": 9.07, + "learning_rate": 6.224674022066199e-05, + "loss": 2.3214, + "theoretical_loss": 3.329976344192242, + "tokens_seen": 2894208000 + }, + { + "epoch": 9.07, + "learning_rate": 6.223671013039117e-05, + "loss": 2.3938, + "theoretical_loss": 3.3299705043877967, + "tokens_seen": 2894273536 + }, + { + "epoch": 9.07, + "learning_rate": 6.222668004012036e-05, + "loss": 2.445, + "theoretical_loss": 3.329964664752607, + "tokens_seen": 2894339072 + }, + { + "epoch": 9.07, + "learning_rate": 6.221664994984956e-05, + "loss": 2.3797, + "theoretical_loss": 3.3299588252866643, + "tokens_seen": 2894404608 + }, + { + "epoch": 9.07, + "learning_rate": 6.220661985957874e-05, + "loss": 2.416, + "theoretical_loss": 3.329952985989959, + "tokens_seen": 2894470144 + }, + { + "epoch": 9.07, + "learning_rate": 6.219658976930793e-05, + "loss": 2.5841, + "theoretical_loss": 3.329947146862483, + "tokens_seen": 2894535680 + }, + { + "epoch": 9.07, + "learning_rate": 6.218655967903711e-05, + "loss": 2.5312, + "theoretical_loss": 3.3299413079042273, + "tokens_seen": 2894601216 + }, + { + "epoch": 9.07, + "learning_rate": 6.21765295887663e-05, + "loss": 2.2988, + "theoretical_loss": 3.329935469115183, + "tokens_seen": 2894666752 + }, + { + "epoch": 9.07, + "learning_rate": 6.216649949849549e-05, + "loss": 2.467, + "theoretical_loss": 3.329929630495342, + "tokens_seen": 2894732288 + }, + { + "epoch": 9.07, + "learning_rate": 6.215646940822467e-05, + "loss": 2.4634, + "theoretical_loss": 3.3299237920446947, + "tokens_seen": 2894797824 + }, + { + "epoch": 9.07, + "learning_rate": 6.214643931795387e-05, + "loss": 2.5023, + "theoretical_loss": 3.3299179537632333, + "tokens_seen": 2894863360 + }, + { + "epoch": 9.07, + "learning_rate": 6.213640922768305e-05, + "loss": 2.599, + "theoretical_loss": 3.3299121156509486, + "tokens_seen": 2894928896 + }, + { + "epoch": 9.07, + "learning_rate": 6.212637913741224e-05, + "loss": 2.2438, + "theoretical_loss": 3.3299062777078317, + "tokens_seen": 2894994432 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3172290, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6508660316467285, + "objective/train/theoretical_loss": 3.3299018993615053, + "objective/train/tokens_used": 2915503584, + "theoretical_loss": 3.3299018993615053, + "tokens_seen": 2895043584 + }, + { + "epoch": 9.07, + "learning_rate": 6.211634904714142e-05, + "loss": 2.5646, + "theoretical_loss": 3.329900439933874, + "tokens_seen": 2895059968 + }, + { + "epoch": 9.07, + "learning_rate": 6.210631895687062e-05, + "loss": 2.232, + "theoretical_loss": 3.329894602329067, + "tokens_seen": 2895125504 + }, + { + "epoch": 9.07, + "learning_rate": 6.20962888665998e-05, + "loss": 2.2096, + "theoretical_loss": 3.3298887648934015, + "tokens_seen": 2895191040 + }, + { + "epoch": 9.07, + "learning_rate": 6.2086258776329e-05, + "loss": 2.3148, + "theoretical_loss": 3.329882927626869, + "tokens_seen": 2895256576 + }, + { + "epoch": 9.07, + "learning_rate": 6.207622868605817e-05, + "loss": 2.5211, + "theoretical_loss": 3.3298770905294615, + "tokens_seen": 2895322112 + }, + { + "epoch": 9.07, + "learning_rate": 6.206619859578736e-05, + "loss": 2.4092, + "theoretical_loss": 3.329871253601169, + "tokens_seen": 2895387648 + }, + { + "epoch": 9.07, + "learning_rate": 6.205616850551655e-05, + "loss": 2.4765, + "theoretical_loss": 3.3298654168419834, + "tokens_seen": 2895453184 + }, + { + "epoch": 9.07, + "learning_rate": 6.204613841524573e-05, + "loss": 2.383, + "theoretical_loss": 3.329859580251896, + "tokens_seen": 2895518720 + }, + { + "epoch": 9.07, + "learning_rate": 6.203610832497493e-05, + "loss": 2.4401, + "theoretical_loss": 3.329853743830898, + "tokens_seen": 2895584256 + }, + { + "epoch": 9.07, + "learning_rate": 6.202607823470412e-05, + "loss": 2.358, + "theoretical_loss": 3.329847907578981, + "tokens_seen": 2895649792 + }, + { + "epoch": 9.07, + "learning_rate": 6.20160481444333e-05, + "loss": 2.4323, + "theoretical_loss": 3.329842071496136, + "tokens_seen": 2895715328 + }, + { + "epoch": 9.07, + "learning_rate": 6.20060180541625e-05, + "loss": 2.5192, + "theoretical_loss": 3.329836235582354, + "tokens_seen": 2895780864 + }, + { + "epoch": 9.07, + "learning_rate": 6.199598796389168e-05, + "loss": 2.5782, + "theoretical_loss": 3.3298303998376264, + "tokens_seen": 2895846400 + }, + { + "epoch": 9.07, + "learning_rate": 6.198595787362086e-05, + "loss": 2.5453, + "theoretical_loss": 3.3298245642619446, + "tokens_seen": 2895911936 + }, + { + "epoch": 9.07, + "learning_rate": 6.197592778335005e-05, + "loss": 2.3898, + "theoretical_loss": 3.3298187288553, + "tokens_seen": 2895977472 + }, + { + "epoch": 9.07, + "learning_rate": 6.196589769307923e-05, + "loss": 2.5557, + "theoretical_loss": 3.3298128936176834, + "tokens_seen": 2896043008 + }, + { + "epoch": 9.07, + "learning_rate": 6.195586760280843e-05, + "loss": 2.3628, + "theoretical_loss": 3.329807058549087, + "tokens_seen": 2896108544 + }, + { + "epoch": 9.07, + "learning_rate": 6.194583751253761e-05, + "loss": 2.4414, + "theoretical_loss": 3.3298012236495014, + "tokens_seen": 2896174080 + }, + { + "epoch": 9.07, + "learning_rate": 6.19358074222668e-05, + "loss": 2.4585, + "theoretical_loss": 3.3297953889189174, + "tokens_seen": 2896239616 + }, + { + "epoch": 9.07, + "learning_rate": 6.192577733199599e-05, + "loss": 2.4599, + "theoretical_loss": 3.3297895543573275, + "tokens_seen": 2896305152 + }, + { + "epoch": 9.07, + "learning_rate": 6.191574724172518e-05, + "loss": 2.4121, + "theoretical_loss": 3.3297837199647224, + "tokens_seen": 2896370688 + }, + { + "epoch": 9.07, + "learning_rate": 6.190571715145436e-05, + "loss": 2.3598, + "theoretical_loss": 3.329777885741093, + "tokens_seen": 2896436224 + }, + { + "epoch": 9.07, + "learning_rate": 6.189568706118356e-05, + "loss": 2.3445, + "theoretical_loss": 3.329772051686431, + "tokens_seen": 2896501760 + }, + { + "epoch": 9.07, + "learning_rate": 6.188565697091274e-05, + "loss": 2.3619, + "theoretical_loss": 3.3297662178007275, + "tokens_seen": 2896567296 + }, + { + "epoch": 9.07, + "learning_rate": 6.187562688064192e-05, + "loss": 2.4112, + "theoretical_loss": 3.3297603840839742, + "tokens_seen": 2896632832 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3173597, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2258427143096924, + "objective/train/theoretical_loss": 3.3297560089072773, + "objective/train/tokens_used": 2917141984, + "theoretical_loss": 3.3297560089072773, + "tokens_seen": 2896681984 + }, + { + "epoch": 9.07, + "learning_rate": 6.186559679037111e-05, + "loss": 2.4295, + "theoretical_loss": 3.329754550536162, + "tokens_seen": 2896698368 + }, + { + "epoch": 9.07, + "learning_rate": 6.18555667001003e-05, + "loss": 2.3815, + "theoretical_loss": 3.329748717157282, + "tokens_seen": 2896763904 + }, + { + "epoch": 9.07, + "learning_rate": 6.184553660982949e-05, + "loss": 2.2586, + "theoretical_loss": 3.329742883947326, + "tokens_seen": 2896829440 + }, + { + "epoch": 9.07, + "learning_rate": 6.183550651955868e-05, + "loss": 2.3454, + "theoretical_loss": 3.3297370509062847, + "tokens_seen": 2896894976 + }, + { + "epoch": 9.07, + "learning_rate": 6.182547642928787e-05, + "loss": 2.4437, + "theoretical_loss": 3.32973121803415, + "tokens_seen": 2896960512 + }, + { + "epoch": 9.07, + "learning_rate": 6.181544633901706e-05, + "loss": 2.4624, + "theoretical_loss": 3.329725385330913, + "tokens_seen": 2897026048 + }, + { + "epoch": 9.07, + "learning_rate": 6.180541624874624e-05, + "loss": 2.4642, + "theoretical_loss": 3.3297195527965644, + "tokens_seen": 2897091584 + }, + { + "epoch": 9.07, + "learning_rate": 6.179538615847544e-05, + "loss": 2.5207, + "theoretical_loss": 3.3297137204310965, + "tokens_seen": 2897157120 + }, + { + "epoch": 9.07, + "learning_rate": 6.178535606820462e-05, + "loss": 2.4349, + "theoretical_loss": 3.3297078882345, + "tokens_seen": 2897222656 + }, + { + "epoch": 9.07, + "learning_rate": 6.17753259779338e-05, + "loss": 2.4606, + "theoretical_loss": 3.329702056206766, + "tokens_seen": 2897288192 + }, + { + "epoch": 9.07, + "learning_rate": 6.176529588766299e-05, + "loss": 2.2376, + "theoretical_loss": 3.3296962243478863, + "tokens_seen": 2897353728 + }, + { + "epoch": 9.07, + "learning_rate": 6.175526579739217e-05, + "loss": 2.3956, + "theoretical_loss": 3.329690392657852, + "tokens_seen": 2897419264 + }, + { + "epoch": 9.07, + "learning_rate": 6.174523570712137e-05, + "loss": 2.4358, + "theoretical_loss": 3.329684561136654, + "tokens_seen": 2897484800 + }, + { + "epoch": 9.07, + "learning_rate": 6.173520561685055e-05, + "loss": 2.3953, + "theoretical_loss": 3.329678729784284, + "tokens_seen": 2897550336 + }, + { + "epoch": 9.07, + "learning_rate": 6.172517552657974e-05, + "loss": 2.225, + "theoretical_loss": 3.3296728986007333, + "tokens_seen": 2897615872 + }, + { + "epoch": 9.07, + "learning_rate": 6.171514543630893e-05, + "loss": 2.4473, + "theoretical_loss": 3.3296670675859934, + "tokens_seen": 2897681408 + }, + { + "epoch": 9.07, + "learning_rate": 6.170511534603812e-05, + "loss": 2.3496, + "theoretical_loss": 3.329661236740055, + "tokens_seen": 2897746944 + }, + { + "epoch": 9.07, + "learning_rate": 6.16950852557673e-05, + "loss": 2.3467, + "theoretical_loss": 3.3296554060629098, + "tokens_seen": 2897812480 + }, + { + "epoch": 9.07, + "learning_rate": 6.168505516549648e-05, + "loss": 2.6116, + "theoretical_loss": 3.329649575554549, + "tokens_seen": 2897878016 + }, + { + "epoch": 9.07, + "learning_rate": 6.167502507522568e-05, + "loss": 2.3001, + "theoretical_loss": 3.3296437452149643, + "tokens_seen": 2897943552 + }, + { + "epoch": 9.07, + "learning_rate": 6.166499498495486e-05, + "loss": 2.4106, + "theoretical_loss": 3.329637915044146, + "tokens_seen": 2898009088 + }, + { + "epoch": 9.07, + "learning_rate": 6.165496489468405e-05, + "loss": 2.4971, + "theoretical_loss": 3.3296320850420864, + "tokens_seen": 2898074624 + }, + { + "epoch": 9.07, + "learning_rate": 6.164493480441325e-05, + "loss": 2.2364, + "theoretical_loss": 3.329626255208776, + "tokens_seen": 2898140160 + }, + { + "epoch": 9.07, + "learning_rate": 6.163490471414243e-05, + "loss": 2.5337, + "theoretical_loss": 3.329620425544207, + "tokens_seen": 2898205696 + }, + { + "epoch": 9.07, + "learning_rate": 6.162487462387162e-05, + "loss": 2.3719, + "theoretical_loss": 3.32961459604837, + "tokens_seen": 2898271232 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3173991, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.439528703689575, + "objective/train/theoretical_loss": 3.3296102240372174, + "objective/train/tokens_used": 2918780384, + "theoretical_loss": 3.3296102240372174, + "tokens_seen": 2898320384 + }, + { + "epoch": 9.07, + "learning_rate": 6.16148445336008e-05, + "loss": 2.4844, + "theoretical_loss": 3.329608766721256, + "tokens_seen": 2898336768 + }, + { + "epoch": 9.07, + "learning_rate": 6.160481444333e-05, + "loss": 2.1354, + "theoretical_loss": 3.3296029375628575, + "tokens_seen": 2898402304 + }, + { + "epoch": 9.07, + "learning_rate": 6.159478435305918e-05, + "loss": 2.4934, + "theoretical_loss": 3.329597108573165, + "tokens_seen": 2898467840 + }, + { + "epoch": 9.07, + "learning_rate": 6.158475426278836e-05, + "loss": 2.4927, + "theoretical_loss": 3.3295912797521696, + "tokens_seen": 2898533376 + }, + { + "epoch": 9.07, + "learning_rate": 6.157472417251756e-05, + "loss": 2.6583, + "theoretical_loss": 3.329585451099863, + "tokens_seen": 2898598912 + }, + { + "epoch": 9.07, + "learning_rate": 6.156469408224674e-05, + "loss": 2.4881, + "theoretical_loss": 3.3295796226162366, + "tokens_seen": 2898664448 + }, + { + "epoch": 9.07, + "learning_rate": 6.155466399197593e-05, + "loss": 2.4411, + "theoretical_loss": 3.3295737943012815, + "tokens_seen": 2898729984 + }, + { + "epoch": 9.07, + "learning_rate": 6.154463390170511e-05, + "loss": 2.5919, + "theoretical_loss": 3.329567966154989, + "tokens_seen": 2898795520 + }, + { + "epoch": 9.07, + "learning_rate": 6.153460381143431e-05, + "loss": 2.3754, + "theoretical_loss": 3.3295621381773506, + "tokens_seen": 2898861056 + }, + { + "epoch": 9.07, + "learning_rate": 6.152457372116349e-05, + "loss": 2.5933, + "theoretical_loss": 3.329556310368357, + "tokens_seen": 2898926592 + }, + { + "epoch": 9.07, + "learning_rate": 6.151454363089268e-05, + "loss": 2.4038, + "theoretical_loss": 3.329550482728, + "tokens_seen": 2898992128 + }, + { + "epoch": 9.07, + "learning_rate": 6.150451354062186e-05, + "loss": 2.4749, + "theoretical_loss": 3.3295446552562713, + "tokens_seen": 2899057664 + }, + { + "epoch": 9.07, + "learning_rate": 6.149448345035105e-05, + "loss": 2.459, + "theoretical_loss": 3.3295388279531615, + "tokens_seen": 2899123200 + }, + { + "epoch": 9.07, + "learning_rate": 6.148445336008024e-05, + "loss": 2.5021, + "theoretical_loss": 3.329533000818662, + "tokens_seen": 2899188736 + }, + { + "epoch": 9.07, + "learning_rate": 6.147442326980942e-05, + "loss": 2.2734, + "theoretical_loss": 3.3295271738527648, + "tokens_seen": 2899254272 + }, + { + "epoch": 9.07, + "learning_rate": 6.146439317953862e-05, + "loss": 2.4604, + "theoretical_loss": 3.32952134705546, + "tokens_seen": 2899319808 + }, + { + "epoch": 9.07, + "learning_rate": 6.145436308926781e-05, + "loss": 2.4665, + "theoretical_loss": 3.32951552042674, + "tokens_seen": 2899385344 + }, + { + "epoch": 9.07, + "learning_rate": 6.144433299899699e-05, + "loss": 2.39, + "theoretical_loss": 3.329509693966596, + "tokens_seen": 2899450880 + }, + { + "epoch": 9.07, + "learning_rate": 6.143430290872619e-05, + "loss": 2.3322, + "theoretical_loss": 3.329503867675019, + "tokens_seen": 2899516416 + }, + { + "epoch": 9.07, + "learning_rate": 6.142427281845537e-05, + "loss": 2.4247, + "theoretical_loss": 3.3294980415519997, + "tokens_seen": 2899581952 + }, + { + "epoch": 9.07, + "learning_rate": 6.141424272818456e-05, + "loss": 2.4677, + "theoretical_loss": 3.3294922155975306, + "tokens_seen": 2899647488 + }, + { + "epoch": 9.07, + "learning_rate": 6.140421263791374e-05, + "loss": 2.3491, + "theoretical_loss": 3.3294863898116023, + "tokens_seen": 2899713024 + }, + { + "epoch": 9.07, + "learning_rate": 6.139418254764292e-05, + "loss": 2.6223, + "theoretical_loss": 3.3294805641942062, + "tokens_seen": 2899778560 + }, + { + "epoch": 9.07, + "learning_rate": 6.138415245737212e-05, + "loss": 2.5528, + "theoretical_loss": 3.3294747387453336, + "tokens_seen": 2899844096 + }, + { + "epoch": 9.07, + "learning_rate": 6.13741223671013e-05, + "loss": 2.4694, + "theoretical_loss": 3.3294689134649764, + "tokens_seen": 2899909632 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3175284, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3615097999572754, + "objective/train/theoretical_loss": 3.3294645446152913, + "objective/train/tokens_used": 2920418784, + "theoretical_loss": 3.3294645446152913, + "tokens_seen": 2899958784 + }, + { + "epoch": 9.07, + "learning_rate": 6.13640922768305e-05, + "loss": 2.3717, + "theoretical_loss": 3.3294630883531253, + "tokens_seen": 2899975168 + }, + { + "epoch": 9.07, + "learning_rate": 6.135406218655968e-05, + "loss": 2.403, + "theoretical_loss": 3.3294572634097714, + "tokens_seen": 2900040704 + }, + { + "epoch": 9.07, + "learning_rate": 6.134403209628887e-05, + "loss": 2.6048, + "theoretical_loss": 3.329451438634907, + "tokens_seen": 2900106240 + }, + { + "epoch": 9.07, + "learning_rate": 6.133400200601805e-05, + "loss": 2.3188, + "theoretical_loss": 3.329445614028522, + "tokens_seen": 2900171776 + }, + { + "epoch": 9.07, + "learning_rate": 6.132397191574725e-05, + "loss": 2.5303, + "theoretical_loss": 3.329439789590609, + "tokens_seen": 2900237312 + }, + { + "epoch": 9.07, + "learning_rate": 6.131394182547643e-05, + "loss": 2.2499, + "theoretical_loss": 3.329433965321159, + "tokens_seen": 2900302848 + }, + { + "epoch": 9.07, + "learning_rate": 6.130391173520561e-05, + "loss": 2.4942, + "theoretical_loss": 3.3294281412201627, + "tokens_seen": 2900368384 + }, + { + "epoch": 9.07, + "learning_rate": 6.12938816449348e-05, + "loss": 2.383, + "theoretical_loss": 3.3294223172876123, + "tokens_seen": 2900433920 + }, + { + "epoch": 9.07, + "learning_rate": 6.128385155466398e-05, + "loss": 2.4627, + "theoretical_loss": 3.329416493523498, + "tokens_seen": 2900499456 + }, + { + "epoch": 9.07, + "learning_rate": 6.127382146439318e-05, + "loss": 2.4453, + "theoretical_loss": 3.3294106699278125, + "tokens_seen": 2900564992 + }, + { + "epoch": 9.07, + "learning_rate": 6.126379137412237e-05, + "loss": 2.4201, + "theoretical_loss": 3.3294048465005464, + "tokens_seen": 2900630528 + }, + { + "epoch": 9.07, + "learning_rate": 6.125376128385156e-05, + "loss": 2.5341, + "theoretical_loss": 3.329399023241691, + "tokens_seen": 2900696064 + }, + { + "epoch": 9.07, + "learning_rate": 6.124373119358075e-05, + "loss": 2.4928, + "theoretical_loss": 3.3293932001512374, + "tokens_seen": 2900761600 + }, + { + "epoch": 9.07, + "learning_rate": 6.123370110330993e-05, + "loss": 2.5143, + "theoretical_loss": 3.3293873772291773, + "tokens_seen": 2900827136 + }, + { + "epoch": 9.07, + "learning_rate": 6.122367101303913e-05, + "loss": 2.533, + "theoretical_loss": 3.3293815544755025, + "tokens_seen": 2900892672 + }, + { + "epoch": 9.07, + "learning_rate": 6.12136409227683e-05, + "loss": 2.4609, + "theoretical_loss": 3.3293757318902033, + "tokens_seen": 2900958208 + }, + { + "epoch": 9.07, + "learning_rate": 6.120361083249749e-05, + "loss": 2.3463, + "theoretical_loss": 3.3293699094732716, + "tokens_seen": 2901023744 + }, + { + "epoch": 9.07, + "learning_rate": 6.119358074222668e-05, + "loss": 2.3158, + "theoretical_loss": 3.3293640872246986, + "tokens_seen": 2901089280 + }, + { + "epoch": 9.07, + "learning_rate": 6.118355065195586e-05, + "loss": 2.2956, + "theoretical_loss": 3.3293582651444753, + "tokens_seen": 2901154816 + }, + { + "epoch": 9.07, + "learning_rate": 6.117352056168506e-05, + "loss": 2.5509, + "theoretical_loss": 3.329352443232594, + "tokens_seen": 2901220352 + }, + { + "epoch": 9.07, + "learning_rate": 6.116349047141424e-05, + "loss": 2.4414, + "theoretical_loss": 3.3293466214890453, + "tokens_seen": 2901285888 + }, + { + "epoch": 9.07, + "learning_rate": 6.115346038114343e-05, + "loss": 2.3498, + "theoretical_loss": 3.3293407999138203, + "tokens_seen": 2901351424 + }, + { + "epoch": 9.07, + "learning_rate": 6.114343029087262e-05, + "loss": 2.3638, + "theoretical_loss": 3.329334978506911, + "tokens_seen": 2901416960 + }, + { + "epoch": 9.07, + "learning_rate": 6.113340020060181e-05, + "loss": 2.3424, + "theoretical_loss": 3.329329157268308, + "tokens_seen": 2901482496 + }, + { + "epoch": 9.07, + "learning_rate": 6.112337011033099e-05, + "loss": 2.3908, + "theoretical_loss": 3.3293233361980032, + "tokens_seen": 2901548032 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3175858, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7570996284484863, + "objective/train/theoretical_loss": 3.3293189705057156, + "objective/train/tokens_used": 2922057184, + "theoretical_loss": 3.3293189705057156, + "tokens_seen": 2901597184 + }, + { + "epoch": 9.07, + "learning_rate": 6.111334002006019e-05, + "loss": 2.4188, + "theoretical_loss": 3.329317515295988, + "tokens_seen": 2901613568 + }, + { + "epoch": 9.07, + "learning_rate": 6.110330992978937e-05, + "loss": 2.2831, + "theoretical_loss": 3.3293116945622536, + "tokens_seen": 2901679104 + }, + { + "epoch": 9.07, + "learning_rate": 6.109327983951855e-05, + "loss": 2.3658, + "theoretical_loss": 3.329305873996791, + "tokens_seen": 2901744640 + }, + { + "epoch": 9.07, + "learning_rate": 6.108324974924774e-05, + "loss": 2.4462, + "theoretical_loss": 3.329300053599592, + "tokens_seen": 2901810176 + }, + { + "epoch": 9.07, + "learning_rate": 6.107321965897694e-05, + "loss": 2.5108, + "theoretical_loss": 3.3292942333706477, + "tokens_seen": 2901875712 + }, + { + "epoch": 9.07, + "learning_rate": 6.106318956870612e-05, + "loss": 2.5717, + "theoretical_loss": 3.329288413309949, + "tokens_seen": 2901941248 + }, + { + "epoch": 9.07, + "learning_rate": 6.105315947843531e-05, + "loss": 2.4831, + "theoretical_loss": 3.329282593417488, + "tokens_seen": 2902006784 + }, + { + "epoch": 9.07, + "learning_rate": 6.10431293881645e-05, + "loss": 2.4726, + "theoretical_loss": 3.329276773693256, + "tokens_seen": 2902072320 + }, + { + "epoch": 9.07, + "learning_rate": 6.103309929789368e-05, + "loss": 2.4899, + "theoretical_loss": 3.3292709541372436, + "tokens_seen": 2902137856 + }, + { + "epoch": 9.07, + "learning_rate": 6.102306920762287e-05, + "loss": 2.4292, + "theoretical_loss": 3.329265134749443, + "tokens_seen": 2902203392 + }, + { + "epoch": 9.07, + "learning_rate": 6.101303911735206e-05, + "loss": 2.3318, + "theoretical_loss": 3.329259315529845, + "tokens_seen": 2902268928 + }, + { + "epoch": 9.07, + "learning_rate": 6.1003009027081246e-05, + "loss": 2.3978, + "theoretical_loss": 3.329253496478441, + "tokens_seen": 2902334464 + }, + { + "epoch": 9.07, + "learning_rate": 6.0992978936810434e-05, + "loss": 2.4596, + "theoretical_loss": 3.329247677595222, + "tokens_seen": 2902400000 + }, + { + "epoch": 9.07, + "learning_rate": 6.098294884653962e-05, + "loss": 2.6298, + "theoretical_loss": 3.3292418588801804, + "tokens_seen": 2902465536 + }, + { + "epoch": 9.07, + "learning_rate": 6.097291875626881e-05, + "loss": 2.2436, + "theoretical_loss": 3.329236040333307, + "tokens_seen": 2902531072 + }, + { + "epoch": 9.07, + "learning_rate": 6.096288866599799e-05, + "loss": 2.4176, + "theoretical_loss": 3.329230221954593, + "tokens_seen": 2902596608 + }, + { + "epoch": 9.07, + "learning_rate": 6.095285857572718e-05, + "loss": 2.544, + "theoretical_loss": 3.3292244037440293, + "tokens_seen": 2902662144 + }, + { + "epoch": 9.07, + "learning_rate": 6.0942828485456366e-05, + "loss": 2.5, + "theoretical_loss": 3.329218585701608, + "tokens_seen": 2902727680 + }, + { + "epoch": 9.07, + "learning_rate": 6.0932798395185554e-05, + "loss": 2.3278, + "theoretical_loss": 3.32921276782732, + "tokens_seen": 2902793216 + }, + { + "epoch": 9.07, + "learning_rate": 6.092276830491474e-05, + "loss": 2.509, + "theoretical_loss": 3.3292069501211574, + "tokens_seen": 2902858752 + }, + { + "epoch": 9.07, + "learning_rate": 6.091273821464393e-05, + "loss": 2.2189, + "theoretical_loss": 3.3292011325831106, + "tokens_seen": 2902924288 + }, + { + "epoch": 9.07, + "learning_rate": 6.0902708124373125e-05, + "loss": 2.6563, + "theoretical_loss": 3.3291953152131715, + "tokens_seen": 2902989824 + }, + { + "epoch": 9.07, + "learning_rate": 6.089267803410231e-05, + "loss": 2.5138, + "theoretical_loss": 3.329189498011331, + "tokens_seen": 2903055360 + }, + { + "epoch": 9.07, + "learning_rate": 6.08826479438315e-05, + "loss": 2.4773, + "theoretical_loss": 3.3291836809775806, + "tokens_seen": 2903120896 + }, + { + "epoch": 9.07, + "learning_rate": 6.087261785356069e-05, + "loss": 2.4827, + "theoretical_loss": 3.3291778641119123, + "tokens_seen": 2903186432 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3176945, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.489175796508789, + "objective/train/theoretical_loss": 3.329173501572959, + "objective/train/tokens_used": 2923695584, + "theoretical_loss": 3.329173501572959, + "tokens_seen": 2903235584 + }, + { + "epoch": 9.07, + "learning_rate": 6.086258776328987e-05, + "loss": 2.4279, + "theoretical_loss": 3.3291720474143163, + "tokens_seen": 2903251968 + }, + { + "epoch": 9.07, + "learning_rate": 6.085255767301906e-05, + "loss": 2.3663, + "theoretical_loss": 3.329166230884785, + "tokens_seen": 2903317504 + }, + { + "epoch": 9.07, + "learning_rate": 6.0842527582748245e-05, + "loss": 2.258, + "theoretical_loss": 3.329160414523309, + "tokens_seen": 2903383040 + }, + { + "epoch": 9.07, + "learning_rate": 6.083249749247743e-05, + "loss": 2.4679, + "theoretical_loss": 3.3291545983298803, + "tokens_seen": 2903448576 + }, + { + "epoch": 9.07, + "learning_rate": 6.082246740220662e-05, + "loss": 2.3197, + "theoretical_loss": 3.3291487823044896, + "tokens_seen": 2903514112 + }, + { + "epoch": 9.07, + "learning_rate": 6.081243731193581e-05, + "loss": 2.5264, + "theoretical_loss": 3.329142966447129, + "tokens_seen": 2903579648 + }, + { + "epoch": 9.07, + "learning_rate": 6.0802407221665e-05, + "loss": 2.6898, + "theoretical_loss": 3.329137150757789, + "tokens_seen": 2903645184 + }, + { + "epoch": 9.07, + "learning_rate": 6.0792377131394185e-05, + "loss": 2.4411, + "theoretical_loss": 3.329131335236462, + "tokens_seen": 2903710720 + }, + { + "epoch": 9.07, + "learning_rate": 6.078234704112337e-05, + "loss": 2.2743, + "theoretical_loss": 3.3291255198831378, + "tokens_seen": 2903776256 + }, + { + "epoch": 9.07, + "learning_rate": 6.077231695085256e-05, + "loss": 2.505, + "theoretical_loss": 3.3291197046978094, + "tokens_seen": 2903841792 + }, + { + "epoch": 9.07, + "learning_rate": 6.076228686058174e-05, + "loss": 2.3671, + "theoretical_loss": 3.329113889680467, + "tokens_seen": 2903907328 + }, + { + "epoch": 9.07, + "learning_rate": 6.075225677031093e-05, + "loss": 2.7271, + "theoretical_loss": 3.329108074831103, + "tokens_seen": 2903972864 + }, + { + "epoch": 9.07, + "learning_rate": 6.074222668004012e-05, + "loss": 2.3596, + "theoretical_loss": 3.329102260149708, + "tokens_seen": 2904038400 + }, + { + "epoch": 9.07, + "learning_rate": 6.0732196589769305e-05, + "loss": 2.2941, + "theoretical_loss": 3.329096445636273, + "tokens_seen": 2904103936 + }, + { + "epoch": 9.07, + "learning_rate": 6.072216649949849e-05, + "loss": 2.4536, + "theoretical_loss": 3.32909063129079, + "tokens_seen": 2904169472 + }, + { + "epoch": 9.07, + "learning_rate": 6.071213640922769e-05, + "loss": 2.6015, + "theoretical_loss": 3.3290848171132503, + "tokens_seen": 2904235008 + }, + { + "epoch": 9.07, + "learning_rate": 6.0702106318956876e-05, + "loss": 2.5071, + "theoretical_loss": 3.3290790031036455, + "tokens_seen": 2904300544 + }, + { + "epoch": 9.07, + "learning_rate": 6.0692076228686064e-05, + "loss": 2.4556, + "theoretical_loss": 3.3290731892619663, + "tokens_seen": 2904366080 + }, + { + "epoch": 9.07, + "learning_rate": 6.068204613841525e-05, + "loss": 2.5503, + "theoretical_loss": 3.3290673755882048, + "tokens_seen": 2904431616 + }, + { + "epoch": 9.07, + "learning_rate": 6.067201604814444e-05, + "loss": 2.4604, + "theoretical_loss": 3.3290615620823516, + "tokens_seen": 2904497152 + }, + { + "epoch": 9.07, + "learning_rate": 6.066198595787362e-05, + "loss": 2.5995, + "theoretical_loss": 3.3290557487443984, + "tokens_seen": 2904562688 + }, + { + "epoch": 9.07, + "learning_rate": 6.065195586760281e-05, + "loss": 2.4066, + "theoretical_loss": 3.329049935574337, + "tokens_seen": 2904628224 + }, + { + "epoch": 9.07, + "learning_rate": 6.0641925777331996e-05, + "loss": 2.4131, + "theoretical_loss": 3.3290441225721583, + "tokens_seen": 2904693760 + }, + { + "epoch": 9.07, + "learning_rate": 6.0631895687061184e-05, + "loss": 2.3585, + "theoretical_loss": 3.3290383097378533, + "tokens_seen": 2904759296 + }, + { + "epoch": 9.07, + "learning_rate": 6.062186559679037e-05, + "loss": 2.6653, + "theoretical_loss": 3.329032497071414, + "tokens_seen": 2904824832 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3177664, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5481338500976562, + "objective/train/theoretical_loss": 3.329028137681741, + "objective/train/tokens_used": 2925333984, + "theoretical_loss": 3.329028137681741, + "tokens_seen": 2904873984 + }, + { + "epoch": 9.07, + "learning_rate": 6.061183550651956e-05, + "loss": 2.454, + "theoretical_loss": 3.3290266845728316, + "tokens_seen": 2904890368 + }, + { + "epoch": 9.07, + "learning_rate": 6.060180541624875e-05, + "loss": 2.2843, + "theoretical_loss": 3.3290208722420975, + "tokens_seen": 2904955904 + }, + { + "epoch": 9.07, + "learning_rate": 6.0591775325977936e-05, + "loss": 2.2749, + "theoretical_loss": 3.329015060079203, + "tokens_seen": 2905021440 + }, + { + "epoch": 9.07, + "learning_rate": 6.0581745235707124e-05, + "loss": 2.4602, + "theoretical_loss": 3.329009248084139, + "tokens_seen": 2905086976 + }, + { + "epoch": 9.07, + "learning_rate": 6.0571715145436305e-05, + "loss": 2.6142, + "theoretical_loss": 3.329003436256898, + "tokens_seen": 2905152512 + }, + { + "epoch": 9.07, + "learning_rate": 6.056168505516549e-05, + "loss": 2.5398, + "theoretical_loss": 3.32899762459747, + "tokens_seen": 2905218048 + }, + { + "epoch": 9.07, + "learning_rate": 6.055165496489468e-05, + "loss": 2.5177, + "theoretical_loss": 3.328991813105848, + "tokens_seen": 2905283584 + }, + { + "epoch": 9.07, + "learning_rate": 6.054162487462387e-05, + "loss": 2.4972, + "theoretical_loss": 3.3289860017820216, + "tokens_seen": 2905349120 + }, + { + "epoch": 9.07, + "learning_rate": 6.0531594784353057e-05, + "loss": 2.6195, + "theoretical_loss": 3.328980190625983, + "tokens_seen": 2905414656 + }, + { + "epoch": 9.07, + "learning_rate": 6.052156469408225e-05, + "loss": 2.5525, + "theoretical_loss": 3.328974379637724, + "tokens_seen": 2905480192 + }, + { + "epoch": 9.07, + "learning_rate": 6.051153460381144e-05, + "loss": 2.0746, + "theoretical_loss": 3.3289685688172352, + "tokens_seen": 2905545728 + }, + { + "epoch": 9.07, + "learning_rate": 6.050150451354063e-05, + "loss": 2.6982, + "theoretical_loss": 3.3289627581645087, + "tokens_seen": 2905611264 + }, + { + "epoch": 9.07, + "learning_rate": 6.0491474423269815e-05, + "loss": 2.5745, + "theoretical_loss": 3.328956947679535, + "tokens_seen": 2905676800 + }, + { + "epoch": 9.07, + "learning_rate": 6.0481444332999e-05, + "loss": 2.5584, + "theoretical_loss": 3.3289511373623064, + "tokens_seen": 2905742336 + }, + { + "epoch": 9.07, + "learning_rate": 6.0471414242728184e-05, + "loss": 2.4092, + "theoretical_loss": 3.3289453272128138, + "tokens_seen": 2905807872 + }, + { + "epoch": 9.07, + "learning_rate": 6.046138415245737e-05, + "loss": 2.5949, + "theoretical_loss": 3.3289395172310483, + "tokens_seen": 2905873408 + }, + { + "epoch": 9.07, + "learning_rate": 6.045135406218656e-05, + "loss": 2.5623, + "theoretical_loss": 3.328933707417002, + "tokens_seen": 2905938944 + }, + { + "epoch": 9.07, + "learning_rate": 6.044132397191575e-05, + "loss": 2.2534, + "theoretical_loss": 3.3289278977706656, + "tokens_seen": 2906004480 + }, + { + "epoch": 9.07, + "learning_rate": 6.0431293881644936e-05, + "loss": 2.534, + "theoretical_loss": 3.328922088292031, + "tokens_seen": 2906070016 + }, + { + "epoch": 9.07, + "learning_rate": 6.0421263791374123e-05, + "loss": 2.1918, + "theoretical_loss": 3.328916278981089, + "tokens_seen": 2906135552 + }, + { + "epoch": 9.07, + "learning_rate": 6.041123370110331e-05, + "loss": 2.4276, + "theoretical_loss": 3.3289104698378313, + "tokens_seen": 2906201088 + }, + { + "epoch": 9.07, + "learning_rate": 6.04012036108325e-05, + "loss": 2.5511, + "theoretical_loss": 3.3289046608622495, + "tokens_seen": 2906266624 + }, + { + "epoch": 9.07, + "learning_rate": 6.039117352056169e-05, + "loss": 2.592, + "theoretical_loss": 3.3288988520543343, + "tokens_seen": 2906332160 + }, + { + "epoch": 9.07, + "learning_rate": 6.0381143430290875e-05, + "loss": 2.6124, + "theoretical_loss": 3.328893043414078, + "tokens_seen": 2906397696 + }, + { + "epoch": 9.07, + "learning_rate": 6.0371113340020056e-05, + "loss": 2.5185, + "theoretical_loss": 3.3288872349414715, + "tokens_seen": 2906463232 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3179058, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2793128490448, + "objective/train/theoretical_loss": 3.3288828786970313, + "objective/train/tokens_used": 2926972384, + "theoretical_loss": 3.3288828786970313, + "tokens_seen": 2906512384 + }, + { + "epoch": 9.07, + "learning_rate": 6.0361083249749244e-05, + "loss": 2.4264, + "theoretical_loss": 3.3288814266365057, + "tokens_seen": 2906528768 + }, + { + "epoch": 9.07, + "learning_rate": 6.035105315947843e-05, + "loss": 2.4566, + "theoretical_loss": 3.328875618499173, + "tokens_seen": 2906594304 + }, + { + "epoch": 9.07, + "learning_rate": 6.0341023069207627e-05, + "loss": 2.5508, + "theoretical_loss": 3.328869810529464, + "tokens_seen": 2906659840 + }, + { + "epoch": 9.07, + "learning_rate": 6.0330992978936814e-05, + "loss": 2.0603, + "theoretical_loss": 3.3288640027273706, + "tokens_seen": 2906725376 + }, + { + "epoch": 9.07, + "learning_rate": 6.0320962888666e-05, + "loss": 2.3875, + "theoretical_loss": 3.3288581950928835, + "tokens_seen": 2906790912 + }, + { + "epoch": 9.07, + "learning_rate": 6.031093279839519e-05, + "loss": 2.4521, + "theoretical_loss": 3.328852387625995, + "tokens_seen": 2906856448 + }, + { + "epoch": 9.07, + "learning_rate": 6.030090270812438e-05, + "loss": 2.554, + "theoretical_loss": 3.3288465803266956, + "tokens_seen": 2906921984 + }, + { + "epoch": 9.07, + "learning_rate": 6.0290872617853566e-05, + "loss": 2.4288, + "theoretical_loss": 3.3288407731949774, + "tokens_seen": 2906987520 + }, + { + "epoch": 9.07, + "learning_rate": 6.0280842527582754e-05, + "loss": 2.4813, + "theoretical_loss": 3.328834966230831, + "tokens_seen": 2907053056 + }, + { + "epoch": 9.07, + "learning_rate": 6.0270812437311935e-05, + "loss": 2.6155, + "theoretical_loss": 3.3288291594342487, + "tokens_seen": 2907118592 + }, + { + "epoch": 9.07, + "learning_rate": 6.026078234704112e-05, + "loss": 2.3577, + "theoretical_loss": 3.3288233528052213, + "tokens_seen": 2907184128 + }, + { + "epoch": 9.07, + "learning_rate": 6.025075225677031e-05, + "loss": 2.3797, + "theoretical_loss": 3.3288175463437404, + "tokens_seen": 2907249664 + }, + { + "epoch": 9.07, + "learning_rate": 6.02407221664995e-05, + "loss": 2.579, + "theoretical_loss": 3.3288117400497974, + "tokens_seen": 2907315200 + }, + { + "epoch": 9.07, + "learning_rate": 6.023069207622869e-05, + "loss": 2.5094, + "theoretical_loss": 3.328805933923383, + "tokens_seen": 2907380736 + }, + { + "epoch": 9.07, + "learning_rate": 6.0220661985957875e-05, + "loss": 2.3754, + "theoretical_loss": 3.32880012796449, + "tokens_seen": 2907446272 + }, + { + "epoch": 9.07, + "learning_rate": 6.021063189568706e-05, + "loss": 2.5206, + "theoretical_loss": 3.3287943221731084, + "tokens_seen": 2907511808 + }, + { + "epoch": 9.07, + "learning_rate": 6.020060180541625e-05, + "loss": 2.5055, + "theoretical_loss": 3.3287885165492304, + "tokens_seen": 2907577344 + }, + { + "epoch": 9.07, + "learning_rate": 6.019057171514544e-05, + "loss": 2.4717, + "theoretical_loss": 3.328782711092847, + "tokens_seen": 2907642880 + }, + { + "epoch": 9.07, + "learning_rate": 6.018054162487462e-05, + "loss": 2.5029, + "theoretical_loss": 3.3287769058039505, + "tokens_seen": 2907708416 + }, + { + "epoch": 9.07, + "learning_rate": 6.017051153460381e-05, + "loss": 2.6453, + "theoretical_loss": 3.328771100682531, + "tokens_seen": 2907773952 + }, + { + "epoch": 9.07, + "learning_rate": 6.0160481444332995e-05, + "loss": 2.592, + "theoretical_loss": 3.32876529572858, + "tokens_seen": 2907839488 + }, + { + "epoch": 9.07, + "learning_rate": 6.015045135406219e-05, + "loss": 2.3966, + "theoretical_loss": 3.32875949094209, + "tokens_seen": 2907905024 + }, + { + "epoch": 9.07, + "learning_rate": 6.014042126379138e-05, + "loss": 2.5379, + "theoretical_loss": 3.3287536863230516, + "tokens_seen": 2907970560 + }, + { + "epoch": 9.07, + "learning_rate": 6.0130391173520566e-05, + "loss": 2.4006, + "theoretical_loss": 3.328747881871456, + "tokens_seen": 2908036096 + }, + { + "epoch": 9.07, + "learning_rate": 6.0120361083249754e-05, + "loss": 2.4724, + "theoretical_loss": 3.328742077587296, + "tokens_seen": 2908101632 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3179768, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6399989128112793, + "objective/train/theoretical_loss": 3.3287377244840486, + "objective/train/tokens_used": 2928610784, + "theoretical_loss": 3.3287377244840486, + "tokens_seen": 2908150784 + }, + { + "epoch": 9.07, + "learning_rate": 6.011033099297894e-05, + "loss": 2.5356, + "theoretical_loss": 3.3287362734705606, + "tokens_seen": 2908167168 + }, + { + "epoch": 9.07, + "learning_rate": 6.010030090270813e-05, + "loss": 2.3892, + "theoretical_loss": 3.328730469521243, + "tokens_seen": 2908232704 + }, + { + "epoch": 9.07, + "learning_rate": 6.009027081243732e-05, + "loss": 2.5909, + "theoretical_loss": 3.328724665739334, + "tokens_seen": 2908298240 + }, + { + "epoch": 9.07, + "learning_rate": 6.00802407221665e-05, + "loss": 2.4562, + "theoretical_loss": 3.3287188621248256, + "tokens_seen": 2908363776 + }, + { + "epoch": 9.07, + "learning_rate": 6.0070210631895686e-05, + "loss": 2.4314, + "theoretical_loss": 3.3287130586777085, + "tokens_seen": 2908429312 + }, + { + "epoch": 9.07, + "learning_rate": 6.0060180541624874e-05, + "loss": 2.5365, + "theoretical_loss": 3.3287072553979744, + "tokens_seen": 2908494848 + }, + { + "epoch": 9.07, + "learning_rate": 6.005015045135406e-05, + "loss": 2.6208, + "theoretical_loss": 3.3287014522856144, + "tokens_seen": 2908560384 + }, + { + "epoch": 9.07, + "learning_rate": 6.004012036108325e-05, + "loss": 2.4658, + "theoretical_loss": 3.32869564934062, + "tokens_seen": 2908625920 + }, + { + "epoch": 9.07, + "learning_rate": 6.003009027081244e-05, + "loss": 2.5347, + "theoretical_loss": 3.328689846562983, + "tokens_seen": 2908691456 + }, + { + "epoch": 9.07, + "learning_rate": 6.0020060180541626e-05, + "loss": 2.3573, + "theoretical_loss": 3.328684043952695, + "tokens_seen": 2908756992 + }, + { + "epoch": 9.07, + "learning_rate": 6.0010030090270814e-05, + "loss": 2.6479, + "theoretical_loss": 3.3286782415097464, + "tokens_seen": 2908822528 + }, + { + "epoch": 9.07, + "learning_rate": 6e-05, + "loss": 2.3747, + "theoretical_loss": 3.328672439234129, + "tokens_seen": 2908888064 + }, + { + "epoch": 9.07, + "learning_rate": 5.998996990972919e-05, + "loss": 2.5276, + "theoretical_loss": 3.3286666371258344, + "tokens_seen": 2908953600 + }, + { + "epoch": 9.07, + "learning_rate": 5.997993981945837e-05, + "loss": 2.6598, + "theoretical_loss": 3.328660835184854, + "tokens_seen": 2909019136 + }, + { + "epoch": 9.07, + "learning_rate": 5.996990972918756e-05, + "loss": 2.5203, + "theoretical_loss": 3.3286550334111795, + "tokens_seen": 2909084672 + }, + { + "epoch": 9.07, + "learning_rate": 5.995987963891675e-05, + "loss": 2.4985, + "theoretical_loss": 3.3286492318048015, + "tokens_seen": 2909150208 + }, + { + "epoch": 9.07, + "learning_rate": 5.994984954864594e-05, + "loss": 2.471, + "theoretical_loss": 3.3286434303657124, + "tokens_seen": 2909215744 + }, + { + "epoch": 9.07, + "learning_rate": 5.993981945837513e-05, + "loss": 2.5527, + "theoretical_loss": 3.328637629093903, + "tokens_seen": 2909281280 + }, + { + "epoch": 9.07, + "learning_rate": 5.992978936810432e-05, + "loss": 2.5844, + "theoretical_loss": 3.328631827989364, + "tokens_seen": 2909346816 + }, + { + "epoch": 9.07, + "learning_rate": 5.9919759277833505e-05, + "loss": 2.247, + "theoretical_loss": 3.3286260270520884, + "tokens_seen": 2909412352 + }, + { + "epoch": 9.07, + "learning_rate": 5.990972918756269e-05, + "loss": 2.5833, + "theoretical_loss": 3.3286202262820663, + "tokens_seen": 2909477888 + }, + { + "epoch": 9.07, + "learning_rate": 5.989969909729188e-05, + "loss": 2.4237, + "theoretical_loss": 3.32861442567929, + "tokens_seen": 2909543424 + }, + { + "epoch": 9.07, + "learning_rate": 5.988966900702107e-05, + "loss": 2.5424, + "theoretical_loss": 3.32860862524375, + "tokens_seen": 2909608960 + }, + { + "epoch": 9.07, + "learning_rate": 5.987963891675025e-05, + "loss": 2.5548, + "theoretical_loss": 3.328602824975439, + "tokens_seen": 2909674496 + }, + { + "epoch": 9.07, + "learning_rate": 5.986960882647944e-05, + "loss": 2.3574, + "theoretical_loss": 3.3285970248743473, + "tokens_seen": 2909740032 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3181217, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.8237876892089844, + "objective/train/theoretical_loss": 3.3285926749082613, + "objective/train/tokens_used": 2930249184, + "theoretical_loss": 3.3285926749082613, + "tokens_seen": 2909789184 + }, + { + "epoch": 9.07, + "learning_rate": 5.9859578736208625e-05, + "loss": 2.4214, + "theoretical_loss": 3.3285912249404666, + "tokens_seen": 2909805568 + }, + { + "epoch": 9.07, + "learning_rate": 5.984954864593781e-05, + "loss": 2.6996, + "theoretical_loss": 3.3285854251737885, + "tokens_seen": 2909871104 + }, + { + "epoch": 9.07, + "learning_rate": 5.9839518555667e-05, + "loss": 2.3315, + "theoretical_loss": 3.328579625574304, + "tokens_seen": 2909936640 + }, + { + "epoch": 9.07, + "learning_rate": 5.982948846539619e-05, + "loss": 2.3731, + "theoretical_loss": 3.328573826142005, + "tokens_seen": 2910002176 + }, + { + "epoch": 9.07, + "learning_rate": 5.981945837512538e-05, + "loss": 2.4194, + "theoretical_loss": 3.3285680268768827, + "tokens_seen": 2910067712 + }, + { + "epoch": 9.07, + "learning_rate": 5.9809428284854565e-05, + "loss": 2.5907, + "theoretical_loss": 3.328562227778929, + "tokens_seen": 2910133248 + }, + { + "epoch": 9.07, + "learning_rate": 5.979939819458375e-05, + "loss": 2.4531, + "theoretical_loss": 3.328556428848134, + "tokens_seen": 2910198784 + }, + { + "epoch": 9.07, + "learning_rate": 5.9789368104312934e-05, + "loss": 2.4692, + "theoretical_loss": 3.3285506300844907, + "tokens_seen": 2910264320 + }, + { + "epoch": 9.07, + "learning_rate": 5.977933801404212e-05, + "loss": 2.484, + "theoretical_loss": 3.3285448314879895, + "tokens_seen": 2910329856 + }, + { + "epoch": 9.07, + "learning_rate": 5.9769307923771316e-05, + "loss": 2.4204, + "theoretical_loss": 3.328539033058622, + "tokens_seen": 2910395392 + }, + { + "epoch": 9.07, + "learning_rate": 5.9759277833500504e-05, + "loss": 2.7438, + "theoretical_loss": 3.32853323479638, + "tokens_seen": 2910460928 + }, + { + "epoch": 9.07, + "learning_rate": 5.974924774322969e-05, + "loss": 2.4441, + "theoretical_loss": 3.3285274367012545, + "tokens_seen": 2910526464 + }, + { + "epoch": 9.07, + "learning_rate": 5.973921765295888e-05, + "loss": 2.6035, + "theoretical_loss": 3.3285216387732373, + "tokens_seen": 2910592000 + }, + { + "epoch": 9.07, + "learning_rate": 5.972918756268807e-05, + "loss": 2.7593, + "theoretical_loss": 3.3285158410123192, + "tokens_seen": 2910657536 + }, + { + "epoch": 9.07, + "learning_rate": 5.9719157472417256e-05, + "loss": 2.3935, + "theoretical_loss": 3.328510043418492, + "tokens_seen": 2910723072 + }, + { + "epoch": 9.07, + "learning_rate": 5.9709127382146444e-05, + "loss": 2.4071, + "theoretical_loss": 3.3285042459917475, + "tokens_seen": 2910788608 + }, + { + "epoch": 9.07, + "learning_rate": 5.969909729187563e-05, + "loss": 2.3359, + "theoretical_loss": 3.3284984487320766, + "tokens_seen": 2910854144 + }, + { + "epoch": 9.07, + "learning_rate": 5.968906720160481e-05, + "loss": 2.4416, + "theoretical_loss": 3.328492651639471, + "tokens_seen": 2910919680 + }, + { + "epoch": 9.07, + "learning_rate": 5.9679037111334e-05, + "loss": 2.4118, + "theoretical_loss": 3.3284868547139217, + "tokens_seen": 2910985216 + }, + { + "epoch": 9.07, + "learning_rate": 5.966900702106319e-05, + "loss": 2.5473, + "theoretical_loss": 3.3284810579554205, + "tokens_seen": 2911050752 + }, + { + "epoch": 9.07, + "learning_rate": 5.9658976930792376e-05, + "loss": 2.1323, + "theoretical_loss": 3.328475261363959, + "tokens_seen": 2911116288 + }, + { + "epoch": 9.07, + "learning_rate": 5.9648946840521564e-05, + "loss": 2.5053, + "theoretical_loss": 3.328469464939528, + "tokens_seen": 2911181824 + }, + { + "epoch": 9.07, + "learning_rate": 5.963891675025075e-05, + "loss": 2.3707, + "theoretical_loss": 3.3284636686821196, + "tokens_seen": 2911247360 + }, + { + "epoch": 9.07, + "learning_rate": 5.962888665997994e-05, + "loss": 2.4931, + "theoretical_loss": 3.328457872591725, + "tokens_seen": 2911312896 + }, + { + "epoch": 9.07, + "learning_rate": 5.961885656970913e-05, + "loss": 2.2802, + "theoretical_loss": 3.328452076668335, + "tokens_seen": 2911378432 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3181970, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.228590965270996, + "objective/train/theoretical_loss": 3.3284477298353856, + "objective/train/tokens_used": 2931887584, + "theoretical_loss": 3.3284477298353856, + "tokens_seen": 2911427584 + }, + { + "epoch": 9.07, + "learning_rate": 5.9608826479438316e-05, + "loss": 2.5439, + "theoretical_loss": 3.3284462809119426, + "tokens_seen": 2911443968 + }, + { + "epoch": 9.07, + "learning_rate": 5.9598796389167504e-05, + "loss": 2.5203, + "theoretical_loss": 3.328440485322538, + "tokens_seen": 2911509504 + }, + { + "epoch": 9.07, + "learning_rate": 5.958876629889669e-05, + "loss": 2.3464, + "theoretical_loss": 3.3284346899001123, + "tokens_seen": 2911575040 + }, + { + "epoch": 9.07, + "learning_rate": 5.957873620862588e-05, + "loss": 2.3308, + "theoretical_loss": 3.3284288946446576, + "tokens_seen": 2911640576 + }, + { + "epoch": 9.07, + "learning_rate": 5.956870611835507e-05, + "loss": 2.4408, + "theoretical_loss": 3.3284230995561654, + "tokens_seen": 2911706112 + }, + { + "epoch": 9.07, + "learning_rate": 5.9558676028084255e-05, + "loss": 2.4496, + "theoretical_loss": 3.328417304634627, + "tokens_seen": 2911771648 + }, + { + "epoch": 9.07, + "learning_rate": 5.954864593781344e-05, + "loss": 2.3647, + "theoretical_loss": 3.3284115098800333, + "tokens_seen": 2911837184 + }, + { + "epoch": 9.07, + "learning_rate": 5.953861584754263e-05, + "loss": 2.5354, + "theoretical_loss": 3.328405715292377, + "tokens_seen": 2911902720 + }, + { + "epoch": 9.07, + "learning_rate": 5.952858575727182e-05, + "loss": 2.5889, + "theoretical_loss": 3.3283999208716484, + "tokens_seen": 2911968256 + }, + { + "epoch": 9.07, + "learning_rate": 5.951855566700101e-05, + "loss": 2.5641, + "theoretical_loss": 3.328394126617839, + "tokens_seen": 2912033792 + }, + { + "epoch": 9.07, + "learning_rate": 5.9508525576730195e-05, + "loss": 2.5027, + "theoretical_loss": 3.328388332530941, + "tokens_seen": 2912099328 + }, + { + "epoch": 9.07, + "learning_rate": 5.949849548645938e-05, + "loss": 2.5675, + "theoretical_loss": 3.3283825386109447, + "tokens_seen": 2912164864 + }, + { + "epoch": 9.07, + "learning_rate": 5.9488465396188564e-05, + "loss": 2.5957, + "theoretical_loss": 3.3283767448578425, + "tokens_seen": 2912230400 + }, + { + "epoch": 9.07, + "learning_rate": 5.947843530591775e-05, + "loss": 2.588, + "theoretical_loss": 3.328370951271626, + "tokens_seen": 2912295936 + }, + { + "epoch": 9.07, + "learning_rate": 5.946840521564694e-05, + "loss": 2.4548, + "theoretical_loss": 3.3283651578522857, + "tokens_seen": 2912361472 + }, + { + "epoch": 9.07, + "learning_rate": 5.945837512537613e-05, + "loss": 2.4554, + "theoretical_loss": 3.3283593645998133, + "tokens_seen": 2912427008 + }, + { + "epoch": 9.07, + "learning_rate": 5.9448345035105316e-05, + "loss": 2.5187, + "theoretical_loss": 3.328353571514201, + "tokens_seen": 2912492544 + }, + { + "epoch": 9.07, + "learning_rate": 5.9438314944834503e-05, + "loss": 2.5348, + "theoretical_loss": 3.3283477785954396, + "tokens_seen": 2912558080 + }, + { + "epoch": 9.07, + "learning_rate": 5.942828485456369e-05, + "loss": 2.3792, + "theoretical_loss": 3.32834198584352, + "tokens_seen": 2912623616 + }, + { + "epoch": 9.07, + "learning_rate": 5.941825476429288e-05, + "loss": 2.6121, + "theoretical_loss": 3.328336193258435, + "tokens_seen": 2912689152 + }, + { + "epoch": 9.07, + "learning_rate": 5.940822467402207e-05, + "loss": 2.5572, + "theoretical_loss": 3.328330400840175, + "tokens_seen": 2912754688 + }, + { + "epoch": 9.07, + "learning_rate": 5.9398194583751255e-05, + "loss": 2.5473, + "theoretical_loss": 3.3283246085887317, + "tokens_seen": 2912820224 + }, + { + "epoch": 9.07, + "learning_rate": 5.938816449348044e-05, + "loss": 2.3097, + "theoretical_loss": 3.3283188165040967, + "tokens_seen": 2912885760 + }, + { + "epoch": 9.07, + "learning_rate": 5.937813440320963e-05, + "loss": 2.5139, + "theoretical_loss": 3.3283130245862615, + "tokens_seen": 2912951296 + }, + { + "epoch": 9.07, + "learning_rate": 5.936810431293882e-05, + "loss": 2.4241, + "theoretical_loss": 3.328307232835217, + "tokens_seen": 2913016832 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3183437, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.830458164215088, + "objective/train/theoretical_loss": 3.3283028891313853, + "objective/train/tokens_used": 2933525984, + "theoretical_loss": 3.3283028891313853, + "tokens_seen": 2913065984 + }, + { + "epoch": 9.07, + "learning_rate": 5.9358074222668007e-05, + "loss": 2.5202, + "theoretical_loss": 3.328301441250955, + "tokens_seen": 2913082368 + }, + { + "epoch": 9.07, + "learning_rate": 5.9348044132397194e-05, + "loss": 2.4887, + "theoretical_loss": 3.3282956498334673, + "tokens_seen": 2913147904 + }, + { + "epoch": 9.07, + "learning_rate": 5.933801404212638e-05, + "loss": 2.4625, + "theoretical_loss": 3.328289858582745, + "tokens_seen": 2913213440 + }, + { + "epoch": 9.07, + "learning_rate": 5.932798395185557e-05, + "loss": 2.4696, + "theoretical_loss": 3.3282840674987795, + "tokens_seen": 2913278976 + }, + { + "epoch": 9.07, + "learning_rate": 5.931795386158476e-05, + "loss": 2.6023, + "theoretical_loss": 3.328278276581562, + "tokens_seen": 2913344512 + }, + { + "epoch": 9.07, + "learning_rate": 5.9307923771313946e-05, + "loss": 2.3806, + "theoretical_loss": 3.328272485831085, + "tokens_seen": 2913410048 + }, + { + "epoch": 9.07, + "learning_rate": 5.929789368104313e-05, + "loss": 2.6458, + "theoretical_loss": 3.328266695247338, + "tokens_seen": 2913475584 + }, + { + "epoch": 9.07, + "learning_rate": 5.9287863590772315e-05, + "loss": 2.6375, + "theoretical_loss": 3.328260904830315, + "tokens_seen": 2913541120 + }, + { + "epoch": 9.07, + "learning_rate": 5.92778335005015e-05, + "loss": 2.4261, + "theoretical_loss": 3.328255114580005, + "tokens_seen": 2913606656 + }, + { + "epoch": 9.07, + "learning_rate": 5.926780341023069e-05, + "loss": 2.4304, + "theoretical_loss": 3.328249324496401, + "tokens_seen": 2913672192 + }, + { + "epoch": 9.07, + "learning_rate": 5.925777331995988e-05, + "loss": 2.4938, + "theoretical_loss": 3.328243534579494, + "tokens_seen": 2913737728 + }, + { + "epoch": 9.07, + "learning_rate": 5.924774322968907e-05, + "loss": 2.4699, + "theoretical_loss": 3.3282377448292757, + "tokens_seen": 2913803264 + }, + { + "epoch": 9.07, + "learning_rate": 5.9237713139418255e-05, + "loss": 2.2721, + "theoretical_loss": 3.328231955245737, + "tokens_seen": 2913868800 + }, + { + "epoch": 9.07, + "learning_rate": 5.922768304914744e-05, + "loss": 2.5848, + "theoretical_loss": 3.3282261658288697, + "tokens_seen": 2913934336 + }, + { + "epoch": 9.07, + "learning_rate": 5.921765295887663e-05, + "loss": 2.601, + "theoretical_loss": 3.328220376578665, + "tokens_seen": 2913999872 + }, + { + "epoch": 9.07, + "learning_rate": 5.9207622868605825e-05, + "loss": 2.3826, + "theoretical_loss": 3.328214587495115, + "tokens_seen": 2914065408 + }, + { + "epoch": 9.07, + "learning_rate": 5.9197592778335006e-05, + "loss": 2.5458, + "theoretical_loss": 3.3282087985782107, + "tokens_seen": 2914130944 + }, + { + "epoch": 9.07, + "learning_rate": 5.9187562688064194e-05, + "loss": 2.5971, + "theoretical_loss": 3.328203009827943, + "tokens_seen": 2914196480 + }, + { + "epoch": 9.07, + "learning_rate": 5.917753259779338e-05, + "loss": 2.5305, + "theoretical_loss": 3.3281972212443045, + "tokens_seen": 2914262016 + }, + { + "epoch": 9.07, + "learning_rate": 5.916750250752257e-05, + "loss": 2.4164, + "theoretical_loss": 3.3281914328272864, + "tokens_seen": 2914327552 + }, + { + "epoch": 9.07, + "learning_rate": 5.915747241725176e-05, + "loss": 2.3876, + "theoretical_loss": 3.3281856445768794, + "tokens_seen": 2914393088 + }, + { + "epoch": 9.07, + "learning_rate": 5.9147442326980946e-05, + "loss": 2.427, + "theoretical_loss": 3.328179856493075, + "tokens_seen": 2914458624 + }, + { + "epoch": 9.07, + "learning_rate": 5.9137412236710134e-05, + "loss": 2.5686, + "theoretical_loss": 3.3281740685758656, + "tokens_seen": 2914524160 + }, + { + "epoch": 9.07, + "learning_rate": 5.912738214643932e-05, + "loss": 2.4895, + "theoretical_loss": 3.328168280825242, + "tokens_seen": 2914589696 + }, + { + "epoch": 9.07, + "learning_rate": 5.911735205616851e-05, + "loss": 2.4575, + "theoretical_loss": 3.3281624932411957, + "tokens_seen": 2914655232 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3184006, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4618325233459473, + "objective/train/theoretical_loss": 3.328158152662472, + "objective/train/tokens_used": 2935164384, + "theoretical_loss": 3.328158152662472, + "tokens_seen": 2914704384 + }, + { + "epoch": 9.07, + "learning_rate": 5.910732196589769e-05, + "loss": 2.4872, + "theoretical_loss": 3.3281567058237185, + "tokens_seen": 2914720768 + }, + { + "epoch": 9.07, + "learning_rate": 5.909729187562688e-05, + "loss": 2.3049, + "theoretical_loss": 3.3281509185728013, + "tokens_seen": 2914786304 + }, + { + "epoch": 9.07, + "learning_rate": 5.9087261785356066e-05, + "loss": 2.5257, + "theoretical_loss": 3.3281451314884363, + "tokens_seen": 2914851840 + }, + { + "epoch": 9.07, + "learning_rate": 5.9077231695085254e-05, + "loss": 2.4462, + "theoretical_loss": 3.328139344570614, + "tokens_seen": 2914917376 + }, + { + "epoch": 9.07, + "learning_rate": 5.906720160481444e-05, + "loss": 2.363, + "theoretical_loss": 3.3281335578193265, + "tokens_seen": 2914982912 + }, + { + "epoch": 9.07, + "learning_rate": 5.905717151454363e-05, + "loss": 2.4381, + "theoretical_loss": 3.3281277712345654, + "tokens_seen": 2915048448 + }, + { + "epoch": 9.07, + "learning_rate": 5.904714142427282e-05, + "loss": 2.2576, + "theoretical_loss": 3.3281219848163217, + "tokens_seen": 2915113984 + }, + { + "epoch": 9.07, + "learning_rate": 5.9037111334002006e-05, + "loss": 2.4472, + "theoretical_loss": 3.328116198564587, + "tokens_seen": 2915179520 + }, + { + "epoch": 9.07, + "learning_rate": 5.90270812437312e-05, + "loss": 2.3489, + "theoretical_loss": 3.328110412479353, + "tokens_seen": 2915245056 + }, + { + "epoch": 9.07, + "learning_rate": 5.901705115346039e-05, + "loss": 2.3371, + "theoretical_loss": 3.328104626560611, + "tokens_seen": 2915310592 + }, + { + "epoch": 9.07, + "learning_rate": 5.900702106318957e-05, + "loss": 2.3563, + "theoretical_loss": 3.3280988408083525, + "tokens_seen": 2915376128 + }, + { + "epoch": 9.07, + "learning_rate": 5.899699097291876e-05, + "loss": 2.401, + "theoretical_loss": 3.328093055222569, + "tokens_seen": 2915441664 + }, + { + "epoch": 9.07, + "learning_rate": 5.8986960882647945e-05, + "loss": 2.4234, + "theoretical_loss": 3.3280872698032518, + "tokens_seen": 2915507200 + }, + { + "epoch": 9.07, + "learning_rate": 5.897693079237713e-05, + "loss": 2.5898, + "theoretical_loss": 3.3280814845503928, + "tokens_seen": 2915572736 + }, + { + "epoch": 9.07, + "learning_rate": 5.896690070210632e-05, + "loss": 2.3825, + "theoretical_loss": 3.328075699463983, + "tokens_seen": 2915638272 + }, + { + "epoch": 9.07, + "learning_rate": 5.895687061183551e-05, + "loss": 2.4975, + "theoretical_loss": 3.328069914544014, + "tokens_seen": 2915703808 + }, + { + "epoch": 9.07, + "learning_rate": 5.89468405215647e-05, + "loss": 2.4993, + "theoretical_loss": 3.3280641297904774, + "tokens_seen": 2915769344 + }, + { + "epoch": 9.07, + "learning_rate": 5.8936810431293885e-05, + "loss": 2.4857, + "theoretical_loss": 3.328058345203364, + "tokens_seen": 2915834880 + }, + { + "epoch": 9.07, + "learning_rate": 5.892678034102307e-05, + "loss": 2.5442, + "theoretical_loss": 3.328052560782666, + "tokens_seen": 2915900416 + }, + { + "epoch": 9.07, + "learning_rate": 5.891675025075226e-05, + "loss": 2.4741, + "theoretical_loss": 3.3280467765283754, + "tokens_seen": 2915965952 + }, + { + "epoch": 9.07, + "learning_rate": 5.890672016048144e-05, + "loss": 2.5635, + "theoretical_loss": 3.3280409924404823, + "tokens_seen": 2916031488 + }, + { + "epoch": 9.07, + "learning_rate": 5.889669007021063e-05, + "loss": 2.3069, + "theoretical_loss": 3.3280352085189793, + "tokens_seen": 2916097024 + }, + { + "epoch": 9.07, + "learning_rate": 5.888665997993982e-05, + "loss": 2.4531, + "theoretical_loss": 3.3280294247638573, + "tokens_seen": 2916162560 + }, + { + "epoch": 9.07, + "learning_rate": 5.8876629889669005e-05, + "loss": 2.4927, + "theoretical_loss": 3.3280236411751076, + "tokens_seen": 2916228096 + }, + { + "epoch": 9.07, + "learning_rate": 5.886659979939819e-05, + "loss": 2.3066, + "theoretical_loss": 3.328017857752722, + "tokens_seen": 2916293632 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3184644, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7624504566192627, + "objective/train/theoretical_loss": 3.3280135202951047, + "objective/train/tokens_used": 2936802784, + "theoretical_loss": 3.3280135202951047, + "tokens_seen": 2916342784 + }, + { + "epoch": 9.07, + "learning_rate": 5.885656970912738e-05, + "loss": 2.4121, + "theoretical_loss": 3.3280120744966926, + "tokens_seen": 2916359168 + }, + { + "epoch": 9.07, + "learning_rate": 5.884653961885657e-05, + "loss": 2.2487, + "theoretical_loss": 3.32800629140701, + "tokens_seen": 2916424704 + }, + { + "epoch": 9.07, + "learning_rate": 5.8836509528585764e-05, + "loss": 2.5542, + "theoretical_loss": 3.3280005084836652, + "tokens_seen": 2916490240 + }, + { + "epoch": 9.07, + "learning_rate": 5.882647943831495e-05, + "loss": 2.2568, + "theoretical_loss": 3.327994725726651, + "tokens_seen": 2916555776 + }, + { + "epoch": 9.07, + "learning_rate": 5.881644934804414e-05, + "loss": 2.6009, + "theoretical_loss": 3.327988943135958, + "tokens_seen": 2916621312 + }, + { + "epoch": 9.07, + "learning_rate": 5.880641925777332e-05, + "loss": 2.4226, + "theoretical_loss": 3.3279831607115784, + "tokens_seen": 2916686848 + }, + { + "epoch": 9.07, + "learning_rate": 5.879638916750251e-05, + "loss": 2.6951, + "theoretical_loss": 3.327977378453503, + "tokens_seen": 2916752384 + }, + { + "epoch": 9.07, + "learning_rate": 5.8786359077231696e-05, + "loss": 2.3365, + "theoretical_loss": 3.3279715963617233, + "tokens_seen": 2916817920 + }, + { + "epoch": 9.07, + "learning_rate": 5.8776328986960884e-05, + "loss": 2.3628, + "theoretical_loss": 3.327965814436231, + "tokens_seen": 2916883456 + }, + { + "epoch": 9.07, + "learning_rate": 5.876629889669007e-05, + "loss": 2.4404, + "theoretical_loss": 3.3279600326770185, + "tokens_seen": 2916948992 + }, + { + "epoch": 9.07, + "learning_rate": 5.875626880641926e-05, + "loss": 2.3079, + "theoretical_loss": 3.3279542510840754, + "tokens_seen": 2917014528 + }, + { + "epoch": 9.07, + "learning_rate": 5.874623871614845e-05, + "loss": 2.5071, + "theoretical_loss": 3.327948469657394, + "tokens_seen": 2917080064 + }, + { + "epoch": 9.07, + "learning_rate": 5.8736208625877636e-05, + "loss": 2.3524, + "theoretical_loss": 3.3279426883969663, + "tokens_seen": 2917145600 + }, + { + "epoch": 9.07, + "learning_rate": 5.8726178535606824e-05, + "loss": 2.1278, + "theoretical_loss": 3.3279369073027834, + "tokens_seen": 2917211136 + }, + { + "epoch": 9.07, + "learning_rate": 5.8716148445336005e-05, + "loss": 2.531, + "theoretical_loss": 3.327931126374837, + "tokens_seen": 2917276672 + }, + { + "epoch": 9.07, + "learning_rate": 5.870611835506519e-05, + "loss": 2.5179, + "theoretical_loss": 3.327925345613118, + "tokens_seen": 2917342208 + }, + { + "epoch": 9.07, + "learning_rate": 5.869608826479438e-05, + "loss": 2.4834, + "theoretical_loss": 3.327919565017618, + "tokens_seen": 2917407744 + }, + { + "epoch": 9.07, + "learning_rate": 5.868605817452357e-05, + "loss": 2.6499, + "theoretical_loss": 3.3279137845883295, + "tokens_seen": 2917473280 + }, + { + "epoch": 9.07, + "learning_rate": 5.8676028084252756e-05, + "loss": 2.5896, + "theoretical_loss": 3.3279080043252427, + "tokens_seen": 2917538816 + }, + { + "epoch": 9.07, + "learning_rate": 5.8665997993981944e-05, + "loss": 2.653, + "theoretical_loss": 3.32790222422835, + "tokens_seen": 2917604352 + }, + { + "epoch": 9.07, + "learning_rate": 5.865596790371113e-05, + "loss": 2.5331, + "theoretical_loss": 3.327896444297642, + "tokens_seen": 2917669888 + }, + { + "epoch": 9.07, + "learning_rate": 5.864593781344033e-05, + "loss": 2.3884, + "theoretical_loss": 3.327890664533111, + "tokens_seen": 2917735424 + }, + { + "epoch": 9.07, + "learning_rate": 5.8635907723169515e-05, + "loss": 2.3241, + "theoretical_loss": 3.3278848849347487, + "tokens_seen": 2917800960 + }, + { + "epoch": 9.07, + "learning_rate": 5.86258776328987e-05, + "loss": 2.319, + "theoretical_loss": 3.327879105502545, + "tokens_seen": 2917866496 + }, + { + "epoch": 9.07, + "learning_rate": 5.8615847542627884e-05, + "loss": 2.6176, + "theoretical_loss": 3.3278733262364932, + "tokens_seen": 2917932032 + }, + { + "epoch": 9.07, + "objective/train/docs_used": 3185653, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6092700958251953, + "objective/train/theoretical_loss": 3.3278689918959863, + "objective/train/tokens_used": 2938441184, + "theoretical_loss": 3.3278689918959863, + "tokens_seen": 2917981184 + }, + { + "epoch": 9.07, + "learning_rate": 5.860581745235707e-05, + "loss": 2.3958, + "theoretical_loss": 3.327867547136584, + "tokens_seen": 2917997568 + }, + { + "epoch": 9.07, + "learning_rate": 5.859578736208626e-05, + "loss": 2.3222, + "theoretical_loss": 3.327861768202809, + "tokens_seen": 2918063104 + }, + { + "epoch": 9.07, + "learning_rate": 5.858575727181545e-05, + "loss": 2.5188, + "theoretical_loss": 3.327855989435159, + "tokens_seen": 2918128640 + }, + { + "epoch": 9.07, + "learning_rate": 5.8575727181544635e-05, + "loss": 2.5023, + "theoretical_loss": 3.327850210833627, + "tokens_seen": 2918194176 + }, + { + "epoch": 9.07, + "learning_rate": 5.856569709127382e-05, + "loss": 2.3104, + "theoretical_loss": 3.3278444323982033, + "tokens_seen": 2918259712 + }, + { + "epoch": 9.07, + "learning_rate": 5.855566700100301e-05, + "loss": 2.4811, + "theoretical_loss": 3.32783865412888, + "tokens_seen": 2918325248 + }, + { + "epoch": 9.08, + "learning_rate": 5.85456369107322e-05, + "loss": 2.5021, + "theoretical_loss": 3.327832876025648, + "tokens_seen": 2918390784 + }, + { + "epoch": 9.08, + "learning_rate": 5.853560682046139e-05, + "loss": 2.4567, + "theoretical_loss": 3.327827098088499, + "tokens_seen": 2918456320 + }, + { + "epoch": 9.08, + "learning_rate": 5.8525576730190575e-05, + "loss": 2.4957, + "theoretical_loss": 3.327821320317425, + "tokens_seen": 2918521856 + }, + { + "epoch": 9.08, + "learning_rate": 5.8515546639919756e-05, + "loss": 2.2592, + "theoretical_loss": 3.327815542712417, + "tokens_seen": 2918587392 + }, + { + "epoch": 9.08, + "learning_rate": 5.8505516549648944e-05, + "loss": 2.5847, + "theoretical_loss": 3.3278097652734666, + "tokens_seen": 2918652928 + }, + { + "epoch": 9.08, + "learning_rate": 5.849548645937813e-05, + "loss": 2.6541, + "theoretical_loss": 3.327803988000565, + "tokens_seen": 2918718464 + }, + { + "epoch": 9.08, + "learning_rate": 5.848545636910732e-05, + "loss": 2.3075, + "theoretical_loss": 3.3277982108937048, + "tokens_seen": 2918784000 + }, + { + "epoch": 9.08, + "learning_rate": 5.847542627883651e-05, + "loss": 2.4886, + "theoretical_loss": 3.327792433952876, + "tokens_seen": 2918849536 + }, + { + "epoch": 9.08, + "learning_rate": 5.8465396188565695e-05, + "loss": 2.6214, + "theoretical_loss": 3.3277866571780708, + "tokens_seen": 2918915072 + }, + { + "epoch": 9.08, + "learning_rate": 5.845536609829489e-05, + "loss": 2.4507, + "theoretical_loss": 3.3277808805692812, + "tokens_seen": 2918980608 + }, + { + "epoch": 9.08, + "learning_rate": 5.844533600802408e-05, + "loss": 2.2114, + "theoretical_loss": 3.3277751041264976, + "tokens_seen": 2919046144 + }, + { + "epoch": 9.08, + "learning_rate": 5.8435305917753266e-05, + "loss": 2.3425, + "theoretical_loss": 3.3277693278497127, + "tokens_seen": 2919111680 + }, + { + "epoch": 9.08, + "learning_rate": 5.8425275827482454e-05, + "loss": 2.4477, + "theoretical_loss": 3.3277635517389172, + "tokens_seen": 2919177216 + }, + { + "epoch": 9.08, + "learning_rate": 5.8415245737211635e-05, + "loss": 2.4676, + "theoretical_loss": 3.327757775794103, + "tokens_seen": 2919242752 + }, + { + "epoch": 9.08, + "learning_rate": 5.840521564694082e-05, + "loss": 2.518, + "theoretical_loss": 3.327752000015261, + "tokens_seen": 2919308288 + }, + { + "epoch": 9.08, + "learning_rate": 5.839518555667001e-05, + "loss": 2.6685, + "theoretical_loss": 3.3277462244023837, + "tokens_seen": 2919373824 + }, + { + "epoch": 9.08, + "learning_rate": 5.83851554663992e-05, + "loss": 2.4287, + "theoretical_loss": 3.3277404489554616, + "tokens_seen": 2919439360 + }, + { + "epoch": 9.08, + "learning_rate": 5.8375125376128387e-05, + "loss": 2.4631, + "theoretical_loss": 3.327734673674487, + "tokens_seen": 2919504896 + }, + { + "epoch": 9.08, + "learning_rate": 5.8365095285857574e-05, + "loss": 2.3416, + "theoretical_loss": 3.3277288985594504, + "tokens_seen": 2919570432 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3186318, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.215847969055176, + "objective/train/theoretical_loss": 3.3277245673320657, + "objective/train/tokens_used": 2940079584, + "theoretical_loss": 3.3277245673320657, + "tokens_seen": 2919619584 + }, + { + "epoch": 9.08, + "learning_rate": 5.835506519558676e-05, + "loss": 2.439, + "theoretical_loss": 3.327723123610345, + "tokens_seen": 2919635968 + }, + { + "epoch": 9.08, + "learning_rate": 5.834503510531595e-05, + "loss": 2.3144, + "theoretical_loss": 3.3277173488271603, + "tokens_seen": 2919701504 + }, + { + "epoch": 9.08, + "learning_rate": 5.833500501504514e-05, + "loss": 2.5491, + "theoretical_loss": 3.3277115742098893, + "tokens_seen": 2919767040 + }, + { + "epoch": 9.08, + "learning_rate": 5.832497492477432e-05, + "loss": 2.5223, + "theoretical_loss": 3.327705799758523, + "tokens_seen": 2919832576 + }, + { + "epoch": 9.08, + "learning_rate": 5.831494483450351e-05, + "loss": 2.3788, + "theoretical_loss": 3.3277000254730527, + "tokens_seen": 2919898112 + }, + { + "epoch": 9.08, + "learning_rate": 5.8304914744232695e-05, + "loss": 2.369, + "theoretical_loss": 3.32769425135347, + "tokens_seen": 2919963648 + }, + { + "epoch": 9.08, + "learning_rate": 5.829488465396188e-05, + "loss": 2.481, + "theoretical_loss": 3.327688477399767, + "tokens_seen": 2920029184 + }, + { + "epoch": 9.08, + "learning_rate": 5.828485456369107e-05, + "loss": 2.5504, + "theoretical_loss": 3.3276827036119343, + "tokens_seen": 2920094720 + }, + { + "epoch": 9.08, + "learning_rate": 5.8274824473420266e-05, + "loss": 2.51, + "theoretical_loss": 3.327676929989964, + "tokens_seen": 2920160256 + }, + { + "epoch": 9.08, + "learning_rate": 5.8264794383149453e-05, + "loss": 2.3852, + "theoretical_loss": 3.3276711565338477, + "tokens_seen": 2920225792 + }, + { + "epoch": 9.08, + "learning_rate": 5.825476429287864e-05, + "loss": 2.4615, + "theoretical_loss": 3.327665383243576, + "tokens_seen": 2920291328 + }, + { + "epoch": 9.08, + "learning_rate": 5.824473420260783e-05, + "loss": 2.386, + "theoretical_loss": 3.3276596101191416, + "tokens_seen": 2920356864 + }, + { + "epoch": 9.08, + "learning_rate": 5.823470411233702e-05, + "loss": 2.422, + "theoretical_loss": 3.3276538371605353, + "tokens_seen": 2920422400 + }, + { + "epoch": 9.08, + "learning_rate": 5.82246740220662e-05, + "loss": 2.5049, + "theoretical_loss": 3.3276480643677493, + "tokens_seen": 2920487936 + }, + { + "epoch": 9.08, + "learning_rate": 5.8214643931795386e-05, + "loss": 2.6177, + "theoretical_loss": 3.327642291740774, + "tokens_seen": 2920553472 + }, + { + "epoch": 9.08, + "learning_rate": 5.8204613841524574e-05, + "loss": 2.2505, + "theoretical_loss": 3.327636519279602, + "tokens_seen": 2920619008 + }, + { + "epoch": 9.08, + "learning_rate": 5.819458375125376e-05, + "loss": 2.2994, + "theoretical_loss": 3.327630746984224, + "tokens_seen": 2920684544 + }, + { + "epoch": 9.08, + "learning_rate": 5.818455366098295e-05, + "loss": 2.4731, + "theoretical_loss": 3.3276249748546323, + "tokens_seen": 2920750080 + }, + { + "epoch": 9.08, + "learning_rate": 5.817452357071214e-05, + "loss": 2.4923, + "theoretical_loss": 3.327619202890818, + "tokens_seen": 2920815616 + }, + { + "epoch": 9.08, + "learning_rate": 5.8164493480441326e-05, + "loss": 2.4744, + "theoretical_loss": 3.327613431092772, + "tokens_seen": 2920881152 + }, + { + "epoch": 9.08, + "learning_rate": 5.8154463390170514e-05, + "loss": 2.4528, + "theoretical_loss": 3.3276076594604866, + "tokens_seen": 2920946688 + }, + { + "epoch": 9.08, + "learning_rate": 5.81444332998997e-05, + "loss": 2.4723, + "theoretical_loss": 3.327601887993954, + "tokens_seen": 2921012224 + }, + { + "epoch": 9.08, + "learning_rate": 5.813440320962889e-05, + "loss": 2.5781, + "theoretical_loss": 3.3275961166931642, + "tokens_seen": 2921077760 + }, + { + "epoch": 9.08, + "learning_rate": 5.812437311935807e-05, + "loss": 2.161, + "theoretical_loss": 3.3275903455581095, + "tokens_seen": 2921143296 + }, + { + "epoch": 9.08, + "learning_rate": 5.811434302908726e-05, + "loss": 2.4884, + "theoretical_loss": 3.3275845745887813, + "tokens_seen": 2921208832 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3187752, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1229896545410156, + "objective/train/theoretical_loss": 3.327580246470538, + "objective/train/tokens_used": 2941717984, + "theoretical_loss": 3.327580246470538, + "tokens_seen": 2921257984 + }, + { + "epoch": 9.08, + "learning_rate": 5.8104312938816446e-05, + "loss": 2.2241, + "theoretical_loss": 3.3275788037851712, + "tokens_seen": 2921274368 + }, + { + "epoch": 9.08, + "learning_rate": 5.8094282848545634e-05, + "loss": 2.3043, + "theoretical_loss": 3.327573033147271, + "tokens_seen": 2921339904 + }, + { + "epoch": 9.08, + "learning_rate": 5.808425275827483e-05, + "loss": 2.5524, + "theoretical_loss": 3.327567262675071, + "tokens_seen": 2921405440 + }, + { + "epoch": 9.08, + "learning_rate": 5.807422266800402e-05, + "loss": 2.4963, + "theoretical_loss": 3.3275614923685644, + "tokens_seen": 2921470976 + }, + { + "epoch": 9.08, + "learning_rate": 5.8064192577733205e-05, + "loss": 2.6286, + "theoretical_loss": 3.327555722227742, + "tokens_seen": 2921536512 + }, + { + "epoch": 9.08, + "learning_rate": 5.805416248746239e-05, + "loss": 2.3508, + "theoretical_loss": 3.327549952252595, + "tokens_seen": 2921602048 + }, + { + "epoch": 9.08, + "learning_rate": 5.804413239719158e-05, + "loss": 2.6069, + "theoretical_loss": 3.3275441824431153, + "tokens_seen": 2921667584 + }, + { + "epoch": 9.08, + "learning_rate": 5.803410230692077e-05, + "loss": 2.4854, + "theoretical_loss": 3.327538412799294, + "tokens_seen": 2921733120 + }, + { + "epoch": 9.08, + "learning_rate": 5.802407221664995e-05, + "loss": 2.3443, + "theoretical_loss": 3.327532643321123, + "tokens_seen": 2921798656 + }, + { + "epoch": 9.08, + "learning_rate": 5.801404212637914e-05, + "loss": 2.5497, + "theoretical_loss": 3.327526874008594, + "tokens_seen": 2921864192 + }, + { + "epoch": 9.08, + "learning_rate": 5.8004012036108325e-05, + "loss": 2.3419, + "theoretical_loss": 3.327521104861699, + "tokens_seen": 2921929728 + }, + { + "epoch": 9.08, + "learning_rate": 5.799398194583751e-05, + "loss": 2.4474, + "theoretical_loss": 3.327515335880428, + "tokens_seen": 2921995264 + }, + { + "epoch": 9.08, + "learning_rate": 5.79839518555667e-05, + "loss": 2.4505, + "theoretical_loss": 3.3275095670647734, + "tokens_seen": 2922060800 + }, + { + "epoch": 9.08, + "learning_rate": 5.797392176529589e-05, + "loss": 2.4749, + "theoretical_loss": 3.327503798414727, + "tokens_seen": 2922126336 + }, + { + "epoch": 9.08, + "learning_rate": 5.796389167502508e-05, + "loss": 2.3446, + "theoretical_loss": 3.32749802993028, + "tokens_seen": 2922191872 + }, + { + "epoch": 9.08, + "learning_rate": 5.7953861584754265e-05, + "loss": 2.4443, + "theoretical_loss": 3.327492261611424, + "tokens_seen": 2922257408 + }, + { + "epoch": 9.08, + "learning_rate": 5.794383149448345e-05, + "loss": 2.4028, + "theoretical_loss": 3.3274864934581503, + "tokens_seen": 2922322944 + }, + { + "epoch": 9.08, + "learning_rate": 5.7933801404212634e-05, + "loss": 2.4986, + "theoretical_loss": 3.3274807254704504, + "tokens_seen": 2922388480 + }, + { + "epoch": 9.08, + "learning_rate": 5.792377131394182e-05, + "loss": 2.4674, + "theoretical_loss": 3.3274749576483167, + "tokens_seen": 2922454016 + }, + { + "epoch": 9.08, + "learning_rate": 5.791374122367101e-05, + "loss": 2.4369, + "theoretical_loss": 3.32746918999174, + "tokens_seen": 2922519552 + }, + { + "epoch": 9.08, + "learning_rate": 5.79037111334002e-05, + "loss": 2.536, + "theoretical_loss": 3.3274634225007116, + "tokens_seen": 2922585088 + }, + { + "epoch": 9.08, + "learning_rate": 5.789368104312939e-05, + "loss": 2.5527, + "theoretical_loss": 3.3274576551752237, + "tokens_seen": 2922650624 + }, + { + "epoch": 9.08, + "learning_rate": 5.788365095285858e-05, + "loss": 2.6033, + "theoretical_loss": 3.3274518880152675, + "tokens_seen": 2922716160 + }, + { + "epoch": 9.08, + "learning_rate": 5.787362086258777e-05, + "loss": 2.3705, + "theoretical_loss": 3.3274461210208344, + "tokens_seen": 2922781696 + }, + { + "epoch": 9.08, + "learning_rate": 5.7863590772316956e-05, + "loss": 2.424, + "theoretical_loss": 3.327440354191916, + "tokens_seen": 2922847232 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3188294, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5335986614227295, + "objective/train/theoretical_loss": 3.327436029178841, + "objective/train/tokens_used": 2943356384, + "theoretical_loss": 3.327436029178841, + "tokens_seen": 2922896384 + }, + { + "epoch": 9.08, + "learning_rate": 5.7853560682046144e-05, + "loss": 2.3625, + "theoretical_loss": 3.3274345875285043, + "tokens_seen": 2922912768 + }, + { + "epoch": 9.08, + "learning_rate": 5.784353059177533e-05, + "loss": 2.5204, + "theoretical_loss": 3.32742882103059, + "tokens_seen": 2922978304 + }, + { + "epoch": 9.08, + "learning_rate": 5.783350050150451e-05, + "loss": 2.416, + "theoretical_loss": 3.3274230546981656, + "tokens_seen": 2923043840 + }, + { + "epoch": 9.08, + "learning_rate": 5.78234704112337e-05, + "loss": 2.3499, + "theoretical_loss": 3.3274172885312217, + "tokens_seen": 2923109376 + }, + { + "epoch": 9.08, + "learning_rate": 5.781344032096289e-05, + "loss": 2.765, + "theoretical_loss": 3.3274115225297507, + "tokens_seen": 2923174912 + }, + { + "epoch": 9.08, + "learning_rate": 5.7803410230692076e-05, + "loss": 2.3316, + "theoretical_loss": 3.3274057566937434, + "tokens_seen": 2923240448 + }, + { + "epoch": 9.08, + "learning_rate": 5.7793380140421264e-05, + "loss": 2.5128, + "theoretical_loss": 3.327399991023192, + "tokens_seen": 2923305984 + }, + { + "epoch": 9.08, + "learning_rate": 5.778335005015045e-05, + "loss": 2.6064, + "theoretical_loss": 3.3273942255180877, + "tokens_seen": 2923371520 + }, + { + "epoch": 9.08, + "learning_rate": 5.777331995987964e-05, + "loss": 2.3058, + "theoretical_loss": 3.3273884601784216, + "tokens_seen": 2923437056 + }, + { + "epoch": 9.08, + "learning_rate": 5.776328986960883e-05, + "loss": 2.5212, + "theoretical_loss": 3.3273826950041863, + "tokens_seen": 2923502592 + }, + { + "epoch": 9.08, + "learning_rate": 5.7753259779338016e-05, + "loss": 2.3035, + "theoretical_loss": 3.3273769299953724, + "tokens_seen": 2923568128 + }, + { + "epoch": 9.08, + "learning_rate": 5.7743229689067204e-05, + "loss": 2.6173, + "theoretical_loss": 3.327371165151972, + "tokens_seen": 2923633664 + }, + { + "epoch": 9.08, + "learning_rate": 5.7733199598796385e-05, + "loss": 2.4149, + "theoretical_loss": 3.327365400473976, + "tokens_seen": 2923699200 + }, + { + "epoch": 9.08, + "learning_rate": 5.772316950852557e-05, + "loss": 2.5376, + "theoretical_loss": 3.327359635961377, + "tokens_seen": 2923764736 + }, + { + "epoch": 9.08, + "learning_rate": 5.771313941825477e-05, + "loss": 2.6772, + "theoretical_loss": 3.3273538716141653, + "tokens_seen": 2923830272 + }, + { + "epoch": 9.08, + "learning_rate": 5.7703109327983955e-05, + "loss": 2.4855, + "theoretical_loss": 3.327348107432334, + "tokens_seen": 2923895808 + }, + { + "epoch": 9.08, + "learning_rate": 5.769307923771314e-05, + "loss": 2.5848, + "theoretical_loss": 3.327342343415873, + "tokens_seen": 2923961344 + }, + { + "epoch": 9.08, + "learning_rate": 5.768304914744233e-05, + "loss": 2.3676, + "theoretical_loss": 3.3273365795647747, + "tokens_seen": 2924026880 + }, + { + "epoch": 9.08, + "learning_rate": 5.767301905717152e-05, + "loss": 2.4678, + "theoretical_loss": 3.327330815879031, + "tokens_seen": 2924092416 + }, + { + "epoch": 9.08, + "learning_rate": 5.766298896690071e-05, + "loss": 2.4584, + "theoretical_loss": 3.327325052358632, + "tokens_seen": 2924157952 + }, + { + "epoch": 9.08, + "learning_rate": 5.7652958876629895e-05, + "loss": 2.5906, + "theoretical_loss": 3.327319289003571, + "tokens_seen": 2924223488 + }, + { + "epoch": 9.08, + "learning_rate": 5.7642928786359076e-05, + "loss": 2.4989, + "theoretical_loss": 3.327313525813839, + "tokens_seen": 2924289024 + }, + { + "epoch": 9.08, + "learning_rate": 5.7632898696088264e-05, + "loss": 2.3755, + "theoretical_loss": 3.3273077627894265, + "tokens_seen": 2924354560 + }, + { + "epoch": 9.08, + "learning_rate": 5.762286860581745e-05, + "loss": 2.4809, + "theoretical_loss": 3.3273019999303264, + "tokens_seen": 2924420096 + }, + { + "epoch": 9.08, + "learning_rate": 5.761283851554664e-05, + "loss": 2.5385, + "theoretical_loss": 3.3272962372365296, + "tokens_seen": 2924485632 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3189723, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3976027965545654, + "objective/train/theoretical_loss": 3.3272919153246576, + "objective/train/tokens_used": 2944994784, + "theoretical_loss": 3.3272919153246576, + "tokens_seen": 2924534784 + }, + { + "epoch": 9.08, + "learning_rate": 5.760280842527583e-05, + "loss": 2.3237, + "theoretical_loss": 3.327290474708028, + "tokens_seen": 2924551168 + }, + { + "epoch": 9.08, + "learning_rate": 5.7592778335005015e-05, + "loss": 2.6361, + "theoretical_loss": 3.327284712344813, + "tokens_seen": 2924616704 + }, + { + "epoch": 9.08, + "learning_rate": 5.75827482447342e-05, + "loss": 2.4922, + "theoretical_loss": 3.327278950146876, + "tokens_seen": 2924682240 + }, + { + "epoch": 9.08, + "learning_rate": 5.757271815446339e-05, + "loss": 2.4255, + "theoretical_loss": 3.3272731881142086, + "tokens_seen": 2924747776 + }, + { + "epoch": 9.08, + "learning_rate": 5.756268806419258e-05, + "loss": 2.5259, + "theoretical_loss": 3.3272674262468023, + "tokens_seen": 2924813312 + }, + { + "epoch": 9.08, + "learning_rate": 5.755265797392177e-05, + "loss": 2.3722, + "theoretical_loss": 3.3272616645446496, + "tokens_seen": 2924878848 + }, + { + "epoch": 9.08, + "learning_rate": 5.754262788365095e-05, + "loss": 2.4664, + "theoretical_loss": 3.32725590300774, + "tokens_seen": 2924944384 + }, + { + "epoch": 9.08, + "learning_rate": 5.7532597793380136e-05, + "loss": 2.4234, + "theoretical_loss": 3.327250141636067, + "tokens_seen": 2925009920 + }, + { + "epoch": 9.08, + "learning_rate": 5.752256770310933e-05, + "loss": 2.4229, + "theoretical_loss": 3.3272443804296215, + "tokens_seen": 2925075456 + }, + { + "epoch": 9.08, + "learning_rate": 5.751253761283852e-05, + "loss": 2.5517, + "theoretical_loss": 3.327238619388395, + "tokens_seen": 2925140992 + }, + { + "epoch": 9.08, + "learning_rate": 5.7502507522567706e-05, + "loss": 2.3912, + "theoretical_loss": 3.327232858512379, + "tokens_seen": 2925206528 + }, + { + "epoch": 9.08, + "learning_rate": 5.7492477432296894e-05, + "loss": 2.6237, + "theoretical_loss": 3.327227097801565, + "tokens_seen": 2925272064 + }, + { + "epoch": 9.08, + "learning_rate": 5.748244734202608e-05, + "loss": 2.2216, + "theoretical_loss": 3.3272213372559447, + "tokens_seen": 2925337600 + }, + { + "epoch": 9.08, + "learning_rate": 5.747241725175527e-05, + "loss": 2.4866, + "theoretical_loss": 3.32721557687551, + "tokens_seen": 2925403136 + }, + { + "epoch": 9.08, + "learning_rate": 5.746238716148446e-05, + "loss": 2.4854, + "theoretical_loss": 3.3272098166602517, + "tokens_seen": 2925468672 + }, + { + "epoch": 9.08, + "learning_rate": 5.7452357071213646e-05, + "loss": 2.5975, + "theoretical_loss": 3.327204056610162, + "tokens_seen": 2925534208 + }, + { + "epoch": 9.08, + "learning_rate": 5.744232698094283e-05, + "loss": 2.5597, + "theoretical_loss": 3.327198296725232, + "tokens_seen": 2925599744 + }, + { + "epoch": 9.08, + "learning_rate": 5.7432296890672015e-05, + "loss": 2.1435, + "theoretical_loss": 3.327192537005454, + "tokens_seen": 2925665280 + }, + { + "epoch": 9.08, + "learning_rate": 5.74222668004012e-05, + "loss": 2.2401, + "theoretical_loss": 3.3271867774508186, + "tokens_seen": 2925730816 + }, + { + "epoch": 9.08, + "learning_rate": 5.741223671013039e-05, + "loss": 2.5085, + "theoretical_loss": 3.327181018061318, + "tokens_seen": 2925796352 + }, + { + "epoch": 9.08, + "learning_rate": 5.740220661985958e-05, + "loss": 2.3167, + "theoretical_loss": 3.327175258836944, + "tokens_seen": 2925861888 + }, + { + "epoch": 9.08, + "learning_rate": 5.7392176529588767e-05, + "loss": 2.4162, + "theoretical_loss": 3.3271694997776873, + "tokens_seen": 2925927424 + }, + { + "epoch": 9.08, + "learning_rate": 5.7382146439317954e-05, + "loss": 2.3352, + "theoretical_loss": 3.3271637408835404, + "tokens_seen": 2925992960 + }, + { + "epoch": 9.08, + "learning_rate": 5.737211634904714e-05, + "loss": 2.3316, + "theoretical_loss": 3.3271579821544943, + "tokens_seen": 2926058496 + }, + { + "epoch": 9.08, + "learning_rate": 5.736208625877633e-05, + "loss": 2.3587, + "theoretical_loss": 3.3271522235905406, + "tokens_seen": 2926124032 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3190488, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.610664129257202, + "objective/train/theoretical_loss": 3.327147904775912, + "objective/train/tokens_used": 2946633184, + "theoretical_loss": 3.327147904775912, + "tokens_seen": 2926173184 + }, + { + "epoch": 9.08, + "learning_rate": 5.735205616850552e-05, + "loss": 2.3602, + "theoretical_loss": 3.327146465191671, + "tokens_seen": 2926189568 + }, + { + "epoch": 9.08, + "learning_rate": 5.73420260782347e-05, + "loss": 2.4943, + "theoretical_loss": 3.3271407069578767, + "tokens_seen": 2926255104 + }, + { + "epoch": 9.08, + "learning_rate": 5.7331995987963894e-05, + "loss": 2.445, + "theoretical_loss": 3.3271349488891495, + "tokens_seen": 2926320640 + }, + { + "epoch": 9.08, + "learning_rate": 5.732196589769308e-05, + "loss": 2.3607, + "theoretical_loss": 3.327129190985482, + "tokens_seen": 2926386176 + }, + { + "epoch": 9.08, + "learning_rate": 5.731193580742227e-05, + "loss": 2.348, + "theoretical_loss": 3.327123433246864, + "tokens_seen": 2926451712 + }, + { + "epoch": 9.08, + "learning_rate": 5.730190571715146e-05, + "loss": 2.4271, + "theoretical_loss": 3.3271176756732883, + "tokens_seen": 2926517248 + }, + { + "epoch": 9.08, + "learning_rate": 5.7291875626880646e-05, + "loss": 2.4196, + "theoretical_loss": 3.3271119182647455, + "tokens_seen": 2926582784 + }, + { + "epoch": 9.08, + "learning_rate": 5.7281845536609833e-05, + "loss": 2.4746, + "theoretical_loss": 3.3271061610212285, + "tokens_seen": 2926648320 + }, + { + "epoch": 9.08, + "learning_rate": 5.727181544633902e-05, + "loss": 2.5479, + "theoretical_loss": 3.327100403942728, + "tokens_seen": 2926713856 + }, + { + "epoch": 9.08, + "learning_rate": 5.726178535606821e-05, + "loss": 2.5, + "theoretical_loss": 3.3270946470292353, + "tokens_seen": 2926779392 + }, + { + "epoch": 9.08, + "learning_rate": 5.725175526579739e-05, + "loss": 2.5109, + "theoretical_loss": 3.3270888902807427, + "tokens_seen": 2926844928 + }, + { + "epoch": 9.08, + "learning_rate": 5.724172517552658e-05, + "loss": 2.5407, + "theoretical_loss": 3.3270831336972417, + "tokens_seen": 2926910464 + }, + { + "epoch": 9.08, + "learning_rate": 5.7231695085255766e-05, + "loss": 2.2877, + "theoretical_loss": 3.327077377278723, + "tokens_seen": 2926976000 + }, + { + "epoch": 9.08, + "learning_rate": 5.7221664994984954e-05, + "loss": 2.5192, + "theoretical_loss": 3.3270716210251794, + "tokens_seen": 2927041536 + }, + { + "epoch": 9.08, + "learning_rate": 5.721163490471414e-05, + "loss": 2.3932, + "theoretical_loss": 3.3270658649366016, + "tokens_seen": 2927107072 + }, + { + "epoch": 9.08, + "learning_rate": 5.720160481444333e-05, + "loss": 2.3773, + "theoretical_loss": 3.3270601090129817, + "tokens_seen": 2927172608 + }, + { + "epoch": 9.08, + "learning_rate": 5.719157472417252e-05, + "loss": 2.4653, + "theoretical_loss": 3.327054353254311, + "tokens_seen": 2927238144 + }, + { + "epoch": 9.08, + "learning_rate": 5.7181544633901706e-05, + "loss": 2.6643, + "theoretical_loss": 3.327048597660581, + "tokens_seen": 2927303680 + }, + { + "epoch": 9.08, + "learning_rate": 5.7171514543630893e-05, + "loss": 2.5236, + "theoretical_loss": 3.3270428422317835, + "tokens_seen": 2927369216 + }, + { + "epoch": 9.08, + "learning_rate": 5.716148445336008e-05, + "loss": 2.5213, + "theoretical_loss": 3.32703708696791, + "tokens_seen": 2927434752 + }, + { + "epoch": 9.08, + "learning_rate": 5.715145436308926e-05, + "loss": 2.5188, + "theoretical_loss": 3.327031331868952, + "tokens_seen": 2927500288 + }, + { + "epoch": 9.08, + "learning_rate": 5.714142427281846e-05, + "loss": 2.4086, + "theoretical_loss": 3.3270255769349015, + "tokens_seen": 2927565824 + }, + { + "epoch": 9.08, + "learning_rate": 5.7131394182547645e-05, + "loss": 2.4717, + "theoretical_loss": 3.3270198221657497, + "tokens_seen": 2927631360 + }, + { + "epoch": 9.08, + "learning_rate": 5.712136409227683e-05, + "loss": 2.5102, + "theoretical_loss": 3.3270140675614877, + "tokens_seen": 2927696896 + }, + { + "epoch": 9.08, + "learning_rate": 5.711133400200602e-05, + "loss": 2.3876, + "theoretical_loss": 3.327008313122108, + "tokens_seen": 2927762432 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3191818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2598812580108643, + "objective/train/theoretical_loss": 3.3270039974007717, + "objective/train/tokens_used": 2948271584, + "theoretical_loss": 3.3270039974007717, + "tokens_seen": 2927811584 + }, + { + "epoch": 9.08, + "learning_rate": 5.710130391173521e-05, + "loss": 2.5312, + "theoretical_loss": 3.3270025588476018, + "tokens_seen": 2927827968 + }, + { + "epoch": 9.08, + "learning_rate": 5.70912738214644e-05, + "loss": 2.43, + "theoretical_loss": 3.3269968047379606, + "tokens_seen": 2927893504 + }, + { + "epoch": 9.08, + "learning_rate": 5.7081243731193585e-05, + "loss": 2.4082, + "theoretical_loss": 3.326991050793176, + "tokens_seen": 2927959040 + }, + { + "epoch": 9.08, + "learning_rate": 5.707121364092277e-05, + "loss": 2.6866, + "theoretical_loss": 3.32698529701324, + "tokens_seen": 2928024576 + }, + { + "epoch": 9.08, + "learning_rate": 5.706118355065196e-05, + "loss": 2.5519, + "theoretical_loss": 3.3269795433981435, + "tokens_seen": 2928090112 + }, + { + "epoch": 9.08, + "learning_rate": 5.705115346038114e-05, + "loss": 2.6882, + "theoretical_loss": 3.3269737899478784, + "tokens_seen": 2928155648 + }, + { + "epoch": 9.08, + "learning_rate": 5.704112337011033e-05, + "loss": 2.5227, + "theoretical_loss": 3.3269680366624366, + "tokens_seen": 2928221184 + }, + { + "epoch": 9.08, + "learning_rate": 5.703109327983952e-05, + "loss": 2.4699, + "theoretical_loss": 3.326962283541809, + "tokens_seen": 2928286720 + }, + { + "epoch": 9.08, + "learning_rate": 5.7021063189568705e-05, + "loss": 2.4169, + "theoretical_loss": 3.326956530585988, + "tokens_seen": 2928352256 + }, + { + "epoch": 9.08, + "learning_rate": 5.701103309929789e-05, + "loss": 2.2983, + "theoretical_loss": 3.326950777794965, + "tokens_seen": 2928417792 + }, + { + "epoch": 9.08, + "learning_rate": 5.700100300902708e-05, + "loss": 2.5344, + "theoretical_loss": 3.326945025168731, + "tokens_seen": 2928483328 + }, + { + "epoch": 9.08, + "learning_rate": 5.699097291875627e-05, + "loss": 2.3298, + "theoretical_loss": 3.326939272707278, + "tokens_seen": 2928548864 + }, + { + "epoch": 9.08, + "learning_rate": 5.698094282848546e-05, + "loss": 2.4099, + "theoretical_loss": 3.3269335204105976, + "tokens_seen": 2928614400 + }, + { + "epoch": 9.08, + "learning_rate": 5.6970912738214645e-05, + "loss": 2.3439, + "theoretical_loss": 3.3269277682786815, + "tokens_seen": 2928679936 + }, + { + "epoch": 9.08, + "learning_rate": 5.696088264794384e-05, + "loss": 2.463, + "theoretical_loss": 3.326922016311521, + "tokens_seen": 2928745472 + }, + { + "epoch": 9.08, + "learning_rate": 5.695085255767302e-05, + "loss": 2.7432, + "theoretical_loss": 3.326916264509108, + "tokens_seen": 2928811008 + }, + { + "epoch": 9.08, + "learning_rate": 5.694082246740221e-05, + "loss": 2.702, + "theoretical_loss": 3.326910512871434, + "tokens_seen": 2928876544 + }, + { + "epoch": 9.08, + "learning_rate": 5.6930792377131396e-05, + "loss": 2.4397, + "theoretical_loss": 3.3269047613984903, + "tokens_seen": 2928942080 + }, + { + "epoch": 9.08, + "learning_rate": 5.6920762286860584e-05, + "loss": 2.4252, + "theoretical_loss": 3.3268990100902687, + "tokens_seen": 2929007616 + }, + { + "epoch": 9.08, + "learning_rate": 5.691073219658977e-05, + "loss": 2.495, + "theoretical_loss": 3.326893258946761, + "tokens_seen": 2929073152 + }, + { + "epoch": 9.08, + "learning_rate": 5.690070210631896e-05, + "loss": 2.3708, + "theoretical_loss": 3.326887507967959, + "tokens_seen": 2929138688 + }, + { + "epoch": 9.08, + "learning_rate": 5.689067201604815e-05, + "loss": 2.4151, + "theoretical_loss": 3.3268817571538536, + "tokens_seen": 2929204224 + }, + { + "epoch": 9.08, + "learning_rate": 5.6880641925777336e-05, + "loss": 2.2346, + "theoretical_loss": 3.3268760065044365, + "tokens_seen": 2929269760 + }, + { + "epoch": 9.08, + "learning_rate": 5.6870611835506524e-05, + "loss": 2.4267, + "theoretical_loss": 3.3268702560196997, + "tokens_seen": 2929335296 + }, + { + "epoch": 9.08, + "learning_rate": 5.6860581745235705e-05, + "loss": 2.46, + "theoretical_loss": 3.3268645056996347, + "tokens_seen": 2929400832 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3192546, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1894571781158447, + "objective/train/theoretical_loss": 3.3268601930676467, + "objective/train/tokens_used": 2949909984, + "theoretical_loss": 3.3268601930676467, + "tokens_seen": 2929449984 + }, + { + "epoch": 9.08, + "learning_rate": 5.685055165496489e-05, + "loss": 2.4776, + "theoretical_loss": 3.326858755544233, + "tokens_seen": 2929466368 + }, + { + "epoch": 9.08, + "learning_rate": 5.684052156469408e-05, + "loss": 2.3369, + "theoretical_loss": 3.3268530055534864, + "tokens_seen": 2929531904 + }, + { + "epoch": 9.08, + "learning_rate": 5.683049147442327e-05, + "loss": 2.5274, + "theoretical_loss": 3.326847255727386, + "tokens_seen": 2929597440 + }, + { + "epoch": 9.08, + "learning_rate": 5.6820461384152456e-05, + "loss": 2.2952, + "theoretical_loss": 3.3268415060659238, + "tokens_seen": 2929662976 + }, + { + "epoch": 9.08, + "learning_rate": 5.6810431293881644e-05, + "loss": 2.4816, + "theoretical_loss": 3.3268357565690914, + "tokens_seen": 2929728512 + }, + { + "epoch": 9.08, + "learning_rate": 5.680040120361083e-05, + "loss": 2.4588, + "theoretical_loss": 3.3268300072368806, + "tokens_seen": 2929794048 + }, + { + "epoch": 9.08, + "learning_rate": 5.679037111334002e-05, + "loss": 2.3212, + "theoretical_loss": 3.3268242580692826, + "tokens_seen": 2929859584 + }, + { + "epoch": 9.08, + "learning_rate": 5.678034102306921e-05, + "loss": 2.5454, + "theoretical_loss": 3.326818509066289, + "tokens_seen": 2929925120 + }, + { + "epoch": 9.08, + "learning_rate": 5.67703109327984e-05, + "loss": 2.321, + "theoretical_loss": 3.3268127602278916, + "tokens_seen": 2929990656 + }, + { + "epoch": 9.08, + "learning_rate": 5.6760280842527584e-05, + "loss": 2.2121, + "theoretical_loss": 3.3268070115540818, + "tokens_seen": 2930056192 + }, + { + "epoch": 9.08, + "learning_rate": 5.675025075225677e-05, + "loss": 2.4355, + "theoretical_loss": 3.326801263044852, + "tokens_seen": 2930121728 + }, + { + "epoch": 9.08, + "learning_rate": 5.674022066198596e-05, + "loss": 2.3702, + "theoretical_loss": 3.3267955147001924, + "tokens_seen": 2930187264 + }, + { + "epoch": 9.08, + "learning_rate": 5.673019057171515e-05, + "loss": 2.5837, + "theoretical_loss": 3.326789766520096, + "tokens_seen": 2930252800 + }, + { + "epoch": 9.08, + "learning_rate": 5.6720160481444335e-05, + "loss": 2.5498, + "theoretical_loss": 3.3267840185045534, + "tokens_seen": 2930318336 + }, + { + "epoch": 9.08, + "learning_rate": 5.671013039117352e-05, + "loss": 2.2919, + "theoretical_loss": 3.326778270653557, + "tokens_seen": 2930383872 + }, + { + "epoch": 9.08, + "learning_rate": 5.670010030090271e-05, + "loss": 2.6543, + "theoretical_loss": 3.3267725229670977, + "tokens_seen": 2930449408 + }, + { + "epoch": 9.08, + "learning_rate": 5.66900702106319e-05, + "loss": 2.503, + "theoretical_loss": 3.3267667754451673, + "tokens_seen": 2930514944 + }, + { + "epoch": 9.08, + "learning_rate": 5.668004012036109e-05, + "loss": 2.4618, + "theoretical_loss": 3.3267610280877578, + "tokens_seen": 2930580480 + }, + { + "epoch": 9.08, + "learning_rate": 5.6670010030090275e-05, + "loss": 2.563, + "theoretical_loss": 3.3267552808948606, + "tokens_seen": 2930646016 + }, + { + "epoch": 9.08, + "learning_rate": 5.6659979939819456e-05, + "loss": 2.4816, + "theoretical_loss": 3.326749533866467, + "tokens_seen": 2930711552 + }, + { + "epoch": 9.08, + "learning_rate": 5.6649949849548644e-05, + "loss": 2.6618, + "theoretical_loss": 3.326743787002569, + "tokens_seen": 2930777088 + }, + { + "epoch": 9.08, + "learning_rate": 5.663991975927783e-05, + "loss": 2.1551, + "theoretical_loss": 3.326738040303158, + "tokens_seen": 2930842624 + }, + { + "epoch": 9.08, + "learning_rate": 5.662988966900702e-05, + "loss": 2.3727, + "theoretical_loss": 3.326732293768226, + "tokens_seen": 2930908160 + }, + { + "epoch": 9.08, + "learning_rate": 5.661985957873621e-05, + "loss": 2.5549, + "theoretical_loss": 3.326726547397764, + "tokens_seen": 2930973696 + }, + { + "epoch": 9.08, + "learning_rate": 5.6609829488465395e-05, + "loss": 2.3076, + "theoretical_loss": 3.326720801191764, + "tokens_seen": 2931039232 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3193665, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3379178047180176, + "objective/train/theoretical_loss": 3.326716491645187, + "objective/train/tokens_used": 2951548384, + "theoretical_loss": 3.326716491645187, + "tokens_seen": 2931088384 + }, + { + "epoch": 9.08, + "learning_rate": 5.659979939819458e-05, + "loss": 2.1983, + "theoretical_loss": 3.326715055150218, + "tokens_seen": 2931104768 + }, + { + "epoch": 9.08, + "learning_rate": 5.658976930792377e-05, + "loss": 2.3597, + "theoretical_loss": 3.3267093092731166, + "tokens_seen": 2931170304 + }, + { + "epoch": 9.08, + "learning_rate": 5.6579739217652966e-05, + "loss": 2.3941, + "theoretical_loss": 3.326703563560452, + "tokens_seen": 2931235840 + }, + { + "epoch": 9.08, + "learning_rate": 5.6569709127382154e-05, + "loss": 2.5807, + "theoretical_loss": 3.326697818012216, + "tokens_seen": 2931301376 + }, + { + "epoch": 9.08, + "learning_rate": 5.6559679037111335e-05, + "loss": 2.4978, + "theoretical_loss": 3.3266920726283997, + "tokens_seen": 2931366912 + }, + { + "epoch": 9.08, + "learning_rate": 5.654964894684052e-05, + "loss": 2.3507, + "theoretical_loss": 3.3266863274089955, + "tokens_seen": 2931432448 + }, + { + "epoch": 9.08, + "learning_rate": 5.653961885656971e-05, + "loss": 2.6635, + "theoretical_loss": 3.3266805823539944, + "tokens_seen": 2931497984 + }, + { + "epoch": 9.08, + "learning_rate": 5.65295887662989e-05, + "loss": 2.5065, + "theoretical_loss": 3.326674837463388, + "tokens_seen": 2931563520 + }, + { + "epoch": 9.08, + "learning_rate": 5.6519558676028086e-05, + "loss": 2.4677, + "theoretical_loss": 3.326669092737168, + "tokens_seen": 2931629056 + }, + { + "epoch": 9.08, + "learning_rate": 5.6509528585757274e-05, + "loss": 2.2968, + "theoretical_loss": 3.3266633481753267, + "tokens_seen": 2931694592 + }, + { + "epoch": 9.08, + "learning_rate": 5.649949849548646e-05, + "loss": 2.4249, + "theoretical_loss": 3.326657603777855, + "tokens_seen": 2931760128 + }, + { + "epoch": 9.08, + "learning_rate": 5.648946840521565e-05, + "loss": 2.4154, + "theoretical_loss": 3.326651859544744, + "tokens_seen": 2931825664 + }, + { + "epoch": 9.08, + "learning_rate": 5.647943831494484e-05, + "loss": 2.5522, + "theoretical_loss": 3.3266461154759863, + "tokens_seen": 2931891200 + }, + { + "epoch": 9.08, + "learning_rate": 5.646940822467402e-05, + "loss": 2.3864, + "theoretical_loss": 3.3266403715715738, + "tokens_seen": 2931956736 + }, + { + "epoch": 9.08, + "learning_rate": 5.645937813440321e-05, + "loss": 2.5131, + "theoretical_loss": 3.326634627831497, + "tokens_seen": 2932022272 + }, + { + "epoch": 9.08, + "learning_rate": 5.6449348044132395e-05, + "loss": 2.4257, + "theoretical_loss": 3.326628884255748, + "tokens_seen": 2932087808 + }, + { + "epoch": 9.08, + "learning_rate": 5.643931795386158e-05, + "loss": 2.4475, + "theoretical_loss": 3.3266231408443185, + "tokens_seen": 2932153344 + }, + { + "epoch": 9.08, + "learning_rate": 5.642928786359077e-05, + "loss": 2.5088, + "theoretical_loss": 3.3266173975972, + "tokens_seen": 2932218880 + }, + { + "epoch": 9.08, + "learning_rate": 5.641925777331996e-05, + "loss": 2.4845, + "theoretical_loss": 3.3266116545143847, + "tokens_seen": 2932284416 + }, + { + "epoch": 9.08, + "learning_rate": 5.6409227683049147e-05, + "loss": 2.2236, + "theoretical_loss": 3.3266059115958635, + "tokens_seen": 2932349952 + }, + { + "epoch": 9.08, + "learning_rate": 5.639919759277834e-05, + "loss": 2.6277, + "theoretical_loss": 3.3266001688416287, + "tokens_seen": 2932415488 + }, + { + "epoch": 9.08, + "learning_rate": 5.638916750250753e-05, + "loss": 2.5023, + "theoretical_loss": 3.326594426251671, + "tokens_seen": 2932481024 + }, + { + "epoch": 9.08, + "learning_rate": 5.637913741223672e-05, + "loss": 2.2623, + "theoretical_loss": 3.3265886838259826, + "tokens_seen": 2932546560 + }, + { + "epoch": 9.08, + "learning_rate": 5.63691073219659e-05, + "loss": 2.3434, + "theoretical_loss": 3.3265829415645554, + "tokens_seen": 2932612096 + }, + { + "epoch": 9.08, + "learning_rate": 5.6359077231695086e-05, + "loss": 2.2962, + "theoretical_loss": 3.3265771994673803, + "tokens_seen": 2932677632 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3194094, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.189481258392334, + "objective/train/theoretical_loss": 3.326572893002285, + "objective/train/tokens_used": 2953186784, + "theoretical_loss": 3.326572893002285, + "tokens_seen": 2932726784 + }, + { + "epoch": 9.08, + "learning_rate": 5.6349047141424274e-05, + "loss": 2.4371, + "theoretical_loss": 3.32657145753445, + "tokens_seen": 2932743168 + }, + { + "epoch": 9.08, + "learning_rate": 5.633901705115346e-05, + "loss": 2.3472, + "theoretical_loss": 3.3265657157657547, + "tokens_seen": 2932808704 + }, + { + "epoch": 9.08, + "learning_rate": 5.632898696088265e-05, + "loss": 2.267, + "theoretical_loss": 3.3265599741612872, + "tokens_seen": 2932874240 + }, + { + "epoch": 9.08, + "learning_rate": 5.631895687061184e-05, + "loss": 2.5474, + "theoretical_loss": 3.3265542327210387, + "tokens_seen": 2932939776 + }, + { + "epoch": 9.08, + "learning_rate": 5.6308926780341025e-05, + "loss": 2.3154, + "theoretical_loss": 3.326548491445001, + "tokens_seen": 2933005312 + }, + { + "epoch": 9.08, + "learning_rate": 5.6298896690070213e-05, + "loss": 2.4903, + "theoretical_loss": 3.3265427503331653, + "tokens_seen": 2933070848 + }, + { + "epoch": 9.08, + "learning_rate": 5.62888665997994e-05, + "loss": 2.3617, + "theoretical_loss": 3.326537009385524, + "tokens_seen": 2933136384 + }, + { + "epoch": 9.08, + "learning_rate": 5.627883650952859e-05, + "loss": 2.506, + "theoretical_loss": 3.326531268602068, + "tokens_seen": 2933201920 + }, + { + "epoch": 9.08, + "learning_rate": 5.626880641925777e-05, + "loss": 2.5087, + "theoretical_loss": 3.326525527982789, + "tokens_seen": 2933267456 + }, + { + "epoch": 9.08, + "learning_rate": 5.625877632898696e-05, + "loss": 2.5754, + "theoretical_loss": 3.3265197875276793, + "tokens_seen": 2933332992 + }, + { + "epoch": 9.08, + "learning_rate": 5.6248746238716146e-05, + "loss": 2.7431, + "theoretical_loss": 3.3265140472367296, + "tokens_seen": 2933398528 + }, + { + "epoch": 9.08, + "learning_rate": 5.6238716148445334e-05, + "loss": 2.246, + "theoretical_loss": 3.3265083071099326, + "tokens_seen": 2933464064 + }, + { + "epoch": 9.08, + "learning_rate": 5.622868605817452e-05, + "loss": 2.3131, + "theoretical_loss": 3.326502567147279, + "tokens_seen": 2933529600 + }, + { + "epoch": 9.08, + "learning_rate": 5.621865596790371e-05, + "loss": 2.6221, + "theoretical_loss": 3.3264968273487607, + "tokens_seen": 2933595136 + }, + { + "epoch": 9.08, + "learning_rate": 5.6208625877632904e-05, + "loss": 2.4454, + "theoretical_loss": 3.32649108771437, + "tokens_seen": 2933660672 + }, + { + "epoch": 9.08, + "learning_rate": 5.619859578736209e-05, + "loss": 2.4322, + "theoretical_loss": 3.3264853482440975, + "tokens_seen": 2933726208 + }, + { + "epoch": 9.08, + "learning_rate": 5.618856569709128e-05, + "loss": 2.1765, + "theoretical_loss": 3.3264796089379356, + "tokens_seen": 2933791744 + }, + { + "epoch": 9.08, + "learning_rate": 5.617853560682047e-05, + "loss": 2.5184, + "theoretical_loss": 3.3264738697958753, + "tokens_seen": 2933857280 + }, + { + "epoch": 9.08, + "learning_rate": 5.616850551654965e-05, + "loss": 2.4694, + "theoretical_loss": 3.3264681308179087, + "tokens_seen": 2933922816 + }, + { + "epoch": 9.08, + "learning_rate": 5.615847542627884e-05, + "loss": 2.596, + "theoretical_loss": 3.3264623920040277, + "tokens_seen": 2933988352 + }, + { + "epoch": 9.08, + "learning_rate": 5.6148445336008025e-05, + "loss": 2.6109, + "theoretical_loss": 3.3264566533542235, + "tokens_seen": 2934053888 + }, + { + "epoch": 9.08, + "learning_rate": 5.613841524573721e-05, + "loss": 2.3455, + "theoretical_loss": 3.3264509148684875, + "tokens_seen": 2934119424 + }, + { + "epoch": 9.08, + "learning_rate": 5.61283851554664e-05, + "loss": 2.5765, + "theoretical_loss": 3.3264451765468124, + "tokens_seen": 2934184960 + }, + { + "epoch": 9.08, + "learning_rate": 5.611835506519559e-05, + "loss": 2.6766, + "theoretical_loss": 3.3264394383891887, + "tokens_seen": 2934250496 + }, + { + "epoch": 9.08, + "learning_rate": 5.610832497492478e-05, + "loss": 2.504, + "theoretical_loss": 3.3264337003956084, + "tokens_seen": 2934316032 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3195329, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3111469745635986, + "objective/train/theoretical_loss": 3.326429397008072, + "objective/train/tokens_used": 2954825184, + "theoretical_loss": 3.326429397008072, + "tokens_seen": 2934365184 + }, + { + "epoch": 9.08, + "learning_rate": 5.6098294884653965e-05, + "loss": 2.5136, + "theoretical_loss": 3.326427962566063, + "tokens_seen": 2934381568 + }, + { + "epoch": 9.08, + "learning_rate": 5.608826479438315e-05, + "loss": 2.2908, + "theoretical_loss": 3.326422224900545, + "tokens_seen": 2934447104 + }, + { + "epoch": 9.08, + "learning_rate": 5.6078234704112334e-05, + "loss": 2.473, + "theoretical_loss": 3.3264164873990447, + "tokens_seen": 2934512640 + }, + { + "epoch": 9.08, + "learning_rate": 5.606820461384152e-05, + "loss": 2.6491, + "theoretical_loss": 3.326410750061555, + "tokens_seen": 2934578176 + }, + { + "epoch": 9.08, + "learning_rate": 5.605817452357071e-05, + "loss": 2.3962, + "theoretical_loss": 3.326405012888067, + "tokens_seen": 2934643712 + }, + { + "epoch": 9.08, + "learning_rate": 5.60481444332999e-05, + "loss": 2.4595, + "theoretical_loss": 3.3263992758785723, + "tokens_seen": 2934709248 + }, + { + "epoch": 9.08, + "learning_rate": 5.6038114343029085e-05, + "loss": 2.6483, + "theoretical_loss": 3.3263935390330626, + "tokens_seen": 2934774784 + }, + { + "epoch": 9.08, + "learning_rate": 5.602808425275827e-05, + "loss": 2.4001, + "theoretical_loss": 3.3263878023515296, + "tokens_seen": 2934840320 + }, + { + "epoch": 9.08, + "learning_rate": 5.601805416248747e-05, + "loss": 2.4839, + "theoretical_loss": 3.3263820658339647, + "tokens_seen": 2934905856 + }, + { + "epoch": 9.08, + "learning_rate": 5.6008024072216656e-05, + "loss": 2.6799, + "theoretical_loss": 3.32637632948036, + "tokens_seen": 2934971392 + }, + { + "epoch": 9.08, + "learning_rate": 5.5997993981945844e-05, + "loss": 2.4471, + "theoretical_loss": 3.326370593290707, + "tokens_seen": 2935036928 + }, + { + "epoch": 9.08, + "learning_rate": 5.598796389167503e-05, + "loss": 2.3955, + "theoretical_loss": 3.326364857264997, + "tokens_seen": 2935102464 + }, + { + "epoch": 9.08, + "learning_rate": 5.597793380140421e-05, + "loss": 2.4075, + "theoretical_loss": 3.326359121403222, + "tokens_seen": 2935168000 + }, + { + "epoch": 9.08, + "learning_rate": 5.59679037111334e-05, + "loss": 2.3189, + "theoretical_loss": 3.326353385705374, + "tokens_seen": 2935233536 + }, + { + "epoch": 9.08, + "learning_rate": 5.595787362086259e-05, + "loss": 2.425, + "theoretical_loss": 3.326347650171444, + "tokens_seen": 2935299072 + }, + { + "epoch": 9.08, + "learning_rate": 5.5947843530591776e-05, + "loss": 2.6134, + "theoretical_loss": 3.3263419148014237, + "tokens_seen": 2935364608 + }, + { + "epoch": 9.08, + "learning_rate": 5.5937813440320964e-05, + "loss": 2.4654, + "theoretical_loss": 3.3263361795953053, + "tokens_seen": 2935430144 + }, + { + "epoch": 9.08, + "learning_rate": 5.592778335005015e-05, + "loss": 2.4024, + "theoretical_loss": 3.3263304445530797, + "tokens_seen": 2935495680 + }, + { + "epoch": 9.08, + "learning_rate": 5.591775325977934e-05, + "loss": 2.6017, + "theoretical_loss": 3.326324709674739, + "tokens_seen": 2935561216 + }, + { + "epoch": 9.08, + "learning_rate": 5.590772316950853e-05, + "loss": 2.3102, + "theoretical_loss": 3.3263189749602753, + "tokens_seen": 2935626752 + }, + { + "epoch": 9.08, + "learning_rate": 5.5897693079237716e-05, + "loss": 2.2987, + "theoretical_loss": 3.3263132404096796, + "tokens_seen": 2935692288 + }, + { + "epoch": 9.08, + "learning_rate": 5.5887662988966904e-05, + "loss": 2.5647, + "theoretical_loss": 3.3263075060229434, + "tokens_seen": 2935757824 + }, + { + "epoch": 9.08, + "learning_rate": 5.5877632898696085e-05, + "loss": 2.5274, + "theoretical_loss": 3.3263017718000594, + "tokens_seen": 2935823360 + }, + { + "epoch": 9.08, + "learning_rate": 5.586760280842527e-05, + "loss": 2.3353, + "theoretical_loss": 3.326296037741018, + "tokens_seen": 2935888896 + }, + { + "epoch": 9.08, + "learning_rate": 5.585757271815446e-05, + "loss": 2.4274, + "theoretical_loss": 3.3262903038458114, + "tokens_seen": 2935954432 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3195765, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5897316932678223, + "objective/train/theoretical_loss": 3.3262860035319184, + "objective/train/tokens_used": 2956463584, + "theoretical_loss": 3.3262860035319184, + "tokens_seen": 2936003584 + }, + { + "epoch": 9.08, + "learning_rate": 5.584754262788365e-05, + "loss": 2.509, + "theoretical_loss": 3.3262845701144315, + "tokens_seen": 2936019968 + }, + { + "epoch": 9.08, + "learning_rate": 5.5837512537612836e-05, + "loss": 2.4557, + "theoretical_loss": 3.32627883654687, + "tokens_seen": 2936085504 + }, + { + "epoch": 9.08, + "learning_rate": 5.582748244734203e-05, + "loss": 2.4548, + "theoretical_loss": 3.3262731031431176, + "tokens_seen": 2936151040 + }, + { + "epoch": 9.08, + "learning_rate": 5.581745235707122e-05, + "loss": 2.5145, + "theoretical_loss": 3.3262673699031673, + "tokens_seen": 2936216576 + }, + { + "epoch": 9.08, + "learning_rate": 5.580742226680041e-05, + "loss": 2.6217, + "theoretical_loss": 3.32626163682701, + "tokens_seen": 2936282112 + }, + { + "epoch": 9.08, + "learning_rate": 5.5797392176529595e-05, + "loss": 2.5123, + "theoretical_loss": 3.3262559039146375, + "tokens_seen": 2936347648 + }, + { + "epoch": 9.08, + "learning_rate": 5.5787362086258776e-05, + "loss": 2.5006, + "theoretical_loss": 3.3262501711660413, + "tokens_seen": 2936413184 + }, + { + "epoch": 9.08, + "learning_rate": 5.5777331995987964e-05, + "loss": 2.4704, + "theoretical_loss": 3.3262444385812135, + "tokens_seen": 2936478720 + }, + { + "epoch": 9.08, + "learning_rate": 5.576730190571715e-05, + "loss": 2.3399, + "theoretical_loss": 3.3262387061601455, + "tokens_seen": 2936544256 + }, + { + "epoch": 9.08, + "learning_rate": 5.575727181544634e-05, + "loss": 2.3132, + "theoretical_loss": 3.3262329739028287, + "tokens_seen": 2936609792 + }, + { + "epoch": 9.08, + "learning_rate": 5.574724172517553e-05, + "loss": 2.5072, + "theoretical_loss": 3.3262272418092556, + "tokens_seen": 2936675328 + }, + { + "epoch": 9.08, + "learning_rate": 5.5737211634904715e-05, + "loss": 2.4508, + "theoretical_loss": 3.326221509879417, + "tokens_seen": 2936740864 + }, + { + "epoch": 9.08, + "learning_rate": 5.57271815446339e-05, + "loss": 2.4702, + "theoretical_loss": 3.326215778113305, + "tokens_seen": 2936806400 + }, + { + "epoch": 9.08, + "learning_rate": 5.571715145436309e-05, + "loss": 2.1823, + "theoretical_loss": 3.326210046510911, + "tokens_seen": 2936871936 + }, + { + "epoch": 9.08, + "learning_rate": 5.570712136409228e-05, + "loss": 2.281, + "theoretical_loss": 3.3262043150722267, + "tokens_seen": 2936937472 + }, + { + "epoch": 9.08, + "learning_rate": 5.569709127382147e-05, + "loss": 2.4711, + "theoretical_loss": 3.326198583797244, + "tokens_seen": 2937003008 + }, + { + "epoch": 9.08, + "learning_rate": 5.568706118355065e-05, + "loss": 2.4181, + "theoretical_loss": 3.3261928526859545, + "tokens_seen": 2937068544 + }, + { + "epoch": 9.08, + "learning_rate": 5.5677031093279836e-05, + "loss": 2.2782, + "theoretical_loss": 3.3261871217383496, + "tokens_seen": 2937134080 + }, + { + "epoch": 9.08, + "learning_rate": 5.5667001003009024e-05, + "loss": 2.5314, + "theoretical_loss": 3.326181390954422, + "tokens_seen": 2937199616 + }, + { + "epoch": 9.08, + "learning_rate": 5.565697091273821e-05, + "loss": 2.4165, + "theoretical_loss": 3.3261756603341617, + "tokens_seen": 2937265152 + }, + { + "epoch": 9.08, + "learning_rate": 5.5646940822467406e-05, + "loss": 2.3188, + "theoretical_loss": 3.3261699298775613, + "tokens_seen": 2937330688 + }, + { + "epoch": 9.08, + "learning_rate": 5.5636910732196594e-05, + "loss": 2.5046, + "theoretical_loss": 3.3261641995846127, + "tokens_seen": 2937396224 + }, + { + "epoch": 9.08, + "learning_rate": 5.562688064192578e-05, + "loss": 2.1964, + "theoretical_loss": 3.3261584694553075, + "tokens_seen": 2937461760 + }, + { + "epoch": 9.08, + "learning_rate": 5.561685055165497e-05, + "loss": 2.4279, + "theoretical_loss": 3.326152739489637, + "tokens_seen": 2937527296 + }, + { + "epoch": 9.08, + "learning_rate": 5.560682046138416e-05, + "loss": 2.597, + "theoretical_loss": 3.326147009687593, + "tokens_seen": 2937592832 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3197186, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3860552310943604, + "objective/train/theoretical_loss": 3.326142712443435, + "objective/train/tokens_used": 2958101984, + "theoretical_loss": 3.326142712443435, + "tokens_seen": 2937641984 + }, + { + "epoch": 9.08, + "learning_rate": 5.5596790371113346e-05, + "loss": 2.445, + "theoretical_loss": 3.3261412800491676, + "tokens_seen": 2937658368 + }, + { + "epoch": 9.08, + "learning_rate": 5.558676028084253e-05, + "loss": 2.4494, + "theoretical_loss": 3.3261355505743517, + "tokens_seen": 2937723904 + }, + { + "epoch": 9.08, + "learning_rate": 5.5576730190571715e-05, + "loss": 2.603, + "theoretical_loss": 3.3261298212631374, + "tokens_seen": 2937789440 + }, + { + "epoch": 9.08, + "learning_rate": 5.55667001003009e-05, + "loss": 2.4685, + "theoretical_loss": 3.3261240921155166, + "tokens_seen": 2937854976 + }, + { + "epoch": 9.08, + "learning_rate": 5.555667001003009e-05, + "loss": 2.2927, + "theoretical_loss": 3.3261183631314806, + "tokens_seen": 2937920512 + }, + { + "epoch": 9.08, + "learning_rate": 5.554663991975928e-05, + "loss": 2.3264, + "theoretical_loss": 3.3261126343110217, + "tokens_seen": 2937986048 + }, + { + "epoch": 9.08, + "learning_rate": 5.5536609829488466e-05, + "loss": 2.2551, + "theoretical_loss": 3.3261069056541306, + "tokens_seen": 2938051584 + }, + { + "epoch": 9.08, + "learning_rate": 5.5526579739217654e-05, + "loss": 2.413, + "theoretical_loss": 3.3261011771607993, + "tokens_seen": 2938117120 + }, + { + "epoch": 9.08, + "learning_rate": 5.551654964894684e-05, + "loss": 2.4587, + "theoretical_loss": 3.3260954488310204, + "tokens_seen": 2938182656 + }, + { + "epoch": 9.08, + "learning_rate": 5.550651955867603e-05, + "loss": 2.4029, + "theoretical_loss": 3.3260897206647844, + "tokens_seen": 2938248192 + }, + { + "epoch": 9.08, + "learning_rate": 5.549648946840521e-05, + "loss": 2.4982, + "theoretical_loss": 3.3260839926620833, + "tokens_seen": 2938313728 + }, + { + "epoch": 9.08, + "learning_rate": 5.54864593781344e-05, + "loss": 2.3934, + "theoretical_loss": 3.3260782648229092, + "tokens_seen": 2938379264 + }, + { + "epoch": 9.08, + "learning_rate": 5.547642928786359e-05, + "loss": 2.3241, + "theoretical_loss": 3.3260725371472537, + "tokens_seen": 2938444800 + }, + { + "epoch": 9.08, + "learning_rate": 5.5466399197592775e-05, + "loss": 2.4509, + "theoretical_loss": 3.326066809635108, + "tokens_seen": 2938510336 + }, + { + "epoch": 9.08, + "learning_rate": 5.545636910732197e-05, + "loss": 2.4761, + "theoretical_loss": 3.326061082286464, + "tokens_seen": 2938575872 + }, + { + "epoch": 9.08, + "learning_rate": 5.544633901705116e-05, + "loss": 2.4123, + "theoretical_loss": 3.326055355101314, + "tokens_seen": 2938641408 + }, + { + "epoch": 9.08, + "learning_rate": 5.5436308926780345e-05, + "loss": 2.384, + "theoretical_loss": 3.3260496280796485, + "tokens_seen": 2938706944 + }, + { + "epoch": 9.08, + "learning_rate": 5.542627883650953e-05, + "loss": 2.2793, + "theoretical_loss": 3.32604390122146, + "tokens_seen": 2938772480 + }, + { + "epoch": 9.08, + "learning_rate": 5.541624874623872e-05, + "loss": 2.4321, + "theoretical_loss": 3.32603817452674, + "tokens_seen": 2938838016 + }, + { + "epoch": 9.08, + "learning_rate": 5.540621865596791e-05, + "loss": 2.5987, + "theoretical_loss": 3.3260324479954804, + "tokens_seen": 2938903552 + }, + { + "epoch": 9.08, + "learning_rate": 5.539618856569709e-05, + "loss": 2.3018, + "theoretical_loss": 3.326026721627673, + "tokens_seen": 2938969088 + }, + { + "epoch": 9.08, + "learning_rate": 5.538615847542628e-05, + "loss": 2.4039, + "theoretical_loss": 3.326020995423309, + "tokens_seen": 2939034624 + }, + { + "epoch": 9.08, + "learning_rate": 5.5376128385155466e-05, + "loss": 2.5365, + "theoretical_loss": 3.32601526938238, + "tokens_seen": 2939100160 + }, + { + "epoch": 9.08, + "learning_rate": 5.5366098294884654e-05, + "loss": 2.3026, + "theoretical_loss": 3.326009543504878, + "tokens_seen": 2939165696 + }, + { + "epoch": 9.08, + "learning_rate": 5.535606820461384e-05, + "loss": 2.3658, + "theoretical_loss": 3.326003817790795, + "tokens_seen": 2939231232 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3197965, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1224710941314697, + "objective/train/theoretical_loss": 3.325999523612471, + "objective/train/tokens_used": 2959740384, + "theoretical_loss": 3.325999523612471, + "tokens_seen": 2939280384 + }, + { + "epoch": 9.08, + "learning_rate": 5.534603811434303e-05, + "loss": 2.4232, + "theoretical_loss": 3.325998092240122, + "tokens_seen": 2939296768 + }, + { + "epoch": 9.08, + "learning_rate": 5.533600802407222e-05, + "loss": 2.2382, + "theoretical_loss": 3.3259923668528515, + "tokens_seen": 2939362304 + }, + { + "epoch": 9.08, + "learning_rate": 5.5325977933801405e-05, + "loss": 2.5642, + "theoretical_loss": 3.3259866416289743, + "tokens_seen": 2939427840 + }, + { + "epoch": 9.08, + "learning_rate": 5.531594784353059e-05, + "loss": 2.239, + "theoretical_loss": 3.325980916568483, + "tokens_seen": 2939493376 + }, + { + "epoch": 9.08, + "learning_rate": 5.530591775325978e-05, + "loss": 2.508, + "theoretical_loss": 3.3259751916713682, + "tokens_seen": 2939558912 + }, + { + "epoch": 9.08, + "learning_rate": 5.529588766298896e-05, + "loss": 2.1737, + "theoretical_loss": 3.325969466937623, + "tokens_seen": 2939624448 + }, + { + "epoch": 9.08, + "learning_rate": 5.528585757271815e-05, + "loss": 2.5377, + "theoretical_loss": 3.3259637423672377, + "tokens_seen": 2939689984 + }, + { + "epoch": 9.08, + "learning_rate": 5.527582748244734e-05, + "loss": 2.3683, + "theoretical_loss": 3.325958017960205, + "tokens_seen": 2939755520 + }, + { + "epoch": 9.08, + "learning_rate": 5.526579739217653e-05, + "loss": 2.4816, + "theoretical_loss": 3.325952293716516, + "tokens_seen": 2939821056 + }, + { + "epoch": 9.08, + "learning_rate": 5.525576730190572e-05, + "loss": 2.2526, + "theoretical_loss": 3.3259465696361628, + "tokens_seen": 2939886592 + }, + { + "epoch": 9.08, + "learning_rate": 5.524573721163491e-05, + "loss": 2.5634, + "theoretical_loss": 3.325940845719137, + "tokens_seen": 2939952128 + }, + { + "epoch": 9.08, + "learning_rate": 5.5235707121364097e-05, + "loss": 2.4132, + "theoretical_loss": 3.32593512196543, + "tokens_seen": 2940017664 + }, + { + "epoch": 9.08, + "learning_rate": 5.5225677031093284e-05, + "loss": 2.5093, + "theoretical_loss": 3.3259293983750338, + "tokens_seen": 2940083200 + }, + { + "epoch": 9.08, + "learning_rate": 5.521564694082247e-05, + "loss": 2.4712, + "theoretical_loss": 3.32592367494794, + "tokens_seen": 2940148736 + }, + { + "epoch": 9.08, + "learning_rate": 5.520561685055166e-05, + "loss": 2.5517, + "theoretical_loss": 3.325917951684141, + "tokens_seen": 2940214272 + }, + { + "epoch": 9.08, + "learning_rate": 5.519558676028084e-05, + "loss": 2.2979, + "theoretical_loss": 3.325912228583627, + "tokens_seen": 2940279808 + }, + { + "epoch": 9.08, + "learning_rate": 5.518555667001003e-05, + "loss": 2.4503, + "theoretical_loss": 3.3259065056463903, + "tokens_seen": 2940345344 + }, + { + "epoch": 9.08, + "learning_rate": 5.517552657973922e-05, + "loss": 2.2376, + "theoretical_loss": 3.3259007828724236, + "tokens_seen": 2940410880 + }, + { + "epoch": 9.08, + "learning_rate": 5.5165496489468405e-05, + "loss": 2.2547, + "theoretical_loss": 3.3258950602617174, + "tokens_seen": 2940476416 + }, + { + "epoch": 9.08, + "learning_rate": 5.515546639919759e-05, + "loss": 2.1802, + "theoretical_loss": 3.3258893378142638, + "tokens_seen": 2940541952 + }, + { + "epoch": 9.08, + "learning_rate": 5.514543630892678e-05, + "loss": 2.5044, + "theoretical_loss": 3.3258836155300546, + "tokens_seen": 2940607488 + }, + { + "epoch": 9.08, + "learning_rate": 5.513540621865597e-05, + "loss": 2.4286, + "theoretical_loss": 3.325877893409082, + "tokens_seen": 2940673024 + }, + { + "epoch": 9.08, + "learning_rate": 5.512537612838516e-05, + "loss": 2.4864, + "theoretical_loss": 3.3258721714513366, + "tokens_seen": 2940738560 + }, + { + "epoch": 9.08, + "learning_rate": 5.5115346038114345e-05, + "loss": 2.2856, + "theoretical_loss": 3.325866449656811, + "tokens_seen": 2940804096 + }, + { + "epoch": 9.08, + "learning_rate": 5.5105315947843526e-05, + "loss": 2.4276, + "theoretical_loss": 3.325860728025496, + "tokens_seen": 2940869632 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3199042, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.9589678049087524, + "objective/train/theoretical_loss": 3.3258564369091124, + "objective/train/tokens_used": 2961378784, + "theoretical_loss": 3.3258564369091124, + "tokens_seen": 2940918784 + }, + { + "epoch": 9.08, + "learning_rate": 5.5095285857572714e-05, + "loss": 2.4018, + "theoretical_loss": 3.325855006557384, + "tokens_seen": 2940935168 + }, + { + "epoch": 9.08, + "learning_rate": 5.508525576730191e-05, + "loss": 2.5352, + "theoretical_loss": 3.325849285252467, + "tokens_seen": 2941000704 + }, + { + "epoch": 9.08, + "learning_rate": 5.5075225677031096e-05, + "loss": 2.3707, + "theoretical_loss": 3.325843564110736, + "tokens_seen": 2941066240 + }, + { + "epoch": 9.08, + "learning_rate": 5.5065195586760284e-05, + "loss": 2.4792, + "theoretical_loss": 3.325837843132183, + "tokens_seen": 2941131776 + }, + { + "epoch": 9.08, + "learning_rate": 5.505516549648947e-05, + "loss": 2.4504, + "theoretical_loss": 3.3258321223168, + "tokens_seen": 2941197312 + }, + { + "epoch": 9.08, + "learning_rate": 5.504513540621866e-05, + "loss": 2.4902, + "theoretical_loss": 3.325826401664578, + "tokens_seen": 2941262848 + }, + { + "epoch": 9.08, + "learning_rate": 5.503510531594785e-05, + "loss": 2.3686, + "theoretical_loss": 3.3258206811755096, + "tokens_seen": 2941328384 + }, + { + "epoch": 9.08, + "learning_rate": 5.5025075225677036e-05, + "loss": 2.5264, + "theoretical_loss": 3.3258149608495855, + "tokens_seen": 2941393920 + }, + { + "epoch": 9.08, + "learning_rate": 5.5015045135406224e-05, + "loss": 2.4768, + "theoretical_loss": 3.325809240686798, + "tokens_seen": 2941459456 + }, + { + "epoch": 9.08, + "learning_rate": 5.5005015045135405e-05, + "loss": 2.2841, + "theoretical_loss": 3.3258035206871392, + "tokens_seen": 2941524992 + }, + { + "epoch": 9.08, + "learning_rate": 5.499498495486459e-05, + "loss": 2.2972, + "theoretical_loss": 3.3257978008506, + "tokens_seen": 2941590528 + }, + { + "epoch": 9.08, + "learning_rate": 5.498495486459378e-05, + "loss": 2.1124, + "theoretical_loss": 3.325792081177173, + "tokens_seen": 2941656064 + }, + { + "epoch": 9.08, + "learning_rate": 5.497492477432297e-05, + "loss": 2.2636, + "theoretical_loss": 3.3257863616668493, + "tokens_seen": 2941721600 + }, + { + "epoch": 9.08, + "learning_rate": 5.4964894684052156e-05, + "loss": 2.298, + "theoretical_loss": 3.32578064231962, + "tokens_seen": 2941787136 + }, + { + "epoch": 9.08, + "learning_rate": 5.4954864593781344e-05, + "loss": 2.3985, + "theoretical_loss": 3.3257749231354783, + "tokens_seen": 2941852672 + }, + { + "epoch": 9.08, + "learning_rate": 5.494483450351053e-05, + "loss": 2.4643, + "theoretical_loss": 3.325769204114415, + "tokens_seen": 2941918208 + }, + { + "epoch": 9.08, + "learning_rate": 5.493480441323972e-05, + "loss": 2.3341, + "theoretical_loss": 3.325763485256422, + "tokens_seen": 2941983744 + }, + { + "epoch": 9.08, + "learning_rate": 5.492477432296891e-05, + "loss": 2.4197, + "theoretical_loss": 3.3257577665614906, + "tokens_seen": 2942049280 + }, + { + "epoch": 9.08, + "learning_rate": 5.4914744232698096e-05, + "loss": 2.2435, + "theoretical_loss": 3.325752048029613, + "tokens_seen": 2942114816 + }, + { + "epoch": 9.08, + "learning_rate": 5.490471414242728e-05, + "loss": 2.1588, + "theoretical_loss": 3.3257463296607814, + "tokens_seen": 2942180352 + }, + { + "epoch": 9.08, + "learning_rate": 5.489468405215647e-05, + "loss": 2.6303, + "theoretical_loss": 3.3257406114549863, + "tokens_seen": 2942245888 + }, + { + "epoch": 9.08, + "learning_rate": 5.488465396188566e-05, + "loss": 2.2577, + "theoretical_loss": 3.3257348934122204, + "tokens_seen": 2942311424 + }, + { + "epoch": 9.08, + "learning_rate": 5.487462387161485e-05, + "loss": 2.2812, + "theoretical_loss": 3.325729175532475, + "tokens_seen": 2942376960 + }, + { + "epoch": 9.08, + "learning_rate": 5.4864593781344035e-05, + "loss": 2.136, + "theoretical_loss": 3.325723457815742, + "tokens_seen": 2942442496 + }, + { + "epoch": 9.08, + "learning_rate": 5.485456369107322e-05, + "loss": 2.4322, + "theoretical_loss": 3.3257177402620135, + "tokens_seen": 2942508032 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3199743, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.703411340713501, + "objective/train/theoretical_loss": 3.325713452203683, + "objective/train/tokens_used": 2963017184, + "theoretical_loss": 3.325713452203683, + "tokens_seen": 2942557184 + }, + { + "epoch": 9.08, + "learning_rate": 5.484453360080241e-05, + "loss": 2.4725, + "theoretical_loss": 3.32571202287128, + "tokens_seen": 2942573568 + }, + { + "epoch": 9.08, + "learning_rate": 5.48345035105316e-05, + "loss": 2.5155, + "theoretical_loss": 3.3257063056435343, + "tokens_seen": 2942639104 + }, + { + "epoch": 9.08, + "learning_rate": 5.482447342026079e-05, + "loss": 2.5674, + "theoretical_loss": 3.325700588578768, + "tokens_seen": 2942704640 + }, + { + "epoch": 9.08, + "learning_rate": 5.4814443329989975e-05, + "loss": 2.3328, + "theoretical_loss": 3.325694871676972, + "tokens_seen": 2942770176 + }, + { + "epoch": 9.08, + "learning_rate": 5.4804413239719156e-05, + "loss": 2.513, + "theoretical_loss": 3.3256891549381393, + "tokens_seen": 2942835712 + }, + { + "epoch": 9.08, + "learning_rate": 5.4794383149448344e-05, + "loss": 2.3163, + "theoretical_loss": 3.3256834383622604, + "tokens_seen": 2942901248 + }, + { + "epoch": 9.08, + "learning_rate": 5.478435305917753e-05, + "loss": 2.5203, + "theoretical_loss": 3.325677721949328, + "tokens_seen": 2942966784 + }, + { + "epoch": 9.08, + "learning_rate": 5.477432296890672e-05, + "loss": 2.6005, + "theoretical_loss": 3.3256720056993334, + "tokens_seen": 2943032320 + }, + { + "epoch": 9.08, + "learning_rate": 5.476429287863591e-05, + "loss": 2.6194, + "theoretical_loss": 3.3256662896122684, + "tokens_seen": 2943097856 + }, + { + "epoch": 9.08, + "learning_rate": 5.4754262788365095e-05, + "loss": 2.4018, + "theoretical_loss": 3.3256605736881246, + "tokens_seen": 2943163392 + }, + { + "epoch": 9.08, + "learning_rate": 5.474423269809428e-05, + "loss": 2.5039, + "theoretical_loss": 3.325654857926894, + "tokens_seen": 2943228928 + }, + { + "epoch": 9.08, + "learning_rate": 5.473420260782347e-05, + "loss": 2.3638, + "theoretical_loss": 3.325649142328568, + "tokens_seen": 2943294464 + }, + { + "epoch": 9.08, + "learning_rate": 5.472417251755266e-05, + "loss": 2.4165, + "theoretical_loss": 3.3256434268931385, + "tokens_seen": 2943360000 + }, + { + "epoch": 9.08, + "learning_rate": 5.471414242728184e-05, + "loss": 2.3914, + "theoretical_loss": 3.325637711620597, + "tokens_seen": 2943425536 + }, + { + "epoch": 9.08, + "learning_rate": 5.4704112337011035e-05, + "loss": 2.3637, + "theoretical_loss": 3.3256319965109356, + "tokens_seen": 2943491072 + }, + { + "epoch": 9.08, + "learning_rate": 5.469408224674022e-05, + "loss": 2.3293, + "theoretical_loss": 3.325626281564146, + "tokens_seen": 2943556608 + }, + { + "epoch": 9.08, + "learning_rate": 5.468405215646941e-05, + "loss": 2.388, + "theoretical_loss": 3.32562056678022, + "tokens_seen": 2943622144 + }, + { + "epoch": 9.08, + "learning_rate": 5.46740220661986e-05, + "loss": 2.4127, + "theoretical_loss": 3.3256148521591484, + "tokens_seen": 2943687680 + }, + { + "epoch": 9.08, + "learning_rate": 5.4663991975927786e-05, + "loss": 2.2213, + "theoretical_loss": 3.3256091377009245, + "tokens_seen": 2943753216 + }, + { + "epoch": 9.08, + "learning_rate": 5.4653961885656974e-05, + "loss": 2.1404, + "theoretical_loss": 3.3256034234055387, + "tokens_seen": 2943818752 + }, + { + "epoch": 9.08, + "learning_rate": 5.464393179538616e-05, + "loss": 2.2038, + "theoretical_loss": 3.3255977092729836, + "tokens_seen": 2943884288 + }, + { + "epoch": 9.08, + "learning_rate": 5.463390170511535e-05, + "loss": 2.4087, + "theoretical_loss": 3.3255919953032502, + "tokens_seen": 2943949824 + }, + { + "epoch": 9.08, + "learning_rate": 5.462387161484454e-05, + "loss": 2.3485, + "theoretical_loss": 3.3255862814963306, + "tokens_seen": 2944015360 + }, + { + "epoch": 9.08, + "learning_rate": 5.461384152457372e-05, + "loss": 2.2758, + "theoretical_loss": 3.325580567852217, + "tokens_seen": 2944080896 + }, + { + "epoch": 9.08, + "learning_rate": 5.460381143430291e-05, + "loss": 2.4036, + "theoretical_loss": 3.3255748543709003, + "tokens_seen": 2944146432 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3200276, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2421863079071045, + "objective/train/theoretical_loss": 3.3255705693667434, + "objective/train/tokens_used": 2964655584, + "theoretical_loss": 3.3255705693667434, + "tokens_seen": 2944195584 + }, + { + "epoch": 9.08, + "learning_rate": 5.4593781344032095e-05, + "loss": 2.2372, + "theoretical_loss": 3.3255691410523727, + "tokens_seen": 2944211968 + }, + { + "epoch": 9.08, + "learning_rate": 5.458375125376128e-05, + "loss": 2.4962, + "theoretical_loss": 3.3255634278966255, + "tokens_seen": 2944277504 + }, + { + "epoch": 9.08, + "learning_rate": 5.457372116349047e-05, + "loss": 2.5054, + "theoretical_loss": 3.3255577149036517, + "tokens_seen": 2944343040 + }, + { + "epoch": 9.08, + "learning_rate": 5.456369107321966e-05, + "loss": 2.4563, + "theoretical_loss": 3.3255520020734415, + "tokens_seen": 2944408576 + }, + { + "epoch": 9.08, + "learning_rate": 5.4553660982948846e-05, + "loss": 2.1507, + "theoretical_loss": 3.3255462894059873, + "tokens_seen": 2944474112 + }, + { + "epoch": 9.08, + "learning_rate": 5.4543630892678034e-05, + "loss": 2.219, + "theoretical_loss": 3.3255405769012807, + "tokens_seen": 2944539648 + }, + { + "epoch": 9.08, + "learning_rate": 5.453360080240722e-05, + "loss": 2.3633, + "theoretical_loss": 3.325534864559314, + "tokens_seen": 2944605184 + }, + { + "epoch": 9.08, + "learning_rate": 5.452357071213641e-05, + "loss": 2.3985, + "theoretical_loss": 3.3255291523800783, + "tokens_seen": 2944670720 + }, + { + "epoch": 9.08, + "learning_rate": 5.45135406218656e-05, + "loss": 2.4662, + "theoretical_loss": 3.3255234403635656, + "tokens_seen": 2944736256 + }, + { + "epoch": 9.08, + "learning_rate": 5.4503510531594786e-05, + "loss": 2.4303, + "theoretical_loss": 3.3255177285097677, + "tokens_seen": 2944801792 + }, + { + "epoch": 9.08, + "learning_rate": 5.4493480441323974e-05, + "loss": 2.2693, + "theoretical_loss": 3.325512016818676, + "tokens_seen": 2944867328 + }, + { + "epoch": 9.08, + "learning_rate": 5.448345035105316e-05, + "loss": 2.2636, + "theoretical_loss": 3.3255063052902822, + "tokens_seen": 2944932864 + }, + { + "epoch": 9.08, + "learning_rate": 5.447342026078235e-05, + "loss": 2.5638, + "theoretical_loss": 3.3255005939245788, + "tokens_seen": 2944998400 + }, + { + "epoch": 9.08, + "learning_rate": 5.446339017051154e-05, + "loss": 2.3476, + "theoretical_loss": 3.325494882721557, + "tokens_seen": 2945063936 + }, + { + "epoch": 9.08, + "learning_rate": 5.4453360080240725e-05, + "loss": 2.3696, + "theoretical_loss": 3.3254891716812085, + "tokens_seen": 2945129472 + }, + { + "epoch": 9.08, + "learning_rate": 5.444332998996991e-05, + "loss": 2.3103, + "theoretical_loss": 3.3254834608035253, + "tokens_seen": 2945195008 + }, + { + "epoch": 9.08, + "learning_rate": 5.44332998996991e-05, + "loss": 2.2874, + "theoretical_loss": 3.325477750088499, + "tokens_seen": 2945260544 + }, + { + "epoch": 9.08, + "learning_rate": 5.442326980942829e-05, + "loss": 2.3273, + "theoretical_loss": 3.325472039536121, + "tokens_seen": 2945326080 + }, + { + "epoch": 9.08, + "learning_rate": 5.441323971915747e-05, + "loss": 2.3361, + "theoretical_loss": 3.3254663291463835, + "tokens_seen": 2945391616 + }, + { + "epoch": 9.08, + "learning_rate": 5.440320962888666e-05, + "loss": 2.2555, + "theoretical_loss": 3.3254606189192786, + "tokens_seen": 2945457152 + }, + { + "epoch": 9.08, + "learning_rate": 5.4393179538615846e-05, + "loss": 2.4716, + "theoretical_loss": 3.3254549088547973, + "tokens_seen": 2945522688 + }, + { + "epoch": 9.08, + "learning_rate": 5.4383149448345034e-05, + "loss": 2.2686, + "theoretical_loss": 3.325449198952932, + "tokens_seen": 2945588224 + }, + { + "epoch": 9.08, + "learning_rate": 5.437311935807422e-05, + "loss": 2.0326, + "theoretical_loss": 3.3254434892136735, + "tokens_seen": 2945653760 + }, + { + "epoch": 9.08, + "learning_rate": 5.436308926780341e-05, + "loss": 2.217, + "theoretical_loss": 3.3254377796370145, + "tokens_seen": 2945719296 + }, + { + "epoch": 9.08, + "learning_rate": 5.43530591775326e-05, + "loss": 2.2272, + "theoretical_loss": 3.3254320702229467, + "tokens_seen": 2945784832 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3203828, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.265639543533325, + "objective/train/theoretical_loss": 3.325427788269091, + "objective/train/tokens_used": 2966293984, + "theoretical_loss": 3.325427788269091, + "tokens_seen": 2945833984 + }, + { + "epoch": 9.08, + "learning_rate": 5.4343029087261785e-05, + "loss": 2.4098, + "theoretical_loss": 3.325426360971461, + "tokens_seen": 2945850368 + }, + { + "epoch": 9.08, + "learning_rate": 5.433299899699098e-05, + "loss": 2.4369, + "theoretical_loss": 3.32542065188255, + "tokens_seen": 2945915904 + }, + { + "epoch": 9.08, + "learning_rate": 5.432296890672016e-05, + "loss": 2.3926, + "theoretical_loss": 3.325414942956205, + "tokens_seen": 2945981440 + }, + { + "epoch": 9.08, + "learning_rate": 5.431293881644935e-05, + "loss": 2.5853, + "theoretical_loss": 3.3254092341924184, + "tokens_seen": 2946046976 + }, + { + "epoch": 9.08, + "learning_rate": 5.430290872617854e-05, + "loss": 2.5635, + "theoretical_loss": 3.325403525591181, + "tokens_seen": 2946112512 + }, + { + "epoch": 9.08, + "learning_rate": 5.4292878635907725e-05, + "loss": 2.5757, + "theoretical_loss": 3.325397817152485, + "tokens_seen": 2946178048 + }, + { + "epoch": 9.08, + "learning_rate": 5.428284854563691e-05, + "loss": 2.2682, + "theoretical_loss": 3.3253921088763225, + "tokens_seen": 2946243584 + }, + { + "epoch": 9.08, + "learning_rate": 5.42728184553661e-05, + "loss": 2.3352, + "theoretical_loss": 3.3253864007626848, + "tokens_seen": 2946309120 + }, + { + "epoch": 9.08, + "learning_rate": 5.426278836509529e-05, + "loss": 2.5665, + "theoretical_loss": 3.3253806928115637, + "tokens_seen": 2946374656 + }, + { + "epoch": 9.08, + "learning_rate": 5.4252758274824477e-05, + "loss": 2.4224, + "theoretical_loss": 3.3253749850229513, + "tokens_seen": 2946440192 + }, + { + "epoch": 9.08, + "learning_rate": 5.4242728184553664e-05, + "loss": 2.612, + "theoretical_loss": 3.325369277396839, + "tokens_seen": 2946505728 + }, + { + "epoch": 9.08, + "learning_rate": 5.423269809428285e-05, + "loss": 2.4714, + "theoretical_loss": 3.3253635699332187, + "tokens_seen": 2946571264 + }, + { + "epoch": 9.08, + "learning_rate": 5.4222668004012033e-05, + "loss": 2.4198, + "theoretical_loss": 3.325357862632082, + "tokens_seen": 2946636800 + }, + { + "epoch": 9.08, + "learning_rate": 5.421263791374122e-05, + "loss": 2.6401, + "theoretical_loss": 3.3253521554934213, + "tokens_seen": 2946702336 + }, + { + "epoch": 9.08, + "learning_rate": 5.420260782347041e-05, + "loss": 2.4555, + "theoretical_loss": 3.3253464485172275, + "tokens_seen": 2946767872 + }, + { + "epoch": 9.08, + "learning_rate": 5.41925777331996e-05, + "loss": 2.4211, + "theoretical_loss": 3.325340741703493, + "tokens_seen": 2946833408 + }, + { + "epoch": 9.08, + "learning_rate": 5.4182547642928785e-05, + "loss": 2.2403, + "theoretical_loss": 3.3253350350522086, + "tokens_seen": 2946898944 + }, + { + "epoch": 9.08, + "learning_rate": 5.417251755265797e-05, + "loss": 2.318, + "theoretical_loss": 3.3253293285633676, + "tokens_seen": 2946964480 + }, + { + "epoch": 9.08, + "learning_rate": 5.416248746238716e-05, + "loss": 2.4564, + "theoretical_loss": 3.32532362223696, + "tokens_seen": 2947030016 + }, + { + "epoch": 9.08, + "learning_rate": 5.415245737211635e-05, + "loss": 2.5141, + "theoretical_loss": 3.3253179160729793, + "tokens_seen": 2947095552 + }, + { + "epoch": 9.08, + "learning_rate": 5.4142427281845543e-05, + "loss": 2.5381, + "theoretical_loss": 3.325312210071416, + "tokens_seen": 2947161088 + }, + { + "epoch": 9.08, + "learning_rate": 5.413239719157473e-05, + "loss": 2.2808, + "theoretical_loss": 3.3253065042322625, + "tokens_seen": 2947226624 + }, + { + "epoch": 9.08, + "learning_rate": 5.412236710130391e-05, + "loss": 2.2145, + "theoretical_loss": 3.3253007985555105, + "tokens_seen": 2947292160 + }, + { + "epoch": 9.08, + "learning_rate": 5.41123370110331e-05, + "loss": 2.4295, + "theoretical_loss": 3.3252950930411513, + "tokens_seen": 2947357696 + }, + { + "epoch": 9.08, + "learning_rate": 5.410230692076229e-05, + "loss": 2.3993, + "theoretical_loss": 3.3252893876891774, + "tokens_seen": 2947423232 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5690431594848633, + "objective/train/theoretical_loss": 3.325285108781757, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.325285108781757, + "tokens_seen": 2947472384 + }, + { + "epoch": 9.08, + "learning_rate": 5.4092276830491476e-05, + "loss": 2.2549, + "theoretical_loss": 3.32528368249958, + "tokens_seen": 2947488768 + }, + { + "epoch": 9.08, + "learning_rate": 5.4082246740220664e-05, + "loss": 2.1982, + "theoretical_loss": 3.325277977472351, + "tokens_seen": 2947554304 + }, + { + "epoch": 9.08, + "learning_rate": 5.407221664994985e-05, + "loss": 2.5184, + "theoretical_loss": 3.3252722726074824, + "tokens_seen": 2947619840 + }, + { + "epoch": 9.08, + "learning_rate": 5.406218655967904e-05, + "loss": 2.3927, + "theoretical_loss": 3.3252665679049658, + "tokens_seen": 2947685376 + }, + { + "epoch": 9.08, + "learning_rate": 5.405215646940823e-05, + "loss": 2.3045, + "theoretical_loss": 3.3252608633647927, + "tokens_seen": 2947750912 + }, + { + "epoch": 9.08, + "learning_rate": 5.4042126379137416e-05, + "loss": 2.4727, + "theoretical_loss": 3.325255158986955, + "tokens_seen": 2947816448 + }, + { + "epoch": 9.08, + "learning_rate": 5.4032096288866603e-05, + "loss": 2.4168, + "theoretical_loss": 3.325249454771445, + "tokens_seen": 2947881984 + }, + { + "epoch": 9.08, + "learning_rate": 5.4022066198595785e-05, + "loss": 2.4056, + "theoretical_loss": 3.3252437507182537, + "tokens_seen": 2947947520 + }, + { + "epoch": 9.08, + "learning_rate": 5.401203610832497e-05, + "loss": 2.6808, + "theoretical_loss": 3.3252380468273737, + "tokens_seen": 2948013056 + }, + { + "epoch": 9.08, + "learning_rate": 5.400200601805416e-05, + "loss": 2.3156, + "theoretical_loss": 3.325232343098796, + "tokens_seen": 2948078592 + }, + { + "epoch": 9.08, + "learning_rate": 5.399197592778335e-05, + "loss": 2.586, + "theoretical_loss": 3.325226639532513, + "tokens_seen": 2948144128 + }, + { + "epoch": 9.08, + "learning_rate": 5.3981945837512536e-05, + "loss": 2.4163, + "theoretical_loss": 3.325220936128516, + "tokens_seen": 2948209664 + }, + { + "epoch": 9.08, + "learning_rate": 5.3971915747241724e-05, + "loss": 2.3448, + "theoretical_loss": 3.3252152328867965, + "tokens_seen": 2948275200 + }, + { + "epoch": 9.08, + "learning_rate": 5.396188565697091e-05, + "loss": 2.3836, + "theoretical_loss": 3.325209529807347, + "tokens_seen": 2948340736 + }, + { + "epoch": 9.08, + "learning_rate": 5.395185556670011e-05, + "loss": 2.1909, + "theoretical_loss": 3.325203826890159, + "tokens_seen": 2948406272 + }, + { + "epoch": 9.08, + "learning_rate": 5.3941825476429295e-05, + "loss": 2.3682, + "theoretical_loss": 3.3251981241352246, + "tokens_seen": 2948471808 + }, + { + "epoch": 9.08, + "learning_rate": 5.3931795386158476e-05, + "loss": 2.41, + "theoretical_loss": 3.325192421542535, + "tokens_seen": 2948537344 + }, + { + "epoch": 9.08, + "learning_rate": 5.3921765295887664e-05, + "loss": 2.3598, + "theoretical_loss": 3.3251867191120823, + "tokens_seen": 2948602880 + }, + { + "epoch": 9.08, + "learning_rate": 5.391173520561685e-05, + "loss": 2.2521, + "theoretical_loss": 3.325181016843858, + "tokens_seen": 2948668416 + }, + { + "epoch": 9.08, + "learning_rate": 5.390170511534604e-05, + "loss": 2.3243, + "theoretical_loss": 3.325175314737854, + "tokens_seen": 2948733952 + }, + { + "epoch": 9.08, + "learning_rate": 5.389167502507523e-05, + "loss": 2.3245, + "theoretical_loss": 3.3251696127940624, + "tokens_seen": 2948799488 + }, + { + "epoch": 9.08, + "learning_rate": 5.3881644934804415e-05, + "loss": 2.4956, + "theoretical_loss": 3.325163911012475, + "tokens_seen": 2948865024 + }, + { + "epoch": 9.08, + "learning_rate": 5.38716148445336e-05, + "loss": 2.4595, + "theoretical_loss": 3.325158209393083, + "tokens_seen": 2948930560 + }, + { + "epoch": 9.08, + "learning_rate": 5.386158475426279e-05, + "loss": 2.3643, + "theoretical_loss": 3.325152507935879, + "tokens_seen": 2948996096 + }, + { + "epoch": 9.08, + "learning_rate": 5.385155466399198e-05, + "loss": 2.2632, + "theoretical_loss": 3.3251468066408534, + "tokens_seen": 2949061632 + }, + { + "debugging/Self-BLEU-5": 0.5316186977046168, + "debugging/distinct-1-grams": 0.7543802754715114, + "debugging/distinct-2-grams": 0.9651798356604075, + "debugging/entropy-1-grams": 5.988581925757856, + "debugging/entropy-2-grams": 7.129173159247395, + "debugging/length": 501.1666666666667, + "debugging/num_segments": 18, + "debugging/score": 0.0029595937199075875, + "debugging/score_std": 0.0037509977829476063, + "epoch": 9.08, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.682790994644165, + "objective/train/theoretical_loss": 3.32514253077601, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.32514253077601, + "tokens_seen": 2949110784 + }, + { + "epoch": 9.08, + "learning_rate": 5.384152457372117e-05, + "loss": 2.4137, + "theoretical_loss": 3.325141105507999, + "tokens_seen": 2949127168 + }, + { + "epoch": 9.08, + "learning_rate": 5.383149448345035e-05, + "loss": 2.528, + "theoretical_loss": 3.325135404537308, + "tokens_seen": 2949192704 + }, + { + "epoch": 9.08, + "learning_rate": 5.3821464393179536e-05, + "loss": 2.3492, + "theoretical_loss": 3.3251297037287717, + "tokens_seen": 2949258240 + }, + { + "epoch": 9.08, + "learning_rate": 5.3811434302908724e-05, + "loss": 2.2635, + "theoretical_loss": 3.325124003082381, + "tokens_seen": 2949323776 + }, + { + "epoch": 9.08, + "learning_rate": 5.380140421263791e-05, + "loss": 2.4316, + "theoretical_loss": 3.3251183025981295, + "tokens_seen": 2949389312 + }, + { + "epoch": 9.08, + "learning_rate": 5.37913741223671e-05, + "loss": 2.3915, + "theoretical_loss": 3.3251126022760076, + "tokens_seen": 2949454848 + }, + { + "epoch": 9.08, + "learning_rate": 5.378134403209629e-05, + "loss": 2.2034, + "theoretical_loss": 3.3251069021160076, + "tokens_seen": 2949520384 + }, + { + "epoch": 9.08, + "learning_rate": 5.377131394182548e-05, + "loss": 2.1939, + "theoretical_loss": 3.3251012021181205, + "tokens_seen": 2949585920 + }, + { + "epoch": 9.08, + "learning_rate": 5.376128385155467e-05, + "loss": 2.4435, + "theoretical_loss": 3.3250955022823394, + "tokens_seen": 2949651456 + }, + { + "epoch": 9.08, + "learning_rate": 5.375125376128386e-05, + "loss": 2.5353, + "theoretical_loss": 3.3250898026086553, + "tokens_seen": 2949716992 + }, + { + "epoch": 9.08, + "learning_rate": 5.3741223671013046e-05, + "loss": 2.4775, + "theoretical_loss": 3.32508410309706, + "tokens_seen": 2949782528 + }, + { + "epoch": 9.08, + "learning_rate": 5.373119358074223e-05, + "loss": 2.5483, + "theoretical_loss": 3.325078403747546, + "tokens_seen": 2949848064 + }, + { + "epoch": 9.08, + "learning_rate": 5.3721163490471415e-05, + "loss": 2.2494, + "theoretical_loss": 3.325072704560104, + "tokens_seen": 2949913600 + }, + { + "epoch": 9.08, + "learning_rate": 5.37111334002006e-05, + "loss": 2.2541, + "theoretical_loss": 3.3250670055347262, + "tokens_seen": 2949979136 + }, + { + "epoch": 9.08, + "learning_rate": 5.370110330992979e-05, + "loss": 2.4078, + "theoretical_loss": 3.325061306671405, + "tokens_seen": 2950044672 + }, + { + "epoch": 9.08, + "learning_rate": 5.369107321965898e-05, + "loss": 2.3115, + "theoretical_loss": 3.325055607970131, + "tokens_seen": 2950110208 + }, + { + "epoch": 9.08, + "learning_rate": 5.3681043129388166e-05, + "loss": 2.1734, + "theoretical_loss": 3.325049909430897, + "tokens_seen": 2950175744 + }, + { + "epoch": 9.08, + "learning_rate": 5.3671013039117354e-05, + "loss": 2.5142, + "theoretical_loss": 3.3250442110536946, + "tokens_seen": 2950241280 + }, + { + "epoch": 9.08, + "learning_rate": 5.366098294884654e-05, + "loss": 2.1921, + "theoretical_loss": 3.325038512838516, + "tokens_seen": 2950306816 + }, + { + "epoch": 9.08, + "learning_rate": 5.365095285857573e-05, + "loss": 2.6635, + "theoretical_loss": 3.3250328147853514, + "tokens_seen": 2950372352 + }, + { + "epoch": 9.08, + "learning_rate": 5.364092276830491e-05, + "loss": 2.3846, + "theoretical_loss": 3.3250271168941943, + "tokens_seen": 2950437888 + }, + { + "epoch": 9.08, + "learning_rate": 5.36308926780341e-05, + "loss": 2.3881, + "theoretical_loss": 3.3250214191650356, + "tokens_seen": 2950503424 + }, + { + "epoch": 9.08, + "learning_rate": 5.362086258776329e-05, + "loss": 2.6127, + "theoretical_loss": 3.3250157215978673, + "tokens_seen": 2950568960 + }, + { + "epoch": 9.08, + "learning_rate": 5.3610832497492475e-05, + "loss": 2.2387, + "theoretical_loss": 3.3250100241926814, + "tokens_seen": 2950634496 + }, + { + "epoch": 9.08, + "learning_rate": 5.360080240722166e-05, + "loss": 2.2206, + "theoretical_loss": 3.3250043269494696, + "tokens_seen": 2950700032 + }, + { + "epoch": 9.08, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1176373958587646, + "objective/train/theoretical_loss": 3.3250000541233513, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.3250000541233513, + "tokens_seen": 2950749184 + }, + { + "epoch": 9.08, + "learning_rate": 5.359077231695085e-05, + "loss": 2.4182, + "theoretical_loss": 3.3249986298682233, + "tokens_seen": 2950765568 + }, + { + "epoch": 9.08, + "learning_rate": 5.3580742226680045e-05, + "loss": 2.2395, + "theoretical_loss": 3.324992932948935, + "tokens_seen": 2950831104 + }, + { + "epoch": 9.08, + "learning_rate": 5.357071213640923e-05, + "loss": 2.3242, + "theoretical_loss": 3.324987236191596, + "tokens_seen": 2950896640 + }, + { + "epoch": 9.08, + "learning_rate": 5.356068204613842e-05, + "loss": 2.3836, + "theoretical_loss": 3.324981539596198, + "tokens_seen": 2950962176 + }, + { + "epoch": 9.08, + "learning_rate": 5.355065195586761e-05, + "loss": 2.493, + "theoretical_loss": 3.3249758431627336, + "tokens_seen": 2951027712 + }, + { + "epoch": 9.08, + "learning_rate": 5.354062186559679e-05, + "loss": 2.4027, + "theoretical_loss": 3.324970146891194, + "tokens_seen": 2951093248 + }, + { + "epoch": 9.08, + "learning_rate": 5.353059177532598e-05, + "loss": 2.4256, + "theoretical_loss": 3.3249644507815703, + "tokens_seen": 2951158784 + }, + { + "epoch": 9.08, + "learning_rate": 5.3520561685055166e-05, + "loss": 2.4427, + "theoretical_loss": 3.3249587548338555, + "tokens_seen": 2951224320 + }, + { + "epoch": 9.08, + "learning_rate": 5.3510531594784354e-05, + "loss": 2.2661, + "theoretical_loss": 3.324953059048041, + "tokens_seen": 2951289856 + }, + { + "epoch": 9.08, + "learning_rate": 5.350050150451354e-05, + "loss": 2.276, + "theoretical_loss": 3.3249473634241182, + "tokens_seen": 2951355392 + }, + { + "epoch": 9.09, + "learning_rate": 5.349047141424273e-05, + "loss": 2.407, + "theoretical_loss": 3.32494166796208, + "tokens_seen": 2951420928 + }, + { + "epoch": 9.09, + "learning_rate": 5.348044132397192e-05, + "loss": 2.2531, + "theoretical_loss": 3.3249359726619168, + "tokens_seen": 2951486464 + }, + { + "epoch": 9.09, + "learning_rate": 5.3470411233701105e-05, + "loss": 2.5542, + "theoretical_loss": 3.324930277523621, + "tokens_seen": 2951552000 + }, + { + "epoch": 9.09, + "learning_rate": 5.346038114343029e-05, + "loss": 2.3694, + "theoretical_loss": 3.324924582547185, + "tokens_seen": 2951617536 + }, + { + "epoch": 9.09, + "learning_rate": 5.345035105315948e-05, + "loss": 2.3227, + "theoretical_loss": 3.3249188877325997, + "tokens_seen": 2951683072 + }, + { + "epoch": 9.09, + "learning_rate": 5.344032096288866e-05, + "loss": 2.3584, + "theoretical_loss": 3.3249131930798574, + "tokens_seen": 2951748608 + }, + { + "epoch": 9.09, + "learning_rate": 5.343029087261785e-05, + "loss": 2.3153, + "theoretical_loss": 3.3249074985889497, + "tokens_seen": 2951814144 + }, + { + "epoch": 9.09, + "learning_rate": 5.342026078234704e-05, + "loss": 2.2985, + "theoretical_loss": 3.3249018042598686, + "tokens_seen": 2951879680 + }, + { + "epoch": 9.09, + "learning_rate": 5.3410230692076226e-05, + "loss": 2.3989, + "theoretical_loss": 3.3248961100926056, + "tokens_seen": 2951945216 + }, + { + "epoch": 9.09, + "learning_rate": 5.3400200601805414e-05, + "loss": 2.1593, + "theoretical_loss": 3.324890416087153, + "tokens_seen": 2952010752 + }, + { + "epoch": 9.09, + "learning_rate": 5.339017051153461e-05, + "loss": 2.118, + "theoretical_loss": 3.3248847222435023, + "tokens_seen": 2952076288 + }, + { + "epoch": 9.09, + "learning_rate": 5.3380140421263796e-05, + "loss": 2.2399, + "theoretical_loss": 3.324879028561645, + "tokens_seen": 2952141824 + }, + { + "epoch": 9.09, + "learning_rate": 5.3370110330992984e-05, + "loss": 2.4669, + "theoretical_loss": 3.3248733350415733, + "tokens_seen": 2952207360 + }, + { + "epoch": 9.09, + "learning_rate": 5.336008024072217e-05, + "loss": 2.713, + "theoretical_loss": 3.324867641683279, + "tokens_seen": 2952272896 + }, + { + "epoch": 9.09, + "learning_rate": 5.335005015045136e-05, + "loss": 2.4297, + "theoretical_loss": 3.3248619484867543, + "tokens_seen": 2952338432 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.643620729446411, + "objective/train/theoretical_loss": 3.3248576786955164, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.3248576786955164, + "tokens_seen": 2952387584 + }, + { + "epoch": 9.09, + "learning_rate": 5.334002006018054e-05, + "loss": 2.3739, + "theoretical_loss": 3.3248562554519903, + "tokens_seen": 2952403968 + }, + { + "epoch": 9.09, + "learning_rate": 5.332998996990973e-05, + "loss": 2.4056, + "theoretical_loss": 3.324850562578979, + "tokens_seen": 2952469504 + }, + { + "epoch": 9.09, + "learning_rate": 5.331995987963892e-05, + "loss": 2.5086, + "theoretical_loss": 3.324844869867712, + "tokens_seen": 2952535040 + }, + { + "epoch": 9.09, + "learning_rate": 5.3309929789368105e-05, + "loss": 2.6663, + "theoretical_loss": 3.3248391773181822, + "tokens_seen": 2952600576 + }, + { + "epoch": 9.09, + "learning_rate": 5.329989969909729e-05, + "loss": 2.2631, + "theoretical_loss": 3.3248334849303802, + "tokens_seen": 2952666112 + }, + { + "epoch": 9.09, + "learning_rate": 5.328986960882648e-05, + "loss": 2.4749, + "theoretical_loss": 3.324827792704298, + "tokens_seen": 2952731648 + }, + { + "epoch": 9.09, + "learning_rate": 5.327983951855567e-05, + "loss": 2.514, + "theoretical_loss": 3.324822100639928, + "tokens_seen": 2952797184 + }, + { + "epoch": 9.09, + "learning_rate": 5.3269809428284857e-05, + "loss": 2.2805, + "theoretical_loss": 3.3248164087372616, + "tokens_seen": 2952862720 + }, + { + "epoch": 9.09, + "learning_rate": 5.3259779338014044e-05, + "loss": 2.4763, + "theoretical_loss": 3.324810716996291, + "tokens_seen": 2952928256 + }, + { + "epoch": 9.09, + "learning_rate": 5.3249749247743226e-05, + "loss": 2.2186, + "theoretical_loss": 3.324805025417007, + "tokens_seen": 2952993792 + }, + { + "epoch": 9.09, + "learning_rate": 5.3239719157472413e-05, + "loss": 2.3333, + "theoretical_loss": 3.324799333999403, + "tokens_seen": 2953059328 + }, + { + "epoch": 9.09, + "learning_rate": 5.32296890672016e-05, + "loss": 2.3801, + "theoretical_loss": 3.3247936427434692, + "tokens_seen": 2953124864 + }, + { + "epoch": 9.09, + "learning_rate": 5.321965897693079e-05, + "loss": 2.3388, + "theoretical_loss": 3.3247879516491987, + "tokens_seen": 2953190400 + }, + { + "epoch": 9.09, + "learning_rate": 5.320962888665998e-05, + "loss": 2.3237, + "theoretical_loss": 3.3247822607165824, + "tokens_seen": 2953255936 + }, + { + "epoch": 9.09, + "learning_rate": 5.319959879638917e-05, + "loss": 2.2789, + "theoretical_loss": 3.324776569945613, + "tokens_seen": 2953321472 + }, + { + "epoch": 9.09, + "learning_rate": 5.318956870611836e-05, + "loss": 2.4965, + "theoretical_loss": 3.324770879336281, + "tokens_seen": 2953387008 + }, + { + "epoch": 9.09, + "learning_rate": 5.317953861584755e-05, + "loss": 2.3367, + "theoretical_loss": 3.32476518888858, + "tokens_seen": 2953452544 + }, + { + "epoch": 9.09, + "learning_rate": 5.3169508525576735e-05, + "loss": 2.2743, + "theoretical_loss": 3.3247594986025004, + "tokens_seen": 2953518080 + }, + { + "epoch": 9.09, + "learning_rate": 5.315947843530592e-05, + "loss": 2.6207, + "theoretical_loss": 3.3247538084780346, + "tokens_seen": 2953583616 + }, + { + "epoch": 9.09, + "learning_rate": 5.3149448345035105e-05, + "loss": 2.33, + "theoretical_loss": 3.3247481185151746, + "tokens_seen": 2953649152 + }, + { + "epoch": 9.09, + "learning_rate": 5.313941825476429e-05, + "loss": 2.387, + "theoretical_loss": 3.3247424287139116, + "tokens_seen": 2953714688 + }, + { + "epoch": 9.09, + "learning_rate": 5.312938816449348e-05, + "loss": 2.3238, + "theoretical_loss": 3.324736739074238, + "tokens_seen": 2953780224 + }, + { + "epoch": 9.09, + "learning_rate": 5.311935807422267e-05, + "loss": 2.3393, + "theoretical_loss": 3.3247310495961453, + "tokens_seen": 2953845760 + }, + { + "epoch": 9.09, + "learning_rate": 5.3109327983951856e-05, + "loss": 2.3503, + "theoretical_loss": 3.3247253602796256, + "tokens_seen": 2953911296 + }, + { + "epoch": 9.09, + "learning_rate": 5.3099297893681044e-05, + "loss": 2.5864, + "theoretical_loss": 3.3247196711246705, + "tokens_seen": 2953976832 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.465939521789551, + "objective/train/theoretical_loss": 3.3247154043644755, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.3247154043644755, + "tokens_seen": 2954025984 + }, + { + "epoch": 9.09, + "learning_rate": 5.308926780341023e-05, + "loss": 2.5187, + "theoretical_loss": 3.3247139821312715, + "tokens_seen": 2954042368 + }, + { + "epoch": 9.09, + "learning_rate": 5.307923771313942e-05, + "loss": 2.1866, + "theoretical_loss": 3.3247082932994214, + "tokens_seen": 2954107904 + }, + { + "epoch": 9.09, + "learning_rate": 5.306920762286861e-05, + "loss": 2.5285, + "theoretical_loss": 3.324702604629111, + "tokens_seen": 2954173440 + }, + { + "epoch": 9.09, + "learning_rate": 5.3059177532597796e-05, + "loss": 2.3996, + "theoretical_loss": 3.324696916120333, + "tokens_seen": 2954238976 + }, + { + "epoch": 9.09, + "learning_rate": 5.304914744232698e-05, + "loss": 2.4222, + "theoretical_loss": 3.3246912277730782, + "tokens_seen": 2954304512 + }, + { + "epoch": 9.09, + "learning_rate": 5.3039117352056165e-05, + "loss": 2.0511, + "theoretical_loss": 3.3246855395873394, + "tokens_seen": 2954370048 + }, + { + "epoch": 9.09, + "learning_rate": 5.302908726178535e-05, + "loss": 2.4345, + "theoretical_loss": 3.324679851563108, + "tokens_seen": 2954435584 + }, + { + "epoch": 9.09, + "learning_rate": 5.301905717151455e-05, + "loss": 2.232, + "theoretical_loss": 3.324674163700376, + "tokens_seen": 2954501120 + }, + { + "epoch": 9.09, + "learning_rate": 5.3009027081243735e-05, + "loss": 2.3712, + "theoretical_loss": 3.324668475999135, + "tokens_seen": 2954566656 + }, + { + "epoch": 9.09, + "learning_rate": 5.299899699097292e-05, + "loss": 2.0431, + "theoretical_loss": 3.3246627884593773, + "tokens_seen": 2954632192 + }, + { + "epoch": 9.09, + "learning_rate": 5.298896690070211e-05, + "loss": 2.3942, + "theoretical_loss": 3.324657101081094, + "tokens_seen": 2954697728 + }, + { + "epoch": 9.09, + "learning_rate": 5.29789368104313e-05, + "loss": 2.4437, + "theoretical_loss": 3.3246514138642778, + "tokens_seen": 2954763264 + }, + { + "epoch": 9.09, + "learning_rate": 5.296890672016049e-05, + "loss": 2.4692, + "theoretical_loss": 3.3246457268089196, + "tokens_seen": 2954828800 + }, + { + "epoch": 9.09, + "learning_rate": 5.2958876629889675e-05, + "loss": 2.1889, + "theoretical_loss": 3.324640039915012, + "tokens_seen": 2954894336 + }, + { + "epoch": 9.09, + "learning_rate": 5.2948846539618856e-05, + "loss": 2.1485, + "theoretical_loss": 3.3246343531825464, + "tokens_seen": 2954959872 + }, + { + "epoch": 9.09, + "learning_rate": 5.2938816449348044e-05, + "loss": 2.2931, + "theoretical_loss": 3.3246286666115146, + "tokens_seen": 2955025408 + }, + { + "epoch": 9.09, + "learning_rate": 5.292878635907723e-05, + "loss": 2.3934, + "theoretical_loss": 3.3246229802019087, + "tokens_seen": 2955090944 + }, + { + "epoch": 9.09, + "learning_rate": 5.291875626880642e-05, + "loss": 2.2404, + "theoretical_loss": 3.324617293953721, + "tokens_seen": 2955156480 + }, + { + "epoch": 9.09, + "learning_rate": 5.290872617853561e-05, + "loss": 2.3267, + "theoretical_loss": 3.324611607866942, + "tokens_seen": 2955222016 + }, + { + "epoch": 9.09, + "learning_rate": 5.2898696088264795e-05, + "loss": 2.3472, + "theoretical_loss": 3.324605921941565, + "tokens_seen": 2955287552 + }, + { + "epoch": 9.09, + "learning_rate": 5.288866599799398e-05, + "loss": 2.1311, + "theoretical_loss": 3.3246002361775804, + "tokens_seen": 2955353088 + }, + { + "epoch": 9.09, + "learning_rate": 5.287863590772317e-05, + "loss": 2.1092, + "theoretical_loss": 3.3245945505749814, + "tokens_seen": 2955418624 + }, + { + "epoch": 9.09, + "learning_rate": 5.286860581745236e-05, + "loss": 2.3554, + "theoretical_loss": 3.324588865133759, + "tokens_seen": 2955484160 + }, + { + "epoch": 9.09, + "learning_rate": 5.285857572718154e-05, + "loss": 2.3532, + "theoretical_loss": 3.3245831798539056, + "tokens_seen": 2955549696 + }, + { + "epoch": 9.09, + "learning_rate": 5.284854563691073e-05, + "loss": 2.1539, + "theoretical_loss": 3.3245774947354123, + "tokens_seen": 2955615232 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.45613694190979, + "objective/train/theoretical_loss": 3.3245732310024305, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.3245732310024305, + "tokens_seen": 2955664384 + }, + { + "epoch": 9.09, + "learning_rate": 5.2838515546639916e-05, + "loss": 2.3324, + "theoretical_loss": 3.3245718097782717, + "tokens_seen": 2955680768 + }, + { + "epoch": 9.09, + "learning_rate": 5.282848545636911e-05, + "loss": 2.4114, + "theoretical_loss": 3.3245661249824754, + "tokens_seen": 2955746304 + }, + { + "epoch": 9.09, + "learning_rate": 5.28184553660983e-05, + "loss": 2.4219, + "theoretical_loss": 3.324560440348015, + "tokens_seen": 2955811840 + }, + { + "epoch": 9.09, + "learning_rate": 5.2808425275827486e-05, + "loss": 2.4433, + "theoretical_loss": 3.3245547558748823, + "tokens_seen": 2955877376 + }, + { + "epoch": 9.09, + "learning_rate": 5.2798395185556674e-05, + "loss": 2.5098, + "theoretical_loss": 3.3245490715630694, + "tokens_seen": 2955942912 + }, + { + "epoch": 9.09, + "learning_rate": 5.278836509528586e-05, + "loss": 2.3556, + "theoretical_loss": 3.324543387412568, + "tokens_seen": 2956008448 + }, + { + "epoch": 9.09, + "learning_rate": 5.277833500501505e-05, + "loss": 2.3497, + "theoretical_loss": 3.3245377034233705, + "tokens_seen": 2956073984 + }, + { + "epoch": 9.09, + "learning_rate": 5.276830491474424e-05, + "loss": 2.5724, + "theoretical_loss": 3.324532019595468, + "tokens_seen": 2956139520 + }, + { + "epoch": 9.09, + "learning_rate": 5.275827482447342e-05, + "loss": 2.0553, + "theoretical_loss": 3.324526335928853, + "tokens_seen": 2956205056 + }, + { + "epoch": 9.09, + "learning_rate": 5.274824473420261e-05, + "loss": 2.3336, + "theoretical_loss": 3.3245206524235167, + "tokens_seen": 2956270592 + }, + { + "epoch": 9.09, + "learning_rate": 5.2738214643931795e-05, + "loss": 2.1209, + "theoretical_loss": 3.324514969079451, + "tokens_seen": 2956336128 + }, + { + "epoch": 9.09, + "learning_rate": 5.272818455366098e-05, + "loss": 2.2628, + "theoretical_loss": 3.3245092858966485, + "tokens_seen": 2956401664 + }, + { + "epoch": 9.09, + "learning_rate": 5.271815446339017e-05, + "loss": 2.3879, + "theoretical_loss": 3.3245036028751, + "tokens_seen": 2956467200 + }, + { + "epoch": 9.09, + "learning_rate": 5.270812437311936e-05, + "loss": 2.3524, + "theoretical_loss": 3.324497920014798, + "tokens_seen": 2956532736 + }, + { + "epoch": 9.09, + "learning_rate": 5.2698094282848546e-05, + "loss": 2.3667, + "theoretical_loss": 3.3244922373157344, + "tokens_seen": 2956598272 + }, + { + "epoch": 9.09, + "learning_rate": 5.2688064192577734e-05, + "loss": 2.5585, + "theoretical_loss": 3.324486554777901, + "tokens_seen": 2956663808 + }, + { + "epoch": 9.09, + "learning_rate": 5.267803410230692e-05, + "loss": 2.4104, + "theoretical_loss": 3.3244808724012893, + "tokens_seen": 2956729344 + }, + { + "epoch": 9.09, + "learning_rate": 5.266800401203611e-05, + "loss": 2.2812, + "theoretical_loss": 3.3244751901858915, + "tokens_seen": 2956794880 + }, + { + "epoch": 9.09, + "learning_rate": 5.265797392176529e-05, + "loss": 2.1258, + "theoretical_loss": 3.324469508131699, + "tokens_seen": 2956860416 + }, + { + "epoch": 9.09, + "learning_rate": 5.264794383149448e-05, + "loss": 2.4369, + "theoretical_loss": 3.3244638262387047, + "tokens_seen": 2956925952 + }, + { + "epoch": 9.09, + "learning_rate": 5.2637913741223674e-05, + "loss": 2.5692, + "theoretical_loss": 3.3244581445068992, + "tokens_seen": 2956991488 + }, + { + "epoch": 9.09, + "learning_rate": 5.262788365095286e-05, + "loss": 2.4134, + "theoretical_loss": 3.324452462936275, + "tokens_seen": 2957057024 + }, + { + "epoch": 9.09, + "learning_rate": 5.261785356068205e-05, + "loss": 2.2269, + "theoretical_loss": 3.3244467815268237, + "tokens_seen": 2957122560 + }, + { + "epoch": 9.09, + "learning_rate": 5.260782347041124e-05, + "loss": 2.4854, + "theoretical_loss": 3.3244411002785377, + "tokens_seen": 2957188096 + }, + { + "epoch": 9.09, + "learning_rate": 5.2597793380140425e-05, + "loss": 2.3368, + "theoretical_loss": 3.324435419191408, + "tokens_seen": 2957253632 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0863213539123535, + "objective/train/theoretical_loss": 3.3244311584818154, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.3244311584818154, + "tokens_seen": 2957302784 + }, + { + "epoch": 9.09, + "learning_rate": 5.258776328986961e-05, + "loss": 2.1362, + "theoretical_loss": 3.3244297382654273, + "tokens_seen": 2957319168 + }, + { + "epoch": 9.09, + "learning_rate": 5.25777331995988e-05, + "loss": 2.6204, + "theoretical_loss": 3.324424057500587, + "tokens_seen": 2957384704 + }, + { + "epoch": 9.09, + "learning_rate": 5.256770310932799e-05, + "loss": 2.5663, + "theoretical_loss": 3.324418376896879, + "tokens_seen": 2957450240 + }, + { + "epoch": 9.09, + "learning_rate": 5.255767301905717e-05, + "loss": 2.3867, + "theoretical_loss": 3.3244126964542957, + "tokens_seen": 2957515776 + }, + { + "epoch": 9.09, + "learning_rate": 5.254764292878636e-05, + "loss": 2.4402, + "theoretical_loss": 3.324407016172828, + "tokens_seen": 2957581312 + }, + { + "epoch": 9.09, + "learning_rate": 5.2537612838515546e-05, + "loss": 2.491, + "theoretical_loss": 3.324401336052468, + "tokens_seen": 2957646848 + }, + { + "epoch": 9.09, + "learning_rate": 5.2527582748244734e-05, + "loss": 2.3939, + "theoretical_loss": 3.324395656093208, + "tokens_seen": 2957712384 + }, + { + "epoch": 9.09, + "learning_rate": 5.251755265797392e-05, + "loss": 2.4361, + "theoretical_loss": 3.3243899762950395, + "tokens_seen": 2957777920 + }, + { + "epoch": 9.09, + "learning_rate": 5.250752256770311e-05, + "loss": 2.3839, + "theoretical_loss": 3.324384296657955, + "tokens_seen": 2957843456 + }, + { + "epoch": 9.09, + "learning_rate": 5.24974924774323e-05, + "loss": 2.316, + "theoretical_loss": 3.3243786171819454, + "tokens_seen": 2957908992 + }, + { + "epoch": 9.09, + "learning_rate": 5.2487462387161485e-05, + "loss": 2.3321, + "theoretical_loss": 3.324372937867003, + "tokens_seen": 2957974528 + }, + { + "epoch": 9.09, + "learning_rate": 5.247743229689067e-05, + "loss": 2.3909, + "theoretical_loss": 3.32436725871312, + "tokens_seen": 2958040064 + }, + { + "epoch": 9.09, + "learning_rate": 5.2467402206619854e-05, + "loss": 2.3548, + "theoretical_loss": 3.324361579720288, + "tokens_seen": 2958105600 + }, + { + "epoch": 9.09, + "learning_rate": 5.245737211634905e-05, + "loss": 2.0473, + "theoretical_loss": 3.3243559008884986, + "tokens_seen": 2958171136 + }, + { + "epoch": 9.09, + "learning_rate": 5.244734202607824e-05, + "loss": 2.0965, + "theoretical_loss": 3.324350222217744, + "tokens_seen": 2958236672 + }, + { + "epoch": 9.09, + "learning_rate": 5.2437311935807425e-05, + "loss": 2.1825, + "theoretical_loss": 3.324344543708016, + "tokens_seen": 2958302208 + }, + { + "epoch": 9.09, + "learning_rate": 5.242728184553661e-05, + "loss": 2.4942, + "theoretical_loss": 3.3243388653593064, + "tokens_seen": 2958367744 + }, + { + "epoch": 9.09, + "learning_rate": 5.24172517552658e-05, + "loss": 2.262, + "theoretical_loss": 3.324333187171607, + "tokens_seen": 2958433280 + }, + { + "epoch": 9.09, + "learning_rate": 5.240722166499499e-05, + "loss": 2.4097, + "theoretical_loss": 3.3243275091449096, + "tokens_seen": 2958498816 + }, + { + "epoch": 9.09, + "learning_rate": 5.2397191574724176e-05, + "loss": 2.5324, + "theoretical_loss": 3.3243218312792067, + "tokens_seen": 2958564352 + }, + { + "epoch": 9.09, + "learning_rate": 5.2387161484453364e-05, + "loss": 2.2398, + "theoretical_loss": 3.3243161535744896, + "tokens_seen": 2958629888 + }, + { + "epoch": 9.09, + "learning_rate": 5.237713139418255e-05, + "loss": 2.4639, + "theoretical_loss": 3.32431047603075, + "tokens_seen": 2958695424 + }, + { + "epoch": 9.09, + "learning_rate": 5.236710130391173e-05, + "loss": 2.4294, + "theoretical_loss": 3.32430479864798, + "tokens_seen": 2958760960 + }, + { + "epoch": 9.09, + "learning_rate": 5.235707121364092e-05, + "loss": 2.4508, + "theoretical_loss": 3.3242991214261717, + "tokens_seen": 2958826496 + }, + { + "epoch": 9.09, + "learning_rate": 5.234704112337011e-05, + "loss": 2.3826, + "theoretical_loss": 3.324293444365317, + "tokens_seen": 2958892032 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.517104148864746, + "objective/train/theoretical_loss": 3.3242891866752964, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.3242891866752964, + "tokens_seen": 2958941184 + }, + { + "epoch": 9.09, + "learning_rate": 5.23370110330993e-05, + "loss": 2.5481, + "theoretical_loss": 3.324287767465407, + "tokens_seen": 2958957568 + }, + { + "epoch": 9.09, + "learning_rate": 5.2326980942828485e-05, + "loss": 2.4875, + "theoretical_loss": 3.3242820907264345, + "tokens_seen": 2959023104 + }, + { + "epoch": 9.09, + "learning_rate": 5.231695085255767e-05, + "loss": 2.2916, + "theoretical_loss": 3.324276414148391, + "tokens_seen": 2959088640 + }, + { + "epoch": 9.09, + "learning_rate": 5.230692076228686e-05, + "loss": 2.4885, + "theoretical_loss": 3.3242707377312684, + "tokens_seen": 2959154176 + }, + { + "epoch": 9.09, + "learning_rate": 5.229689067201605e-05, + "loss": 2.5152, + "theoretical_loss": 3.3242650614750584, + "tokens_seen": 2959219712 + }, + { + "epoch": 9.09, + "learning_rate": 5.2286860581745237e-05, + "loss": 2.3135, + "theoretical_loss": 3.324259385379753, + "tokens_seen": 2959285248 + }, + { + "epoch": 9.09, + "learning_rate": 5.2276830491474424e-05, + "loss": 2.1493, + "theoretical_loss": 3.3242537094453444, + "tokens_seen": 2959350784 + }, + { + "epoch": 9.09, + "learning_rate": 5.226680040120361e-05, + "loss": 2.3847, + "theoretical_loss": 3.324248033671824, + "tokens_seen": 2959416320 + }, + { + "epoch": 9.09, + "learning_rate": 5.22567703109328e-05, + "loss": 2.2044, + "theoretical_loss": 3.3242423580591836, + "tokens_seen": 2959481856 + }, + { + "epoch": 9.09, + "learning_rate": 5.224674022066199e-05, + "loss": 2.6055, + "theoretical_loss": 3.3242366826074154, + "tokens_seen": 2959547392 + }, + { + "epoch": 9.09, + "learning_rate": 5.2236710130391176e-05, + "loss": 2.0583, + "theoretical_loss": 3.3242310073165116, + "tokens_seen": 2959612928 + }, + { + "epoch": 9.09, + "learning_rate": 5.2226680040120364e-05, + "loss": 2.2063, + "theoretical_loss": 3.3242253321864634, + "tokens_seen": 2959678464 + }, + { + "epoch": 9.09, + "learning_rate": 5.221664994984955e-05, + "loss": 2.2307, + "theoretical_loss": 3.324219657217263, + "tokens_seen": 2959744000 + }, + { + "epoch": 9.09, + "learning_rate": 5.220661985957874e-05, + "loss": 2.2943, + "theoretical_loss": 3.324213982408902, + "tokens_seen": 2959809536 + }, + { + "epoch": 9.09, + "learning_rate": 5.219658976930793e-05, + "loss": 2.5729, + "theoretical_loss": 3.324208307761373, + "tokens_seen": 2959875072 + }, + { + "epoch": 9.09, + "learning_rate": 5.2186559679037115e-05, + "loss": 2.2755, + "theoretical_loss": 3.324202633274667, + "tokens_seen": 2959940608 + }, + { + "epoch": 9.09, + "learning_rate": 5.2176529588766297e-05, + "loss": 2.2312, + "theoretical_loss": 3.3241969589487765, + "tokens_seen": 2960006144 + }, + { + "epoch": 9.09, + "learning_rate": 5.2166499498495484e-05, + "loss": 2.4512, + "theoretical_loss": 3.324191284783693, + "tokens_seen": 2960071680 + }, + { + "epoch": 9.09, + "learning_rate": 5.215646940822467e-05, + "loss": 2.3185, + "theoretical_loss": 3.3241856107794088, + "tokens_seen": 2960137216 + }, + { + "epoch": 9.09, + "learning_rate": 5.214643931795386e-05, + "loss": 2.3667, + "theoretical_loss": 3.3241799369359155, + "tokens_seen": 2960202752 + }, + { + "epoch": 9.09, + "learning_rate": 5.213640922768305e-05, + "loss": 2.2262, + "theoretical_loss": 3.3241742632532048, + "tokens_seen": 2960268288 + }, + { + "epoch": 9.09, + "learning_rate": 5.2126379137412236e-05, + "loss": 2.4096, + "theoretical_loss": 3.324168589731269, + "tokens_seen": 2960333824 + }, + { + "epoch": 9.09, + "learning_rate": 5.2116349047141424e-05, + "loss": 2.3142, + "theoretical_loss": 3.3241629163700996, + "tokens_seen": 2960399360 + }, + { + "epoch": 9.09, + "learning_rate": 5.210631895687061e-05, + "loss": 2.182, + "theoretical_loss": 3.324157243169689, + "tokens_seen": 2960464896 + }, + { + "epoch": 9.09, + "learning_rate": 5.20962888665998e-05, + "loss": 2.171, + "theoretical_loss": 3.3241515701300286, + "tokens_seen": 2960530432 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2459299564361572, + "objective/train/theoretical_loss": 3.324147315455771, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.324147315455771, + "tokens_seen": 2960579584 + }, + { + "epoch": 9.09, + "learning_rate": 5.208625877632899e-05, + "loss": 2.1486, + "theoretical_loss": 3.3241458972511104, + "tokens_seen": 2960595968 + }, + { + "epoch": 9.09, + "learning_rate": 5.2076228686058176e-05, + "loss": 2.5178, + "theoretical_loss": 3.3241402245329263, + "tokens_seen": 2960661504 + }, + { + "epoch": 9.09, + "learning_rate": 5.2066198595787363e-05, + "loss": 2.5044, + "theoretical_loss": 3.3241345519754684, + "tokens_seen": 2960727040 + }, + { + "epoch": 9.09, + "learning_rate": 5.205616850551655e-05, + "loss": 2.2173, + "theoretical_loss": 3.324128879578728, + "tokens_seen": 2960792576 + }, + { + "epoch": 9.09, + "learning_rate": 5.204613841524574e-05, + "loss": 2.0722, + "theoretical_loss": 3.324123207342698, + "tokens_seen": 2960858112 + }, + { + "epoch": 9.09, + "learning_rate": 5.203610832497493e-05, + "loss": 2.3411, + "theoretical_loss": 3.3241175352673693, + "tokens_seen": 2960923648 + }, + { + "epoch": 9.09, + "learning_rate": 5.2026078234704115e-05, + "loss": 2.4109, + "theoretical_loss": 3.324111863352734, + "tokens_seen": 2960989184 + }, + { + "epoch": 9.09, + "learning_rate": 5.20160481444333e-05, + "loss": 2.6697, + "theoretical_loss": 3.3241061915987844, + "tokens_seen": 2961054720 + }, + { + "epoch": 9.09, + "learning_rate": 5.200601805416249e-05, + "loss": 2.3489, + "theoretical_loss": 3.3241005200055125, + "tokens_seen": 2961120256 + }, + { + "epoch": 9.09, + "learning_rate": 5.199598796389168e-05, + "loss": 2.3759, + "theoretical_loss": 3.3240948485729094, + "tokens_seen": 2961185792 + }, + { + "epoch": 9.09, + "learning_rate": 5.198595787362087e-05, + "loss": 2.4356, + "theoretical_loss": 3.3240891773009675, + "tokens_seen": 2961251328 + }, + { + "epoch": 9.09, + "learning_rate": 5.197592778335005e-05, + "loss": 2.313, + "theoretical_loss": 3.324083506189679, + "tokens_seen": 2961316864 + }, + { + "epoch": 9.09, + "learning_rate": 5.1965897693079236e-05, + "loss": 2.4975, + "theoretical_loss": 3.324077835239035, + "tokens_seen": 2961382400 + }, + { + "epoch": 9.09, + "learning_rate": 5.1955867602808424e-05, + "loss": 2.1923, + "theoretical_loss": 3.324072164449028, + "tokens_seen": 2961447936 + }, + { + "epoch": 9.09, + "learning_rate": 5.194583751253761e-05, + "loss": 2.5787, + "theoretical_loss": 3.32406649381965, + "tokens_seen": 2961513472 + }, + { + "epoch": 9.09, + "learning_rate": 5.19358074222668e-05, + "loss": 2.6657, + "theoretical_loss": 3.324060823350892, + "tokens_seen": 2961579008 + }, + { + "epoch": 9.09, + "learning_rate": 5.192577733199599e-05, + "loss": 2.3622, + "theoretical_loss": 3.324055153042747, + "tokens_seen": 2961644544 + }, + { + "epoch": 9.09, + "learning_rate": 5.1915747241725175e-05, + "loss": 2.4808, + "theoretical_loss": 3.324049482895206, + "tokens_seen": 2961710080 + }, + { + "epoch": 9.09, + "learning_rate": 5.190571715145436e-05, + "loss": 2.1185, + "theoretical_loss": 3.324043812908262, + "tokens_seen": 2961775616 + }, + { + "epoch": 9.09, + "learning_rate": 5.189568706118355e-05, + "loss": 2.2189, + "theoretical_loss": 3.3240381430819057, + "tokens_seen": 2961841152 + }, + { + "epoch": 9.09, + "learning_rate": 5.1885656970912746e-05, + "loss": 2.3141, + "theoretical_loss": 3.3240324734161297, + "tokens_seen": 2961906688 + }, + { + "epoch": 9.09, + "learning_rate": 5.187562688064193e-05, + "loss": 2.2298, + "theoretical_loss": 3.3240268039109253, + "tokens_seen": 2961972224 + }, + { + "epoch": 9.09, + "learning_rate": 5.1865596790371115e-05, + "loss": 2.4249, + "theoretical_loss": 3.3240211345662853, + "tokens_seen": 2962037760 + }, + { + "epoch": 9.09, + "learning_rate": 5.18555667001003e-05, + "loss": 2.0393, + "theoretical_loss": 3.324015465382201, + "tokens_seen": 2962103296 + }, + { + "epoch": 9.09, + "learning_rate": 5.184553660982949e-05, + "loss": 2.5585, + "theoretical_loss": 3.3240097963586646, + "tokens_seen": 2962168832 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2247915267944336, + "objective/train/theoretical_loss": 3.3240055446963663, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.3240055446963663, + "tokens_seen": 2962217984 + }, + { + "epoch": 9.09, + "learning_rate": 5.183550651955868e-05, + "loss": 2.2407, + "theoretical_loss": 3.3240041274956673, + "tokens_seen": 2962234368 + }, + { + "epoch": 9.09, + "learning_rate": 5.1825476429287866e-05, + "loss": 2.2645, + "theoretical_loss": 3.3239984587932017, + "tokens_seen": 2962299904 + }, + { + "epoch": 9.09, + "learning_rate": 5.1815446339017054e-05, + "loss": 2.2506, + "theoretical_loss": 3.3239927902512596, + "tokens_seen": 2962365440 + }, + { + "epoch": 9.09, + "learning_rate": 5.180541624874624e-05, + "loss": 2.1434, + "theoretical_loss": 3.323987121869833, + "tokens_seen": 2962430976 + }, + { + "epoch": 9.09, + "learning_rate": 5.179538615847543e-05, + "loss": 2.469, + "theoretical_loss": 3.3239814536489134, + "tokens_seen": 2962496512 + }, + { + "epoch": 9.09, + "learning_rate": 5.178535606820461e-05, + "loss": 2.3629, + "theoretical_loss": 3.323975785588493, + "tokens_seen": 2962562048 + }, + { + "epoch": 9.09, + "learning_rate": 5.17753259779338e-05, + "loss": 2.4847, + "theoretical_loss": 3.3239701176885634, + "tokens_seen": 2962627584 + }, + { + "epoch": 9.09, + "learning_rate": 5.176529588766299e-05, + "loss": 2.569, + "theoretical_loss": 3.3239644499491168, + "tokens_seen": 2962693120 + }, + { + "epoch": 9.09, + "learning_rate": 5.1755265797392175e-05, + "loss": 2.3965, + "theoretical_loss": 3.3239587823701453, + "tokens_seen": 2962758656 + }, + { + "epoch": 9.09, + "learning_rate": 5.174523570712136e-05, + "loss": 2.3931, + "theoretical_loss": 3.3239531149516406, + "tokens_seen": 2962824192 + }, + { + "epoch": 9.09, + "learning_rate": 5.173520561685055e-05, + "loss": 2.2451, + "theoretical_loss": 3.323947447693594, + "tokens_seen": 2962889728 + }, + { + "epoch": 9.09, + "learning_rate": 5.172517552657974e-05, + "loss": 2.3871, + "theoretical_loss": 3.3239417805959985, + "tokens_seen": 2962955264 + }, + { + "epoch": 9.09, + "learning_rate": 5.1715145436308926e-05, + "loss": 2.4007, + "theoretical_loss": 3.3239361136588452, + "tokens_seen": 2963020800 + }, + { + "epoch": 9.09, + "learning_rate": 5.170511534603812e-05, + "loss": 2.5923, + "theoretical_loss": 3.3239304468821262, + "tokens_seen": 2963086336 + }, + { + "epoch": 9.09, + "learning_rate": 5.169508525576731e-05, + "loss": 2.4855, + "theoretical_loss": 3.323924780265834, + "tokens_seen": 2963151872 + }, + { + "epoch": 9.09, + "learning_rate": 5.168505516549649e-05, + "loss": 2.1145, + "theoretical_loss": 3.3239191138099597, + "tokens_seen": 2963217408 + }, + { + "epoch": 9.09, + "learning_rate": 5.167502507522568e-05, + "loss": 2.3997, + "theoretical_loss": 3.3239134475144954, + "tokens_seen": 2963282944 + }, + { + "epoch": 9.09, + "learning_rate": 5.1664994984954866e-05, + "loss": 2.3001, + "theoretical_loss": 3.3239077813794333, + "tokens_seen": 2963348480 + }, + { + "epoch": 9.09, + "learning_rate": 5.1654964894684054e-05, + "loss": 2.3909, + "theoretical_loss": 3.323902115404765, + "tokens_seen": 2963414016 + }, + { + "epoch": 9.09, + "learning_rate": 5.164493480441324e-05, + "loss": 2.292, + "theoretical_loss": 3.3238964495904826, + "tokens_seen": 2963479552 + }, + { + "epoch": 9.09, + "learning_rate": 5.163490471414243e-05, + "loss": 2.2477, + "theoretical_loss": 3.323890783936578, + "tokens_seen": 2963545088 + }, + { + "epoch": 9.09, + "learning_rate": 5.162487462387162e-05, + "loss": 2.2586, + "theoretical_loss": 3.323885118443043, + "tokens_seen": 2963610624 + }, + { + "epoch": 9.09, + "learning_rate": 5.1614844533600805e-05, + "loss": 2.4668, + "theoretical_loss": 3.3238794531098694, + "tokens_seen": 2963676160 + }, + { + "epoch": 9.09, + "learning_rate": 5.160481444332999e-05, + "loss": 2.2911, + "theoretical_loss": 3.3238737879370497, + "tokens_seen": 2963741696 + }, + { + "epoch": 9.09, + "learning_rate": 5.159478435305918e-05, + "loss": 2.1156, + "theoretical_loss": 3.3238681229245755, + "tokens_seen": 2963807232 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2037250995635986, + "objective/train/theoretical_loss": 3.323863874270441, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.323863874270441, + "tokens_seen": 2963856384 + }, + { + "epoch": 9.09, + "learning_rate": 5.158475426278836e-05, + "loss": 2.4562, + "theoretical_loss": 3.3238624580724383, + "tokens_seen": 2963872768 + }, + { + "epoch": 9.09, + "learning_rate": 5.157472417251755e-05, + "loss": 2.423, + "theoretical_loss": 3.3238567933806307, + "tokens_seen": 2963938304 + }, + { + "epoch": 9.09, + "learning_rate": 5.156469408224674e-05, + "loss": 2.4967, + "theoretical_loss": 3.3238511288491437, + "tokens_seen": 2964003840 + }, + { + "epoch": 9.09, + "learning_rate": 5.1554663991975926e-05, + "loss": 2.4125, + "theoretical_loss": 3.3238454644779702, + "tokens_seen": 2964069376 + }, + { + "epoch": 9.09, + "learning_rate": 5.1544633901705114e-05, + "loss": 2.4888, + "theoretical_loss": 3.323839800267102, + "tokens_seen": 2964134912 + }, + { + "epoch": 9.09, + "learning_rate": 5.15346038114343e-05, + "loss": 2.391, + "theoretical_loss": 3.3238341362165302, + "tokens_seen": 2964200448 + }, + { + "epoch": 9.09, + "learning_rate": 5.152457372116349e-05, + "loss": 2.2209, + "theoretical_loss": 3.3238284723262472, + "tokens_seen": 2964265984 + }, + { + "epoch": 9.09, + "learning_rate": 5.1514543630892684e-05, + "loss": 2.1985, + "theoretical_loss": 3.3238228085962453, + "tokens_seen": 2964331520 + }, + { + "epoch": 9.09, + "learning_rate": 5.150451354062187e-05, + "loss": 2.2873, + "theoretical_loss": 3.323817145026516, + "tokens_seen": 2964397056 + }, + { + "epoch": 9.09, + "learning_rate": 5.149448345035106e-05, + "loss": 2.5129, + "theoretical_loss": 3.323811481617051, + "tokens_seen": 2964462592 + }, + { + "epoch": 9.09, + "learning_rate": 5.148445336008024e-05, + "loss": 2.2605, + "theoretical_loss": 3.323805818367843, + "tokens_seen": 2964528128 + }, + { + "epoch": 9.09, + "learning_rate": 5.147442326980943e-05, + "loss": 2.0683, + "theoretical_loss": 3.3238001552788834, + "tokens_seen": 2964593664 + }, + { + "epoch": 9.09, + "learning_rate": 5.146439317953862e-05, + "loss": 2.1675, + "theoretical_loss": 3.323794492350164, + "tokens_seen": 2964659200 + }, + { + "epoch": 9.09, + "learning_rate": 5.146439317953862e-05, + "loss": 2.1783, + "theoretical_loss": 3.323788829581677, + "tokens_seen": 2964724736 + }, + { + "epoch": 9.09, + "learning_rate": 5.1454363089267805e-05, + "loss": 2.5103, + "theoretical_loss": 3.3237831669734144, + "tokens_seen": 2964790272 + }, + { + "epoch": 9.09, + "learning_rate": 5.144433299899699e-05, + "loss": 2.2048, + "theoretical_loss": 3.3237775045253675, + "tokens_seen": 2964855808 + }, + { + "epoch": 9.09, + "learning_rate": 5.143430290872618e-05, + "loss": 2.3011, + "theoretical_loss": 3.323771842237529, + "tokens_seen": 2964921344 + }, + { + "epoch": 9.09, + "learning_rate": 5.142427281845537e-05, + "loss": 2.5314, + "theoretical_loss": 3.3237661801098906, + "tokens_seen": 2964986880 + }, + { + "epoch": 9.09, + "learning_rate": 5.1414242728184556e-05, + "loss": 2.3889, + "theoretical_loss": 3.3237605181424437, + "tokens_seen": 2965052416 + }, + { + "epoch": 9.09, + "learning_rate": 5.1404212637913744e-05, + "loss": 2.1979, + "theoretical_loss": 3.3237548563351806, + "tokens_seen": 2965117952 + }, + { + "epoch": 9.09, + "learning_rate": 5.1394182547642925e-05, + "loss": 2.3718, + "theoretical_loss": 3.323749194688094, + "tokens_seen": 2965183488 + }, + { + "epoch": 9.09, + "learning_rate": 5.138415245737211e-05, + "loss": 2.3003, + "theoretical_loss": 3.3237435332011747, + "tokens_seen": 2965249024 + }, + { + "epoch": 9.09, + "learning_rate": 5.13741223671013e-05, + "loss": 2.273, + "theoretical_loss": 3.323737871874415, + "tokens_seen": 2965314560 + }, + { + "epoch": 9.09, + "learning_rate": 5.136409227683049e-05, + "loss": 2.256, + "theoretical_loss": 3.323732210707807, + "tokens_seen": 2965380096 + }, + { + "epoch": 9.09, + "learning_rate": 5.135406218655968e-05, + "loss": 2.2662, + "theoretical_loss": 3.3237265497013424, + "tokens_seen": 2965445632 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7142558097839355, + "objective/train/theoretical_loss": 3.323722304051583, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.323722304051583, + "tokens_seen": 2965494784 + }, + { + "epoch": 9.09, + "learning_rate": 5.1344032096288865e-05, + "loss": 2.2654, + "theoretical_loss": 3.3237208888550134, + "tokens_seen": 2965511168 + }, + { + "epoch": 9.09, + "learning_rate": 5.133400200601805e-05, + "loss": 2.4913, + "theoretical_loss": 3.3237152281688114, + "tokens_seen": 2965576704 + }, + { + "epoch": 9.09, + "learning_rate": 5.132397191574725e-05, + "loss": 2.375, + "theoretical_loss": 3.323709567642729, + "tokens_seen": 2965642240 + }, + { + "epoch": 9.09, + "learning_rate": 5.1313941825476435e-05, + "loss": 2.1354, + "theoretical_loss": 3.323703907276758, + "tokens_seen": 2965707776 + }, + { + "epoch": 9.09, + "learning_rate": 5.130391173520562e-05, + "loss": 2.5194, + "theoretical_loss": 3.32369824707089, + "tokens_seen": 2965773312 + }, + { + "epoch": 9.09, + "learning_rate": 5.1293881644934804e-05, + "loss": 2.2214, + "theoretical_loss": 3.323692587025117, + "tokens_seen": 2965838848 + }, + { + "epoch": 9.09, + "learning_rate": 5.128385155466399e-05, + "loss": 2.2814, + "theoretical_loss": 3.3236869271394305, + "tokens_seen": 2965904384 + }, + { + "epoch": 9.09, + "learning_rate": 5.127382146439318e-05, + "loss": 2.3457, + "theoretical_loss": 3.3236812674138236, + "tokens_seen": 2965969920 + }, + { + "epoch": 9.09, + "learning_rate": 5.126379137412237e-05, + "loss": 2.2978, + "theoretical_loss": 3.323675607848288, + "tokens_seen": 2966035456 + }, + { + "epoch": 9.09, + "learning_rate": 5.1253761283851556e-05, + "loss": 2.505, + "theoretical_loss": 3.3236699484428147, + "tokens_seen": 2966100992 + }, + { + "epoch": 9.09, + "learning_rate": 5.1243731193580744e-05, + "loss": 2.3146, + "theoretical_loss": 3.3236642891973958, + "tokens_seen": 2966166528 + }, + { + "epoch": 9.09, + "learning_rate": 5.123370110330993e-05, + "loss": 2.4471, + "theoretical_loss": 3.323658630112024, + "tokens_seen": 2966232064 + }, + { + "epoch": 9.09, + "learning_rate": 5.122367101303912e-05, + "loss": 2.444, + "theoretical_loss": 3.323652971186691, + "tokens_seen": 2966297600 + }, + { + "epoch": 9.09, + "learning_rate": 5.121364092276831e-05, + "loss": 2.4987, + "theoretical_loss": 3.3236473124213886, + "tokens_seen": 2966363136 + }, + { + "epoch": 9.09, + "learning_rate": 5.1203610832497495e-05, + "loss": 2.2715, + "theoretical_loss": 3.3236416538161087, + "tokens_seen": 2966428672 + }, + { + "epoch": 9.09, + "learning_rate": 5.1193580742226677e-05, + "loss": 2.5185, + "theoretical_loss": 3.323635995370843, + "tokens_seen": 2966494208 + }, + { + "epoch": 9.09, + "learning_rate": 5.1183550651955864e-05, + "loss": 2.4976, + "theoretical_loss": 3.323630337085584, + "tokens_seen": 2966559744 + }, + { + "epoch": 9.09, + "learning_rate": 5.117352056168505e-05, + "loss": 2.2594, + "theoretical_loss": 3.323624678960323, + "tokens_seen": 2966625280 + }, + { + "epoch": 9.09, + "learning_rate": 5.116349047141424e-05, + "loss": 2.5607, + "theoretical_loss": 3.3236190209950527, + "tokens_seen": 2966690816 + }, + { + "epoch": 9.09, + "learning_rate": 5.115346038114343e-05, + "loss": 2.2467, + "theoretical_loss": 3.3236133631897644, + "tokens_seen": 2966756352 + }, + { + "epoch": 9.09, + "learning_rate": 5.114343029087262e-05, + "loss": 2.4164, + "theoretical_loss": 3.3236077055444504, + "tokens_seen": 2966821888 + }, + { + "epoch": 9.09, + "learning_rate": 5.113340020060181e-05, + "loss": 2.2034, + "theoretical_loss": 3.3236020480591026, + "tokens_seen": 2966887424 + }, + { + "epoch": 9.09, + "learning_rate": 5.1123370110331e-05, + "loss": 2.2046, + "theoretical_loss": 3.3235963907337127, + "tokens_seen": 2966952960 + }, + { + "epoch": 9.09, + "learning_rate": 5.1113340020060187e-05, + "loss": 2.3258, + "theoretical_loss": 3.3235907335682726, + "tokens_seen": 2967018496 + }, + { + "epoch": 9.09, + "learning_rate": 5.1103309929789374e-05, + "loss": 2.3614, + "theoretical_loss": 3.323585076562775, + "tokens_seen": 2967084032 + }, + { + "epoch": 9.09, + "objective/train/docs_used": 3206060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.585063934326172, + "objective/train/theoretical_loss": 3.3235808339136086, + "objective/train/tokens_used": 2967603680, + "theoretical_loss": 3.3235808339136086, + "tokens_seen": 2967133184 + }, + { + "epoch": 9.09, + "learning_rate": 5.1093279839518556e-05, + "loss": 2.4278, + "theoretical_loss": 3.323579419717211, + "tokens_seen": 2967149568 + }, + { + "epoch": 9.09, + "learning_rate": 5.1083249749247743e-05, + "loss": 2.4994, + "theoretical_loss": 3.3235737630315727, + "tokens_seen": 2967215104 + }, + { + "epoch": 9.09, + "learning_rate": 5.107321965897693e-05, + "loss": 2.3799, + "theoretical_loss": 3.323568106505852, + "tokens_seen": 2967280640 + }, + { + "epoch": 9.09, + "learning_rate": 5.106318956870612e-05, + "loss": 2.4145, + "theoretical_loss": 3.3235624501400416, + "tokens_seen": 2967346176 + }, + { + "epoch": 9.09, + "learning_rate": 5.105315947843531e-05, + "loss": 2.149, + "theoretical_loss": 3.3235567939341326, + "tokens_seen": 2967411712 + }, + { + "epoch": 9.09, + "learning_rate": 5.1043129388164495e-05, + "loss": 2.3601, + "theoretical_loss": 3.323551137888117, + "tokens_seen": 2967477248 + }, + { + "epoch": 9.09, + "learning_rate": 5.103309929789368e-05, + "loss": 2.3398, + "theoretical_loss": 3.3235454820019874, + "tokens_seen": 2967542784 + }, + { + "epoch": 9.09, + "learning_rate": 5.102306920762287e-05, + "loss": 2.3464, + "theoretical_loss": 3.323539826275735, + "tokens_seen": 2967608320 + }, + { + "epoch": 10.0, + "learning_rate": 5.101303911735206e-05, + "loss": 2.9543, + "theoretical_loss": 3.3235328452091064, + "tokens_seen": 2967689216 + }, + { + "epoch": 10.0, + "learning_rate": 5.100300902708124e-05, + "loss": 2.4512, + "theoretical_loss": 3.323527189840051, + "tokens_seen": 2967754752 + }, + { + "epoch": 10.0, + "learning_rate": 5.099297893681043e-05, + "loss": 2.4758, + "theoretical_loss": 3.3235215346308475, + "tokens_seen": 2967820288 + }, + { + "epoch": 10.0, + "learning_rate": 5.0982948846539616e-05, + "loss": 2.4604, + "theoretical_loss": 3.3235158795814876, + "tokens_seen": 2967885824 + }, + { + "epoch": 10.0, + "learning_rate": 5.0972918756268804e-05, + "loss": 2.362, + "theoretical_loss": 3.323510224691963, + "tokens_seen": 2967951360 + }, + { + "epoch": 10.0, + "learning_rate": 5.096288866599799e-05, + "loss": 2.4179, + "theoretical_loss": 3.3235045699622656, + "tokens_seen": 2968016896 + }, + { + "epoch": 10.0, + "learning_rate": 5.0952858575727186e-05, + "loss": 2.4177, + "theoretical_loss": 3.323498915392388, + "tokens_seen": 2968082432 + }, + { + "epoch": 10.0, + "learning_rate": 5.0942828485456374e-05, + "loss": 2.4378, + "theoretical_loss": 3.323493260982321, + "tokens_seen": 2968147968 + }, + { + "epoch": 10.0, + "learning_rate": 5.093279839518556e-05, + "loss": 2.408, + "theoretical_loss": 3.3234876067320576, + "tokens_seen": 2968213504 + }, + { + "epoch": 10.0, + "learning_rate": 5.092276830491475e-05, + "loss": 2.4774, + "theoretical_loss": 3.3234819526415893, + "tokens_seen": 2968279040 + }, + { + "epoch": 10.0, + "learning_rate": 5.091273821464394e-05, + "loss": 2.5586, + "theoretical_loss": 3.323476298710908, + "tokens_seen": 2968344576 + }, + { + "epoch": 10.0, + "learning_rate": 5.090270812437312e-05, + "loss": 2.4774, + "theoretical_loss": 3.3234706449400058, + "tokens_seen": 2968410112 + }, + { + "epoch": 10.0, + "learning_rate": 5.089267803410231e-05, + "loss": 2.602, + "theoretical_loss": 3.323464991328875, + "tokens_seen": 2968475648 + }, + { + "epoch": 10.0, + "learning_rate": 5.0882647943831495e-05, + "loss": 2.5336, + "theoretical_loss": 3.323459337877507, + "tokens_seen": 2968541184 + }, + { + "epoch": 10.0, + "learning_rate": 5.087261785356068e-05, + "loss": 2.4556, + "theoretical_loss": 3.3234536845858935, + "tokens_seen": 2968606720 + }, + { + "epoch": 10.0, + "learning_rate": 5.086258776328987e-05, + "loss": 2.4732, + "theoretical_loss": 3.323448031454028, + "tokens_seen": 2968672256 + }, + { + "epoch": 10.0, + "learning_rate": 5.085255767301906e-05, + "loss": 2.5343, + "theoretical_loss": 3.3234423784819, + "tokens_seen": 2968737792 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3273540, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.721041202545166, + "objective/train/theoretical_loss": 3.323439552055736, + "objective/train/tokens_used": 2989230560, + "theoretical_loss": 3.323439552055736, + "tokens_seen": 2968770560 + }, + { + "epoch": 10.0, + "learning_rate": 5.0842527582748246e-05, + "loss": 2.461, + "theoretical_loss": 3.3234367256695037, + "tokens_seen": 2968803328 + }, + { + "epoch": 10.0, + "learning_rate": 5.0832497492477434e-05, + "loss": 2.3224, + "theoretical_loss": 3.32343107301683, + "tokens_seen": 2968868864 + }, + { + "epoch": 10.0, + "learning_rate": 5.082246740220662e-05, + "loss": 2.5219, + "theoretical_loss": 3.323425420523871, + "tokens_seen": 2968934400 + }, + { + "epoch": 10.0, + "learning_rate": 5.081243731193581e-05, + "loss": 2.3914, + "theoretical_loss": 3.323419768190619, + "tokens_seen": 2968999936 + }, + { + "epoch": 10.0, + "learning_rate": 5.080240722166499e-05, + "loss": 2.3907, + "theoretical_loss": 3.3234141160170654, + "tokens_seen": 2969065472 + }, + { + "epoch": 10.0, + "learning_rate": 5.079237713139418e-05, + "loss": 2.5311, + "theoretical_loss": 3.3234084640032027, + "tokens_seen": 2969131008 + }, + { + "epoch": 10.0, + "learning_rate": 5.078234704112337e-05, + "loss": 2.4846, + "theoretical_loss": 3.3234028121490224, + "tokens_seen": 2969196544 + }, + { + "epoch": 10.0, + "learning_rate": 5.0772316950852555e-05, + "loss": 2.5469, + "theoretical_loss": 3.3233971604545167, + "tokens_seen": 2969262080 + }, + { + "epoch": 10.0, + "learning_rate": 5.076228686058175e-05, + "loss": 2.5121, + "theoretical_loss": 3.3233915089196775, + "tokens_seen": 2969327616 + }, + { + "epoch": 10.0, + "learning_rate": 5.075225677031094e-05, + "loss": 2.5922, + "theoretical_loss": 3.323385857544497, + "tokens_seen": 2969393152 + }, + { + "epoch": 10.0, + "learning_rate": 5.0742226680040125e-05, + "loss": 2.4745, + "theoretical_loss": 3.3233802063289666, + "tokens_seen": 2969458688 + }, + { + "epoch": 10.0, + "learning_rate": 5.073219658976931e-05, + "loss": 2.3228, + "theoretical_loss": 3.323374555273079, + "tokens_seen": 2969524224 + }, + { + "epoch": 10.0, + "learning_rate": 5.07221664994985e-05, + "loss": 2.5454, + "theoretical_loss": 3.3233689043768253, + "tokens_seen": 2969589760 + }, + { + "epoch": 10.0, + "learning_rate": 5.071213640922768e-05, + "loss": 2.5298, + "theoretical_loss": 3.3233632536401987, + "tokens_seen": 2969655296 + }, + { + "epoch": 10.0, + "learning_rate": 5.070210631895687e-05, + "loss": 2.4097, + "theoretical_loss": 3.32335760306319, + "tokens_seen": 2969720832 + }, + { + "epoch": 10.0, + "learning_rate": 5.069207622868606e-05, + "loss": 2.5065, + "theoretical_loss": 3.3233519526457918, + "tokens_seen": 2969786368 + }, + { + "epoch": 10.0, + "learning_rate": 5.0682046138415246e-05, + "loss": 2.2801, + "theoretical_loss": 3.323346302387996, + "tokens_seen": 2969851904 + }, + { + "epoch": 10.0, + "learning_rate": 5.0672016048144434e-05, + "loss": 2.5059, + "theoretical_loss": 3.3233406522897937, + "tokens_seen": 2969917440 + }, + { + "epoch": 10.0, + "learning_rate": 5.066198595787362e-05, + "loss": 2.4901, + "theoretical_loss": 3.3233350023511785, + "tokens_seen": 2969982976 + }, + { + "epoch": 10.0, + "learning_rate": 5.065195586760281e-05, + "loss": 2.4938, + "theoretical_loss": 3.323329352572141, + "tokens_seen": 2970048512 + }, + { + "epoch": 10.0, + "learning_rate": 5.0641925777332e-05, + "loss": 2.445, + "theoretical_loss": 3.3233237029526737, + "tokens_seen": 2970114048 + }, + { + "epoch": 10.0, + "learning_rate": 5.0631895687061185e-05, + "loss": 2.5582, + "theoretical_loss": 3.323318053492769, + "tokens_seen": 2970179584 + }, + { + "epoch": 10.0, + "learning_rate": 5.062186559679037e-05, + "loss": 2.2175, + "theoretical_loss": 3.323312404192418, + "tokens_seen": 2970245120 + }, + { + "epoch": 10.0, + "learning_rate": 5.0611835506519554e-05, + "loss": 2.4111, + "theoretical_loss": 3.323306755051613, + "tokens_seen": 2970310656 + }, + { + "epoch": 10.0, + "learning_rate": 5.060180541624874e-05, + "loss": 2.3925, + "theoretical_loss": 3.323301106070346, + "tokens_seen": 2970376192 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3278793, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1870498657226562, + "objective/train/theoretical_loss": 3.323298281639537, + "objective/train/tokens_used": 2990868960, + "theoretical_loss": 3.323298281639537, + "tokens_seen": 2970408960 + }, + { + "epoch": 10.0, + "learning_rate": 5.059177532597793e-05, + "loss": 2.3907, + "theoretical_loss": 3.3232954572486095, + "tokens_seen": 2970441728 + }, + { + "epoch": 10.0, + "learning_rate": 5.0581745235707125e-05, + "loss": 2.5364, + "theoretical_loss": 3.3232898085863947, + "tokens_seen": 2970507264 + }, + { + "epoch": 10.0, + "learning_rate": 5.057171514543631e-05, + "loss": 2.5059, + "theoretical_loss": 3.3232841600836935, + "tokens_seen": 2970572800 + }, + { + "epoch": 10.0, + "learning_rate": 5.05616850551655e-05, + "loss": 2.4094, + "theoretical_loss": 3.3232785117404986, + "tokens_seen": 2970638336 + }, + { + "epoch": 10.0, + "learning_rate": 5.055165496489469e-05, + "loss": 2.5234, + "theoretical_loss": 3.323272863556802, + "tokens_seen": 2970703872 + }, + { + "epoch": 10.0, + "learning_rate": 5.0541624874623876e-05, + "loss": 2.4824, + "theoretical_loss": 3.323267215532595, + "tokens_seen": 2970769408 + }, + { + "epoch": 10.0, + "learning_rate": 5.0531594784353064e-05, + "loss": 2.5795, + "theoretical_loss": 3.3232615676678696, + "tokens_seen": 2970834944 + }, + { + "epoch": 10.0, + "learning_rate": 5.052156469408225e-05, + "loss": 2.5827, + "theoretical_loss": 3.3232559199626186, + "tokens_seen": 2970900480 + }, + { + "epoch": 10.0, + "learning_rate": 5.051153460381143e-05, + "loss": 2.4688, + "theoretical_loss": 3.323250272416833, + "tokens_seen": 2970966016 + }, + { + "epoch": 10.0, + "learning_rate": 5.050150451354062e-05, + "loss": 2.4594, + "theoretical_loss": 3.3232446250305054, + "tokens_seen": 2971031552 + }, + { + "epoch": 10.0, + "learning_rate": 5.049147442326981e-05, + "loss": 2.5862, + "theoretical_loss": 3.3232389778036278, + "tokens_seen": 2971097088 + }, + { + "epoch": 10.0, + "learning_rate": 5.0481444332999e-05, + "loss": 2.6349, + "theoretical_loss": 3.323233330736192, + "tokens_seen": 2971162624 + }, + { + "epoch": 10.0, + "learning_rate": 5.0471414242728185e-05, + "loss": 2.5686, + "theoretical_loss": 3.32322768382819, + "tokens_seen": 2971228160 + }, + { + "epoch": 10.0, + "learning_rate": 5.046138415245737e-05, + "loss": 2.3659, + "theoretical_loss": 3.3232220370796135, + "tokens_seen": 2971293696 + }, + { + "epoch": 10.0, + "learning_rate": 5.045135406218656e-05, + "loss": 2.4382, + "theoretical_loss": 3.3232163904904546, + "tokens_seen": 2971359232 + }, + { + "epoch": 10.0, + "learning_rate": 5.044132397191575e-05, + "loss": 2.4639, + "theoretical_loss": 3.3232107440607055, + "tokens_seen": 2971424768 + }, + { + "epoch": 10.0, + "learning_rate": 5.0431293881644936e-05, + "loss": 2.509, + "theoretical_loss": 3.323205097790358, + "tokens_seen": 2971490304 + }, + { + "epoch": 10.0, + "learning_rate": 5.0421263791374124e-05, + "loss": 2.5162, + "theoretical_loss": 3.323199451679405, + "tokens_seen": 2971555840 + }, + { + "epoch": 10.0, + "learning_rate": 5.0411233701103305e-05, + "loss": 2.2941, + "theoretical_loss": 3.3231938057278367, + "tokens_seen": 2971621376 + }, + { + "epoch": 10.0, + "learning_rate": 5.040120361083249e-05, + "loss": 2.5556, + "theoretical_loss": 3.3231881599356465, + "tokens_seen": 2971686912 + }, + { + "epoch": 10.0, + "learning_rate": 5.039117352056169e-05, + "loss": 2.4412, + "theoretical_loss": 3.3231825143028257, + "tokens_seen": 2971752448 + }, + { + "epoch": 10.0, + "learning_rate": 5.0381143430290876e-05, + "loss": 2.4605, + "theoretical_loss": 3.323176868829367, + "tokens_seen": 2971817984 + }, + { + "epoch": 10.0, + "learning_rate": 5.0371113340020064e-05, + "loss": 2.5935, + "theoretical_loss": 3.3231712235152613, + "tokens_seen": 2971883520 + }, + { + "epoch": 10.0, + "learning_rate": 5.036108324974925e-05, + "loss": 2.3753, + "theoretical_loss": 3.3231655783605016, + "tokens_seen": 2971949056 + }, + { + "epoch": 10.0, + "learning_rate": 5.035105315947844e-05, + "loss": 2.5403, + "theoretical_loss": 3.3231599333650794, + "tokens_seen": 2972014592 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3283666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.623721122741699, + "objective/train/theoretical_loss": 3.3231571109271174, + "objective/train/tokens_used": 2992507360, + "theoretical_loss": 3.3231571109271174, + "tokens_seen": 2972047360 + }, + { + "epoch": 10.0, + "learning_rate": 5.034102306920763e-05, + "loss": 2.4514, + "theoretical_loss": 3.323154288528987, + "tokens_seen": 2972080128 + }, + { + "epoch": 10.0, + "learning_rate": 5.0330992978936815e-05, + "loss": 2.3242, + "theoretical_loss": 3.323148643852216, + "tokens_seen": 2972145664 + }, + { + "epoch": 10.0, + "learning_rate": 5.0320962888665996e-05, + "loss": 2.4767, + "theoretical_loss": 3.3231429993347588, + "tokens_seen": 2972211200 + }, + { + "epoch": 10.0, + "learning_rate": 5.0310932798395184e-05, + "loss": 2.4015, + "theoretical_loss": 3.323137354976607, + "tokens_seen": 2972276736 + }, + { + "epoch": 10.0, + "learning_rate": 5.030090270812437e-05, + "loss": 2.5362, + "theoretical_loss": 3.3231317107777523, + "tokens_seen": 2972342272 + }, + { + "epoch": 10.0, + "learning_rate": 5.029087261785356e-05, + "loss": 2.4842, + "theoretical_loss": 3.3231260667381877, + "tokens_seen": 2972407808 + }, + { + "epoch": 10.0, + "learning_rate": 5.028084252758275e-05, + "loss": 2.4895, + "theoretical_loss": 3.3231204228579045, + "tokens_seen": 2972473344 + }, + { + "epoch": 10.0, + "learning_rate": 5.0270812437311936e-05, + "loss": 2.3994, + "theoretical_loss": 3.323114779136895, + "tokens_seen": 2972538880 + }, + { + "epoch": 10.0, + "learning_rate": 5.0260782347041124e-05, + "loss": 2.5448, + "theoretical_loss": 3.3231091355751503, + "tokens_seen": 2972604416 + }, + { + "epoch": 10.0, + "learning_rate": 5.025075225677031e-05, + "loss": 2.5624, + "theoretical_loss": 3.3231034921726637, + "tokens_seen": 2972669952 + }, + { + "epoch": 10.0, + "learning_rate": 5.02407221664995e-05, + "loss": 2.3367, + "theoretical_loss": 3.3230978489294265, + "tokens_seen": 2972735488 + }, + { + "epoch": 10.0, + "learning_rate": 5.023069207622869e-05, + "loss": 2.595, + "theoretical_loss": 3.323092205845431, + "tokens_seen": 2972801024 + }, + { + "epoch": 10.0, + "learning_rate": 5.022066198595787e-05, + "loss": 2.5123, + "theoretical_loss": 3.3230865629206687, + "tokens_seen": 2972866560 + }, + { + "epoch": 10.0, + "learning_rate": 5.0210631895687057e-05, + "loss": 2.5646, + "theoretical_loss": 3.323080920155132, + "tokens_seen": 2972932096 + }, + { + "epoch": 10.0, + "learning_rate": 5.020060180541625e-05, + "loss": 2.5721, + "theoretical_loss": 3.323075277548813, + "tokens_seen": 2972997632 + }, + { + "epoch": 10.0, + "learning_rate": 5.019057171514544e-05, + "loss": 2.4464, + "theoretical_loss": 3.323069635101703, + "tokens_seen": 2973063168 + }, + { + "epoch": 10.0, + "learning_rate": 5.018054162487463e-05, + "loss": 2.3766, + "theoretical_loss": 3.323063992813795, + "tokens_seen": 2973128704 + }, + { + "epoch": 10.0, + "learning_rate": 5.0170511534603815e-05, + "loss": 2.4759, + "theoretical_loss": 3.3230583506850806, + "tokens_seen": 2973194240 + }, + { + "epoch": 10.0, + "learning_rate": 5.0160481444333e-05, + "loss": 2.4852, + "theoretical_loss": 3.3230527087155513, + "tokens_seen": 2973259776 + }, + { + "epoch": 10.0, + "learning_rate": 5.015045135406219e-05, + "loss": 2.4909, + "theoretical_loss": 3.3230470669051995, + "tokens_seen": 2973325312 + }, + { + "epoch": 10.0, + "learning_rate": 5.014042126379138e-05, + "loss": 2.2529, + "theoretical_loss": 3.3230414252540172, + "tokens_seen": 2973390848 + }, + { + "epoch": 10.0, + "learning_rate": 5.0130391173520567e-05, + "loss": 2.4142, + "theoretical_loss": 3.3230357837619966, + "tokens_seen": 2973456384 + }, + { + "epoch": 10.0, + "learning_rate": 5.012036108324975e-05, + "loss": 2.3895, + "theoretical_loss": 3.323030142429129, + "tokens_seen": 2973521920 + }, + { + "epoch": 10.0, + "learning_rate": 5.0110330992978936e-05, + "loss": 2.5595, + "theoretical_loss": 3.3230245012554076, + "tokens_seen": 2973587456 + }, + { + "epoch": 10.0, + "learning_rate": 5.0100300902708123e-05, + "loss": 2.4849, + "theoretical_loss": 3.3230188602408233, + "tokens_seen": 2973652992 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3288573, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.494184732437134, + "objective/train/theoretical_loss": 3.323016039793205, + "objective/train/tokens_used": 2994145760, + "theoretical_loss": 3.323016039793205, + "tokens_seen": 2973685760 + }, + { + "epoch": 10.0, + "learning_rate": 5.009027081243731e-05, + "loss": 2.5706, + "theoretical_loss": 3.3230132193853685, + "tokens_seen": 2973718528 + }, + { + "epoch": 10.0, + "learning_rate": 5.00802407221665e-05, + "loss": 2.354, + "theoretical_loss": 3.3230075786890354, + "tokens_seen": 2973784064 + }, + { + "epoch": 10.0, + "learning_rate": 5.007021063189569e-05, + "loss": 2.4262, + "theoretical_loss": 3.3230019381518154, + "tokens_seen": 2973849600 + }, + { + "epoch": 10.0, + "learning_rate": 5.0060180541624875e-05, + "loss": 2.4449, + "theoretical_loss": 3.3229962977737015, + "tokens_seen": 2973915136 + }, + { + "epoch": 10.0, + "learning_rate": 5.005015045135406e-05, + "loss": 2.5859, + "theoretical_loss": 3.322990657554685, + "tokens_seen": 2973980672 + }, + { + "epoch": 10.0, + "learning_rate": 5.004012036108325e-05, + "loss": 2.4323, + "theoretical_loss": 3.3229850174947577, + "tokens_seen": 2974046208 + }, + { + "epoch": 10.0, + "learning_rate": 5.003009027081243e-05, + "loss": 2.4852, + "theoretical_loss": 3.322979377593912, + "tokens_seen": 2974111744 + }, + { + "epoch": 10.0, + "learning_rate": 5.002006018054162e-05, + "loss": 2.6351, + "theoretical_loss": 3.32297373785214, + "tokens_seen": 2974177280 + }, + { + "epoch": 10.0, + "learning_rate": 5.0010030090270814e-05, + "loss": 2.476, + "theoretical_loss": 3.3229680982694334, + "tokens_seen": 2974242816 + }, + { + "epoch": 10.0, + "learning_rate": 5e-05, + "loss": 2.4816, + "theoretical_loss": 3.3229624588457845, + "tokens_seen": 2974308352 + }, + { + "epoch": 10.0, + "learning_rate": 4.998996990972919e-05, + "loss": 2.5319, + "theoretical_loss": 3.322956819581185, + "tokens_seen": 2974373888 + }, + { + "epoch": 10.0, + "learning_rate": 4.997993981945838e-05, + "loss": 2.3834, + "theoretical_loss": 3.322951180475627, + "tokens_seen": 2974439424 + }, + { + "epoch": 10.0, + "learning_rate": 4.9969909729187566e-05, + "loss": 2.4298, + "theoretical_loss": 3.322945541529103, + "tokens_seen": 2974504960 + }, + { + "epoch": 10.0, + "learning_rate": 4.9959879638916754e-05, + "loss": 2.5976, + "theoretical_loss": 3.322939902741605, + "tokens_seen": 2974570496 + }, + { + "epoch": 10.0, + "learning_rate": 4.994984954864594e-05, + "loss": 2.4272, + "theoretical_loss": 3.3229342641131234, + "tokens_seen": 2974636032 + }, + { + "epoch": 10.0, + "learning_rate": 4.993981945837513e-05, + "loss": 2.5142, + "theoretical_loss": 3.3229286256436525, + "tokens_seen": 2974701568 + }, + { + "epoch": 10.0, + "learning_rate": 4.992978936810431e-05, + "loss": 2.3344, + "theoretical_loss": 3.3229229873331825, + "tokens_seen": 2974767104 + }, + { + "epoch": 10.0, + "learning_rate": 4.99197592778335e-05, + "loss": 2.4654, + "theoretical_loss": 3.3229173491817066, + "tokens_seen": 2974832640 + }, + { + "epoch": 10.0, + "learning_rate": 4.990972918756269e-05, + "loss": 2.3463, + "theoretical_loss": 3.3229117111892164, + "tokens_seen": 2974898176 + }, + { + "epoch": 10.0, + "learning_rate": 4.9899699097291875e-05, + "loss": 2.6101, + "theoretical_loss": 3.3229060733557034, + "tokens_seen": 2974963712 + }, + { + "epoch": 10.0, + "learning_rate": 4.988966900702106e-05, + "loss": 2.5806, + "theoretical_loss": 3.3229004356811602, + "tokens_seen": 2975029248 + }, + { + "epoch": 10.0, + "learning_rate": 4.987963891675025e-05, + "loss": 2.3946, + "theoretical_loss": 3.322894798165579, + "tokens_seen": 2975094784 + }, + { + "epoch": 10.0, + "learning_rate": 4.986960882647944e-05, + "loss": 2.5129, + "theoretical_loss": 3.322889160808952, + "tokens_seen": 2975160320 + }, + { + "epoch": 10.0, + "learning_rate": 4.9859578736208626e-05, + "loss": 2.5969, + "theoretical_loss": 3.32288352361127, + "tokens_seen": 2975225856 + }, + { + "epoch": 10.0, + "learning_rate": 4.9849548645937814e-05, + "loss": 2.4463, + "theoretical_loss": 3.3228778865725257, + "tokens_seen": 2975291392 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3293627, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6588404178619385, + "objective/train/theoretical_loss": 3.322875068112753, + "objective/train/tokens_used": 2995784160, + "theoretical_loss": 3.322875068112753, + "tokens_seen": 2975324160 + }, + { + "epoch": 10.0, + "learning_rate": 4.9839518555667e-05, + "loss": 2.5306, + "theoretical_loss": 3.3228722496927117, + "tokens_seen": 2975356928 + }, + { + "epoch": 10.0, + "learning_rate": 4.982948846539619e-05, + "loss": 2.5304, + "theoretical_loss": 3.3228666129718194, + "tokens_seen": 2975422464 + }, + { + "epoch": 10.0, + "learning_rate": 4.981945837512538e-05, + "loss": 2.6981, + "theoretical_loss": 3.3228609764098405, + "tokens_seen": 2975488000 + }, + { + "epoch": 10.0, + "learning_rate": 4.9809428284854566e-05, + "loss": 2.4645, + "theoretical_loss": 3.3228553400067677, + "tokens_seen": 2975553536 + }, + { + "epoch": 10.0, + "learning_rate": 4.9799398194583754e-05, + "loss": 2.5819, + "theoretical_loss": 3.322849703762593, + "tokens_seen": 2975619072 + }, + { + "epoch": 10.0, + "learning_rate": 4.978936810431294e-05, + "loss": 2.4028, + "theoretical_loss": 3.3228440676773077, + "tokens_seen": 2975684608 + }, + { + "epoch": 10.0, + "learning_rate": 4.977933801404213e-05, + "loss": 2.7151, + "theoretical_loss": 3.3228384317509048, + "tokens_seen": 2975750144 + }, + { + "epoch": 10.0, + "learning_rate": 4.976930792377132e-05, + "loss": 2.4423, + "theoretical_loss": 3.322832795983375, + "tokens_seen": 2975815680 + }, + { + "epoch": 10.0, + "learning_rate": 4.9759277833500505e-05, + "loss": 2.287, + "theoretical_loss": 3.3228271603747124, + "tokens_seen": 2975881216 + }, + { + "epoch": 10.0, + "learning_rate": 4.974924774322969e-05, + "loss": 2.6101, + "theoretical_loss": 3.322821524924907, + "tokens_seen": 2975946752 + }, + { + "epoch": 10.0, + "learning_rate": 4.973921765295888e-05, + "loss": 2.3351, + "theoretical_loss": 3.3228158896339517, + "tokens_seen": 2976012288 + }, + { + "epoch": 10.0, + "learning_rate": 4.972918756268806e-05, + "loss": 2.4979, + "theoretical_loss": 3.3228102545018383, + "tokens_seen": 2976077824 + }, + { + "epoch": 10.0, + "learning_rate": 4.971915747241725e-05, + "loss": 2.4346, + "theoretical_loss": 3.322804619528559, + "tokens_seen": 2976143360 + }, + { + "epoch": 10.0, + "learning_rate": 4.970912738214644e-05, + "loss": 2.344, + "theoretical_loss": 3.322798984714106, + "tokens_seen": 2976208896 + }, + { + "epoch": 10.0, + "learning_rate": 4.9699097291875626e-05, + "loss": 2.5217, + "theoretical_loss": 3.322793350058471, + "tokens_seen": 2976274432 + }, + { + "epoch": 10.0, + "learning_rate": 4.9689067201604814e-05, + "loss": 2.5373, + "theoretical_loss": 3.322787715561646, + "tokens_seen": 2976339968 + }, + { + "epoch": 10.0, + "learning_rate": 4.9679037111334e-05, + "loss": 2.6131, + "theoretical_loss": 3.322782081223623, + "tokens_seen": 2976405504 + }, + { + "epoch": 10.0, + "learning_rate": 4.966900702106319e-05, + "loss": 2.5353, + "theoretical_loss": 3.3227764470443946, + "tokens_seen": 2976471040 + }, + { + "epoch": 10.0, + "learning_rate": 4.965897693079238e-05, + "loss": 2.5123, + "theoretical_loss": 3.3227708130239524, + "tokens_seen": 2976536576 + }, + { + "epoch": 10.0, + "learning_rate": 4.9648946840521565e-05, + "loss": 2.2791, + "theoretical_loss": 3.322765179162288, + "tokens_seen": 2976602112 + }, + { + "epoch": 10.0, + "learning_rate": 4.963891675025076e-05, + "loss": 2.2411, + "theoretical_loss": 3.322759545459394, + "tokens_seen": 2976667648 + }, + { + "epoch": 10.0, + "learning_rate": 4.962888665997994e-05, + "loss": 2.7067, + "theoretical_loss": 3.322753911915263, + "tokens_seen": 2976733184 + }, + { + "epoch": 10.0, + "learning_rate": 4.961885656970913e-05, + "loss": 2.536, + "theoretical_loss": 3.322748278529885, + "tokens_seen": 2976798720 + }, + { + "epoch": 10.0, + "learning_rate": 4.960882647943832e-05, + "loss": 2.653, + "theoretical_loss": 3.3227426453032542, + "tokens_seen": 2976864256 + }, + { + "epoch": 10.0, + "learning_rate": 4.9598796389167505e-05, + "loss": 2.5658, + "theoretical_loss": 3.322737012235362, + "tokens_seen": 2976929792 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3295935, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6850674152374268, + "objective/train/theoretical_loss": 3.32273419576094, + "objective/train/tokens_used": 2997422560, + "theoretical_loss": 3.32273419576094, + "tokens_seen": 2976962560 + }, + { + "epoch": 10.0, + "learning_rate": 4.958876629889669e-05, + "loss": 2.4034, + "theoretical_loss": 3.3227313793262, + "tokens_seen": 2976995328 + }, + { + "epoch": 10.0, + "learning_rate": 4.957873620862588e-05, + "loss": 2.4911, + "theoretical_loss": 3.32272574657576, + "tokens_seen": 2977060864 + }, + { + "epoch": 10.0, + "learning_rate": 4.956870611835507e-05, + "loss": 2.4826, + "theoretical_loss": 3.322720113984035, + "tokens_seen": 2977126400 + }, + { + "epoch": 10.0, + "learning_rate": 4.9558676028084256e-05, + "loss": 2.5757, + "theoretical_loss": 3.3227144815510163, + "tokens_seen": 2977191936 + }, + { + "epoch": 10.0, + "learning_rate": 4.9548645937813444e-05, + "loss": 2.369, + "theoretical_loss": 3.3227088492766965, + "tokens_seen": 2977257472 + }, + { + "epoch": 10.0, + "learning_rate": 4.9538615847542625e-05, + "loss": 2.4177, + "theoretical_loss": 3.322703217161067, + "tokens_seen": 2977323008 + }, + { + "epoch": 10.0, + "learning_rate": 4.952858575727181e-05, + "loss": 2.577, + "theoretical_loss": 3.32269758520412, + "tokens_seen": 2977388544 + }, + { + "epoch": 10.0, + "learning_rate": 4.9518555667001e-05, + "loss": 2.4523, + "theoretical_loss": 3.322691953405848, + "tokens_seen": 2977454080 + }, + { + "epoch": 10.0, + "learning_rate": 4.950852557673019e-05, + "loss": 2.547, + "theoretical_loss": 3.3226863217662426, + "tokens_seen": 2977519616 + }, + { + "epoch": 10.0, + "learning_rate": 4.949849548645938e-05, + "loss": 2.4613, + "theoretical_loss": 3.3226806902852957, + "tokens_seen": 2977585152 + }, + { + "epoch": 10.0, + "learning_rate": 4.9488465396188565e-05, + "loss": 2.4656, + "theoretical_loss": 3.3226750589629996, + "tokens_seen": 2977650688 + }, + { + "epoch": 10.0, + "learning_rate": 4.947843530591775e-05, + "loss": 2.55, + "theoretical_loss": 3.3226694277993465, + "tokens_seen": 2977716224 + }, + { + "epoch": 10.0, + "learning_rate": 4.946840521564694e-05, + "loss": 2.4942, + "theoretical_loss": 3.3226637967943287, + "tokens_seen": 2977781760 + }, + { + "epoch": 10.0, + "learning_rate": 4.945837512537613e-05, + "loss": 2.4574, + "theoretical_loss": 3.322658165947937, + "tokens_seen": 2977847296 + }, + { + "epoch": 10.0, + "learning_rate": 4.944834503510532e-05, + "loss": 2.533, + "theoretical_loss": 3.322652535260165, + "tokens_seen": 2977912832 + }, + { + "epoch": 10.0, + "learning_rate": 4.9438314944834504e-05, + "loss": 2.569, + "theoretical_loss": 3.322646904731003, + "tokens_seen": 2977978368 + }, + { + "epoch": 10.0, + "learning_rate": 4.942828485456369e-05, + "loss": 2.5901, + "theoretical_loss": 3.3226412743604445, + "tokens_seen": 2978043904 + }, + { + "epoch": 10.0, + "learning_rate": 4.941825476429288e-05, + "loss": 2.3161, + "theoretical_loss": 3.3226356441484812, + "tokens_seen": 2978109440 + }, + { + "epoch": 10.0, + "learning_rate": 4.940822467402207e-05, + "loss": 2.4504, + "theoretical_loss": 3.3226300140951053, + "tokens_seen": 2978174976 + }, + { + "epoch": 10.0, + "learning_rate": 4.9398194583751256e-05, + "loss": 2.4466, + "theoretical_loss": 3.322624384200308, + "tokens_seen": 2978240512 + }, + { + "epoch": 10.0, + "learning_rate": 4.9388164493480444e-05, + "loss": 2.3737, + "theoretical_loss": 3.322618754464082, + "tokens_seen": 2978306048 + }, + { + "epoch": 10.0, + "learning_rate": 4.937813440320963e-05, + "loss": 2.4439, + "theoretical_loss": 3.3226131248864195, + "tokens_seen": 2978371584 + }, + { + "epoch": 10.0, + "learning_rate": 4.936810431293882e-05, + "loss": 2.356, + "theoretical_loss": 3.322607495467312, + "tokens_seen": 2978437120 + }, + { + "epoch": 10.0, + "learning_rate": 4.935807422266801e-05, + "loss": 2.4405, + "theoretical_loss": 3.3226018662067522, + "tokens_seen": 2978502656 + }, + { + "epoch": 10.0, + "learning_rate": 4.9348044132397195e-05, + "loss": 2.4821, + "theoretical_loss": 3.3225962371047313, + "tokens_seen": 2978568192 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3300967, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5508203506469727, + "objective/train/theoretical_loss": 3.3225934226131706, + "objective/train/tokens_used": 2999060960, + "theoretical_loss": 3.3225934226131706, + "tokens_seen": 2978600960 + }, + { + "epoch": 10.0, + "learning_rate": 4.9338014042126376e-05, + "loss": 2.5107, + "theoretical_loss": 3.322590608161242, + "tokens_seen": 2978633728 + }, + { + "epoch": 10.0, + "learning_rate": 4.9327983951855564e-05, + "loss": 2.6338, + "theoretical_loss": 3.322584979376276, + "tokens_seen": 2978699264 + }, + { + "epoch": 10.0, + "learning_rate": 4.931795386158475e-05, + "loss": 2.6763, + "theoretical_loss": 3.322579350749826, + "tokens_seen": 2978764800 + }, + { + "epoch": 10.0, + "learning_rate": 4.930792377131394e-05, + "loss": 2.5324, + "theoretical_loss": 3.3225737222818834, + "tokens_seen": 2978830336 + }, + { + "epoch": 10.0, + "learning_rate": 4.929789368104313e-05, + "loss": 2.3968, + "theoretical_loss": 3.32256809397244, + "tokens_seen": 2978895872 + }, + { + "epoch": 10.0, + "learning_rate": 4.9287863590772316e-05, + "loss": 2.4797, + "theoretical_loss": 3.322562465821489, + "tokens_seen": 2978961408 + }, + { + "epoch": 10.0, + "learning_rate": 4.9277833500501504e-05, + "loss": 2.4096, + "theoretical_loss": 3.322556837829021, + "tokens_seen": 2979026944 + }, + { + "epoch": 10.0, + "learning_rate": 4.92678034102307e-05, + "loss": 2.568, + "theoretical_loss": 3.322551209995029, + "tokens_seen": 2979092480 + }, + { + "epoch": 10.0, + "learning_rate": 4.9257773319959886e-05, + "loss": 2.4111, + "theoretical_loss": 3.322545582319505, + "tokens_seen": 2979158016 + }, + { + "epoch": 10.0, + "learning_rate": 4.9247743229689074e-05, + "loss": 2.3166, + "theoretical_loss": 3.3225399548024406, + "tokens_seen": 2979223552 + }, + { + "epoch": 10.0, + "learning_rate": 4.9237713139418255e-05, + "loss": 2.3931, + "theoretical_loss": 3.3225343274438286, + "tokens_seen": 2979289088 + }, + { + "epoch": 10.0, + "learning_rate": 4.922768304914744e-05, + "loss": 2.5398, + "theoretical_loss": 3.32252870024366, + "tokens_seen": 2979354624 + }, + { + "epoch": 10.0, + "learning_rate": 4.921765295887663e-05, + "loss": 2.3462, + "theoretical_loss": 3.3225230732019275, + "tokens_seen": 2979420160 + }, + { + "epoch": 10.0, + "learning_rate": 4.920762286860582e-05, + "loss": 2.5135, + "theoretical_loss": 3.3225174463186233, + "tokens_seen": 2979485696 + }, + { + "epoch": 10.0, + "learning_rate": 4.919759277833501e-05, + "loss": 2.3982, + "theoretical_loss": 3.3225118195937395, + "tokens_seen": 2979551232 + }, + { + "epoch": 10.0, + "learning_rate": 4.9187562688064195e-05, + "loss": 2.5067, + "theoretical_loss": 3.3225061930272677, + "tokens_seen": 2979616768 + }, + { + "epoch": 10.0, + "learning_rate": 4.917753259779338e-05, + "loss": 2.4197, + "theoretical_loss": 3.3225005666192, + "tokens_seen": 2979682304 + }, + { + "epoch": 10.0, + "learning_rate": 4.916750250752257e-05, + "loss": 2.4403, + "theoretical_loss": 3.3224949403695287, + "tokens_seen": 2979747840 + }, + { + "epoch": 10.0, + "learning_rate": 4.915747241725176e-05, + "loss": 2.593, + "theoretical_loss": 3.3224893142782457, + "tokens_seen": 2979813376 + }, + { + "epoch": 10.0, + "learning_rate": 4.914744232698094e-05, + "loss": 2.5562, + "theoretical_loss": 3.322483688345343, + "tokens_seen": 2979878912 + }, + { + "epoch": 10.0, + "learning_rate": 4.913741223671013e-05, + "loss": 2.4755, + "theoretical_loss": 3.322478062570813, + "tokens_seen": 2979944448 + }, + { + "epoch": 10.0, + "learning_rate": 4.9127382146439316e-05, + "loss": 2.507, + "theoretical_loss": 3.3224724369546474, + "tokens_seen": 2980009984 + }, + { + "epoch": 10.0, + "learning_rate": 4.9117352056168503e-05, + "loss": 2.2669, + "theoretical_loss": 3.3224668114968385, + "tokens_seen": 2980075520 + }, + { + "epoch": 10.0, + "learning_rate": 4.910732196589769e-05, + "loss": 2.3293, + "theoretical_loss": 3.322461186197378, + "tokens_seen": 2980141056 + }, + { + "epoch": 10.0, + "learning_rate": 4.909729187562688e-05, + "loss": 2.395, + "theoretical_loss": 3.3224555610562585, + "tokens_seen": 2980206592 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3305814, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.533071517944336, + "objective/train/theoretical_loss": 3.322452748545074, + "objective/train/tokens_used": 3000699360, + "theoretical_loss": 3.322452748545074, + "tokens_seen": 2980239360 + }, + { + "epoch": 10.0, + "learning_rate": 4.908726178535607e-05, + "loss": 2.5752, + "theoretical_loss": 3.322449936073472, + "tokens_seen": 2980272128 + }, + { + "epoch": 10.0, + "learning_rate": 4.907723169508526e-05, + "loss": 2.4318, + "theoretical_loss": 3.3224443112490096, + "tokens_seen": 2980337664 + }, + { + "epoch": 10.0, + "learning_rate": 4.906720160481445e-05, + "loss": 2.4782, + "theoretical_loss": 3.322438686582865, + "tokens_seen": 2980403200 + }, + { + "epoch": 10.0, + "learning_rate": 4.905717151454364e-05, + "loss": 2.4908, + "theoretical_loss": 3.322433062075029, + "tokens_seen": 2980468736 + }, + { + "epoch": 10.0, + "learning_rate": 4.904714142427282e-05, + "loss": 2.5191, + "theoretical_loss": 3.322427437725494, + "tokens_seen": 2980534272 + }, + { + "epoch": 10.0, + "learning_rate": 4.9037111334002007e-05, + "loss": 2.5553, + "theoretical_loss": 3.322421813534252, + "tokens_seen": 2980599808 + }, + { + "epoch": 10.0, + "learning_rate": 4.9027081243731194e-05, + "loss": 2.4846, + "theoretical_loss": 3.322416189501295, + "tokens_seen": 2980665344 + }, + { + "epoch": 10.0, + "learning_rate": 4.901705115346038e-05, + "loss": 2.4654, + "theoretical_loss": 3.3224105656266154, + "tokens_seen": 2980730880 + }, + { + "epoch": 10.0, + "learning_rate": 4.900702106318957e-05, + "loss": 2.4983, + "theoretical_loss": 3.322404941910205, + "tokens_seen": 2980796416 + }, + { + "epoch": 10.0, + "learning_rate": 4.899699097291876e-05, + "loss": 2.4141, + "theoretical_loss": 3.322399318352056, + "tokens_seen": 2980861952 + }, + { + "epoch": 10.0, + "learning_rate": 4.8986960882647946e-05, + "loss": 2.3798, + "theoretical_loss": 3.3223936949521606, + "tokens_seen": 2980927488 + }, + { + "epoch": 10.0, + "learning_rate": 4.8976930792377134e-05, + "loss": 2.4378, + "theoretical_loss": 3.3223880717105105, + "tokens_seen": 2980993024 + }, + { + "epoch": 10.0, + "learning_rate": 4.896690070210632e-05, + "loss": 2.5247, + "theoretical_loss": 3.3223824486270983, + "tokens_seen": 2981058560 + }, + { + "epoch": 10.0, + "learning_rate": 4.895687061183551e-05, + "loss": 2.2934, + "theoretical_loss": 3.3223768257019155, + "tokens_seen": 2981124096 + }, + { + "epoch": 10.0, + "learning_rate": 4.894684052156469e-05, + "loss": 2.5322, + "theoretical_loss": 3.3223712029349546, + "tokens_seen": 2981189632 + }, + { + "epoch": 10.0, + "learning_rate": 4.893681043129388e-05, + "loss": 2.4341, + "theoretical_loss": 3.322365580326207, + "tokens_seen": 2981255168 + }, + { + "epoch": 10.0, + "learning_rate": 4.892678034102307e-05, + "loss": 2.4832, + "theoretical_loss": 3.3223599578756655, + "tokens_seen": 2981320704 + }, + { + "epoch": 10.0, + "learning_rate": 4.8916750250752255e-05, + "loss": 2.5416, + "theoretical_loss": 3.322354335583322, + "tokens_seen": 2981386240 + }, + { + "epoch": 10.0, + "learning_rate": 4.890672016048144e-05, + "loss": 2.4585, + "theoretical_loss": 3.3223487134491685, + "tokens_seen": 2981451776 + }, + { + "epoch": 10.0, + "learning_rate": 4.889669007021063e-05, + "loss": 2.3418, + "theoretical_loss": 3.322343091473197, + "tokens_seen": 2981517312 + }, + { + "epoch": 10.0, + "learning_rate": 4.8886659979939825e-05, + "loss": 2.4744, + "theoretical_loss": 3.3223374696553996, + "tokens_seen": 2981582848 + }, + { + "epoch": 10.0, + "learning_rate": 4.887662988966901e-05, + "loss": 2.4803, + "theoretical_loss": 3.322331847995768, + "tokens_seen": 2981648384 + }, + { + "epoch": 10.0, + "learning_rate": 4.88665997993982e-05, + "loss": 2.6485, + "theoretical_loss": 3.322326226494295, + "tokens_seen": 2981713920 + }, + { + "epoch": 10.0, + "learning_rate": 4.885656970912738e-05, + "loss": 2.5378, + "theoretical_loss": 3.3223206051509724, + "tokens_seen": 2981779456 + }, + { + "epoch": 10.0, + "learning_rate": 4.884653961885657e-05, + "loss": 2.4227, + "theoretical_loss": 3.3223149839657924, + "tokens_seen": 2981844992 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3310872, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.636472702026367, + "objective/train/theoretical_loss": 3.322312173432503, + "objective/train/tokens_used": 3002337760, + "theoretical_loss": 3.322312173432503, + "tokens_seen": 2981877760 + }, + { + "epoch": 10.0, + "learning_rate": 4.883650952858576e-05, + "loss": 2.5163, + "theoretical_loss": 3.322309362938747, + "tokens_seen": 2981910528 + }, + { + "epoch": 10.0, + "learning_rate": 4.8826479438314946e-05, + "loss": 2.4266, + "theoretical_loss": 3.3223037420698276, + "tokens_seen": 2981976064 + }, + { + "epoch": 10.0, + "learning_rate": 4.8816449348044134e-05, + "loss": 2.4003, + "theoretical_loss": 3.3222981213590272, + "tokens_seen": 2982041600 + }, + { + "epoch": 10.0, + "learning_rate": 4.880641925777332e-05, + "loss": 2.5673, + "theoretical_loss": 3.3222925008063373, + "tokens_seen": 2982107136 + }, + { + "epoch": 10.0, + "learning_rate": 4.879638916750251e-05, + "loss": 2.2479, + "theoretical_loss": 3.3222868804117507, + "tokens_seen": 2982172672 + }, + { + "epoch": 10.0, + "learning_rate": 4.87863590772317e-05, + "loss": 2.5187, + "theoretical_loss": 3.3222812601752585, + "tokens_seen": 2982238208 + }, + { + "epoch": 10.0, + "learning_rate": 4.8776328986960885e-05, + "loss": 2.6472, + "theoretical_loss": 3.322275640096853, + "tokens_seen": 2982303744 + }, + { + "epoch": 10.0, + "learning_rate": 4.876629889669007e-05, + "loss": 2.5534, + "theoretical_loss": 3.322270020176527, + "tokens_seen": 2982369280 + }, + { + "epoch": 10.0, + "learning_rate": 4.8756268806419254e-05, + "loss": 2.4131, + "theoretical_loss": 3.322264400414272, + "tokens_seen": 2982434816 + }, + { + "epoch": 10.0, + "learning_rate": 4.874623871614844e-05, + "loss": 2.407, + "theoretical_loss": 3.3222587808100803, + "tokens_seen": 2982500352 + }, + { + "epoch": 10.0, + "learning_rate": 4.873620862587763e-05, + "loss": 2.4585, + "theoretical_loss": 3.3222531613639443, + "tokens_seen": 2982565888 + }, + { + "epoch": 10.0, + "learning_rate": 4.872617853560682e-05, + "loss": 2.5796, + "theoretical_loss": 3.322247542075855, + "tokens_seen": 2982631424 + }, + { + "epoch": 10.0, + "learning_rate": 4.8716148445336006e-05, + "loss": 2.4005, + "theoretical_loss": 3.322241922945805, + "tokens_seen": 2982696960 + }, + { + "epoch": 10.0, + "learning_rate": 4.8706118355065194e-05, + "loss": 2.5163, + "theoretical_loss": 3.322236303973787, + "tokens_seen": 2982762496 + }, + { + "epoch": 10.0, + "learning_rate": 4.869608826479439e-05, + "loss": 2.5348, + "theoretical_loss": 3.3222306851597927, + "tokens_seen": 2982828032 + }, + { + "epoch": 10.0, + "learning_rate": 4.8686058174523576e-05, + "loss": 2.4509, + "theoretical_loss": 3.322225066503814, + "tokens_seen": 2982893568 + }, + { + "epoch": 10.0, + "learning_rate": 4.8676028084252764e-05, + "loss": 2.4854, + "theoretical_loss": 3.3222194480058427, + "tokens_seen": 2982959104 + }, + { + "epoch": 10.0, + "learning_rate": 4.866599799398195e-05, + "loss": 2.5807, + "theoretical_loss": 3.3222138296658716, + "tokens_seen": 2983024640 + }, + { + "epoch": 10.0, + "learning_rate": 4.865596790371113e-05, + "loss": 2.4504, + "theoretical_loss": 3.3222082114838924, + "tokens_seen": 2983090176 + }, + { + "epoch": 10.0, + "learning_rate": 4.864593781344032e-05, + "loss": 2.3301, + "theoretical_loss": 3.3222025934598975, + "tokens_seen": 2983155712 + }, + { + "epoch": 10.0, + "learning_rate": 4.863590772316951e-05, + "loss": 2.4154, + "theoretical_loss": 3.3221969755938785, + "tokens_seen": 2983221248 + }, + { + "epoch": 10.0, + "learning_rate": 4.86258776328987e-05, + "loss": 2.2047, + "theoretical_loss": 3.3221913578858278, + "tokens_seen": 2983286784 + }, + { + "epoch": 10.0, + "learning_rate": 4.8615847542627885e-05, + "loss": 2.4611, + "theoretical_loss": 3.322185740335737, + "tokens_seen": 2983352320 + }, + { + "epoch": 10.0, + "learning_rate": 4.860581745235707e-05, + "loss": 2.6863, + "theoretical_loss": 3.3221801229435988, + "tokens_seen": 2983417856 + }, + { + "epoch": 10.0, + "learning_rate": 4.859578736208626e-05, + "loss": 2.3381, + "theoretical_loss": 3.3221745057094054, + "tokens_seen": 2983483392 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 3315892, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7576262950897217, + "objective/train/theoretical_loss": 3.322171697151535, + "objective/train/tokens_used": 3003976160, + "theoretical_loss": 3.322171697151535, + "tokens_seen": 2983516160 + }, + { + "epoch": 10.0, + "learning_rate": 4.858575727181545e-05, + "loss": 2.6761, + "theoretical_loss": 3.3221688886331484, + "tokens_seen": 2983548928 + }, + { + "epoch": 10.0, + "learning_rate": 4.8575727181544636e-05, + "loss": 2.4836, + "theoretical_loss": 3.3221632717148197, + "tokens_seen": 2983614464 + }, + { + "epoch": 10.0, + "learning_rate": 4.8565697091273824e-05, + "loss": 2.3967, + "theoretical_loss": 3.3221576549544123, + "tokens_seen": 2983680000 + }, + { + "epoch": 10.0, + "learning_rate": 4.8555667001003005e-05, + "loss": 2.4362, + "theoretical_loss": 3.322152038351917, + "tokens_seen": 2983745536 + }, + { + "epoch": 10.0, + "learning_rate": 4.854563691073219e-05, + "loss": 2.4126, + "theoretical_loss": 3.3221464219073273, + "tokens_seen": 2983811072 + }, + { + "epoch": 10.0, + "learning_rate": 4.853560682046138e-05, + "loss": 2.56, + "theoretical_loss": 3.3221408056206343, + "tokens_seen": 2983876608 + }, + { + "epoch": 10.0, + "learning_rate": 4.852557673019057e-05, + "loss": 2.6754, + "theoretical_loss": 3.322135189491831, + "tokens_seen": 2983942144 + }, + { + "epoch": 10.0, + "learning_rate": 4.8515546639919764e-05, + "loss": 2.6243, + "theoretical_loss": 3.3221295735209084, + "tokens_seen": 2984007680 + }, + { + "epoch": 10.0, + "learning_rate": 4.850551654964895e-05, + "loss": 2.4151, + "theoretical_loss": 3.322123957707859, + "tokens_seen": 2984073216 + }, + { + "epoch": 10.01, + "learning_rate": 4.849548645937814e-05, + "loss": 2.3755, + "theoretical_loss": 3.322118342052675, + "tokens_seen": 2984138752 + }, + { + "epoch": 10.01, + "learning_rate": 4.848545636910733e-05, + "loss": 2.5082, + "theoretical_loss": 3.322112726555349, + "tokens_seen": 2984204288 + }, + { + "epoch": 10.01, + "learning_rate": 4.8475426278836515e-05, + "loss": 2.578, + "theoretical_loss": 3.3221071112158724, + "tokens_seen": 2984269824 + }, + { + "epoch": 10.01, + "learning_rate": 4.8465396188565696e-05, + "loss": 2.5215, + "theoretical_loss": 3.322101496034237, + "tokens_seen": 2984335360 + }, + { + "epoch": 10.01, + "learning_rate": 4.8455366098294884e-05, + "loss": 2.4392, + "theoretical_loss": 3.322095881010436, + "tokens_seen": 2984400896 + }, + { + "epoch": 10.01, + "learning_rate": 4.844533600802407e-05, + "loss": 2.6235, + "theoretical_loss": 3.3220902661444605, + "tokens_seen": 2984466432 + }, + { + "epoch": 10.01, + "learning_rate": 4.843530591775326e-05, + "loss": 2.3796, + "theoretical_loss": 3.3220846514363034, + "tokens_seen": 2984531968 + }, + { + "epoch": 10.01, + "learning_rate": 4.842527582748245e-05, + "loss": 2.3949, + "theoretical_loss": 3.322079036885956, + "tokens_seen": 2984597504 + }, + { + "epoch": 10.01, + "learning_rate": 4.8415245737211636e-05, + "loss": 2.5414, + "theoretical_loss": 3.322073422493411, + "tokens_seen": 2984663040 + }, + { + "epoch": 10.01, + "learning_rate": 4.8405215646940824e-05, + "loss": 2.5893, + "theoretical_loss": 3.32206780825866, + "tokens_seen": 2984728576 + }, + { + "epoch": 10.01, + "learning_rate": 4.839518555667001e-05, + "loss": 2.2747, + "theoretical_loss": 3.3220621941816955, + "tokens_seen": 2984794112 + }, + { + "epoch": 10.01, + "learning_rate": 4.83851554663992e-05, + "loss": 2.1063, + "theoretical_loss": 3.32205658026251, + "tokens_seen": 2984859648 + }, + { + "epoch": 10.01, + "learning_rate": 4.837512537612839e-05, + "loss": 2.6465, + "theoretical_loss": 3.322050966501094, + "tokens_seen": 2984925184 + }, + { + "epoch": 10.01, + "learning_rate": 4.836509528585757e-05, + "loss": 2.4733, + "theoretical_loss": 3.3220453528974416, + "tokens_seen": 2984990720 + }, + { + "epoch": 10.01, + "learning_rate": 4.8355065195586756e-05, + "loss": 2.4864, + "theoretical_loss": 3.3220397394515437, + "tokens_seen": 2985056256 + }, + { + "epoch": 10.01, + "learning_rate": 4.8345035105315944e-05, + "loss": 2.4911, + "theoretical_loss": 3.322034126163393, + "tokens_seen": 2985121792 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3320997, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.796234130859375, + "objective/train/theoretical_loss": 3.32203131957847, + "objective/train/tokens_used": 3005614560, + "theoretical_loss": 3.32203131957847, + "tokens_seen": 2985154560 + }, + { + "epoch": 10.01, + "learning_rate": 4.833500501504513e-05, + "loss": 2.5405, + "theoretical_loss": 3.322028513032981, + "tokens_seen": 2985187328 + }, + { + "epoch": 10.01, + "learning_rate": 4.832497492477433e-05, + "loss": 2.5512, + "theoretical_loss": 3.3220229000602997, + "tokens_seen": 2985252864 + }, + { + "epoch": 10.01, + "learning_rate": 4.8314944834503515e-05, + "loss": 2.477, + "theoretical_loss": 3.322017287245342, + "tokens_seen": 2985318400 + }, + { + "epoch": 10.01, + "learning_rate": 4.83049147442327e-05, + "loss": 2.5827, + "theoretical_loss": 3.3220116745881, + "tokens_seen": 2985383936 + }, + { + "epoch": 10.01, + "learning_rate": 4.829488465396189e-05, + "loss": 2.4168, + "theoretical_loss": 3.322006062088565, + "tokens_seen": 2985449472 + }, + { + "epoch": 10.01, + "learning_rate": 4.828485456369108e-05, + "loss": 2.3237, + "theoretical_loss": 3.3220004497467297, + "tokens_seen": 2985515008 + }, + { + "epoch": 10.01, + "learning_rate": 4.8274824473420266e-05, + "loss": 2.5502, + "theoretical_loss": 3.3219948375625856, + "tokens_seen": 2985580544 + }, + { + "epoch": 10.01, + "learning_rate": 4.826479438314945e-05, + "loss": 2.5413, + "theoretical_loss": 3.3219892255361256, + "tokens_seen": 2985646080 + }, + { + "epoch": 10.01, + "learning_rate": 4.8254764292878635e-05, + "loss": 2.3309, + "theoretical_loss": 3.3219836136673413, + "tokens_seen": 2985711616 + }, + { + "epoch": 10.01, + "learning_rate": 4.824473420260782e-05, + "loss": 2.5273, + "theoretical_loss": 3.321978001956225, + "tokens_seen": 2985777152 + }, + { + "epoch": 10.01, + "learning_rate": 4.823470411233701e-05, + "loss": 2.5457, + "theoretical_loss": 3.3219723904027694, + "tokens_seen": 2985842688 + }, + { + "epoch": 10.01, + "learning_rate": 4.82246740220662e-05, + "loss": 2.6161, + "theoretical_loss": 3.3219667790069654, + "tokens_seen": 2985908224 + }, + { + "epoch": 10.01, + "learning_rate": 4.821464393179539e-05, + "loss": 2.5732, + "theoretical_loss": 3.3219611677688055, + "tokens_seen": 2985973760 + }, + { + "epoch": 10.01, + "learning_rate": 4.8204613841524575e-05, + "loss": 2.5139, + "theoretical_loss": 3.321955556688282, + "tokens_seen": 2986039296 + }, + { + "epoch": 10.01, + "learning_rate": 4.819458375125376e-05, + "loss": 2.469, + "theoretical_loss": 3.3219499457653874, + "tokens_seen": 2986104832 + }, + { + "epoch": 10.01, + "learning_rate": 4.818455366098295e-05, + "loss": 2.3305, + "theoretical_loss": 3.321944335000113, + "tokens_seen": 2986170368 + }, + { + "epoch": 10.01, + "learning_rate": 4.817452357071213e-05, + "loss": 2.4089, + "theoretical_loss": 3.321938724392452, + "tokens_seen": 2986235904 + }, + { + "epoch": 10.01, + "learning_rate": 4.816449348044132e-05, + "loss": 2.3311, + "theoretical_loss": 3.321933113942395, + "tokens_seen": 2986301440 + }, + { + "epoch": 10.01, + "learning_rate": 4.815446339017051e-05, + "loss": 2.594, + "theoretical_loss": 3.3219275036499356, + "tokens_seen": 2986366976 + }, + { + "epoch": 10.01, + "learning_rate": 4.8144433299899696e-05, + "loss": 2.6041, + "theoretical_loss": 3.321921893515065, + "tokens_seen": 2986432512 + }, + { + "epoch": 10.01, + "learning_rate": 4.813440320962889e-05, + "loss": 2.3765, + "theoretical_loss": 3.321916283537776, + "tokens_seen": 2986498048 + }, + { + "epoch": 10.01, + "learning_rate": 4.812437311935808e-05, + "loss": 2.3768, + "theoretical_loss": 3.32191067371806, + "tokens_seen": 2986563584 + }, + { + "epoch": 10.01, + "learning_rate": 4.8114343029087266e-05, + "loss": 2.4623, + "theoretical_loss": 3.321905064055909, + "tokens_seen": 2986629120 + }, + { + "epoch": 10.01, + "learning_rate": 4.8104312938816454e-05, + "loss": 2.4, + "theoretical_loss": 3.3218994545513163, + "tokens_seen": 2986694656 + }, + { + "epoch": 10.01, + "learning_rate": 4.809428284854564e-05, + "loss": 2.459, + "theoretical_loss": 3.321893845204273, + "tokens_seen": 2986760192 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3326118, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2957518100738525, + "objective/train/theoretical_loss": 3.32189104058983, + "objective/train/tokens_used": 3007252960, + "theoretical_loss": 3.32189104058983, + "tokens_seen": 2986792960 + }, + { + "epoch": 10.01, + "learning_rate": 4.808425275827483e-05, + "loss": 2.3697, + "theoretical_loss": 3.3218882360147717, + "tokens_seen": 2986825728 + }, + { + "epoch": 10.01, + "learning_rate": 4.807422266800401e-05, + "loss": 2.4943, + "theoretical_loss": 3.321882626982804, + "tokens_seen": 2986891264 + }, + { + "epoch": 10.01, + "learning_rate": 4.80641925777332e-05, + "loss": 2.3699, + "theoretical_loss": 3.321877018108362, + "tokens_seen": 2986956800 + }, + { + "epoch": 10.01, + "learning_rate": 4.8054162487462387e-05, + "loss": 2.369, + "theoretical_loss": 3.3218714093914388, + "tokens_seen": 2987022336 + }, + { + "epoch": 10.01, + "learning_rate": 4.8044132397191574e-05, + "loss": 2.6438, + "theoretical_loss": 3.3218658008320254, + "tokens_seen": 2987087872 + }, + { + "epoch": 10.01, + "learning_rate": 4.803410230692076e-05, + "loss": 2.3252, + "theoretical_loss": 3.3218601924301145, + "tokens_seen": 2987153408 + }, + { + "epoch": 10.01, + "learning_rate": 4.802407221664995e-05, + "loss": 2.5989, + "theoretical_loss": 3.321854584185698, + "tokens_seen": 2987218944 + }, + { + "epoch": 10.01, + "learning_rate": 4.801404212637914e-05, + "loss": 2.3949, + "theoretical_loss": 3.3218489760987686, + "tokens_seen": 2987284480 + }, + { + "epoch": 10.01, + "learning_rate": 4.8004012036108326e-05, + "loss": 2.3983, + "theoretical_loss": 3.3218433681693176, + "tokens_seen": 2987350016 + }, + { + "epoch": 10.01, + "learning_rate": 4.7993981945837514e-05, + "loss": 2.4636, + "theoretical_loss": 3.3218377603973375, + "tokens_seen": 2987415552 + }, + { + "epoch": 10.01, + "learning_rate": 4.79839518555667e-05, + "loss": 2.5373, + "theoretical_loss": 3.3218321527828203, + "tokens_seen": 2987481088 + }, + { + "epoch": 10.01, + "learning_rate": 4.797392176529588e-05, + "loss": 2.5745, + "theoretical_loss": 3.3218265453257585, + "tokens_seen": 2987546624 + }, + { + "epoch": 10.01, + "learning_rate": 4.796389167502507e-05, + "loss": 2.4923, + "theoretical_loss": 3.3218209380261436, + "tokens_seen": 2987612160 + }, + { + "epoch": 10.01, + "learning_rate": 4.7953861584754266e-05, + "loss": 2.4669, + "theoretical_loss": 3.321815330883968, + "tokens_seen": 2987677696 + }, + { + "epoch": 10.01, + "learning_rate": 4.7943831494483453e-05, + "loss": 2.4172, + "theoretical_loss": 3.321809723899224, + "tokens_seen": 2987743232 + }, + { + "epoch": 10.01, + "learning_rate": 4.793380140421264e-05, + "loss": 2.4477, + "theoretical_loss": 3.321804117071904, + "tokens_seen": 2987808768 + }, + { + "epoch": 10.01, + "learning_rate": 4.792377131394183e-05, + "loss": 2.4156, + "theoretical_loss": 3.3217985104019996, + "tokens_seen": 2987874304 + }, + { + "epoch": 10.01, + "learning_rate": 4.791374122367102e-05, + "loss": 2.4265, + "theoretical_loss": 3.321792903889503, + "tokens_seen": 2987939840 + }, + { + "epoch": 10.01, + "learning_rate": 4.7903711133400205e-05, + "loss": 2.4751, + "theoretical_loss": 3.3217872975344065, + "tokens_seen": 2988005376 + }, + { + "epoch": 10.01, + "learning_rate": 4.789368104312939e-05, + "loss": 2.4491, + "theoretical_loss": 3.321781691336702, + "tokens_seen": 2988070912 + }, + { + "epoch": 10.01, + "learning_rate": 4.788365095285858e-05, + "loss": 2.6703, + "theoretical_loss": 3.321776085296382, + "tokens_seen": 2988136448 + }, + { + "epoch": 10.01, + "learning_rate": 4.787362086258776e-05, + "loss": 2.3286, + "theoretical_loss": 3.3217704794134377, + "tokens_seen": 2988201984 + }, + { + "epoch": 10.01, + "learning_rate": 4.786359077231695e-05, + "loss": 2.5922, + "theoretical_loss": 3.3217648736878624, + "tokens_seen": 2988267520 + }, + { + "epoch": 10.01, + "learning_rate": 4.785356068204614e-05, + "loss": 2.395, + "theoretical_loss": 3.321759268119648, + "tokens_seen": 2988333056 + }, + { + "epoch": 10.01, + "learning_rate": 4.7843530591775326e-05, + "loss": 2.2493, + "theoretical_loss": 3.3217536627087862, + "tokens_seen": 2988398592 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3331130, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.404291868209839, + "objective/train/theoretical_loss": 3.32175086006236, + "objective/train/tokens_used": 3008891360, + "theoretical_loss": 3.32175086006236, + "tokens_seen": 2988431360 + }, + { + "epoch": 10.01, + "learning_rate": 4.7833500501504514e-05, + "loss": 2.4228, + "theoretical_loss": 3.3217480574552694, + "tokens_seen": 2988464128 + }, + { + "epoch": 10.01, + "learning_rate": 4.78234704112337e-05, + "loss": 2.4488, + "theoretical_loss": 3.3217424523590893, + "tokens_seen": 2988529664 + }, + { + "epoch": 10.01, + "learning_rate": 4.781344032096289e-05, + "loss": 2.3853, + "theoretical_loss": 3.3217368474202384, + "tokens_seen": 2988595200 + }, + { + "epoch": 10.01, + "learning_rate": 4.780341023069208e-05, + "loss": 2.4726, + "theoretical_loss": 3.3217312426387093, + "tokens_seen": 2988660736 + }, + { + "epoch": 10.01, + "learning_rate": 4.7793380140421265e-05, + "loss": 2.5432, + "theoretical_loss": 3.3217256380144935, + "tokens_seen": 2988726272 + }, + { + "epoch": 10.01, + "learning_rate": 4.7783350050150446e-05, + "loss": 2.5571, + "theoretical_loss": 3.3217200335475834, + "tokens_seen": 2988791808 + }, + { + "epoch": 10.01, + "learning_rate": 4.7773319959879634e-05, + "loss": 2.5361, + "theoretical_loss": 3.3217144292379706, + "tokens_seen": 2988857344 + }, + { + "epoch": 10.01, + "learning_rate": 4.776328986960883e-05, + "loss": 2.3686, + "theoretical_loss": 3.321708825085648, + "tokens_seen": 2988922880 + }, + { + "epoch": 10.01, + "learning_rate": 4.775325977933802e-05, + "loss": 2.624, + "theoretical_loss": 3.3217032210906074, + "tokens_seen": 2988988416 + }, + { + "epoch": 10.01, + "learning_rate": 4.7743229689067205e-05, + "loss": 2.4832, + "theoretical_loss": 3.3216976172528407, + "tokens_seen": 2989053952 + }, + { + "epoch": 10.01, + "learning_rate": 4.773319959879639e-05, + "loss": 2.4764, + "theoretical_loss": 3.3216920135723402, + "tokens_seen": 2989119488 + }, + { + "epoch": 10.01, + "learning_rate": 4.772316950852558e-05, + "loss": 2.3072, + "theoretical_loss": 3.3216864100490984, + "tokens_seen": 2989185024 + }, + { + "epoch": 10.01, + "learning_rate": 4.771313941825477e-05, + "loss": 2.4691, + "theoretical_loss": 3.321680806683107, + "tokens_seen": 2989250560 + }, + { + "epoch": 10.01, + "learning_rate": 4.7703109327983956e-05, + "loss": 2.6218, + "theoretical_loss": 3.3216752034743586, + "tokens_seen": 2989316096 + }, + { + "epoch": 10.01, + "learning_rate": 4.7693079237713144e-05, + "loss": 2.5349, + "theoretical_loss": 3.3216696004228448, + "tokens_seen": 2989381632 + }, + { + "epoch": 10.01, + "learning_rate": 4.7683049147442325e-05, + "loss": 2.4277, + "theoretical_loss": 3.321663997528558, + "tokens_seen": 2989447168 + }, + { + "epoch": 10.01, + "learning_rate": 4.767301905717151e-05, + "loss": 2.4036, + "theoretical_loss": 3.3216583947914904, + "tokens_seen": 2989512704 + }, + { + "epoch": 10.01, + "learning_rate": 4.76629889669007e-05, + "loss": 2.5479, + "theoretical_loss": 3.321652792211634, + "tokens_seen": 2989578240 + }, + { + "epoch": 10.01, + "learning_rate": 4.765295887662989e-05, + "loss": 2.5262, + "theoretical_loss": 3.321647189788981, + "tokens_seen": 2989643776 + }, + { + "epoch": 10.01, + "learning_rate": 4.764292878635908e-05, + "loss": 2.4158, + "theoretical_loss": 3.3216415875235237, + "tokens_seen": 2989709312 + }, + { + "epoch": 10.01, + "learning_rate": 4.7632898696088265e-05, + "loss": 2.6333, + "theoretical_loss": 3.321635985415254, + "tokens_seen": 2989774848 + }, + { + "epoch": 10.01, + "learning_rate": 4.762286860581745e-05, + "loss": 2.4556, + "theoretical_loss": 3.321630383464164, + "tokens_seen": 2989840384 + }, + { + "epoch": 10.01, + "learning_rate": 4.761283851554664e-05, + "loss": 2.4846, + "theoretical_loss": 3.3216247816702458, + "tokens_seen": 2989905920 + }, + { + "epoch": 10.01, + "learning_rate": 4.760280842527583e-05, + "loss": 2.5359, + "theoretical_loss": 3.3216191800334918, + "tokens_seen": 2989971456 + }, + { + "epoch": 10.01, + "learning_rate": 4.7592778335005016e-05, + "loss": 2.4869, + "theoretical_loss": 3.3216135785538943, + "tokens_seen": 2990036992 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3336125, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0843727588653564, + "objective/train/theoretical_loss": 3.3216107778730266, + "objective/train/tokens_used": 3010529760, + "theoretical_loss": 3.3216107778730266, + "tokens_seen": 2990069760 + }, + { + "epoch": 10.01, + "learning_rate": 4.75827482447342e-05, + "loss": 2.2968, + "theoretical_loss": 3.3216079772314453, + "tokens_seen": 2990102528 + }, + { + "epoch": 10.01, + "learning_rate": 4.757271815446339e-05, + "loss": 2.5409, + "theoretical_loss": 3.3216023760661364, + "tokens_seen": 2990168064 + }, + { + "epoch": 10.01, + "learning_rate": 4.756268806419258e-05, + "loss": 2.5228, + "theoretical_loss": 3.3215967750579605, + "tokens_seen": 2990233600 + }, + { + "epoch": 10.01, + "learning_rate": 4.755265797392177e-05, + "loss": 2.5807, + "theoretical_loss": 3.3215911742069095, + "tokens_seen": 2990299136 + }, + { + "epoch": 10.01, + "learning_rate": 4.7542627883650956e-05, + "loss": 2.6288, + "theoretical_loss": 3.3215855735129756, + "tokens_seen": 2990364672 + }, + { + "epoch": 10.01, + "learning_rate": 4.7532597793380144e-05, + "loss": 2.4429, + "theoretical_loss": 3.3215799729761506, + "tokens_seen": 2990430208 + }, + { + "epoch": 10.01, + "learning_rate": 4.752256770310933e-05, + "loss": 2.3964, + "theoretical_loss": 3.321574372596427, + "tokens_seen": 2990495744 + }, + { + "epoch": 10.01, + "learning_rate": 4.751253761283852e-05, + "loss": 2.4901, + "theoretical_loss": 3.321568772373797, + "tokens_seen": 2990561280 + }, + { + "epoch": 10.01, + "learning_rate": 4.750250752256771e-05, + "loss": 2.4942, + "theoretical_loss": 3.3215631723082524, + "tokens_seen": 2990626816 + }, + { + "epoch": 10.01, + "learning_rate": 4.7492477432296895e-05, + "loss": 2.6053, + "theoretical_loss": 3.3215575723997857, + "tokens_seen": 2990692352 + }, + { + "epoch": 10.01, + "learning_rate": 4.7482447342026076e-05, + "loss": 2.3054, + "theoretical_loss": 3.321551972648389, + "tokens_seen": 2990757888 + }, + { + "epoch": 10.01, + "learning_rate": 4.7472417251755264e-05, + "loss": 2.3817, + "theoretical_loss": 3.321546373054054, + "tokens_seen": 2990823424 + }, + { + "epoch": 10.01, + "learning_rate": 4.746238716148445e-05, + "loss": 2.5647, + "theoretical_loss": 3.3215407736167735, + "tokens_seen": 2990888960 + }, + { + "epoch": 10.01, + "learning_rate": 4.745235707121364e-05, + "loss": 2.3784, + "theoretical_loss": 3.3215351743365393, + "tokens_seen": 2990954496 + }, + { + "epoch": 10.01, + "learning_rate": 4.744232698094283e-05, + "loss": 2.6375, + "theoretical_loss": 3.3215295752133436, + "tokens_seen": 2991020032 + }, + { + "epoch": 10.01, + "learning_rate": 4.7432296890672016e-05, + "loss": 2.518, + "theoretical_loss": 3.3215239762471787, + "tokens_seen": 2991085568 + }, + { + "epoch": 10.01, + "learning_rate": 4.7422266800401204e-05, + "loss": 2.5897, + "theoretical_loss": 3.3215183774380366, + "tokens_seen": 2991151104 + }, + { + "epoch": 10.01, + "learning_rate": 4.741223671013039e-05, + "loss": 2.4668, + "theoretical_loss": 3.3215127787859093, + "tokens_seen": 2991216640 + }, + { + "epoch": 10.01, + "learning_rate": 4.740220661985958e-05, + "loss": 2.4593, + "theoretical_loss": 3.3215071802907894, + "tokens_seen": 2991282176 + }, + { + "epoch": 10.01, + "learning_rate": 4.739217652958876e-05, + "loss": 2.3764, + "theoretical_loss": 3.3215015819526688, + "tokens_seen": 2991347712 + }, + { + "epoch": 10.01, + "learning_rate": 4.7382146439317955e-05, + "loss": 2.2629, + "theoretical_loss": 3.3214959837715394, + "tokens_seen": 2991413248 + }, + { + "epoch": 10.01, + "learning_rate": 4.737211634904714e-05, + "loss": 2.4208, + "theoretical_loss": 3.321490385747394, + "tokens_seen": 2991478784 + }, + { + "epoch": 10.01, + "learning_rate": 4.736208625877633e-05, + "loss": 2.4197, + "theoretical_loss": 3.321484787880224, + "tokens_seen": 2991544320 + }, + { + "epoch": 10.01, + "learning_rate": 4.735205616850552e-05, + "loss": 2.3679, + "theoretical_loss": 3.321479190170022, + "tokens_seen": 2991609856 + }, + { + "epoch": 10.01, + "learning_rate": 4.734202607823471e-05, + "loss": 2.3712, + "theoretical_loss": 3.32147359261678, + "tokens_seen": 2991675392 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3337582, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5961523056030273, + "objective/train/theoretical_loss": 3.321470793899017, + "objective/train/tokens_used": 3012168160, + "theoretical_loss": 3.321470793899017, + "tokens_seen": 2991708160 + }, + { + "epoch": 10.01, + "learning_rate": 4.7331995987963895e-05, + "loss": 2.3037, + "theoretical_loss": 3.3214679952204906, + "tokens_seen": 2991740928 + }, + { + "epoch": 10.01, + "learning_rate": 4.732196589769308e-05, + "loss": 2.4245, + "theoretical_loss": 3.3214623979811457, + "tokens_seen": 2991806464 + }, + { + "epoch": 10.01, + "learning_rate": 4.731193580742227e-05, + "loss": 2.5074, + "theoretical_loss": 3.321456800898737, + "tokens_seen": 2991872000 + }, + { + "epoch": 10.01, + "learning_rate": 4.730190571715146e-05, + "loss": 2.5236, + "theoretical_loss": 3.321451203973257, + "tokens_seen": 2991937536 + }, + { + "epoch": 10.01, + "learning_rate": 4.729187562688064e-05, + "loss": 2.5113, + "theoretical_loss": 3.3214456072046983, + "tokens_seen": 2992003072 + }, + { + "epoch": 10.01, + "learning_rate": 4.728184553660983e-05, + "loss": 2.3421, + "theoretical_loss": 3.3214400105930526, + "tokens_seen": 2992068608 + }, + { + "epoch": 10.01, + "learning_rate": 4.7271815446339015e-05, + "loss": 2.2856, + "theoretical_loss": 3.321434414138312, + "tokens_seen": 2992134144 + }, + { + "epoch": 10.01, + "learning_rate": 4.72617853560682e-05, + "loss": 2.5184, + "theoretical_loss": 3.3214288178404687, + "tokens_seen": 2992199680 + }, + { + "epoch": 10.01, + "learning_rate": 4.725175526579739e-05, + "loss": 2.4874, + "theoretical_loss": 3.321423221699515, + "tokens_seen": 2992265216 + }, + { + "epoch": 10.01, + "learning_rate": 4.724172517552658e-05, + "loss": 2.5826, + "theoretical_loss": 3.3214176257154433, + "tokens_seen": 2992330752 + }, + { + "epoch": 10.01, + "learning_rate": 4.723169508525577e-05, + "loss": 2.5904, + "theoretical_loss": 3.3214120298882452, + "tokens_seen": 2992396288 + }, + { + "epoch": 10.01, + "learning_rate": 4.7221664994984955e-05, + "loss": 2.4364, + "theoretical_loss": 3.3214064342179137, + "tokens_seen": 2992461824 + }, + { + "epoch": 10.01, + "learning_rate": 4.721163490471414e-05, + "loss": 2.409, + "theoretical_loss": 3.32140083870444, + "tokens_seen": 2992527360 + }, + { + "epoch": 10.01, + "learning_rate": 4.720160481444334e-05, + "loss": 2.422, + "theoretical_loss": 3.3213952433478164, + "tokens_seen": 2992592896 + }, + { + "epoch": 10.01, + "learning_rate": 4.719157472417252e-05, + "loss": 2.0997, + "theoretical_loss": 3.3213896481480356, + "tokens_seen": 2992658432 + }, + { + "epoch": 10.01, + "learning_rate": 4.7181544633901706e-05, + "loss": 2.6021, + "theoretical_loss": 3.3213840531050898, + "tokens_seen": 2992723968 + }, + { + "epoch": 10.01, + "learning_rate": 4.7171514543630894e-05, + "loss": 2.4579, + "theoretical_loss": 3.3213784582189705, + "tokens_seen": 2992789504 + }, + { + "epoch": 10.01, + "learning_rate": 4.716148445336008e-05, + "loss": 2.3134, + "theoretical_loss": 3.3213728634896706, + "tokens_seen": 2992855040 + }, + { + "epoch": 10.01, + "learning_rate": 4.715145436308927e-05, + "loss": 2.4224, + "theoretical_loss": 3.321367268917182, + "tokens_seen": 2992920576 + }, + { + "epoch": 10.01, + "learning_rate": 4.714142427281846e-05, + "loss": 2.6278, + "theoretical_loss": 3.321361674501497, + "tokens_seen": 2992986112 + }, + { + "epoch": 10.01, + "learning_rate": 4.7131394182547646e-05, + "loss": 2.4071, + "theoretical_loss": 3.321356080242607, + "tokens_seen": 2993051648 + }, + { + "epoch": 10.01, + "learning_rate": 4.7121364092276834e-05, + "loss": 2.4544, + "theoretical_loss": 3.321350486140505, + "tokens_seen": 2993117184 + }, + { + "epoch": 10.01, + "learning_rate": 4.711133400200602e-05, + "loss": 2.4751, + "theoretical_loss": 3.321344892195183, + "tokens_seen": 2993182720 + }, + { + "epoch": 10.01, + "learning_rate": 4.710130391173521e-05, + "loss": 2.4678, + "theoretical_loss": 3.321339298406633, + "tokens_seen": 2993248256 + }, + { + "epoch": 10.01, + "learning_rate": 4.709127382146439e-05, + "loss": 2.3453, + "theoretical_loss": 3.3213337047748475, + "tokens_seen": 2993313792 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3337918, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.297234058380127, + "objective/train/theoretical_loss": 3.321330908017739, + "objective/train/tokens_used": 3013806560, + "theoretical_loss": 3.321330908017739, + "tokens_seen": 2993346560 + }, + { + "epoch": 10.01, + "learning_rate": 4.708124373119358e-05, + "loss": 2.3643, + "theoretical_loss": 3.321328111299818, + "tokens_seen": 2993379328 + }, + { + "epoch": 10.01, + "learning_rate": 4.7071213640922767e-05, + "loss": 2.3578, + "theoretical_loss": 3.3213225179815375, + "tokens_seen": 2993444864 + }, + { + "epoch": 10.01, + "learning_rate": 4.7061183550651954e-05, + "loss": 2.4893, + "theoretical_loss": 3.321316924819998, + "tokens_seen": 2993510400 + }, + { + "epoch": 10.01, + "learning_rate": 4.705115346038114e-05, + "loss": 2.6273, + "theoretical_loss": 3.321311331815191, + "tokens_seen": 2993575936 + }, + { + "epoch": 10.01, + "learning_rate": 4.704112337011033e-05, + "loss": 2.5675, + "theoretical_loss": 3.3213057389671095, + "tokens_seen": 2993641472 + }, + { + "epoch": 10.01, + "learning_rate": 4.703109327983952e-05, + "loss": 2.4235, + "theoretical_loss": 3.321300146275745, + "tokens_seen": 2993707008 + }, + { + "epoch": 10.01, + "learning_rate": 4.7021063189568706e-05, + "loss": 2.4227, + "theoretical_loss": 3.3212945537410903, + "tokens_seen": 2993772544 + }, + { + "epoch": 10.01, + "learning_rate": 4.70110330992979e-05, + "loss": 2.5043, + "theoretical_loss": 3.3212889613631376, + "tokens_seen": 2993838080 + }, + { + "epoch": 10.01, + "learning_rate": 4.700100300902708e-05, + "loss": 2.5712, + "theoretical_loss": 3.321283369141878, + "tokens_seen": 2993903616 + }, + { + "epoch": 10.01, + "learning_rate": 4.699097291875627e-05, + "loss": 2.3459, + "theoretical_loss": 3.321277777077305, + "tokens_seen": 2993969152 + }, + { + "epoch": 10.01, + "learning_rate": 4.698094282848546e-05, + "loss": 2.4701, + "theoretical_loss": 3.3212721851694105, + "tokens_seen": 2994034688 + }, + { + "epoch": 10.01, + "learning_rate": 4.6970912738214646e-05, + "loss": 2.3864, + "theoretical_loss": 3.3212665934181858, + "tokens_seen": 2994100224 + }, + { + "epoch": 10.01, + "learning_rate": 4.6960882647943833e-05, + "loss": 2.5747, + "theoretical_loss": 3.3212610018236237, + "tokens_seen": 2994165760 + }, + { + "epoch": 10.01, + "learning_rate": 4.695085255767302e-05, + "loss": 2.5666, + "theoretical_loss": 3.321255410385717, + "tokens_seen": 2994231296 + }, + { + "epoch": 10.01, + "learning_rate": 4.694082246740221e-05, + "loss": 2.4707, + "theoretical_loss": 3.321249819104457, + "tokens_seen": 2994296832 + }, + { + "epoch": 10.01, + "learning_rate": 4.69307923771314e-05, + "loss": 2.5341, + "theoretical_loss": 3.321244227979836, + "tokens_seen": 2994362368 + }, + { + "epoch": 10.01, + "learning_rate": 4.6920762286860585e-05, + "loss": 2.381, + "theoretical_loss": 3.321238637011846, + "tokens_seen": 2994427904 + }, + { + "epoch": 10.01, + "learning_rate": 4.691073219658977e-05, + "loss": 2.3067, + "theoretical_loss": 3.32123304620048, + "tokens_seen": 2994493440 + }, + { + "epoch": 10.01, + "learning_rate": 4.6900702106318954e-05, + "loss": 2.5874, + "theoretical_loss": 3.3212274555457295, + "tokens_seen": 2994558976 + }, + { + "epoch": 10.01, + "learning_rate": 4.689067201604814e-05, + "loss": 2.4377, + "theoretical_loss": 3.321221865047587, + "tokens_seen": 2994624512 + }, + { + "epoch": 10.01, + "learning_rate": 4.688064192577733e-05, + "loss": 2.3538, + "theoretical_loss": 3.3212162747060447, + "tokens_seen": 2994690048 + }, + { + "epoch": 10.01, + "learning_rate": 4.687061183550652e-05, + "loss": 2.3044, + "theoretical_loss": 3.3212106845210947, + "tokens_seen": 2994755584 + }, + { + "epoch": 10.01, + "learning_rate": 4.6860581745235706e-05, + "loss": 2.51, + "theoretical_loss": 3.3212050944927287, + "tokens_seen": 2994821120 + }, + { + "epoch": 10.01, + "learning_rate": 4.6850551654964894e-05, + "loss": 2.5192, + "theoretical_loss": 3.3211995046209397, + "tokens_seen": 2994886656 + }, + { + "epoch": 10.01, + "learning_rate": 4.684052156469408e-05, + "loss": 2.3477, + "theoretical_loss": 3.3211939149057192, + "tokens_seen": 2994952192 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3339287, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7042019367218018, + "objective/train/theoretical_loss": 3.32119112010682, + "objective/train/tokens_used": 3015444960, + "theoretical_loss": 3.32119112010682, + "tokens_seen": 2994984960 + }, + { + "epoch": 10.01, + "learning_rate": 4.683049147442327e-05, + "loss": 2.5106, + "theoretical_loss": 3.3211883253470598, + "tokens_seen": 2995017728 + }, + { + "epoch": 10.01, + "learning_rate": 4.6820461384152464e-05, + "loss": 2.4721, + "theoretical_loss": 3.3211827359449537, + "tokens_seen": 2995083264 + }, + { + "epoch": 10.01, + "learning_rate": 4.681043129388165e-05, + "loss": 2.5656, + "theoretical_loss": 3.3211771466993927, + "tokens_seen": 2995148800 + }, + { + "epoch": 10.01, + "learning_rate": 4.680040120361083e-05, + "loss": 2.4678, + "theoretical_loss": 3.32117155761037, + "tokens_seen": 2995214336 + }, + { + "epoch": 10.01, + "learning_rate": 4.679037111334002e-05, + "loss": 2.5884, + "theoretical_loss": 3.3211659686778763, + "tokens_seen": 2995279872 + }, + { + "epoch": 10.01, + "learning_rate": 4.678034102306921e-05, + "loss": 2.5487, + "theoretical_loss": 3.321160379901905, + "tokens_seen": 2995345408 + }, + { + "epoch": 10.01, + "learning_rate": 4.67703109327984e-05, + "loss": 2.3855, + "theoretical_loss": 3.3211547912824475, + "tokens_seen": 2995410944 + }, + { + "epoch": 10.01, + "learning_rate": 4.6760280842527585e-05, + "loss": 2.611, + "theoretical_loss": 3.3211492028194964, + "tokens_seen": 2995476480 + }, + { + "epoch": 10.01, + "learning_rate": 4.675025075225677e-05, + "loss": 2.5038, + "theoretical_loss": 3.321143614513044, + "tokens_seen": 2995542016 + }, + { + "epoch": 10.01, + "learning_rate": 4.674022066198596e-05, + "loss": 2.5109, + "theoretical_loss": 3.3211380263630828, + "tokens_seen": 2995607552 + }, + { + "epoch": 10.01, + "learning_rate": 4.673019057171515e-05, + "loss": 2.5336, + "theoretical_loss": 3.321132438369604, + "tokens_seen": 2995673088 + }, + { + "epoch": 10.01, + "learning_rate": 4.6720160481444336e-05, + "loss": 2.5644, + "theoretical_loss": 3.3211268505326004, + "tokens_seen": 2995738624 + }, + { + "epoch": 10.01, + "learning_rate": 4.671013039117352e-05, + "loss": 2.3312, + "theoretical_loss": 3.321121262852064, + "tokens_seen": 2995804160 + }, + { + "epoch": 10.01, + "learning_rate": 4.6700100300902705e-05, + "loss": 2.5556, + "theoretical_loss": 3.321115675327987, + "tokens_seen": 2995869696 + }, + { + "epoch": 10.01, + "learning_rate": 4.669007021063189e-05, + "loss": 2.5474, + "theoretical_loss": 3.321110087960362, + "tokens_seen": 2995935232 + }, + { + "epoch": 10.01, + "learning_rate": 4.668004012036108e-05, + "loss": 2.6028, + "theoretical_loss": 3.321104500749181, + "tokens_seen": 2996000768 + }, + { + "epoch": 10.01, + "learning_rate": 4.667001003009027e-05, + "loss": 2.4489, + "theoretical_loss": 3.3210989136944358, + "tokens_seen": 2996066304 + }, + { + "epoch": 10.01, + "learning_rate": 4.665997993981946e-05, + "loss": 2.4756, + "theoretical_loss": 3.321093326796119, + "tokens_seen": 2996131840 + }, + { + "epoch": 10.01, + "learning_rate": 4.6649949849548645e-05, + "loss": 2.5245, + "theoretical_loss": 3.3210877400542227, + "tokens_seen": 2996197376 + }, + { + "epoch": 10.01, + "learning_rate": 4.663991975927784e-05, + "loss": 2.4708, + "theoretical_loss": 3.321082153468739, + "tokens_seen": 2996262912 + }, + { + "epoch": 10.01, + "learning_rate": 4.662988966900703e-05, + "loss": 2.5273, + "theoretical_loss": 3.3210765670396603, + "tokens_seen": 2996328448 + }, + { + "epoch": 10.01, + "learning_rate": 4.6619859578736215e-05, + "loss": 2.5872, + "theoretical_loss": 3.321070980766979, + "tokens_seen": 2996393984 + }, + { + "epoch": 10.01, + "learning_rate": 4.6609829488465396e-05, + "loss": 2.4004, + "theoretical_loss": 3.3210653946506867, + "tokens_seen": 2996459520 + }, + { + "epoch": 10.01, + "learning_rate": 4.6599799398194584e-05, + "loss": 2.3618, + "theoretical_loss": 3.3210598086907757, + "tokens_seen": 2996525056 + }, + { + "epoch": 10.01, + "learning_rate": 4.658976930792377e-05, + "loss": 2.4872, + "theoretical_loss": 3.3210542228872386, + "tokens_seen": 2996590592 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3339868, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9447882175445557, + "objective/train/theoretical_loss": 3.3210514300441076, + "objective/train/tokens_used": 3017083360, + "theoretical_loss": 3.3210514300441076, + "tokens_seen": 2996623360 + }, + { + "epoch": 10.01, + "learning_rate": 4.657973921765296e-05, + "loss": 2.5027, + "theoretical_loss": 3.3210486372400676, + "tokens_seen": 2996656128 + }, + { + "epoch": 10.01, + "learning_rate": 4.656970912738215e-05, + "loss": 2.3375, + "theoretical_loss": 3.321043051749254, + "tokens_seen": 2996721664 + }, + { + "epoch": 10.01, + "learning_rate": 4.6559679037111336e-05, + "loss": 2.4422, + "theoretical_loss": 3.3210374664147917, + "tokens_seen": 2996787200 + }, + { + "epoch": 10.01, + "learning_rate": 4.6549648946840524e-05, + "loss": 2.6226, + "theoretical_loss": 3.3210318812366717, + "tokens_seen": 2996852736 + }, + { + "epoch": 10.01, + "learning_rate": 4.653961885656971e-05, + "loss": 2.4202, + "theoretical_loss": 3.3210262962148858, + "tokens_seen": 2996918272 + }, + { + "epoch": 10.01, + "learning_rate": 4.65295887662989e-05, + "loss": 2.3598, + "theoretical_loss": 3.3210207113494272, + "tokens_seen": 2996983808 + }, + { + "epoch": 10.01, + "learning_rate": 4.651955867602809e-05, + "loss": 2.6059, + "theoretical_loss": 3.3210151266402876, + "tokens_seen": 2997049344 + }, + { + "epoch": 10.01, + "learning_rate": 4.650952858575727e-05, + "loss": 2.544, + "theoretical_loss": 3.3210095420874595, + "tokens_seen": 2997114880 + }, + { + "epoch": 10.01, + "learning_rate": 4.6499498495486456e-05, + "loss": 2.4173, + "theoretical_loss": 3.321003957690935, + "tokens_seen": 2997180416 + }, + { + "epoch": 10.01, + "learning_rate": 4.6489468405215644e-05, + "loss": 2.5924, + "theoretical_loss": 3.3209983734507063, + "tokens_seen": 2997245952 + }, + { + "epoch": 10.01, + "learning_rate": 4.647943831494483e-05, + "loss": 2.421, + "theoretical_loss": 3.3209927893667657, + "tokens_seen": 2997311488 + }, + { + "epoch": 10.01, + "learning_rate": 4.646940822467402e-05, + "loss": 2.5255, + "theoretical_loss": 3.320987205439105, + "tokens_seen": 2997377024 + }, + { + "epoch": 10.01, + "learning_rate": 4.645937813440321e-05, + "loss": 2.4513, + "theoretical_loss": 3.3209816216677166, + "tokens_seen": 2997442560 + }, + { + "epoch": 10.01, + "learning_rate": 4.64493480441324e-05, + "loss": 2.6042, + "theoretical_loss": 3.320976038052593, + "tokens_seen": 2997508096 + }, + { + "epoch": 10.01, + "learning_rate": 4.643931795386159e-05, + "loss": 2.5114, + "theoretical_loss": 3.320970454593726, + "tokens_seen": 2997573632 + }, + { + "epoch": 10.01, + "learning_rate": 4.642928786359078e-05, + "loss": 2.4823, + "theoretical_loss": 3.320964871291108, + "tokens_seen": 2997639168 + }, + { + "epoch": 10.01, + "learning_rate": 4.6419257773319966e-05, + "loss": 2.3371, + "theoretical_loss": 3.3209592881447314, + "tokens_seen": 2997704704 + }, + { + "epoch": 10.01, + "learning_rate": 4.640922768304915e-05, + "loss": 2.4132, + "theoretical_loss": 3.3209537051545883, + "tokens_seen": 2997770240 + }, + { + "epoch": 10.01, + "learning_rate": 4.6399197592778335e-05, + "loss": 2.2286, + "theoretical_loss": 3.3209481223206705, + "tokens_seen": 2997835776 + }, + { + "epoch": 10.01, + "learning_rate": 4.638916750250752e-05, + "loss": 2.5208, + "theoretical_loss": 3.320942539642971, + "tokens_seen": 2997901312 + }, + { + "epoch": 10.01, + "learning_rate": 4.637913741223671e-05, + "loss": 2.3998, + "theoretical_loss": 3.3209369571214813, + "tokens_seen": 2997966848 + }, + { + "epoch": 10.01, + "learning_rate": 4.63691073219659e-05, + "loss": 2.4945, + "theoretical_loss": 3.320931374756194, + "tokens_seen": 2998032384 + }, + { + "epoch": 10.01, + "learning_rate": 4.635907723169509e-05, + "loss": 2.3755, + "theoretical_loss": 3.320925792547101, + "tokens_seen": 2998097920 + }, + { + "epoch": 10.01, + "learning_rate": 4.6349047141424275e-05, + "loss": 2.4222, + "theoretical_loss": 3.320920210494195, + "tokens_seen": 2998163456 + }, + { + "epoch": 10.01, + "learning_rate": 4.633901705115346e-05, + "loss": 2.5703, + "theoretical_loss": 3.320914628597468, + "tokens_seen": 2998228992 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3341284, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6037189960479736, + "objective/train/theoretical_loss": 3.320911837707669, + "objective/train/tokens_used": 3018721760, + "theoretical_loss": 3.320911837707669, + "tokens_seen": 2998261760 + }, + { + "epoch": 10.01, + "learning_rate": 4.632898696088265e-05, + "loss": 2.3745, + "theoretical_loss": 3.3209090468569116, + "tokens_seen": 2998294528 + }, + { + "epoch": 10.01, + "learning_rate": 4.631895687061183e-05, + "loss": 2.338, + "theoretical_loss": 3.320903465272519, + "tokens_seen": 2998360064 + }, + { + "epoch": 10.01, + "learning_rate": 4.630892678034102e-05, + "loss": 2.379, + "theoretical_loss": 3.320897883844282, + "tokens_seen": 2998425600 + }, + { + "epoch": 10.01, + "learning_rate": 4.629889669007021e-05, + "loss": 2.4971, + "theoretical_loss": 3.320892302572193, + "tokens_seen": 2998491136 + }, + { + "epoch": 10.01, + "learning_rate": 4.6288866599799395e-05, + "loss": 2.4168, + "theoretical_loss": 3.320886721456244, + "tokens_seen": 2998556672 + }, + { + "epoch": 10.01, + "learning_rate": 4.627883650952858e-05, + "loss": 2.5304, + "theoretical_loss": 3.320881140496427, + "tokens_seen": 2998622208 + }, + { + "epoch": 10.01, + "learning_rate": 4.626880641925777e-05, + "loss": 2.4869, + "theoretical_loss": 3.320875559692734, + "tokens_seen": 2998687744 + }, + { + "epoch": 10.01, + "learning_rate": 4.6258776328986966e-05, + "loss": 2.5345, + "theoretical_loss": 3.3208699790451583, + "tokens_seen": 2998753280 + }, + { + "epoch": 10.01, + "learning_rate": 4.6248746238716154e-05, + "loss": 2.5558, + "theoretical_loss": 3.3208643985536916, + "tokens_seen": 2998818816 + }, + { + "epoch": 10.01, + "learning_rate": 4.623871614844534e-05, + "loss": 2.4019, + "theoretical_loss": 3.320858818218326, + "tokens_seen": 2998884352 + }, + { + "epoch": 10.01, + "learning_rate": 4.622868605817453e-05, + "loss": 2.5623, + "theoretical_loss": 3.3208532380390534, + "tokens_seen": 2998949888 + }, + { + "epoch": 10.01, + "learning_rate": 4.621865596790371e-05, + "loss": 2.5287, + "theoretical_loss": 3.3208476580158663, + "tokens_seen": 2999015424 + }, + { + "epoch": 10.01, + "learning_rate": 4.62086258776329e-05, + "loss": 2.5947, + "theoretical_loss": 3.3208420781487575, + "tokens_seen": 2999080960 + }, + { + "epoch": 10.01, + "learning_rate": 4.6198595787362086e-05, + "loss": 2.4228, + "theoretical_loss": 3.3208364984377186, + "tokens_seen": 2999146496 + }, + { + "epoch": 10.01, + "learning_rate": 4.6188565697091274e-05, + "loss": 2.5944, + "theoretical_loss": 3.3208309188827414, + "tokens_seen": 2999212032 + }, + { + "epoch": 10.01, + "learning_rate": 4.617853560682046e-05, + "loss": 2.4365, + "theoretical_loss": 3.3208253394838194, + "tokens_seen": 2999277568 + }, + { + "epoch": 10.01, + "learning_rate": 4.616850551654965e-05, + "loss": 2.4796, + "theoretical_loss": 3.3208197602409437, + "tokens_seen": 2999343104 + }, + { + "epoch": 10.01, + "learning_rate": 4.615847542627884e-05, + "loss": 2.2992, + "theoretical_loss": 3.3208141811541068, + "tokens_seen": 2999408640 + }, + { + "epoch": 10.01, + "learning_rate": 4.6148445336008026e-05, + "loss": 2.4573, + "theoretical_loss": 3.3208086022233014, + "tokens_seen": 2999474176 + }, + { + "epoch": 10.01, + "learning_rate": 4.6138415245737214e-05, + "loss": 2.4403, + "theoretical_loss": 3.3208030234485193, + "tokens_seen": 2999539712 + }, + { + "epoch": 10.01, + "learning_rate": 4.61283851554664e-05, + "loss": 2.288, + "theoretical_loss": 3.320797444829753, + "tokens_seen": 2999605248 + }, + { + "epoch": 10.01, + "learning_rate": 4.611835506519558e-05, + "loss": 2.3109, + "theoretical_loss": 3.320791866366994, + "tokens_seen": 2999670784 + }, + { + "epoch": 10.01, + "learning_rate": 4.610832497492477e-05, + "loss": 2.1786, + "theoretical_loss": 3.3207862880602352, + "tokens_seen": 2999736320 + }, + { + "epoch": 10.01, + "learning_rate": 4.609829488465396e-05, + "loss": 2.4082, + "theoretical_loss": 3.320780709909469, + "tokens_seen": 2999801856 + }, + { + "epoch": 10.01, + "learning_rate": 4.6088264794383147e-05, + "loss": 2.4456, + "theoretical_loss": 3.320775131914687, + "tokens_seen": 2999867392 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3342108, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.288560628890991, + "objective/train/theoretical_loss": 3.320772342975788, + "objective/train/tokens_used": 3020360160, + "theoretical_loss": 3.320772342975788, + "tokens_seen": 2999900160 + }, + { + "epoch": 10.01, + "learning_rate": 4.6078234704112334e-05, + "loss": 2.4083, + "theoretical_loss": 3.3207695540758824, + "tokens_seen": 2999932928 + }, + { + "epoch": 10.01, + "learning_rate": 4.606820461384153e-05, + "loss": 2.408, + "theoretical_loss": 3.320763976393046, + "tokens_seen": 2999998464 + }, + { + "epoch": 10.01, + "learning_rate": 4.605817452357072e-05, + "loss": 2.439, + "theoretical_loss": 3.320758398866171, + "tokens_seen": 3000064000 + }, + { + "epoch": 10.01, + "learning_rate": 4.6048144433299905e-05, + "loss": 2.4455, + "theoretical_loss": 3.3207528214952493, + "tokens_seen": 3000129536 + }, + { + "epoch": 10.01, + "learning_rate": 4.603811434302909e-05, + "loss": 2.57, + "theoretical_loss": 3.3207472442802737, + "tokens_seen": 3000195072 + }, + { + "epoch": 10.01, + "learning_rate": 4.602808425275828e-05, + "loss": 2.3776, + "theoretical_loss": 3.320741667221236, + "tokens_seen": 3000260608 + }, + { + "epoch": 10.01, + "learning_rate": 4.601805416248746e-05, + "loss": 2.4367, + "theoretical_loss": 3.320736090318128, + "tokens_seen": 3000326144 + }, + { + "epoch": 10.01, + "learning_rate": 4.600802407221665e-05, + "loss": 2.4022, + "theoretical_loss": 3.3207305135709433, + "tokens_seen": 3000391680 + }, + { + "epoch": 10.01, + "learning_rate": 4.599799398194584e-05, + "loss": 2.5695, + "theoretical_loss": 3.3207249369796723, + "tokens_seen": 3000457216 + }, + { + "epoch": 10.01, + "learning_rate": 4.5987963891675026e-05, + "loss": 2.4508, + "theoretical_loss": 3.3207193605443086, + "tokens_seen": 3000522752 + }, + { + "epoch": 10.01, + "learning_rate": 4.5977933801404213e-05, + "loss": 2.5896, + "theoretical_loss": 3.3207137842648438, + "tokens_seen": 3000588288 + }, + { + "epoch": 10.01, + "learning_rate": 4.59679037111334e-05, + "loss": 2.5398, + "theoretical_loss": 3.32070820814127, + "tokens_seen": 3000653824 + }, + { + "epoch": 10.01, + "learning_rate": 4.595787362086259e-05, + "loss": 2.5873, + "theoretical_loss": 3.3207026321735804, + "tokens_seen": 3000719360 + }, + { + "epoch": 10.01, + "learning_rate": 4.594784353059178e-05, + "loss": 2.2936, + "theoretical_loss": 3.3206970563617664, + "tokens_seen": 3000784896 + }, + { + "epoch": 10.01, + "learning_rate": 4.5937813440320965e-05, + "loss": 2.3952, + "theoretical_loss": 3.32069148070582, + "tokens_seen": 3000850432 + }, + { + "epoch": 10.01, + "learning_rate": 4.5927783350050146e-05, + "loss": 2.4102, + "theoretical_loss": 3.3206859052057345, + "tokens_seen": 3000915968 + }, + { + "epoch": 10.01, + "learning_rate": 4.5917753259779334e-05, + "loss": 2.3987, + "theoretical_loss": 3.320680329861501, + "tokens_seen": 3000981504 + }, + { + "epoch": 10.01, + "learning_rate": 4.590772316950852e-05, + "loss": 2.3956, + "theoretical_loss": 3.320674754673113, + "tokens_seen": 3001047040 + }, + { + "epoch": 10.01, + "learning_rate": 4.589769307923771e-05, + "loss": 2.5127, + "theoretical_loss": 3.320669179640561, + "tokens_seen": 3001112576 + }, + { + "epoch": 10.01, + "learning_rate": 4.5887662988966904e-05, + "loss": 2.4214, + "theoretical_loss": 3.320663604763839, + "tokens_seen": 3001178112 + }, + { + "epoch": 10.01, + "learning_rate": 4.587763289869609e-05, + "loss": 2.4531, + "theoretical_loss": 3.3206580300429382, + "tokens_seen": 3001243648 + }, + { + "epoch": 10.01, + "learning_rate": 4.586760280842528e-05, + "loss": 2.5282, + "theoretical_loss": 3.3206524554778514, + "tokens_seen": 3001309184 + }, + { + "epoch": 10.01, + "learning_rate": 4.585757271815447e-05, + "loss": 2.3431, + "theoretical_loss": 3.32064688106857, + "tokens_seen": 3001374720 + }, + { + "epoch": 10.01, + "learning_rate": 4.5847542627883656e-05, + "loss": 2.3774, + "theoretical_loss": 3.3206413068150873, + "tokens_seen": 3001440256 + }, + { + "epoch": 10.01, + "learning_rate": 4.5837512537612844e-05, + "loss": 2.558, + "theoretical_loss": 3.320635732717395, + "tokens_seen": 3001505792 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3343615, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.624105930328369, + "objective/train/theoretical_loss": 3.3206329457269677, + "objective/train/tokens_used": 3021998560, + "theoretical_loss": 3.3206329457269677, + "tokens_seen": 3001538560 + }, + { + "epoch": 10.01, + "learning_rate": 4.5827482447342025e-05, + "loss": 2.5783, + "theoretical_loss": 3.320630158775485, + "tokens_seen": 3001571328 + }, + { + "epoch": 10.01, + "learning_rate": 4.581745235707121e-05, + "loss": 2.4209, + "theoretical_loss": 3.3206245849893508, + "tokens_seen": 3001636864 + }, + { + "epoch": 10.01, + "learning_rate": 4.58074222668004e-05, + "loss": 2.5538, + "theoretical_loss": 3.320619011358983, + "tokens_seen": 3001702400 + }, + { + "epoch": 10.01, + "learning_rate": 4.579739217652959e-05, + "loss": 2.3516, + "theoretical_loss": 3.320613437884375, + "tokens_seen": 3001767936 + }, + { + "epoch": 10.01, + "learning_rate": 4.578736208625878e-05, + "loss": 2.3156, + "theoretical_loss": 3.3206078645655186, + "tokens_seen": 3001833472 + }, + { + "epoch": 10.01, + "learning_rate": 4.5777331995987965e-05, + "loss": 2.5434, + "theoretical_loss": 3.320602291402406, + "tokens_seen": 3001899008 + }, + { + "epoch": 10.01, + "learning_rate": 4.576730190571715e-05, + "loss": 2.3954, + "theoretical_loss": 3.3205967183950293, + "tokens_seen": 3001964544 + }, + { + "epoch": 10.01, + "learning_rate": 4.575727181544634e-05, + "loss": 2.4636, + "theoretical_loss": 3.3205911455433816, + "tokens_seen": 3002030080 + }, + { + "epoch": 10.01, + "learning_rate": 4.574724172517553e-05, + "loss": 2.359, + "theoretical_loss": 3.3205855728474543, + "tokens_seen": 3002095616 + }, + { + "epoch": 10.01, + "learning_rate": 4.5737211634904716e-05, + "loss": 2.7051, + "theoretical_loss": 3.32058000030724, + "tokens_seen": 3002161152 + }, + { + "epoch": 10.01, + "learning_rate": 4.57271815446339e-05, + "loss": 2.4406, + "theoretical_loss": 3.3205744279227307, + "tokens_seen": 3002226688 + }, + { + "epoch": 10.01, + "learning_rate": 4.5717151454363085e-05, + "loss": 2.242, + "theoretical_loss": 3.320568855693919, + "tokens_seen": 3002292224 + }, + { + "epoch": 10.01, + "learning_rate": 4.570712136409227e-05, + "loss": 2.3732, + "theoretical_loss": 3.320563283620797, + "tokens_seen": 3002357760 + }, + { + "epoch": 10.01, + "learning_rate": 4.569709127382147e-05, + "loss": 2.2248, + "theoretical_loss": 3.320557711703357, + "tokens_seen": 3002423296 + }, + { + "epoch": 10.01, + "learning_rate": 4.5687061183550656e-05, + "loss": 2.5689, + "theoretical_loss": 3.320552139941591, + "tokens_seen": 3002488832 + }, + { + "epoch": 10.01, + "learning_rate": 4.5677031093279844e-05, + "loss": 2.4084, + "theoretical_loss": 3.3205465683354913, + "tokens_seen": 3002554368 + }, + { + "epoch": 10.01, + "learning_rate": 4.566700100300903e-05, + "loss": 2.2009, + "theoretical_loss": 3.3205409968850508, + "tokens_seen": 3002619904 + }, + { + "epoch": 10.01, + "learning_rate": 4.565697091273822e-05, + "loss": 2.5364, + "theoretical_loss": 3.320535425590261, + "tokens_seen": 3002685440 + }, + { + "epoch": 10.01, + "learning_rate": 4.564694082246741e-05, + "loss": 2.6209, + "theoretical_loss": 3.320529854451114, + "tokens_seen": 3002750976 + }, + { + "epoch": 10.01, + "learning_rate": 4.5636910732196595e-05, + "loss": 2.5568, + "theoretical_loss": 3.320524283467603, + "tokens_seen": 3002816512 + }, + { + "epoch": 10.01, + "learning_rate": 4.5626880641925776e-05, + "loss": 2.6524, + "theoretical_loss": 3.3205187126397195, + "tokens_seen": 3002882048 + }, + { + "epoch": 10.01, + "learning_rate": 4.5616850551654964e-05, + "loss": 2.2142, + "theoretical_loss": 3.3205131419674556, + "tokens_seen": 3002947584 + }, + { + "epoch": 10.01, + "learning_rate": 4.560682046138415e-05, + "loss": 2.4827, + "theoretical_loss": 3.320507571450804, + "tokens_seen": 3003013120 + }, + { + "epoch": 10.01, + "learning_rate": 4.559679037111334e-05, + "loss": 2.463, + "theoretical_loss": 3.3205020010897575, + "tokens_seen": 3003078656 + }, + { + "epoch": 10.01, + "learning_rate": 4.558676028084253e-05, + "loss": 2.595, + "theoretical_loss": 3.3204964308843072, + "tokens_seen": 3003144192 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3344442, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5074257850646973, + "objective/train/theoretical_loss": 3.3204936458399286, + "objective/train/tokens_used": 3023636960, + "theoretical_loss": 3.3204936458399286, + "tokens_seen": 3003176960 + }, + { + "epoch": 10.01, + "learning_rate": 4.5576730190571716e-05, + "loss": 2.5981, + "theoretical_loss": 3.320490860834446, + "tokens_seen": 3003209728 + }, + { + "epoch": 10.01, + "learning_rate": 4.5566700100300904e-05, + "loss": 2.5182, + "theoretical_loss": 3.3204852909401663, + "tokens_seen": 3003275264 + }, + { + "epoch": 10.01, + "learning_rate": 4.555667001003009e-05, + "loss": 2.4685, + "theoretical_loss": 3.3204797212014596, + "tokens_seen": 3003340800 + }, + { + "epoch": 10.01, + "learning_rate": 4.554663991975928e-05, + "loss": 2.5343, + "theoretical_loss": 3.320474151618319, + "tokens_seen": 3003406336 + }, + { + "epoch": 10.01, + "learning_rate": 4.553660982948846e-05, + "loss": 2.5666, + "theoretical_loss": 3.3204685821907365, + "tokens_seen": 3003471872 + }, + { + "epoch": 10.01, + "learning_rate": 4.552657973921765e-05, + "loss": 2.5553, + "theoretical_loss": 3.3204630129187045, + "tokens_seen": 3003537408 + }, + { + "epoch": 10.01, + "learning_rate": 4.5516549648946836e-05, + "loss": 2.4432, + "theoretical_loss": 3.3204574438022143, + "tokens_seen": 3003602944 + }, + { + "epoch": 10.01, + "learning_rate": 4.550651955867603e-05, + "loss": 2.3943, + "theoretical_loss": 3.3204518748412593, + "tokens_seen": 3003668480 + }, + { + "epoch": 10.01, + "learning_rate": 4.549648946840522e-05, + "loss": 2.2664, + "theoretical_loss": 3.320446306035832, + "tokens_seen": 3003734016 + }, + { + "epoch": 10.01, + "learning_rate": 4.548645937813441e-05, + "loss": 2.3806, + "theoretical_loss": 3.320440737385923, + "tokens_seen": 3003799552 + }, + { + "epoch": 10.01, + "learning_rate": 4.5476429287863595e-05, + "loss": 2.3535, + "theoretical_loss": 3.3204351688915263, + "tokens_seen": 3003865088 + }, + { + "epoch": 10.01, + "learning_rate": 4.546639919759278e-05, + "loss": 2.4854, + "theoretical_loss": 3.3204296005526333, + "tokens_seen": 3003930624 + }, + { + "epoch": 10.01, + "learning_rate": 4.545636910732197e-05, + "loss": 2.4884, + "theoretical_loss": 3.3204240323692362, + "tokens_seen": 3003996160 + }, + { + "epoch": 10.01, + "learning_rate": 4.544633901705116e-05, + "loss": 2.4849, + "theoretical_loss": 3.3204184643413277, + "tokens_seen": 3004061696 + }, + { + "epoch": 10.01, + "learning_rate": 4.543630892678034e-05, + "loss": 2.3087, + "theoretical_loss": 3.3204128964688997, + "tokens_seen": 3004127232 + }, + { + "epoch": 10.01, + "learning_rate": 4.542627883650953e-05, + "loss": 2.5294, + "theoretical_loss": 3.320407328751945, + "tokens_seen": 3004192768 + }, + { + "epoch": 10.01, + "learning_rate": 4.5416248746238715e-05, + "loss": 2.357, + "theoretical_loss": 3.320401761190455, + "tokens_seen": 3004258304 + }, + { + "epoch": 10.01, + "learning_rate": 4.54062186559679e-05, + "loss": 2.331, + "theoretical_loss": 3.320396193784423, + "tokens_seen": 3004323840 + }, + { + "epoch": 10.01, + "learning_rate": 4.539618856569709e-05, + "loss": 2.5582, + "theoretical_loss": 3.3203906265338405, + "tokens_seen": 3004389376 + }, + { + "epoch": 10.01, + "learning_rate": 4.538615847542628e-05, + "loss": 2.2838, + "theoretical_loss": 3.3203850594387, + "tokens_seen": 3004454912 + }, + { + "epoch": 10.01, + "learning_rate": 4.537612838515547e-05, + "loss": 2.5055, + "theoretical_loss": 3.3203794924989936, + "tokens_seen": 3004520448 + }, + { + "epoch": 10.01, + "learning_rate": 4.5366098294884655e-05, + "loss": 2.3478, + "theoretical_loss": 3.320373925714714, + "tokens_seen": 3004585984 + }, + { + "epoch": 10.01, + "learning_rate": 4.535606820461384e-05, + "loss": 2.4703, + "theoretical_loss": 3.320368359085853, + "tokens_seen": 3004651520 + }, + { + "epoch": 10.01, + "learning_rate": 4.534603811434303e-05, + "loss": 2.158, + "theoretical_loss": 3.320362792612403, + "tokens_seen": 3004717056 + }, + { + "epoch": 10.01, + "learning_rate": 4.533600802407221e-05, + "loss": 2.3183, + "theoretical_loss": 3.320357226294357, + "tokens_seen": 3004782592 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3345613, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4166290760040283, + "objective/train/theoretical_loss": 3.3203544431936076, + "objective/train/tokens_used": 3025275360, + "theoretical_loss": 3.3203544431936076, + "tokens_seen": 3004815360 + }, + { + "epoch": 10.01, + "learning_rate": 4.5325977933801406e-05, + "loss": 2.3261, + "theoretical_loss": 3.3203516601317062, + "tokens_seen": 3004848128 + }, + { + "epoch": 10.01, + "learning_rate": 4.5315947843530594e-05, + "loss": 2.2686, + "theoretical_loss": 3.320346094124443, + "tokens_seen": 3004913664 + }, + { + "epoch": 10.01, + "learning_rate": 4.530591775325978e-05, + "loss": 2.3701, + "theoretical_loss": 3.3203405282725607, + "tokens_seen": 3004979200 + }, + { + "epoch": 10.01, + "learning_rate": 4.529588766298897e-05, + "loss": 2.4359, + "theoretical_loss": 3.32033496257605, + "tokens_seen": 3005044736 + }, + { + "epoch": 10.01, + "learning_rate": 4.528585757271816e-05, + "loss": 2.6259, + "theoretical_loss": 3.3203293970349046, + "tokens_seen": 3005110272 + }, + { + "epoch": 10.01, + "learning_rate": 4.5275827482447346e-05, + "loss": 2.356, + "theoretical_loss": 3.320323831649116, + "tokens_seen": 3005175808 + }, + { + "epoch": 10.01, + "learning_rate": 4.5265797392176534e-05, + "loss": 2.5645, + "theoretical_loss": 3.320318266418677, + "tokens_seen": 3005241344 + }, + { + "epoch": 10.01, + "learning_rate": 4.525576730190572e-05, + "loss": 2.582, + "theoretical_loss": 3.320312701343579, + "tokens_seen": 3005306880 + }, + { + "epoch": 10.01, + "learning_rate": 4.52457372116349e-05, + "loss": 2.3911, + "theoretical_loss": 3.320307136423815, + "tokens_seen": 3005372416 + }, + { + "epoch": 10.01, + "learning_rate": 4.523570712136409e-05, + "loss": 2.6672, + "theoretical_loss": 3.3203015716593773, + "tokens_seen": 3005437952 + }, + { + "epoch": 10.01, + "learning_rate": 4.522567703109328e-05, + "loss": 2.5841, + "theoretical_loss": 3.320296007050258, + "tokens_seen": 3005503488 + }, + { + "epoch": 10.01, + "learning_rate": 4.5215646940822466e-05, + "loss": 2.2084, + "theoretical_loss": 3.3202904425964492, + "tokens_seen": 3005569024 + }, + { + "epoch": 10.01, + "learning_rate": 4.5205616850551654e-05, + "loss": 2.514, + "theoretical_loss": 3.3202848782979433, + "tokens_seen": 3005634560 + }, + { + "epoch": 10.01, + "learning_rate": 4.519558676028084e-05, + "loss": 2.3794, + "theoretical_loss": 3.3202793141547327, + "tokens_seen": 3005700096 + }, + { + "epoch": 10.01, + "learning_rate": 4.518555667001003e-05, + "loss": 2.6195, + "theoretical_loss": 3.32027375016681, + "tokens_seen": 3005765632 + }, + { + "epoch": 10.01, + "learning_rate": 4.517552657973922e-05, + "loss": 2.4838, + "theoretical_loss": 3.3202681863341663, + "tokens_seen": 3005831168 + }, + { + "epoch": 10.01, + "learning_rate": 4.5165496489468406e-05, + "loss": 2.6258, + "theoretical_loss": 3.320262622656795, + "tokens_seen": 3005896704 + }, + { + "epoch": 10.01, + "learning_rate": 4.5155466399197594e-05, + "loss": 2.3747, + "theoretical_loss": 3.320257059134688, + "tokens_seen": 3005962240 + }, + { + "epoch": 10.01, + "learning_rate": 4.5145436308926775e-05, + "loss": 2.2743, + "theoretical_loss": 3.3202514957678377, + "tokens_seen": 3006027776 + }, + { + "epoch": 10.01, + "learning_rate": 4.513540621865597e-05, + "loss": 2.4881, + "theoretical_loss": 3.3202459325562366, + "tokens_seen": 3006093312 + }, + { + "epoch": 10.01, + "learning_rate": 4.512537612838516e-05, + "loss": 2.3945, + "theoretical_loss": 3.320240369499876, + "tokens_seen": 3006158848 + }, + { + "epoch": 10.01, + "learning_rate": 4.5115346038114345e-05, + "loss": 2.4994, + "theoretical_loss": 3.3202348065987497, + "tokens_seen": 3006224384 + }, + { + "epoch": 10.01, + "learning_rate": 4.510531594784353e-05, + "loss": 2.4737, + "theoretical_loss": 3.3202292438528485, + "tokens_seen": 3006289920 + }, + { + "epoch": 10.01, + "learning_rate": 4.509528585757272e-05, + "loss": 2.6566, + "theoretical_loss": 3.3202236812621657, + "tokens_seen": 3006355456 + }, + { + "epoch": 10.01, + "learning_rate": 4.508525576730191e-05, + "loss": 2.4772, + "theoretical_loss": 3.320218118826693, + "tokens_seen": 3006420992 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3346399, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.586618185043335, + "objective/train/theoretical_loss": 3.320215337667158, + "objective/train/tokens_used": 3026913760, + "theoretical_loss": 3.320215337667158, + "tokens_seen": 3006453760 + }, + { + "epoch": 10.01, + "learning_rate": 4.50752256770311e-05, + "loss": 2.4832, + "theoretical_loss": 3.320212556546423, + "tokens_seen": 3006486528 + }, + { + "epoch": 10.01, + "learning_rate": 4.5065195586760285e-05, + "loss": 2.402, + "theoretical_loss": 3.3202069944213477, + "tokens_seen": 3006552064 + }, + { + "epoch": 10.01, + "learning_rate": 4.505516549648947e-05, + "loss": 2.3641, + "theoretical_loss": 3.32020143245146, + "tokens_seen": 3006617600 + }, + { + "epoch": 10.01, + "learning_rate": 4.5045135406218654e-05, + "loss": 2.4091, + "theoretical_loss": 3.3201958706367516, + "tokens_seen": 3006683136 + }, + { + "epoch": 10.01, + "learning_rate": 4.503510531594784e-05, + "loss": 2.2925, + "theoretical_loss": 3.320190308977215, + "tokens_seen": 3006748672 + }, + { + "epoch": 10.01, + "learning_rate": 4.502507522567703e-05, + "loss": 2.4094, + "theoretical_loss": 3.3201847474728425, + "tokens_seen": 3006814208 + }, + { + "epoch": 10.01, + "learning_rate": 4.501504513540622e-05, + "loss": 2.4526, + "theoretical_loss": 3.3201791861236263, + "tokens_seen": 3006879744 + }, + { + "epoch": 10.01, + "learning_rate": 4.5005015045135405e-05, + "loss": 2.6782, + "theoretical_loss": 3.3201736249295584, + "tokens_seen": 3006945280 + }, + { + "epoch": 10.01, + "learning_rate": 4.4994984954864593e-05, + "loss": 2.3221, + "theoretical_loss": 3.3201680638906317, + "tokens_seen": 3007010816 + }, + { + "epoch": 10.01, + "learning_rate": 4.498495486459378e-05, + "loss": 2.4692, + "theoretical_loss": 3.3201625030068382, + "tokens_seen": 3007076352 + }, + { + "epoch": 10.01, + "learning_rate": 4.497492477432297e-05, + "loss": 2.4358, + "theoretical_loss": 3.32015694227817, + "tokens_seen": 3007141888 + }, + { + "epoch": 10.01, + "learning_rate": 4.496489468405216e-05, + "loss": 2.4533, + "theoretical_loss": 3.32015138170462, + "tokens_seen": 3007207424 + }, + { + "epoch": 10.01, + "learning_rate": 4.4954864593781345e-05, + "loss": 2.3101, + "theoretical_loss": 3.32014582128618, + "tokens_seen": 3007272960 + }, + { + "epoch": 10.01, + "learning_rate": 4.494483450351053e-05, + "loss": 2.594, + "theoretical_loss": 3.3201402610228423, + "tokens_seen": 3007338496 + }, + { + "epoch": 10.01, + "learning_rate": 4.493480441323972e-05, + "loss": 2.2573, + "theoretical_loss": 3.320134700914599, + "tokens_seen": 3007404032 + }, + { + "epoch": 10.01, + "learning_rate": 4.492477432296891e-05, + "loss": 2.409, + "theoretical_loss": 3.320129140961443, + "tokens_seen": 3007469568 + }, + { + "epoch": 10.01, + "learning_rate": 4.4914744232698097e-05, + "loss": 2.3691, + "theoretical_loss": 3.320123581163366, + "tokens_seen": 3007535104 + }, + { + "epoch": 10.01, + "learning_rate": 4.4904714142427284e-05, + "loss": 2.4983, + "theoretical_loss": 3.320118021520361, + "tokens_seen": 3007600640 + }, + { + "epoch": 10.01, + "learning_rate": 4.489468405215647e-05, + "loss": 2.391, + "theoretical_loss": 3.3201124620324194, + "tokens_seen": 3007666176 + }, + { + "epoch": 10.01, + "learning_rate": 4.488465396188566e-05, + "loss": 2.3275, + "theoretical_loss": 3.3201069026995342, + "tokens_seen": 3007731712 + }, + { + "epoch": 10.01, + "learning_rate": 4.487462387161485e-05, + "loss": 2.3249, + "theoretical_loss": 3.3201013435216975, + "tokens_seen": 3007797248 + }, + { + "epoch": 10.01, + "learning_rate": 4.4864593781344036e-05, + "loss": 2.4391, + "theoretical_loss": 3.3200957844989016, + "tokens_seen": 3007862784 + }, + { + "epoch": 10.01, + "learning_rate": 4.485456369107322e-05, + "loss": 2.4346, + "theoretical_loss": 3.3200902256311386, + "tokens_seen": 3007928320 + }, + { + "epoch": 10.01, + "learning_rate": 4.4844533600802405e-05, + "loss": 2.3098, + "theoretical_loss": 3.320084666918401, + "tokens_seen": 3007993856 + }, + { + "epoch": 10.01, + "learning_rate": 4.483450351053159e-05, + "loss": 2.4713, + "theoretical_loss": 3.320079108360681, + "tokens_seen": 3008059392 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3347884, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4817585945129395, + "objective/train/theoretical_loss": 3.32007632913995, + "objective/train/tokens_used": 3028552160, + "theoretical_loss": 3.32007632913995, + "tokens_seen": 3008092160 + }, + { + "epoch": 10.01, + "learning_rate": 4.482447342026078e-05, + "loss": 2.4473, + "theoretical_loss": 3.3200735499579705, + "tokens_seen": 3008124928 + }, + { + "epoch": 10.01, + "learning_rate": 4.481444332998997e-05, + "loss": 2.4321, + "theoretical_loss": 3.320067991710263, + "tokens_seen": 3008190464 + }, + { + "epoch": 10.01, + "learning_rate": 4.480441323971916e-05, + "loss": 2.5332, + "theoretical_loss": 3.32006243361755, + "tokens_seen": 3008256000 + }, + { + "epoch": 10.01, + "learning_rate": 4.4794383149448345e-05, + "loss": 2.5849, + "theoretical_loss": 3.320056875679823, + "tokens_seen": 3008321536 + }, + { + "epoch": 10.01, + "learning_rate": 4.478435305917753e-05, + "loss": 2.4871, + "theoretical_loss": 3.320051317897076, + "tokens_seen": 3008387072 + }, + { + "epoch": 10.01, + "learning_rate": 4.477432296890672e-05, + "loss": 2.2781, + "theoretical_loss": 3.3200457602693003, + "tokens_seen": 3008452608 + }, + { + "epoch": 10.01, + "learning_rate": 4.476429287863591e-05, + "loss": 2.5439, + "theoretical_loss": 3.320040202796488, + "tokens_seen": 3008518144 + }, + { + "epoch": 10.01, + "learning_rate": 4.4754262788365096e-05, + "loss": 2.5053, + "theoretical_loss": 3.320034645478632, + "tokens_seen": 3008583680 + }, + { + "epoch": 10.01, + "learning_rate": 4.4744232698094284e-05, + "loss": 2.3847, + "theoretical_loss": 3.3200290883157244, + "tokens_seen": 3008649216 + }, + { + "epoch": 10.01, + "learning_rate": 4.473420260782347e-05, + "loss": 2.2945, + "theoretical_loss": 3.3200235313077577, + "tokens_seen": 3008714752 + }, + { + "epoch": 10.01, + "learning_rate": 4.472417251755266e-05, + "loss": 2.2359, + "theoretical_loss": 3.3200179744547236, + "tokens_seen": 3008780288 + }, + { + "epoch": 10.01, + "learning_rate": 4.471414242728185e-05, + "loss": 2.3717, + "theoretical_loss": 3.320012417756615, + "tokens_seen": 3008845824 + }, + { + "epoch": 10.01, + "learning_rate": 4.4704112337011036e-05, + "loss": 2.492, + "theoretical_loss": 3.3200068612134235, + "tokens_seen": 3008911360 + }, + { + "epoch": 10.01, + "learning_rate": 4.4694082246740224e-05, + "loss": 2.3922, + "theoretical_loss": 3.3200013048251424, + "tokens_seen": 3008976896 + }, + { + "epoch": 10.01, + "learning_rate": 4.468405215646941e-05, + "loss": 2.4235, + "theoretical_loss": 3.3199957485917633, + "tokens_seen": 3009042432 + }, + { + "epoch": 10.01, + "learning_rate": 4.46740220661986e-05, + "loss": 2.6253, + "theoretical_loss": 3.319990192513279, + "tokens_seen": 3009107968 + }, + { + "epoch": 10.01, + "learning_rate": 4.466399197592779e-05, + "loss": 2.3953, + "theoretical_loss": 3.319984636589681, + "tokens_seen": 3009173504 + }, + { + "epoch": 10.01, + "learning_rate": 4.465396188565697e-05, + "loss": 2.2887, + "theoretical_loss": 3.3199790808209624, + "tokens_seen": 3009239040 + }, + { + "epoch": 10.01, + "learning_rate": 4.4643931795386156e-05, + "loss": 2.4419, + "theoretical_loss": 3.319973525207115, + "tokens_seen": 3009304576 + }, + { + "epoch": 10.01, + "learning_rate": 4.4633901705115344e-05, + "loss": 2.2972, + "theoretical_loss": 3.3199679697481317, + "tokens_seen": 3009370112 + }, + { + "epoch": 10.01, + "learning_rate": 4.462387161484453e-05, + "loss": 2.3117, + "theoretical_loss": 3.3199624144440043, + "tokens_seen": 3009435648 + }, + { + "epoch": 10.01, + "learning_rate": 4.461384152457372e-05, + "loss": 2.6453, + "theoretical_loss": 3.319956859294725, + "tokens_seen": 3009501184 + }, + { + "epoch": 10.01, + "learning_rate": 4.460381143430291e-05, + "loss": 2.2663, + "theoretical_loss": 3.319951304300287, + "tokens_seen": 3009566720 + }, + { + "epoch": 10.01, + "learning_rate": 4.4593781344032096e-05, + "loss": 2.4, + "theoretical_loss": 3.319945749460681, + "tokens_seen": 3009632256 + }, + { + "epoch": 10.01, + "learning_rate": 4.4583751253761284e-05, + "loss": 2.4734, + "theoretical_loss": 3.3199401947759015, + "tokens_seen": 3009697792 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3348705, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0612847805023193, + "objective/train/theoretical_loss": 3.3199374174915683, + "objective/train/tokens_used": 3030190560, + "theoretical_loss": 3.3199374174915683, + "tokens_seen": 3009730560 + }, + { + "epoch": 10.01, + "learning_rate": 4.457372116349048e-05, + "loss": 2.2857, + "theoretical_loss": 3.319934640245939, + "tokens_seen": 3009763328 + }, + { + "epoch": 10.01, + "learning_rate": 4.4563691073219666e-05, + "loss": 2.3589, + "theoretical_loss": 3.319929085870786, + "tokens_seen": 3009828864 + }, + { + "epoch": 10.01, + "learning_rate": 4.455366098294885e-05, + "loss": 2.4089, + "theoretical_loss": 3.3199235316504363, + "tokens_seen": 3009894400 + }, + { + "epoch": 10.01, + "learning_rate": 4.4543630892678035e-05, + "loss": 2.4984, + "theoretical_loss": 3.3199179775848804, + "tokens_seen": 3009959936 + }, + { + "epoch": 10.01, + "learning_rate": 4.453360080240722e-05, + "loss": 2.375, + "theoretical_loss": 3.3199124236741113, + "tokens_seen": 3010025472 + }, + { + "epoch": 10.01, + "learning_rate": 4.452357071213641e-05, + "loss": 2.3144, + "theoretical_loss": 3.319906869918122, + "tokens_seen": 3010091008 + }, + { + "epoch": 10.01, + "learning_rate": 4.45135406218656e-05, + "loss": 2.4061, + "theoretical_loss": 3.319901316316904, + "tokens_seen": 3010156544 + }, + { + "epoch": 10.01, + "learning_rate": 4.450351053159479e-05, + "loss": 2.4149, + "theoretical_loss": 3.3198957628704493, + "tokens_seen": 3010222080 + }, + { + "epoch": 10.01, + "learning_rate": 4.4493480441323975e-05, + "loss": 2.2548, + "theoretical_loss": 3.3198902095787513, + "tokens_seen": 3010287616 + }, + { + "epoch": 10.01, + "learning_rate": 4.448345035105316e-05, + "loss": 2.472, + "theoretical_loss": 3.3198846564418014, + "tokens_seen": 3010353152 + }, + { + "epoch": 10.01, + "learning_rate": 4.447342026078235e-05, + "loss": 2.3549, + "theoretical_loss": 3.3198791034595927, + "tokens_seen": 3010418688 + }, + { + "epoch": 10.01, + "learning_rate": 4.446339017051153e-05, + "loss": 2.5528, + "theoretical_loss": 3.319873550632117, + "tokens_seen": 3010484224 + }, + { + "epoch": 10.01, + "learning_rate": 4.445336008024072e-05, + "loss": 2.5231, + "theoretical_loss": 3.3198679979593666, + "tokens_seen": 3010549760 + }, + { + "epoch": 10.01, + "learning_rate": 4.444332998996991e-05, + "loss": 2.492, + "theoretical_loss": 3.319862445441334, + "tokens_seen": 3010615296 + }, + { + "epoch": 10.01, + "learning_rate": 4.4433299899699095e-05, + "loss": 2.3041, + "theoretical_loss": 3.3198568930780112, + "tokens_seen": 3010680832 + }, + { + "epoch": 10.01, + "learning_rate": 4.442326980942828e-05, + "loss": 2.4811, + "theoretical_loss": 3.3198513408693913, + "tokens_seen": 3010746368 + }, + { + "epoch": 10.01, + "learning_rate": 4.441323971915747e-05, + "loss": 2.3842, + "theoretical_loss": 3.3198457888154653, + "tokens_seen": 3010811904 + }, + { + "epoch": 10.01, + "learning_rate": 4.440320962888666e-05, + "loss": 2.6494, + "theoretical_loss": 3.319840236916227, + "tokens_seen": 3010877440 + }, + { + "epoch": 10.01, + "learning_rate": 4.439317953861585e-05, + "loss": 2.5128, + "theoretical_loss": 3.319834685171668, + "tokens_seen": 3010942976 + }, + { + "epoch": 10.01, + "learning_rate": 4.438314944834504e-05, + "loss": 2.5174, + "theoretical_loss": 3.3198291335817807, + "tokens_seen": 3011008512 + }, + { + "epoch": 10.01, + "learning_rate": 4.437311935807423e-05, + "loss": 2.6203, + "theoretical_loss": 3.319823582146557, + "tokens_seen": 3011074048 + }, + { + "epoch": 10.01, + "learning_rate": 4.436308926780341e-05, + "loss": 2.3924, + "theoretical_loss": 3.31981803086599, + "tokens_seen": 3011139584 + }, + { + "epoch": 10.01, + "learning_rate": 4.43530591775326e-05, + "loss": 2.4272, + "theoretical_loss": 3.3198124797400714, + "tokens_seen": 3011205120 + }, + { + "epoch": 10.01, + "learning_rate": 4.4343029087261786e-05, + "loss": 2.4389, + "theoretical_loss": 3.319806928768794, + "tokens_seen": 3011270656 + }, + { + "epoch": 10.01, + "learning_rate": 4.4332998996990974e-05, + "loss": 2.413, + "theoretical_loss": 3.31980137795215, + "tokens_seen": 3011336192 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3350207, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5655815601348877, + "objective/train/theoretical_loss": 3.319798602601813, + "objective/train/tokens_used": 3031828960, + "theoretical_loss": 3.319798602601813, + "tokens_seen": 3011368960 + }, + { + "epoch": 10.01, + "learning_rate": 4.432296890672016e-05, + "loss": 2.3703, + "theoretical_loss": 3.3197958272901316, + "tokens_seen": 3011401728 + }, + { + "epoch": 10.01, + "learning_rate": 4.431293881644935e-05, + "loss": 2.3409, + "theoretical_loss": 3.3197902767827308, + "tokens_seen": 3011467264 + }, + { + "epoch": 10.01, + "learning_rate": 4.430290872617854e-05, + "loss": 2.4186, + "theoretical_loss": 3.3197847264299405, + "tokens_seen": 3011532800 + }, + { + "epoch": 10.01, + "learning_rate": 4.4292878635907726e-05, + "loss": 2.3836, + "theoretical_loss": 3.319779176231753, + "tokens_seen": 3011598336 + }, + { + "epoch": 10.01, + "learning_rate": 4.4282848545636914e-05, + "loss": 2.5129, + "theoretical_loss": 3.31977362618816, + "tokens_seen": 3011663872 + }, + { + "epoch": 10.01, + "learning_rate": 4.42728184553661e-05, + "loss": 2.54, + "theoretical_loss": 3.319768076299155, + "tokens_seen": 3011729408 + }, + { + "epoch": 10.01, + "learning_rate": 4.426278836509528e-05, + "loss": 2.4922, + "theoretical_loss": 3.3197625265647286, + "tokens_seen": 3011794944 + }, + { + "epoch": 10.01, + "learning_rate": 4.425275827482447e-05, + "loss": 2.3068, + "theoretical_loss": 3.319756976984875, + "tokens_seen": 3011860480 + }, + { + "epoch": 10.01, + "learning_rate": 4.424272818455366e-05, + "loss": 2.2963, + "theoretical_loss": 3.319751427559585, + "tokens_seen": 3011926016 + }, + { + "epoch": 10.01, + "learning_rate": 4.4232698094282846e-05, + "loss": 2.2628, + "theoretical_loss": 3.3197458782888525, + "tokens_seen": 3011991552 + }, + { + "epoch": 10.01, + "learning_rate": 4.4222668004012034e-05, + "loss": 2.5539, + "theoretical_loss": 3.319740329172668, + "tokens_seen": 3012057088 + }, + { + "epoch": 10.01, + "learning_rate": 4.421263791374122e-05, + "loss": 2.464, + "theoretical_loss": 3.319734780211025, + "tokens_seen": 3012122624 + }, + { + "epoch": 10.01, + "learning_rate": 4.420260782347041e-05, + "loss": 2.3365, + "theoretical_loss": 3.319729231403916, + "tokens_seen": 3012188160 + }, + { + "epoch": 10.01, + "learning_rate": 4.4192577733199605e-05, + "loss": 2.5654, + "theoretical_loss": 3.3197236827513326, + "tokens_seen": 3012253696 + }, + { + "epoch": 10.01, + "learning_rate": 4.418254764292879e-05, + "loss": 2.4027, + "theoretical_loss": 3.319718134253268, + "tokens_seen": 3012319232 + }, + { + "epoch": 10.01, + "learning_rate": 4.417251755265798e-05, + "loss": 2.2035, + "theoretical_loss": 3.3197125859097136, + "tokens_seen": 3012384768 + }, + { + "epoch": 10.01, + "learning_rate": 4.416248746238716e-05, + "loss": 2.5756, + "theoretical_loss": 3.319707037720662, + "tokens_seen": 3012450304 + }, + { + "epoch": 10.01, + "learning_rate": 4.415245737211635e-05, + "loss": 2.49, + "theoretical_loss": 3.3197014896861057, + "tokens_seen": 3012515840 + }, + { + "epoch": 10.01, + "learning_rate": 4.414242728184554e-05, + "loss": 2.4629, + "theoretical_loss": 3.319695941806037, + "tokens_seen": 3012581376 + }, + { + "epoch": 10.01, + "learning_rate": 4.4132397191574725e-05, + "loss": 2.4075, + "theoretical_loss": 3.3196903940804487, + "tokens_seen": 3012646912 + }, + { + "epoch": 10.01, + "learning_rate": 4.412236710130391e-05, + "loss": 2.5093, + "theoretical_loss": 3.3196848465093325, + "tokens_seen": 3012712448 + }, + { + "epoch": 10.01, + "learning_rate": 4.41123370110331e-05, + "loss": 2.2983, + "theoretical_loss": 3.3196792990926802, + "tokens_seen": 3012777984 + }, + { + "epoch": 10.01, + "learning_rate": 4.410230692076229e-05, + "loss": 2.378, + "theoretical_loss": 3.3196737518304857, + "tokens_seen": 3012843520 + }, + { + "epoch": 10.01, + "learning_rate": 4.409227683049148e-05, + "loss": 2.5713, + "theoretical_loss": 3.31966820472274, + "tokens_seen": 3012909056 + }, + { + "epoch": 10.01, + "learning_rate": 4.4082246740220665e-05, + "loss": 2.5669, + "theoretical_loss": 3.319662657769437, + "tokens_seen": 3012974592 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3350902, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4056711196899414, + "objective/train/theoretical_loss": 3.319659884350698, + "objective/train/tokens_used": 3033467360, + "theoretical_loss": 3.319659884350698, + "tokens_seen": 3013007360 + }, + { + "epoch": 10.01, + "learning_rate": 4.4072216649949846e-05, + "loss": 2.7399, + "theoretical_loss": 3.319657110970567, + "tokens_seen": 3013040128 + }, + { + "epoch": 10.01, + "learning_rate": 4.4062186559679034e-05, + "loss": 2.5694, + "theoretical_loss": 3.3196515643261235, + "tokens_seen": 3013105664 + }, + { + "epoch": 10.01, + "learning_rate": 4.405215646940822e-05, + "loss": 2.2931, + "theoretical_loss": 3.3196460178360985, + "tokens_seen": 3013171200 + }, + { + "epoch": 10.01, + "learning_rate": 4.404212637913741e-05, + "loss": 2.4267, + "theoretical_loss": 3.319640471500485, + "tokens_seen": 3013236736 + }, + { + "epoch": 10.01, + "learning_rate": 4.40320962888666e-05, + "loss": 2.6285, + "theoretical_loss": 3.3196349253192743, + "tokens_seen": 3013302272 + }, + { + "epoch": 10.01, + "learning_rate": 4.4022066198595785e-05, + "loss": 2.4226, + "theoretical_loss": 3.3196293792924596, + "tokens_seen": 3013367808 + }, + { + "epoch": 10.01, + "learning_rate": 4.401203610832498e-05, + "loss": 2.3757, + "theoretical_loss": 3.319623833420033, + "tokens_seen": 3013433344 + }, + { + "epoch": 10.01, + "learning_rate": 4.400200601805417e-05, + "loss": 2.5108, + "theoretical_loss": 3.319618287701987, + "tokens_seen": 3013498880 + }, + { + "epoch": 10.01, + "learning_rate": 4.3991975927783356e-05, + "loss": 2.2973, + "theoretical_loss": 3.3196127421383137, + "tokens_seen": 3013564416 + }, + { + "epoch": 10.01, + "learning_rate": 4.3981945837512544e-05, + "loss": 2.2584, + "theoretical_loss": 3.319607196729005, + "tokens_seen": 3013629952 + }, + { + "epoch": 10.01, + "learning_rate": 4.3971915747241725e-05, + "loss": 2.4381, + "theoretical_loss": 3.319601651474054, + "tokens_seen": 3013695488 + }, + { + "epoch": 10.01, + "learning_rate": 4.396188565697091e-05, + "loss": 2.6016, + "theoretical_loss": 3.319596106373453, + "tokens_seen": 3013761024 + }, + { + "epoch": 10.01, + "learning_rate": 4.39518555667001e-05, + "loss": 2.3339, + "theoretical_loss": 3.3195905614271934, + "tokens_seen": 3013826560 + }, + { + "epoch": 10.01, + "learning_rate": 4.394182547642929e-05, + "loss": 2.4769, + "theoretical_loss": 3.319585016635269, + "tokens_seen": 3013892096 + }, + { + "epoch": 10.01, + "learning_rate": 4.3931795386158477e-05, + "loss": 2.452, + "theoretical_loss": 3.3195794719976712, + "tokens_seen": 3013957632 + }, + { + "epoch": 10.01, + "learning_rate": 4.3921765295887664e-05, + "loss": 2.3437, + "theoretical_loss": 3.3195739275143925, + "tokens_seen": 3014023168 + }, + { + "epoch": 10.01, + "learning_rate": 4.391173520561685e-05, + "loss": 2.4661, + "theoretical_loss": 3.3195683831854255, + "tokens_seen": 3014088704 + }, + { + "epoch": 10.01, + "learning_rate": 4.390170511534604e-05, + "loss": 2.6747, + "theoretical_loss": 3.319562839010762, + "tokens_seen": 3014154240 + }, + { + "epoch": 10.01, + "learning_rate": 4.389167502507523e-05, + "loss": 2.6964, + "theoretical_loss": 3.3195572949903953, + "tokens_seen": 3014219776 + }, + { + "epoch": 10.01, + "learning_rate": 4.3881644934804416e-05, + "loss": 2.5108, + "theoretical_loss": 3.3195517511243167, + "tokens_seen": 3014285312 + }, + { + "epoch": 10.01, + "learning_rate": 4.38716148445336e-05, + "loss": 2.3399, + "theoretical_loss": 3.3195462074125195, + "tokens_seen": 3014350848 + }, + { + "epoch": 10.01, + "learning_rate": 4.3861584754262785e-05, + "loss": 2.305, + "theoretical_loss": 3.319540663854995, + "tokens_seen": 3014416384 + }, + { + "epoch": 10.01, + "learning_rate": 4.385155466399197e-05, + "loss": 2.0813, + "theoretical_loss": 3.3195351204517363, + "tokens_seen": 3014481920 + }, + { + "epoch": 10.01, + "learning_rate": 4.384152457372116e-05, + "loss": 2.3309, + "theoretical_loss": 3.3195295772027356, + "tokens_seen": 3014547456 + }, + { + "epoch": 10.01, + "learning_rate": 4.383149448345035e-05, + "loss": 2.4172, + "theoretical_loss": 3.3195240341079852, + "tokens_seen": 3014612992 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3351496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2588233947753906, + "objective/train/theoretical_loss": 3.319521262618452, + "objective/train/tokens_used": 3035105760, + "theoretical_loss": 3.319521262618452, + "tokens_seen": 3014645760 + }, + { + "epoch": 10.01, + "learning_rate": 4.3821464393179543e-05, + "loss": 2.3174, + "theoretical_loss": 3.3195184911674778, + "tokens_seen": 3014678528 + }, + { + "epoch": 10.01, + "learning_rate": 4.381143430290873e-05, + "loss": 2.3934, + "theoretical_loss": 3.319512948381205, + "tokens_seen": 3014744064 + }, + { + "epoch": 10.01, + "learning_rate": 4.380140421263792e-05, + "loss": 2.4523, + "theoretical_loss": 3.3195074057491603, + "tokens_seen": 3014809600 + }, + { + "epoch": 10.01, + "learning_rate": 4.379137412236711e-05, + "loss": 2.247, + "theoretical_loss": 3.3195018632713347, + "tokens_seen": 3014875136 + }, + { + "epoch": 10.01, + "learning_rate": 4.3781344032096295e-05, + "loss": 2.2077, + "theoretical_loss": 3.3194963209477213, + "tokens_seen": 3014940672 + }, + { + "epoch": 10.01, + "learning_rate": 4.3771313941825476e-05, + "loss": 2.4406, + "theoretical_loss": 3.3194907787783126, + "tokens_seen": 3015006208 + }, + { + "epoch": 10.01, + "learning_rate": 4.3761283851554664e-05, + "loss": 2.532, + "theoretical_loss": 3.319485236763101, + "tokens_seen": 3015071744 + }, + { + "epoch": 10.01, + "learning_rate": 4.375125376128385e-05, + "loss": 2.4141, + "theoretical_loss": 3.319479694902078, + "tokens_seen": 3015137280 + }, + { + "epoch": 10.01, + "learning_rate": 4.374122367101304e-05, + "loss": 2.4469, + "theoretical_loss": 3.3194741531952365, + "tokens_seen": 3015202816 + }, + { + "epoch": 10.01, + "learning_rate": 4.373119358074223e-05, + "loss": 2.3861, + "theoretical_loss": 3.319468611642569, + "tokens_seen": 3015268352 + }, + { + "epoch": 10.01, + "learning_rate": 4.3721163490471416e-05, + "loss": 2.2659, + "theoretical_loss": 3.319463070244068, + "tokens_seen": 3015333888 + }, + { + "epoch": 10.01, + "learning_rate": 4.3711133400200604e-05, + "loss": 2.4645, + "theoretical_loss": 3.3194575289997257, + "tokens_seen": 3015399424 + }, + { + "epoch": 10.01, + "learning_rate": 4.370110330992979e-05, + "loss": 2.2441, + "theoretical_loss": 3.319451987909534, + "tokens_seen": 3015464960 + }, + { + "epoch": 10.01, + "learning_rate": 4.369107321965898e-05, + "loss": 2.2332, + "theoretical_loss": 3.319446446973486, + "tokens_seen": 3015530496 + }, + { + "epoch": 10.01, + "learning_rate": 4.368104312938816e-05, + "loss": 2.4623, + "theoretical_loss": 3.319440906191574, + "tokens_seen": 3015596032 + }, + { + "epoch": 10.01, + "learning_rate": 4.367101303911735e-05, + "loss": 2.3332, + "theoretical_loss": 3.3194353655637894, + "tokens_seen": 3015661568 + }, + { + "epoch": 10.01, + "learning_rate": 4.3660982948846536e-05, + "loss": 2.519, + "theoretical_loss": 3.3194298250901255, + "tokens_seen": 3015727104 + }, + { + "epoch": 10.01, + "learning_rate": 4.3650952858575724e-05, + "loss": 2.3013, + "theoretical_loss": 3.3194242847705744, + "tokens_seen": 3015792640 + }, + { + "epoch": 10.01, + "learning_rate": 4.364092276830491e-05, + "loss": 2.4477, + "theoretical_loss": 3.3194187446051284, + "tokens_seen": 3015858176 + }, + { + "epoch": 10.01, + "learning_rate": 4.363089267803411e-05, + "loss": 2.5312, + "theoretical_loss": 3.31941320459378, + "tokens_seen": 3015923712 + }, + { + "epoch": 10.01, + "learning_rate": 4.3620862587763295e-05, + "loss": 2.4692, + "theoretical_loss": 3.319407664736522, + "tokens_seen": 3015989248 + }, + { + "epoch": 10.01, + "learning_rate": 4.361083249749248e-05, + "loss": 2.5292, + "theoretical_loss": 3.3194021250333456, + "tokens_seen": 3016054784 + }, + { + "epoch": 10.01, + "learning_rate": 4.360080240722167e-05, + "loss": 2.4023, + "theoretical_loss": 3.319396585484244, + "tokens_seen": 3016120320 + }, + { + "epoch": 10.01, + "learning_rate": 4.359077231695086e-05, + "loss": 2.4031, + "theoretical_loss": 3.3193910460892093, + "tokens_seen": 3016185856 + }, + { + "epoch": 10.01, + "learning_rate": 4.358074222668004e-05, + "loss": 2.1493, + "theoretical_loss": 3.319385506848234, + "tokens_seen": 3016251392 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 3352891, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0213687419891357, + "objective/train/theoretical_loss": 3.3193827372855162, + "objective/train/tokens_used": 3036744160, + "theoretical_loss": 3.3193827372855162, + "tokens_seen": 3016284160 + }, + { + "epoch": 10.01, + "learning_rate": 4.357071213640923e-05, + "loss": 2.2746, + "theoretical_loss": 3.3193799677613107, + "tokens_seen": 3016316928 + }, + { + "epoch": 10.01, + "learning_rate": 4.3560682046138415e-05, + "loss": 2.3548, + "theoretical_loss": 3.3193744288284313, + "tokens_seen": 3016382464 + }, + { + "epoch": 10.01, + "learning_rate": 4.35506519558676e-05, + "loss": 2.5465, + "theoretical_loss": 3.319368890049588, + "tokens_seen": 3016448000 + }, + { + "epoch": 10.01, + "learning_rate": 4.354062186559679e-05, + "loss": 2.3341, + "theoretical_loss": 3.3193633514247742, + "tokens_seen": 3016513536 + }, + { + "epoch": 10.01, + "learning_rate": 4.353059177532598e-05, + "loss": 2.3624, + "theoretical_loss": 3.319357812953981, + "tokens_seen": 3016579072 + }, + { + "epoch": 10.01, + "learning_rate": 4.352056168505517e-05, + "loss": 2.3946, + "theoretical_loss": 3.3193522746372017, + "tokens_seen": 3016644608 + }, + { + "epoch": 10.01, + "learning_rate": 4.3510531594784355e-05, + "loss": 2.366, + "theoretical_loss": 3.3193467364744285, + "tokens_seen": 3016710144 + }, + { + "epoch": 10.01, + "learning_rate": 4.350050150451354e-05, + "loss": 2.4123, + "theoretical_loss": 3.3193411984656533, + "tokens_seen": 3016775680 + }, + { + "epoch": 10.01, + "learning_rate": 4.349047141424273e-05, + "loss": 2.4338, + "theoretical_loss": 3.3193356606108693, + "tokens_seen": 3016841216 + }, + { + "epoch": 10.01, + "learning_rate": 4.348044132397191e-05, + "loss": 2.4094, + "theoretical_loss": 3.319330122910068, + "tokens_seen": 3016906752 + }, + { + "epoch": 10.01, + "learning_rate": 4.34704112337011e-05, + "loss": 2.1735, + "theoretical_loss": 3.3193245853632423, + "tokens_seen": 3016972288 + }, + { + "epoch": 10.01, + "learning_rate": 4.346038114343029e-05, + "loss": 2.518, + "theoretical_loss": 3.3193190479703842, + "tokens_seen": 3017037824 + }, + { + "epoch": 10.01, + "learning_rate": 4.3450351053159475e-05, + "loss": 2.5697, + "theoretical_loss": 3.319313510731486, + "tokens_seen": 3017103360 + }, + { + "epoch": 10.02, + "learning_rate": 4.344032096288867e-05, + "loss": 2.4075, + "theoretical_loss": 3.319307973646541, + "tokens_seen": 3017168896 + }, + { + "epoch": 10.02, + "learning_rate": 4.343029087261786e-05, + "loss": 2.4417, + "theoretical_loss": 3.3193024367155406, + "tokens_seen": 3017234432 + }, + { + "epoch": 10.02, + "learning_rate": 4.3420260782347046e-05, + "loss": 2.3222, + "theoretical_loss": 3.3192968999384775, + "tokens_seen": 3017299968 + }, + { + "epoch": 10.02, + "learning_rate": 4.3410230692076234e-05, + "loss": 2.3439, + "theoretical_loss": 3.3192913633153442, + "tokens_seen": 3017365504 + }, + { + "epoch": 10.02, + "learning_rate": 4.340020060180542e-05, + "loss": 2.2698, + "theoretical_loss": 3.319285826846133, + "tokens_seen": 3017431040 + }, + { + "epoch": 10.02, + "learning_rate": 4.33901705115346e-05, + "loss": 2.6125, + "theoretical_loss": 3.3192802905308363, + "tokens_seen": 3017496576 + }, + { + "epoch": 10.02, + "learning_rate": 4.338014042126379e-05, + "loss": 2.6083, + "theoretical_loss": 3.3192747543694465, + "tokens_seen": 3017562112 + }, + { + "epoch": 10.02, + "learning_rate": 4.337011033099298e-05, + "loss": 2.3944, + "theoretical_loss": 3.319269218361956, + "tokens_seen": 3017627648 + }, + { + "epoch": 10.02, + "learning_rate": 4.3360080240722166e-05, + "loss": 2.2879, + "theoretical_loss": 3.319263682508357, + "tokens_seen": 3017693184 + }, + { + "epoch": 10.02, + "learning_rate": 4.3350050150451354e-05, + "loss": 2.3698, + "theoretical_loss": 3.3192581468086413, + "tokens_seen": 3017758720 + }, + { + "epoch": 10.02, + "learning_rate": 4.334002006018054e-05, + "loss": 2.4949, + "theoretical_loss": 3.3192526112628027, + "tokens_seen": 3017824256 + }, + { + "epoch": 10.02, + "learning_rate": 4.332998996990973e-05, + "loss": 2.5034, + "theoretical_loss": 3.3192470758708326, + "tokens_seen": 3017889792 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3353739, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1773462295532227, + "objective/train/theoretical_loss": 3.3192443082325456, + "objective/train/tokens_used": 3038382560, + "theoretical_loss": 3.3192443082325456, + "tokens_seen": 3017922560 + }, + { + "epoch": 10.02, + "learning_rate": 4.331995987963892e-05, + "loss": 2.4052, + "theoretical_loss": 3.3192415406327234, + "tokens_seen": 3017955328 + }, + { + "epoch": 10.02, + "learning_rate": 4.3309929789368106e-05, + "loss": 2.342, + "theoretical_loss": 3.319236005548468, + "tokens_seen": 3018020864 + }, + { + "epoch": 10.02, + "learning_rate": 4.3299899699097294e-05, + "loss": 2.3701, + "theoretical_loss": 3.3192304706180584, + "tokens_seen": 3018086400 + }, + { + "epoch": 10.02, + "learning_rate": 4.3289869608826475e-05, + "loss": 2.3036, + "theoretical_loss": 3.319224935841487, + "tokens_seen": 3018151936 + }, + { + "epoch": 10.02, + "learning_rate": 4.327983951855566e-05, + "loss": 2.3901, + "theoretical_loss": 3.3192194012187466, + "tokens_seen": 3018217472 + }, + { + "epoch": 10.02, + "learning_rate": 4.326980942828485e-05, + "loss": 2.6407, + "theoretical_loss": 3.3192138667498288, + "tokens_seen": 3018283008 + }, + { + "epoch": 10.02, + "learning_rate": 4.3259779338014045e-05, + "loss": 2.3788, + "theoretical_loss": 3.3192083324347266, + "tokens_seen": 3018348544 + }, + { + "epoch": 10.02, + "learning_rate": 4.324974924774323e-05, + "loss": 2.1826, + "theoretical_loss": 3.319202798273432, + "tokens_seen": 3018414080 + }, + { + "epoch": 10.02, + "learning_rate": 4.323971915747242e-05, + "loss": 2.2153, + "theoretical_loss": 3.319197264265938, + "tokens_seen": 3018479616 + }, + { + "epoch": 10.02, + "learning_rate": 4.322968906720161e-05, + "loss": 2.665, + "theoretical_loss": 3.319191730412236, + "tokens_seen": 3018545152 + }, + { + "epoch": 10.02, + "learning_rate": 4.32196589769308e-05, + "loss": 2.4382, + "theoretical_loss": 3.3191861967123195, + "tokens_seen": 3018610688 + }, + { + "epoch": 10.02, + "learning_rate": 4.3209628886659985e-05, + "loss": 2.3695, + "theoretical_loss": 3.31918066316618, + "tokens_seen": 3018676224 + }, + { + "epoch": 10.02, + "learning_rate": 4.319959879638917e-05, + "loss": 2.3868, + "theoretical_loss": 3.3191751297738104, + "tokens_seen": 3018741760 + }, + { + "epoch": 10.02, + "learning_rate": 4.3189568706118354e-05, + "loss": 2.425, + "theoretical_loss": 3.3191695965352026, + "tokens_seen": 3018807296 + }, + { + "epoch": 10.02, + "learning_rate": 4.317953861584754e-05, + "loss": 2.2554, + "theoretical_loss": 3.3191640634503496, + "tokens_seen": 3018872832 + }, + { + "epoch": 10.02, + "learning_rate": 4.316950852557673e-05, + "loss": 2.5973, + "theoretical_loss": 3.3191585305192435, + "tokens_seen": 3018938368 + }, + { + "epoch": 10.02, + "learning_rate": 4.315947843530592e-05, + "loss": 2.4561, + "theoretical_loss": 3.3191529977418766, + "tokens_seen": 3019003904 + }, + { + "epoch": 10.02, + "learning_rate": 4.3149448345035105e-05, + "loss": 2.399, + "theoretical_loss": 3.3191474651182418, + "tokens_seen": 3019069440 + }, + { + "epoch": 10.02, + "learning_rate": 4.313941825476429e-05, + "loss": 2.3233, + "theoretical_loss": 3.3191419326483307, + "tokens_seen": 3019134976 + }, + { + "epoch": 10.02, + "learning_rate": 4.312938816449348e-05, + "loss": 2.2019, + "theoretical_loss": 3.319136400332136, + "tokens_seen": 3019200512 + }, + { + "epoch": 10.02, + "learning_rate": 4.311935807422267e-05, + "loss": 2.552, + "theoretical_loss": 3.3191308681696503, + "tokens_seen": 3019266048 + }, + { + "epoch": 10.02, + "learning_rate": 4.310932798395186e-05, + "loss": 2.347, + "theoretical_loss": 3.319125336160866, + "tokens_seen": 3019331584 + }, + { + "epoch": 10.02, + "learning_rate": 4.3099297893681045e-05, + "loss": 2.3135, + "theoretical_loss": 3.319119804305775, + "tokens_seen": 3019397120 + }, + { + "epoch": 10.02, + "learning_rate": 4.3089267803410226e-05, + "loss": 2.2338, + "theoretical_loss": 3.31911427260437, + "tokens_seen": 3019462656 + }, + { + "epoch": 10.02, + "learning_rate": 4.3079237713139414e-05, + "loss": 2.4311, + "theoretical_loss": 3.3191087410566436, + "tokens_seen": 3019528192 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3355232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2641420364379883, + "objective/train/theoretical_loss": 3.3191059753404075, + "objective/train/tokens_used": 3040020960, + "theoretical_loss": 3.3191059753404075, + "tokens_seen": 3019560960 + }, + { + "epoch": 10.02, + "learning_rate": 4.306920762286861e-05, + "loss": 2.4149, + "theoretical_loss": 3.3191032096625883, + "tokens_seen": 3019593728 + }, + { + "epoch": 10.02, + "learning_rate": 4.3059177532597796e-05, + "loss": 2.497, + "theoretical_loss": 3.319097678422196, + "tokens_seen": 3019659264 + }, + { + "epoch": 10.02, + "learning_rate": 4.3049147442326984e-05, + "loss": 2.373, + "theoretical_loss": 3.3190921473354593, + "tokens_seen": 3019724800 + }, + { + "epoch": 10.02, + "learning_rate": 4.303911735205617e-05, + "loss": 2.4101, + "theoretical_loss": 3.319086616402371, + "tokens_seen": 3019790336 + }, + { + "epoch": 10.02, + "learning_rate": 4.302908726178536e-05, + "loss": 2.5057, + "theoretical_loss": 3.3190810856229227, + "tokens_seen": 3019855872 + }, + { + "epoch": 10.02, + "learning_rate": 4.301905717151455e-05, + "loss": 2.3973, + "theoretical_loss": 3.319075554997107, + "tokens_seen": 3019921408 + }, + { + "epoch": 10.02, + "learning_rate": 4.3009027081243736e-05, + "loss": 2.2836, + "theoretical_loss": 3.319070024524917, + "tokens_seen": 3019986944 + }, + { + "epoch": 10.02, + "learning_rate": 4.299899699097292e-05, + "loss": 2.3121, + "theoretical_loss": 3.3190644942063443, + "tokens_seen": 3020052480 + }, + { + "epoch": 10.02, + "learning_rate": 4.2988966900702105e-05, + "loss": 2.2471, + "theoretical_loss": 3.319058964041382, + "tokens_seen": 3020118016 + }, + { + "epoch": 10.02, + "learning_rate": 4.297893681043129e-05, + "loss": 2.4635, + "theoretical_loss": 3.3190534340300215, + "tokens_seen": 3020183552 + }, + { + "epoch": 10.02, + "learning_rate": 4.296890672016048e-05, + "loss": 2.2506, + "theoretical_loss": 3.3190479041722565, + "tokens_seen": 3020249088 + }, + { + "epoch": 10.02, + "learning_rate": 4.295887662988967e-05, + "loss": 2.307, + "theoretical_loss": 3.3190423744680784, + "tokens_seen": 3020314624 + }, + { + "epoch": 10.02, + "learning_rate": 4.2948846539618857e-05, + "loss": 2.483, + "theoretical_loss": 3.3190368449174796, + "tokens_seen": 3020380160 + }, + { + "epoch": 10.02, + "learning_rate": 4.2938816449348044e-05, + "loss": 2.3119, + "theoretical_loss": 3.319031315520453, + "tokens_seen": 3020445696 + }, + { + "epoch": 10.02, + "learning_rate": 4.292878635907723e-05, + "loss": 2.548, + "theoretical_loss": 3.319025786276991, + "tokens_seen": 3020511232 + }, + { + "epoch": 10.02, + "learning_rate": 4.291875626880642e-05, + "loss": 2.375, + "theoretical_loss": 3.319020257187086, + "tokens_seen": 3020576768 + }, + { + "epoch": 10.02, + "learning_rate": 4.290872617853561e-05, + "loss": 2.5021, + "theoretical_loss": 3.31901472825073, + "tokens_seen": 3020642304 + }, + { + "epoch": 10.02, + "learning_rate": 4.289869608826479e-05, + "loss": 2.3037, + "theoretical_loss": 3.3190091994679154, + "tokens_seen": 3020707840 + }, + { + "epoch": 10.02, + "learning_rate": 4.288866599799398e-05, + "loss": 2.2976, + "theoretical_loss": 3.3190036708386352, + "tokens_seen": 3020773376 + }, + { + "epoch": 10.02, + "learning_rate": 4.287863590772317e-05, + "loss": 2.5763, + "theoretical_loss": 3.318998142362881, + "tokens_seen": 3020838912 + }, + { + "epoch": 10.02, + "learning_rate": 4.286860581745236e-05, + "loss": 2.5329, + "theoretical_loss": 3.3189926140406465, + "tokens_seen": 3020904448 + }, + { + "epoch": 10.02, + "learning_rate": 4.285857572718155e-05, + "loss": 2.5292, + "theoretical_loss": 3.3189870858719224, + "tokens_seen": 3020969984 + }, + { + "epoch": 10.02, + "learning_rate": 4.2848545636910736e-05, + "loss": 2.3459, + "theoretical_loss": 3.318981557856702, + "tokens_seen": 3021035520 + }, + { + "epoch": 10.02, + "learning_rate": 4.2838515546639923e-05, + "loss": 2.5188, + "theoretical_loss": 3.318976029994978, + "tokens_seen": 3021101056 + }, + { + "epoch": 10.02, + "learning_rate": 4.282848545636911e-05, + "loss": 2.5167, + "theoretical_loss": 3.318970502286743, + "tokens_seen": 3021166592 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3355931, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.580738067626953, + "objective/train/theoretical_loss": 3.318967738490181, + "objective/train/tokens_used": 3041659360, + "theoretical_loss": 3.318967738490181, + "tokens_seen": 3021199360 + }, + { + "epoch": 10.02, + "learning_rate": 4.28184553660983e-05, + "loss": 2.5958, + "theoretical_loss": 3.3189649747319883, + "tokens_seen": 3021232128 + }, + { + "epoch": 10.02, + "learning_rate": 4.280842527582749e-05, + "loss": 2.4731, + "theoretical_loss": 3.3189594473307067, + "tokens_seen": 3021297664 + }, + { + "epoch": 10.02, + "learning_rate": 4.279839518555667e-05, + "loss": 2.4209, + "theoretical_loss": 3.3189539200828913, + "tokens_seen": 3021363200 + }, + { + "epoch": 10.02, + "learning_rate": 4.2788365095285856e-05, + "loss": 2.4278, + "theoretical_loss": 3.3189483929885335, + "tokens_seen": 3021428736 + }, + { + "epoch": 10.02, + "learning_rate": 4.2778335005015044e-05, + "loss": 2.5123, + "theoretical_loss": 3.318942866047627, + "tokens_seen": 3021494272 + }, + { + "epoch": 10.02, + "learning_rate": 4.276830491474423e-05, + "loss": 2.3531, + "theoretical_loss": 3.3189373392601627, + "tokens_seen": 3021559808 + }, + { + "epoch": 10.02, + "learning_rate": 4.275827482447342e-05, + "loss": 2.3241, + "theoretical_loss": 3.3189318126261336, + "tokens_seen": 3021625344 + }, + { + "epoch": 10.02, + "learning_rate": 4.274824473420261e-05, + "loss": 2.2979, + "theoretical_loss": 3.318926286145533, + "tokens_seen": 3021690880 + }, + { + "epoch": 10.02, + "learning_rate": 4.2738214643931796e-05, + "loss": 2.638, + "theoretical_loss": 3.318920759818352, + "tokens_seen": 3021756416 + }, + { + "epoch": 10.02, + "learning_rate": 4.2728184553660983e-05, + "loss": 2.387, + "theoretical_loss": 3.318915233644584, + "tokens_seen": 3021821952 + }, + { + "epoch": 10.02, + "learning_rate": 4.271815446339017e-05, + "loss": 2.3323, + "theoretical_loss": 3.3189097076242207, + "tokens_seen": 3021887488 + }, + { + "epoch": 10.02, + "learning_rate": 4.270812437311935e-05, + "loss": 2.3696, + "theoretical_loss": 3.3189041817572553, + "tokens_seen": 3021953024 + }, + { + "epoch": 10.02, + "learning_rate": 4.269809428284855e-05, + "loss": 2.3935, + "theoretical_loss": 3.3188986560436793, + "tokens_seen": 3022018560 + }, + { + "epoch": 10.02, + "learning_rate": 4.2688064192577735e-05, + "loss": 2.2092, + "theoretical_loss": 3.3188931304834854, + "tokens_seen": 3022084096 + }, + { + "epoch": 10.02, + "learning_rate": 4.267803410230692e-05, + "loss": 2.5115, + "theoretical_loss": 3.3188876050766667, + "tokens_seen": 3022149632 + }, + { + "epoch": 10.02, + "learning_rate": 4.266800401203611e-05, + "loss": 2.4143, + "theoretical_loss": 3.3188820798232146, + "tokens_seen": 3022215168 + }, + { + "epoch": 10.02, + "learning_rate": 4.26579739217653e-05, + "loss": 2.2609, + "theoretical_loss": 3.3188765547231225, + "tokens_seen": 3022280704 + }, + { + "epoch": 10.02, + "learning_rate": 4.264794383149449e-05, + "loss": 2.3525, + "theoretical_loss": 3.318871029776382, + "tokens_seen": 3022346240 + }, + { + "epoch": 10.02, + "learning_rate": 4.2637913741223675e-05, + "loss": 2.4884, + "theoretical_loss": 3.318865504982986, + "tokens_seen": 3022411776 + }, + { + "epoch": 10.02, + "learning_rate": 4.262788365095286e-05, + "loss": 2.1673, + "theoretical_loss": 3.3188599803429266, + "tokens_seen": 3022477312 + }, + { + "epoch": 10.02, + "learning_rate": 4.261785356068205e-05, + "loss": 2.4121, + "theoretical_loss": 3.3188544558561963, + "tokens_seen": 3022542848 + }, + { + "epoch": 10.02, + "learning_rate": 4.260782347041123e-05, + "loss": 2.4351, + "theoretical_loss": 3.318848931522788, + "tokens_seen": 3022608384 + }, + { + "epoch": 10.02, + "learning_rate": 4.259779338014042e-05, + "loss": 2.2217, + "theoretical_loss": 3.318843407342693, + "tokens_seen": 3022673920 + }, + { + "epoch": 10.02, + "learning_rate": 4.258776328986961e-05, + "loss": 2.4914, + "theoretical_loss": 3.318837883315905, + "tokens_seen": 3022739456 + }, + { + "epoch": 10.02, + "learning_rate": 4.2577733199598795e-05, + "loss": 2.36, + "theoretical_loss": 3.318832359442416, + "tokens_seen": 3022804992 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3357112, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.548478603363037, + "objective/train/theoretical_loss": 3.318829597563156, + "objective/train/tokens_used": 3043297760, + "theoretical_loss": 3.318829597563156, + "tokens_seen": 3022837760 + }, + { + "epoch": 10.02, + "learning_rate": 4.256770310932798e-05, + "loss": 2.5434, + "theoretical_loss": 3.3188268357222177, + "tokens_seen": 3022870528 + }, + { + "epoch": 10.02, + "learning_rate": 4.255767301905717e-05, + "loss": 2.5229, + "theoretical_loss": 3.318821312155303, + "tokens_seen": 3022936064 + }, + { + "epoch": 10.02, + "learning_rate": 4.254764292878636e-05, + "loss": 2.4627, + "theoretical_loss": 3.3188157887416656, + "tokens_seen": 3023001600 + }, + { + "epoch": 10.02, + "learning_rate": 4.253761283851555e-05, + "loss": 2.5552, + "theoretical_loss": 3.3188102654812957, + "tokens_seen": 3023067136 + }, + { + "epoch": 10.02, + "learning_rate": 4.2527582748244735e-05, + "loss": 2.6057, + "theoretical_loss": 3.318804742374187, + "tokens_seen": 3023132672 + }, + { + "epoch": 10.02, + "learning_rate": 4.251755265797392e-05, + "loss": 2.4306, + "theoretical_loss": 3.318799219420332, + "tokens_seen": 3023198208 + }, + { + "epoch": 10.02, + "learning_rate": 4.250752256770311e-05, + "loss": 2.2691, + "theoretical_loss": 3.3187936966197227, + "tokens_seen": 3023263744 + }, + { + "epoch": 10.02, + "learning_rate": 4.24974924774323e-05, + "loss": 2.496, + "theoretical_loss": 3.3187881739723517, + "tokens_seen": 3023329280 + }, + { + "epoch": 10.02, + "learning_rate": 4.2487462387161486e-05, + "loss": 2.5147, + "theoretical_loss": 3.318782651478211, + "tokens_seen": 3023394816 + }, + { + "epoch": 10.02, + "learning_rate": 4.2477432296890674e-05, + "loss": 2.405, + "theoretical_loss": 3.3187771291372936, + "tokens_seen": 3023460352 + }, + { + "epoch": 10.02, + "learning_rate": 4.246740220661986e-05, + "loss": 2.2766, + "theoretical_loss": 3.318771606949592, + "tokens_seen": 3023525888 + }, + { + "epoch": 10.02, + "learning_rate": 4.245737211634905e-05, + "loss": 2.4777, + "theoretical_loss": 3.318766084915098, + "tokens_seen": 3023591424 + }, + { + "epoch": 10.02, + "learning_rate": 4.244734202607824e-05, + "loss": 2.1763, + "theoretical_loss": 3.3187605630338046, + "tokens_seen": 3023656960 + }, + { + "epoch": 10.02, + "learning_rate": 4.2437311935807426e-05, + "loss": 2.539, + "theoretical_loss": 3.318755041305704, + "tokens_seen": 3023722496 + }, + { + "epoch": 10.02, + "learning_rate": 4.2427281845536614e-05, + "loss": 2.4303, + "theoretical_loss": 3.318749519730789, + "tokens_seen": 3023788032 + }, + { + "epoch": 10.02, + "learning_rate": 4.24172517552658e-05, + "loss": 2.3525, + "theoretical_loss": 3.3187439983090514, + "tokens_seen": 3023853568 + }, + { + "epoch": 10.02, + "learning_rate": 4.240722166499498e-05, + "loss": 2.5336, + "theoretical_loss": 3.3187384770404837, + "tokens_seen": 3023919104 + }, + { + "epoch": 10.02, + "learning_rate": 4.239719157472417e-05, + "loss": 2.4778, + "theoretical_loss": 3.318732955925079, + "tokens_seen": 3023984640 + }, + { + "epoch": 10.02, + "learning_rate": 4.238716148445336e-05, + "loss": 2.4649, + "theoretical_loss": 3.318727434962829, + "tokens_seen": 3024050176 + }, + { + "epoch": 10.02, + "learning_rate": 4.2377131394182546e-05, + "loss": 2.3108, + "theoretical_loss": 3.318721914153726, + "tokens_seen": 3024115712 + }, + { + "epoch": 10.02, + "learning_rate": 4.2367101303911734e-05, + "loss": 2.3709, + "theoretical_loss": 3.3187163934977635, + "tokens_seen": 3024181248 + }, + { + "epoch": 10.02, + "learning_rate": 4.235707121364092e-05, + "loss": 2.165, + "theoretical_loss": 3.318710872994933, + "tokens_seen": 3024246784 + }, + { + "epoch": 10.02, + "learning_rate": 4.234704112337011e-05, + "loss": 2.4792, + "theoretical_loss": 3.3187053526452273, + "tokens_seen": 3024312320 + }, + { + "epoch": 10.02, + "learning_rate": 4.23370110330993e-05, + "loss": 2.4581, + "theoretical_loss": 3.3186998324486385, + "tokens_seen": 3024377856 + }, + { + "epoch": 10.02, + "learning_rate": 4.2326980942828486e-05, + "loss": 2.3379, + "theoretical_loss": 3.3186943124051598, + "tokens_seen": 3024443392 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3357880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2698519229888916, + "objective/train/theoretical_loss": 3.318691552440834, + "objective/train/tokens_used": 3044936160, + "theoretical_loss": 3.318691552440834, + "tokens_seen": 3024476160 + }, + { + "epoch": 10.02, + "learning_rate": 4.231695085255768e-05, + "loss": 2.2717, + "theoretical_loss": 3.3186887925147825, + "tokens_seen": 3024508928 + }, + { + "epoch": 10.02, + "learning_rate": 4.230692076228686e-05, + "loss": 2.5117, + "theoretical_loss": 3.3186832727775, + "tokens_seen": 3024574464 + }, + { + "epoch": 10.02, + "learning_rate": 4.229689067201605e-05, + "loss": 2.4585, + "theoretical_loss": 3.3186777531933043, + "tokens_seen": 3024640000 + }, + { + "epoch": 10.02, + "learning_rate": 4.228686058174524e-05, + "loss": 2.3493, + "theoretical_loss": 3.318672233762188, + "tokens_seen": 3024705536 + }, + { + "epoch": 10.02, + "learning_rate": 4.2276830491474425e-05, + "loss": 2.5127, + "theoretical_loss": 3.3186667144841437, + "tokens_seen": 3024771072 + }, + { + "epoch": 10.02, + "learning_rate": 4.226680040120361e-05, + "loss": 2.397, + "theoretical_loss": 3.318661195359163, + "tokens_seen": 3024836608 + }, + { + "epoch": 10.02, + "learning_rate": 4.22567703109328e-05, + "loss": 2.4927, + "theoretical_loss": 3.3186556763872392, + "tokens_seen": 3024902144 + }, + { + "epoch": 10.02, + "learning_rate": 4.224674022066199e-05, + "loss": 2.6648, + "theoretical_loss": 3.318650157568365, + "tokens_seen": 3024967680 + }, + { + "epoch": 10.02, + "learning_rate": 4.223671013039118e-05, + "loss": 2.3975, + "theoretical_loss": 3.3186446389025317, + "tokens_seen": 3025033216 + }, + { + "epoch": 10.02, + "learning_rate": 4.2226680040120365e-05, + "loss": 2.5302, + "theoretical_loss": 3.318639120389733, + "tokens_seen": 3025098752 + }, + { + "epoch": 10.02, + "learning_rate": 4.2216649949849546e-05, + "loss": 2.2247, + "theoretical_loss": 3.31863360202996, + "tokens_seen": 3025164288 + }, + { + "epoch": 10.02, + "learning_rate": 4.2206619859578734e-05, + "loss": 2.5324, + "theoretical_loss": 3.318628083823206, + "tokens_seen": 3025229824 + }, + { + "epoch": 10.02, + "learning_rate": 4.219658976930792e-05, + "loss": 2.4477, + "theoretical_loss": 3.3186225657694637, + "tokens_seen": 3025295360 + }, + { + "epoch": 10.02, + "learning_rate": 4.218655967903711e-05, + "loss": 2.5703, + "theoretical_loss": 3.3186170478687247, + "tokens_seen": 3025360896 + }, + { + "epoch": 10.02, + "learning_rate": 4.21765295887663e-05, + "loss": 2.5457, + "theoretical_loss": 3.318611530120982, + "tokens_seen": 3025426432 + }, + { + "epoch": 10.02, + "learning_rate": 4.2166499498495485e-05, + "loss": 2.4837, + "theoretical_loss": 3.318606012526228, + "tokens_seen": 3025491968 + }, + { + "epoch": 10.02, + "learning_rate": 4.215646940822467e-05, + "loss": 2.4399, + "theoretical_loss": 3.3186004950844548, + "tokens_seen": 3025557504 + }, + { + "epoch": 10.02, + "learning_rate": 4.214643931795386e-05, + "loss": 2.4262, + "theoretical_loss": 3.318594977795655, + "tokens_seen": 3025623040 + }, + { + "epoch": 10.02, + "learning_rate": 4.213640922768305e-05, + "loss": 2.3096, + "theoretical_loss": 3.3185894606598216, + "tokens_seen": 3025688576 + }, + { + "epoch": 10.02, + "learning_rate": 4.2126379137412244e-05, + "loss": 2.3906, + "theoretical_loss": 3.3185839436769466, + "tokens_seen": 3025754112 + }, + { + "epoch": 10.02, + "learning_rate": 4.2116349047141425e-05, + "loss": 2.6176, + "theoretical_loss": 3.318578426847022, + "tokens_seen": 3025819648 + }, + { + "epoch": 10.02, + "learning_rate": 4.210631895687061e-05, + "loss": 2.3749, + "theoretical_loss": 3.318572910170041, + "tokens_seen": 3025885184 + }, + { + "epoch": 10.02, + "learning_rate": 4.20962888665998e-05, + "loss": 2.3686, + "theoretical_loss": 3.318567393645996, + "tokens_seen": 3025950720 + }, + { + "epoch": 10.02, + "learning_rate": 4.208625877632899e-05, + "loss": 2.5433, + "theoretical_loss": 3.3185618772748793, + "tokens_seen": 3026016256 + }, + { + "epoch": 10.02, + "learning_rate": 4.2076228686058176e-05, + "loss": 2.2931, + "theoretical_loss": 3.3185563610566824, + "tokens_seen": 3026081792 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3359305, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4721286296844482, + "objective/train/theoretical_loss": 3.3185536030049274, + "objective/train/tokens_used": 3046574560, + "theoretical_loss": 3.3185536030049274, + "tokens_seen": 3026114560 + }, + { + "epoch": 10.02, + "learning_rate": 4.2066198595787364e-05, + "loss": 2.5451, + "theoretical_loss": 3.318550844991399, + "tokens_seen": 3026147328 + }, + { + "epoch": 10.02, + "learning_rate": 4.205616850551655e-05, + "loss": 2.5342, + "theoretical_loss": 3.3185453290790212, + "tokens_seen": 3026212864 + }, + { + "epoch": 10.02, + "learning_rate": 4.204613841524574e-05, + "loss": 2.5028, + "theoretical_loss": 3.318539813319542, + "tokens_seen": 3026278400 + }, + { + "epoch": 10.02, + "learning_rate": 4.203610832497493e-05, + "loss": 2.3598, + "theoretical_loss": 3.318534297712952, + "tokens_seen": 3026343936 + }, + { + "epoch": 10.02, + "learning_rate": 4.2026078234704116e-05, + "loss": 2.3945, + "theoretical_loss": 3.3185287822592455, + "tokens_seen": 3026409472 + }, + { + "epoch": 10.02, + "learning_rate": 4.20160481444333e-05, + "loss": 2.4908, + "theoretical_loss": 3.3185232669584144, + "tokens_seen": 3026475008 + }, + { + "epoch": 10.02, + "learning_rate": 4.2006018054162485e-05, + "loss": 2.3004, + "theoretical_loss": 3.318517751810451, + "tokens_seen": 3026540544 + }, + { + "epoch": 10.02, + "learning_rate": 4.199598796389167e-05, + "loss": 2.5313, + "theoretical_loss": 3.318512236815348, + "tokens_seen": 3026606080 + }, + { + "epoch": 10.02, + "learning_rate": 4.198595787362086e-05, + "loss": 2.2399, + "theoretical_loss": 3.3185067219730975, + "tokens_seen": 3026671616 + }, + { + "epoch": 10.02, + "learning_rate": 4.197592778335005e-05, + "loss": 2.5503, + "theoretical_loss": 3.3185012072836924, + "tokens_seen": 3026737152 + }, + { + "epoch": 10.02, + "learning_rate": 4.1965897693079237e-05, + "loss": 2.2163, + "theoretical_loss": 3.3184956927471245, + "tokens_seen": 3026802688 + }, + { + "epoch": 10.02, + "learning_rate": 4.1955867602808424e-05, + "loss": 2.4495, + "theoretical_loss": 3.318490178363387, + "tokens_seen": 3026868224 + }, + { + "epoch": 10.02, + "learning_rate": 4.194583751253762e-05, + "loss": 2.2071, + "theoretical_loss": 3.318484664132472, + "tokens_seen": 3026933760 + }, + { + "epoch": 10.02, + "learning_rate": 4.193580742226681e-05, + "loss": 2.4508, + "theoretical_loss": 3.318479150054372, + "tokens_seen": 3026999296 + }, + { + "epoch": 10.02, + "learning_rate": 4.192577733199599e-05, + "loss": 2.4634, + "theoretical_loss": 3.318473636129079, + "tokens_seen": 3027064832 + }, + { + "epoch": 10.02, + "learning_rate": 4.1915747241725176e-05, + "loss": 2.421, + "theoretical_loss": 3.3184681223565864, + "tokens_seen": 3027130368 + }, + { + "epoch": 10.02, + "learning_rate": 4.1905717151454364e-05, + "loss": 2.3462, + "theoretical_loss": 3.318462608736886, + "tokens_seen": 3027195904 + }, + { + "epoch": 10.02, + "learning_rate": 4.189568706118355e-05, + "loss": 2.2201, + "theoretical_loss": 3.31845709526997, + "tokens_seen": 3027261440 + }, + { + "epoch": 10.02, + "learning_rate": 4.188565697091274e-05, + "loss": 2.5935, + "theoretical_loss": 3.318451581955832, + "tokens_seen": 3027326976 + }, + { + "epoch": 10.02, + "learning_rate": 4.187562688064193e-05, + "loss": 2.1363, + "theoretical_loss": 3.3184460687944632, + "tokens_seen": 3027392512 + }, + { + "epoch": 10.02, + "learning_rate": 4.1865596790371115e-05, + "loss": 2.5017, + "theoretical_loss": 3.318440555785857, + "tokens_seen": 3027458048 + }, + { + "epoch": 10.02, + "learning_rate": 4.18555667001003e-05, + "loss": 2.4846, + "theoretical_loss": 3.3184350429300054, + "tokens_seen": 3027523584 + }, + { + "epoch": 10.02, + "learning_rate": 4.184553660982949e-05, + "loss": 2.3869, + "theoretical_loss": 3.3184295302269002, + "tokens_seen": 3027589120 + }, + { + "epoch": 10.02, + "learning_rate": 4.183550651955868e-05, + "loss": 2.3569, + "theoretical_loss": 3.318424017676535, + "tokens_seen": 3027654656 + }, + { + "epoch": 10.02, + "learning_rate": 4.182547642928786e-05, + "loss": 2.3655, + "theoretical_loss": 3.318418505278902, + "tokens_seen": 3027720192 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3360101, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.01686954498291, + "objective/train/theoretical_loss": 3.3184157491373574, + "objective/train/tokens_used": 3048212960, + "theoretical_loss": 3.3184157491373574, + "tokens_seen": 3027752960 + }, + { + "epoch": 10.02, + "learning_rate": 4.181544633901705e-05, + "loss": 2.4071, + "theoretical_loss": 3.318412993033993, + "tokens_seen": 3027785728 + }, + { + "epoch": 10.02, + "learning_rate": 4.1805416248746236e-05, + "loss": 2.4607, + "theoretical_loss": 3.3184074809418016, + "tokens_seen": 3027851264 + }, + { + "epoch": 10.02, + "learning_rate": 4.1795386158475424e-05, + "loss": 2.5899, + "theoretical_loss": 3.3184019690023194, + "tokens_seen": 3027916800 + }, + { + "epoch": 10.02, + "learning_rate": 4.178535606820461e-05, + "loss": 2.3455, + "theoretical_loss": 3.318396457215539, + "tokens_seen": 3027982336 + }, + { + "epoch": 10.02, + "learning_rate": 4.17753259779338e-05, + "loss": 2.4102, + "theoretical_loss": 3.318390945581453, + "tokens_seen": 3028047872 + }, + { + "epoch": 10.02, + "learning_rate": 4.176529588766299e-05, + "loss": 2.4065, + "theoretical_loss": 3.318385434100054, + "tokens_seen": 3028113408 + }, + { + "epoch": 10.02, + "learning_rate": 4.175526579739218e-05, + "loss": 2.3559, + "theoretical_loss": 3.3183799227713338, + "tokens_seen": 3028178944 + }, + { + "epoch": 10.02, + "learning_rate": 4.174523570712137e-05, + "loss": 2.4097, + "theoretical_loss": 3.318374411595286, + "tokens_seen": 3028244480 + }, + { + "epoch": 10.02, + "learning_rate": 4.173520561685056e-05, + "loss": 2.3106, + "theoretical_loss": 3.318368900571902, + "tokens_seen": 3028310016 + }, + { + "epoch": 10.02, + "learning_rate": 4.172517552657974e-05, + "loss": 2.1105, + "theoretical_loss": 3.318363389701174, + "tokens_seen": 3028375552 + }, + { + "epoch": 10.02, + "learning_rate": 4.171514543630893e-05, + "loss": 2.4978, + "theoretical_loss": 3.318357878983096, + "tokens_seen": 3028441088 + }, + { + "epoch": 10.02, + "learning_rate": 4.1705115346038115e-05, + "loss": 2.3196, + "theoretical_loss": 3.3183523684176595, + "tokens_seen": 3028506624 + }, + { + "epoch": 10.02, + "learning_rate": 4.16950852557673e-05, + "loss": 2.3792, + "theoretical_loss": 3.3183468580048574, + "tokens_seen": 3028572160 + }, + { + "epoch": 10.02, + "learning_rate": 4.168505516549649e-05, + "loss": 2.3593, + "theoretical_loss": 3.3183413477446813, + "tokens_seen": 3028637696 + }, + { + "epoch": 10.02, + "learning_rate": 4.167502507522568e-05, + "loss": 2.5223, + "theoretical_loss": 3.3183358376371244, + "tokens_seen": 3028703232 + }, + { + "epoch": 10.02, + "learning_rate": 4.166499498495487e-05, + "loss": 2.2576, + "theoretical_loss": 3.3183303276821787, + "tokens_seen": 3028768768 + }, + { + "epoch": 10.02, + "learning_rate": 4.1654964894684055e-05, + "loss": 2.466, + "theoretical_loss": 3.3183248178798372, + "tokens_seen": 3028834304 + }, + { + "epoch": 10.02, + "learning_rate": 4.164493480441324e-05, + "loss": 2.6546, + "theoretical_loss": 3.3183193082300924, + "tokens_seen": 3028899840 + }, + { + "epoch": 10.02, + "learning_rate": 4.163490471414243e-05, + "loss": 2.5264, + "theoretical_loss": 3.3183137987329365, + "tokens_seen": 3028965376 + }, + { + "epoch": 10.02, + "learning_rate": 4.162487462387161e-05, + "loss": 2.6211, + "theoretical_loss": 3.3183082893883618, + "tokens_seen": 3029030912 + }, + { + "epoch": 10.02, + "learning_rate": 4.16148445336008e-05, + "loss": 2.4047, + "theoretical_loss": 3.318302780196361, + "tokens_seen": 3029096448 + }, + { + "epoch": 10.02, + "learning_rate": 4.160481444332999e-05, + "loss": 2.14, + "theoretical_loss": 3.318297271156926, + "tokens_seen": 3029161984 + }, + { + "epoch": 10.02, + "learning_rate": 4.1594784353059175e-05, + "loss": 2.2587, + "theoretical_loss": 3.3182917622700505, + "tokens_seen": 3029227520 + }, + { + "epoch": 10.02, + "learning_rate": 4.158475426278836e-05, + "loss": 2.4983, + "theoretical_loss": 3.3182862535357263, + "tokens_seen": 3029293056 + }, + { + "epoch": 10.02, + "learning_rate": 4.157472417251755e-05, + "loss": 2.3514, + "theoretical_loss": 3.3182807449539453, + "tokens_seen": 3029358592 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3361225, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.8699959516525269, + "objective/train/theoretical_loss": 3.3182779907202566, + "objective/train/tokens_used": 3049851360, + "theoretical_loss": 3.3182779907202566, + "tokens_seen": 3029391360 + }, + { + "epoch": 10.02, + "learning_rate": 4.1564694082246746e-05, + "loss": 2.0509, + "theoretical_loss": 3.318275236524701, + "tokens_seen": 3029424128 + }, + { + "epoch": 10.02, + "learning_rate": 4.1554663991975934e-05, + "loss": 2.5302, + "theoretical_loss": 3.3182697282479854, + "tokens_seen": 3029489664 + }, + { + "epoch": 10.02, + "learning_rate": 4.154463390170512e-05, + "loss": 2.4454, + "theoretical_loss": 3.3182642201237904, + "tokens_seen": 3029555200 + }, + { + "epoch": 10.02, + "learning_rate": 4.15346038114343e-05, + "loss": 2.3542, + "theoretical_loss": 3.31825871215211, + "tokens_seen": 3029620736 + }, + { + "epoch": 10.02, + "learning_rate": 4.152457372116349e-05, + "loss": 2.5645, + "theoretical_loss": 3.318253204332935, + "tokens_seen": 3029686272 + }, + { + "epoch": 10.02, + "learning_rate": 4.151454363089268e-05, + "loss": 2.5295, + "theoretical_loss": 3.3182476966662584, + "tokens_seen": 3029751808 + }, + { + "epoch": 10.02, + "learning_rate": 4.1504513540621866e-05, + "loss": 2.4367, + "theoretical_loss": 3.3182421891520733, + "tokens_seen": 3029817344 + }, + { + "epoch": 10.02, + "learning_rate": 4.1494483450351054e-05, + "loss": 2.4877, + "theoretical_loss": 3.318236681790372, + "tokens_seen": 3029882880 + }, + { + "epoch": 10.02, + "learning_rate": 4.148445336008024e-05, + "loss": 2.2169, + "theoretical_loss": 3.318231174581147, + "tokens_seen": 3029948416 + }, + { + "epoch": 10.02, + "learning_rate": 4.147442326980943e-05, + "loss": 2.3748, + "theoretical_loss": 3.3182256675243895, + "tokens_seen": 3030013952 + }, + { + "epoch": 10.02, + "learning_rate": 4.146439317953862e-05, + "loss": 2.5648, + "theoretical_loss": 3.318220160620094, + "tokens_seen": 3030079488 + }, + { + "epoch": 10.02, + "learning_rate": 4.1454363089267806e-05, + "loss": 2.4653, + "theoretical_loss": 3.3182146538682513, + "tokens_seen": 3030145024 + }, + { + "epoch": 10.02, + "learning_rate": 4.1444332998996994e-05, + "loss": 2.4338, + "theoretical_loss": 3.318209147268855, + "tokens_seen": 3030210560 + }, + { + "epoch": 10.02, + "learning_rate": 4.1434302908726175e-05, + "loss": 2.2608, + "theoretical_loss": 3.318203640821897, + "tokens_seen": 3030276096 + }, + { + "epoch": 10.02, + "learning_rate": 4.142427281845536e-05, + "loss": 2.6816, + "theoretical_loss": 3.31819813452737, + "tokens_seen": 3030341632 + }, + { + "epoch": 10.02, + "learning_rate": 4.141424272818455e-05, + "loss": 2.361, + "theoretical_loss": 3.3181926283852663, + "tokens_seen": 3030407168 + }, + { + "epoch": 10.02, + "learning_rate": 4.140421263791374e-05, + "loss": 2.4251, + "theoretical_loss": 3.318187122395579, + "tokens_seen": 3030472704 + }, + { + "epoch": 10.02, + "learning_rate": 4.1394182547642926e-05, + "loss": 2.5049, + "theoretical_loss": 3.3181816165582996, + "tokens_seen": 3030538240 + }, + { + "epoch": 10.02, + "learning_rate": 4.138415245737212e-05, + "loss": 2.3265, + "theoretical_loss": 3.318176110873421, + "tokens_seen": 3030603776 + }, + { + "epoch": 10.02, + "learning_rate": 4.137412236710131e-05, + "loss": 2.3519, + "theoretical_loss": 3.318170605340936, + "tokens_seen": 3030669312 + }, + { + "epoch": 10.02, + "learning_rate": 4.13640922768305e-05, + "loss": 2.4705, + "theoretical_loss": 3.318165099960837, + "tokens_seen": 3030734848 + }, + { + "epoch": 10.02, + "learning_rate": 4.1354062186559685e-05, + "loss": 2.3117, + "theoretical_loss": 3.318159594733116, + "tokens_seen": 3030800384 + }, + { + "epoch": 10.02, + "learning_rate": 4.134403209628887e-05, + "loss": 2.3881, + "theoretical_loss": 3.3181540896577664, + "tokens_seen": 3030865920 + }, + { + "epoch": 10.02, + "learning_rate": 4.1334002006018054e-05, + "loss": 2.409, + "theoretical_loss": 3.3181485847347796, + "tokens_seen": 3030931456 + }, + { + "epoch": 10.02, + "learning_rate": 4.132397191574724e-05, + "loss": 2.4261, + "theoretical_loss": 3.3181430799641487, + "tokens_seen": 3030996992 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3361969, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5628256797790527, + "objective/train/theoretical_loss": 3.3181403276359642, + "objective/train/tokens_used": 3051489760, + "theoretical_loss": 3.3181403276359642, + "tokens_seen": 3031029760 + }, + { + "epoch": 10.02, + "learning_rate": 4.131394182547643e-05, + "loss": 2.5502, + "theoretical_loss": 3.318137575345866, + "tokens_seen": 3031062528 + }, + { + "epoch": 10.02, + "learning_rate": 4.130391173520562e-05, + "loss": 2.5055, + "theoretical_loss": 3.3181320708799245, + "tokens_seen": 3031128064 + }, + { + "epoch": 10.02, + "learning_rate": 4.1293881644934805e-05, + "loss": 2.4035, + "theoretical_loss": 3.318126566566316, + "tokens_seen": 3031193600 + }, + { + "epoch": 10.02, + "learning_rate": 4.128385155466399e-05, + "loss": 2.3734, + "theoretical_loss": 3.3181210624050332, + "tokens_seen": 3031259136 + }, + { + "epoch": 10.02, + "learning_rate": 4.127382146439318e-05, + "loss": 2.363, + "theoretical_loss": 3.318115558396069, + "tokens_seen": 3031324672 + }, + { + "epoch": 10.02, + "learning_rate": 4.126379137412237e-05, + "loss": 2.3801, + "theoretical_loss": 3.318110054539415, + "tokens_seen": 3031390208 + }, + { + "epoch": 10.02, + "learning_rate": 4.125376128385156e-05, + "loss": 2.5266, + "theoretical_loss": 3.3181045508350646, + "tokens_seen": 3031455744 + }, + { + "epoch": 10.02, + "learning_rate": 4.124373119358074e-05, + "loss": 2.4606, + "theoretical_loss": 3.31809904728301, + "tokens_seen": 3031521280 + }, + { + "epoch": 10.02, + "learning_rate": 4.1233701103309926e-05, + "loss": 2.4866, + "theoretical_loss": 3.3180935438832435, + "tokens_seen": 3031586816 + }, + { + "epoch": 10.02, + "learning_rate": 4.1223671013039114e-05, + "loss": 2.4484, + "theoretical_loss": 3.3180880406357582, + "tokens_seen": 3031652352 + }, + { + "epoch": 10.02, + "learning_rate": 4.12136409227683e-05, + "loss": 2.2991, + "theoretical_loss": 3.3180825375405454, + "tokens_seen": 3031717888 + }, + { + "epoch": 10.02, + "learning_rate": 4.120361083249749e-05, + "loss": 2.2782, + "theoretical_loss": 3.3180770345975987, + "tokens_seen": 3031783424 + }, + { + "epoch": 10.02, + "learning_rate": 4.1193580742226684e-05, + "loss": 2.5828, + "theoretical_loss": 3.3180715318069103, + "tokens_seen": 3031848960 + }, + { + "epoch": 10.02, + "learning_rate": 4.118355065195587e-05, + "loss": 2.4231, + "theoretical_loss": 3.318066029168472, + "tokens_seen": 3031914496 + }, + { + "epoch": 10.02, + "learning_rate": 4.117352056168506e-05, + "loss": 2.458, + "theoretical_loss": 3.318060526682278, + "tokens_seen": 3031980032 + }, + { + "epoch": 10.02, + "learning_rate": 4.116349047141425e-05, + "loss": 2.5625, + "theoretical_loss": 3.3180550243483187, + "tokens_seen": 3032045568 + }, + { + "epoch": 10.02, + "learning_rate": 4.1153460381143436e-05, + "loss": 2.6542, + "theoretical_loss": 3.318049522166588, + "tokens_seen": 3032111104 + }, + { + "epoch": 10.02, + "learning_rate": 4.114343029087262e-05, + "loss": 2.5455, + "theoretical_loss": 3.3180440201370778, + "tokens_seen": 3032176640 + }, + { + "epoch": 10.02, + "learning_rate": 4.1133400200601805e-05, + "loss": 2.5541, + "theoretical_loss": 3.318038518259781, + "tokens_seen": 3032242176 + }, + { + "epoch": 10.02, + "learning_rate": 4.112337011033099e-05, + "loss": 2.3999, + "theoretical_loss": 3.31803301653469, + "tokens_seen": 3032307712 + }, + { + "epoch": 10.02, + "learning_rate": 4.111334002006018e-05, + "loss": 2.3743, + "theoretical_loss": 3.318027514961797, + "tokens_seen": 3032373248 + }, + { + "epoch": 10.02, + "learning_rate": 4.110330992978937e-05, + "loss": 2.3805, + "theoretical_loss": 3.3180220135410945, + "tokens_seen": 3032438784 + }, + { + "epoch": 10.02, + "learning_rate": 4.1093279839518556e-05, + "loss": 2.4054, + "theoretical_loss": 3.3180165122725755, + "tokens_seen": 3032504320 + }, + { + "epoch": 10.02, + "learning_rate": 4.1083249749247744e-05, + "loss": 2.5174, + "theoretical_loss": 3.318011011156232, + "tokens_seen": 3032569856 + }, + { + "epoch": 10.02, + "learning_rate": 4.107321965897693e-05, + "loss": 2.4458, + "theoretical_loss": 3.318005510192057, + "tokens_seen": 3032635392 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3363549, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4236531257629395, + "objective/train/theoretical_loss": 3.3180027597670305, + "objective/train/tokens_used": 3053128160, + "theoretical_loss": 3.3180027597670305, + "tokens_seen": 3032668160 + }, + { + "epoch": 10.02, + "learning_rate": 4.106318956870612e-05, + "loss": 2.2225, + "theoretical_loss": 3.3180000093800426, + "tokens_seen": 3032700928 + }, + { + "epoch": 10.02, + "learning_rate": 4.105315947843531e-05, + "loss": 2.3354, + "theoretical_loss": 3.317994508720181, + "tokens_seen": 3032766464 + }, + { + "epoch": 10.02, + "learning_rate": 4.104312938816449e-05, + "loss": 2.3904, + "theoretical_loss": 3.317989008212466, + "tokens_seen": 3032832000 + }, + { + "epoch": 10.02, + "learning_rate": 4.103309929789368e-05, + "loss": 2.5608, + "theoretical_loss": 3.3179835078568884, + "tokens_seen": 3032897536 + }, + { + "epoch": 10.02, + "learning_rate": 4.1023069207622865e-05, + "loss": 2.5684, + "theoretical_loss": 3.317978007653442, + "tokens_seen": 3032963072 + }, + { + "epoch": 10.02, + "learning_rate": 4.101303911735205e-05, + "loss": 2.411, + "theoretical_loss": 3.3179725076021187, + "tokens_seen": 3033028608 + }, + { + "epoch": 10.02, + "learning_rate": 4.100300902708125e-05, + "loss": 2.4255, + "theoretical_loss": 3.317967007702911, + "tokens_seen": 3033094144 + }, + { + "epoch": 10.02, + "learning_rate": 4.0992978936810435e-05, + "loss": 2.343, + "theoretical_loss": 3.3179615079558116, + "tokens_seen": 3033159680 + }, + { + "epoch": 10.02, + "learning_rate": 4.098294884653962e-05, + "loss": 2.422, + "theoretical_loss": 3.317956008360813, + "tokens_seen": 3033225216 + }, + { + "epoch": 10.02, + "learning_rate": 4.097291875626881e-05, + "loss": 2.4574, + "theoretical_loss": 3.3179505089179075, + "tokens_seen": 3033290752 + }, + { + "epoch": 10.02, + "learning_rate": 4.0962888665998e-05, + "loss": 2.4196, + "theoretical_loss": 3.317945009627088, + "tokens_seen": 3033356288 + }, + { + "epoch": 10.02, + "learning_rate": 4.095285857572719e-05, + "loss": 2.2965, + "theoretical_loss": 3.3179395104883467, + "tokens_seen": 3033421824 + }, + { + "epoch": 10.02, + "learning_rate": 4.094282848545637e-05, + "loss": 2.4032, + "theoretical_loss": 3.3179340115016758, + "tokens_seen": 3033487360 + }, + { + "epoch": 10.02, + "learning_rate": 4.0932798395185556e-05, + "loss": 2.4741, + "theoretical_loss": 3.3179285126670686, + "tokens_seen": 3033552896 + }, + { + "epoch": 10.02, + "learning_rate": 4.0922768304914744e-05, + "loss": 2.3861, + "theoretical_loss": 3.317923013984517, + "tokens_seen": 3033618432 + }, + { + "epoch": 10.02, + "learning_rate": 4.091273821464393e-05, + "loss": 2.5122, + "theoretical_loss": 3.317917515454014, + "tokens_seen": 3033683968 + }, + { + "epoch": 10.02, + "learning_rate": 4.090270812437312e-05, + "loss": 2.5899, + "theoretical_loss": 3.3179120170755514, + "tokens_seen": 3033749504 + }, + { + "epoch": 10.02, + "learning_rate": 4.089267803410231e-05, + "loss": 2.5588, + "theoretical_loss": 3.3179065188491226, + "tokens_seen": 3033815040 + }, + { + "epoch": 10.02, + "learning_rate": 4.0882647943831495e-05, + "loss": 2.6651, + "theoretical_loss": 3.3179010207747193, + "tokens_seen": 3033880576 + }, + { + "epoch": 10.02, + "learning_rate": 4.087261785356068e-05, + "loss": 2.3324, + "theoretical_loss": 3.3178955228523344, + "tokens_seen": 3033946112 + }, + { + "epoch": 10.02, + "learning_rate": 4.086258776328987e-05, + "loss": 2.4746, + "theoretical_loss": 3.3178900250819603, + "tokens_seen": 3034011648 + }, + { + "epoch": 10.02, + "learning_rate": 4.085255767301905e-05, + "loss": 2.2423, + "theoretical_loss": 3.3178845274635895, + "tokens_seen": 3034077184 + }, + { + "epoch": 10.02, + "learning_rate": 4.084252758274824e-05, + "loss": 2.3608, + "theoretical_loss": 3.317879029997215, + "tokens_seen": 3034142720 + }, + { + "epoch": 10.02, + "learning_rate": 4.083249749247743e-05, + "loss": 2.3236, + "theoretical_loss": 3.3178735326828286, + "tokens_seen": 3034208256 + }, + { + "epoch": 10.02, + "learning_rate": 4.0822467402206616e-05, + "loss": 2.5051, + "theoretical_loss": 3.3178680355204233, + "tokens_seen": 3034273792 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3364302, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.573617935180664, + "objective/train/theoretical_loss": 3.317865286996211, + "objective/train/tokens_used": 3054766560, + "theoretical_loss": 3.317865286996211, + "tokens_seen": 3034306560 + }, + { + "epoch": 10.02, + "learning_rate": 4.081243731193581e-05, + "loss": 2.3826, + "theoretical_loss": 3.317862538509991, + "tokens_seen": 3034339328 + }, + { + "epoch": 10.02, + "learning_rate": 4.0802407221665e-05, + "loss": 2.3461, + "theoretical_loss": 3.3178570416515254, + "tokens_seen": 3034404864 + }, + { + "epoch": 10.02, + "learning_rate": 4.0792377131394187e-05, + "loss": 2.2466, + "theoretical_loss": 3.3178515449450177, + "tokens_seen": 3034470400 + }, + { + "epoch": 10.02, + "learning_rate": 4.0782347041123374e-05, + "loss": 2.3988, + "theoretical_loss": 3.3178460483904613, + "tokens_seen": 3034535936 + }, + { + "epoch": 10.02, + "learning_rate": 4.077231695085256e-05, + "loss": 2.3932, + "theoretical_loss": 3.317840551987848, + "tokens_seen": 3034601472 + }, + { + "epoch": 10.02, + "learning_rate": 4.076228686058175e-05, + "loss": 2.4237, + "theoretical_loss": 3.3178350557371714, + "tokens_seen": 3034667008 + }, + { + "epoch": 10.02, + "learning_rate": 4.075225677031093e-05, + "loss": 2.3315, + "theoretical_loss": 3.317829559638423, + "tokens_seen": 3034732544 + }, + { + "epoch": 10.02, + "learning_rate": 4.074222668004012e-05, + "loss": 2.3407, + "theoretical_loss": 3.3178240636915954, + "tokens_seen": 3034798080 + }, + { + "epoch": 10.02, + "learning_rate": 4.073219658976931e-05, + "loss": 2.2807, + "theoretical_loss": 3.3178185678966816, + "tokens_seen": 3034863616 + }, + { + "epoch": 10.02, + "learning_rate": 4.0722166499498495e-05, + "loss": 2.2834, + "theoretical_loss": 3.3178130722536743, + "tokens_seen": 3034929152 + }, + { + "epoch": 10.02, + "learning_rate": 4.071213640922768e-05, + "loss": 2.2572, + "theoretical_loss": 3.317807576762565, + "tokens_seen": 3034994688 + }, + { + "epoch": 10.02, + "learning_rate": 4.070210631895687e-05, + "loss": 2.546, + "theoretical_loss": 3.317802081423347, + "tokens_seen": 3035060224 + }, + { + "epoch": 10.02, + "learning_rate": 4.069207622868606e-05, + "loss": 2.5218, + "theoretical_loss": 3.317796586236013, + "tokens_seen": 3035125760 + }, + { + "epoch": 10.02, + "learning_rate": 4.068204613841525e-05, + "loss": 2.5264, + "theoretical_loss": 3.317791091200555, + "tokens_seen": 3035191296 + }, + { + "epoch": 10.02, + "learning_rate": 4.0672016048144435e-05, + "loss": 2.2571, + "theoretical_loss": 3.3177855963169653, + "tokens_seen": 3035256832 + }, + { + "epoch": 10.02, + "learning_rate": 4.066198595787362e-05, + "loss": 2.4961, + "theoretical_loss": 3.3177801015852375, + "tokens_seen": 3035322368 + }, + { + "epoch": 10.02, + "learning_rate": 4.0651955867602804e-05, + "loss": 2.3841, + "theoretical_loss": 3.3177746070053633, + "tokens_seen": 3035387904 + }, + { + "epoch": 10.02, + "learning_rate": 4.064192577733199e-05, + "loss": 2.3207, + "theoretical_loss": 3.317769112577335, + "tokens_seen": 3035453440 + }, + { + "epoch": 10.02, + "learning_rate": 4.0631895687061186e-05, + "loss": 2.4366, + "theoretical_loss": 3.317763618301146, + "tokens_seen": 3035518976 + }, + { + "epoch": 10.02, + "learning_rate": 4.0621865596790374e-05, + "loss": 2.4414, + "theoretical_loss": 3.3177581241767884, + "tokens_seen": 3035584512 + }, + { + "epoch": 10.02, + "learning_rate": 4.061183550651956e-05, + "loss": 2.2249, + "theoretical_loss": 3.3177526302042546, + "tokens_seen": 3035650048 + }, + { + "epoch": 10.02, + "learning_rate": 4.060180541624875e-05, + "loss": 2.4677, + "theoretical_loss": 3.3177471363835367, + "tokens_seen": 3035715584 + }, + { + "epoch": 10.02, + "learning_rate": 4.059177532597794e-05, + "loss": 2.6469, + "theoretical_loss": 3.317741642714628, + "tokens_seen": 3035781120 + }, + { + "epoch": 10.02, + "learning_rate": 4.0581745235707126e-05, + "loss": 2.1526, + "theoretical_loss": 3.317736149197521, + "tokens_seen": 3035846656 + }, + { + "epoch": 10.02, + "learning_rate": 4.0571715145436313e-05, + "loss": 2.2444, + "theoretical_loss": 3.3177306558322077, + "tokens_seen": 3035912192 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3365526, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.259972333908081, + "objective/train/theoretical_loss": 3.3177279092064715, + "objective/train/tokens_used": 3056404960, + "theoretical_loss": 3.3177279092064715, + "tokens_seen": 3035944960 + }, + { + "epoch": 10.02, + "learning_rate": 4.05616850551655e-05, + "loss": 2.4407, + "theoretical_loss": 3.317725162618681, + "tokens_seen": 3035977728 + }, + { + "epoch": 10.02, + "learning_rate": 4.055165496489468e-05, + "loss": 2.597, + "theoretical_loss": 3.3177196695569333, + "tokens_seen": 3036043264 + }, + { + "epoch": 10.02, + "learning_rate": 4.054162487462387e-05, + "loss": 2.245, + "theoretical_loss": 3.317714176646957, + "tokens_seen": 3036108800 + }, + { + "epoch": 10.02, + "learning_rate": 4.053159478435306e-05, + "loss": 2.3753, + "theoretical_loss": 3.3177086838887453, + "tokens_seen": 3036174336 + }, + { + "epoch": 10.02, + "learning_rate": 4.0521564694082246e-05, + "loss": 2.576, + "theoretical_loss": 3.31770319128229, + "tokens_seen": 3036239872 + }, + { + "epoch": 10.02, + "learning_rate": 4.0511534603811434e-05, + "loss": 2.1787, + "theoretical_loss": 3.3176976988275837, + "tokens_seen": 3036305408 + }, + { + "epoch": 10.02, + "learning_rate": 4.050150451354062e-05, + "loss": 2.5502, + "theoretical_loss": 3.3176922065246193, + "tokens_seen": 3036370944 + }, + { + "epoch": 10.02, + "learning_rate": 4.049147442326981e-05, + "loss": 2.6664, + "theoretical_loss": 3.317686714373389, + "tokens_seen": 3036436480 + }, + { + "epoch": 10.02, + "learning_rate": 4.0481444332999e-05, + "loss": 2.5156, + "theoretical_loss": 3.3176812223738854, + "tokens_seen": 3036502016 + }, + { + "epoch": 10.02, + "learning_rate": 4.0471414242728186e-05, + "loss": 2.5464, + "theoretical_loss": 3.3176757305261013, + "tokens_seen": 3036567552 + }, + { + "epoch": 10.02, + "learning_rate": 4.046138415245737e-05, + "loss": 2.5251, + "theoretical_loss": 3.3176702388300288, + "tokens_seen": 3036633088 + }, + { + "epoch": 10.02, + "learning_rate": 4.0451354062186555e-05, + "loss": 2.2153, + "theoretical_loss": 3.3176647472856606, + "tokens_seen": 3036698624 + }, + { + "epoch": 10.02, + "learning_rate": 4.044132397191575e-05, + "loss": 2.405, + "theoretical_loss": 3.3176592558929894, + "tokens_seen": 3036764160 + }, + { + "epoch": 10.02, + "learning_rate": 4.043129388164494e-05, + "loss": 2.5839, + "theoretical_loss": 3.317653764652008, + "tokens_seen": 3036829696 + }, + { + "epoch": 10.02, + "learning_rate": 4.0421263791374125e-05, + "loss": 2.5127, + "theoretical_loss": 3.317648273562708, + "tokens_seen": 3036895232 + }, + { + "epoch": 10.02, + "learning_rate": 4.041123370110331e-05, + "loss": 2.3085, + "theoretical_loss": 3.317642782625083, + "tokens_seen": 3036960768 + }, + { + "epoch": 10.02, + "learning_rate": 4.04012036108325e-05, + "loss": 2.3931, + "theoretical_loss": 3.317637291839125, + "tokens_seen": 3037026304 + }, + { + "epoch": 10.02, + "learning_rate": 4.039117352056169e-05, + "loss": 2.5536, + "theoretical_loss": 3.3176318012048265, + "tokens_seen": 3037091840 + }, + { + "epoch": 10.02, + "learning_rate": 4.038114343029088e-05, + "loss": 2.3879, + "theoretical_loss": 3.3176263107221797, + "tokens_seen": 3037157376 + }, + { + "epoch": 10.02, + "learning_rate": 4.0371113340020065e-05, + "loss": 2.5822, + "theoretical_loss": 3.317620820391178, + "tokens_seen": 3037222912 + }, + { + "epoch": 10.02, + "learning_rate": 4.0361083249749246e-05, + "loss": 2.3813, + "theoretical_loss": 3.3176153302118134, + "tokens_seen": 3037288448 + }, + { + "epoch": 10.02, + "learning_rate": 4.0351053159478434e-05, + "loss": 2.4806, + "theoretical_loss": 3.317609840184079, + "tokens_seen": 3037353984 + }, + { + "epoch": 10.02, + "learning_rate": 4.034102306920762e-05, + "loss": 2.101, + "theoretical_loss": 3.317604350307966, + "tokens_seen": 3037419520 + }, + { + "epoch": 10.02, + "learning_rate": 4.033099297893681e-05, + "loss": 2.221, + "theoretical_loss": 3.3175988605834683, + "tokens_seen": 3037485056 + }, + { + "epoch": 10.02, + "learning_rate": 4.0320962888666e-05, + "loss": 2.6334, + "theoretical_loss": 3.317593371010578, + "tokens_seen": 3037550592 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3366230, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5377156734466553, + "objective/train/theoretical_loss": 3.317590626280983, + "objective/train/tokens_used": 3058043360, + "theoretical_loss": 3.317590626280983, + "tokens_seen": 3037583360 + }, + { + "epoch": 10.02, + "learning_rate": 4.0310932798395185e-05, + "loss": 2.5132, + "theoretical_loss": 3.3175878815892874, + "tokens_seen": 3037616128 + }, + { + "epoch": 10.02, + "learning_rate": 4.030090270812437e-05, + "loss": 2.3808, + "theoretical_loss": 3.3175823923195895, + "tokens_seen": 3037681664 + }, + { + "epoch": 10.02, + "learning_rate": 4.029087261785356e-05, + "loss": 2.2024, + "theoretical_loss": 3.3175769032014766, + "tokens_seen": 3037747200 + }, + { + "epoch": 10.02, + "learning_rate": 4.028084252758275e-05, + "loss": 2.3919, + "theoretical_loss": 3.317571414234941, + "tokens_seen": 3037812736 + }, + { + "epoch": 10.02, + "learning_rate": 4.027081243731194e-05, + "loss": 2.3231, + "theoretical_loss": 3.3175659254199754, + "tokens_seen": 3037878272 + }, + { + "epoch": 10.02, + "learning_rate": 4.026078234704112e-05, + "loss": 2.4648, + "theoretical_loss": 3.317560436756573, + "tokens_seen": 3037943808 + }, + { + "epoch": 10.02, + "learning_rate": 4.025075225677031e-05, + "loss": 2.6545, + "theoretical_loss": 3.3175549482447253, + "tokens_seen": 3038009344 + }, + { + "epoch": 10.02, + "learning_rate": 4.02407221664995e-05, + "loss": 2.3648, + "theoretical_loss": 3.3175494598844253, + "tokens_seen": 3038074880 + }, + { + "epoch": 10.02, + "learning_rate": 4.023069207622869e-05, + "loss": 2.3527, + "theoretical_loss": 3.317543971675666, + "tokens_seen": 3038140416 + }, + { + "epoch": 10.02, + "learning_rate": 4.0220661985957876e-05, + "loss": 2.4531, + "theoretical_loss": 3.317538483618439, + "tokens_seen": 3038205952 + }, + { + "epoch": 10.02, + "learning_rate": 4.0210631895687064e-05, + "loss": 2.241, + "theoretical_loss": 3.3175329957127375, + "tokens_seen": 3038271488 + }, + { + "epoch": 10.02, + "learning_rate": 4.020060180541625e-05, + "loss": 2.4467, + "theoretical_loss": 3.317527507958554, + "tokens_seen": 3038337024 + }, + { + "epoch": 10.02, + "learning_rate": 4.019057171514544e-05, + "loss": 2.3547, + "theoretical_loss": 3.317522020355881, + "tokens_seen": 3038402560 + }, + { + "epoch": 10.02, + "learning_rate": 4.018054162487463e-05, + "loss": 2.6258, + "theoretical_loss": 3.317516532904711, + "tokens_seen": 3038468096 + }, + { + "epoch": 10.02, + "learning_rate": 4.0170511534603816e-05, + "loss": 2.5605, + "theoretical_loss": 3.3175110456050363, + "tokens_seen": 3038533632 + }, + { + "epoch": 10.02, + "learning_rate": 4.0160481444333e-05, + "loss": 2.4939, + "theoretical_loss": 3.3175055584568502, + "tokens_seen": 3038599168 + }, + { + "epoch": 10.02, + "learning_rate": 4.0150451354062185e-05, + "loss": 2.5146, + "theoretical_loss": 3.3175000714601444, + "tokens_seen": 3038664704 + }, + { + "epoch": 10.02, + "learning_rate": 4.014042126379137e-05, + "loss": 2.4316, + "theoretical_loss": 3.3174945846149115, + "tokens_seen": 3038730240 + }, + { + "epoch": 10.02, + "learning_rate": 4.013039117352056e-05, + "loss": 2.2875, + "theoretical_loss": 3.317489097921145, + "tokens_seen": 3038795776 + }, + { + "epoch": 10.02, + "learning_rate": 4.012036108324975e-05, + "loss": 2.6603, + "theoretical_loss": 3.3174836113788366, + "tokens_seen": 3038861312 + }, + { + "epoch": 10.02, + "learning_rate": 4.0110330992978936e-05, + "loss": 2.3559, + "theoretical_loss": 3.317478124987979, + "tokens_seen": 3038926848 + }, + { + "epoch": 10.02, + "learning_rate": 4.0100300902708124e-05, + "loss": 2.5243, + "theoretical_loss": 3.3174726387485647, + "tokens_seen": 3038992384 + }, + { + "epoch": 10.02, + "learning_rate": 4.009027081243731e-05, + "loss": 2.2736, + "theoretical_loss": 3.3174671526605866, + "tokens_seen": 3039057920 + }, + { + "epoch": 10.02, + "learning_rate": 4.00802407221665e-05, + "loss": 2.4091, + "theoretical_loss": 3.3174616667240366, + "tokens_seen": 3039123456 + }, + { + "epoch": 10.02, + "learning_rate": 4.007021063189569e-05, + "loss": 2.4157, + "theoretical_loss": 3.3174561809389083, + "tokens_seen": 3039188992 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3367562, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2761642932891846, + "objective/train/theoretical_loss": 3.3174534381031244, + "objective/train/tokens_used": 3059681760, + "theoretical_loss": 3.3174534381031244, + "tokens_seen": 3039221760 + }, + { + "epoch": 10.02, + "learning_rate": 4.0060180541624876e-05, + "loss": 2.1712, + "theoretical_loss": 3.3174506953051934, + "tokens_seen": 3039254528 + }, + { + "epoch": 10.02, + "learning_rate": 4.0050150451354064e-05, + "loss": 2.6319, + "theoretical_loss": 3.3174452098228846, + "tokens_seen": 3039320064 + }, + { + "epoch": 10.02, + "learning_rate": 4.004012036108325e-05, + "loss": 2.5108, + "theoretical_loss": 3.3174397244919747, + "tokens_seen": 3039385600 + }, + { + "epoch": 10.02, + "learning_rate": 4.003009027081244e-05, + "loss": 2.5332, + "theoretical_loss": 3.3174342393124556, + "tokens_seen": 3039451136 + }, + { + "epoch": 10.02, + "learning_rate": 4.002006018054163e-05, + "loss": 2.4351, + "theoretical_loss": 3.317428754284321, + "tokens_seen": 3039516672 + }, + { + "epoch": 10.02, + "learning_rate": 4.0010030090270815e-05, + "loss": 2.553, + "theoretical_loss": 3.3174232694075627, + "tokens_seen": 3039582208 + }, + { + "epoch": 10.02, + "learning_rate": 4e-05, + "loss": 2.432, + "theoretical_loss": 3.3174177846821733, + "tokens_seen": 3039647744 + }, + { + "epoch": 10.02, + "learning_rate": 3.998996990972919e-05, + "loss": 2.4695, + "theoretical_loss": 3.3174123001081455, + "tokens_seen": 3039713280 + }, + { + "epoch": 10.02, + "learning_rate": 3.997993981945838e-05, + "loss": 2.5899, + "theoretical_loss": 3.3174068156854712, + "tokens_seen": 3039778816 + }, + { + "epoch": 10.02, + "learning_rate": 3.996990972918756e-05, + "loss": 2.3428, + "theoretical_loss": 3.3174013314141444, + "tokens_seen": 3039844352 + }, + { + "epoch": 10.02, + "learning_rate": 3.995987963891675e-05, + "loss": 2.6299, + "theoretical_loss": 3.317395847294156, + "tokens_seen": 3039909888 + }, + { + "epoch": 10.02, + "learning_rate": 3.9949849548645936e-05, + "loss": 2.3546, + "theoretical_loss": 3.3173903633255, + "tokens_seen": 3039975424 + }, + { + "epoch": 10.02, + "learning_rate": 3.9939819458375124e-05, + "loss": 2.2591, + "theoretical_loss": 3.317384879508168, + "tokens_seen": 3040040960 + }, + { + "epoch": 10.02, + "learning_rate": 3.992978936810431e-05, + "loss": 2.2598, + "theoretical_loss": 3.3173793958421536, + "tokens_seen": 3040106496 + }, + { + "epoch": 10.02, + "learning_rate": 3.99197592778335e-05, + "loss": 2.3342, + "theoretical_loss": 3.317373912327448, + "tokens_seen": 3040172032 + }, + { + "epoch": 10.02, + "learning_rate": 3.990972918756269e-05, + "loss": 2.4629, + "theoretical_loss": 3.317368428964045, + "tokens_seen": 3040237568 + }, + { + "epoch": 10.02, + "learning_rate": 3.9899699097291875e-05, + "loss": 2.3475, + "theoretical_loss": 3.317362945751936, + "tokens_seen": 3040303104 + }, + { + "epoch": 10.02, + "learning_rate": 3.988966900702106e-05, + "loss": 2.3214, + "theoretical_loss": 3.317357462691114, + "tokens_seen": 3040368640 + }, + { + "epoch": 10.02, + "learning_rate": 3.987963891675026e-05, + "loss": 2.5371, + "theoretical_loss": 3.3173519797815723, + "tokens_seen": 3040434176 + }, + { + "epoch": 10.02, + "learning_rate": 3.986960882647944e-05, + "loss": 2.3113, + "theoretical_loss": 3.317346497023303, + "tokens_seen": 3040499712 + }, + { + "epoch": 10.02, + "learning_rate": 3.985957873620863e-05, + "loss": 2.4032, + "theoretical_loss": 3.3173410144162983, + "tokens_seen": 3040565248 + }, + { + "epoch": 10.02, + "learning_rate": 3.9849548645937815e-05, + "loss": 2.3893, + "theoretical_loss": 3.3173355319605506, + "tokens_seen": 3040630784 + }, + { + "epoch": 10.02, + "learning_rate": 3.9839518555667e-05, + "loss": 2.5181, + "theoretical_loss": 3.3173300496560536, + "tokens_seen": 3040696320 + }, + { + "epoch": 10.02, + "learning_rate": 3.982948846539619e-05, + "loss": 2.6086, + "theoretical_loss": 3.3173245675027987, + "tokens_seen": 3040761856 + }, + { + "epoch": 10.02, + "learning_rate": 3.981945837512538e-05, + "loss": 2.2447, + "theoretical_loss": 3.317319085500779, + "tokens_seen": 3040827392 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3368140, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.363330125808716, + "objective/train/theoretical_loss": 3.31731634455648, + "objective/train/tokens_used": 3061320160, + "theoretical_loss": 3.31731634455648, + "tokens_seen": 3040860160 + }, + { + "epoch": 10.02, + "learning_rate": 3.9809428284854567e-05, + "loss": 2.24, + "theoretical_loss": 3.317313603649987, + "tokens_seen": 3040892928 + }, + { + "epoch": 10.02, + "learning_rate": 3.9799398194583754e-05, + "loss": 2.5727, + "theoretical_loss": 3.3173081219504152, + "tokens_seen": 3040958464 + }, + { + "epoch": 10.02, + "learning_rate": 3.978936810431294e-05, + "loss": 2.5282, + "theoretical_loss": 3.3173026404020565, + "tokens_seen": 3041024000 + }, + { + "epoch": 10.02, + "learning_rate": 3.9779338014042123e-05, + "loss": 2.4511, + "theoretical_loss": 3.317297159004903, + "tokens_seen": 3041089536 + }, + { + "epoch": 10.02, + "learning_rate": 3.976930792377131e-05, + "loss": 2.419, + "theoretical_loss": 3.3172916777589476, + "tokens_seen": 3041155072 + }, + { + "epoch": 10.02, + "learning_rate": 3.97592778335005e-05, + "loss": 2.4845, + "theoretical_loss": 3.3172861966641825, + "tokens_seen": 3041220608 + }, + { + "epoch": 10.02, + "learning_rate": 3.974924774322969e-05, + "loss": 2.2991, + "theoretical_loss": 3.317280715720601, + "tokens_seen": 3041286144 + }, + { + "epoch": 10.02, + "learning_rate": 3.9739217652958875e-05, + "loss": 2.4892, + "theoretical_loss": 3.3172752349281946, + "tokens_seen": 3041351680 + }, + { + "epoch": 10.02, + "learning_rate": 3.972918756268806e-05, + "loss": 2.2995, + "theoretical_loss": 3.317269754286957, + "tokens_seen": 3041417216 + }, + { + "epoch": 10.02, + "learning_rate": 3.971915747241725e-05, + "loss": 2.609, + "theoretical_loss": 3.3172642737968796, + "tokens_seen": 3041482752 + }, + { + "epoch": 10.02, + "learning_rate": 3.970912738214644e-05, + "loss": 2.6184, + "theoretical_loss": 3.317258793457956, + "tokens_seen": 3041548288 + }, + { + "epoch": 10.02, + "learning_rate": 3.969909729187563e-05, + "loss": 2.397, + "theoretical_loss": 3.3172533132701782, + "tokens_seen": 3041613824 + }, + { + "epoch": 10.02, + "learning_rate": 3.968906720160482e-05, + "loss": 2.2502, + "theoretical_loss": 3.317247833233539, + "tokens_seen": 3041679360 + }, + { + "epoch": 10.02, + "learning_rate": 3.9679037111334e-05, + "loss": 2.316, + "theoretical_loss": 3.317242353348031, + "tokens_seen": 3041744896 + }, + { + "epoch": 10.02, + "learning_rate": 3.966900702106319e-05, + "loss": 2.4943, + "theoretical_loss": 3.317236873613647, + "tokens_seen": 3041810432 + }, + { + "epoch": 10.02, + "learning_rate": 3.965897693079238e-05, + "loss": 2.3822, + "theoretical_loss": 3.3172313940303795, + "tokens_seen": 3041875968 + }, + { + "epoch": 10.02, + "learning_rate": 3.9648946840521566e-05, + "loss": 2.207, + "theoretical_loss": 3.31722591459822, + "tokens_seen": 3041941504 + }, + { + "epoch": 10.02, + "learning_rate": 3.9638916750250754e-05, + "loss": 2.5526, + "theoretical_loss": 3.3172204353171626, + "tokens_seen": 3042007040 + }, + { + "epoch": 10.02, + "learning_rate": 3.962888665997994e-05, + "loss": 2.311, + "theoretical_loss": 3.3172149561871986, + "tokens_seen": 3042072576 + }, + { + "epoch": 10.02, + "learning_rate": 3.961885656970913e-05, + "loss": 2.5766, + "theoretical_loss": 3.317209477208322, + "tokens_seen": 3042138112 + }, + { + "epoch": 10.02, + "learning_rate": 3.960882647943832e-05, + "loss": 2.4756, + "theoretical_loss": 3.317203998380524, + "tokens_seen": 3042203648 + }, + { + "epoch": 10.02, + "learning_rate": 3.9598796389167506e-05, + "loss": 2.3958, + "theoretical_loss": 3.3171985197037976, + "tokens_seen": 3042269184 + }, + { + "epoch": 10.02, + "learning_rate": 3.9588766298896693e-05, + "loss": 2.1588, + "theoretical_loss": 3.317193041178136, + "tokens_seen": 3042334720 + }, + { + "epoch": 10.02, + "learning_rate": 3.9578736208625875e-05, + "loss": 2.4996, + "theoretical_loss": 3.3171875628035314, + "tokens_seen": 3042400256 + }, + { + "epoch": 10.02, + "learning_rate": 3.956870611835506e-05, + "loss": 2.392, + "theoretical_loss": 3.3171820845799758, + "tokens_seen": 3042465792 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3369135, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.311540365219116, + "objective/train/theoretical_loss": 3.3171793455248393, + "objective/train/tokens_used": 3062958560, + "theoretical_loss": 3.3171793455248393, + "tokens_seen": 3042498560 + }, + { + "epoch": 10.02, + "learning_rate": 3.955867602808425e-05, + "loss": 2.3845, + "theoretical_loss": 3.317176606507463, + "tokens_seen": 3042531328 + }, + { + "epoch": 10.02, + "learning_rate": 3.954864593781344e-05, + "loss": 2.3926, + "theoretical_loss": 3.317171128585984, + "tokens_seen": 3042596864 + }, + { + "epoch": 10.02, + "learning_rate": 3.9538615847542626e-05, + "loss": 2.3935, + "theoretical_loss": 3.3171656508155327, + "tokens_seen": 3042662400 + }, + { + "epoch": 10.02, + "learning_rate": 3.9528585757271814e-05, + "loss": 2.4101, + "theoretical_loss": 3.3171601731961013, + "tokens_seen": 3042727936 + }, + { + "epoch": 10.02, + "learning_rate": 3.9518555667001e-05, + "loss": 2.3905, + "theoretical_loss": 3.317154695727682, + "tokens_seen": 3042793472 + }, + { + "epoch": 10.02, + "learning_rate": 3.950852557673019e-05, + "loss": 2.5689, + "theoretical_loss": 3.317149218410268, + "tokens_seen": 3042859008 + }, + { + "epoch": 10.02, + "learning_rate": 3.9498495486459385e-05, + "loss": 2.4463, + "theoretical_loss": 3.3171437412438514, + "tokens_seen": 3042924544 + }, + { + "epoch": 10.02, + "learning_rate": 3.948846539618857e-05, + "loss": 2.3389, + "theoretical_loss": 3.3171382642284253, + "tokens_seen": 3042990080 + }, + { + "epoch": 10.02, + "learning_rate": 3.9478435305917754e-05, + "loss": 2.4745, + "theoretical_loss": 3.3171327873639815, + "tokens_seen": 3043055616 + }, + { + "epoch": 10.02, + "learning_rate": 3.946840521564694e-05, + "loss": 2.5847, + "theoretical_loss": 3.3171273106505135, + "tokens_seen": 3043121152 + }, + { + "epoch": 10.02, + "learning_rate": 3.945837512537613e-05, + "loss": 2.3622, + "theoretical_loss": 3.317121834088013, + "tokens_seen": 3043186688 + }, + { + "epoch": 10.02, + "learning_rate": 3.944834503510532e-05, + "loss": 2.4939, + "theoretical_loss": 3.317116357676473, + "tokens_seen": 3043252224 + }, + { + "epoch": 10.02, + "learning_rate": 3.9438314944834505e-05, + "loss": 2.5632, + "theoretical_loss": 3.3171108814158865, + "tokens_seen": 3043317760 + }, + { + "epoch": 10.02, + "learning_rate": 3.942828485456369e-05, + "loss": 2.3669, + "theoretical_loss": 3.317105405306245, + "tokens_seen": 3043383296 + }, + { + "epoch": 10.02, + "learning_rate": 3.941825476429288e-05, + "loss": 2.4417, + "theoretical_loss": 3.3170999293475427, + "tokens_seen": 3043448832 + }, + { + "epoch": 10.02, + "learning_rate": 3.940822467402207e-05, + "loss": 2.4264, + "theoretical_loss": 3.3170944535397706, + "tokens_seen": 3043514368 + }, + { + "epoch": 10.02, + "learning_rate": 3.939819458375126e-05, + "loss": 2.5141, + "theoretical_loss": 3.3170889778829222, + "tokens_seen": 3043579904 + }, + { + "epoch": 10.02, + "learning_rate": 3.938816449348044e-05, + "loss": 2.452, + "theoretical_loss": 3.31708350237699, + "tokens_seen": 3043645440 + }, + { + "epoch": 10.02, + "learning_rate": 3.9378134403209626e-05, + "loss": 2.425, + "theoretical_loss": 3.3170780270219664, + "tokens_seen": 3043710976 + }, + { + "epoch": 10.02, + "learning_rate": 3.9368104312938814e-05, + "loss": 2.4084, + "theoretical_loss": 3.3170725518178434, + "tokens_seen": 3043776512 + }, + { + "epoch": 10.02, + "learning_rate": 3.9358074222668e-05, + "loss": 2.5042, + "theoretical_loss": 3.317067076764615, + "tokens_seen": 3043842048 + }, + { + "epoch": 10.02, + "learning_rate": 3.934804413239719e-05, + "loss": 2.32, + "theoretical_loss": 3.3170616018622727, + "tokens_seen": 3043907584 + }, + { + "epoch": 10.02, + "learning_rate": 3.933801404212638e-05, + "loss": 2.4868, + "theoretical_loss": 3.3170561271108094, + "tokens_seen": 3043973120 + }, + { + "epoch": 10.02, + "learning_rate": 3.9327983951855565e-05, + "loss": 2.5669, + "theoretical_loss": 3.317050652510218, + "tokens_seen": 3044038656 + }, + { + "epoch": 10.02, + "learning_rate": 3.931795386158476e-05, + "loss": 2.4915, + "theoretical_loss": 3.3170451780604906, + "tokens_seen": 3044104192 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3370502, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6761364936828613, + "objective/train/theoretical_loss": 3.3170424408921986, + "objective/train/tokens_used": 3064596960, + "theoretical_loss": 3.3170424408921986, + "tokens_seen": 3044136960 + }, + { + "epoch": 10.02, + "learning_rate": 3.930792377131395e-05, + "loss": 2.6097, + "theoretical_loss": 3.31703970376162, + "tokens_seen": 3044169728 + }, + { + "epoch": 10.02, + "learning_rate": 3.9297893681043136e-05, + "loss": 2.6913, + "theoretical_loss": 3.317034229613599, + "tokens_seen": 3044235264 + }, + { + "epoch": 10.02, + "learning_rate": 3.928786359077232e-05, + "loss": 2.5678, + "theoretical_loss": 3.3170287556164193, + "tokens_seen": 3044300800 + }, + { + "epoch": 10.02, + "learning_rate": 3.9277833500501505e-05, + "loss": 2.4964, + "theoretical_loss": 3.3170232817700747, + "tokens_seen": 3044366336 + }, + { + "epoch": 10.02, + "learning_rate": 3.926780341023069e-05, + "loss": 2.4099, + "theoretical_loss": 3.3170178080745574, + "tokens_seen": 3044431872 + }, + { + "epoch": 10.02, + "learning_rate": 3.925777331995988e-05, + "loss": 2.6133, + "theoretical_loss": 3.3170123345298594, + "tokens_seen": 3044497408 + }, + { + "epoch": 10.02, + "learning_rate": 3.924774322968907e-05, + "loss": 2.4156, + "theoretical_loss": 3.317006861135974, + "tokens_seen": 3044562944 + }, + { + "epoch": 10.02, + "learning_rate": 3.9237713139418256e-05, + "loss": 2.4754, + "theoretical_loss": 3.3170013878928937, + "tokens_seen": 3044628480 + }, + { + "epoch": 10.02, + "learning_rate": 3.9227683049147444e-05, + "loss": 2.4084, + "theoretical_loss": 3.316995914800611, + "tokens_seen": 3044694016 + }, + { + "epoch": 10.02, + "learning_rate": 3.921765295887663e-05, + "loss": 2.4329, + "theoretical_loss": 3.3169904418591183, + "tokens_seen": 3044759552 + }, + { + "epoch": 10.02, + "learning_rate": 3.920762286860582e-05, + "loss": 2.4551, + "theoretical_loss": 3.3169849690684083, + "tokens_seen": 3044825088 + }, + { + "epoch": 10.02, + "learning_rate": 3.919759277833501e-05, + "loss": 2.3525, + "theoretical_loss": 3.3169794964284742, + "tokens_seen": 3044890624 + }, + { + "epoch": 10.02, + "learning_rate": 3.918756268806419e-05, + "loss": 2.4804, + "theoretical_loss": 3.3169740239393075, + "tokens_seen": 3044956160 + }, + { + "epoch": 10.02, + "learning_rate": 3.917753259779338e-05, + "loss": 2.4107, + "theoretical_loss": 3.3169685516009015, + "tokens_seen": 3045021696 + }, + { + "epoch": 10.02, + "learning_rate": 3.9167502507522565e-05, + "loss": 2.324, + "theoretical_loss": 3.3169630794132487, + "tokens_seen": 3045087232 + }, + { + "epoch": 10.02, + "learning_rate": 3.915747241725175e-05, + "loss": 2.5017, + "theoretical_loss": 3.3169576073763416, + "tokens_seen": 3045152768 + }, + { + "epoch": 10.02, + "learning_rate": 3.914744232698094e-05, + "loss": 2.5179, + "theoretical_loss": 3.3169521354901734, + "tokens_seen": 3045218304 + }, + { + "epoch": 10.02, + "learning_rate": 3.913741223671013e-05, + "loss": 2.4147, + "theoretical_loss": 3.3169466637547353, + "tokens_seen": 3045283840 + }, + { + "epoch": 10.02, + "learning_rate": 3.912738214643932e-05, + "loss": 2.5173, + "theoretical_loss": 3.3169411921700216, + "tokens_seen": 3045349376 + }, + { + "epoch": 10.02, + "learning_rate": 3.911735205616851e-05, + "loss": 2.5132, + "theoretical_loss": 3.3169357207360237, + "tokens_seen": 3045414912 + }, + { + "epoch": 10.02, + "learning_rate": 3.91073219658977e-05, + "loss": 2.2536, + "theoretical_loss": 3.3169302494527346, + "tokens_seen": 3045480448 + }, + { + "epoch": 10.02, + "learning_rate": 3.909729187562689e-05, + "loss": 2.5896, + "theoretical_loss": 3.316924778320147, + "tokens_seen": 3045545984 + }, + { + "epoch": 10.02, + "learning_rate": 3.908726178535607e-05, + "loss": 2.5147, + "theoretical_loss": 3.3169193073382535, + "tokens_seen": 3045611520 + }, + { + "epoch": 10.02, + "learning_rate": 3.9077231695085256e-05, + "loss": 2.2862, + "theoretical_loss": 3.3169138365070463, + "tokens_seen": 3045677056 + }, + { + "epoch": 10.02, + "learning_rate": 3.9067201604814444e-05, + "loss": 2.5429, + "theoretical_loss": 3.3169083658265186, + "tokens_seen": 3045742592 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3370502, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.301603078842163, + "objective/train/theoretical_loss": 3.3169056305427573, + "objective/train/tokens_used": 3066235360, + "theoretical_loss": 3.3169056305427573, + "tokens_seen": 3045775360 + }, + { + "epoch": 10.02, + "learning_rate": 3.905717151454363e-05, + "loss": 2.3552, + "theoretical_loss": 3.316902895296663, + "tokens_seen": 3045808128 + }, + { + "epoch": 10.02, + "learning_rate": 3.904714142427282e-05, + "loss": 2.541, + "theoretical_loss": 3.3168974249174714, + "tokens_seen": 3045873664 + }, + { + "epoch": 10.02, + "learning_rate": 3.903711133400201e-05, + "loss": 2.3813, + "theoretical_loss": 3.316891954688937, + "tokens_seen": 3045939200 + }, + { + "epoch": 10.02, + "learning_rate": 3.9027081243731195e-05, + "loss": 2.3341, + "theoretical_loss": 3.316886484611052, + "tokens_seen": 3046004736 + }, + { + "epoch": 10.02, + "learning_rate": 3.901705115346038e-05, + "loss": 2.5717, + "theoretical_loss": 3.3168810146838097, + "tokens_seen": 3046070272 + }, + { + "epoch": 10.02, + "learning_rate": 3.900702106318957e-05, + "loss": 2.5953, + "theoretical_loss": 3.316875544907202, + "tokens_seen": 3046135808 + }, + { + "epoch": 10.02, + "learning_rate": 3.899699097291875e-05, + "loss": 2.414, + "theoretical_loss": 3.316870075281222, + "tokens_seen": 3046201344 + }, + { + "epoch": 10.02, + "learning_rate": 3.898696088264794e-05, + "loss": 2.4743, + "theoretical_loss": 3.3168646058058617, + "tokens_seen": 3046266880 + }, + { + "epoch": 10.02, + "learning_rate": 3.897693079237713e-05, + "loss": 2.5263, + "theoretical_loss": 3.3168591364811144, + "tokens_seen": 3046332416 + }, + { + "epoch": 10.02, + "learning_rate": 3.8966900702106316e-05, + "loss": 2.4184, + "theoretical_loss": 3.3168536673069724, + "tokens_seen": 3046397952 + }, + { + "epoch": 10.02, + "learning_rate": 3.8956870611835504e-05, + "loss": 2.4441, + "theoretical_loss": 3.3168481982834286, + "tokens_seen": 3046463488 + }, + { + "epoch": 10.02, + "learning_rate": 3.894684052156469e-05, + "loss": 2.6697, + "theoretical_loss": 3.316842729410475, + "tokens_seen": 3046529024 + }, + { + "epoch": 10.02, + "learning_rate": 3.8936810431293886e-05, + "loss": 2.4501, + "theoretical_loss": 3.3168372606881045, + "tokens_seen": 3046594560 + }, + { + "epoch": 10.02, + "learning_rate": 3.8926780341023074e-05, + "loss": 2.4585, + "theoretical_loss": 3.3168317921163104, + "tokens_seen": 3046660096 + }, + { + "epoch": 10.02, + "learning_rate": 3.891675025075226e-05, + "loss": 2.2837, + "theoretical_loss": 3.316826323695084, + "tokens_seen": 3046725632 + }, + { + "epoch": 10.02, + "learning_rate": 3.890672016048145e-05, + "loss": 2.4257, + "theoretical_loss": 3.316820855424419, + "tokens_seen": 3046791168 + }, + { + "epoch": 10.02, + "learning_rate": 3.889669007021063e-05, + "loss": 2.5198, + "theoretical_loss": 3.316815387304308, + "tokens_seen": 3046856704 + }, + { + "epoch": 10.02, + "learning_rate": 3.888665997993982e-05, + "loss": 2.3122, + "theoretical_loss": 3.3168099193347422, + "tokens_seen": 3046922240 + }, + { + "epoch": 10.02, + "learning_rate": 3.887662988966901e-05, + "loss": 2.479, + "theoretical_loss": 3.316804451515716, + "tokens_seen": 3046987776 + }, + { + "epoch": 10.02, + "learning_rate": 3.8866599799398195e-05, + "loss": 2.645, + "theoretical_loss": 3.316798983847221, + "tokens_seen": 3047053312 + }, + { + "epoch": 10.02, + "learning_rate": 3.885656970912738e-05, + "loss": 2.291, + "theoretical_loss": 3.31679351632925, + "tokens_seen": 3047118848 + }, + { + "epoch": 10.02, + "learning_rate": 3.884653961885657e-05, + "loss": 2.3186, + "theoretical_loss": 3.3167880489617962, + "tokens_seen": 3047184384 + }, + { + "epoch": 10.02, + "learning_rate": 3.883650952858576e-05, + "loss": 2.6593, + "theoretical_loss": 3.3167825817448513, + "tokens_seen": 3047249920 + }, + { + "epoch": 10.02, + "learning_rate": 3.8826479438314947e-05, + "loss": 2.5323, + "theoretical_loss": 3.3167771146784086, + "tokens_seen": 3047315456 + }, + { + "epoch": 10.02, + "learning_rate": 3.8816449348044134e-05, + "loss": 2.4654, + "theoretical_loss": 3.3167716477624607, + "tokens_seen": 3047380992 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3372055, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3921737670898438, + "objective/train/theoretical_loss": 3.3167689143609196, + "objective/train/tokens_used": 3067873760, + "theoretical_loss": 3.3167689143609196, + "tokens_seen": 3047413760 + }, + { + "epoch": 10.02, + "learning_rate": 3.880641925777332e-05, + "loss": 2.5116, + "theoretical_loss": 3.3167661809969995, + "tokens_seen": 3047446528 + }, + { + "epoch": 10.02, + "learning_rate": 3.8796389167502503e-05, + "loss": 2.4825, + "theoretical_loss": 3.3167607143820184, + "tokens_seen": 3047512064 + }, + { + "epoch": 10.02, + "learning_rate": 3.878635907723169e-05, + "loss": 2.5428, + "theoretical_loss": 3.3167552479175098, + "tokens_seen": 3047577600 + }, + { + "epoch": 10.02, + "learning_rate": 3.877632898696088e-05, + "loss": 2.4966, + "theoretical_loss": 3.316749781603466, + "tokens_seen": 3047643136 + }, + { + "epoch": 10.02, + "learning_rate": 3.876629889669007e-05, + "loss": 2.5104, + "theoretical_loss": 3.3167443154398804, + "tokens_seen": 3047708672 + }, + { + "epoch": 10.02, + "learning_rate": 3.875626880641926e-05, + "loss": 2.5127, + "theoretical_loss": 3.3167388494267445, + "tokens_seen": 3047774208 + }, + { + "epoch": 10.02, + "learning_rate": 3.874623871614845e-05, + "loss": 2.3337, + "theoretical_loss": 3.316733383564052, + "tokens_seen": 3047839744 + }, + { + "epoch": 10.02, + "learning_rate": 3.873620862587764e-05, + "loss": 2.462, + "theoretical_loss": 3.3167279178517948, + "tokens_seen": 3047905280 + }, + { + "epoch": 10.02, + "learning_rate": 3.8726178535606825e-05, + "loss": 2.4594, + "theoretical_loss": 3.316722452289966, + "tokens_seen": 3047970816 + }, + { + "epoch": 10.02, + "learning_rate": 3.871614844533601e-05, + "loss": 2.5582, + "theoretical_loss": 3.3167169868785575, + "tokens_seen": 3048036352 + }, + { + "epoch": 10.02, + "learning_rate": 3.87061183550652e-05, + "loss": 2.3212, + "theoretical_loss": 3.316711521617563, + "tokens_seen": 3048101888 + }, + { + "epoch": 10.02, + "learning_rate": 3.869608826479438e-05, + "loss": 2.5609, + "theoretical_loss": 3.316706056506974, + "tokens_seen": 3048167424 + }, + { + "epoch": 10.02, + "learning_rate": 3.868605817452357e-05, + "loss": 2.3327, + "theoretical_loss": 3.3167005915467844, + "tokens_seen": 3048232960 + }, + { + "epoch": 10.02, + "learning_rate": 3.867602808425276e-05, + "loss": 2.4519, + "theoretical_loss": 3.316695126736986, + "tokens_seen": 3048298496 + }, + { + "epoch": 10.02, + "learning_rate": 3.8665997993981946e-05, + "loss": 2.4536, + "theoretical_loss": 3.316689662077571, + "tokens_seen": 3048364032 + }, + { + "epoch": 10.02, + "learning_rate": 3.8655967903711134e-05, + "loss": 2.4738, + "theoretical_loss": 3.316684197568533, + "tokens_seen": 3048429568 + }, + { + "epoch": 10.02, + "learning_rate": 3.864593781344032e-05, + "loss": 2.6533, + "theoretical_loss": 3.316678733209864, + "tokens_seen": 3048495104 + }, + { + "epoch": 10.02, + "learning_rate": 3.863590772316951e-05, + "loss": 2.5134, + "theoretical_loss": 3.3166732690015572, + "tokens_seen": 3048560640 + }, + { + "epoch": 10.02, + "learning_rate": 3.86258776328987e-05, + "loss": 2.4696, + "theoretical_loss": 3.3166678049436045, + "tokens_seen": 3048626176 + }, + { + "epoch": 10.02, + "learning_rate": 3.8615847542627886e-05, + "loss": 2.429, + "theoretical_loss": 3.316662341035999, + "tokens_seen": 3048691712 + }, + { + "epoch": 10.02, + "learning_rate": 3.860581745235707e-05, + "loss": 2.5209, + "theoretical_loss": 3.3166568772787333, + "tokens_seen": 3048757248 + }, + { + "epoch": 10.02, + "learning_rate": 3.8595787362086255e-05, + "loss": 2.5509, + "theoretical_loss": 3.3166514136718, + "tokens_seen": 3048822784 + }, + { + "epoch": 10.02, + "learning_rate": 3.858575727181544e-05, + "loss": 2.4885, + "theoretical_loss": 3.3166459502151913, + "tokens_seen": 3048888320 + }, + { + "epoch": 10.02, + "learning_rate": 3.857572718154463e-05, + "loss": 2.6277, + "theoretical_loss": 3.316640486908901, + "tokens_seen": 3048953856 + }, + { + "epoch": 10.02, + "learning_rate": 3.8565697091273825e-05, + "loss": 2.2859, + "theoretical_loss": 3.3166350237529203, + "tokens_seen": 3049019392 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 3372798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.377603530883789, + "objective/train/theoretical_loss": 3.316632292231294, + "objective/train/tokens_used": 3069512160, + "theoretical_loss": 3.316632292231294, + "tokens_seen": 3049052160 + }, + { + "epoch": 10.02, + "learning_rate": 3.855566700100301e-05, + "loss": 2.4384, + "theoretical_loss": 3.3166295607472427, + "tokens_seen": 3049084928 + }, + { + "epoch": 10.02, + "learning_rate": 3.85456369107322e-05, + "loss": 2.3234, + "theoretical_loss": 3.3166240978918604, + "tokens_seen": 3049150464 + }, + { + "epoch": 10.02, + "learning_rate": 3.853560682046139e-05, + "loss": 2.2624, + "theoretical_loss": 3.3166186351867664, + "tokens_seen": 3049216000 + }, + { + "epoch": 10.02, + "learning_rate": 3.852557673019058e-05, + "loss": 2.4328, + "theoretical_loss": 3.3166131726319534, + "tokens_seen": 3049281536 + }, + { + "epoch": 10.02, + "learning_rate": 3.8515546639919765e-05, + "loss": 2.3397, + "theoretical_loss": 3.3166077102274136, + "tokens_seen": 3049347072 + }, + { + "epoch": 10.02, + "learning_rate": 3.8505516549648946e-05, + "loss": 2.3253, + "theoretical_loss": 3.3166022479731403, + "tokens_seen": 3049412608 + }, + { + "epoch": 10.02, + "learning_rate": 3.8495486459378134e-05, + "loss": 2.6457, + "theoretical_loss": 3.316596785869125, + "tokens_seen": 3049478144 + }, + { + "epoch": 10.02, + "learning_rate": 3.848545636910732e-05, + "loss": 2.4197, + "theoretical_loss": 3.3165913239153615, + "tokens_seen": 3049543680 + }, + { + "epoch": 10.02, + "learning_rate": 3.847542627883651e-05, + "loss": 2.2015, + "theoretical_loss": 3.3165858621118423, + "tokens_seen": 3049609216 + }, + { + "epoch": 10.02, + "learning_rate": 3.84653961885657e-05, + "loss": 2.5105, + "theoretical_loss": 3.316580400458559, + "tokens_seen": 3049674752 + }, + { + "epoch": 10.02, + "learning_rate": 3.8455366098294885e-05, + "loss": 2.4765, + "theoretical_loss": 3.3165749389555055, + "tokens_seen": 3049740288 + }, + { + "epoch": 10.02, + "learning_rate": 3.844533600802407e-05, + "loss": 2.4694, + "theoretical_loss": 3.3165694776026733, + "tokens_seen": 3049805824 + }, + { + "epoch": 10.02, + "learning_rate": 3.843530591775326e-05, + "loss": 2.4438, + "theoretical_loss": 3.316564016400056, + "tokens_seen": 3049871360 + }, + { + "epoch": 10.02, + "learning_rate": 3.842527582748245e-05, + "loss": 2.4199, + "theoretical_loss": 3.316558555347646, + "tokens_seen": 3049936896 + }, + { + "epoch": 10.02, + "learning_rate": 3.841524573721164e-05, + "loss": 2.5716, + "theoretical_loss": 3.316553094445436, + "tokens_seen": 3050002432 + }, + { + "epoch": 10.02, + "learning_rate": 3.840521564694082e-05, + "loss": 2.5172, + "theoretical_loss": 3.316547633693418, + "tokens_seen": 3050067968 + }, + { + "epoch": 10.03, + "learning_rate": 3.8395185556670006e-05, + "loss": 2.3139, + "theoretical_loss": 3.3165421730915856, + "tokens_seen": 3050133504 + }, + { + "epoch": 10.03, + "learning_rate": 3.8385155466399194e-05, + "loss": 2.4021, + "theoretical_loss": 3.3165367126399303, + "tokens_seen": 3050199040 + }, + { + "epoch": 10.03, + "learning_rate": 3.837512537612839e-05, + "loss": 2.2804, + "theoretical_loss": 3.316531252338446, + "tokens_seen": 3050264576 + }, + { + "epoch": 10.03, + "learning_rate": 3.8365095285857576e-05, + "loss": 2.2546, + "theoretical_loss": 3.3165257921871243, + "tokens_seen": 3050330112 + }, + { + "epoch": 10.03, + "learning_rate": 3.8355065195586764e-05, + "loss": 2.415, + "theoretical_loss": 3.3165203321859584, + "tokens_seen": 3050395648 + }, + { + "epoch": 10.03, + "learning_rate": 3.834503510531595e-05, + "loss": 2.4113, + "theoretical_loss": 3.316514872334941, + "tokens_seen": 3050461184 + }, + { + "epoch": 10.03, + "learning_rate": 3.833500501504514e-05, + "loss": 2.4235, + "theoretical_loss": 3.3165094126340646, + "tokens_seen": 3050526720 + }, + { + "epoch": 10.03, + "learning_rate": 3.832497492477433e-05, + "loss": 2.3611, + "theoretical_loss": 3.3165039530833216, + "tokens_seen": 3050592256 + }, + { + "epoch": 10.03, + "learning_rate": 3.8314944834503516e-05, + "loss": 2.399, + "theoretical_loss": 3.3164984936827047, + "tokens_seen": 3050657792 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3373905, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.536484479904175, + "objective/train/theoretical_loss": 3.3164957640386916, + "objective/train/tokens_used": 3071150560, + "theoretical_loss": 3.3164957640386916, + "tokens_seen": 3050690560 + }, + { + "epoch": 10.03, + "learning_rate": 3.83049147442327e-05, + "loss": 2.4156, + "theoretical_loss": 3.316493034432207, + "tokens_seen": 3050723328 + }, + { + "epoch": 10.03, + "learning_rate": 3.8294884653961885e-05, + "loss": 2.4698, + "theoretical_loss": 3.316487575331821, + "tokens_seen": 3050788864 + }, + { + "epoch": 10.03, + "learning_rate": 3.828485456369107e-05, + "loss": 2.4658, + "theoretical_loss": 3.316482116381539, + "tokens_seen": 3050854400 + }, + { + "epoch": 10.03, + "learning_rate": 3.827482447342026e-05, + "loss": 2.6792, + "theoretical_loss": 3.3164766575813536, + "tokens_seen": 3050919936 + }, + { + "epoch": 10.03, + "learning_rate": 3.826479438314945e-05, + "loss": 2.2799, + "theoretical_loss": 3.316471198931258, + "tokens_seen": 3050985472 + }, + { + "epoch": 10.03, + "learning_rate": 3.8254764292878636e-05, + "loss": 2.3602, + "theoretical_loss": 3.3164657404312448, + "tokens_seen": 3051051008 + }, + { + "epoch": 10.03, + "learning_rate": 3.8244734202607824e-05, + "loss": 2.4048, + "theoretical_loss": 3.316460282081306, + "tokens_seen": 3051116544 + }, + { + "epoch": 10.03, + "learning_rate": 3.823470411233701e-05, + "loss": 2.4003, + "theoretical_loss": 3.316454823881435, + "tokens_seen": 3051182080 + }, + { + "epoch": 10.03, + "learning_rate": 3.82246740220662e-05, + "loss": 2.4835, + "theoretical_loss": 3.3164493658316236, + "tokens_seen": 3051247616 + }, + { + "epoch": 10.03, + "learning_rate": 3.821464393179538e-05, + "loss": 2.4958, + "theoretical_loss": 3.3164439079318653, + "tokens_seen": 3051313152 + }, + { + "epoch": 10.03, + "learning_rate": 3.820461384152457e-05, + "loss": 2.6651, + "theoretical_loss": 3.316438450182152, + "tokens_seen": 3051378688 + }, + { + "epoch": 10.03, + "learning_rate": 3.819458375125376e-05, + "loss": 2.47, + "theoretical_loss": 3.3164329925824774, + "tokens_seen": 3051444224 + }, + { + "epoch": 10.03, + "learning_rate": 3.818455366098295e-05, + "loss": 2.5319, + "theoretical_loss": 3.316427535132833, + "tokens_seen": 3051509760 + }, + { + "epoch": 10.03, + "learning_rate": 3.817452357071214e-05, + "loss": 2.442, + "theoretical_loss": 3.316422077833212, + "tokens_seen": 3051575296 + }, + { + "epoch": 10.03, + "learning_rate": 3.816449348044133e-05, + "loss": 2.1902, + "theoretical_loss": 3.3164166206836074, + "tokens_seen": 3051640832 + }, + { + "epoch": 10.03, + "learning_rate": 3.8154463390170515e-05, + "loss": 2.4185, + "theoretical_loss": 3.3164111636840112, + "tokens_seen": 3051706368 + }, + { + "epoch": 10.03, + "learning_rate": 3.81444332998997e-05, + "loss": 2.5392, + "theoretical_loss": 3.3164057068344164, + "tokens_seen": 3051771904 + }, + { + "epoch": 10.03, + "learning_rate": 3.813440320962889e-05, + "loss": 2.4672, + "theoretical_loss": 3.3164002501348158, + "tokens_seen": 3051837440 + }, + { + "epoch": 10.03, + "learning_rate": 3.812437311935808e-05, + "loss": 2.1595, + "theoretical_loss": 3.3163947935852014, + "tokens_seen": 3051902976 + }, + { + "epoch": 10.03, + "learning_rate": 3.811434302908726e-05, + "loss": 2.3988, + "theoretical_loss": 3.3163893371855666, + "tokens_seen": 3051968512 + }, + { + "epoch": 10.03, + "learning_rate": 3.810431293881645e-05, + "loss": 2.4809, + "theoretical_loss": 3.316383880935904, + "tokens_seen": 3052034048 + }, + { + "epoch": 10.03, + "learning_rate": 3.8094282848545636e-05, + "loss": 2.4645, + "theoretical_loss": 3.3163784248362056, + "tokens_seen": 3052099584 + }, + { + "epoch": 10.03, + "learning_rate": 3.8084252758274824e-05, + "loss": 2.5508, + "theoretical_loss": 3.3163729688864643, + "tokens_seen": 3052165120 + }, + { + "epoch": 10.03, + "learning_rate": 3.807422266800401e-05, + "loss": 2.4967, + "theoretical_loss": 3.316367513086673, + "tokens_seen": 3052230656 + }, + { + "epoch": 10.03, + "learning_rate": 3.80641925777332e-05, + "loss": 2.533, + "theoretical_loss": 3.3163620574368244, + "tokens_seen": 3052296192 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3374492, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4025440216064453, + "objective/train/theoretical_loss": 3.3163593296681264, + "objective/train/tokens_used": 3072788960, + "theoretical_loss": 3.3163593296681264, + "tokens_seen": 3052328960 + }, + { + "epoch": 10.03, + "learning_rate": 3.805416248746239e-05, + "loss": 2.5161, + "theoretical_loss": 3.3163566019369113, + "tokens_seen": 3052361728 + }, + { + "epoch": 10.03, + "learning_rate": 3.8044132397191575e-05, + "loss": 2.4692, + "theoretical_loss": 3.3163511465869258, + "tokens_seen": 3052427264 + }, + { + "epoch": 10.03, + "learning_rate": 3.803410230692076e-05, + "loss": 2.4258, + "theoretical_loss": 3.316345691386861, + "tokens_seen": 3052492800 + }, + { + "epoch": 10.03, + "learning_rate": 3.802407221664995e-05, + "loss": 2.4418, + "theoretical_loss": 3.3163402363367096, + "tokens_seen": 3052558336 + }, + { + "epoch": 10.03, + "learning_rate": 3.801404212637913e-05, + "loss": 2.2098, + "theoretical_loss": 3.316334781436464, + "tokens_seen": 3052623872 + }, + { + "epoch": 10.03, + "learning_rate": 3.800401203610833e-05, + "loss": 2.6142, + "theoretical_loss": 3.316329326686117, + "tokens_seen": 3052689408 + }, + { + "epoch": 10.03, + "learning_rate": 3.7993981945837515e-05, + "loss": 2.4542, + "theoretical_loss": 3.316323872085661, + "tokens_seen": 3052754944 + }, + { + "epoch": 10.03, + "learning_rate": 3.79839518555667e-05, + "loss": 2.6561, + "theoretical_loss": 3.316318417635089, + "tokens_seen": 3052820480 + }, + { + "epoch": 10.03, + "learning_rate": 3.797392176529589e-05, + "loss": 2.4947, + "theoretical_loss": 3.3163129633343935, + "tokens_seen": 3052886016 + }, + { + "epoch": 10.03, + "learning_rate": 3.796389167502508e-05, + "loss": 2.5, + "theoretical_loss": 3.3163075091835674, + "tokens_seen": 3052951552 + }, + { + "epoch": 10.03, + "learning_rate": 3.7953861584754266e-05, + "loss": 2.6043, + "theoretical_loss": 3.316302055182603, + "tokens_seen": 3053017088 + }, + { + "epoch": 10.03, + "learning_rate": 3.7943831494483454e-05, + "loss": 2.3215, + "theoretical_loss": 3.316296601331493, + "tokens_seen": 3053082624 + }, + { + "epoch": 10.03, + "learning_rate": 3.793380140421264e-05, + "loss": 2.5402, + "theoretical_loss": 3.3162911476302304, + "tokens_seen": 3053148160 + }, + { + "epoch": 10.03, + "learning_rate": 3.792377131394182e-05, + "loss": 2.2189, + "theoretical_loss": 3.3162856940788075, + "tokens_seen": 3053213696 + }, + { + "epoch": 10.03, + "learning_rate": 3.791374122367101e-05, + "loss": 2.3737, + "theoretical_loss": 3.3162802406772176, + "tokens_seen": 3053279232 + }, + { + "epoch": 10.03, + "learning_rate": 3.79037111334002e-05, + "loss": 2.6182, + "theoretical_loss": 3.3162747874254523, + "tokens_seen": 3053344768 + }, + { + "epoch": 10.03, + "learning_rate": 3.789368104312939e-05, + "loss": 2.4292, + "theoretical_loss": 3.316269334323505, + "tokens_seen": 3053410304 + }, + { + "epoch": 10.03, + "learning_rate": 3.7883650952858575e-05, + "loss": 2.2298, + "theoretical_loss": 3.316263881371369, + "tokens_seen": 3053475840 + }, + { + "epoch": 10.03, + "learning_rate": 3.787362086258776e-05, + "loss": 2.6856, + "theoretical_loss": 3.3162584285690353, + "tokens_seen": 3053541376 + }, + { + "epoch": 10.03, + "learning_rate": 3.786359077231695e-05, + "loss": 2.455, + "theoretical_loss": 3.316252975916498, + "tokens_seen": 3053606912 + }, + { + "epoch": 10.03, + "learning_rate": 3.785356068204614e-05, + "loss": 2.4906, + "theoretical_loss": 3.316247523413749, + "tokens_seen": 3053672448 + }, + { + "epoch": 10.03, + "learning_rate": 3.7843530591775327e-05, + "loss": 2.4957, + "theoretical_loss": 3.316242071060781, + "tokens_seen": 3053737984 + }, + { + "epoch": 10.03, + "learning_rate": 3.7833500501504514e-05, + "loss": 2.527, + "theoretical_loss": 3.3162366188575874, + "tokens_seen": 3053803520 + }, + { + "epoch": 10.03, + "learning_rate": 3.7823470411233696e-05, + "loss": 2.3077, + "theoretical_loss": 3.31623116680416, + "tokens_seen": 3053869056 + }, + { + "epoch": 10.03, + "learning_rate": 3.781344032096289e-05, + "loss": 2.3756, + "theoretical_loss": 3.3162257149004923, + "tokens_seen": 3053934592 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3375959, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.278730630874634, + "objective/train/theoretical_loss": 3.3162229890048156, + "objective/train/tokens_used": 3074427360, + "theoretical_loss": 3.3162229890048156, + "tokens_seen": 3053967360 + }, + { + "epoch": 10.03, + "learning_rate": 3.780341023069208e-05, + "loss": 2.4851, + "theoretical_loss": 3.316220263146576, + "tokens_seen": 3054000128 + }, + { + "epoch": 10.03, + "learning_rate": 3.7793380140421266e-05, + "loss": 2.5561, + "theoretical_loss": 3.3162148115424044, + "tokens_seen": 3054065664 + }, + { + "epoch": 10.03, + "learning_rate": 3.7783350050150454e-05, + "loss": 2.3831, + "theoretical_loss": 3.3162093600879703, + "tokens_seen": 3054131200 + }, + { + "epoch": 10.03, + "learning_rate": 3.777331995987964e-05, + "loss": 2.4267, + "theoretical_loss": 3.316203908783266, + "tokens_seen": 3054196736 + }, + { + "epoch": 10.03, + "learning_rate": 3.776328986960883e-05, + "loss": 2.4068, + "theoretical_loss": 3.3161984576282846, + "tokens_seen": 3054262272 + }, + { + "epoch": 10.03, + "learning_rate": 3.775325977933802e-05, + "loss": 2.494, + "theoretical_loss": 3.316193006623018, + "tokens_seen": 3054327808 + }, + { + "epoch": 10.03, + "learning_rate": 3.7743229689067205e-05, + "loss": 2.5863, + "theoretical_loss": 3.3161875557674594, + "tokens_seen": 3054393344 + }, + { + "epoch": 10.03, + "learning_rate": 3.773319959879639e-05, + "loss": 2.4279, + "theoretical_loss": 3.3161821050616016, + "tokens_seen": 3054458880 + }, + { + "epoch": 10.03, + "learning_rate": 3.7723169508525574e-05, + "loss": 2.4772, + "theoretical_loss": 3.316176654505437, + "tokens_seen": 3054524416 + }, + { + "epoch": 10.03, + "learning_rate": 3.771313941825476e-05, + "loss": 2.3511, + "theoretical_loss": 3.3161712040989584, + "tokens_seen": 3054589952 + }, + { + "epoch": 10.03, + "learning_rate": 3.770310932798395e-05, + "loss": 2.4043, + "theoretical_loss": 3.3161657538421587, + "tokens_seen": 3054655488 + }, + { + "epoch": 10.03, + "learning_rate": 3.769307923771314e-05, + "loss": 2.267, + "theoretical_loss": 3.31616030373503, + "tokens_seen": 3054721024 + }, + { + "epoch": 10.03, + "learning_rate": 3.7683049147442326e-05, + "loss": 2.6526, + "theoretical_loss": 3.3161548537775656, + "tokens_seen": 3054786560 + }, + { + "epoch": 10.03, + "learning_rate": 3.7673019057171514e-05, + "loss": 2.3593, + "theoretical_loss": 3.316149403969758, + "tokens_seen": 3054852096 + }, + { + "epoch": 10.03, + "learning_rate": 3.76629889669007e-05, + "loss": 2.2968, + "theoretical_loss": 3.3161439543116, + "tokens_seen": 3054917632 + }, + { + "epoch": 10.03, + "learning_rate": 3.765295887662989e-05, + "loss": 2.3917, + "theoretical_loss": 3.3161385048030834, + "tokens_seen": 3054983168 + }, + { + "epoch": 10.03, + "learning_rate": 3.764292878635908e-05, + "loss": 2.4524, + "theoretical_loss": 3.316133055444202, + "tokens_seen": 3055048704 + }, + { + "epoch": 10.03, + "learning_rate": 3.763289869608826e-05, + "loss": 2.6232, + "theoretical_loss": 3.3161276062349474, + "tokens_seen": 3055114240 + }, + { + "epoch": 10.03, + "learning_rate": 3.7622868605817453e-05, + "loss": 2.52, + "theoretical_loss": 3.3161221571753137, + "tokens_seen": 3055179776 + }, + { + "epoch": 10.03, + "learning_rate": 3.761283851554664e-05, + "loss": 2.6837, + "theoretical_loss": 3.3161167082652923, + "tokens_seen": 3055245312 + }, + { + "epoch": 10.03, + "learning_rate": 3.760280842527583e-05, + "loss": 2.5275, + "theoretical_loss": 3.3161112595048765, + "tokens_seen": 3055310848 + }, + { + "epoch": 10.03, + "learning_rate": 3.759277833500502e-05, + "loss": 2.5265, + "theoretical_loss": 3.316105810894059, + "tokens_seen": 3055376384 + }, + { + "epoch": 10.03, + "learning_rate": 3.7582748244734205e-05, + "loss": 2.5098, + "theoretical_loss": 3.3161003624328322, + "tokens_seen": 3055441920 + }, + { + "epoch": 10.03, + "learning_rate": 3.757271815446339e-05, + "loss": 2.6622, + "theoretical_loss": 3.316094914121189, + "tokens_seen": 3055507456 + }, + { + "epoch": 10.03, + "learning_rate": 3.756268806419258e-05, + "loss": 2.5169, + "theoretical_loss": 3.316089465959122, + "tokens_seen": 3055572992 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3376357, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7386646270751953, + "objective/train/theoretical_loss": 3.316086741934177, + "objective/train/tokens_used": 3076065760, + "theoretical_loss": 3.316086741934177, + "tokens_seen": 3055605760 + }, + { + "epoch": 10.03, + "learning_rate": 3.755265797392177e-05, + "loss": 2.506, + "theoretical_loss": 3.3160840179466238, + "tokens_seen": 3055638528 + }, + { + "epoch": 10.03, + "learning_rate": 3.754262788365096e-05, + "loss": 2.4373, + "theoretical_loss": 3.316078570083687, + "tokens_seen": 3055704064 + }, + { + "epoch": 10.03, + "learning_rate": 3.753259779338014e-05, + "loss": 2.5189, + "theoretical_loss": 3.316073122370305, + "tokens_seen": 3055769600 + }, + { + "epoch": 10.03, + "learning_rate": 3.7522567703109326e-05, + "loss": 2.39, + "theoretical_loss": 3.3160676748064697, + "tokens_seen": 3055835136 + }, + { + "epoch": 10.03, + "learning_rate": 3.7512537612838514e-05, + "loss": 2.3554, + "theoretical_loss": 3.316062227392174, + "tokens_seen": 3055900672 + }, + { + "epoch": 10.03, + "learning_rate": 3.75025075225677e-05, + "loss": 2.5174, + "theoretical_loss": 3.3160567801274103, + "tokens_seen": 3055966208 + }, + { + "epoch": 10.03, + "learning_rate": 3.749247743229689e-05, + "loss": 2.297, + "theoretical_loss": 3.3160513330121724, + "tokens_seen": 3056031744 + }, + { + "epoch": 10.03, + "learning_rate": 3.748244734202608e-05, + "loss": 2.5378, + "theoretical_loss": 3.3160458860464517, + "tokens_seen": 3056097280 + }, + { + "epoch": 10.03, + "learning_rate": 3.7472417251755265e-05, + "loss": 2.4475, + "theoretical_loss": 3.3160404392302416, + "tokens_seen": 3056162816 + }, + { + "epoch": 10.03, + "learning_rate": 3.746238716148445e-05, + "loss": 2.3754, + "theoretical_loss": 3.316034992563534, + "tokens_seen": 3056228352 + }, + { + "epoch": 10.03, + "learning_rate": 3.745235707121364e-05, + "loss": 2.561, + "theoretical_loss": 3.316029546046323, + "tokens_seen": 3056293888 + }, + { + "epoch": 10.03, + "learning_rate": 3.7442326980942836e-05, + "loss": 2.438, + "theoretical_loss": 3.3160240996786, + "tokens_seen": 3056359424 + }, + { + "epoch": 10.03, + "learning_rate": 3.743229689067202e-05, + "loss": 2.6571, + "theoretical_loss": 3.3160186534603584, + "tokens_seen": 3056424960 + }, + { + "epoch": 10.03, + "learning_rate": 3.7422266800401205e-05, + "loss": 2.4224, + "theoretical_loss": 3.3160132073915904, + "tokens_seen": 3056490496 + }, + { + "epoch": 10.03, + "learning_rate": 3.741223671013039e-05, + "loss": 2.4356, + "theoretical_loss": 3.3160077614722896, + "tokens_seen": 3056556032 + }, + { + "epoch": 10.03, + "learning_rate": 3.740220661985958e-05, + "loss": 2.4638, + "theoretical_loss": 3.3160023157024474, + "tokens_seen": 3056621568 + }, + { + "epoch": 10.03, + "learning_rate": 3.739217652958877e-05, + "loss": 2.4857, + "theoretical_loss": 3.3159968700820572, + "tokens_seen": 3056687104 + }, + { + "epoch": 10.03, + "learning_rate": 3.7382146439317956e-05, + "loss": 2.6016, + "theoretical_loss": 3.315991424611112, + "tokens_seen": 3056752640 + }, + { + "epoch": 10.03, + "learning_rate": 3.7372116349047144e-05, + "loss": 2.5043, + "theoretical_loss": 3.3159859792896036, + "tokens_seen": 3056818176 + }, + { + "epoch": 10.03, + "learning_rate": 3.736208625877633e-05, + "loss": 2.5208, + "theoretical_loss": 3.3159805341175255, + "tokens_seen": 3056883712 + }, + { + "epoch": 10.03, + "learning_rate": 3.735205616850552e-05, + "loss": 2.4315, + "theoretical_loss": 3.31597508909487, + "tokens_seen": 3056949248 + }, + { + "epoch": 10.03, + "learning_rate": 3.734202607823471e-05, + "loss": 2.4288, + "theoretical_loss": 3.31596964422163, + "tokens_seen": 3057014784 + }, + { + "epoch": 10.03, + "learning_rate": 3.733199598796389e-05, + "loss": 2.6542, + "theoretical_loss": 3.315964199497798, + "tokens_seen": 3057080320 + }, + { + "epoch": 10.03, + "learning_rate": 3.732196589769308e-05, + "loss": 2.5898, + "theoretical_loss": 3.315958754923367, + "tokens_seen": 3057145856 + }, + { + "epoch": 10.03, + "learning_rate": 3.7311935807422265e-05, + "loss": 2.6453, + "theoretical_loss": 3.3159533104983296, + "tokens_seen": 3057211392 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3377495, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5324506759643555, + "objective/train/theoretical_loss": 3.315950588341831, + "objective/train/tokens_used": 3077704160, + "theoretical_loss": 3.315950588341831, + "tokens_seen": 3057244160 + }, + { + "epoch": 10.03, + "learning_rate": 3.730190571715145e-05, + "loss": 2.4372, + "theoretical_loss": 3.315947866222678, + "tokens_seen": 3057276928 + }, + { + "epoch": 10.03, + "learning_rate": 3.729187562688064e-05, + "loss": 2.3823, + "theoretical_loss": 3.3159424220964056, + "tokens_seen": 3057342464 + }, + { + "epoch": 10.03, + "learning_rate": 3.728184553660983e-05, + "loss": 2.7403, + "theoretical_loss": 3.315936978119505, + "tokens_seen": 3057408000 + }, + { + "epoch": 10.03, + "learning_rate": 3.7271815446339016e-05, + "loss": 2.4968, + "theoretical_loss": 3.3159315342919684, + "tokens_seen": 3057473536 + }, + { + "epoch": 10.03, + "learning_rate": 3.7261785356068204e-05, + "loss": 2.3858, + "theoretical_loss": 3.3159260906137886, + "tokens_seen": 3057539072 + }, + { + "epoch": 10.03, + "learning_rate": 3.72517552657974e-05, + "loss": 2.5664, + "theoretical_loss": 3.315920647084959, + "tokens_seen": 3057604608 + }, + { + "epoch": 10.03, + "learning_rate": 3.724172517552659e-05, + "loss": 2.4329, + "theoretical_loss": 3.3159152037054715, + "tokens_seen": 3057670144 + }, + { + "epoch": 10.03, + "learning_rate": 3.723169508525577e-05, + "loss": 2.5402, + "theoretical_loss": 3.315909760475319, + "tokens_seen": 3057735680 + }, + { + "epoch": 10.03, + "learning_rate": 3.7221664994984956e-05, + "loss": 2.5426, + "theoretical_loss": 3.315904317394495, + "tokens_seen": 3057801216 + }, + { + "epoch": 10.03, + "learning_rate": 3.7211634904714144e-05, + "loss": 2.6283, + "theoretical_loss": 3.3158988744629907, + "tokens_seen": 3057866752 + }, + { + "epoch": 10.03, + "learning_rate": 3.720160481444333e-05, + "loss": 2.6577, + "theoretical_loss": 3.3158934316808, + "tokens_seen": 3057932288 + }, + { + "epoch": 10.03, + "learning_rate": 3.719157472417252e-05, + "loss": 2.3909, + "theoretical_loss": 3.315887989047915, + "tokens_seen": 3057997824 + }, + { + "epoch": 10.03, + "learning_rate": 3.718154463390171e-05, + "loss": 2.6479, + "theoretical_loss": 3.315882546564329, + "tokens_seen": 3058063360 + }, + { + "epoch": 10.03, + "learning_rate": 3.7171514543630895e-05, + "loss": 2.5263, + "theoretical_loss": 3.315877104230034, + "tokens_seen": 3058128896 + }, + { + "epoch": 10.03, + "learning_rate": 3.716148445336008e-05, + "loss": 2.3281, + "theoretical_loss": 3.3158716620450233, + "tokens_seen": 3058194432 + }, + { + "epoch": 10.03, + "learning_rate": 3.715145436308927e-05, + "loss": 2.4636, + "theoretical_loss": 3.315866220009289, + "tokens_seen": 3058259968 + }, + { + "epoch": 10.03, + "learning_rate": 3.714142427281845e-05, + "loss": 2.4543, + "theoretical_loss": 3.3158607781228246, + "tokens_seen": 3058325504 + }, + { + "epoch": 10.03, + "learning_rate": 3.713139418254764e-05, + "loss": 2.5199, + "theoretical_loss": 3.315855336385622, + "tokens_seen": 3058391040 + }, + { + "epoch": 10.03, + "learning_rate": 3.712136409227683e-05, + "loss": 2.5405, + "theoretical_loss": 3.3158498947976742, + "tokens_seen": 3058456576 + }, + { + "epoch": 10.03, + "learning_rate": 3.7111334002006016e-05, + "loss": 2.4335, + "theoretical_loss": 3.315844453358974, + "tokens_seen": 3058522112 + }, + { + "epoch": 10.03, + "learning_rate": 3.7101303911735204e-05, + "loss": 2.6448, + "theoretical_loss": 3.3158390120695143, + "tokens_seen": 3058587648 + }, + { + "epoch": 10.03, + "learning_rate": 3.709127382146439e-05, + "loss": 2.3941, + "theoretical_loss": 3.3158335709292874, + "tokens_seen": 3058653184 + }, + { + "epoch": 10.03, + "learning_rate": 3.708124373119358e-05, + "loss": 2.6742, + "theoretical_loss": 3.3158281299382866, + "tokens_seen": 3058718720 + }, + { + "epoch": 10.03, + "learning_rate": 3.707121364092277e-05, + "loss": 2.5393, + "theoretical_loss": 3.3158226890965037, + "tokens_seen": 3058784256 + }, + { + "epoch": 10.03, + "learning_rate": 3.706118355065196e-05, + "loss": 2.4334, + "theoretical_loss": 3.315817248403932, + "tokens_seen": 3058849792 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3377800, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2981436252593994, + "objective/train/theoretical_loss": 3.3158145281135987, + "objective/train/tokens_used": 3079342560, + "theoretical_loss": 3.3158145281135987, + "tokens_seen": 3058882560 + }, + { + "epoch": 10.03, + "learning_rate": 3.705115346038115e-05, + "loss": 2.5691, + "theoretical_loss": 3.3158118078605647, + "tokens_seen": 3058915328 + }, + { + "epoch": 10.03, + "learning_rate": 3.704112337011033e-05, + "loss": 2.3603, + "theoretical_loss": 3.3158063674663936, + "tokens_seen": 3058980864 + }, + { + "epoch": 10.03, + "learning_rate": 3.703109327983952e-05, + "loss": 2.633, + "theoretical_loss": 3.3158009272214115, + "tokens_seen": 3059046400 + }, + { + "epoch": 10.03, + "learning_rate": 3.702106318956871e-05, + "loss": 2.3585, + "theoretical_loss": 3.3157954871256115, + "tokens_seen": 3059111936 + }, + { + "epoch": 10.03, + "learning_rate": 3.7011033099297895e-05, + "loss": 2.6062, + "theoretical_loss": 3.3157900471789863, + "tokens_seen": 3059177472 + }, + { + "epoch": 10.03, + "learning_rate": 3.700100300902708e-05, + "loss": 2.3617, + "theoretical_loss": 3.3157846073815285, + "tokens_seen": 3059243008 + }, + { + "epoch": 10.03, + "learning_rate": 3.699097291875627e-05, + "loss": 2.7919, + "theoretical_loss": 3.3157791677332304, + "tokens_seen": 3059308544 + }, + { + "epoch": 10.03, + "learning_rate": 3.698094282848546e-05, + "loss": 2.3963, + "theoretical_loss": 3.3157737282340856, + "tokens_seen": 3059374080 + }, + { + "epoch": 10.03, + "learning_rate": 3.6970912738214646e-05, + "loss": 2.3485, + "theoretical_loss": 3.3157682888840863, + "tokens_seen": 3059439616 + }, + { + "epoch": 10.03, + "learning_rate": 3.6960882647943834e-05, + "loss": 2.4439, + "theoretical_loss": 3.315762849683225, + "tokens_seen": 3059505152 + }, + { + "epoch": 10.03, + "learning_rate": 3.695085255767302e-05, + "loss": 2.5093, + "theoretical_loss": 3.315757410631495, + "tokens_seen": 3059570688 + }, + { + "epoch": 10.03, + "learning_rate": 3.69408224674022e-05, + "loss": 2.4353, + "theoretical_loss": 3.3157519717288886, + "tokens_seen": 3059636224 + }, + { + "epoch": 10.03, + "learning_rate": 3.693079237713139e-05, + "loss": 2.6127, + "theoretical_loss": 3.3157465329753983, + "tokens_seen": 3059701760 + }, + { + "epoch": 10.03, + "learning_rate": 3.692076228686058e-05, + "loss": 2.4975, + "theoretical_loss": 3.3157410943710177, + "tokens_seen": 3059767296 + }, + { + "epoch": 10.03, + "learning_rate": 3.691073219658977e-05, + "loss": 2.6005, + "theoretical_loss": 3.3157356559157383, + "tokens_seen": 3059832832 + }, + { + "epoch": 10.03, + "learning_rate": 3.6900702106318955e-05, + "loss": 2.5477, + "theoretical_loss": 3.3157302176095538, + "tokens_seen": 3059898368 + }, + { + "epoch": 10.03, + "learning_rate": 3.689067201604814e-05, + "loss": 2.395, + "theoretical_loss": 3.3157247794524567, + "tokens_seen": 3059963904 + }, + { + "epoch": 10.03, + "learning_rate": 3.688064192577733e-05, + "loss": 2.4733, + "theoretical_loss": 3.3157193414444395, + "tokens_seen": 3060029440 + }, + { + "epoch": 10.03, + "learning_rate": 3.6870611835506525e-05, + "loss": 2.4886, + "theoretical_loss": 3.315713903585495, + "tokens_seen": 3060094976 + }, + { + "epoch": 10.03, + "learning_rate": 3.686058174523571e-05, + "loss": 2.5438, + "theoretical_loss": 3.3157084658756157, + "tokens_seen": 3060160512 + }, + { + "epoch": 10.03, + "learning_rate": 3.68505516549649e-05, + "loss": 2.509, + "theoretical_loss": 3.315703028314795, + "tokens_seen": 3060226048 + }, + { + "epoch": 10.03, + "learning_rate": 3.684052156469408e-05, + "loss": 2.6852, + "theoretical_loss": 3.315697590903025, + "tokens_seen": 3060291584 + }, + { + "epoch": 10.03, + "learning_rate": 3.683049147442327e-05, + "loss": 2.3428, + "theoretical_loss": 3.3156921536402986, + "tokens_seen": 3060357120 + }, + { + "epoch": 10.03, + "learning_rate": 3.682046138415246e-05, + "loss": 2.2845, + "theoretical_loss": 3.315686716526608, + "tokens_seen": 3060422656 + }, + { + "epoch": 10.03, + "learning_rate": 3.6810431293881646e-05, + "loss": 2.6342, + "theoretical_loss": 3.3156812795619475, + "tokens_seen": 3060488192 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3377800, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4405148029327393, + "objective/train/theoretical_loss": 3.3156785611355004, + "objective/train/tokens_used": 3080980960, + "theoretical_loss": 3.3156785611355004, + "tokens_seen": 3060520960 + }, + { + "epoch": 10.03, + "learning_rate": 3.6800401203610834e-05, + "loss": 2.5316, + "theoretical_loss": 3.315675842746308, + "tokens_seen": 3060553728 + }, + { + "epoch": 10.03, + "learning_rate": 3.679037111334002e-05, + "loss": 2.6979, + "theoretical_loss": 3.3156704060796836, + "tokens_seen": 3060619264 + }, + { + "epoch": 10.03, + "learning_rate": 3.678034102306921e-05, + "loss": 2.685, + "theoretical_loss": 3.315664969562066, + "tokens_seen": 3060684800 + }, + { + "epoch": 10.03, + "learning_rate": 3.67703109327984e-05, + "loss": 2.7151, + "theoretical_loss": 3.3156595331934486, + "tokens_seen": 3060750336 + }, + { + "epoch": 10.03, + "learning_rate": 3.6760280842527585e-05, + "loss": 2.4055, + "theoretical_loss": 3.3156540969738235, + "tokens_seen": 3060815872 + }, + { + "epoch": 10.03, + "learning_rate": 3.6750250752256767e-05, + "loss": 2.4807, + "theoretical_loss": 3.315648660903184, + "tokens_seen": 3060881408 + }, + { + "epoch": 10.03, + "learning_rate": 3.6740220661985954e-05, + "loss": 2.2792, + "theoretical_loss": 3.315643224981523, + "tokens_seen": 3060946944 + }, + { + "epoch": 10.03, + "learning_rate": 3.673019057171514e-05, + "loss": 2.5139, + "theoretical_loss": 3.3156377892088322, + "tokens_seen": 3061012480 + }, + { + "epoch": 10.03, + "learning_rate": 3.672016048144433e-05, + "loss": 2.4498, + "theoretical_loss": 3.3156323535851056, + "tokens_seen": 3061078016 + }, + { + "epoch": 10.03, + "learning_rate": 3.671013039117352e-05, + "loss": 2.3728, + "theoretical_loss": 3.315626918110335, + "tokens_seen": 3061143552 + }, + { + "epoch": 10.03, + "learning_rate": 3.6700100300902706e-05, + "loss": 2.614, + "theoretical_loss": 3.3156214827845134, + "tokens_seen": 3061209088 + }, + { + "epoch": 10.03, + "learning_rate": 3.66900702106319e-05, + "loss": 2.4669, + "theoretical_loss": 3.3156160476076337, + "tokens_seen": 3061274624 + }, + { + "epoch": 10.03, + "learning_rate": 3.668004012036109e-05, + "loss": 2.6148, + "theoretical_loss": 3.3156106125796887, + "tokens_seen": 3061340160 + }, + { + "epoch": 10.03, + "learning_rate": 3.6670010030090277e-05, + "loss": 2.7849, + "theoretical_loss": 3.3156051777006708, + "tokens_seen": 3061405696 + }, + { + "epoch": 10.03, + "learning_rate": 3.6659979939819464e-05, + "loss": 2.7067, + "theoretical_loss": 3.315599742970573, + "tokens_seen": 3061471232 + }, + { + "epoch": 10.03, + "learning_rate": 3.6649949849548646e-05, + "loss": 2.6259, + "theoretical_loss": 3.3155943083893873, + "tokens_seen": 3061536768 + }, + { + "epoch": 10.03, + "learning_rate": 3.6639919759277833e-05, + "loss": 2.9174, + "theoretical_loss": 3.3155888739571076, + "tokens_seen": 3061602304 + }, + { + "epoch": 10.03, + "learning_rate": 3.662988966900702e-05, + "loss": 2.5141, + "theoretical_loss": 3.315583439673726, + "tokens_seen": 3061667840 + }, + { + "epoch": 10.03, + "learning_rate": 3.661985957873621e-05, + "loss": 2.4914, + "theoretical_loss": 3.315578005539235, + "tokens_seen": 3061733376 + }, + { + "epoch": 10.03, + "learning_rate": 3.66098294884654e-05, + "loss": 2.7271, + "theoretical_loss": 3.315572571553628, + "tokens_seen": 3061798912 + }, + { + "epoch": 10.03, + "learning_rate": 3.6599799398194585e-05, + "loss": 2.6303, + "theoretical_loss": 3.315567137716897, + "tokens_seen": 3061864448 + }, + { + "epoch": 10.03, + "learning_rate": 3.658976930792377e-05, + "loss": 2.5892, + "theoretical_loss": 3.3155617040290357, + "tokens_seen": 3061929984 + }, + { + "epoch": 10.03, + "learning_rate": 3.657973921765296e-05, + "loss": 2.6583, + "theoretical_loss": 3.3155562704900357, + "tokens_seen": 3061995520 + }, + { + "epoch": 10.03, + "learning_rate": 3.656970912738215e-05, + "loss": 2.3906, + "theoretical_loss": 3.3155508370998903, + "tokens_seen": 3062061056 + }, + { + "epoch": 10.03, + "learning_rate": 3.6559679037111337e-05, + "loss": 2.6916, + "theoretical_loss": 3.3155454038585925, + "tokens_seen": 3062126592 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3378556, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8506011962890625, + "objective/train/theoretical_loss": 3.315542687293759, + "objective/train/tokens_used": 3082619360, + "theoretical_loss": 3.315542687293759, + "tokens_seen": 3062159360 + }, + { + "epoch": 10.03, + "learning_rate": 3.654964894684052e-05, + "loss": 2.5859, + "theoretical_loss": 3.3155399707661344, + "tokens_seen": 3062192128 + }, + { + "epoch": 10.03, + "learning_rate": 3.6539618856569706e-05, + "loss": 2.629, + "theoretical_loss": 3.3155345378225096, + "tokens_seen": 3062257664 + }, + { + "epoch": 10.03, + "learning_rate": 3.6529588766298894e-05, + "loss": 2.5997, + "theoretical_loss": 3.31552910502771, + "tokens_seen": 3062323200 + }, + { + "epoch": 10.03, + "learning_rate": 3.651955867602808e-05, + "loss": 2.492, + "theoretical_loss": 3.3155236723817287, + "tokens_seen": 3062388736 + }, + { + "epoch": 10.03, + "learning_rate": 3.650952858575727e-05, + "loss": 2.6519, + "theoretical_loss": 3.3155182398845584, + "tokens_seen": 3062454272 + }, + { + "epoch": 10.03, + "learning_rate": 3.6499498495486464e-05, + "loss": 2.6952, + "theoretical_loss": 3.3155128075361917, + "tokens_seen": 3062519808 + }, + { + "epoch": 10.03, + "learning_rate": 3.648946840521565e-05, + "loss": 2.5659, + "theoretical_loss": 3.3155073753366215, + "tokens_seen": 3062585344 + }, + { + "epoch": 10.03, + "learning_rate": 3.647943831494484e-05, + "loss": 2.4512, + "theoretical_loss": 3.315501943285841, + "tokens_seen": 3062650880 + }, + { + "epoch": 10.03, + "learning_rate": 3.646940822467403e-05, + "loss": 2.5122, + "theoretical_loss": 3.315496511383842, + "tokens_seen": 3062716416 + }, + { + "epoch": 10.03, + "learning_rate": 3.645937813440321e-05, + "loss": 2.557, + "theoretical_loss": 3.315491079630618, + "tokens_seen": 3062781952 + }, + { + "epoch": 10.03, + "learning_rate": 3.64493480441324e-05, + "loss": 2.6726, + "theoretical_loss": 3.315485648026161, + "tokens_seen": 3062847488 + }, + { + "epoch": 10.03, + "learning_rate": 3.6439317953861585e-05, + "loss": 2.5919, + "theoretical_loss": 3.315480216570464, + "tokens_seen": 3062913024 + }, + { + "epoch": 10.03, + "learning_rate": 3.642928786359077e-05, + "loss": 2.5126, + "theoretical_loss": 3.3154747852635205, + "tokens_seen": 3062978560 + }, + { + "epoch": 10.03, + "learning_rate": 3.641925777331996e-05, + "loss": 2.7222, + "theoretical_loss": 3.3154693541053226, + "tokens_seen": 3063044096 + }, + { + "epoch": 10.03, + "learning_rate": 3.640922768304915e-05, + "loss": 2.7014, + "theoretical_loss": 3.315463923095863, + "tokens_seen": 3063109632 + }, + { + "epoch": 10.03, + "learning_rate": 3.6399197592778336e-05, + "loss": 2.6869, + "theoretical_loss": 3.315458492235135, + "tokens_seen": 3063175168 + }, + { + "epoch": 10.03, + "learning_rate": 3.6389167502507524e-05, + "loss": 2.7725, + "theoretical_loss": 3.3154530615231304, + "tokens_seen": 3063240704 + }, + { + "epoch": 10.03, + "learning_rate": 3.637913741223671e-05, + "loss": 2.4896, + "theoretical_loss": 3.3154476309598424, + "tokens_seen": 3063306240 + }, + { + "epoch": 10.03, + "learning_rate": 3.63691073219659e-05, + "loss": 2.4767, + "theoretical_loss": 3.315442200545264, + "tokens_seen": 3063371776 + }, + { + "epoch": 10.03, + "learning_rate": 3.635907723169508e-05, + "loss": 2.6047, + "theoretical_loss": 3.315436770279388, + "tokens_seen": 3063437312 + }, + { + "epoch": 10.03, + "learning_rate": 3.634904714142427e-05, + "loss": 2.7765, + "theoretical_loss": 3.315431340162206, + "tokens_seen": 3063502848 + }, + { + "epoch": 10.03, + "learning_rate": 3.633901705115346e-05, + "loss": 2.6545, + "theoretical_loss": 3.3154259101937127, + "tokens_seen": 3063568384 + }, + { + "epoch": 10.03, + "learning_rate": 3.6328986960882645e-05, + "loss": 2.3558, + "theoretical_loss": 3.3154204803738994, + "tokens_seen": 3063633920 + }, + { + "epoch": 10.03, + "learning_rate": 3.631895687061183e-05, + "loss": 2.3862, + "theoretical_loss": 3.315415050702759, + "tokens_seen": 3063699456 + }, + { + "epoch": 10.03, + "learning_rate": 3.630892678034103e-05, + "loss": 2.5978, + "theoretical_loss": 3.315409621180285, + "tokens_seen": 3063764992 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3379996, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.734600782394409, + "objective/train/theoretical_loss": 3.315406906474795, + "objective/train/tokens_used": 3084257760, + "theoretical_loss": 3.315406906474795, + "tokens_seen": 3063797760 + }, + { + "epoch": 10.03, + "learning_rate": 3.6298896690070215e-05, + "loss": 2.6012, + "theoretical_loss": 3.315404191806469, + "tokens_seen": 3063830528 + }, + { + "epoch": 10.03, + "learning_rate": 3.62888665997994e-05, + "loss": 2.5595, + "theoretical_loss": 3.315398762581305, + "tokens_seen": 3063896064 + }, + { + "epoch": 10.03, + "learning_rate": 3.627883650952859e-05, + "loss": 2.3738, + "theoretical_loss": 3.3153933335047845, + "tokens_seen": 3063961600 + }, + { + "epoch": 10.03, + "learning_rate": 3.626880641925778e-05, + "loss": 2.6397, + "theoretical_loss": 3.315387904576901, + "tokens_seen": 3064027136 + }, + { + "epoch": 10.03, + "learning_rate": 3.625877632898696e-05, + "loss": 2.7675, + "theoretical_loss": 3.3153824757976476, + "tokens_seen": 3064092672 + }, + { + "epoch": 10.03, + "learning_rate": 3.624874623871615e-05, + "loss": 2.6943, + "theoretical_loss": 3.3153770471670163, + "tokens_seen": 3064158208 + }, + { + "epoch": 10.03, + "learning_rate": 3.6238716148445336e-05, + "loss": 2.702, + "theoretical_loss": 3.315371618685, + "tokens_seen": 3064223744 + }, + { + "epoch": 10.03, + "learning_rate": 3.6228686058174524e-05, + "loss": 2.5974, + "theoretical_loss": 3.315366190351592, + "tokens_seen": 3064289280 + }, + { + "epoch": 10.03, + "learning_rate": 3.621865596790371e-05, + "loss": 2.6614, + "theoretical_loss": 3.315360762166784, + "tokens_seen": 3064354816 + }, + { + "epoch": 10.03, + "learning_rate": 3.62086258776329e-05, + "loss": 2.4234, + "theoretical_loss": 3.31535533413057, + "tokens_seen": 3064420352 + }, + { + "epoch": 10.03, + "learning_rate": 3.619859578736209e-05, + "loss": 2.2487, + "theoretical_loss": 3.315349906242942, + "tokens_seen": 3064485888 + }, + { + "epoch": 10.03, + "learning_rate": 3.6188565697091275e-05, + "loss": 2.5851, + "theoretical_loss": 3.315344478503893, + "tokens_seen": 3064551424 + }, + { + "epoch": 10.03, + "learning_rate": 3.617853560682046e-05, + "loss": 2.4548, + "theoretical_loss": 3.3153390509134155, + "tokens_seen": 3064616960 + }, + { + "epoch": 10.03, + "learning_rate": 3.616850551654965e-05, + "loss": 2.6976, + "theoretical_loss": 3.3153336234715027, + "tokens_seen": 3064682496 + }, + { + "epoch": 10.03, + "learning_rate": 3.615847542627883e-05, + "loss": 2.5044, + "theoretical_loss": 3.3153281961781467, + "tokens_seen": 3064748032 + }, + { + "epoch": 10.03, + "learning_rate": 3.614844533600802e-05, + "loss": 2.7738, + "theoretical_loss": 3.3153227690333407, + "tokens_seen": 3064813568 + }, + { + "epoch": 10.03, + "learning_rate": 3.613841524573721e-05, + "loss": 2.5932, + "theoretical_loss": 3.3153173420370776, + "tokens_seen": 3064879104 + }, + { + "epoch": 10.03, + "learning_rate": 3.61283851554664e-05, + "loss": 2.7003, + "theoretical_loss": 3.3153119151893495, + "tokens_seen": 3064944640 + }, + { + "epoch": 10.03, + "learning_rate": 3.611835506519559e-05, + "loss": 2.4409, + "theoretical_loss": 3.31530648849015, + "tokens_seen": 3065010176 + }, + { + "epoch": 10.03, + "learning_rate": 3.610832497492478e-05, + "loss": 2.6821, + "theoretical_loss": 3.3153010619394716, + "tokens_seen": 3065075712 + }, + { + "epoch": 10.03, + "learning_rate": 3.6098294884653966e-05, + "loss": 2.7932, + "theoretical_loss": 3.315295635537307, + "tokens_seen": 3065141248 + }, + { + "epoch": 10.03, + "learning_rate": 3.6088264794383154e-05, + "loss": 2.6387, + "theoretical_loss": 3.3152902092836483, + "tokens_seen": 3065206784 + }, + { + "epoch": 10.03, + "learning_rate": 3.607823470411234e-05, + "loss": 2.6, + "theoretical_loss": 3.3152847831784893, + "tokens_seen": 3065272320 + }, + { + "epoch": 10.03, + "learning_rate": 3.606820461384152e-05, + "loss": 2.5381, + "theoretical_loss": 3.3152793572218227, + "tokens_seen": 3065337856 + }, + { + "epoch": 10.03, + "learning_rate": 3.605817452357071e-05, + "loss": 2.6331, + "theoretical_loss": 3.31527393141364, + "tokens_seen": 3065403392 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3380629, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.809659719467163, + "objective/train/theoretical_loss": 3.3152712185652287, + "objective/train/tokens_used": 3085896160, + "theoretical_loss": 3.3152712185652287, + "tokens_seen": 3065436160 + }, + { + "epoch": 10.03, + "learning_rate": 3.60481444332999e-05, + "loss": 2.6314, + "theoretical_loss": 3.3152685057539353, + "tokens_seen": 3065468928 + }, + { + "epoch": 10.03, + "learning_rate": 3.603811434302909e-05, + "loss": 2.7368, + "theoretical_loss": 3.315263080242701, + "tokens_seen": 3065534464 + }, + { + "epoch": 10.03, + "learning_rate": 3.6028084252758275e-05, + "loss": 2.4871, + "theoretical_loss": 3.3152576548799297, + "tokens_seen": 3065600000 + }, + { + "epoch": 10.03, + "learning_rate": 3.601805416248746e-05, + "loss": 2.6151, + "theoretical_loss": 3.315252229665614, + "tokens_seen": 3065665536 + }, + { + "epoch": 10.03, + "learning_rate": 3.600802407221665e-05, + "loss": 2.5075, + "theoretical_loss": 3.315246804599747, + "tokens_seen": 3065731072 + }, + { + "epoch": 10.03, + "learning_rate": 3.599799398194584e-05, + "loss": 2.7038, + "theoretical_loss": 3.3152413796823215, + "tokens_seen": 3065796608 + }, + { + "epoch": 10.03, + "learning_rate": 3.5987963891675026e-05, + "loss": 2.4341, + "theoretical_loss": 3.31523595491333, + "tokens_seen": 3065862144 + }, + { + "epoch": 10.03, + "learning_rate": 3.5977933801404214e-05, + "loss": 2.4312, + "theoretical_loss": 3.3152305302927654, + "tokens_seen": 3065927680 + }, + { + "epoch": 10.03, + "learning_rate": 3.5967903711133395e-05, + "loss": 2.4835, + "theoretical_loss": 3.3152251058206206, + "tokens_seen": 3065993216 + }, + { + "epoch": 10.03, + "learning_rate": 3.595787362086258e-05, + "loss": 2.4927, + "theoretical_loss": 3.315219681496888, + "tokens_seen": 3066058752 + }, + { + "epoch": 10.03, + "learning_rate": 3.594784353059177e-05, + "loss": 2.6708, + "theoretical_loss": 3.3152142573215606, + "tokens_seen": 3066124288 + }, + { + "epoch": 10.03, + "learning_rate": 3.5937813440320966e-05, + "loss": 2.462, + "theoretical_loss": 3.3152088332946312, + "tokens_seen": 3066189824 + }, + { + "epoch": 10.03, + "learning_rate": 3.5927783350050154e-05, + "loss": 2.7397, + "theoretical_loss": 3.3152034094160925, + "tokens_seen": 3066255360 + }, + { + "epoch": 10.03, + "learning_rate": 3.591775325977934e-05, + "loss": 2.7767, + "theoretical_loss": 3.3151979856859377, + "tokens_seen": 3066320896 + }, + { + "epoch": 10.03, + "learning_rate": 3.590772316950853e-05, + "loss": 2.6374, + "theoretical_loss": 3.3151925621041585, + "tokens_seen": 3066386432 + }, + { + "epoch": 10.03, + "learning_rate": 3.589769307923772e-05, + "loss": 2.4367, + "theoretical_loss": 3.315187138670749, + "tokens_seen": 3066451968 + }, + { + "epoch": 10.03, + "learning_rate": 3.5887662988966905e-05, + "loss": 2.4306, + "theoretical_loss": 3.315181715385701, + "tokens_seen": 3066517504 + }, + { + "epoch": 10.03, + "learning_rate": 3.587763289869609e-05, + "loss": 2.6201, + "theoretical_loss": 3.3151762922490073, + "tokens_seen": 3066583040 + }, + { + "epoch": 10.03, + "learning_rate": 3.5867602808425274e-05, + "loss": 2.5016, + "theoretical_loss": 3.315170869260661, + "tokens_seen": 3066648576 + }, + { + "epoch": 10.03, + "learning_rate": 3.585757271815446e-05, + "loss": 2.6476, + "theoretical_loss": 3.3151654464206555, + "tokens_seen": 3066714112 + }, + { + "epoch": 10.03, + "learning_rate": 3.584754262788365e-05, + "loss": 2.6513, + "theoretical_loss": 3.315160023728982, + "tokens_seen": 3066779648 + }, + { + "epoch": 10.03, + "learning_rate": 3.583751253761284e-05, + "loss": 2.5329, + "theoretical_loss": 3.3151546011856348, + "tokens_seen": 3066845184 + }, + { + "epoch": 10.03, + "learning_rate": 3.5827482447342026e-05, + "loss": 2.5861, + "theoretical_loss": 3.3151491787906058, + "tokens_seen": 3066910720 + }, + { + "epoch": 10.03, + "learning_rate": 3.5817452357071214e-05, + "loss": 2.7437, + "theoretical_loss": 3.315143756543888, + "tokens_seen": 3066976256 + }, + { + "epoch": 10.03, + "learning_rate": 3.58074222668004e-05, + "loss": 2.4047, + "theoretical_loss": 3.315138334445474, + "tokens_seen": 3067041792 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3381857, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.701077461242676, + "objective/train/theoretical_loss": 3.3151356234518787, + "objective/train/tokens_used": 3087534560, + "theoretical_loss": 3.3151356234518787, + "tokens_seen": 3067074560 + }, + { + "epoch": 10.03, + "learning_rate": 3.579739217652959e-05, + "loss": 2.452, + "theoretical_loss": 3.315132912495357, + "tokens_seen": 3067107328 + }, + { + "epoch": 10.03, + "learning_rate": 3.578736208625878e-05, + "loss": 2.5153, + "theoretical_loss": 3.3151274906935297, + "tokens_seen": 3067172864 + }, + { + "epoch": 10.03, + "learning_rate": 3.577733199598796e-05, + "loss": 2.4243, + "theoretical_loss": 3.315122069039984, + "tokens_seen": 3067238400 + }, + { + "epoch": 10.03, + "learning_rate": 3.5767301905717147e-05, + "loss": 2.6503, + "theoretical_loss": 3.315116647534714, + "tokens_seen": 3067303936 + }, + { + "epoch": 10.03, + "learning_rate": 3.5757271815446334e-05, + "loss": 2.6533, + "theoretical_loss": 3.315111226177712, + "tokens_seen": 3067369472 + }, + { + "epoch": 10.03, + "learning_rate": 3.574724172517553e-05, + "loss": 2.6061, + "theoretical_loss": 3.3151058049689706, + "tokens_seen": 3067435008 + }, + { + "epoch": 10.03, + "learning_rate": 3.573721163490472e-05, + "loss": 2.7146, + "theoretical_loss": 3.315100383908482, + "tokens_seen": 3067500544 + }, + { + "epoch": 10.03, + "learning_rate": 3.5727181544633905e-05, + "loss": 2.5853, + "theoretical_loss": 3.3150949629962403, + "tokens_seen": 3067566080 + }, + { + "epoch": 10.03, + "learning_rate": 3.571715145436309e-05, + "loss": 2.7125, + "theoretical_loss": 3.3150895422322373, + "tokens_seen": 3067631616 + }, + { + "epoch": 10.03, + "learning_rate": 3.570712136409228e-05, + "loss": 2.6813, + "theoretical_loss": 3.315084121616466, + "tokens_seen": 3067697152 + }, + { + "epoch": 10.03, + "learning_rate": 3.569709127382147e-05, + "loss": 2.7678, + "theoretical_loss": 3.315078701148919, + "tokens_seen": 3067762688 + }, + { + "epoch": 10.03, + "learning_rate": 3.5687061183550657e-05, + "loss": 2.6093, + "theoretical_loss": 3.3150732808295897, + "tokens_seen": 3067828224 + }, + { + "epoch": 10.03, + "learning_rate": 3.567703109327984e-05, + "loss": 2.7784, + "theoretical_loss": 3.3150678606584707, + "tokens_seen": 3067893760 + }, + { + "epoch": 10.03, + "learning_rate": 3.5667001003009026e-05, + "loss": 2.7297, + "theoretical_loss": 3.315062440635554, + "tokens_seen": 3067959296 + }, + { + "epoch": 10.03, + "learning_rate": 3.5656970912738213e-05, + "loss": 2.5424, + "theoretical_loss": 3.3150570207608334, + "tokens_seen": 3068024832 + }, + { + "epoch": 10.03, + "learning_rate": 3.56469408224674e-05, + "loss": 2.6874, + "theoretical_loss": 3.315051601034301, + "tokens_seen": 3068090368 + }, + { + "epoch": 10.03, + "learning_rate": 3.563691073219659e-05, + "loss": 2.6363, + "theoretical_loss": 3.31504618145595, + "tokens_seen": 3068155904 + }, + { + "epoch": 10.03, + "learning_rate": 3.562688064192578e-05, + "loss": 2.4527, + "theoretical_loss": 3.315040762025773, + "tokens_seen": 3068221440 + }, + { + "epoch": 10.03, + "learning_rate": 3.5616850551654965e-05, + "loss": 2.4938, + "theoretical_loss": 3.315035342743762, + "tokens_seen": 3068286976 + }, + { + "epoch": 10.03, + "learning_rate": 3.560682046138415e-05, + "loss": 2.7912, + "theoretical_loss": 3.3150299236099117, + "tokens_seen": 3068352512 + }, + { + "epoch": 10.03, + "learning_rate": 3.559679037111334e-05, + "loss": 2.5837, + "theoretical_loss": 3.3150245046242137, + "tokens_seen": 3068418048 + }, + { + "epoch": 10.03, + "learning_rate": 3.558676028084253e-05, + "loss": 2.6532, + "theoretical_loss": 3.3150190857866604, + "tokens_seen": 3068483584 + }, + { + "epoch": 10.03, + "learning_rate": 3.557673019057171e-05, + "loss": 2.707, + "theoretical_loss": 3.315013667097245, + "tokens_seen": 3068549120 + }, + { + "epoch": 10.03, + "learning_rate": 3.5566700100300904e-05, + "loss": 2.486, + "theoretical_loss": 3.3150082485559604, + "tokens_seen": 3068614656 + }, + { + "epoch": 10.03, + "learning_rate": 3.555667001003009e-05, + "loss": 2.3435, + "theoretical_loss": 3.3150028301627996, + "tokens_seen": 3068680192 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3382648, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.387377977371216, + "objective/train/theoretical_loss": 3.3150001210217632, + "objective/train/tokens_used": 3089172960, + "theoretical_loss": 3.3150001210217632, + "tokens_seen": 3068712960 + }, + { + "epoch": 10.03, + "learning_rate": 3.554663991975928e-05, + "loss": 2.6551, + "theoretical_loss": 3.314997411917755, + "tokens_seen": 3068745728 + }, + { + "epoch": 10.03, + "learning_rate": 3.553660982948847e-05, + "loss": 2.6055, + "theoretical_loss": 3.3149919938208194, + "tokens_seen": 3068811264 + }, + { + "epoch": 10.03, + "learning_rate": 3.5526579739217656e-05, + "loss": 2.6261, + "theoretical_loss": 3.3149865758719854, + "tokens_seen": 3068876800 + }, + { + "epoch": 10.03, + "learning_rate": 3.5516549648946844e-05, + "loss": 2.3841, + "theoretical_loss": 3.3149811580712463, + "tokens_seen": 3068942336 + }, + { + "epoch": 10.03, + "learning_rate": 3.550651955867603e-05, + "loss": 2.5355, + "theoretical_loss": 3.314975740418595, + "tokens_seen": 3069007872 + }, + { + "epoch": 10.03, + "learning_rate": 3.549648946840522e-05, + "loss": 2.6366, + "theoretical_loss": 3.3149703229140233, + "tokens_seen": 3069073408 + }, + { + "epoch": 10.03, + "learning_rate": 3.548645937813441e-05, + "loss": 2.3894, + "theoretical_loss": 3.3149649055575248, + "tokens_seen": 3069138944 + }, + { + "epoch": 10.03, + "learning_rate": 3.547642928786359e-05, + "loss": 2.6157, + "theoretical_loss": 3.3149594883490927, + "tokens_seen": 3069204480 + }, + { + "epoch": 10.03, + "learning_rate": 3.546639919759278e-05, + "loss": 2.4312, + "theoretical_loss": 3.3149540712887187, + "tokens_seen": 3069270016 + }, + { + "epoch": 10.03, + "learning_rate": 3.5456369107321965e-05, + "loss": 2.3954, + "theoretical_loss": 3.314948654376396, + "tokens_seen": 3069335552 + }, + { + "epoch": 10.03, + "learning_rate": 3.544633901705115e-05, + "loss": 2.7088, + "theoretical_loss": 3.314943237612118, + "tokens_seen": 3069401088 + }, + { + "epoch": 10.03, + "learning_rate": 3.543630892678034e-05, + "loss": 2.3965, + "theoretical_loss": 3.3149378209958766, + "tokens_seen": 3069466624 + }, + { + "epoch": 10.03, + "learning_rate": 3.542627883650953e-05, + "loss": 2.5094, + "theoretical_loss": 3.314932404527665, + "tokens_seen": 3069532160 + }, + { + "epoch": 10.03, + "learning_rate": 3.5416248746238716e-05, + "loss": 2.7375, + "theoretical_loss": 3.3149269882074757, + "tokens_seen": 3069597696 + }, + { + "epoch": 10.03, + "learning_rate": 3.5406218655967904e-05, + "loss": 2.6446, + "theoretical_loss": 3.3149215720353022, + "tokens_seen": 3069663232 + }, + { + "epoch": 10.03, + "learning_rate": 3.539618856569709e-05, + "loss": 2.705, + "theoretical_loss": 3.314916156011137, + "tokens_seen": 3069728768 + }, + { + "epoch": 10.03, + "learning_rate": 3.538615847542627e-05, + "loss": 2.5706, + "theoretical_loss": 3.3149107401349727, + "tokens_seen": 3069794304 + }, + { + "epoch": 10.03, + "learning_rate": 3.537612838515547e-05, + "loss": 2.5271, + "theoretical_loss": 3.314905324406802, + "tokens_seen": 3069859840 + }, + { + "epoch": 10.03, + "learning_rate": 3.5366098294884656e-05, + "loss": 2.6281, + "theoretical_loss": 3.314899908826618, + "tokens_seen": 3069925376 + }, + { + "epoch": 10.03, + "learning_rate": 3.5356068204613844e-05, + "loss": 2.4478, + "theoretical_loss": 3.314894493394413, + "tokens_seen": 3069990912 + }, + { + "epoch": 10.03, + "learning_rate": 3.534603811434303e-05, + "loss": 2.5047, + "theoretical_loss": 3.3148890781101805, + "tokens_seen": 3070056448 + }, + { + "epoch": 10.03, + "learning_rate": 3.533600802407222e-05, + "loss": 2.6028, + "theoretical_loss": 3.314883662973913, + "tokens_seen": 3070121984 + }, + { + "epoch": 10.03, + "learning_rate": 3.532597793380141e-05, + "loss": 2.4237, + "theoretical_loss": 3.314878247985603, + "tokens_seen": 3070187520 + }, + { + "epoch": 10.03, + "learning_rate": 3.5315947843530595e-05, + "loss": 2.4604, + "theoretical_loss": 3.3148728331452437, + "tokens_seen": 3070253056 + }, + { + "epoch": 10.03, + "learning_rate": 3.530591775325978e-05, + "loss": 2.3103, + "theoretical_loss": 3.3148674184528275, + "tokens_seen": 3070318592 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3387908, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.244450807571411, + "objective/train/theoretical_loss": 3.3148647111620964, + "objective/train/tokens_used": 3090811360, + "theoretical_loss": 3.3148647111620964, + "tokens_seen": 3070351360 + }, + { + "epoch": 10.03, + "learning_rate": 3.529588766298897e-05, + "loss": 2.4981, + "theoretical_loss": 3.314862003908348, + "tokens_seen": 3070384128 + }, + { + "epoch": 10.03, + "learning_rate": 3.528585757271815e-05, + "loss": 2.5398, + "theoretical_loss": 3.314856589511797, + "tokens_seen": 3070449664 + }, + { + "epoch": 10.03, + "learning_rate": 3.527582748244734e-05, + "loss": 2.5453, + "theoretical_loss": 3.314851175263168, + "tokens_seen": 3070515200 + }, + { + "epoch": 10.03, + "learning_rate": 3.526579739217653e-05, + "loss": 2.6529, + "theoretical_loss": 3.3148457611624536, + "tokens_seen": 3070580736 + }, + { + "epoch": 10.03, + "learning_rate": 3.5255767301905716e-05, + "loss": 2.4909, + "theoretical_loss": 3.314840347209646, + "tokens_seen": 3070646272 + }, + { + "epoch": 10.03, + "learning_rate": 3.5245737211634904e-05, + "loss": 2.4957, + "theoretical_loss": 3.314834933404739, + "tokens_seen": 3070711808 + }, + { + "epoch": 10.03, + "learning_rate": 3.523570712136409e-05, + "loss": 2.3944, + "theoretical_loss": 3.314829519747725, + "tokens_seen": 3070777344 + }, + { + "epoch": 10.03, + "learning_rate": 3.522567703109328e-05, + "loss": 2.4768, + "theoretical_loss": 3.314824106238597, + "tokens_seen": 3070842880 + }, + { + "epoch": 10.03, + "learning_rate": 3.521564694082247e-05, + "loss": 2.4237, + "theoretical_loss": 3.314818692877347, + "tokens_seen": 3070908416 + }, + { + "epoch": 10.03, + "learning_rate": 3.5205616850551655e-05, + "loss": 2.5986, + "theoretical_loss": 3.3148132796639684, + "tokens_seen": 3070973952 + }, + { + "epoch": 10.03, + "learning_rate": 3.519558676028084e-05, + "loss": 2.4005, + "theoretical_loss": 3.3148078665984544, + "tokens_seen": 3071039488 + }, + { + "epoch": 10.03, + "learning_rate": 3.518555667001003e-05, + "loss": 2.4117, + "theoretical_loss": 3.314802453680797, + "tokens_seen": 3071105024 + }, + { + "epoch": 10.03, + "learning_rate": 3.517552657973922e-05, + "loss": 2.4797, + "theoretical_loss": 3.3147970409109897, + "tokens_seen": 3071170560 + }, + { + "epoch": 10.03, + "learning_rate": 3.516549648946841e-05, + "loss": 2.4822, + "theoretical_loss": 3.314791628289025, + "tokens_seen": 3071236096 + }, + { + "epoch": 10.03, + "learning_rate": 3.5155466399197595e-05, + "loss": 2.4047, + "theoretical_loss": 3.3147862158148955, + "tokens_seen": 3071301632 + }, + { + "epoch": 10.03, + "learning_rate": 3.514543630892678e-05, + "loss": 2.4122, + "theoretical_loss": 3.3147808034885946, + "tokens_seen": 3071367168 + }, + { + "epoch": 10.03, + "learning_rate": 3.513540621865597e-05, + "loss": 2.308, + "theoretical_loss": 3.314775391310114, + "tokens_seen": 3071432704 + }, + { + "epoch": 10.03, + "learning_rate": 3.512537612838516e-05, + "loss": 2.4423, + "theoretical_loss": 3.314769979279448, + "tokens_seen": 3071498240 + }, + { + "epoch": 10.03, + "learning_rate": 3.5115346038114346e-05, + "loss": 2.5696, + "theoretical_loss": 3.314764567396588, + "tokens_seen": 3071563776 + }, + { + "epoch": 10.03, + "learning_rate": 3.5105315947843534e-05, + "loss": 2.5292, + "theoretical_loss": 3.314759155661528, + "tokens_seen": 3071629312 + }, + { + "epoch": 10.03, + "learning_rate": 3.509528585757272e-05, + "loss": 2.6493, + "theoretical_loss": 3.31475374407426, + "tokens_seen": 3071694848 + }, + { + "epoch": 10.03, + "learning_rate": 3.50852557673019e-05, + "loss": 2.6475, + "theoretical_loss": 3.3147483326347773, + "tokens_seen": 3071760384 + }, + { + "epoch": 10.03, + "learning_rate": 3.507522567703109e-05, + "loss": 2.4483, + "theoretical_loss": 3.314742921343072, + "tokens_seen": 3071825920 + }, + { + "epoch": 10.03, + "learning_rate": 3.506519558676028e-05, + "loss": 2.5932, + "theoretical_loss": 3.3147375101991376, + "tokens_seen": 3071891456 + }, + { + "epoch": 10.03, + "learning_rate": 3.505516549648947e-05, + "loss": 2.5302, + "theoretical_loss": 3.314732099202967, + "tokens_seen": 3071956992 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3393093, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4069104194641113, + "objective/train/theoretical_loss": 3.314729393760291, + "objective/train/tokens_used": 3092449760, + "theoretical_loss": 3.314729393760291, + "tokens_seen": 3071989760 + }, + { + "epoch": 10.03, + "learning_rate": 3.5045135406218655e-05, + "loss": 2.6174, + "theoretical_loss": 3.3147266883545523, + "tokens_seen": 3072022528 + }, + { + "epoch": 10.03, + "learning_rate": 3.503510531594784e-05, + "loss": 2.3325, + "theoretical_loss": 3.314721277653887, + "tokens_seen": 3072088064 + }, + { + "epoch": 10.03, + "learning_rate": 3.502507522567703e-05, + "loss": 2.4701, + "theoretical_loss": 3.3147158671009636, + "tokens_seen": 3072153600 + }, + { + "epoch": 10.03, + "learning_rate": 3.501504513540622e-05, + "loss": 2.7299, + "theoretical_loss": 3.3147104566957752, + "tokens_seen": 3072219136 + }, + { + "epoch": 10.03, + "learning_rate": 3.5005015045135406e-05, + "loss": 2.7447, + "theoretical_loss": 3.3147050464383137, + "tokens_seen": 3072284672 + }, + { + "epoch": 10.03, + "learning_rate": 3.4994984954864594e-05, + "loss": 2.5333, + "theoretical_loss": 3.314699636328573, + "tokens_seen": 3072350208 + }, + { + "epoch": 10.03, + "learning_rate": 3.498495486459378e-05, + "loss": 2.4456, + "theoretical_loss": 3.3146942263665458, + "tokens_seen": 3072415744 + }, + { + "epoch": 10.03, + "learning_rate": 3.497492477432297e-05, + "loss": 2.4964, + "theoretical_loss": 3.3146888165522244, + "tokens_seen": 3072481280 + }, + { + "epoch": 10.03, + "learning_rate": 3.496489468405216e-05, + "loss": 2.5736, + "theoretical_loss": 3.3146834068856017, + "tokens_seen": 3072546816 + }, + { + "epoch": 10.03, + "learning_rate": 3.4954864593781346e-05, + "loss": 2.6902, + "theoretical_loss": 3.3146779973666707, + "tokens_seen": 3072612352 + }, + { + "epoch": 10.03, + "learning_rate": 3.4944834503510534e-05, + "loss": 2.6744, + "theoretical_loss": 3.314672587995424, + "tokens_seen": 3072677888 + }, + { + "epoch": 10.03, + "learning_rate": 3.493480441323972e-05, + "loss": 2.5913, + "theoretical_loss": 3.314667178771855, + "tokens_seen": 3072743424 + }, + { + "epoch": 10.03, + "learning_rate": 3.492477432296891e-05, + "loss": 2.4138, + "theoretical_loss": 3.314661769695956, + "tokens_seen": 3072808960 + }, + { + "epoch": 10.03, + "learning_rate": 3.49147442326981e-05, + "loss": 2.4802, + "theoretical_loss": 3.31465636076772, + "tokens_seen": 3072874496 + }, + { + "epoch": 10.03, + "learning_rate": 3.4904714142427285e-05, + "loss": 2.5388, + "theoretical_loss": 3.314650951987139, + "tokens_seen": 3072940032 + }, + { + "epoch": 10.03, + "learning_rate": 3.4894684052156466e-05, + "loss": 2.6439, + "theoretical_loss": 3.3146455433542075, + "tokens_seen": 3073005568 + }, + { + "epoch": 10.03, + "learning_rate": 3.4884653961885654e-05, + "loss": 2.3478, + "theoretical_loss": 3.314640134868917, + "tokens_seen": 3073071104 + }, + { + "epoch": 10.03, + "learning_rate": 3.487462387161484e-05, + "loss": 2.4324, + "theoretical_loss": 3.314634726531261, + "tokens_seen": 3073136640 + }, + { + "epoch": 10.03, + "learning_rate": 3.486459378134403e-05, + "loss": 2.6459, + "theoretical_loss": 3.3146293183412316, + "tokens_seen": 3073202176 + }, + { + "epoch": 10.03, + "learning_rate": 3.485456369107322e-05, + "loss": 2.3593, + "theoretical_loss": 3.3146239102988226, + "tokens_seen": 3073267712 + }, + { + "epoch": 10.03, + "learning_rate": 3.4844533600802406e-05, + "loss": 2.6698, + "theoretical_loss": 3.314618502404026, + "tokens_seen": 3073333248 + }, + { + "epoch": 10.03, + "learning_rate": 3.4834503510531594e-05, + "loss": 2.519, + "theoretical_loss": 3.314613094656835, + "tokens_seen": 3073398784 + }, + { + "epoch": 10.03, + "learning_rate": 3.482447342026078e-05, + "loss": 2.6926, + "theoretical_loss": 3.314607687057242, + "tokens_seen": 3073464320 + }, + { + "epoch": 10.03, + "learning_rate": 3.4814443329989976e-05, + "loss": 2.4796, + "theoretical_loss": 3.31460227960524, + "tokens_seen": 3073529856 + }, + { + "epoch": 10.03, + "learning_rate": 3.4804413239719164e-05, + "loss": 2.4027, + "theoretical_loss": 3.3145968723008226, + "tokens_seen": 3073595392 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3398116, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5023627281188965, + "objective/train/theoretical_loss": 3.3145941687039553, + "objective/train/tokens_used": 3094088160, + "theoretical_loss": 3.3145941687039553, + "tokens_seen": 3073628160 + }, + { + "epoch": 10.03, + "learning_rate": 3.4794383149448345e-05, + "loss": 2.5184, + "theoretical_loss": 3.3145914651439816, + "tokens_seen": 3073660928 + }, + { + "epoch": 10.03, + "learning_rate": 3.478435305917753e-05, + "loss": 2.5258, + "theoretical_loss": 3.3145860581347106, + "tokens_seen": 3073726464 + }, + { + "epoch": 10.03, + "learning_rate": 3.477432296890672e-05, + "loss": 2.2672, + "theoretical_loss": 3.3145806512730016, + "tokens_seen": 3073792000 + }, + { + "epoch": 10.03, + "learning_rate": 3.476429287863591e-05, + "loss": 2.5573, + "theoretical_loss": 3.3145752445588483, + "tokens_seen": 3073857536 + }, + { + "epoch": 10.03, + "learning_rate": 3.47542627883651e-05, + "loss": 2.6749, + "theoretical_loss": 3.314569837992243, + "tokens_seen": 3073923072 + }, + { + "epoch": 10.03, + "learning_rate": 3.4744232698094285e-05, + "loss": 2.4526, + "theoretical_loss": 3.3145644315731784, + "tokens_seen": 3073988608 + }, + { + "epoch": 10.03, + "learning_rate": 3.473420260782347e-05, + "loss": 2.5849, + "theoretical_loss": 3.3145590253016475, + "tokens_seen": 3074054144 + }, + { + "epoch": 10.03, + "learning_rate": 3.472417251755266e-05, + "loss": 2.4337, + "theoretical_loss": 3.3145536191776435, + "tokens_seen": 3074119680 + }, + { + "epoch": 10.03, + "learning_rate": 3.471414242728185e-05, + "loss": 2.4739, + "theoretical_loss": 3.3145482132011588, + "tokens_seen": 3074185216 + }, + { + "epoch": 10.03, + "learning_rate": 3.4704112337011036e-05, + "loss": 2.501, + "theoretical_loss": 3.3145428073721863, + "tokens_seen": 3074250752 + }, + { + "epoch": 10.03, + "learning_rate": 3.469408224674022e-05, + "loss": 2.4435, + "theoretical_loss": 3.314537401690719, + "tokens_seen": 3074316288 + }, + { + "epoch": 10.03, + "learning_rate": 3.4684052156469406e-05, + "loss": 2.6181, + "theoretical_loss": 3.3145319961567496, + "tokens_seen": 3074381824 + }, + { + "epoch": 10.03, + "learning_rate": 3.4674022066198593e-05, + "loss": 2.5262, + "theoretical_loss": 3.314526590770271, + "tokens_seen": 3074447360 + }, + { + "epoch": 10.03, + "learning_rate": 3.466399197592778e-05, + "loss": 2.6675, + "theoretical_loss": 3.3145211855312757, + "tokens_seen": 3074512896 + }, + { + "epoch": 10.03, + "learning_rate": 3.465396188565697e-05, + "loss": 2.5861, + "theoretical_loss": 3.314515780439757, + "tokens_seen": 3074578432 + }, + { + "epoch": 10.03, + "learning_rate": 3.464393179538616e-05, + "loss": 2.5968, + "theoretical_loss": 3.3145103754957077, + "tokens_seen": 3074643968 + }, + { + "epoch": 10.03, + "learning_rate": 3.4633901705115345e-05, + "loss": 2.2867, + "theoretical_loss": 3.31450497069912, + "tokens_seen": 3074709504 + }, + { + "epoch": 10.03, + "learning_rate": 3.462387161484454e-05, + "loss": 2.5636, + "theoretical_loss": 3.314499566049988, + "tokens_seen": 3074775040 + }, + { + "epoch": 10.03, + "learning_rate": 3.461384152457373e-05, + "loss": 2.6366, + "theoretical_loss": 3.314494161548303, + "tokens_seen": 3074840576 + }, + { + "epoch": 10.03, + "learning_rate": 3.460381143430291e-05, + "loss": 2.4862, + "theoretical_loss": 3.314488757194059, + "tokens_seen": 3074906112 + }, + { + "epoch": 10.03, + "learning_rate": 3.4593781344032097e-05, + "loss": 2.7482, + "theoretical_loss": 3.3144833529872484, + "tokens_seen": 3074971648 + }, + { + "epoch": 10.03, + "learning_rate": 3.4583751253761284e-05, + "loss": 2.706, + "theoretical_loss": 3.3144779489278635, + "tokens_seen": 3075037184 + }, + { + "epoch": 10.03, + "learning_rate": 3.457372116349047e-05, + "loss": 2.6484, + "theoretical_loss": 3.3144725450158985, + "tokens_seen": 3075102720 + }, + { + "epoch": 10.03, + "learning_rate": 3.456369107321966e-05, + "loss": 2.5069, + "theoretical_loss": 3.314467141251345, + "tokens_seen": 3075168256 + }, + { + "epoch": 10.03, + "learning_rate": 3.455366098294885e-05, + "loss": 2.479, + "theoretical_loss": 3.314461737634196, + "tokens_seen": 3075233792 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3403291, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6900172233581543, + "objective/train/theoretical_loss": 3.3144590358808963, + "objective/train/tokens_used": 3095726560, + "theoretical_loss": 3.3144590358808963, + "tokens_seen": 3075266560 + }, + { + "epoch": 10.03, + "learning_rate": 3.4543630892678036e-05, + "loss": 2.5528, + "theoretical_loss": 3.3144563341644453, + "tokens_seen": 3075299328 + }, + { + "epoch": 10.03, + "learning_rate": 3.4533600802407224e-05, + "loss": 2.6468, + "theoretical_loss": 3.314450930842084, + "tokens_seen": 3075364864 + }, + { + "epoch": 10.03, + "learning_rate": 3.452357071213641e-05, + "loss": 2.4874, + "theoretical_loss": 3.314445527667107, + "tokens_seen": 3075430400 + }, + { + "epoch": 10.03, + "learning_rate": 3.45135406218656e-05, + "loss": 2.6366, + "theoretical_loss": 3.3144401246395057, + "tokens_seen": 3075495936 + }, + { + "epoch": 10.03, + "learning_rate": 3.450351053159478e-05, + "loss": 2.7822, + "theoretical_loss": 3.314434721759273, + "tokens_seen": 3075561472 + }, + { + "epoch": 10.03, + "learning_rate": 3.449348044132397e-05, + "loss": 2.6491, + "theoretical_loss": 3.314429319026403, + "tokens_seen": 3075627008 + }, + { + "epoch": 10.03, + "learning_rate": 3.448345035105316e-05, + "loss": 2.4767, + "theoretical_loss": 3.3144239164408873, + "tokens_seen": 3075692544 + }, + { + "epoch": 10.03, + "learning_rate": 3.4473420260782345e-05, + "loss": 2.501, + "theoretical_loss": 3.3144185140027185, + "tokens_seen": 3075758080 + }, + { + "epoch": 10.03, + "learning_rate": 3.446339017051153e-05, + "loss": 2.4015, + "theoretical_loss": 3.3144131117118905, + "tokens_seen": 3075823616 + }, + { + "epoch": 10.03, + "learning_rate": 3.445336008024072e-05, + "loss": 2.604, + "theoretical_loss": 3.3144077095683957, + "tokens_seen": 3075889152 + }, + { + "epoch": 10.03, + "learning_rate": 3.444332998996991e-05, + "loss": 2.5378, + "theoretical_loss": 3.3144023075722266, + "tokens_seen": 3075954688 + }, + { + "epoch": 10.03, + "learning_rate": 3.44332998996991e-05, + "loss": 2.6261, + "theoretical_loss": 3.3143969057233766, + "tokens_seen": 3076020224 + }, + { + "epoch": 10.03, + "learning_rate": 3.442326980942829e-05, + "loss": 2.5945, + "theoretical_loss": 3.314391504021838, + "tokens_seen": 3076085760 + }, + { + "epoch": 10.03, + "learning_rate": 3.441323971915748e-05, + "loss": 2.3871, + "theoretical_loss": 3.3143861024676045, + "tokens_seen": 3076151296 + }, + { + "epoch": 10.03, + "learning_rate": 3.440320962888666e-05, + "loss": 2.386, + "theoretical_loss": 3.314380701060668, + "tokens_seen": 3076216832 + }, + { + "epoch": 10.03, + "learning_rate": 3.439317953861585e-05, + "loss": 2.4465, + "theoretical_loss": 3.314375299801022, + "tokens_seen": 3076282368 + }, + { + "epoch": 10.03, + "learning_rate": 3.4383149448345036e-05, + "loss": 2.5179, + "theoretical_loss": 3.3143698986886587, + "tokens_seen": 3076347904 + }, + { + "epoch": 10.03, + "learning_rate": 3.4373119358074224e-05, + "loss": 2.7186, + "theoretical_loss": 3.314364497723571, + "tokens_seen": 3076413440 + }, + { + "epoch": 10.03, + "learning_rate": 3.436308926780341e-05, + "loss": 2.58, + "theoretical_loss": 3.314359096905753, + "tokens_seen": 3076478976 + }, + { + "epoch": 10.03, + "learning_rate": 3.43530591775326e-05, + "loss": 2.5231, + "theoretical_loss": 3.314353696235196, + "tokens_seen": 3076544512 + }, + { + "epoch": 10.03, + "learning_rate": 3.434302908726179e-05, + "loss": 2.466, + "theoretical_loss": 3.3143482957118935, + "tokens_seen": 3076610048 + }, + { + "epoch": 10.03, + "learning_rate": 3.4332998996990975e-05, + "loss": 2.3573, + "theoretical_loss": 3.3143428953358387, + "tokens_seen": 3076675584 + }, + { + "epoch": 10.03, + "learning_rate": 3.432296890672016e-05, + "loss": 2.2213, + "theoretical_loss": 3.3143374951070235, + "tokens_seen": 3076741120 + }, + { + "epoch": 10.03, + "learning_rate": 3.4312938816449344e-05, + "loss": 2.3592, + "theoretical_loss": 3.3143320950254416, + "tokens_seen": 3076806656 + }, + { + "epoch": 10.03, + "learning_rate": 3.430290872617853e-05, + "loss": 2.5602, + "theoretical_loss": 3.3143266950910855, + "tokens_seen": 3076872192 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3403903, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.456493854522705, + "objective/train/theoretical_loss": 3.314323995179115, + "objective/train/tokens_used": 3097364960, + "theoretical_loss": 3.314323995179115, + "tokens_seen": 3076904960 + }, + { + "epoch": 10.03, + "learning_rate": 3.429287863590772e-05, + "loss": 2.5761, + "theoretical_loss": 3.314321295303948, + "tokens_seen": 3076937728 + }, + { + "epoch": 10.03, + "learning_rate": 3.428284854563691e-05, + "loss": 2.4452, + "theoretical_loss": 3.3143158956640226, + "tokens_seen": 3077003264 + }, + { + "epoch": 10.03, + "learning_rate": 3.4272818455366096e-05, + "loss": 2.6582, + "theoretical_loss": 3.3143104961713012, + "tokens_seen": 3077068800 + }, + { + "epoch": 10.03, + "learning_rate": 3.4262788365095284e-05, + "loss": 2.4086, + "theoretical_loss": 3.314305096825777, + "tokens_seen": 3077134336 + }, + { + "epoch": 10.03, + "learning_rate": 3.425275827482448e-05, + "loss": 2.4682, + "theoretical_loss": 3.314299697627443, + "tokens_seen": 3077199872 + }, + { + "epoch": 10.03, + "learning_rate": 3.4242728184553666e-05, + "loss": 2.5005, + "theoretical_loss": 3.314294298576292, + "tokens_seen": 3077265408 + }, + { + "epoch": 10.03, + "learning_rate": 3.4232698094282854e-05, + "loss": 2.2422, + "theoretical_loss": 3.3142888996723165, + "tokens_seen": 3077330944 + }, + { + "epoch": 10.03, + "learning_rate": 3.422266800401204e-05, + "loss": 2.5714, + "theoretical_loss": 3.3142835009155105, + "tokens_seen": 3077396480 + }, + { + "epoch": 10.03, + "learning_rate": 3.421263791374122e-05, + "loss": 2.4267, + "theoretical_loss": 3.314278102305865, + "tokens_seen": 3077462016 + }, + { + "epoch": 10.03, + "learning_rate": 3.420260782347041e-05, + "loss": 2.5003, + "theoretical_loss": 3.3142727038433746, + "tokens_seen": 3077527552 + }, + { + "epoch": 10.03, + "learning_rate": 3.41925777331996e-05, + "loss": 2.5301, + "theoretical_loss": 3.3142673055280314, + "tokens_seen": 3077593088 + }, + { + "epoch": 10.03, + "learning_rate": 3.418254764292879e-05, + "loss": 2.47, + "theoretical_loss": 3.314261907359828, + "tokens_seen": 3077658624 + }, + { + "epoch": 10.03, + "learning_rate": 3.4172517552657975e-05, + "loss": 2.5289, + "theoretical_loss": 3.3142565093387573, + "tokens_seen": 3077724160 + }, + { + "epoch": 10.03, + "learning_rate": 3.416248746238716e-05, + "loss": 2.4474, + "theoretical_loss": 3.314251111464813, + "tokens_seen": 3077789696 + }, + { + "epoch": 10.03, + "learning_rate": 3.415245737211635e-05, + "loss": 2.5713, + "theoretical_loss": 3.314245713737987, + "tokens_seen": 3077855232 + }, + { + "epoch": 10.03, + "learning_rate": 3.414242728184554e-05, + "loss": 2.5448, + "theoretical_loss": 3.3142403161582727, + "tokens_seen": 3077920768 + }, + { + "epoch": 10.03, + "learning_rate": 3.4132397191574726e-05, + "loss": 2.6777, + "theoretical_loss": 3.314234918725663, + "tokens_seen": 3077986304 + }, + { + "epoch": 10.03, + "learning_rate": 3.4122367101303914e-05, + "loss": 2.501, + "theoretical_loss": 3.3142295214401503, + "tokens_seen": 3078051840 + }, + { + "epoch": 10.03, + "learning_rate": 3.4112337011033095e-05, + "loss": 2.4188, + "theoretical_loss": 3.3142241243017274, + "tokens_seen": 3078117376 + }, + { + "epoch": 10.03, + "learning_rate": 3.410230692076228e-05, + "loss": 2.551, + "theoretical_loss": 3.314218727310388, + "tokens_seen": 3078182912 + }, + { + "epoch": 10.03, + "learning_rate": 3.409227683049147e-05, + "loss": 2.5251, + "theoretical_loss": 3.3142133304661243, + "tokens_seen": 3078248448 + }, + { + "epoch": 10.03, + "learning_rate": 3.408224674022066e-05, + "loss": 2.4787, + "theoretical_loss": 3.314207933768929, + "tokens_seen": 3078313984 + }, + { + "epoch": 10.03, + "learning_rate": 3.407221664994985e-05, + "loss": 2.6089, + "theoretical_loss": 3.3142025372187955, + "tokens_seen": 3078379520 + }, + { + "epoch": 10.03, + "learning_rate": 3.406218655967904e-05, + "loss": 2.4804, + "theoretical_loss": 3.314197140815716, + "tokens_seen": 3078445056 + }, + { + "epoch": 10.03, + "learning_rate": 3.405215646940823e-05, + "loss": 2.6093, + "theoretical_loss": 3.3141917445596842, + "tokens_seen": 3078510592 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3405078, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.204277515411377, + "objective/train/theoretical_loss": 3.314189046486809, + "objective/train/tokens_used": 3099003360, + "theoretical_loss": 3.314189046486809, + "tokens_seen": 3078543360 + }, + { + "epoch": 10.03, + "learning_rate": 3.404212637913742e-05, + "loss": 2.3753, + "theoretical_loss": 3.3141863484506926, + "tokens_seen": 3078576128 + }, + { + "epoch": 10.03, + "learning_rate": 3.4032096288866605e-05, + "loss": 2.4487, + "theoretical_loss": 3.3141809524887336, + "tokens_seen": 3078641664 + }, + { + "epoch": 10.03, + "learning_rate": 3.402206619859579e-05, + "loss": 2.5525, + "theoretical_loss": 3.3141755566738005, + "tokens_seen": 3078707200 + }, + { + "epoch": 10.03, + "learning_rate": 3.4012036108324974e-05, + "loss": 2.5171, + "theoretical_loss": 3.3141701610058862, + "tokens_seen": 3078772736 + }, + { + "epoch": 10.03, + "learning_rate": 3.400200601805416e-05, + "loss": 2.4661, + "theoretical_loss": 3.3141647654849837, + "tokens_seen": 3078838272 + }, + { + "epoch": 10.03, + "learning_rate": 3.399197592778335e-05, + "loss": 2.5854, + "theoretical_loss": 3.3141593701110854, + "tokens_seen": 3078903808 + }, + { + "epoch": 10.03, + "learning_rate": 3.398194583751254e-05, + "loss": 2.2723, + "theoretical_loss": 3.3141539748841846, + "tokens_seen": 3078969344 + }, + { + "epoch": 10.03, + "learning_rate": 3.3971915747241726e-05, + "loss": 2.6958, + "theoretical_loss": 3.3141485798042742, + "tokens_seen": 3079034880 + }, + { + "epoch": 10.03, + "learning_rate": 3.3961885656970914e-05, + "loss": 2.5381, + "theoretical_loss": 3.3141431848713463, + "tokens_seen": 3079100416 + }, + { + "epoch": 10.03, + "learning_rate": 3.39518555667001e-05, + "loss": 2.5022, + "theoretical_loss": 3.3141377900853946, + "tokens_seen": 3079165952 + }, + { + "epoch": 10.03, + "learning_rate": 3.394182547642929e-05, + "loss": 2.5248, + "theoretical_loss": 3.3141323954464115, + "tokens_seen": 3079231488 + }, + { + "epoch": 10.03, + "learning_rate": 3.393179538615848e-05, + "loss": 2.5601, + "theoretical_loss": 3.3141270009543904, + "tokens_seen": 3079297024 + }, + { + "epoch": 10.03, + "learning_rate": 3.392176529588766e-05, + "loss": 2.5644, + "theoretical_loss": 3.3141216066093238, + "tokens_seen": 3079362560 + }, + { + "epoch": 10.03, + "learning_rate": 3.3911735205616846e-05, + "loss": 2.5601, + "theoretical_loss": 3.3141162124112045, + "tokens_seen": 3079428096 + }, + { + "epoch": 10.03, + "learning_rate": 3.3901705115346034e-05, + "loss": 2.525, + "theoretical_loss": 3.3141108183600254, + "tokens_seen": 3079493632 + }, + { + "epoch": 10.03, + "learning_rate": 3.389167502507522e-05, + "loss": 2.5402, + "theoretical_loss": 3.3141054244557795, + "tokens_seen": 3079559168 + }, + { + "epoch": 10.03, + "learning_rate": 3.388164493480441e-05, + "loss": 2.6515, + "theoretical_loss": 3.314100030698459, + "tokens_seen": 3079624704 + }, + { + "epoch": 10.03, + "learning_rate": 3.3871614844533605e-05, + "loss": 2.5928, + "theoretical_loss": 3.314094637088058, + "tokens_seen": 3079690240 + }, + { + "epoch": 10.03, + "learning_rate": 3.386158475426279e-05, + "loss": 2.2896, + "theoretical_loss": 3.3140892436245686, + "tokens_seen": 3079755776 + }, + { + "epoch": 10.03, + "learning_rate": 3.385155466399198e-05, + "loss": 2.6436, + "theoretical_loss": 3.314083850307984, + "tokens_seen": 3079821312 + }, + { + "epoch": 10.03, + "learning_rate": 3.384152457372117e-05, + "loss": 2.6118, + "theoretical_loss": 3.314078457138297, + "tokens_seen": 3079886848 + }, + { + "epoch": 10.03, + "learning_rate": 3.3831494483450356e-05, + "loss": 2.4563, + "theoretical_loss": 3.3140730641155, + "tokens_seen": 3079952384 + }, + { + "epoch": 10.03, + "learning_rate": 3.382146439317954e-05, + "loss": 2.6622, + "theoretical_loss": 3.3140676712395862, + "tokens_seen": 3080017920 + }, + { + "epoch": 10.03, + "learning_rate": 3.3811434302908725e-05, + "loss": 2.5537, + "theoretical_loss": 3.3140622785105487, + "tokens_seen": 3080083456 + }, + { + "epoch": 10.03, + "learning_rate": 3.380140421263791e-05, + "loss": 2.3885, + "theoretical_loss": 3.3140568859283803, + "tokens_seen": 3080148992 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3405666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.825260877609253, + "objective/train/theoretical_loss": 3.31405418969237, + "objective/train/tokens_used": 3100641760, + "theoretical_loss": 3.31405418969237, + "tokens_seen": 3080181760 + }, + { + "epoch": 10.03, + "learning_rate": 3.37913741223671e-05, + "loss": 2.703, + "theoretical_loss": 3.3140514934930736, + "tokens_seen": 3080214528 + }, + { + "epoch": 10.03, + "learning_rate": 3.378134403209629e-05, + "loss": 2.5434, + "theoretical_loss": 3.314046101204622, + "tokens_seen": 3080280064 + }, + { + "epoch": 10.03, + "learning_rate": 3.377131394182548e-05, + "loss": 2.4571, + "theoretical_loss": 3.3140407090630175, + "tokens_seen": 3080345600 + }, + { + "epoch": 10.03, + "learning_rate": 3.3761283851554665e-05, + "loss": 2.4904, + "theoretical_loss": 3.3140353170682535, + "tokens_seen": 3080411136 + }, + { + "epoch": 10.03, + "learning_rate": 3.375125376128385e-05, + "loss": 2.5285, + "theoretical_loss": 3.314029925220323, + "tokens_seen": 3080476672 + }, + { + "epoch": 10.03, + "learning_rate": 3.374122367101304e-05, + "loss": 2.465, + "theoretical_loss": 3.314024533519219, + "tokens_seen": 3080542208 + }, + { + "epoch": 10.03, + "learning_rate": 3.373119358074223e-05, + "loss": 2.4499, + "theoretical_loss": 3.3140191419649336, + "tokens_seen": 3080607744 + }, + { + "epoch": 10.03, + "learning_rate": 3.372116349047141e-05, + "loss": 2.4769, + "theoretical_loss": 3.3140137505574607, + "tokens_seen": 3080673280 + }, + { + "epoch": 10.03, + "learning_rate": 3.37111334002006e-05, + "loss": 2.5468, + "theoretical_loss": 3.3140083592967926, + "tokens_seen": 3080738816 + }, + { + "epoch": 10.03, + "learning_rate": 3.3701103309929785e-05, + "loss": 2.2909, + "theoretical_loss": 3.314002968182922, + "tokens_seen": 3080804352 + }, + { + "epoch": 10.03, + "learning_rate": 3.3691073219658973e-05, + "loss": 2.4892, + "theoretical_loss": 3.3139975772158423, + "tokens_seen": 3080869888 + }, + { + "epoch": 10.03, + "learning_rate": 3.368104312938817e-05, + "loss": 2.3884, + "theoretical_loss": 3.313992186395546, + "tokens_seen": 3080935424 + }, + { + "epoch": 10.03, + "learning_rate": 3.3671013039117356e-05, + "loss": 2.8732, + "theoretical_loss": 3.313986795722026, + "tokens_seen": 3081000960 + }, + { + "epoch": 10.03, + "learning_rate": 3.3660982948846544e-05, + "loss": 2.3395, + "theoretical_loss": 3.3139814051952756, + "tokens_seen": 3081066496 + }, + { + "epoch": 10.03, + "learning_rate": 3.365095285857573e-05, + "loss": 2.428, + "theoretical_loss": 3.313976014815287, + "tokens_seen": 3081132032 + }, + { + "epoch": 10.03, + "learning_rate": 3.364092276830492e-05, + "loss": 2.4043, + "theoretical_loss": 3.313970624582054, + "tokens_seen": 3081197568 + }, + { + "epoch": 10.03, + "learning_rate": 3.363089267803411e-05, + "loss": 2.3467, + "theoretical_loss": 3.3139652344955683, + "tokens_seen": 3081263104 + }, + { + "epoch": 10.03, + "learning_rate": 3.362086258776329e-05, + "loss": 2.8115, + "theoretical_loss": 3.313959844555824, + "tokens_seen": 3081328640 + }, + { + "epoch": 10.03, + "learning_rate": 3.3610832497492477e-05, + "loss": 2.7133, + "theoretical_loss": 3.313954454762813, + "tokens_seen": 3081394176 + }, + { + "epoch": 10.03, + "learning_rate": 3.3600802407221664e-05, + "loss": 2.4116, + "theoretical_loss": 3.3139490651165286, + "tokens_seen": 3081459712 + }, + { + "epoch": 10.03, + "learning_rate": 3.359077231695085e-05, + "loss": 2.4103, + "theoretical_loss": 3.313943675616964, + "tokens_seen": 3081525248 + }, + { + "epoch": 10.03, + "learning_rate": 3.358074222668004e-05, + "loss": 2.6593, + "theoretical_loss": 3.3139382862641114, + "tokens_seen": 3081590784 + }, + { + "epoch": 10.03, + "learning_rate": 3.357071213640923e-05, + "loss": 2.682, + "theoretical_loss": 3.3139328970579642, + "tokens_seen": 3081656320 + }, + { + "epoch": 10.03, + "learning_rate": 3.3560682046138416e-05, + "loss": 2.5272, + "theoretical_loss": 3.313927507998515, + "tokens_seen": 3081721856 + }, + { + "epoch": 10.03, + "learning_rate": 3.3550651955867604e-05, + "loss": 2.4505, + "theoretical_loss": 3.313922119085757, + "tokens_seen": 3081787392 + }, + { + "epoch": 10.03, + "objective/train/docs_used": 3407019, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5957448482513428, + "objective/train/theoretical_loss": 3.313919424684385, + "objective/train/tokens_used": 3102280160, + "theoretical_loss": 3.313919424684385, + "tokens_seen": 3081820160 + }, + { + "epoch": 10.03, + "learning_rate": 3.354062186559679e-05, + "loss": 2.6095, + "theoretical_loss": 3.313916730319683, + "tokens_seen": 3081852928 + }, + { + "epoch": 10.03, + "learning_rate": 3.353059177532597e-05, + "loss": 2.5251, + "theoretical_loss": 3.3139113417002855, + "tokens_seen": 3081918464 + }, + { + "epoch": 10.03, + "learning_rate": 3.352056168505516e-05, + "loss": 2.6553, + "theoretical_loss": 3.3139059532275574, + "tokens_seen": 3081984000 + }, + { + "epoch": 10.03, + "learning_rate": 3.351053159478435e-05, + "loss": 2.6245, + "theoretical_loss": 3.3139005649014925, + "tokens_seen": 3082049536 + }, + { + "epoch": 10.03, + "learning_rate": 3.3500501504513543e-05, + "loss": 2.3715, + "theoretical_loss": 3.313895176722083, + "tokens_seen": 3082115072 + }, + { + "epoch": 10.03, + "learning_rate": 3.349047141424273e-05, + "loss": 2.7145, + "theoretical_loss": 3.3138897886893215, + "tokens_seen": 3082180608 + }, + { + "epoch": 10.03, + "learning_rate": 3.348044132397192e-05, + "loss": 2.4685, + "theoretical_loss": 3.3138844008032016, + "tokens_seen": 3082246144 + }, + { + "epoch": 10.03, + "learning_rate": 3.347041123370111e-05, + "loss": 2.5761, + "theoretical_loss": 3.3138790130637155, + "tokens_seen": 3082311680 + }, + { + "epoch": 10.03, + "learning_rate": 3.3460381143430295e-05, + "loss": 2.5807, + "theoretical_loss": 3.3138736254708565, + "tokens_seen": 3082377216 + }, + { + "epoch": 10.03, + "learning_rate": 3.345035105315948e-05, + "loss": 2.7034, + "theoretical_loss": 3.3138682380246176, + "tokens_seen": 3082442752 + }, + { + "epoch": 10.03, + "learning_rate": 3.344032096288867e-05, + "loss": 2.695, + "theoretical_loss": 3.3138628507249916, + "tokens_seen": 3082508288 + }, + { + "epoch": 10.03, + "learning_rate": 3.343029087261785e-05, + "loss": 2.4232, + "theoretical_loss": 3.313857463571971, + "tokens_seen": 3082573824 + }, + { + "epoch": 10.03, + "learning_rate": 3.342026078234704e-05, + "loss": 2.4575, + "theoretical_loss": 3.3138520765655493, + "tokens_seen": 3082639360 + }, + { + "epoch": 10.03, + "learning_rate": 3.341023069207623e-05, + "loss": 2.672, + "theoretical_loss": 3.3138466897057186, + "tokens_seen": 3082704896 + }, + { + "epoch": 10.03, + "learning_rate": 3.3400200601805416e-05, + "loss": 2.4002, + "theoretical_loss": 3.313841302992473, + "tokens_seen": 3082770432 + }, + { + "epoch": 10.03, + "learning_rate": 3.3390170511534604e-05, + "loss": 2.6077, + "theoretical_loss": 3.313835916425804, + "tokens_seen": 3082835968 + }, + { + "epoch": 10.03, + "learning_rate": 3.338014042126379e-05, + "loss": 2.2909, + "theoretical_loss": 3.3138305300057054, + "tokens_seen": 3082901504 + }, + { + "epoch": 10.03, + "learning_rate": 3.337011033099298e-05, + "loss": 2.42, + "theoretical_loss": 3.31382514373217, + "tokens_seen": 3082967040 + }, + { + "epoch": 10.03, + "learning_rate": 3.336008024072217e-05, + "loss": 2.6829, + "theoretical_loss": 3.3138197576051907, + "tokens_seen": 3083032576 + }, + { + "epoch": 10.03, + "learning_rate": 3.3350050150451355e-05, + "loss": 2.3971, + "theoretical_loss": 3.31381437162476, + "tokens_seen": 3083098112 + }, + { + "epoch": 10.04, + "learning_rate": 3.334002006018054e-05, + "loss": 2.5027, + "theoretical_loss": 3.313808985790871, + "tokens_seen": 3083163648 + }, + { + "epoch": 10.04, + "learning_rate": 3.3329989969909724e-05, + "loss": 2.3703, + "theoretical_loss": 3.313803600103517, + "tokens_seen": 3083229184 + }, + { + "epoch": 10.04, + "learning_rate": 3.331995987963891e-05, + "loss": 2.4618, + "theoretical_loss": 3.3137982145626905, + "tokens_seen": 3083294720 + }, + { + "epoch": 10.04, + "learning_rate": 3.330992978936811e-05, + "loss": 2.193, + "theoretical_loss": 3.313792829168384, + "tokens_seen": 3083360256 + }, + { + "epoch": 10.04, + "learning_rate": 3.3299899699097295e-05, + "loss": 2.6018, + "theoretical_loss": 3.3137874439205914, + "tokens_seen": 3083425792 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3407615, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7507855892181396, + "objective/train/theoretical_loss": 3.313784751351635, + "objective/train/tokens_used": 3103918560, + "theoretical_loss": 3.313784751351635, + "tokens_seen": 3083458560 + }, + { + "epoch": 10.04, + "learning_rate": 3.328986960882648e-05, + "loss": 2.3286, + "theoretical_loss": 3.3137820588193048, + "tokens_seen": 3083491328 + }, + { + "epoch": 10.04, + "learning_rate": 3.327983951855567e-05, + "loss": 2.6049, + "theoretical_loss": 3.3137766738645174, + "tokens_seen": 3083556864 + }, + { + "epoch": 10.04, + "learning_rate": 3.326980942828486e-05, + "loss": 2.5233, + "theoretical_loss": 3.313771289056222, + "tokens_seen": 3083622400 + }, + { + "epoch": 10.04, + "learning_rate": 3.3259779338014046e-05, + "loss": 2.5219, + "theoretical_loss": 3.313765904394412, + "tokens_seen": 3083687936 + }, + { + "epoch": 10.04, + "learning_rate": 3.3249749247743234e-05, + "loss": 2.3387, + "theoretical_loss": 3.313760519879079, + "tokens_seen": 3083753472 + }, + { + "epoch": 10.04, + "learning_rate": 3.323971915747242e-05, + "loss": 2.677, + "theoretical_loss": 3.3137551355102177, + "tokens_seen": 3083819008 + }, + { + "epoch": 10.04, + "learning_rate": 3.32296890672016e-05, + "loss": 2.3158, + "theoretical_loss": 3.3137497512878196, + "tokens_seen": 3083884544 + }, + { + "epoch": 10.04, + "learning_rate": 3.321965897693079e-05, + "loss": 2.4417, + "theoretical_loss": 3.313744367211878, + "tokens_seen": 3083950080 + }, + { + "epoch": 10.04, + "learning_rate": 3.320962888665998e-05, + "loss": 2.5125, + "theoretical_loss": 3.313738983282386, + "tokens_seen": 3084015616 + }, + { + "epoch": 10.04, + "learning_rate": 3.319959879638917e-05, + "loss": 2.544, + "theoretical_loss": 3.3137335994993364, + "tokens_seen": 3084081152 + }, + { + "epoch": 10.04, + "learning_rate": 3.3189568706118355e-05, + "loss": 2.584, + "theoretical_loss": 3.3137282158627217, + "tokens_seen": 3084146688 + }, + { + "epoch": 10.04, + "learning_rate": 3.317953861584754e-05, + "loss": 2.5091, + "theoretical_loss": 3.3137228323725356, + "tokens_seen": 3084212224 + }, + { + "epoch": 10.04, + "learning_rate": 3.316950852557673e-05, + "loss": 2.3787, + "theoretical_loss": 3.313717449028771, + "tokens_seen": 3084277760 + }, + { + "epoch": 10.04, + "learning_rate": 3.315947843530592e-05, + "loss": 2.3829, + "theoretical_loss": 3.31371206583142, + "tokens_seen": 3084343296 + }, + { + "epoch": 10.04, + "learning_rate": 3.3149448345035106e-05, + "loss": 2.3321, + "theoretical_loss": 3.3137066827804755, + "tokens_seen": 3084408832 + }, + { + "epoch": 10.04, + "learning_rate": 3.313941825476429e-05, + "loss": 2.5377, + "theoretical_loss": 3.313701299875931, + "tokens_seen": 3084474368 + }, + { + "epoch": 10.04, + "learning_rate": 3.3129388164493475e-05, + "loss": 2.5319, + "theoretical_loss": 3.31369591711778, + "tokens_seen": 3084539904 + }, + { + "epoch": 10.04, + "learning_rate": 3.311935807422267e-05, + "loss": 2.6633, + "theoretical_loss": 3.313690534506014, + "tokens_seen": 3084605440 + }, + { + "epoch": 10.04, + "learning_rate": 3.310932798395186e-05, + "loss": 2.3756, + "theoretical_loss": 3.3136851520406267, + "tokens_seen": 3084670976 + }, + { + "epoch": 10.04, + "learning_rate": 3.3099297893681046e-05, + "loss": 2.437, + "theoretical_loss": 3.3136797697216105, + "tokens_seen": 3084736512 + }, + { + "epoch": 10.04, + "learning_rate": 3.3089267803410234e-05, + "loss": 2.6211, + "theoretical_loss": 3.313674387548959, + "tokens_seen": 3084802048 + }, + { + "epoch": 10.04, + "learning_rate": 3.307923771313942e-05, + "loss": 2.592, + "theoretical_loss": 3.313669005522665, + "tokens_seen": 3084867584 + }, + { + "epoch": 10.04, + "learning_rate": 3.306920762286861e-05, + "loss": 2.4381, + "theoretical_loss": 3.3136636236427206, + "tokens_seen": 3084933120 + }, + { + "epoch": 10.04, + "learning_rate": 3.30591775325978e-05, + "loss": 2.3915, + "theoretical_loss": 3.3136582419091196, + "tokens_seen": 3084998656 + }, + { + "epoch": 10.04, + "learning_rate": 3.3049147442326985e-05, + "loss": 2.6184, + "theoretical_loss": 3.313652860321855, + "tokens_seen": 3085064192 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3409148, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3011372089385986, + "objective/train/theoretical_loss": 3.313650169583096, + "objective/train/tokens_used": 3105556960, + "theoretical_loss": 3.313650169583096, + "tokens_seen": 3085096960 + }, + { + "epoch": 10.04, + "learning_rate": 3.3039117352056166e-05, + "loss": 2.4403, + "theoretical_loss": 3.3136474788809185, + "tokens_seen": 3085129728 + }, + { + "epoch": 10.04, + "learning_rate": 3.3029087261785354e-05, + "loss": 2.5248, + "theoretical_loss": 3.3136420975863046, + "tokens_seen": 3085195264 + }, + { + "epoch": 10.04, + "learning_rate": 3.301905717151454e-05, + "loss": 2.7167, + "theoretical_loss": 3.3136367164380047, + "tokens_seen": 3085260800 + }, + { + "epoch": 10.04, + "learning_rate": 3.300902708124373e-05, + "loss": 2.5616, + "theoretical_loss": 3.313631335436013, + "tokens_seen": 3085326336 + }, + { + "epoch": 10.04, + "learning_rate": 3.299899699097292e-05, + "loss": 2.5478, + "theoretical_loss": 3.313625954580322, + "tokens_seen": 3085391872 + }, + { + "epoch": 10.04, + "learning_rate": 3.2988966900702106e-05, + "loss": 2.4244, + "theoretical_loss": 3.3136205738709243, + "tokens_seen": 3085457408 + }, + { + "epoch": 10.04, + "learning_rate": 3.2978936810431294e-05, + "loss": 2.6525, + "theoretical_loss": 3.313615193307813, + "tokens_seen": 3085522944 + }, + { + "epoch": 10.04, + "learning_rate": 3.296890672016048e-05, + "loss": 2.5943, + "theoretical_loss": 3.3136098128909808, + "tokens_seen": 3085588480 + }, + { + "epoch": 10.04, + "learning_rate": 3.295887662988967e-05, + "loss": 2.5988, + "theoretical_loss": 3.313604432620421, + "tokens_seen": 3085654016 + }, + { + "epoch": 10.04, + "learning_rate": 3.294884653961886e-05, + "loss": 2.3763, + "theoretical_loss": 3.3135990524961265, + "tokens_seen": 3085719552 + }, + { + "epoch": 10.04, + "learning_rate": 3.2938816449348045e-05, + "loss": 2.6538, + "theoretical_loss": 3.31359367251809, + "tokens_seen": 3085785088 + }, + { + "epoch": 10.04, + "learning_rate": 3.292878635907723e-05, + "loss": 2.2777, + "theoretical_loss": 3.3135882926863047, + "tokens_seen": 3085850624 + }, + { + "epoch": 10.04, + "learning_rate": 3.291875626880642e-05, + "loss": 2.548, + "theoretical_loss": 3.313582913000763, + "tokens_seen": 3085916160 + }, + { + "epoch": 10.04, + "learning_rate": 3.290872617853561e-05, + "loss": 2.497, + "theoretical_loss": 3.313577533461458, + "tokens_seen": 3085981696 + }, + { + "epoch": 10.04, + "learning_rate": 3.28986960882648e-05, + "loss": 2.6479, + "theoretical_loss": 3.3135721540683827, + "tokens_seen": 3086047232 + }, + { + "epoch": 10.04, + "learning_rate": 3.2888665997993985e-05, + "loss": 2.6904, + "theoretical_loss": 3.3135667748215303, + "tokens_seen": 3086112768 + }, + { + "epoch": 10.04, + "learning_rate": 3.287863590772317e-05, + "loss": 2.4756, + "theoretical_loss": 3.313561395720894, + "tokens_seen": 3086178304 + }, + { + "epoch": 10.04, + "learning_rate": 3.286860581745236e-05, + "loss": 2.5745, + "theoretical_loss": 3.3135560167664657, + "tokens_seen": 3086243840 + }, + { + "epoch": 10.04, + "learning_rate": 3.285857572718155e-05, + "loss": 2.2829, + "theoretical_loss": 3.3135506379582385, + "tokens_seen": 3086309376 + }, + { + "epoch": 10.04, + "learning_rate": 3.2848545636910736e-05, + "loss": 2.4551, + "theoretical_loss": 3.3135452592962062, + "tokens_seen": 3086374912 + }, + { + "epoch": 10.04, + "learning_rate": 3.283851554663992e-05, + "loss": 2.3005, + "theoretical_loss": 3.313539880780361, + "tokens_seen": 3086440448 + }, + { + "epoch": 10.04, + "learning_rate": 3.2828485456369105e-05, + "loss": 2.7192, + "theoretical_loss": 3.313534502410696, + "tokens_seen": 3086505984 + }, + { + "epoch": 10.04, + "learning_rate": 3.281845536609829e-05, + "loss": 2.5284, + "theoretical_loss": 3.313529124187204, + "tokens_seen": 3086571520 + }, + { + "epoch": 10.04, + "learning_rate": 3.280842527582748e-05, + "loss": 2.4687, + "theoretical_loss": 3.3135237461098783, + "tokens_seen": 3086637056 + }, + { + "epoch": 10.04, + "learning_rate": 3.279839518555667e-05, + "loss": 2.4714, + "theoretical_loss": 3.3135183681787113, + "tokens_seen": 3086702592 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3409516, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5285611152648926, + "objective/train/theoretical_loss": 3.3135156792679354, + "objective/train/tokens_used": 3107195360, + "theoretical_loss": 3.3135156792679354, + "tokens_seen": 3086735360 + }, + { + "epoch": 10.04, + "learning_rate": 3.278836509528586e-05, + "loss": 2.5656, + "theoretical_loss": 3.313512990393696, + "tokens_seen": 3086768128 + }, + { + "epoch": 10.04, + "learning_rate": 3.2778335005015045e-05, + "loss": 2.5416, + "theoretical_loss": 3.313507612754826, + "tokens_seen": 3086833664 + }, + { + "epoch": 10.04, + "learning_rate": 3.276830491474423e-05, + "loss": 2.3474, + "theoretical_loss": 3.313502235262094, + "tokens_seen": 3086899200 + }, + { + "epoch": 10.04, + "learning_rate": 3.275827482447342e-05, + "loss": 2.6552, + "theoretical_loss": 3.313496857915492, + "tokens_seen": 3086964736 + }, + { + "epoch": 10.04, + "learning_rate": 3.274824473420261e-05, + "loss": 2.4087, + "theoretical_loss": 3.313491480715014, + "tokens_seen": 3087030272 + }, + { + "epoch": 10.04, + "learning_rate": 3.2738214643931796e-05, + "loss": 2.6638, + "theoretical_loss": 3.313486103660652, + "tokens_seen": 3087095808 + }, + { + "epoch": 10.04, + "learning_rate": 3.2728184553660984e-05, + "loss": 2.4771, + "theoretical_loss": 3.3134807267524002, + "tokens_seen": 3087161344 + }, + { + "epoch": 10.04, + "learning_rate": 3.271815446339017e-05, + "loss": 2.5512, + "theoretical_loss": 3.3134753499902505, + "tokens_seen": 3087226880 + }, + { + "epoch": 10.04, + "learning_rate": 3.270812437311936e-05, + "loss": 2.7266, + "theoretical_loss": 3.313469973374196, + "tokens_seen": 3087292416 + }, + { + "epoch": 10.04, + "learning_rate": 3.269809428284855e-05, + "loss": 2.4104, + "theoretical_loss": 3.31346459690423, + "tokens_seen": 3087357952 + }, + { + "epoch": 10.04, + "learning_rate": 3.2688064192577736e-05, + "loss": 2.4053, + "theoretical_loss": 3.3134592205803446, + "tokens_seen": 3087423488 + }, + { + "epoch": 10.04, + "learning_rate": 3.2678034102306924e-05, + "loss": 2.5431, + "theoretical_loss": 3.3134538444025337, + "tokens_seen": 3087489024 + }, + { + "epoch": 10.04, + "learning_rate": 3.266800401203611e-05, + "loss": 2.3256, + "theoretical_loss": 3.3134484683707894, + "tokens_seen": 3087554560 + }, + { + "epoch": 10.04, + "learning_rate": 3.26579739217653e-05, + "loss": 2.3139, + "theoretical_loss": 3.3134430924851057, + "tokens_seen": 3087620096 + }, + { + "epoch": 10.04, + "learning_rate": 3.264794383149448e-05, + "loss": 2.7821, + "theoretical_loss": 3.313437716745475, + "tokens_seen": 3087685632 + }, + { + "epoch": 10.04, + "learning_rate": 3.263791374122367e-05, + "loss": 2.4149, + "theoretical_loss": 3.3134323411518896, + "tokens_seen": 3087751168 + }, + { + "epoch": 10.04, + "learning_rate": 3.2627883650952857e-05, + "loss": 2.5999, + "theoretical_loss": 3.313426965704343, + "tokens_seen": 3087816704 + }, + { + "epoch": 10.04, + "learning_rate": 3.2617853560682044e-05, + "loss": 2.5547, + "theoretical_loss": 3.3134215904028284, + "tokens_seen": 3087882240 + }, + { + "epoch": 10.04, + "learning_rate": 3.260782347041123e-05, + "loss": 2.4164, + "theoretical_loss": 3.3134162152473383, + "tokens_seen": 3087947776 + }, + { + "epoch": 10.04, + "learning_rate": 3.259779338014042e-05, + "loss": 2.7024, + "theoretical_loss": 3.3134108402378653, + "tokens_seen": 3088013312 + }, + { + "epoch": 10.04, + "learning_rate": 3.258776328986961e-05, + "loss": 2.5763, + "theoretical_loss": 3.3134054653744034, + "tokens_seen": 3088078848 + }, + { + "epoch": 10.04, + "learning_rate": 3.2577733199598796e-05, + "loss": 2.3546, + "theoretical_loss": 3.3134000906569447, + "tokens_seen": 3088144384 + }, + { + "epoch": 10.04, + "learning_rate": 3.2567703109327984e-05, + "loss": 2.4638, + "theoretical_loss": 3.313394716085482, + "tokens_seen": 3088209920 + }, + { + "epoch": 10.04, + "learning_rate": 3.255767301905718e-05, + "loss": 2.3628, + "theoretical_loss": 3.3133893416600095, + "tokens_seen": 3088275456 + }, + { + "epoch": 10.04, + "learning_rate": 3.254764292878636e-05, + "loss": 2.428, + "theoretical_loss": 3.3133839673805188, + "tokens_seen": 3088340992 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3410962, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6350255012512207, + "objective/train/theoretical_loss": 3.3133812802955145, + "objective/train/tokens_used": 3108833760, + "theoretical_loss": 3.3133812802955145, + "tokens_seen": 3088373760 + }, + { + "epoch": 10.04, + "learning_rate": 3.253761283851555e-05, + "loss": 2.5067, + "theoretical_loss": 3.313378593247003, + "tokens_seen": 3088406528 + }, + { + "epoch": 10.04, + "learning_rate": 3.2527582748244736e-05, + "loss": 2.4016, + "theoretical_loss": 3.3133732192594554, + "tokens_seen": 3088472064 + }, + { + "epoch": 10.04, + "learning_rate": 3.2517552657973923e-05, + "loss": 2.4582, + "theoretical_loss": 3.313367845417869, + "tokens_seen": 3088537600 + }, + { + "epoch": 10.04, + "learning_rate": 3.250752256770311e-05, + "loss": 2.5826, + "theoretical_loss": 3.3133624717222365, + "tokens_seen": 3088603136 + }, + { + "epoch": 10.04, + "learning_rate": 3.24974924774323e-05, + "loss": 2.41, + "theoretical_loss": 3.313357098172551, + "tokens_seen": 3088668672 + }, + { + "epoch": 10.04, + "learning_rate": 3.248746238716149e-05, + "loss": 2.3795, + "theoretical_loss": 3.3133517247688054, + "tokens_seen": 3088734208 + }, + { + "epoch": 10.04, + "learning_rate": 3.2477432296890675e-05, + "loss": 2.4779, + "theoretical_loss": 3.3133463515109924, + "tokens_seen": 3088799744 + }, + { + "epoch": 10.04, + "learning_rate": 3.246740220661986e-05, + "loss": 2.6255, + "theoretical_loss": 3.3133409783991055, + "tokens_seen": 3088865280 + }, + { + "epoch": 10.04, + "learning_rate": 3.2457372116349044e-05, + "loss": 2.4608, + "theoretical_loss": 3.313335605433137, + "tokens_seen": 3088930816 + }, + { + "epoch": 10.04, + "learning_rate": 3.244734202607823e-05, + "loss": 2.4781, + "theoretical_loss": 3.3133302326130805, + "tokens_seen": 3088996352 + }, + { + "epoch": 10.04, + "learning_rate": 3.243731193580742e-05, + "loss": 2.35, + "theoretical_loss": 3.3133248599389282, + "tokens_seen": 3089061888 + }, + { + "epoch": 10.04, + "learning_rate": 3.242728184553661e-05, + "loss": 2.5733, + "theoretical_loss": 3.3133194874106735, + "tokens_seen": 3089127424 + }, + { + "epoch": 10.04, + "learning_rate": 3.2417251755265796e-05, + "loss": 2.3896, + "theoretical_loss": 3.313314115028309, + "tokens_seen": 3089192960 + }, + { + "epoch": 10.04, + "learning_rate": 3.2407221664994984e-05, + "loss": 2.4923, + "theoretical_loss": 3.313308742791828, + "tokens_seen": 3089258496 + }, + { + "epoch": 10.04, + "learning_rate": 3.239719157472417e-05, + "loss": 2.7117, + "theoretical_loss": 3.313303370701224, + "tokens_seen": 3089324032 + }, + { + "epoch": 10.04, + "learning_rate": 3.238716148445336e-05, + "loss": 2.3381, + "theoretical_loss": 3.3132979987564886, + "tokens_seen": 3089389568 + }, + { + "epoch": 10.04, + "learning_rate": 3.237713139418255e-05, + "loss": 2.4746, + "theoretical_loss": 3.3132926269576157, + "tokens_seen": 3089455104 + }, + { + "epoch": 10.04, + "learning_rate": 3.236710130391174e-05, + "loss": 2.3633, + "theoretical_loss": 3.313287255304598, + "tokens_seen": 3089520640 + }, + { + "epoch": 10.04, + "learning_rate": 3.235707121364092e-05, + "loss": 2.3936, + "theoretical_loss": 3.313281883797428, + "tokens_seen": 3089586176 + }, + { + "epoch": 10.04, + "learning_rate": 3.234704112337011e-05, + "loss": 2.439, + "theoretical_loss": 3.3132765124361, + "tokens_seen": 3089651712 + }, + { + "epoch": 10.04, + "learning_rate": 3.23370110330993e-05, + "loss": 2.4188, + "theoretical_loss": 3.3132711412206053, + "tokens_seen": 3089717248 + }, + { + "epoch": 10.04, + "learning_rate": 3.232698094282849e-05, + "loss": 2.6202, + "theoretical_loss": 3.3132657701509376, + "tokens_seen": 3089782784 + }, + { + "epoch": 10.04, + "learning_rate": 3.2316950852557675e-05, + "loss": 2.6391, + "theoretical_loss": 3.31326039922709, + "tokens_seen": 3089848320 + }, + { + "epoch": 10.04, + "learning_rate": 3.230692076228686e-05, + "loss": 2.5374, + "theoretical_loss": 3.3132550284490554, + "tokens_seen": 3089913856 + }, + { + "epoch": 10.04, + "learning_rate": 3.229689067201605e-05, + "loss": 2.3946, + "theoretical_loss": 3.3132496578168262, + "tokens_seen": 3089979392 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3411728, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.828634262084961, + "objective/train/theoretical_loss": 3.313246972555387, + "objective/train/tokens_used": 3110472160, + "theoretical_loss": 3.313246972555387, + "tokens_seen": 3090012160 + }, + { + "epoch": 10.04, + "learning_rate": 3.228686058174524e-05, + "loss": 2.5297, + "theoretical_loss": 3.313244287330396, + "tokens_seen": 3090044928 + }, + { + "epoch": 10.04, + "learning_rate": 3.2276830491474426e-05, + "loss": 2.467, + "theoretical_loss": 3.313238916989757, + "tokens_seen": 3090110464 + }, + { + "epoch": 10.04, + "learning_rate": 3.2266800401203614e-05, + "loss": 2.3477, + "theoretical_loss": 3.3132335467949035, + "tokens_seen": 3090176000 + }, + { + "epoch": 10.04, + "learning_rate": 3.2256770310932795e-05, + "loss": 2.396, + "theoretical_loss": 3.3132281767458274, + "tokens_seen": 3090241536 + }, + { + "epoch": 10.04, + "learning_rate": 3.224674022066198e-05, + "loss": 2.4575, + "theoretical_loss": 3.3132228068425214, + "tokens_seen": 3090307072 + }, + { + "epoch": 10.04, + "learning_rate": 3.223671013039117e-05, + "loss": 2.4444, + "theoretical_loss": 3.3132174370849796, + "tokens_seen": 3090372608 + }, + { + "epoch": 10.04, + "learning_rate": 3.222668004012036e-05, + "loss": 2.5073, + "theoretical_loss": 3.3132120674731937, + "tokens_seen": 3090438144 + }, + { + "epoch": 10.04, + "learning_rate": 3.221664994984955e-05, + "loss": 2.6107, + "theoretical_loss": 3.3132066980071575, + "tokens_seen": 3090503680 + }, + { + "epoch": 10.04, + "learning_rate": 3.2206619859578735e-05, + "loss": 2.6038, + "theoretical_loss": 3.313201328686864, + "tokens_seen": 3090569216 + }, + { + "epoch": 10.04, + "learning_rate": 3.219658976930792e-05, + "loss": 2.459, + "theoretical_loss": 3.313195959512305, + "tokens_seen": 3090634752 + }, + { + "epoch": 10.04, + "learning_rate": 3.218655967903712e-05, + "loss": 2.7413, + "theoretical_loss": 3.3131905904834746, + "tokens_seen": 3090700288 + }, + { + "epoch": 10.04, + "learning_rate": 3.2176529588766305e-05, + "loss": 2.6914, + "theoretical_loss": 3.3131852216003654, + "tokens_seen": 3090765824 + }, + { + "epoch": 10.04, + "learning_rate": 3.216649949849549e-05, + "loss": 2.4507, + "theoretical_loss": 3.313179852862971, + "tokens_seen": 3090831360 + }, + { + "epoch": 10.04, + "learning_rate": 3.2156469408224674e-05, + "loss": 2.3903, + "theoretical_loss": 3.313174484271283, + "tokens_seen": 3090896896 + }, + { + "epoch": 10.04, + "learning_rate": 3.214643931795386e-05, + "loss": 2.4905, + "theoretical_loss": 3.3131691158252954, + "tokens_seen": 3090962432 + }, + { + "epoch": 10.04, + "learning_rate": 3.213640922768305e-05, + "loss": 2.6036, + "theoretical_loss": 3.313163747525001, + "tokens_seen": 3091027968 + }, + { + "epoch": 10.04, + "learning_rate": 3.212637913741224e-05, + "loss": 2.5212, + "theoretical_loss": 3.3131583793703925, + "tokens_seen": 3091093504 + }, + { + "epoch": 10.04, + "learning_rate": 3.2116349047141426e-05, + "loss": 2.4644, + "theoretical_loss": 3.3131530113614627, + "tokens_seen": 3091159040 + }, + { + "epoch": 10.04, + "learning_rate": 3.2106318956870614e-05, + "loss": 2.4074, + "theoretical_loss": 3.3131476434982052, + "tokens_seen": 3091224576 + }, + { + "epoch": 10.04, + "learning_rate": 3.20962888665998e-05, + "loss": 2.5025, + "theoretical_loss": 3.3131422757806126, + "tokens_seen": 3091290112 + }, + { + "epoch": 10.04, + "learning_rate": 3.208625877632899e-05, + "loss": 2.3648, + "theoretical_loss": 3.3131369082086777, + "tokens_seen": 3091355648 + }, + { + "epoch": 10.04, + "learning_rate": 3.207622868605818e-05, + "loss": 2.2902, + "theoretical_loss": 3.3131315407823934, + "tokens_seen": 3091421184 + }, + { + "epoch": 10.04, + "learning_rate": 3.206619859578736e-05, + "loss": 2.48, + "theoretical_loss": 3.313126173501753, + "tokens_seen": 3091486720 + }, + { + "epoch": 10.04, + "learning_rate": 3.2056168505516546e-05, + "loss": 2.4445, + "theoretical_loss": 3.3131208063667494, + "tokens_seen": 3091552256 + }, + { + "epoch": 10.04, + "learning_rate": 3.2046138415245734e-05, + "loss": 2.4209, + "theoretical_loss": 3.313115439377376, + "tokens_seen": 3091617792 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3413229, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.439804792404175, + "objective/train/theoretical_loss": 3.313112755937298, + "objective/train/tokens_used": 3112110560, + "theoretical_loss": 3.313112755937298, + "tokens_seen": 3091650560 + }, + { + "epoch": 10.04, + "learning_rate": 3.203610832497492e-05, + "loss": 2.3515, + "theoretical_loss": 3.3131100725336244, + "tokens_seen": 3091683328 + }, + { + "epoch": 10.04, + "learning_rate": 3.202607823470411e-05, + "loss": 2.3486, + "theoretical_loss": 3.3131047058354888, + "tokens_seen": 3091748864 + }, + { + "epoch": 10.04, + "learning_rate": 3.20160481444333e-05, + "loss": 2.4284, + "theoretical_loss": 3.313099339282962, + "tokens_seen": 3091814400 + }, + { + "epoch": 10.04, + "learning_rate": 3.2006018054162486e-05, + "loss": 2.5972, + "theoretical_loss": 3.3130939728760365, + "tokens_seen": 3091879936 + }, + { + "epoch": 10.04, + "learning_rate": 3.199598796389168e-05, + "loss": 2.5079, + "theoretical_loss": 3.3130886066147056, + "tokens_seen": 3091945472 + }, + { + "epoch": 10.04, + "learning_rate": 3.198595787362087e-05, + "loss": 2.3814, + "theoretical_loss": 3.313083240498962, + "tokens_seen": 3092011008 + }, + { + "epoch": 10.04, + "learning_rate": 3.1975927783350056e-05, + "loss": 2.3499, + "theoretical_loss": 3.313077874528799, + "tokens_seen": 3092076544 + }, + { + "epoch": 10.04, + "learning_rate": 3.196589769307924e-05, + "loss": 2.4452, + "theoretical_loss": 3.3130725087042094, + "tokens_seen": 3092142080 + }, + { + "epoch": 10.04, + "learning_rate": 3.1955867602808425e-05, + "loss": 2.4702, + "theoretical_loss": 3.313067143025186, + "tokens_seen": 3092207616 + }, + { + "epoch": 10.04, + "learning_rate": 3.194583751253761e-05, + "loss": 2.4256, + "theoretical_loss": 3.313061777491722, + "tokens_seen": 3092273152 + }, + { + "epoch": 10.04, + "learning_rate": 3.19358074222668e-05, + "loss": 2.6478, + "theoretical_loss": 3.31305641210381, + "tokens_seen": 3092338688 + }, + { + "epoch": 10.04, + "learning_rate": 3.192577733199599e-05, + "loss": 2.6056, + "theoretical_loss": 3.3130510468614442, + "tokens_seen": 3092404224 + }, + { + "epoch": 10.04, + "learning_rate": 3.191574724172518e-05, + "loss": 2.385, + "theoretical_loss": 3.313045681764616, + "tokens_seen": 3092469760 + }, + { + "epoch": 10.04, + "learning_rate": 3.1905717151454365e-05, + "loss": 2.2677, + "theoretical_loss": 3.3130403168133187, + "tokens_seen": 3092535296 + }, + { + "epoch": 10.04, + "learning_rate": 3.189568706118355e-05, + "loss": 2.5674, + "theoretical_loss": 3.313034952007546, + "tokens_seen": 3092600832 + }, + { + "epoch": 10.04, + "learning_rate": 3.188565697091274e-05, + "loss": 2.4673, + "theoretical_loss": 3.3130295873472906, + "tokens_seen": 3092666368 + }, + { + "epoch": 10.04, + "learning_rate": 3.187562688064193e-05, + "loss": 2.3417, + "theoretical_loss": 3.313024222832545, + "tokens_seen": 3092731904 + }, + { + "epoch": 10.04, + "learning_rate": 3.186559679037111e-05, + "loss": 2.4645, + "theoretical_loss": 3.3130188584633027, + "tokens_seen": 3092797440 + }, + { + "epoch": 10.04, + "learning_rate": 3.18555667001003e-05, + "loss": 2.4233, + "theoretical_loss": 3.313013494239556, + "tokens_seen": 3092862976 + }, + { + "epoch": 10.04, + "learning_rate": 3.1845536609829485e-05, + "loss": 2.5296, + "theoretical_loss": 3.3130081301612986, + "tokens_seen": 3092928512 + }, + { + "epoch": 10.04, + "learning_rate": 3.183550651955867e-05, + "loss": 2.3615, + "theoretical_loss": 3.3130027662285233, + "tokens_seen": 3092994048 + }, + { + "epoch": 10.04, + "learning_rate": 3.182547642928786e-05, + "loss": 2.7046, + "theoretical_loss": 3.3129974024412228, + "tokens_seen": 3093059584 + }, + { + "epoch": 10.04, + "learning_rate": 3.181544633901705e-05, + "loss": 2.4116, + "theoretical_loss": 3.3129920387993903, + "tokens_seen": 3093125120 + }, + { + "epoch": 10.04, + "learning_rate": 3.1805416248746244e-05, + "loss": 2.4395, + "theoretical_loss": 3.312986675303019, + "tokens_seen": 3093190656 + }, + { + "epoch": 10.04, + "learning_rate": 3.179538615847543e-05, + "loss": 2.3102, + "theoretical_loss": 3.3129813119521008, + "tokens_seen": 3093256192 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3413860, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.696775197982788, + "objective/train/theoretical_loss": 3.3129786303311852, + "objective/train/tokens_used": 3113748960, + "theoretical_loss": 3.3129786303311852, + "tokens_seen": 3093288960 + }, + { + "epoch": 10.04, + "learning_rate": 3.178535606820462e-05, + "loss": 2.3651, + "theoretical_loss": 3.31297594874663, + "tokens_seen": 3093321728 + }, + { + "epoch": 10.04, + "learning_rate": 3.177532597793381e-05, + "loss": 2.5875, + "theoretical_loss": 3.312970585686599, + "tokens_seen": 3093387264 + }, + { + "epoch": 10.04, + "learning_rate": 3.176529588766299e-05, + "loss": 2.3845, + "theoretical_loss": 3.312965222772001, + "tokens_seen": 3093452800 + }, + { + "epoch": 10.04, + "learning_rate": 3.1755265797392176e-05, + "loss": 2.339, + "theoretical_loss": 3.3129598600028283, + "tokens_seen": 3093518336 + }, + { + "epoch": 10.04, + "learning_rate": 3.1745235707121364e-05, + "loss": 2.4841, + "theoretical_loss": 3.3129544973790743, + "tokens_seen": 3093583872 + }, + { + "epoch": 10.04, + "learning_rate": 3.173520561685055e-05, + "loss": 2.2535, + "theoretical_loss": 3.3129491349007325, + "tokens_seen": 3093649408 + }, + { + "epoch": 10.04, + "learning_rate": 3.172517552657974e-05, + "loss": 2.4633, + "theoretical_loss": 3.3129437725677953, + "tokens_seen": 3093714944 + }, + { + "epoch": 10.04, + "learning_rate": 3.171514543630893e-05, + "loss": 2.4035, + "theoretical_loss": 3.312938410380256, + "tokens_seen": 3093780480 + }, + { + "epoch": 10.04, + "learning_rate": 3.1705115346038116e-05, + "loss": 2.5714, + "theoretical_loss": 3.312933048338107, + "tokens_seen": 3093846016 + }, + { + "epoch": 10.04, + "learning_rate": 3.1695085255767304e-05, + "loss": 2.3001, + "theoretical_loss": 3.3129276864413417, + "tokens_seen": 3093911552 + }, + { + "epoch": 10.04, + "learning_rate": 3.168505516549649e-05, + "loss": 2.3791, + "theoretical_loss": 3.312922324689953, + "tokens_seen": 3093977088 + }, + { + "epoch": 10.04, + "learning_rate": 3.167502507522567e-05, + "loss": 2.4713, + "theoretical_loss": 3.312916963083934, + "tokens_seen": 3094042624 + }, + { + "epoch": 10.04, + "learning_rate": 3.166499498495486e-05, + "loss": 2.4791, + "theoretical_loss": 3.3129116016232776, + "tokens_seen": 3094108160 + }, + { + "epoch": 10.04, + "learning_rate": 3.165496489468405e-05, + "loss": 2.4707, + "theoretical_loss": 3.3129062403079765, + "tokens_seen": 3094173696 + }, + { + "epoch": 10.04, + "learning_rate": 3.1644934804413237e-05, + "loss": 2.4266, + "theoretical_loss": 3.312900879138024, + "tokens_seen": 3094239232 + }, + { + "epoch": 10.04, + "learning_rate": 3.1634904714142424e-05, + "loss": 2.342, + "theoretical_loss": 3.312895518113413, + "tokens_seen": 3094304768 + }, + { + "epoch": 10.04, + "learning_rate": 3.162487462387162e-05, + "loss": 2.2488, + "theoretical_loss": 3.312890157234137, + "tokens_seen": 3094370304 + }, + { + "epoch": 10.04, + "learning_rate": 3.161484453360081e-05, + "loss": 2.4221, + "theoretical_loss": 3.312884796500188, + "tokens_seen": 3094435840 + }, + { + "epoch": 10.04, + "learning_rate": 3.1604814443329995e-05, + "loss": 2.2332, + "theoretical_loss": 3.3128794359115594, + "tokens_seen": 3094501376 + }, + { + "epoch": 10.04, + "learning_rate": 3.159478435305918e-05, + "loss": 2.4604, + "theoretical_loss": 3.312874075468245, + "tokens_seen": 3094566912 + }, + { + "epoch": 10.04, + "learning_rate": 3.158475426278837e-05, + "loss": 2.4682, + "theoretical_loss": 3.3128687151702363, + "tokens_seen": 3094632448 + }, + { + "epoch": 10.04, + "learning_rate": 3.157472417251755e-05, + "loss": 2.4154, + "theoretical_loss": 3.312863355017527, + "tokens_seen": 3094697984 + }, + { + "epoch": 10.04, + "learning_rate": 3.156469408224674e-05, + "loss": 2.6283, + "theoretical_loss": 3.3128579950101105, + "tokens_seen": 3094763520 + }, + { + "epoch": 10.04, + "learning_rate": 3.155466399197593e-05, + "loss": 2.5237, + "theoretical_loss": 3.312852635147979, + "tokens_seen": 3094829056 + }, + { + "epoch": 10.04, + "learning_rate": 3.1544633901705116e-05, + "loss": 2.5844, + "theoretical_loss": 3.312847275431126, + "tokens_seen": 3094894592 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3415506, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.349890947341919, + "objective/train/theoretical_loss": 3.312844595627177, + "objective/train/tokens_used": 3115387360, + "theoretical_loss": 3.312844595627177, + "tokens_seen": 3094927360 + }, + { + "epoch": 10.04, + "learning_rate": 3.1534603811434303e-05, + "loss": 2.4138, + "theoretical_loss": 3.3128419158595444, + "tokens_seen": 3094960128 + }, + { + "epoch": 10.04, + "learning_rate": 3.152457372116349e-05, + "loss": 2.3956, + "theoretical_loss": 3.3128365564332274, + "tokens_seen": 3095025664 + }, + { + "epoch": 10.04, + "learning_rate": 3.151454363089268e-05, + "loss": 2.2559, + "theoretical_loss": 3.3128311971521676, + "tokens_seen": 3095091200 + }, + { + "epoch": 10.04, + "learning_rate": 3.150451354062187e-05, + "loss": 2.3157, + "theoretical_loss": 3.3128258380163578, + "tokens_seen": 3095156736 + }, + { + "epoch": 10.04, + "learning_rate": 3.1494483450351055e-05, + "loss": 2.442, + "theoretical_loss": 3.3128204790257914, + "tokens_seen": 3095222272 + }, + { + "epoch": 10.04, + "learning_rate": 3.148445336008024e-05, + "loss": 2.5221, + "theoretical_loss": 3.3128151201804616, + "tokens_seen": 3095287808 + }, + { + "epoch": 10.04, + "learning_rate": 3.1474423269809424e-05, + "loss": 2.5105, + "theoretical_loss": 3.312809761480361, + "tokens_seen": 3095353344 + }, + { + "epoch": 10.04, + "learning_rate": 3.146439317953861e-05, + "loss": 2.3617, + "theoretical_loss": 3.312804402925482, + "tokens_seen": 3095418880 + }, + { + "epoch": 10.04, + "learning_rate": 3.14543630892678e-05, + "loss": 2.6471, + "theoretical_loss": 3.3127990445158195, + "tokens_seen": 3095484416 + }, + { + "epoch": 10.04, + "learning_rate": 3.144433299899699e-05, + "loss": 2.5026, + "theoretical_loss": 3.3127936862513643, + "tokens_seen": 3095549952 + }, + { + "epoch": 10.04, + "learning_rate": 3.143430290872618e-05, + "loss": 2.5164, + "theoretical_loss": 3.3127883281321107, + "tokens_seen": 3095615488 + }, + { + "epoch": 10.04, + "learning_rate": 3.142427281845537e-05, + "loss": 2.4717, + "theoretical_loss": 3.312782970158051, + "tokens_seen": 3095681024 + }, + { + "epoch": 10.04, + "learning_rate": 3.141424272818456e-05, + "loss": 2.5332, + "theoretical_loss": 3.312777612329179, + "tokens_seen": 3095746560 + }, + { + "epoch": 10.04, + "learning_rate": 3.1404212637913746e-05, + "loss": 2.7525, + "theoretical_loss": 3.312772254645487, + "tokens_seen": 3095812096 + }, + { + "epoch": 10.04, + "learning_rate": 3.1394182547642934e-05, + "loss": 2.4624, + "theoretical_loss": 3.312766897106968, + "tokens_seen": 3095877632 + }, + { + "epoch": 10.04, + "learning_rate": 3.138415245737212e-05, + "loss": 2.484, + "theoretical_loss": 3.312761539713616, + "tokens_seen": 3095943168 + }, + { + "epoch": 10.04, + "learning_rate": 3.13741223671013e-05, + "loss": 2.3582, + "theoretical_loss": 3.3127561824654226, + "tokens_seen": 3096008704 + }, + { + "epoch": 10.04, + "learning_rate": 3.136409227683049e-05, + "loss": 2.3348, + "theoretical_loss": 3.3127508253623814, + "tokens_seen": 3096074240 + }, + { + "epoch": 10.04, + "learning_rate": 3.135406218655968e-05, + "loss": 2.5681, + "theoretical_loss": 3.3127454684044855, + "tokens_seen": 3096139776 + }, + { + "epoch": 10.04, + "learning_rate": 3.134403209628887e-05, + "loss": 2.6399, + "theoretical_loss": 3.3127401115917277, + "tokens_seen": 3096205312 + }, + { + "epoch": 10.04, + "learning_rate": 3.1334002006018055e-05, + "loss": 2.5287, + "theoretical_loss": 3.312734754924101, + "tokens_seen": 3096270848 + }, + { + "epoch": 10.04, + "learning_rate": 3.132397191574724e-05, + "loss": 2.4524, + "theoretical_loss": 3.312729398401599, + "tokens_seen": 3096336384 + }, + { + "epoch": 10.04, + "learning_rate": 3.131394182547643e-05, + "loss": 2.6562, + "theoretical_loss": 3.3127240420242137, + "tokens_seen": 3096401920 + }, + { + "epoch": 10.04, + "learning_rate": 3.130391173520562e-05, + "loss": 2.6076, + "theoretical_loss": 3.312718685791938, + "tokens_seen": 3096467456 + }, + { + "epoch": 10.04, + "learning_rate": 3.1293881644934806e-05, + "loss": 2.4496, + "theoretical_loss": 3.3127133297047666, + "tokens_seen": 3096532992 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3416213, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1190850734710693, + "objective/train/theoretical_loss": 3.312710651715592, + "objective/train/tokens_used": 3117025760, + "theoretical_loss": 3.312710651715592, + "tokens_seen": 3096565760 + }, + { + "epoch": 10.04, + "learning_rate": 3.128385155466399e-05, + "loss": 2.3823, + "theoretical_loss": 3.3127079737626905, + "tokens_seen": 3096598528 + }, + { + "epoch": 10.04, + "learning_rate": 3.1273821464393175e-05, + "loss": 2.4318, + "theoretical_loss": 3.312702617965704, + "tokens_seen": 3096664064 + }, + { + "epoch": 10.04, + "learning_rate": 3.126379137412236e-05, + "loss": 2.4752, + "theoretical_loss": 3.3126972623138, + "tokens_seen": 3096729600 + }, + { + "epoch": 10.04, + "learning_rate": 3.125376128385155e-05, + "loss": 2.4794, + "theoretical_loss": 3.312691906806971, + "tokens_seen": 3096795136 + }, + { + "epoch": 10.04, + "learning_rate": 3.1243731193580746e-05, + "loss": 2.5256, + "theoretical_loss": 3.3126865514452097, + "tokens_seen": 3096860672 + }, + { + "epoch": 10.04, + "learning_rate": 3.123370110330993e-05, + "loss": 2.3237, + "theoretical_loss": 3.3126811962285094, + "tokens_seen": 3096926208 + }, + { + "epoch": 10.04, + "learning_rate": 3.1223671013039115e-05, + "loss": 2.3792, + "theoretical_loss": 3.3126758411568638, + "tokens_seen": 3096991744 + }, + { + "epoch": 10.04, + "learning_rate": 3.12136409227683e-05, + "loss": 2.3838, + "theoretical_loss": 3.3126704862302656, + "tokens_seen": 3097057280 + }, + { + "epoch": 10.04, + "learning_rate": 3.120361083249749e-05, + "loss": 2.5841, + "theoretical_loss": 3.312665131448707, + "tokens_seen": 3097122816 + }, + { + "epoch": 10.04, + "learning_rate": 3.1193580742226685e-05, + "loss": 2.5121, + "theoretical_loss": 3.3126597768121817, + "tokens_seen": 3097188352 + }, + { + "epoch": 10.04, + "learning_rate": 3.1183550651955866e-05, + "loss": 2.1914, + "theoretical_loss": 3.3126544223206826, + "tokens_seen": 3097253888 + }, + { + "epoch": 10.04, + "learning_rate": 3.1173520561685054e-05, + "loss": 2.5276, + "theoretical_loss": 3.3126490679742027, + "tokens_seen": 3097319424 + }, + { + "epoch": 10.04, + "learning_rate": 3.116349047141424e-05, + "loss": 2.358, + "theoretical_loss": 3.3126437137727356, + "tokens_seen": 3097384960 + }, + { + "epoch": 10.04, + "learning_rate": 3.115346038114343e-05, + "loss": 2.5591, + "theoretical_loss": 3.312638359716273, + "tokens_seen": 3097450496 + }, + { + "epoch": 10.04, + "learning_rate": 3.114343029087262e-05, + "loss": 2.4552, + "theoretical_loss": 3.3126330058048086, + "tokens_seen": 3097516032 + }, + { + "epoch": 10.04, + "learning_rate": 3.1133400200601806e-05, + "loss": 2.4734, + "theoretical_loss": 3.3126276520383353, + "tokens_seen": 3097581568 + }, + { + "epoch": 10.04, + "learning_rate": 3.1123370110330994e-05, + "loss": 2.5505, + "theoretical_loss": 3.3126222984168465, + "tokens_seen": 3097647104 + }, + { + "epoch": 10.04, + "learning_rate": 3.111334002006018e-05, + "loss": 2.5862, + "theoretical_loss": 3.312616944940335, + "tokens_seen": 3097712640 + }, + { + "epoch": 10.04, + "learning_rate": 3.110330992978937e-05, + "loss": 2.4057, + "theoretical_loss": 3.3126115916087935, + "tokens_seen": 3097778176 + }, + { + "epoch": 10.04, + "learning_rate": 3.109327983951856e-05, + "loss": 2.3359, + "theoretical_loss": 3.312606238422215, + "tokens_seen": 3097843712 + }, + { + "epoch": 10.04, + "learning_rate": 3.1083249749247745e-05, + "loss": 2.4622, + "theoretical_loss": 3.312600885380593, + "tokens_seen": 3097909248 + }, + { + "epoch": 10.04, + "learning_rate": 3.107321965897693e-05, + "loss": 2.2161, + "theoretical_loss": 3.3125955324839205, + "tokens_seen": 3097974784 + }, + { + "epoch": 10.04, + "learning_rate": 3.106318956870612e-05, + "loss": 2.2061, + "theoretical_loss": 3.31259017973219, + "tokens_seen": 3098040320 + }, + { + "epoch": 10.04, + "learning_rate": 3.105315947843531e-05, + "loss": 2.2929, + "theoretical_loss": 3.3125848271253946, + "tokens_seen": 3098105856 + }, + { + "epoch": 10.04, + "learning_rate": 3.10431293881645e-05, + "loss": 2.3317, + "theoretical_loss": 3.312579474663528, + "tokens_seen": 3098171392 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3417091, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4227871894836426, + "objective/train/theoretical_loss": 3.31257679848694, + "objective/train/tokens_used": 3118664160, + "theoretical_loss": 3.31257679848694, + "tokens_seen": 3098204160 + }, + { + "epoch": 10.04, + "learning_rate": 3.103309929789368e-05, + "loss": 2.4553, + "theoretical_loss": 3.3125741223465823, + "tokens_seen": 3098236928 + }, + { + "epoch": 10.04, + "learning_rate": 3.1023069207622866e-05, + "loss": 2.5381, + "theoretical_loss": 3.3125687701745505, + "tokens_seen": 3098302464 + }, + { + "epoch": 10.04, + "learning_rate": 3.101303911735206e-05, + "loss": 2.4688, + "theoretical_loss": 3.312563418147427, + "tokens_seen": 3098368000 + }, + { + "epoch": 10.04, + "learning_rate": 3.100300902708125e-05, + "loss": 2.4201, + "theoretical_loss": 3.3125580662652028, + "tokens_seen": 3098433536 + }, + { + "epoch": 10.04, + "learning_rate": 3.099297893681043e-05, + "loss": 2.4919, + "theoretical_loss": 3.3125527145278726, + "tokens_seen": 3098499072 + }, + { + "epoch": 10.04, + "learning_rate": 3.098294884653962e-05, + "loss": 2.6477, + "theoretical_loss": 3.3125473629354283, + "tokens_seen": 3098564608 + }, + { + "epoch": 10.04, + "learning_rate": 3.0972918756268805e-05, + "loss": 2.5506, + "theoretical_loss": 3.3125420114878636, + "tokens_seen": 3098630144 + }, + { + "epoch": 10.04, + "learning_rate": 3.096288866599799e-05, + "loss": 2.5042, + "theoretical_loss": 3.312536660185171, + "tokens_seen": 3098695680 + }, + { + "epoch": 10.04, + "learning_rate": 3.095285857572718e-05, + "loss": 2.5289, + "theoretical_loss": 3.312531309027344, + "tokens_seen": 3098761216 + }, + { + "epoch": 10.04, + "learning_rate": 3.094282848545637e-05, + "loss": 2.3948, + "theoretical_loss": 3.3125259580143753, + "tokens_seen": 3098826752 + }, + { + "epoch": 10.04, + "learning_rate": 3.093279839518556e-05, + "loss": 2.4213, + "theoretical_loss": 3.3125206071462583, + "tokens_seen": 3098892288 + }, + { + "epoch": 10.04, + "learning_rate": 3.0922768304914745e-05, + "loss": 2.5545, + "theoretical_loss": 3.3125152564229854, + "tokens_seen": 3098957824 + }, + { + "epoch": 10.04, + "learning_rate": 3.091273821464393e-05, + "loss": 2.2965, + "theoretical_loss": 3.31250990584455, + "tokens_seen": 3099023360 + }, + { + "epoch": 10.04, + "learning_rate": 3.090270812437312e-05, + "loss": 2.4226, + "theoretical_loss": 3.3125045554109454, + "tokens_seen": 3099088896 + }, + { + "epoch": 10.04, + "learning_rate": 3.089267803410231e-05, + "loss": 2.3421, + "theoretical_loss": 3.3124992051221636, + "tokens_seen": 3099154432 + }, + { + "epoch": 10.04, + "learning_rate": 3.0882647943831496e-05, + "loss": 2.3601, + "theoretical_loss": 3.312493854978199, + "tokens_seen": 3099219968 + }, + { + "epoch": 10.04, + "learning_rate": 3.0872617853560684e-05, + "loss": 2.5493, + "theoretical_loss": 3.3124885049790436, + "tokens_seen": 3099285504 + }, + { + "epoch": 10.04, + "learning_rate": 3.086258776328987e-05, + "loss": 2.3656, + "theoretical_loss": 3.3124831551246907, + "tokens_seen": 3099351040 + }, + { + "epoch": 10.04, + "learning_rate": 3.085255767301906e-05, + "loss": 2.4347, + "theoretical_loss": 3.312477805415133, + "tokens_seen": 3099416576 + }, + { + "epoch": 10.04, + "learning_rate": 3.084252758274824e-05, + "loss": 2.3979, + "theoretical_loss": 3.3124724558503646, + "tokens_seen": 3099482112 + }, + { + "epoch": 10.04, + "learning_rate": 3.083249749247743e-05, + "loss": 2.4757, + "theoretical_loss": 3.3124671064303772, + "tokens_seen": 3099547648 + }, + { + "epoch": 10.04, + "learning_rate": 3.0822467402206624e-05, + "loss": 2.3624, + "theoretical_loss": 3.312461757155165, + "tokens_seen": 3099613184 + }, + { + "epoch": 10.04, + "learning_rate": 3.081243731193581e-05, + "loss": 2.4091, + "theoretical_loss": 3.31245640802472, + "tokens_seen": 3099678720 + }, + { + "epoch": 10.04, + "learning_rate": 3.0802407221665e-05, + "loss": 2.3447, + "theoretical_loss": 3.3124510590390357, + "tokens_seen": 3099744256 + }, + { + "epoch": 10.04, + "learning_rate": 3.079237713139418e-05, + "loss": 2.4282, + "theoretical_loss": 3.312445710198105, + "tokens_seen": 3099809792 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3418461, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5260910987854004, + "objective/train/theoretical_loss": 3.3124430358319206, + "objective/train/tokens_used": 3120302560, + "theoretical_loss": 3.3124430358319206, + "tokens_seen": 3099842560 + }, + { + "epoch": 10.04, + "learning_rate": 3.078234704112337e-05, + "loss": 2.435, + "theoretical_loss": 3.3124403615019213, + "tokens_seen": 3099875328 + }, + { + "epoch": 10.04, + "learning_rate": 3.0772316950852556e-05, + "loss": 2.2558, + "theoretical_loss": 3.312435012950477, + "tokens_seen": 3099940864 + }, + { + "epoch": 10.04, + "learning_rate": 3.0762286860581744e-05, + "loss": 2.7155, + "theoretical_loss": 3.3124296645437656, + "tokens_seen": 3100006400 + }, + { + "epoch": 10.04, + "learning_rate": 3.075225677031093e-05, + "loss": 2.3403, + "theoretical_loss": 3.31242431628178, + "tokens_seen": 3100071936 + }, + { + "epoch": 10.04, + "learning_rate": 3.074222668004012e-05, + "loss": 2.2553, + "theoretical_loss": 3.312418968164513, + "tokens_seen": 3100137472 + }, + { + "epoch": 10.04, + "learning_rate": 3.073219658976931e-05, + "loss": 2.3789, + "theoretical_loss": 3.312413620191958, + "tokens_seen": 3100203008 + }, + { + "epoch": 10.04, + "learning_rate": 3.0722166499498496e-05, + "loss": 2.5119, + "theoretical_loss": 3.3124082723641077, + "tokens_seen": 3100268544 + }, + { + "epoch": 10.04, + "learning_rate": 3.0712136409227684e-05, + "loss": 2.4024, + "theoretical_loss": 3.3124029246809554, + "tokens_seen": 3100334080 + }, + { + "epoch": 10.04, + "learning_rate": 3.070210631895687e-05, + "loss": 2.3751, + "theoretical_loss": 3.312397577142494, + "tokens_seen": 3100399616 + }, + { + "epoch": 10.04, + "learning_rate": 3.069207622868606e-05, + "loss": 2.5359, + "theoretical_loss": 3.3123922297487165, + "tokens_seen": 3100465152 + }, + { + "epoch": 10.04, + "learning_rate": 3.068204613841525e-05, + "loss": 2.396, + "theoretical_loss": 3.3123868824996157, + "tokens_seen": 3100530688 + }, + { + "epoch": 10.04, + "learning_rate": 3.0672016048144435e-05, + "loss": 2.4007, + "theoretical_loss": 3.3123815353951853, + "tokens_seen": 3100596224 + }, + { + "epoch": 10.04, + "learning_rate": 3.066198595787362e-05, + "loss": 2.3571, + "theoretical_loss": 3.312376188435418, + "tokens_seen": 3100661760 + }, + { + "epoch": 10.04, + "learning_rate": 3.0651955867602804e-05, + "loss": 2.4219, + "theoretical_loss": 3.3123708416203064, + "tokens_seen": 3100727296 + }, + { + "epoch": 10.04, + "learning_rate": 3.064192577733199e-05, + "loss": 2.538, + "theoretical_loss": 3.3123654949498436, + "tokens_seen": 3100792832 + }, + { + "epoch": 10.04, + "learning_rate": 3.063189568706119e-05, + "loss": 2.5585, + "theoretical_loss": 3.3123601484240233, + "tokens_seen": 3100858368 + }, + { + "epoch": 10.04, + "learning_rate": 3.0621865596790375e-05, + "loss": 2.0188, + "theoretical_loss": 3.312354802042838, + "tokens_seen": 3100923904 + }, + { + "epoch": 10.04, + "learning_rate": 3.061183550651956e-05, + "loss": 2.4028, + "theoretical_loss": 3.312349455806281, + "tokens_seen": 3100989440 + }, + { + "epoch": 10.04, + "learning_rate": 3.0601805416248744e-05, + "loss": 2.5524, + "theoretical_loss": 3.3123441097143447, + "tokens_seen": 3101054976 + }, + { + "epoch": 10.04, + "learning_rate": 3.059177532597793e-05, + "loss": 2.1697, + "theoretical_loss": 3.312338763767023, + "tokens_seen": 3101120512 + }, + { + "epoch": 10.04, + "learning_rate": 3.058174523570712e-05, + "loss": 2.3091, + "theoretical_loss": 3.3123334179643087, + "tokens_seen": 3101186048 + }, + { + "epoch": 10.04, + "learning_rate": 3.057171514543631e-05, + "loss": 2.4595, + "theoretical_loss": 3.3123280723061943, + "tokens_seen": 3101251584 + }, + { + "epoch": 10.04, + "learning_rate": 3.0561685055165495e-05, + "loss": 2.0685, + "theoretical_loss": 3.312322726792673, + "tokens_seen": 3101317120 + }, + { + "epoch": 10.04, + "learning_rate": 3.055165496489468e-05, + "loss": 2.4286, + "theoretical_loss": 3.312317381423739, + "tokens_seen": 3101382656 + }, + { + "epoch": 10.04, + "learning_rate": 3.054162487462387e-05, + "loss": 2.4138, + "theoretical_loss": 3.3123120361993834, + "tokens_seen": 3101448192 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3419081, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8503341674804688, + "objective/train/theoretical_loss": 3.312309363641421, + "objective/train/tokens_used": 3121940960, + "theoretical_loss": 3.312309363641421, + "tokens_seen": 3101480960 + }, + { + "epoch": 10.04, + "learning_rate": 3.053159478435306e-05, + "loss": 2.5211, + "theoretical_loss": 3.312306691119601, + "tokens_seen": 3101513728 + }, + { + "epoch": 10.04, + "learning_rate": 3.052156469408225e-05, + "loss": 2.4615, + "theoretical_loss": 3.3123013461843835, + "tokens_seen": 3101579264 + }, + { + "epoch": 10.04, + "learning_rate": 3.0511534603811435e-05, + "loss": 2.3851, + "theoretical_loss": 3.3122960013937246, + "tokens_seen": 3101644800 + }, + { + "epoch": 10.04, + "learning_rate": 3.0501504513540623e-05, + "loss": 2.1795, + "theoretical_loss": 3.312290656747617, + "tokens_seen": 3101710336 + }, + { + "epoch": 10.04, + "learning_rate": 3.049147442326981e-05, + "loss": 2.4857, + "theoretical_loss": 3.312285312246055, + "tokens_seen": 3101775872 + }, + { + "epoch": 10.04, + "learning_rate": 3.0481444332998995e-05, + "loss": 2.4923, + "theoretical_loss": 3.3122799678890296, + "tokens_seen": 3101841408 + }, + { + "epoch": 10.04, + "learning_rate": 3.0471414242728183e-05, + "loss": 2.3771, + "theoretical_loss": 3.312274623676535, + "tokens_seen": 3101906944 + }, + { + "epoch": 10.04, + "learning_rate": 3.046138415245737e-05, + "loss": 2.5609, + "theoretical_loss": 3.312269279608564, + "tokens_seen": 3101972480 + }, + { + "epoch": 10.04, + "learning_rate": 3.0451354062186562e-05, + "loss": 2.4496, + "theoretical_loss": 3.31226393568511, + "tokens_seen": 3102038016 + }, + { + "epoch": 10.04, + "learning_rate": 3.044132397191575e-05, + "loss": 2.4774, + "theoretical_loss": 3.312258591906166, + "tokens_seen": 3102103552 + }, + { + "epoch": 10.04, + "learning_rate": 3.0431293881644935e-05, + "loss": 2.4831, + "theoretical_loss": 3.3122532482717237, + "tokens_seen": 3102169088 + }, + { + "epoch": 10.04, + "learning_rate": 3.0421263791374123e-05, + "loss": 2.5025, + "theoretical_loss": 3.312247904781778, + "tokens_seen": 3102234624 + }, + { + "epoch": 10.04, + "learning_rate": 3.041123370110331e-05, + "loss": 2.261, + "theoretical_loss": 3.312242561436321, + "tokens_seen": 3102300160 + }, + { + "epoch": 10.04, + "learning_rate": 3.04012036108325e-05, + "loss": 2.5285, + "theoretical_loss": 3.312237218235346, + "tokens_seen": 3102365696 + }, + { + "epoch": 10.04, + "learning_rate": 3.0391173520561686e-05, + "loss": 2.4162, + "theoretical_loss": 3.312231875178846, + "tokens_seen": 3102431232 + }, + { + "epoch": 10.04, + "learning_rate": 3.038114343029087e-05, + "loss": 2.6302, + "theoretical_loss": 3.312226532266814, + "tokens_seen": 3102496768 + }, + { + "epoch": 10.04, + "learning_rate": 3.037111334002006e-05, + "loss": 2.1939, + "theoretical_loss": 3.312221189499243, + "tokens_seen": 3102562304 + }, + { + "epoch": 10.04, + "learning_rate": 3.0361083249749247e-05, + "loss": 2.4979, + "theoretical_loss": 3.312215846876126, + "tokens_seen": 3102627840 + }, + { + "epoch": 10.04, + "learning_rate": 3.0351053159478438e-05, + "loss": 2.4967, + "theoretical_loss": 3.312210504397456, + "tokens_seen": 3102693376 + }, + { + "epoch": 10.04, + "learning_rate": 3.0341023069207626e-05, + "loss": 2.5304, + "theoretical_loss": 3.3122051620632265, + "tokens_seen": 3102758912 + }, + { + "epoch": 10.04, + "learning_rate": 3.033099297893681e-05, + "loss": 2.2712, + "theoretical_loss": 3.31219981987343, + "tokens_seen": 3102824448 + }, + { + "epoch": 10.04, + "learning_rate": 3.0320962888665998e-05, + "loss": 2.3356, + "theoretical_loss": 3.3121944778280596, + "tokens_seen": 3102889984 + }, + { + "epoch": 10.04, + "learning_rate": 3.0310932798395186e-05, + "loss": 2.4988, + "theoretical_loss": 3.3121891359271087, + "tokens_seen": 3102955520 + }, + { + "epoch": 10.04, + "learning_rate": 3.0300902708124374e-05, + "loss": 2.6705, + "theoretical_loss": 3.3121837941705703, + "tokens_seen": 3103021056 + }, + { + "epoch": 10.04, + "learning_rate": 3.0290872617853562e-05, + "loss": 2.3689, + "theoretical_loss": 3.3121784525584372, + "tokens_seen": 3103086592 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3420574, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.42258358001709, + "objective/train/theoretical_loss": 3.3121757818065203, + "objective/train/tokens_used": 3123579360, + "theoretical_loss": 3.3121757818065203, + "tokens_seen": 3103119360 + }, + { + "epoch": 10.04, + "learning_rate": 3.0280842527582746e-05, + "loss": 2.4429, + "theoretical_loss": 3.3121731110907024, + "tokens_seen": 3103152128 + }, + { + "epoch": 10.04, + "learning_rate": 3.0270812437311934e-05, + "loss": 2.3221, + "theoretical_loss": 3.312167769767359, + "tokens_seen": 3103217664 + }, + { + "epoch": 10.04, + "learning_rate": 3.0260782347041126e-05, + "loss": 2.4804, + "theoretical_loss": 3.3121624285884, + "tokens_seen": 3103283200 + }, + { + "epoch": 10.04, + "learning_rate": 3.0250752256770314e-05, + "loss": 2.546, + "theoretical_loss": 3.312157087553819, + "tokens_seen": 3103348736 + }, + { + "epoch": 10.04, + "learning_rate": 3.02407221664995e-05, + "loss": 2.5379, + "theoretical_loss": 3.3121517466636083, + "tokens_seen": 3103414272 + }, + { + "epoch": 10.04, + "learning_rate": 3.0230692076228686e-05, + "loss": 2.4021, + "theoretical_loss": 3.3121464059177614, + "tokens_seen": 3103479808 + }, + { + "epoch": 10.04, + "learning_rate": 3.0220661985957874e-05, + "loss": 2.1929, + "theoretical_loss": 3.312141065316271, + "tokens_seen": 3103545344 + }, + { + "epoch": 10.04, + "learning_rate": 3.0210631895687062e-05, + "loss": 2.587, + "theoretical_loss": 3.312135724859131, + "tokens_seen": 3103610880 + }, + { + "epoch": 10.04, + "learning_rate": 3.020060180541625e-05, + "loss": 2.2096, + "theoretical_loss": 3.3121303845463332, + "tokens_seen": 3103676416 + }, + { + "epoch": 10.04, + "learning_rate": 3.0190571715145438e-05, + "loss": 2.5672, + "theoretical_loss": 3.3121250443778716, + "tokens_seen": 3103741952 + }, + { + "epoch": 10.04, + "learning_rate": 3.0180541624874622e-05, + "loss": 2.5142, + "theoretical_loss": 3.312119704353739, + "tokens_seen": 3103807488 + }, + { + "epoch": 10.04, + "learning_rate": 3.0170511534603813e-05, + "loss": 2.4766, + "theoretical_loss": 3.312114364473928, + "tokens_seen": 3103873024 + }, + { + "epoch": 10.04, + "learning_rate": 3.0160481444333e-05, + "loss": 2.5733, + "theoretical_loss": 3.3121090247384326, + "tokens_seen": 3103938560 + }, + { + "epoch": 10.04, + "learning_rate": 3.015045135406219e-05, + "loss": 2.1872, + "theoretical_loss": 3.3121036851472447, + "tokens_seen": 3104004096 + }, + { + "epoch": 10.04, + "learning_rate": 3.0140421263791377e-05, + "loss": 2.4178, + "theoretical_loss": 3.3120983457003583, + "tokens_seen": 3104069632 + }, + { + "epoch": 10.04, + "learning_rate": 3.013039117352056e-05, + "loss": 2.1264, + "theoretical_loss": 3.312093006397766, + "tokens_seen": 3104135168 + }, + { + "epoch": 10.04, + "learning_rate": 3.012036108324975e-05, + "loss": 2.4058, + "theoretical_loss": 3.312087667239461, + "tokens_seen": 3104200704 + }, + { + "epoch": 10.04, + "learning_rate": 3.0110330992978937e-05, + "loss": 2.4999, + "theoretical_loss": 3.312082328225436, + "tokens_seen": 3104266240 + }, + { + "epoch": 10.04, + "learning_rate": 3.0100300902708125e-05, + "loss": 2.274, + "theoretical_loss": 3.3120769893556847, + "tokens_seen": 3104331776 + }, + { + "epoch": 10.04, + "learning_rate": 3.009027081243731e-05, + "loss": 2.3568, + "theoretical_loss": 3.3120716506301995, + "tokens_seen": 3104397312 + }, + { + "epoch": 10.04, + "learning_rate": 3.0080240722166498e-05, + "loss": 2.5152, + "theoretical_loss": 3.312066312048974, + "tokens_seen": 3104462848 + }, + { + "epoch": 10.04, + "learning_rate": 3.007021063189569e-05, + "loss": 2.5661, + "theoretical_loss": 3.312060973612001, + "tokens_seen": 3104528384 + }, + { + "epoch": 10.04, + "learning_rate": 3.0060180541624877e-05, + "loss": 2.4392, + "theoretical_loss": 3.3120556353192736, + "tokens_seen": 3104593920 + }, + { + "epoch": 10.04, + "learning_rate": 3.0050150451354065e-05, + "loss": 2.5594, + "theoretical_loss": 3.3120502971707846, + "tokens_seen": 3104659456 + }, + { + "epoch": 10.04, + "learning_rate": 3.004012036108325e-05, + "loss": 2.4397, + "theoretical_loss": 3.3120449591665277, + "tokens_seen": 3104724992 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3421029, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3807170391082764, + "objective/train/theoretical_loss": 3.312042290218484, + "objective/train/tokens_used": 3125217760, + "theoretical_loss": 3.312042290218484, + "tokens_seen": 3104757760 + }, + { + "epoch": 10.04, + "learning_rate": 3.0030090270812437e-05, + "loss": 2.5072, + "theoretical_loss": 3.312039621306495, + "tokens_seen": 3104790528 + }, + { + "epoch": 10.04, + "learning_rate": 3.0020060180541625e-05, + "loss": 2.4592, + "theoretical_loss": 3.3120342835906804, + "tokens_seen": 3104856064 + }, + { + "epoch": 10.04, + "learning_rate": 3.0010030090270813e-05, + "loss": 2.6162, + "theoretical_loss": 3.312028946019077, + "tokens_seen": 3104921600 + }, + { + "epoch": 10.04, + "learning_rate": 3e-05, + "loss": 2.3166, + "theoretical_loss": 3.3120236085916774, + "tokens_seen": 3104987136 + }, + { + "epoch": 10.04, + "learning_rate": 2.9989969909729185e-05, + "loss": 2.5289, + "theoretical_loss": 3.3120182713084745, + "tokens_seen": 3105052672 + }, + { + "epoch": 10.04, + "learning_rate": 2.9979939819458377e-05, + "loss": 2.4598, + "theoretical_loss": 3.312012934169462, + "tokens_seen": 3105118208 + }, + { + "epoch": 10.04, + "learning_rate": 2.9969909729187564e-05, + "loss": 2.3613, + "theoretical_loss": 3.3120075971746323, + "tokens_seen": 3105183744 + }, + { + "epoch": 10.04, + "learning_rate": 2.9959879638916752e-05, + "loss": 2.3265, + "theoretical_loss": 3.312002260323979, + "tokens_seen": 3105249280 + }, + { + "epoch": 10.04, + "learning_rate": 2.994984954864594e-05, + "loss": 2.5764, + "theoretical_loss": 3.3119969236174946, + "tokens_seen": 3105314816 + }, + { + "epoch": 10.04, + "learning_rate": 2.994984954864594e-05, + "loss": 2.2804, + "theoretical_loss": 3.311991587055173, + "tokens_seen": 3105380352 + }, + { + "epoch": 10.04, + "learning_rate": 2.9939819458375125e-05, + "loss": 2.2122, + "theoretical_loss": 3.3119862506370064, + "tokens_seen": 3105445888 + }, + { + "epoch": 10.04, + "learning_rate": 2.9929789368104313e-05, + "loss": 2.4062, + "theoretical_loss": 3.311980914362988, + "tokens_seen": 3105511424 + }, + { + "epoch": 10.04, + "learning_rate": 2.99197592778335e-05, + "loss": 2.4796, + "theoretical_loss": 3.3119755782331115, + "tokens_seen": 3105576960 + }, + { + "epoch": 10.04, + "learning_rate": 2.990972918756269e-05, + "loss": 2.453, + "theoretical_loss": 3.31197024224737, + "tokens_seen": 3105642496 + }, + { + "epoch": 10.04, + "learning_rate": 2.9899699097291876e-05, + "loss": 2.4816, + "theoretical_loss": 3.3119649064057555, + "tokens_seen": 3105708032 + }, + { + "epoch": 10.04, + "learning_rate": 2.988966900702106e-05, + "loss": 2.3253, + "theoretical_loss": 3.311959570708262, + "tokens_seen": 3105773568 + }, + { + "epoch": 10.04, + "learning_rate": 2.9879638916750252e-05, + "loss": 2.4397, + "theoretical_loss": 3.311954235154882, + "tokens_seen": 3105839104 + }, + { + "epoch": 10.04, + "learning_rate": 2.986960882647944e-05, + "loss": 2.6029, + "theoretical_loss": 3.311948899745609, + "tokens_seen": 3105904640 + }, + { + "epoch": 10.04, + "learning_rate": 2.9859578736208628e-05, + "loss": 2.3439, + "theoretical_loss": 3.3119435644804356, + "tokens_seen": 3105970176 + }, + { + "epoch": 10.04, + "learning_rate": 2.9849548645937816e-05, + "loss": 2.5882, + "theoretical_loss": 3.311938229359355, + "tokens_seen": 3106035712 + }, + { + "epoch": 10.04, + "learning_rate": 2.9839518555667e-05, + "loss": 2.3994, + "theoretical_loss": 3.311932894382361, + "tokens_seen": 3106101248 + }, + { + "epoch": 10.04, + "learning_rate": 2.9829488465396188e-05, + "loss": 2.5601, + "theoretical_loss": 3.311927559549446, + "tokens_seen": 3106166784 + }, + { + "epoch": 10.04, + "learning_rate": 2.9819458375125376e-05, + "loss": 2.4752, + "theoretical_loss": 3.311922224860603, + "tokens_seen": 3106232320 + }, + { + "epoch": 10.04, + "learning_rate": 2.9809428284854564e-05, + "loss": 2.4227, + "theoretical_loss": 3.3119168903158256, + "tokens_seen": 3106297856 + }, + { + "epoch": 10.04, + "learning_rate": 2.9799398194583752e-05, + "loss": 2.1711, + "theoretical_loss": 3.311911555915106, + "tokens_seen": 3106363392 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3422603, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9425172805786133, + "objective/train/theoretical_loss": 3.311908888768766, + "objective/train/tokens_used": 3126856160, + "theoretical_loss": 3.311908888768766, + "tokens_seen": 3106396160 + }, + { + "epoch": 10.04, + "learning_rate": 2.978936810431294e-05, + "loss": 2.5878, + "theoretical_loss": 3.311906221658438, + "tokens_seen": 3106428928 + }, + { + "epoch": 10.04, + "learning_rate": 2.9779338014042128e-05, + "loss": 2.368, + "theoretical_loss": 3.3119008875458142, + "tokens_seen": 3106494464 + }, + { + "epoch": 10.04, + "learning_rate": 2.9769307923771316e-05, + "loss": 2.5454, + "theoretical_loss": 3.3118955535772283, + "tokens_seen": 3106560000 + }, + { + "epoch": 10.04, + "learning_rate": 2.9759277833500504e-05, + "loss": 2.5229, + "theoretical_loss": 3.311890219752673, + "tokens_seen": 3106625536 + }, + { + "epoch": 10.04, + "learning_rate": 2.974924774322969e-05, + "loss": 2.3856, + "theoretical_loss": 3.311884886072141, + "tokens_seen": 3106691072 + }, + { + "epoch": 10.04, + "learning_rate": 2.9739217652958876e-05, + "loss": 2.496, + "theoretical_loss": 3.311879552535626, + "tokens_seen": 3106756608 + }, + { + "epoch": 10.04, + "learning_rate": 2.9729187562688064e-05, + "loss": 2.0749, + "theoretical_loss": 3.311874219143121, + "tokens_seen": 3106822144 + }, + { + "epoch": 10.04, + "learning_rate": 2.9719157472417252e-05, + "loss": 2.5068, + "theoretical_loss": 3.3118688858946186, + "tokens_seen": 3106887680 + }, + { + "epoch": 10.04, + "learning_rate": 2.970912738214644e-05, + "loss": 2.4708, + "theoretical_loss": 3.311863552790112, + "tokens_seen": 3106953216 + }, + { + "epoch": 10.04, + "learning_rate": 2.9699097291875627e-05, + "loss": 2.4425, + "theoretical_loss": 3.3118582198295945, + "tokens_seen": 3107018752 + }, + { + "epoch": 10.04, + "learning_rate": 2.9689067201604815e-05, + "loss": 2.5026, + "theoretical_loss": 3.3118528870130595, + "tokens_seen": 3107084288 + }, + { + "epoch": 10.04, + "learning_rate": 2.9679037111334003e-05, + "loss": 2.2808, + "theoretical_loss": 3.311847554340499, + "tokens_seen": 3107149824 + }, + { + "epoch": 10.04, + "learning_rate": 2.966900702106319e-05, + "loss": 2.4051, + "theoretical_loss": 3.3118422218119075, + "tokens_seen": 3107215360 + }, + { + "epoch": 10.04, + "learning_rate": 2.965897693079238e-05, + "loss": 2.2314, + "theoretical_loss": 3.311836889427277, + "tokens_seen": 3107280896 + }, + { + "epoch": 10.04, + "learning_rate": 2.9648946840521564e-05, + "loss": 2.4999, + "theoretical_loss": 3.3118315571866006, + "tokens_seen": 3107346432 + }, + { + "epoch": 10.04, + "learning_rate": 2.963891675025075e-05, + "loss": 2.45, + "theoretical_loss": 3.3118262250898725, + "tokens_seen": 3107411968 + }, + { + "epoch": 10.04, + "learning_rate": 2.962888665997994e-05, + "loss": 2.443, + "theoretical_loss": 3.3118208931370843, + "tokens_seen": 3107477504 + }, + { + "epoch": 10.04, + "learning_rate": 2.9618856569709127e-05, + "loss": 2.5216, + "theoretical_loss": 3.31181556132823, + "tokens_seen": 3107543040 + }, + { + "epoch": 10.04, + "learning_rate": 2.9608826479438315e-05, + "loss": 2.5696, + "theoretical_loss": 3.311810229663302, + "tokens_seen": 3107608576 + }, + { + "epoch": 10.04, + "learning_rate": 2.9598796389167503e-05, + "loss": 2.5441, + "theoretical_loss": 3.3118048981422943, + "tokens_seen": 3107674112 + }, + { + "epoch": 10.04, + "learning_rate": 2.958876629889669e-05, + "loss": 2.6383, + "theoretical_loss": 3.3117995667652, + "tokens_seen": 3107739648 + }, + { + "epoch": 10.04, + "learning_rate": 2.957873620862588e-05, + "loss": 2.4137, + "theoretical_loss": 3.3117942355320107, + "tokens_seen": 3107805184 + }, + { + "epoch": 10.04, + "learning_rate": 2.9568706118355067e-05, + "loss": 2.47, + "theoretical_loss": 3.3117889044427207, + "tokens_seen": 3107870720 + }, + { + "epoch": 10.04, + "learning_rate": 2.9558676028084255e-05, + "loss": 2.3895, + "theoretical_loss": 3.311783573497323, + "tokens_seen": 3107936256 + }, + { + "epoch": 10.04, + "learning_rate": 2.954864593781344e-05, + "loss": 2.3479, + "theoretical_loss": 3.3117782426958104, + "tokens_seen": 3108001792 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3423330, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.746774911880493, + "objective/train/theoretical_loss": 3.311775577349009, + "objective/train/tokens_used": 3128494560, + "theoretical_loss": 3.311775577349009, + "tokens_seen": 3108034560 + }, + { + "epoch": 10.04, + "learning_rate": 2.9538615847542627e-05, + "loss": 2.5478, + "theoretical_loss": 3.311772912038176, + "tokens_seen": 3108067328 + }, + { + "epoch": 10.04, + "learning_rate": 2.9528585757271815e-05, + "loss": 2.6253, + "theoretical_loss": 3.311767581524413, + "tokens_seen": 3108132864 + }, + { + "epoch": 10.04, + "learning_rate": 2.9518555667001003e-05, + "loss": 2.3932, + "theoretical_loss": 3.311762251154515, + "tokens_seen": 3108198400 + }, + { + "epoch": 10.04, + "learning_rate": 2.9508525576730194e-05, + "loss": 2.3184, + "theoretical_loss": 3.311756920928474, + "tokens_seen": 3108263936 + }, + { + "epoch": 10.04, + "learning_rate": 2.949849548645938e-05, + "loss": 2.4488, + "theoretical_loss": 3.311751590846284, + "tokens_seen": 3108329472 + }, + { + "epoch": 10.04, + "learning_rate": 2.9488465396188567e-05, + "loss": 2.4283, + "theoretical_loss": 3.3117462609079373, + "tokens_seen": 3108395008 + }, + { + "epoch": 10.04, + "learning_rate": 2.9478435305917754e-05, + "loss": 2.3396, + "theoretical_loss": 3.3117409311134276, + "tokens_seen": 3108460544 + }, + { + "epoch": 10.04, + "learning_rate": 2.9468405215646942e-05, + "loss": 2.3704, + "theoretical_loss": 3.3117356014627477, + "tokens_seen": 3108526080 + }, + { + "epoch": 10.04, + "learning_rate": 2.945837512537613e-05, + "loss": 2.3475, + "theoretical_loss": 3.311730271955891, + "tokens_seen": 3108591616 + }, + { + "epoch": 10.04, + "learning_rate": 2.9448345035105315e-05, + "loss": 2.503, + "theoretical_loss": 3.3117249425928503, + "tokens_seen": 3108657152 + }, + { + "epoch": 10.04, + "learning_rate": 2.9438314944834503e-05, + "loss": 2.5426, + "theoretical_loss": 3.3117196133736186, + "tokens_seen": 3108722688 + }, + { + "epoch": 10.04, + "learning_rate": 2.942828485456369e-05, + "loss": 2.3644, + "theoretical_loss": 3.3117142842981893, + "tokens_seen": 3108788224 + }, + { + "epoch": 10.04, + "learning_rate": 2.9418254764292882e-05, + "loss": 2.5639, + "theoretical_loss": 3.311708955366555, + "tokens_seen": 3108853760 + }, + { + "epoch": 10.04, + "learning_rate": 2.940822467402207e-05, + "loss": 2.4622, + "theoretical_loss": 3.3117036265787094, + "tokens_seen": 3108919296 + }, + { + "epoch": 10.04, + "learning_rate": 2.9398194583751254e-05, + "loss": 2.3738, + "theoretical_loss": 3.311698297934645, + "tokens_seen": 3108984832 + }, + { + "epoch": 10.04, + "learning_rate": 2.9388164493480442e-05, + "loss": 2.3649, + "theoretical_loss": 3.3116929694343558, + "tokens_seen": 3109050368 + }, + { + "epoch": 10.04, + "learning_rate": 2.937813440320963e-05, + "loss": 2.5263, + "theoretical_loss": 3.311687641077834, + "tokens_seen": 3109115904 + }, + { + "epoch": 10.04, + "learning_rate": 2.9368104312938818e-05, + "loss": 2.4045, + "theoretical_loss": 3.3116823128650728, + "tokens_seen": 3109181440 + }, + { + "epoch": 10.04, + "learning_rate": 2.9358074222668002e-05, + "loss": 2.5598, + "theoretical_loss": 3.3116769847960654, + "tokens_seen": 3109246976 + }, + { + "epoch": 10.04, + "learning_rate": 2.934804413239719e-05, + "loss": 2.6152, + "theoretical_loss": 3.3116716568708053, + "tokens_seen": 3109312512 + }, + { + "epoch": 10.04, + "learning_rate": 2.9338014042126378e-05, + "loss": 2.2535, + "theoretical_loss": 3.3116663290892854, + "tokens_seen": 3109378048 + }, + { + "epoch": 10.04, + "learning_rate": 2.9327983951855566e-05, + "loss": 2.3055, + "theoretical_loss": 3.311661001451498, + "tokens_seen": 3109443584 + }, + { + "epoch": 10.04, + "learning_rate": 2.9317953861584757e-05, + "loss": 2.4243, + "theoretical_loss": 3.3116556739574374, + "tokens_seen": 3109509120 + }, + { + "epoch": 10.04, + "learning_rate": 2.9307923771313942e-05, + "loss": 2.3431, + "theoretical_loss": 3.3116503466070957, + "tokens_seen": 3109574656 + }, + { + "epoch": 10.04, + "learning_rate": 2.929789368104313e-05, + "loss": 2.6822, + "theoretical_loss": 3.3116450194004665, + "tokens_seen": 3109640192 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3424632, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.626723527908325, + "objective/train/theoretical_loss": 3.311642355851042, + "objective/train/tokens_used": 3130132960, + "theoretical_loss": 3.311642355851042, + "tokens_seen": 3109672960 + }, + { + "epoch": 10.04, + "learning_rate": 2.9287863590772318e-05, + "loss": 2.4549, + "theoretical_loss": 3.3116396923375433, + "tokens_seen": 3109705728 + }, + { + "epoch": 10.04, + "learning_rate": 2.9277833500501506e-05, + "loss": 2.438, + "theoretical_loss": 3.311634365418318, + "tokens_seen": 3109771264 + }, + { + "epoch": 10.04, + "learning_rate": 2.9267803410230693e-05, + "loss": 2.3913, + "theoretical_loss": 3.311629038642785, + "tokens_seen": 3109836800 + }, + { + "epoch": 10.04, + "learning_rate": 2.9257773319959878e-05, + "loss": 2.3752, + "theoretical_loss": 3.3116237120109364, + "tokens_seen": 3109902336 + }, + { + "epoch": 10.04, + "learning_rate": 2.9247743229689066e-05, + "loss": 2.3391, + "theoretical_loss": 3.311618385522766, + "tokens_seen": 3109967872 + }, + { + "epoch": 10.04, + "learning_rate": 2.9237713139418254e-05, + "loss": 2.5325, + "theoretical_loss": 3.311613059178266, + "tokens_seen": 3110033408 + }, + { + "epoch": 10.04, + "learning_rate": 2.9227683049147445e-05, + "loss": 2.3852, + "theoretical_loss": 3.311607732977431, + "tokens_seen": 3110098944 + }, + { + "epoch": 10.04, + "learning_rate": 2.9217652958876633e-05, + "loss": 2.3059, + "theoretical_loss": 3.3116024069202528, + "tokens_seen": 3110164480 + }, + { + "epoch": 10.04, + "learning_rate": 2.9207622868605817e-05, + "loss": 2.4896, + "theoretical_loss": 3.311597081006725, + "tokens_seen": 3110230016 + }, + { + "epoch": 10.04, + "learning_rate": 2.9197592778335005e-05, + "loss": 2.1022, + "theoretical_loss": 3.31159175523684, + "tokens_seen": 3110295552 + }, + { + "epoch": 10.04, + "learning_rate": 2.9187562688064193e-05, + "loss": 2.3706, + "theoretical_loss": 3.311586429610592, + "tokens_seen": 3110361088 + }, + { + "epoch": 10.04, + "learning_rate": 2.917753259779338e-05, + "loss": 2.4351, + "theoretical_loss": 3.3115811041279737, + "tokens_seen": 3110426624 + }, + { + "epoch": 10.04, + "learning_rate": 2.916750250752257e-05, + "loss": 2.528, + "theoretical_loss": 3.3115757787889777, + "tokens_seen": 3110492160 + }, + { + "epoch": 10.04, + "learning_rate": 2.9157472417251754e-05, + "loss": 2.2245, + "theoretical_loss": 3.311570453593598, + "tokens_seen": 3110557696 + }, + { + "epoch": 10.04, + "learning_rate": 2.914744232698094e-05, + "loss": 2.6069, + "theoretical_loss": 3.311565128541827, + "tokens_seen": 3110623232 + }, + { + "epoch": 10.04, + "learning_rate": 2.9137412236710133e-05, + "loss": 2.4212, + "theoretical_loss": 3.3115598036336573, + "tokens_seen": 3110688768 + }, + { + "epoch": 10.04, + "learning_rate": 2.912738214643932e-05, + "loss": 2.4506, + "theoretical_loss": 3.3115544788690836, + "tokens_seen": 3110754304 + }, + { + "epoch": 10.04, + "learning_rate": 2.911735205616851e-05, + "loss": 2.3259, + "theoretical_loss": 3.3115491542480977, + "tokens_seen": 3110819840 + }, + { + "epoch": 10.04, + "learning_rate": 2.9107321965897693e-05, + "loss": 2.552, + "theoretical_loss": 3.311543829770693, + "tokens_seen": 3110885376 + }, + { + "epoch": 10.04, + "learning_rate": 2.909729187562688e-05, + "loss": 2.6241, + "theoretical_loss": 3.311538505436863, + "tokens_seen": 3110950912 + }, + { + "epoch": 10.04, + "learning_rate": 2.908726178535607e-05, + "loss": 2.4626, + "theoretical_loss": 3.3115331812466002, + "tokens_seen": 3111016448 + }, + { + "epoch": 10.04, + "learning_rate": 2.9077231695085257e-05, + "loss": 2.5521, + "theoretical_loss": 3.3115278571998985, + "tokens_seen": 3111081984 + }, + { + "epoch": 10.04, + "learning_rate": 2.9067201604814445e-05, + "loss": 2.389, + "theoretical_loss": 3.31152253329675, + "tokens_seen": 3111147520 + }, + { + "epoch": 10.04, + "learning_rate": 2.905717151454363e-05, + "loss": 2.3162, + "theoretical_loss": 3.311517209537149, + "tokens_seen": 3111213056 + }, + { + "epoch": 10.04, + "learning_rate": 2.9047141424272817e-05, + "loss": 2.5647, + "theoretical_loss": 3.311511885921087, + "tokens_seen": 3111278592 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3425335, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.799713134765625, + "objective/train/theoretical_loss": 3.3115092241668815, + "objective/train/tokens_used": 3131771360, + "theoretical_loss": 3.3115092241668815, + "tokens_seen": 3111311360 + }, + { + "epoch": 10.04, + "learning_rate": 2.903711133400201e-05, + "loss": 2.5716, + "theoretical_loss": 3.3115065624485585, + "tokens_seen": 3111344128 + }, + { + "epoch": 10.04, + "learning_rate": 2.9027081243731196e-05, + "loss": 2.3056, + "theoretical_loss": 3.311501239119556, + "tokens_seen": 3111409664 + }, + { + "epoch": 10.04, + "learning_rate": 2.9017051153460384e-05, + "loss": 2.58, + "theoretical_loss": 3.3114959159340733, + "tokens_seen": 3111475200 + }, + { + "epoch": 10.04, + "learning_rate": 2.900702106318957e-05, + "loss": 2.4223, + "theoretical_loss": 3.3114905928921026, + "tokens_seen": 3111540736 + }, + { + "epoch": 10.04, + "learning_rate": 2.8996990972918757e-05, + "loss": 2.4622, + "theoretical_loss": 3.311485269993637, + "tokens_seen": 3111606272 + }, + { + "epoch": 10.04, + "learning_rate": 2.8986960882647944e-05, + "loss": 2.4818, + "theoretical_loss": 3.31147994723867, + "tokens_seen": 3111671808 + }, + { + "epoch": 10.04, + "learning_rate": 2.8976930792377132e-05, + "loss": 2.482, + "theoretical_loss": 3.3114746246271953, + "tokens_seen": 3111737344 + }, + { + "epoch": 10.04, + "learning_rate": 2.8966900702106317e-05, + "loss": 2.5483, + "theoretical_loss": 3.311469302159205, + "tokens_seen": 3111802880 + }, + { + "epoch": 10.04, + "learning_rate": 2.8956870611835505e-05, + "loss": 2.5272, + "theoretical_loss": 3.3114639798346928, + "tokens_seen": 3111868416 + }, + { + "epoch": 10.04, + "learning_rate": 2.8946840521564696e-05, + "loss": 2.4041, + "theoretical_loss": 3.3114586576536515, + "tokens_seen": 3111933952 + }, + { + "epoch": 10.04, + "learning_rate": 2.8936810431293884e-05, + "loss": 2.4184, + "theoretical_loss": 3.3114533356160742, + "tokens_seen": 3111999488 + }, + { + "epoch": 10.04, + "learning_rate": 2.8926780341023072e-05, + "loss": 2.2874, + "theoretical_loss": 3.311448013721954, + "tokens_seen": 3112065024 + }, + { + "epoch": 10.04, + "learning_rate": 2.8916750250752256e-05, + "loss": 2.5807, + "theoretical_loss": 3.3114426919712847, + "tokens_seen": 3112130560 + }, + { + "epoch": 10.04, + "learning_rate": 2.8906720160481444e-05, + "loss": 2.1262, + "theoretical_loss": 3.3114373703640587, + "tokens_seen": 3112196096 + }, + { + "epoch": 10.04, + "learning_rate": 2.8896690070210632e-05, + "loss": 2.3422, + "theoretical_loss": 3.311432048900269, + "tokens_seen": 3112261632 + }, + { + "epoch": 10.04, + "learning_rate": 2.888665997993982e-05, + "loss": 2.4825, + "theoretical_loss": 3.311426727579909, + "tokens_seen": 3112327168 + }, + { + "epoch": 10.04, + "learning_rate": 2.8876629889669008e-05, + "loss": 2.5147, + "theoretical_loss": 3.311421406402972, + "tokens_seen": 3112392704 + }, + { + "epoch": 10.04, + "learning_rate": 2.8866599799398192e-05, + "loss": 2.2756, + "theoretical_loss": 3.311416085369451, + "tokens_seen": 3112458240 + }, + { + "epoch": 10.04, + "learning_rate": 2.8856569709127384e-05, + "loss": 2.3815, + "theoretical_loss": 3.311410764479339, + "tokens_seen": 3112523776 + }, + { + "epoch": 10.04, + "learning_rate": 2.884653961885657e-05, + "loss": 2.326, + "theoretical_loss": 3.311405443732629, + "tokens_seen": 3112589312 + }, + { + "epoch": 10.04, + "learning_rate": 2.883650952858576e-05, + "loss": 2.5237, + "theoretical_loss": 3.311400123129314, + "tokens_seen": 3112654848 + }, + { + "epoch": 10.04, + "learning_rate": 2.8826479438314947e-05, + "loss": 2.5457, + "theoretical_loss": 3.311394802669388, + "tokens_seen": 3112720384 + }, + { + "epoch": 10.04, + "learning_rate": 2.8816449348044132e-05, + "loss": 2.4079, + "theoretical_loss": 3.311389482352843, + "tokens_seen": 3112785920 + }, + { + "epoch": 10.04, + "learning_rate": 2.880641925777332e-05, + "loss": 2.3996, + "theoretical_loss": 3.3113841621796727, + "tokens_seen": 3112851456 + }, + { + "epoch": 10.04, + "learning_rate": 2.8796389167502508e-05, + "loss": 2.3798, + "theoretical_loss": 3.31137884214987, + "tokens_seen": 3112916992 + }, + { + "debugging/Self-BLEU-5": 0.47984579323109716, + "debugging/distinct-1-grams": 0.7888668622339603, + "debugging/distinct-2-grams": 0.9642621582434204, + "debugging/entropy-1-grams": 6.070768265651113, + "debugging/entropy-2-grams": 7.080874440702655, + "debugging/length": 472.88235294117646, + "debugging/num_segments": 17, + "debugging/score": 0.005389080755428918, + "debugging/score_std": 0.0038289633201055314, + "epoch": 10.04, + "objective/train/docs_used": 3426769, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7713265419006348, + "objective/train/theoretical_loss": 3.31137618218873, + "objective/train/tokens_used": 3133409760, + "theoretical_loss": 3.31137618218873, + "tokens_seen": 3112949760 + }, + { + "epoch": 10.04, + "learning_rate": 2.8786359077231696e-05, + "loss": 2.5141, + "theoretical_loss": 3.3113735222634286, + "tokens_seen": 3112982528 + }, + { + "epoch": 10.04, + "learning_rate": 2.8776328986960883e-05, + "loss": 2.2362, + "theoretical_loss": 3.3113682025203413, + "tokens_seen": 3113048064 + }, + { + "epoch": 10.04, + "learning_rate": 2.8766298896690068e-05, + "loss": 2.5746, + "theoretical_loss": 3.3113628829206005, + "tokens_seen": 3113113600 + }, + { + "epoch": 10.04, + "learning_rate": 2.875626880641926e-05, + "loss": 2.5746, + "theoretical_loss": 3.3113575634642003, + "tokens_seen": 3113179136 + }, + { + "epoch": 10.04, + "learning_rate": 2.8746238716148447e-05, + "loss": 2.4021, + "theoretical_loss": 3.3113522441511334, + "tokens_seen": 3113244672 + }, + { + "epoch": 10.04, + "learning_rate": 2.8736208625877635e-05, + "loss": 2.5343, + "theoretical_loss": 3.3113469249813927, + "tokens_seen": 3113310208 + }, + { + "epoch": 10.04, + "learning_rate": 2.8726178535606823e-05, + "loss": 2.4428, + "theoretical_loss": 3.311341605954972, + "tokens_seen": 3113375744 + }, + { + "epoch": 10.04, + "learning_rate": 2.8716148445336007e-05, + "loss": 2.3714, + "theoretical_loss": 3.3113362870718634, + "tokens_seen": 3113441280 + }, + { + "epoch": 10.04, + "learning_rate": 2.8706118355065195e-05, + "loss": 2.2169, + "theoretical_loss": 3.311330968332061, + "tokens_seen": 3113506816 + }, + { + "epoch": 10.04, + "learning_rate": 2.8696088264794383e-05, + "loss": 2.4593, + "theoretical_loss": 3.3113256497355574, + "tokens_seen": 3113572352 + }, + { + "epoch": 10.04, + "learning_rate": 2.868605817452357e-05, + "loss": 2.6356, + "theoretical_loss": 3.311320331282346, + "tokens_seen": 3113637888 + }, + { + "epoch": 10.04, + "learning_rate": 2.867602808425276e-05, + "loss": 2.459, + "theoretical_loss": 3.3113150129724196, + "tokens_seen": 3113703424 + }, + { + "epoch": 10.04, + "learning_rate": 2.8665997993981947e-05, + "loss": 2.5559, + "theoretical_loss": 3.3113096948057716, + "tokens_seen": 3113768960 + }, + { + "epoch": 10.04, + "learning_rate": 2.8655967903711135e-05, + "loss": 2.5258, + "theoretical_loss": 3.3113043767823953, + "tokens_seen": 3113834496 + }, + { + "epoch": 10.04, + "learning_rate": 2.8645937813440323e-05, + "loss": 2.6469, + "theoretical_loss": 3.311299058902283, + "tokens_seen": 3113900032 + }, + { + "epoch": 10.04, + "learning_rate": 2.863590772316951e-05, + "loss": 2.5, + "theoretical_loss": 3.311293741165429, + "tokens_seen": 3113965568 + }, + { + "epoch": 10.04, + "learning_rate": 2.8625877632898695e-05, + "loss": 2.406, + "theoretical_loss": 3.3112884235718254, + "tokens_seen": 3114031104 + }, + { + "epoch": 10.04, + "learning_rate": 2.8615847542627883e-05, + "loss": 2.3198, + "theoretical_loss": 3.311283106121466, + "tokens_seen": 3114096640 + }, + { + "epoch": 10.04, + "learning_rate": 2.860581745235707e-05, + "loss": 2.3487, + "theoretical_loss": 3.311277788814343, + "tokens_seen": 3114162176 + }, + { + "epoch": 10.04, + "learning_rate": 2.859578736208626e-05, + "loss": 2.4453, + "theoretical_loss": 3.311272471650451, + "tokens_seen": 3114227712 + }, + { + "epoch": 10.04, + "learning_rate": 2.8585757271815447e-05, + "loss": 2.5327, + "theoretical_loss": 3.311267154629782, + "tokens_seen": 3114293248 + }, + { + "epoch": 10.04, + "learning_rate": 2.857572718154463e-05, + "loss": 2.4162, + "theoretical_loss": 3.3112618377523297, + "tokens_seen": 3114358784 + }, + { + "epoch": 10.04, + "learning_rate": 2.8565697091273823e-05, + "loss": 2.4963, + "theoretical_loss": 3.3112565210180867, + "tokens_seen": 3114424320 + }, + { + "epoch": 10.04, + "learning_rate": 2.855566700100301e-05, + "loss": 2.5558, + "theoretical_loss": 3.3112512044270463, + "tokens_seen": 3114489856 + }, + { + "epoch": 10.04, + "learning_rate": 2.85456369107322e-05, + "loss": 2.3255, + "theoretical_loss": 3.3112458879792017, + "tokens_seen": 3114555392 + }, + { + "epoch": 10.04, + "objective/train/docs_used": 3427609, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.226684093475342, + "objective/train/theoretical_loss": 3.311243229808976, + "objective/train/tokens_used": 3135048160, + "theoretical_loss": 3.311243229808976, + "tokens_seen": 3114588160 + }, + { + "epoch": 10.04, + "learning_rate": 2.8535606820461386e-05, + "loss": 2.0259, + "theoretical_loss": 3.3112405716745466, + "tokens_seen": 3114620928 + }, + { + "epoch": 10.04, + "learning_rate": 2.852557673019057e-05, + "loss": 2.4342, + "theoretical_loss": 3.3112352555130733, + "tokens_seen": 3114686464 + }, + { + "epoch": 10.04, + "learning_rate": 2.851554663991976e-05, + "loss": 2.3487, + "theoretical_loss": 3.311229939494775, + "tokens_seen": 3114752000 + }, + { + "epoch": 10.04, + "learning_rate": 2.8505516549648947e-05, + "loss": 2.468, + "theoretical_loss": 3.311224623619645, + "tokens_seen": 3114817536 + }, + { + "epoch": 10.04, + "learning_rate": 2.8495486459378134e-05, + "loss": 2.2766, + "theoretical_loss": 3.311219307887677, + "tokens_seen": 3114883072 + }, + { + "epoch": 10.04, + "learning_rate": 2.8485456369107322e-05, + "loss": 2.5703, + "theoretical_loss": 3.3112139922988635, + "tokens_seen": 3114948608 + }, + { + "epoch": 10.04, + "learning_rate": 2.847542627883651e-05, + "loss": 2.393, + "theoretical_loss": 3.311208676853197, + "tokens_seen": 3115014144 + }, + { + "epoch": 10.04, + "learning_rate": 2.8465396188565698e-05, + "loss": 2.4015, + "theoretical_loss": 3.3112033615506724, + "tokens_seen": 3115079680 + }, + { + "epoch": 10.04, + "learning_rate": 2.8455366098294886e-05, + "loss": 2.3735, + "theoretical_loss": 3.3111980463912816, + "tokens_seen": 3115145216 + }, + { + "epoch": 10.04, + "learning_rate": 2.8445336008024074e-05, + "loss": 2.5463, + "theoretical_loss": 3.3111927313750176, + "tokens_seen": 3115210752 + }, + { + "epoch": 10.04, + "learning_rate": 2.8435305917753262e-05, + "loss": 2.7243, + "theoretical_loss": 3.311187416501874, + "tokens_seen": 3115276288 + }, + { + "epoch": 10.04, + "learning_rate": 2.8425275827482446e-05, + "loss": 2.4343, + "theoretical_loss": 3.311182101771844, + "tokens_seen": 3115341824 + }, + { + "epoch": 10.04, + "learning_rate": 2.8415245737211634e-05, + "loss": 2.3333, + "theoretical_loss": 3.3111767871849205, + "tokens_seen": 3115407360 + }, + { + "epoch": 10.04, + "learning_rate": 2.8405215646940822e-05, + "loss": 2.3903, + "theoretical_loss": 3.311171472741097, + "tokens_seen": 3115472896 + }, + { + "epoch": 10.04, + "learning_rate": 2.839518555667001e-05, + "loss": 2.1823, + "theoretical_loss": 3.3111661584403658, + "tokens_seen": 3115538432 + }, + { + "epoch": 10.04, + "learning_rate": 2.83851554663992e-05, + "loss": 2.3823, + "theoretical_loss": 3.3111608442827203, + "tokens_seen": 3115603968 + }, + { + "epoch": 10.04, + "learning_rate": 2.8375125376128386e-05, + "loss": 2.367, + "theoretical_loss": 3.3111555302681546, + "tokens_seen": 3115669504 + }, + { + "epoch": 10.04, + "learning_rate": 2.8365095285857574e-05, + "loss": 2.3761, + "theoretical_loss": 3.311150216396661, + "tokens_seen": 3115735040 + }, + { + "epoch": 10.04, + "learning_rate": 2.835506519558676e-05, + "loss": 2.5754, + "theoretical_loss": 3.3111449026682327, + "tokens_seen": 3115800576 + }, + { + "epoch": 10.04, + "learning_rate": 2.834503510531595e-05, + "loss": 2.4385, + "theoretical_loss": 3.3111395890828628, + "tokens_seen": 3115866112 + }, + { + "epoch": 10.04, + "learning_rate": 2.8335005015045137e-05, + "loss": 2.6575, + "theoretical_loss": 3.311134275640545, + "tokens_seen": 3115931648 + }, + { + "epoch": 10.04, + "learning_rate": 2.8324974924774322e-05, + "loss": 2.3424, + "theoretical_loss": 3.3111289623412716, + "tokens_seen": 3115997184 + }, + { + "epoch": 10.04, + "learning_rate": 2.831494483450351e-05, + "loss": 2.3698, + "theoretical_loss": 3.311123649185036, + "tokens_seen": 3116062720 + }, + { + "epoch": 10.05, + "learning_rate": 2.8304914744232698e-05, + "loss": 2.5363, + "theoretical_loss": 3.311118336171832, + "tokens_seen": 3116128256 + }, + { + "epoch": 10.05, + "learning_rate": 2.8294884653961886e-05, + "loss": 2.6515, + "theoretical_loss": 3.311113023301652, + "tokens_seen": 3116193792 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3429033, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0541279315948486, + "objective/train/theoretical_loss": 3.311110366920194, + "objective/train/tokens_used": 3136686560, + "theoretical_loss": 3.311110366920194, + "tokens_seen": 3116226560 + }, + { + "epoch": 10.05, + "learning_rate": 2.8284854563691077e-05, + "loss": 2.4431, + "theoretical_loss": 3.3111077105744893, + "tokens_seen": 3116259328 + }, + { + "epoch": 10.05, + "learning_rate": 2.827482447342026e-05, + "loss": 2.5919, + "theoretical_loss": 3.3111023979903376, + "tokens_seen": 3116324864 + }, + { + "epoch": 10.05, + "learning_rate": 2.826479438314945e-05, + "loss": 2.2878, + "theoretical_loss": 3.311097085549189, + "tokens_seen": 3116390400 + }, + { + "epoch": 10.05, + "learning_rate": 2.8254764292878637e-05, + "loss": 2.4581, + "theoretical_loss": 3.311091773251037, + "tokens_seen": 3116455936 + }, + { + "epoch": 10.05, + "learning_rate": 2.8244734202607825e-05, + "loss": 2.1951, + "theoretical_loss": 3.3110864610958757, + "tokens_seen": 3116521472 + }, + { + "epoch": 10.05, + "learning_rate": 2.823470411233701e-05, + "loss": 2.3887, + "theoretical_loss": 3.311081149083697, + "tokens_seen": 3116587008 + }, + { + "epoch": 10.05, + "learning_rate": 2.8224674022066197e-05, + "loss": 2.4564, + "theoretical_loss": 3.311075837214495, + "tokens_seen": 3116652544 + }, + { + "epoch": 10.05, + "learning_rate": 2.8214643931795385e-05, + "loss": 2.1855, + "theoretical_loss": 3.3110705254882618, + "tokens_seen": 3116718080 + }, + { + "epoch": 10.05, + "learning_rate": 2.8204613841524573e-05, + "loss": 2.2774, + "theoretical_loss": 3.311065213904991, + "tokens_seen": 3116783616 + }, + { + "epoch": 10.05, + "learning_rate": 2.8194583751253765e-05, + "loss": 2.4414, + "theoretical_loss": 3.3110599024646765, + "tokens_seen": 3116849152 + }, + { + "epoch": 10.05, + "learning_rate": 2.818455366098295e-05, + "loss": 2.3934, + "theoretical_loss": 3.31105459116731, + "tokens_seen": 3116914688 + }, + { + "epoch": 10.05, + "learning_rate": 2.8174523570712137e-05, + "loss": 2.3608, + "theoretical_loss": 3.3110492800128863, + "tokens_seen": 3116980224 + }, + { + "epoch": 10.05, + "learning_rate": 2.8164493480441325e-05, + "loss": 2.4618, + "theoretical_loss": 3.3110439690013975, + "tokens_seen": 3117045760 + }, + { + "epoch": 10.05, + "learning_rate": 2.8154463390170513e-05, + "loss": 2.5326, + "theoretical_loss": 3.311038658132837, + "tokens_seen": 3117111296 + }, + { + "epoch": 10.05, + "learning_rate": 2.81444332998997e-05, + "loss": 2.4009, + "theoretical_loss": 3.311033347407198, + "tokens_seen": 3117176832 + }, + { + "epoch": 10.05, + "learning_rate": 2.8134403209628885e-05, + "loss": 2.3392, + "theoretical_loss": 3.311028036824473, + "tokens_seen": 3117242368 + }, + { + "epoch": 10.05, + "learning_rate": 2.8124373119358073e-05, + "loss": 2.4701, + "theoretical_loss": 3.311022726384656, + "tokens_seen": 3117307904 + }, + { + "epoch": 10.05, + "learning_rate": 2.811434302908726e-05, + "loss": 2.1681, + "theoretical_loss": 3.31101741608774, + "tokens_seen": 3117373440 + }, + { + "epoch": 10.05, + "learning_rate": 2.8104312938816452e-05, + "loss": 2.6492, + "theoretical_loss": 3.311012105933718, + "tokens_seen": 3117438976 + }, + { + "epoch": 10.05, + "learning_rate": 2.809428284854564e-05, + "loss": 2.3792, + "theoretical_loss": 3.3110067959225833, + "tokens_seen": 3117504512 + }, + { + "epoch": 10.05, + "learning_rate": 2.8084252758274825e-05, + "loss": 2.5371, + "theoretical_loss": 3.3110014860543284, + "tokens_seen": 3117570048 + }, + { + "epoch": 10.05, + "learning_rate": 2.8074222668004013e-05, + "loss": 2.0911, + "theoretical_loss": 3.3109961763289477, + "tokens_seen": 3117635584 + }, + { + "epoch": 10.05, + "learning_rate": 2.80641925777332e-05, + "loss": 2.3263, + "theoretical_loss": 3.310990866746433, + "tokens_seen": 3117701120 + }, + { + "epoch": 10.05, + "learning_rate": 2.805416248746239e-05, + "loss": 2.307, + "theoretical_loss": 3.310985557306778, + "tokens_seen": 3117766656 + }, + { + "epoch": 10.05, + "learning_rate": 2.8044132397191576e-05, + "loss": 2.3844, + "theoretical_loss": 3.3109802480099764, + "tokens_seen": 3117832192 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3429764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3558356761932373, + "objective/train/theoretical_loss": 3.310977593415143, + "objective/train/tokens_used": 3138324960, + "theoretical_loss": 3.310977593415143, + "tokens_seen": 3117864960 + }, + { + "epoch": 10.05, + "learning_rate": 2.803410230692076e-05, + "loss": 2.5409, + "theoretical_loss": 3.310974938856021, + "tokens_seen": 3117897728 + }, + { + "epoch": 10.05, + "learning_rate": 2.802407221664995e-05, + "loss": 2.5826, + "theoretical_loss": 3.3109696298449043, + "tokens_seen": 3117963264 + }, + { + "epoch": 10.05, + "learning_rate": 2.8014042126379137e-05, + "loss": 2.5307, + "theoretical_loss": 3.31096432097662, + "tokens_seen": 3118028800 + }, + { + "epoch": 10.05, + "learning_rate": 2.8004012036108328e-05, + "loss": 2.4461, + "theoretical_loss": 3.310959012251162, + "tokens_seen": 3118094336 + }, + { + "epoch": 10.05, + "learning_rate": 2.7993981945837516e-05, + "loss": 2.4836, + "theoretical_loss": 3.310953703668522, + "tokens_seen": 3118159872 + }, + { + "epoch": 10.05, + "learning_rate": 2.79839518555667e-05, + "loss": 2.4528, + "theoretical_loss": 3.310948395228694, + "tokens_seen": 3118225408 + }, + { + "epoch": 10.05, + "learning_rate": 2.7973921765295888e-05, + "loss": 2.4084, + "theoretical_loss": 3.3109430869316707, + "tokens_seen": 3118290944 + }, + { + "epoch": 10.05, + "learning_rate": 2.7963891675025076e-05, + "loss": 2.3722, + "theoretical_loss": 3.310937778777446, + "tokens_seen": 3118356480 + }, + { + "epoch": 10.05, + "learning_rate": 2.7953861584754264e-05, + "loss": 2.2927, + "theoretical_loss": 3.3109324707660126, + "tokens_seen": 3118422016 + }, + { + "epoch": 10.05, + "learning_rate": 2.7943831494483452e-05, + "loss": 2.3729, + "theoretical_loss": 3.3109271628973636, + "tokens_seen": 3118487552 + }, + { + "epoch": 10.05, + "learning_rate": 2.7933801404212636e-05, + "loss": 2.4257, + "theoretical_loss": 3.3109218551714923, + "tokens_seen": 3118553088 + }, + { + "epoch": 10.05, + "learning_rate": 2.7923771313941824e-05, + "loss": 2.5575, + "theoretical_loss": 3.3109165475883917, + "tokens_seen": 3118618624 + }, + { + "epoch": 10.05, + "learning_rate": 2.7913741223671015e-05, + "loss": 2.2293, + "theoretical_loss": 3.310911240148055, + "tokens_seen": 3118684160 + }, + { + "epoch": 10.05, + "learning_rate": 2.7903711133400203e-05, + "loss": 2.24, + "theoretical_loss": 3.310905932850476, + "tokens_seen": 3118749696 + }, + { + "epoch": 10.05, + "learning_rate": 2.7893681043129388e-05, + "loss": 2.6993, + "theoretical_loss": 3.3109006256956466, + "tokens_seen": 3118815232 + }, + { + "epoch": 10.05, + "learning_rate": 2.7883650952858576e-05, + "loss": 2.3442, + "theoretical_loss": 3.310895318683561, + "tokens_seen": 3118880768 + }, + { + "epoch": 10.05, + "learning_rate": 2.7873620862587764e-05, + "loss": 2.5364, + "theoretical_loss": 3.3108900118142115, + "tokens_seen": 3118946304 + }, + { + "epoch": 10.05, + "learning_rate": 2.786359077231695e-05, + "loss": 2.5614, + "theoretical_loss": 3.3108847050875925, + "tokens_seen": 3119011840 + }, + { + "epoch": 10.05, + "learning_rate": 2.785356068204614e-05, + "loss": 2.4339, + "theoretical_loss": 3.3108793985036957, + "tokens_seen": 3119077376 + }, + { + "epoch": 10.05, + "learning_rate": 2.7843530591775324e-05, + "loss": 2.4222, + "theoretical_loss": 3.3108740920625155, + "tokens_seen": 3119142912 + }, + { + "epoch": 10.05, + "learning_rate": 2.7833500501504512e-05, + "loss": 2.3907, + "theoretical_loss": 3.310868785764044, + "tokens_seen": 3119208448 + }, + { + "epoch": 10.05, + "learning_rate": 2.7823470411233703e-05, + "loss": 2.3737, + "theoretical_loss": 3.3108634796082757, + "tokens_seen": 3119273984 + }, + { + "epoch": 10.05, + "learning_rate": 2.781344032096289e-05, + "loss": 2.4992, + "theoretical_loss": 3.3108581735952027, + "tokens_seen": 3119339520 + }, + { + "epoch": 10.05, + "learning_rate": 2.780341023069208e-05, + "loss": 2.5633, + "theoretical_loss": 3.3108528677248183, + "tokens_seen": 3119405056 + }, + { + "epoch": 10.05, + "learning_rate": 2.7793380140421263e-05, + "loss": 2.4666, + "theoretical_loss": 3.310847561997116, + "tokens_seen": 3119470592 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3430318, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.727386713027954, + "objective/train/theoretical_loss": 3.3108449091867684, + "objective/train/tokens_used": 3139963360, + "theoretical_loss": 3.3108449091867684, + "tokens_seen": 3119503360 + }, + { + "epoch": 10.05, + "learning_rate": 2.778335005015045e-05, + "loss": 2.467, + "theoretical_loss": 3.3108422564120885, + "tokens_seen": 3119536128 + }, + { + "epoch": 10.05, + "learning_rate": 2.777331995987964e-05, + "loss": 2.5689, + "theoretical_loss": 3.3108369509697297, + "tokens_seen": 3119601664 + }, + { + "epoch": 10.05, + "learning_rate": 2.7763289869608827e-05, + "loss": 2.4004, + "theoretical_loss": 3.310831645670032, + "tokens_seen": 3119667200 + }, + { + "epoch": 10.05, + "learning_rate": 2.7753259779338015e-05, + "loss": 2.2517, + "theoretical_loss": 3.310826340512989, + "tokens_seen": 3119732736 + }, + { + "epoch": 10.05, + "learning_rate": 2.77432296890672e-05, + "loss": 2.4573, + "theoretical_loss": 3.3108210354985936, + "tokens_seen": 3119798272 + }, + { + "epoch": 10.05, + "learning_rate": 2.7733199598796387e-05, + "loss": 2.4066, + "theoretical_loss": 3.310815730626839, + "tokens_seen": 3119863808 + }, + { + "epoch": 10.05, + "learning_rate": 2.772316950852558e-05, + "loss": 2.444, + "theoretical_loss": 3.3108104258977185, + "tokens_seen": 3119929344 + }, + { + "epoch": 10.05, + "learning_rate": 2.7713139418254767e-05, + "loss": 2.3031, + "theoretical_loss": 3.3108051213112253, + "tokens_seen": 3119994880 + }, + { + "epoch": 10.05, + "learning_rate": 2.7703109327983955e-05, + "loss": 2.55, + "theoretical_loss": 3.3107998168673527, + "tokens_seen": 3120060416 + }, + { + "epoch": 10.05, + "learning_rate": 2.769307923771314e-05, + "loss": 2.3202, + "theoretical_loss": 3.3107945125660936, + "tokens_seen": 3120125952 + }, + { + "epoch": 10.05, + "learning_rate": 2.7683049147442327e-05, + "loss": 2.6022, + "theoretical_loss": 3.3107892084074413, + "tokens_seen": 3120191488 + }, + { + "epoch": 10.05, + "learning_rate": 2.7673019057171515e-05, + "loss": 2.4436, + "theoretical_loss": 3.310783904391389, + "tokens_seen": 3120257024 + }, + { + "epoch": 10.05, + "learning_rate": 2.7662988966900703e-05, + "loss": 2.47, + "theoretical_loss": 3.3107786005179296, + "tokens_seen": 3120322560 + }, + { + "epoch": 10.05, + "learning_rate": 2.765295887662989e-05, + "loss": 2.4958, + "theoretical_loss": 3.310773296787057, + "tokens_seen": 3120388096 + }, + { + "epoch": 10.05, + "learning_rate": 2.7642928786359075e-05, + "loss": 2.4231, + "theoretical_loss": 3.3107679931987635, + "tokens_seen": 3120453632 + }, + { + "epoch": 10.05, + "learning_rate": 2.7632898696088266e-05, + "loss": 2.5099, + "theoretical_loss": 3.310762689753042, + "tokens_seen": 3120519168 + }, + { + "epoch": 10.05, + "learning_rate": 2.7622868605817454e-05, + "loss": 2.3859, + "theoretical_loss": 3.310757386449887, + "tokens_seen": 3120584704 + }, + { + "epoch": 10.05, + "learning_rate": 2.7612838515546642e-05, + "loss": 2.6081, + "theoretical_loss": 3.310752083289291, + "tokens_seen": 3120650240 + }, + { + "epoch": 10.05, + "learning_rate": 2.760280842527583e-05, + "loss": 2.468, + "theoretical_loss": 3.3107467802712467, + "tokens_seen": 3120715776 + }, + { + "epoch": 10.05, + "learning_rate": 2.7592778335005015e-05, + "loss": 2.4381, + "theoretical_loss": 3.310741477395748, + "tokens_seen": 3120781312 + }, + { + "epoch": 10.05, + "learning_rate": 2.7582748244734203e-05, + "loss": 2.4675, + "theoretical_loss": 3.3107361746627877, + "tokens_seen": 3120846848 + }, + { + "epoch": 10.05, + "learning_rate": 2.757271815446339e-05, + "loss": 2.4952, + "theoretical_loss": 3.310730872072359, + "tokens_seen": 3120912384 + }, + { + "epoch": 10.05, + "learning_rate": 2.756268806419258e-05, + "loss": 2.4707, + "theoretical_loss": 3.3107255696244557, + "tokens_seen": 3120977920 + }, + { + "epoch": 10.05, + "learning_rate": 2.7552657973921763e-05, + "loss": 2.4111, + "theoretical_loss": 3.31072026731907, + "tokens_seen": 3121043456 + }, + { + "epoch": 10.05, + "learning_rate": 2.7542627883650954e-05, + "loss": 2.5217, + "theoretical_loss": 3.3107149651561953, + "tokens_seen": 3121108992 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3430318, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.64953875541687, + "objective/train/theoretical_loss": 3.3107123141281978, + "objective/train/tokens_used": 3141601760, + "theoretical_loss": 3.3107123141281978, + "tokens_seen": 3121141760 + }, + { + "epoch": 10.05, + "learning_rate": 2.7532597793380142e-05, + "loss": 2.2556, + "theoretical_loss": 3.3107096631358255, + "tokens_seen": 3121174528 + }, + { + "epoch": 10.05, + "learning_rate": 2.752256770310933e-05, + "loss": 2.207, + "theoretical_loss": 3.3107043612579528, + "tokens_seen": 3121240064 + }, + { + "epoch": 10.05, + "learning_rate": 2.7512537612838518e-05, + "loss": 2.6128, + "theoretical_loss": 3.310699059522571, + "tokens_seen": 3121305600 + }, + { + "epoch": 10.05, + "learning_rate": 2.7502507522567702e-05, + "loss": 2.4041, + "theoretical_loss": 3.310693757929673, + "tokens_seen": 3121371136 + }, + { + "epoch": 10.05, + "learning_rate": 2.749247743229689e-05, + "loss": 2.4478, + "theoretical_loss": 3.310688456479252, + "tokens_seen": 3121436672 + }, + { + "epoch": 10.05, + "learning_rate": 2.7482447342026078e-05, + "loss": 2.4909, + "theoretical_loss": 3.3106831551713016, + "tokens_seen": 3121502208 + }, + { + "epoch": 10.05, + "learning_rate": 2.7472417251755266e-05, + "loss": 2.3637, + "theoretical_loss": 3.310677854005814, + "tokens_seen": 3121567744 + }, + { + "epoch": 10.05, + "learning_rate": 2.7462387161484454e-05, + "loss": 2.705, + "theoretical_loss": 3.310672552982784, + "tokens_seen": 3121633280 + }, + { + "epoch": 10.05, + "learning_rate": 2.745235707121364e-05, + "loss": 2.3706, + "theoretical_loss": 3.310667252102203, + "tokens_seen": 3121698816 + }, + { + "epoch": 10.05, + "learning_rate": 2.744232698094283e-05, + "loss": 2.5003, + "theoretical_loss": 3.3106619513640654, + "tokens_seen": 3121764352 + }, + { + "epoch": 10.05, + "learning_rate": 2.7432296890672018e-05, + "loss": 2.3137, + "theoretical_loss": 3.310656650768364, + "tokens_seen": 3121829888 + }, + { + "epoch": 10.05, + "learning_rate": 2.7422266800401205e-05, + "loss": 2.499, + "theoretical_loss": 3.3106513503150916, + "tokens_seen": 3121895424 + }, + { + "epoch": 10.05, + "learning_rate": 2.7412236710130393e-05, + "loss": 2.3682, + "theoretical_loss": 3.310646050004242, + "tokens_seen": 3121960960 + }, + { + "epoch": 10.05, + "learning_rate": 2.7402206619859578e-05, + "loss": 2.3137, + "theoretical_loss": 3.310640749835808, + "tokens_seen": 3122026496 + }, + { + "epoch": 10.05, + "learning_rate": 2.7392176529588766e-05, + "loss": 2.4756, + "theoretical_loss": 3.3106354498097827, + "tokens_seen": 3122092032 + }, + { + "epoch": 10.05, + "learning_rate": 2.7382146439317954e-05, + "loss": 2.6357, + "theoretical_loss": 3.31063014992616, + "tokens_seen": 3122157568 + }, + { + "epoch": 10.05, + "learning_rate": 2.737211634904714e-05, + "loss": 2.6469, + "theoretical_loss": 3.3106248501849325, + "tokens_seen": 3122223104 + }, + { + "epoch": 10.05, + "learning_rate": 2.736208625877633e-05, + "loss": 2.5674, + "theoretical_loss": 3.310619550586093, + "tokens_seen": 3122288640 + }, + { + "epoch": 10.05, + "learning_rate": 2.7352056168505517e-05, + "loss": 2.5805, + "theoretical_loss": 3.3106142511296355, + "tokens_seen": 3122354176 + }, + { + "epoch": 10.05, + "learning_rate": 2.7342026078234705e-05, + "loss": 2.5627, + "theoretical_loss": 3.310608951815553, + "tokens_seen": 3122419712 + }, + { + "epoch": 10.05, + "learning_rate": 2.7331995987963893e-05, + "loss": 2.2854, + "theoretical_loss": 3.3106036526438385, + "tokens_seen": 3122485248 + }, + { + "epoch": 10.05, + "learning_rate": 2.732196589769308e-05, + "loss": 2.388, + "theoretical_loss": 3.310598353614485, + "tokens_seen": 3122550784 + }, + { + "epoch": 10.05, + "learning_rate": 2.731193580742227e-05, + "loss": 2.4393, + "theoretical_loss": 3.3105930547274856, + "tokens_seen": 3122616320 + }, + { + "epoch": 10.05, + "learning_rate": 2.7301905717151453e-05, + "loss": 2.2944, + "theoretical_loss": 3.3105877559828345, + "tokens_seen": 3122681856 + }, + { + "epoch": 10.05, + "learning_rate": 2.729187562688064e-05, + "loss": 2.4025, + "theoretical_loss": 3.310582457380524, + "tokens_seen": 3122747392 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3431082, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6108298301696777, + "objective/train/theoretical_loss": 3.310579808132744, + "objective/train/tokens_used": 3143240160, + "theoretical_loss": 3.310579808132744, + "tokens_seen": 3122780160 + }, + { + "epoch": 10.05, + "learning_rate": 2.728184553660983e-05, + "loss": 2.5381, + "theoretical_loss": 3.310577158920547, + "tokens_seen": 3122812928 + }, + { + "epoch": 10.05, + "learning_rate": 2.7271815446339017e-05, + "loss": 2.4538, + "theoretical_loss": 3.3105718606028973, + "tokens_seen": 3122878464 + }, + { + "epoch": 10.05, + "learning_rate": 2.7261785356068205e-05, + "loss": 2.6026, + "theoretical_loss": 3.3105665624275677, + "tokens_seen": 3122944000 + }, + { + "epoch": 10.05, + "learning_rate": 2.7251755265797393e-05, + "loss": 2.3742, + "theoretical_loss": 3.3105612643945523, + "tokens_seen": 3123009536 + }, + { + "epoch": 10.05, + "learning_rate": 2.724172517552658e-05, + "loss": 2.2595, + "theoretical_loss": 3.310555966503843, + "tokens_seen": 3123075072 + }, + { + "epoch": 10.05, + "learning_rate": 2.723169508525577e-05, + "loss": 2.4174, + "theoretical_loss": 3.3105506687554342, + "tokens_seen": 3123140608 + }, + { + "epoch": 10.05, + "learning_rate": 2.7221664994984957e-05, + "loss": 2.4267, + "theoretical_loss": 3.3105453711493182, + "tokens_seen": 3123206144 + }, + { + "epoch": 10.05, + "learning_rate": 2.7211634904714145e-05, + "loss": 2.4668, + "theoretical_loss": 3.3105400736854884, + "tokens_seen": 3123271680 + }, + { + "epoch": 10.05, + "learning_rate": 2.720160481444333e-05, + "loss": 2.4096, + "theoretical_loss": 3.310534776363938, + "tokens_seen": 3123337216 + }, + { + "epoch": 10.05, + "learning_rate": 2.7191574724172517e-05, + "loss": 2.551, + "theoretical_loss": 3.3105294791846607, + "tokens_seen": 3123402752 + }, + { + "epoch": 10.05, + "learning_rate": 2.7181544633901705e-05, + "loss": 2.3537, + "theoretical_loss": 3.310524182147649, + "tokens_seen": 3123468288 + }, + { + "epoch": 10.05, + "learning_rate": 2.7171514543630893e-05, + "loss": 2.4189, + "theoretical_loss": 3.3105188852528964, + "tokens_seen": 3123533824 + }, + { + "epoch": 10.05, + "learning_rate": 2.716148445336008e-05, + "loss": 2.5211, + "theoretical_loss": 3.3105135885003962, + "tokens_seen": 3123599360 + }, + { + "epoch": 10.05, + "learning_rate": 2.715145436308927e-05, + "loss": 2.5114, + "theoretical_loss": 3.3105082918901414, + "tokens_seen": 3123664896 + }, + { + "epoch": 10.05, + "learning_rate": 2.7141424272818456e-05, + "loss": 2.4564, + "theoretical_loss": 3.3105029954221252, + "tokens_seen": 3123730432 + }, + { + "epoch": 10.05, + "learning_rate": 2.7131394182547644e-05, + "loss": 2.5763, + "theoretical_loss": 3.3104976990963406, + "tokens_seen": 3123795968 + }, + { + "epoch": 10.05, + "learning_rate": 2.7121364092276832e-05, + "loss": 2.485, + "theoretical_loss": 3.3104924029127814, + "tokens_seen": 3123861504 + }, + { + "epoch": 10.05, + "learning_rate": 2.7111334002006017e-05, + "loss": 2.3553, + "theoretical_loss": 3.3104871068714408, + "tokens_seen": 3123927040 + }, + { + "epoch": 10.05, + "learning_rate": 2.7101303911735205e-05, + "loss": 2.443, + "theoretical_loss": 3.310481810972311, + "tokens_seen": 3123992576 + }, + { + "epoch": 10.05, + "learning_rate": 2.7091273821464393e-05, + "loss": 2.6506, + "theoretical_loss": 3.310476515215386, + "tokens_seen": 3124058112 + }, + { + "epoch": 10.05, + "learning_rate": 2.708124373119358e-05, + "loss": 2.5574, + "theoretical_loss": 3.310471219600659, + "tokens_seen": 3124123648 + }, + { + "epoch": 10.05, + "learning_rate": 2.7071213640922772e-05, + "loss": 2.3534, + "theoretical_loss": 3.310465924128123, + "tokens_seen": 3124189184 + }, + { + "epoch": 10.05, + "learning_rate": 2.7061183550651956e-05, + "loss": 2.5049, + "theoretical_loss": 3.310460628797771, + "tokens_seen": 3124254720 + }, + { + "epoch": 10.05, + "learning_rate": 2.7051153460381144e-05, + "loss": 2.6133, + "theoretical_loss": 3.310455333609597, + "tokens_seen": 3124320256 + }, + { + "epoch": 10.05, + "learning_rate": 2.7041123370110332e-05, + "loss": 2.4977, + "theoretical_loss": 3.310450038563593, + "tokens_seen": 3124385792 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3432377, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1842663288116455, + "objective/train/theoretical_loss": 3.310447391093903, + "objective/train/tokens_used": 3144878560, + "theoretical_loss": 3.310447391093903, + "tokens_seen": 3124418560 + }, + { + "epoch": 10.05, + "learning_rate": 2.703109327983952e-05, + "loss": 2.52, + "theoretical_loss": 3.3104447436597533, + "tokens_seen": 3124451328 + }, + { + "epoch": 10.05, + "learning_rate": 2.7021063189568708e-05, + "loss": 2.43, + "theoretical_loss": 3.3104394488980704, + "tokens_seen": 3124516864 + }, + { + "epoch": 10.05, + "learning_rate": 2.7011033099297892e-05, + "loss": 2.4156, + "theoretical_loss": 3.310434154278538, + "tokens_seen": 3124582400 + }, + { + "epoch": 10.05, + "learning_rate": 2.700100300902708e-05, + "loss": 2.5735, + "theoretical_loss": 3.3104288598011484, + "tokens_seen": 3124647936 + }, + { + "epoch": 10.05, + "learning_rate": 2.6990972918756268e-05, + "loss": 2.4112, + "theoretical_loss": 3.310423565465896, + "tokens_seen": 3124713472 + }, + { + "epoch": 10.05, + "learning_rate": 2.6980942828485456e-05, + "loss": 2.4154, + "theoretical_loss": 3.3104182712727734, + "tokens_seen": 3124779008 + }, + { + "epoch": 10.05, + "learning_rate": 2.6970912738214647e-05, + "loss": 2.4013, + "theoretical_loss": 3.3104129772217736, + "tokens_seen": 3124844544 + }, + { + "epoch": 10.05, + "learning_rate": 2.6960882647943832e-05, + "loss": 2.4012, + "theoretical_loss": 3.3104076833128904, + "tokens_seen": 3124910080 + }, + { + "epoch": 10.05, + "learning_rate": 2.695085255767302e-05, + "loss": 2.6111, + "theoretical_loss": 3.3104023895461165, + "tokens_seen": 3124975616 + }, + { + "epoch": 10.05, + "learning_rate": 2.6940822467402208e-05, + "loss": 2.5765, + "theoretical_loss": 3.3103970959214455, + "tokens_seen": 3125041152 + }, + { + "epoch": 10.05, + "learning_rate": 2.6930792377131395e-05, + "loss": 2.4278, + "theoretical_loss": 3.31039180243887, + "tokens_seen": 3125106688 + }, + { + "epoch": 10.05, + "learning_rate": 2.6920762286860583e-05, + "loss": 2.3092, + "theoretical_loss": 3.310386509098384, + "tokens_seen": 3125172224 + }, + { + "epoch": 10.05, + "learning_rate": 2.6910732196589768e-05, + "loss": 2.4189, + "theoretical_loss": 3.31038121589998, + "tokens_seen": 3125237760 + }, + { + "epoch": 10.05, + "learning_rate": 2.6900702106318956e-05, + "loss": 2.4397, + "theoretical_loss": 3.310375922843652, + "tokens_seen": 3125303296 + }, + { + "epoch": 10.05, + "learning_rate": 2.6890672016048144e-05, + "loss": 2.2794, + "theoretical_loss": 3.310370629929392, + "tokens_seen": 3125368832 + }, + { + "epoch": 10.05, + "learning_rate": 2.6880641925777335e-05, + "loss": 2.3892, + "theoretical_loss": 3.310365337157194, + "tokens_seen": 3125434368 + }, + { + "epoch": 10.05, + "learning_rate": 2.6870611835506523e-05, + "loss": 2.3266, + "theoretical_loss": 3.3103600445270516, + "tokens_seen": 3125499904 + }, + { + "epoch": 10.05, + "learning_rate": 2.6860581745235707e-05, + "loss": 2.3426, + "theoretical_loss": 3.310354752038957, + "tokens_seen": 3125565440 + }, + { + "epoch": 10.05, + "learning_rate": 2.6850551654964895e-05, + "loss": 2.3351, + "theoretical_loss": 3.310349459692904, + "tokens_seen": 3125630976 + }, + { + "epoch": 10.05, + "learning_rate": 2.6840521564694083e-05, + "loss": 2.3588, + "theoretical_loss": 3.310344167488886, + "tokens_seen": 3125696512 + }, + { + "epoch": 10.05, + "learning_rate": 2.683049147442327e-05, + "loss": 2.53, + "theoretical_loss": 3.310338875426896, + "tokens_seen": 3125762048 + }, + { + "epoch": 10.05, + "learning_rate": 2.6820461384152456e-05, + "loss": 2.4249, + "theoretical_loss": 3.310333583506927, + "tokens_seen": 3125827584 + }, + { + "epoch": 10.05, + "learning_rate": 2.6810431293881643e-05, + "loss": 2.3246, + "theoretical_loss": 3.3103282917289722, + "tokens_seen": 3125893120 + }, + { + "epoch": 10.05, + "learning_rate": 2.680040120361083e-05, + "loss": 2.6204, + "theoretical_loss": 3.3103230000930255, + "tokens_seen": 3125958656 + }, + { + "epoch": 10.05, + "learning_rate": 2.6790371113340023e-05, + "loss": 2.4397, + "theoretical_loss": 3.3103177085990794, + "tokens_seen": 3126024192 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3433062, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6262576580047607, + "objective/train/theoretical_loss": 3.3103150629053544, + "objective/train/tokens_used": 3146516960, + "theoretical_loss": 3.3103150629053544, + "tokens_seen": 3126056960 + }, + { + "epoch": 10.05, + "learning_rate": 2.678034102306921e-05, + "loss": 2.7356, + "theoretical_loss": 3.310312417247127, + "tokens_seen": 3126089728 + }, + { + "epoch": 10.05, + "learning_rate": 2.6770310932798395e-05, + "loss": 2.4088, + "theoretical_loss": 3.310307126037162, + "tokens_seen": 3126155264 + }, + { + "epoch": 10.05, + "learning_rate": 2.6760280842527583e-05, + "loss": 2.3966, + "theoretical_loss": 3.3103018349691777, + "tokens_seen": 3126220800 + }, + { + "epoch": 10.05, + "learning_rate": 2.675025075225677e-05, + "loss": 2.4848, + "theoretical_loss": 3.3102965440431666, + "tokens_seen": 3126286336 + }, + { + "epoch": 10.05, + "learning_rate": 2.674022066198596e-05, + "loss": 2.3083, + "theoretical_loss": 3.310291253259123, + "tokens_seen": 3126351872 + }, + { + "epoch": 10.05, + "learning_rate": 2.6730190571715147e-05, + "loss": 2.3627, + "theoretical_loss": 3.310285962617039, + "tokens_seen": 3126417408 + }, + { + "epoch": 10.05, + "learning_rate": 2.672016048144433e-05, + "loss": 2.4221, + "theoretical_loss": 3.3102806721169085, + "tokens_seen": 3126482944 + }, + { + "epoch": 10.05, + "learning_rate": 2.671013039117352e-05, + "loss": 2.3042, + "theoretical_loss": 3.3102753817587245, + "tokens_seen": 3126548480 + }, + { + "epoch": 10.05, + "learning_rate": 2.6700100300902707e-05, + "loss": 2.4403, + "theoretical_loss": 3.31027009154248, + "tokens_seen": 3126614016 + }, + { + "epoch": 10.05, + "learning_rate": 2.6690070210631898e-05, + "loss": 2.4606, + "theoretical_loss": 3.3102648014681684, + "tokens_seen": 3126679552 + }, + { + "epoch": 10.05, + "learning_rate": 2.6680040120361086e-05, + "loss": 2.2928, + "theoretical_loss": 3.3102595115357833, + "tokens_seen": 3126745088 + }, + { + "epoch": 10.05, + "learning_rate": 2.667001003009027e-05, + "loss": 2.4196, + "theoretical_loss": 3.3102542217453177, + "tokens_seen": 3126810624 + }, + { + "epoch": 10.05, + "learning_rate": 2.665997993981946e-05, + "loss": 2.4621, + "theoretical_loss": 3.3102489320967643, + "tokens_seen": 3126876160 + }, + { + "epoch": 10.05, + "learning_rate": 2.6649949849548646e-05, + "loss": 2.5494, + "theoretical_loss": 3.310243642590117, + "tokens_seen": 3126941696 + }, + { + "epoch": 10.05, + "learning_rate": 2.6639919759277834e-05, + "loss": 2.5194, + "theoretical_loss": 3.3102383532253685, + "tokens_seen": 3127007232 + }, + { + "epoch": 10.05, + "learning_rate": 2.6629889669007022e-05, + "loss": 2.4065, + "theoretical_loss": 3.3102330640025124, + "tokens_seen": 3127072768 + }, + { + "epoch": 10.05, + "learning_rate": 2.6619859578736207e-05, + "loss": 2.4273, + "theoretical_loss": 3.310227774921542, + "tokens_seen": 3127138304 + }, + { + "epoch": 10.05, + "learning_rate": 2.6609829488465395e-05, + "loss": 2.4342, + "theoretical_loss": 3.31022248598245, + "tokens_seen": 3127203840 + }, + { + "epoch": 10.05, + "learning_rate": 2.6599799398194586e-05, + "loss": 2.6207, + "theoretical_loss": 3.3102171971852297, + "tokens_seen": 3127269376 + }, + { + "epoch": 10.05, + "learning_rate": 2.6589769307923774e-05, + "loss": 2.5796, + "theoretical_loss": 3.310211908529875, + "tokens_seen": 3127334912 + }, + { + "epoch": 10.05, + "learning_rate": 2.657973921765296e-05, + "loss": 2.2842, + "theoretical_loss": 3.3102066200163787, + "tokens_seen": 3127400448 + }, + { + "epoch": 10.05, + "learning_rate": 2.6569709127382146e-05, + "loss": 2.463, + "theoretical_loss": 3.3102013316447336, + "tokens_seen": 3127465984 + }, + { + "epoch": 10.05, + "learning_rate": 2.6559679037111334e-05, + "loss": 2.43, + "theoretical_loss": 3.3101960434149333, + "tokens_seen": 3127531520 + }, + { + "epoch": 10.05, + "learning_rate": 2.6549648946840522e-05, + "loss": 2.3467, + "theoretical_loss": 3.310190755326971, + "tokens_seen": 3127597056 + }, + { + "epoch": 10.05, + "learning_rate": 2.653961885656971e-05, + "loss": 2.4268, + "theoretical_loss": 3.3101854673808404, + "tokens_seen": 3127662592 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3434404, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6789207458496094, + "objective/train/theoretical_loss": 3.31018282346096, + "objective/train/tokens_used": 3148155360, + "theoretical_loss": 3.31018282346096, + "tokens_seen": 3127695360 + }, + { + "epoch": 10.05, + "learning_rate": 2.6529588766298898e-05, + "loss": 2.4647, + "theoretical_loss": 3.3101801795765344, + "tokens_seen": 3127728128 + }, + { + "epoch": 10.05, + "learning_rate": 2.6519558676028082e-05, + "loss": 2.4626, + "theoretical_loss": 3.3101748919140457, + "tokens_seen": 3127793664 + }, + { + "epoch": 10.05, + "learning_rate": 2.6509528585757274e-05, + "loss": 2.1782, + "theoretical_loss": 3.310169604393368, + "tokens_seen": 3127859200 + }, + { + "epoch": 10.05, + "learning_rate": 2.649949849548646e-05, + "loss": 2.493, + "theoretical_loss": 3.3101643170144945, + "tokens_seen": 3127924736 + }, + { + "epoch": 10.05, + "learning_rate": 2.648946840521565e-05, + "loss": 2.5067, + "theoretical_loss": 3.3101590297774184, + "tokens_seen": 3127990272 + }, + { + "epoch": 10.05, + "learning_rate": 2.6479438314944837e-05, + "loss": 2.6081, + "theoretical_loss": 3.310153742682133, + "tokens_seen": 3128055808 + }, + { + "epoch": 10.05, + "learning_rate": 2.6469408224674022e-05, + "loss": 2.3416, + "theoretical_loss": 3.310148455728631, + "tokens_seen": 3128121344 + }, + { + "epoch": 10.05, + "learning_rate": 2.645937813440321e-05, + "loss": 2.1941, + "theoretical_loss": 3.3101431689169063, + "tokens_seen": 3128186880 + }, + { + "epoch": 10.05, + "learning_rate": 2.6449348044132398e-05, + "loss": 2.5385, + "theoretical_loss": 3.310137882246952, + "tokens_seen": 3128252416 + }, + { + "epoch": 10.05, + "learning_rate": 2.6439317953861585e-05, + "loss": 2.4161, + "theoretical_loss": 3.3101325957187613, + "tokens_seen": 3128317952 + }, + { + "epoch": 10.05, + "learning_rate": 2.642928786359077e-05, + "loss": 2.5409, + "theoretical_loss": 3.310127309332327, + "tokens_seen": 3128383488 + }, + { + "epoch": 10.05, + "learning_rate": 2.6419257773319958e-05, + "loss": 2.4819, + "theoretical_loss": 3.310122023087643, + "tokens_seen": 3128449024 + }, + { + "epoch": 10.05, + "learning_rate": 2.640922768304915e-05, + "loss": 2.5534, + "theoretical_loss": 3.310116736984702, + "tokens_seen": 3128514560 + }, + { + "epoch": 10.05, + "learning_rate": 2.6399197592778337e-05, + "loss": 2.3965, + "theoretical_loss": 3.3101114510234977, + "tokens_seen": 3128580096 + }, + { + "epoch": 10.05, + "learning_rate": 2.6389167502507525e-05, + "loss": 2.4316, + "theoretical_loss": 3.3101061652040227, + "tokens_seen": 3128645632 + }, + { + "epoch": 10.05, + "learning_rate": 2.637913741223671e-05, + "loss": 2.5448, + "theoretical_loss": 3.3101008795262707, + "tokens_seen": 3128711168 + }, + { + "epoch": 10.05, + "learning_rate": 2.6369107321965897e-05, + "loss": 2.5967, + "theoretical_loss": 3.3100955939902352, + "tokens_seen": 3128776704 + }, + { + "epoch": 10.05, + "learning_rate": 2.6359077231695085e-05, + "loss": 2.4637, + "theoretical_loss": 3.3100903085959086, + "tokens_seen": 3128842240 + }, + { + "epoch": 10.05, + "learning_rate": 2.6349047141424273e-05, + "loss": 2.5118, + "theoretical_loss": 3.310085023343285, + "tokens_seen": 3128907776 + }, + { + "epoch": 10.05, + "learning_rate": 2.633901705115346e-05, + "loss": 2.5493, + "theoretical_loss": 3.310079738232357, + "tokens_seen": 3128973312 + }, + { + "epoch": 10.05, + "learning_rate": 2.6328986960882646e-05, + "loss": 2.6495, + "theoretical_loss": 3.310074453263118, + "tokens_seen": 3129038848 + }, + { + "epoch": 10.05, + "learning_rate": 2.6318956870611837e-05, + "loss": 2.3975, + "theoretical_loss": 3.3100691684355614, + "tokens_seen": 3129104384 + }, + { + "epoch": 10.05, + "learning_rate": 2.6308926780341025e-05, + "loss": 2.3681, + "theoretical_loss": 3.3100638837496805, + "tokens_seen": 3129169920 + }, + { + "epoch": 10.05, + "learning_rate": 2.6298896690070213e-05, + "loss": 2.3055, + "theoretical_loss": 3.310058599205468, + "tokens_seen": 3129235456 + }, + { + "epoch": 10.05, + "learning_rate": 2.62888665997994e-05, + "loss": 2.4433, + "theoretical_loss": 3.310053314802918, + "tokens_seen": 3129300992 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3434865, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.286196708679199, + "objective/train/theoretical_loss": 3.3100506726547634, + "objective/train/tokens_used": 3149793760, + "theoretical_loss": 3.3100506726547634, + "tokens_seen": 3129333760 + }, + { + "epoch": 10.05, + "learning_rate": 2.6278836509528585e-05, + "loss": 2.3768, + "theoretical_loss": 3.310048030542023, + "tokens_seen": 3129366528 + }, + { + "epoch": 10.05, + "learning_rate": 2.6268806419257773e-05, + "loss": 2.46, + "theoretical_loss": 3.310042746422776, + "tokens_seen": 3129432064 + }, + { + "epoch": 10.05, + "learning_rate": 2.625877632898696e-05, + "loss": 2.4381, + "theoretical_loss": 3.310037462445171, + "tokens_seen": 3129497600 + }, + { + "epoch": 10.05, + "learning_rate": 2.624874623871615e-05, + "loss": 2.2453, + "theoretical_loss": 3.310032178609201, + "tokens_seen": 3129563136 + }, + { + "epoch": 10.05, + "learning_rate": 2.6238716148445337e-05, + "loss": 2.3317, + "theoretical_loss": 3.3100268949148592, + "tokens_seen": 3129628672 + }, + { + "epoch": 10.05, + "learning_rate": 2.6228686058174525e-05, + "loss": 2.3097, + "theoretical_loss": 3.310021611362139, + "tokens_seen": 3129694208 + }, + { + "epoch": 10.05, + "learning_rate": 2.6218655967903712e-05, + "loss": 2.3551, + "theoretical_loss": 3.3100163279510335, + "tokens_seen": 3129759744 + }, + { + "epoch": 10.05, + "learning_rate": 2.62086258776329e-05, + "loss": 2.3359, + "theoretical_loss": 3.3100110446815356, + "tokens_seen": 3129825280 + }, + { + "epoch": 10.05, + "learning_rate": 2.6198595787362088e-05, + "loss": 2.3353, + "theoretical_loss": 3.310005761553639, + "tokens_seen": 3129890816 + }, + { + "epoch": 10.05, + "learning_rate": 2.6188565697091276e-05, + "loss": 2.4932, + "theoretical_loss": 3.3100004785673365, + "tokens_seen": 3129956352 + }, + { + "epoch": 10.05, + "learning_rate": 2.617853560682046e-05, + "loss": 2.4383, + "theoretical_loss": 3.309995195722622, + "tokens_seen": 3130021888 + }, + { + "epoch": 10.05, + "learning_rate": 2.616850551654965e-05, + "loss": 2.5794, + "theoretical_loss": 3.3099899130194883, + "tokens_seen": 3130087424 + }, + { + "epoch": 10.05, + "learning_rate": 2.6158475426278836e-05, + "loss": 2.5634, + "theoretical_loss": 3.3099846304579286, + "tokens_seen": 3130152960 + }, + { + "epoch": 10.05, + "learning_rate": 2.6148445336008024e-05, + "loss": 2.6181, + "theoretical_loss": 3.3099793480379365, + "tokens_seen": 3130218496 + }, + { + "epoch": 10.05, + "learning_rate": 2.6138415245737212e-05, + "loss": 2.3322, + "theoretical_loss": 3.3099740657595045, + "tokens_seen": 3130284032 + }, + { + "epoch": 10.05, + "learning_rate": 2.61283851554664e-05, + "loss": 2.4545, + "theoretical_loss": 3.309968783622627, + "tokens_seen": 3130349568 + }, + { + "epoch": 10.05, + "learning_rate": 2.6118355065195588e-05, + "loss": 2.3718, + "theoretical_loss": 3.309963501627296, + "tokens_seen": 3130415104 + }, + { + "epoch": 10.05, + "learning_rate": 2.6108324974924776e-05, + "loss": 2.5823, + "theoretical_loss": 3.309958219773505, + "tokens_seen": 3130480640 + }, + { + "epoch": 10.05, + "learning_rate": 2.6098294884653964e-05, + "loss": 2.3288, + "theoretical_loss": 3.3099529380612482, + "tokens_seen": 3130546176 + }, + { + "epoch": 10.05, + "learning_rate": 2.6088264794383148e-05, + "loss": 2.3988, + "theoretical_loss": 3.309947656490518, + "tokens_seen": 3130611712 + }, + { + "epoch": 10.05, + "learning_rate": 2.6078234704112336e-05, + "loss": 2.5658, + "theoretical_loss": 3.3099423750613077, + "tokens_seen": 3130677248 + }, + { + "epoch": 10.05, + "learning_rate": 2.6068204613841524e-05, + "loss": 2.4114, + "theoretical_loss": 3.3099370937736112, + "tokens_seen": 3130742784 + }, + { + "epoch": 10.05, + "learning_rate": 2.6058174523570712e-05, + "loss": 2.3993, + "theoretical_loss": 3.309931812627421, + "tokens_seen": 3130808320 + }, + { + "epoch": 10.05, + "learning_rate": 2.60481444332999e-05, + "loss": 2.3995, + "theoretical_loss": 3.3099265316227306, + "tokens_seen": 3130873856 + }, + { + "epoch": 10.05, + "learning_rate": 2.6038114343029088e-05, + "loss": 2.5648, + "theoretical_loss": 3.309921250759533, + "tokens_seen": 3130939392 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3436364, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6327931880950928, + "objective/train/theoretical_loss": 3.309918610380992, + "objective/train/tokens_used": 3151432160, + "theoretical_loss": 3.309918610380992, + "tokens_seen": 3130972160 + }, + { + "epoch": 10.05, + "learning_rate": 2.6028084252758276e-05, + "loss": 2.5708, + "theoretical_loss": 3.309915970037822, + "tokens_seen": 3131004928 + }, + { + "epoch": 10.05, + "learning_rate": 2.6018054162487464e-05, + "loss": 2.3145, + "theoretical_loss": 3.30991068945759, + "tokens_seen": 3131070464 + }, + { + "epoch": 10.05, + "learning_rate": 2.600802407221665e-05, + "loss": 2.6231, + "theoretical_loss": 3.3099054090188313, + "tokens_seen": 3131136000 + }, + { + "epoch": 10.05, + "learning_rate": 2.599799398194584e-05, + "loss": 2.4315, + "theoretical_loss": 3.3099001287215386, + "tokens_seen": 3131201536 + }, + { + "epoch": 10.05, + "learning_rate": 2.5987963891675024e-05, + "loss": 2.4644, + "theoretical_loss": 3.3098948485657047, + "tokens_seen": 3131267072 + }, + { + "epoch": 10.05, + "learning_rate": 2.5977933801404212e-05, + "loss": 2.2966, + "theoretical_loss": 3.309889568551324, + "tokens_seen": 3131332608 + }, + { + "epoch": 10.05, + "learning_rate": 2.59679037111334e-05, + "loss": 2.4501, + "theoretical_loss": 3.309884288678389, + "tokens_seen": 3131398144 + }, + { + "epoch": 10.05, + "learning_rate": 2.5957873620862588e-05, + "loss": 2.3612, + "theoretical_loss": 3.3098790089468926, + "tokens_seen": 3131463680 + }, + { + "epoch": 10.05, + "learning_rate": 2.5947843530591775e-05, + "loss": 2.3265, + "theoretical_loss": 3.3098737293568288, + "tokens_seen": 3131529216 + }, + { + "epoch": 10.05, + "learning_rate": 2.5937813440320963e-05, + "loss": 2.4658, + "theoretical_loss": 3.30986844990819, + "tokens_seen": 3131594752 + }, + { + "epoch": 10.05, + "learning_rate": 2.592778335005015e-05, + "loss": 2.5996, + "theoretical_loss": 3.3098631706009707, + "tokens_seen": 3131660288 + }, + { + "epoch": 10.05, + "learning_rate": 2.591775325977934e-05, + "loss": 2.4658, + "theoretical_loss": 3.309857891435163, + "tokens_seen": 3131725824 + }, + { + "epoch": 10.05, + "learning_rate": 2.5907723169508527e-05, + "loss": 2.4086, + "theoretical_loss": 3.3098526124107606, + "tokens_seen": 3131791360 + }, + { + "epoch": 10.05, + "learning_rate": 2.5897693079237715e-05, + "loss": 2.6411, + "theoretical_loss": 3.309847333527757, + "tokens_seen": 3131856896 + }, + { + "epoch": 10.05, + "learning_rate": 2.58876629889669e-05, + "loss": 2.6504, + "theoretical_loss": 3.309842054786145, + "tokens_seen": 3131922432 + }, + { + "epoch": 10.05, + "learning_rate": 2.5877632898696087e-05, + "loss": 2.3651, + "theoretical_loss": 3.309836776185918, + "tokens_seen": 3131987968 + }, + { + "epoch": 10.05, + "learning_rate": 2.5867602808425275e-05, + "loss": 2.2845, + "theoretical_loss": 3.3098314977270697, + "tokens_seen": 3132053504 + }, + { + "epoch": 10.05, + "learning_rate": 2.5857572718154463e-05, + "loss": 2.4088, + "theoretical_loss": 3.3098262194095924, + "tokens_seen": 3132119040 + }, + { + "epoch": 10.05, + "learning_rate": 2.5847542627883654e-05, + "loss": 2.0381, + "theoretical_loss": 3.3098209412334803, + "tokens_seen": 3132184576 + }, + { + "epoch": 10.05, + "learning_rate": 2.583751253761284e-05, + "loss": 2.6207, + "theoretical_loss": 3.309815663198726, + "tokens_seen": 3132250112 + }, + { + "epoch": 10.05, + "learning_rate": 2.5827482447342027e-05, + "loss": 2.4424, + "theoretical_loss": 3.3098103853053233, + "tokens_seen": 3132315648 + }, + { + "epoch": 10.05, + "learning_rate": 2.5817452357071215e-05, + "loss": 2.4159, + "theoretical_loss": 3.3098051075532653, + "tokens_seen": 3132381184 + }, + { + "epoch": 10.05, + "learning_rate": 2.5807422266800403e-05, + "loss": 2.3702, + "theoretical_loss": 3.309799829942545, + "tokens_seen": 3132446720 + }, + { + "epoch": 10.05, + "learning_rate": 2.579739217652959e-05, + "loss": 2.5332, + "theoretical_loss": 3.3097945524731553, + "tokens_seen": 3132512256 + }, + { + "epoch": 10.05, + "learning_rate": 2.5787362086258775e-05, + "loss": 2.4237, + "theoretical_loss": 3.3097892751450906, + "tokens_seen": 3132577792 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3437059, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.249847173690796, + "objective/train/theoretical_loss": 3.3097866365340525, + "objective/train/tokens_used": 3153070560, + "theoretical_loss": 3.3097866365340525, + "tokens_seen": 3132610560 + }, + { + "epoch": 10.05, + "learning_rate": 2.5777331995987963e-05, + "loss": 2.4146, + "theoretical_loss": 3.309783997958343, + "tokens_seen": 3132643328 + }, + { + "epoch": 10.05, + "learning_rate": 2.576730190571715e-05, + "loss": 2.4998, + "theoretical_loss": 3.309778720912907, + "tokens_seen": 3132708864 + }, + { + "epoch": 10.05, + "learning_rate": 2.5757271815446342e-05, + "loss": 2.5227, + "theoretical_loss": 3.3097734440087745, + "tokens_seen": 3132774400 + }, + { + "epoch": 10.05, + "learning_rate": 2.574724172517553e-05, + "loss": 2.273, + "theoretical_loss": 3.30976816724594, + "tokens_seen": 3132839936 + }, + { + "epoch": 10.05, + "learning_rate": 2.5737211634904715e-05, + "loss": 2.5662, + "theoretical_loss": 3.3097628906243957, + "tokens_seen": 3132905472 + }, + { + "epoch": 10.05, + "learning_rate": 2.5727181544633902e-05, + "loss": 2.3449, + "theoretical_loss": 3.309757614144135, + "tokens_seen": 3132971008 + }, + { + "epoch": 10.05, + "learning_rate": 2.571715145436309e-05, + "loss": 2.4232, + "theoretical_loss": 3.309752337805152, + "tokens_seen": 3133036544 + }, + { + "epoch": 10.05, + "learning_rate": 2.5707121364092278e-05, + "loss": 2.598, + "theoretical_loss": 3.3097470616074394, + "tokens_seen": 3133102080 + }, + { + "epoch": 10.05, + "learning_rate": 2.5697091273821463e-05, + "loss": 2.343, + "theoretical_loss": 3.3097417855509903, + "tokens_seen": 3133167616 + }, + { + "epoch": 10.05, + "learning_rate": 2.568706118355065e-05, + "loss": 2.392, + "theoretical_loss": 3.3097365096357985, + "tokens_seen": 3133233152 + }, + { + "epoch": 10.05, + "learning_rate": 2.567703109327984e-05, + "loss": 2.3217, + "theoretical_loss": 3.3097312338618567, + "tokens_seen": 3133298688 + }, + { + "epoch": 10.05, + "learning_rate": 2.5667001003009026e-05, + "loss": 2.2881, + "theoretical_loss": 3.3097259582291585, + "tokens_seen": 3133364224 + }, + { + "epoch": 10.05, + "learning_rate": 2.5656970912738218e-05, + "loss": 2.3303, + "theoretical_loss": 3.309720682737697, + "tokens_seen": 3133429760 + }, + { + "epoch": 10.05, + "learning_rate": 2.5646940822467402e-05, + "loss": 2.6219, + "theoretical_loss": 3.3097154073874653, + "tokens_seen": 3133495296 + }, + { + "epoch": 10.05, + "learning_rate": 2.563691073219659e-05, + "loss": 2.4648, + "theoretical_loss": 3.3097101321784574, + "tokens_seen": 3133560832 + }, + { + "epoch": 10.05, + "learning_rate": 2.5626880641925778e-05, + "loss": 2.5344, + "theoretical_loss": 3.3097048571106655, + "tokens_seen": 3133626368 + }, + { + "epoch": 10.05, + "learning_rate": 2.5616850551654966e-05, + "loss": 2.5054, + "theoretical_loss": 3.3096995821840838, + "tokens_seen": 3133691904 + }, + { + "epoch": 10.05, + "learning_rate": 2.5606820461384154e-05, + "loss": 2.4947, + "theoretical_loss": 3.309694307398705, + "tokens_seen": 3133757440 + }, + { + "epoch": 10.05, + "learning_rate": 2.5596790371113338e-05, + "loss": 2.4509, + "theoretical_loss": 3.3096890327545228, + "tokens_seen": 3133822976 + }, + { + "epoch": 10.05, + "learning_rate": 2.5586760280842526e-05, + "loss": 2.4371, + "theoretical_loss": 3.30968375825153, + "tokens_seen": 3133888512 + }, + { + "epoch": 10.05, + "learning_rate": 2.5576730190571714e-05, + "loss": 2.3577, + "theoretical_loss": 3.30967848388972, + "tokens_seen": 3133954048 + }, + { + "epoch": 10.05, + "learning_rate": 2.5566700100300905e-05, + "loss": 2.5827, + "theoretical_loss": 3.3096732096690866, + "tokens_seen": 3134019584 + }, + { + "epoch": 10.05, + "learning_rate": 2.5556670010030093e-05, + "loss": 2.5003, + "theoretical_loss": 3.3096679355896224, + "tokens_seen": 3134085120 + }, + { + "epoch": 10.05, + "learning_rate": 2.5546639919759278e-05, + "loss": 2.4378, + "theoretical_loss": 3.3096626616513207, + "tokens_seen": 3134150656 + }, + { + "epoch": 10.05, + "learning_rate": 2.5536609829488466e-05, + "loss": 2.4473, + "theoretical_loss": 3.3096573878541755, + "tokens_seen": 3134216192 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3438299, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.585047483444214, + "objective/train/theoretical_loss": 3.309654751008534, + "objective/train/tokens_used": 3154708960, + "theoretical_loss": 3.309654751008534, + "tokens_seen": 3134248960 + }, + { + "epoch": 10.05, + "learning_rate": 2.5526579739217654e-05, + "loss": 2.6247, + "theoretical_loss": 3.309652114198179, + "tokens_seen": 3134281728 + }, + { + "epoch": 10.05, + "learning_rate": 2.551654964894684e-05, + "loss": 2.4515, + "theoretical_loss": 3.3096468406833255, + "tokens_seen": 3134347264 + }, + { + "epoch": 10.05, + "learning_rate": 2.550651955867603e-05, + "loss": 2.3701, + "theoretical_loss": 3.3096415673096073, + "tokens_seen": 3134412800 + }, + { + "epoch": 10.05, + "learning_rate": 2.5496489468405214e-05, + "loss": 2.3576, + "theoretical_loss": 3.3096362940770185, + "tokens_seen": 3134478336 + }, + { + "epoch": 10.05, + "learning_rate": 2.5486459378134402e-05, + "loss": 2.623, + "theoretical_loss": 3.309631020985552, + "tokens_seen": 3134543872 + }, + { + "epoch": 10.05, + "learning_rate": 2.5476429287863593e-05, + "loss": 2.3276, + "theoretical_loss": 3.309625748035201, + "tokens_seen": 3134609408 + }, + { + "epoch": 10.05, + "learning_rate": 2.546639919759278e-05, + "loss": 2.6818, + "theoretical_loss": 3.309620475225959, + "tokens_seen": 3134674944 + }, + { + "epoch": 10.05, + "learning_rate": 2.545636910732197e-05, + "loss": 2.4186, + "theoretical_loss": 3.309615202557819, + "tokens_seen": 3134740480 + }, + { + "epoch": 10.05, + "learning_rate": 2.5446339017051153e-05, + "loss": 2.3313, + "theoretical_loss": 3.309609930030774, + "tokens_seen": 3134806016 + }, + { + "epoch": 10.05, + "learning_rate": 2.543630892678034e-05, + "loss": 2.4842, + "theoretical_loss": 3.3096046576448184, + "tokens_seen": 3134871552 + }, + { + "epoch": 10.05, + "learning_rate": 2.542627883650953e-05, + "loss": 2.3713, + "theoretical_loss": 3.3095993853999444, + "tokens_seen": 3134937088 + }, + { + "epoch": 10.05, + "learning_rate": 2.5416248746238717e-05, + "loss": 2.6067, + "theoretical_loss": 3.309594113296146, + "tokens_seen": 3135002624 + }, + { + "epoch": 10.05, + "learning_rate": 2.5406218655967905e-05, + "loss": 2.4438, + "theoretical_loss": 3.309588841333416, + "tokens_seen": 3135068160 + }, + { + "epoch": 10.05, + "learning_rate": 2.539618856569709e-05, + "loss": 2.6272, + "theoretical_loss": 3.309583569511748, + "tokens_seen": 3135133696 + }, + { + "epoch": 10.05, + "learning_rate": 2.5386158475426277e-05, + "loss": 2.4278, + "theoretical_loss": 3.3095782978311346, + "tokens_seen": 3135199232 + }, + { + "epoch": 10.05, + "learning_rate": 2.537612838515547e-05, + "loss": 2.4693, + "theoretical_loss": 3.3095730262915697, + "tokens_seen": 3135264768 + }, + { + "epoch": 10.05, + "learning_rate": 2.5366098294884657e-05, + "loss": 2.4855, + "theoretical_loss": 3.309567754893046, + "tokens_seen": 3135330304 + }, + { + "epoch": 10.05, + "learning_rate": 2.535606820461384e-05, + "loss": 2.4051, + "theoretical_loss": 3.309562483635558, + "tokens_seen": 3135395840 + }, + { + "epoch": 10.05, + "learning_rate": 2.534603811434303e-05, + "loss": 2.3436, + "theoretical_loss": 3.309557212519098, + "tokens_seen": 3135461376 + }, + { + "epoch": 10.05, + "learning_rate": 2.5336008024072217e-05, + "loss": 2.5707, + "theoretical_loss": 3.3095519415436594, + "tokens_seen": 3135526912 + }, + { + "epoch": 10.05, + "learning_rate": 2.5325977933801405e-05, + "loss": 2.3162, + "theoretical_loss": 3.309546670709236, + "tokens_seen": 3135592448 + }, + { + "epoch": 10.05, + "learning_rate": 2.5315947843530593e-05, + "loss": 2.6372, + "theoretical_loss": 3.30954140001582, + "tokens_seen": 3135657984 + }, + { + "epoch": 10.05, + "learning_rate": 2.5305917753259777e-05, + "loss": 2.4992, + "theoretical_loss": 3.3095361294634054, + "tokens_seen": 3135723520 + }, + { + "epoch": 10.05, + "learning_rate": 2.5295887662988965e-05, + "loss": 2.451, + "theoretical_loss": 3.3095308590519856, + "tokens_seen": 3135789056 + }, + { + "epoch": 10.05, + "learning_rate": 2.5285857572718156e-05, + "loss": 2.6285, + "theoretical_loss": 3.309525588781553, + "tokens_seen": 3135854592 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3438967, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5031416416168213, + "objective/train/theoretical_loss": 3.3095229536992057, + "objective/train/tokens_used": 3156347360, + "theoretical_loss": 3.3095229536992057, + "tokens_seen": 3135887360 + }, + { + "epoch": 10.05, + "learning_rate": 2.5275827482447344e-05, + "loss": 2.4518, + "theoretical_loss": 3.3095203186521025, + "tokens_seen": 3135920128 + }, + { + "epoch": 10.05, + "learning_rate": 2.5265797392176532e-05, + "loss": 2.4132, + "theoretical_loss": 3.3095150486636262, + "tokens_seen": 3135985664 + }, + { + "epoch": 10.05, + "learning_rate": 2.5255767301905717e-05, + "loss": 2.5413, + "theoretical_loss": 3.309509778816117, + "tokens_seen": 3136051200 + }, + { + "epoch": 10.05, + "learning_rate": 2.5245737211634905e-05, + "loss": 2.6668, + "theoretical_loss": 3.3095045091095696, + "tokens_seen": 3136116736 + }, + { + "epoch": 10.05, + "learning_rate": 2.5235707121364092e-05, + "loss": 2.4697, + "theoretical_loss": 3.3094992395439764, + "tokens_seen": 3136182272 + }, + { + "epoch": 10.05, + "learning_rate": 2.522567703109328e-05, + "loss": 2.5568, + "theoretical_loss": 3.3094939701193304, + "tokens_seen": 3136247808 + }, + { + "epoch": 10.05, + "learning_rate": 2.5215646940822468e-05, + "loss": 2.425, + "theoretical_loss": 3.309488700835625, + "tokens_seen": 3136313344 + }, + { + "epoch": 10.05, + "learning_rate": 2.5205616850551653e-05, + "loss": 2.376, + "theoretical_loss": 3.3094834316928545, + "tokens_seen": 3136378880 + }, + { + "epoch": 10.05, + "learning_rate": 2.5195586760280844e-05, + "loss": 2.3393, + "theoretical_loss": 3.3094781626910112, + "tokens_seen": 3136444416 + }, + { + "epoch": 10.05, + "learning_rate": 2.5185556670010032e-05, + "loss": 2.3197, + "theoretical_loss": 3.3094728938300886, + "tokens_seen": 3136509952 + }, + { + "epoch": 10.05, + "learning_rate": 2.517552657973922e-05, + "loss": 2.4801, + "theoretical_loss": 3.30946762511008, + "tokens_seen": 3136575488 + }, + { + "epoch": 10.05, + "learning_rate": 2.5165496489468408e-05, + "loss": 2.7165, + "theoretical_loss": 3.3094623565309784, + "tokens_seen": 3136641024 + }, + { + "epoch": 10.05, + "learning_rate": 2.5155466399197592e-05, + "loss": 2.6784, + "theoretical_loss": 3.3094570880927776, + "tokens_seen": 3136706560 + }, + { + "epoch": 10.05, + "learning_rate": 2.514543630892678e-05, + "loss": 2.6016, + "theoretical_loss": 3.3094518197954708, + "tokens_seen": 3136772096 + }, + { + "epoch": 10.05, + "learning_rate": 2.5135406218655968e-05, + "loss": 2.4615, + "theoretical_loss": 3.3094465516390508, + "tokens_seen": 3136837632 + }, + { + "epoch": 10.05, + "learning_rate": 2.5125376128385156e-05, + "loss": 2.4107, + "theoretical_loss": 3.3094412836235114, + "tokens_seen": 3136903168 + }, + { + "epoch": 10.05, + "learning_rate": 2.5115346038114344e-05, + "loss": 2.4594, + "theoretical_loss": 3.309436015748846, + "tokens_seen": 3136968704 + }, + { + "epoch": 10.05, + "learning_rate": 2.5105315947843528e-05, + "loss": 2.5742, + "theoretical_loss": 3.3094307480150476, + "tokens_seen": 3137034240 + }, + { + "epoch": 10.05, + "learning_rate": 2.509528585757272e-05, + "loss": 2.463, + "theoretical_loss": 3.3094254804221093, + "tokens_seen": 3137099776 + }, + { + "epoch": 10.05, + "learning_rate": 2.5085255767301907e-05, + "loss": 2.4071, + "theoretical_loss": 3.3094202129700245, + "tokens_seen": 3137165312 + }, + { + "epoch": 10.05, + "learning_rate": 2.5075225677031095e-05, + "loss": 2.6545, + "theoretical_loss": 3.3094149456587867, + "tokens_seen": 3137230848 + }, + { + "epoch": 10.05, + "learning_rate": 2.5065195586760283e-05, + "loss": 2.6905, + "theoretical_loss": 3.309409678488389, + "tokens_seen": 3137296384 + }, + { + "epoch": 10.05, + "learning_rate": 2.5055165496489468e-05, + "loss": 2.4521, + "theoretical_loss": 3.309404411458825, + "tokens_seen": 3137361920 + }, + { + "epoch": 10.05, + "learning_rate": 2.5045135406218656e-05, + "loss": 2.423, + "theoretical_loss": 3.309399144570088, + "tokens_seen": 3137427456 + }, + { + "epoch": 10.05, + "learning_rate": 2.5035105315947844e-05, + "loss": 2.4891, + "theoretical_loss": 3.3093938778221705, + "tokens_seen": 3137492992 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3440468, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.401765823364258, + "objective/train/theoretical_loss": 3.3093912445010174, + "objective/train/tokens_used": 3157985760, + "theoretical_loss": 3.3093912445010174, + "tokens_seen": 3137525760 + }, + { + "epoch": 10.05, + "learning_rate": 2.502507522567703e-05, + "loss": 2.4705, + "theoretical_loss": 3.3093886112150663, + "tokens_seen": 3137558528 + }, + { + "epoch": 10.05, + "learning_rate": 2.5015045135406216e-05, + "loss": 2.46, + "theoretical_loss": 3.3093833447487695, + "tokens_seen": 3137624064 + }, + { + "epoch": 10.05, + "learning_rate": 2.5005015045135407e-05, + "loss": 2.623, + "theoretical_loss": 3.309378078423272, + "tokens_seen": 3137689600 + }, + { + "epoch": 10.05, + "learning_rate": 2.4994984954864595e-05, + "loss": 2.6275, + "theoretical_loss": 3.309372812238568, + "tokens_seen": 3137755136 + }, + { + "epoch": 10.05, + "learning_rate": 2.4984954864593783e-05, + "loss": 2.6217, + "theoretical_loss": 3.3093675461946503, + "tokens_seen": 3137820672 + }, + { + "epoch": 10.05, + "learning_rate": 2.497492477432297e-05, + "loss": 2.4212, + "theoretical_loss": 3.309362280291513, + "tokens_seen": 3137886208 + }, + { + "epoch": 10.05, + "learning_rate": 2.4964894684052155e-05, + "loss": 2.4015, + "theoretical_loss": 3.3093570145291484, + "tokens_seen": 3137951744 + }, + { + "epoch": 10.05, + "learning_rate": 2.4954864593781343e-05, + "loss": 2.4635, + "theoretical_loss": 3.30935174890755, + "tokens_seen": 3138017280 + }, + { + "epoch": 10.05, + "learning_rate": 2.494483450351053e-05, + "loss": 2.3696, + "theoretical_loss": 3.3093464834267117, + "tokens_seen": 3138082816 + }, + { + "epoch": 10.05, + "learning_rate": 2.493480441323972e-05, + "loss": 2.3077, + "theoretical_loss": 3.3093412180866264, + "tokens_seen": 3138148352 + }, + { + "epoch": 10.05, + "learning_rate": 2.4924774322968907e-05, + "loss": 2.5315, + "theoretical_loss": 3.309335952887287, + "tokens_seen": 3138213888 + }, + { + "epoch": 10.05, + "learning_rate": 2.4914744232698095e-05, + "loss": 2.5276, + "theoretical_loss": 3.309330687828688, + "tokens_seen": 3138279424 + }, + { + "epoch": 10.05, + "learning_rate": 2.4904714142427283e-05, + "loss": 2.5382, + "theoretical_loss": 3.3093254229108213, + "tokens_seen": 3138344960 + }, + { + "epoch": 10.05, + "learning_rate": 2.489468405215647e-05, + "loss": 2.5868, + "theoretical_loss": 3.3093201581336813, + "tokens_seen": 3138410496 + }, + { + "epoch": 10.05, + "learning_rate": 2.488465396188566e-05, + "loss": 2.5214, + "theoretical_loss": 3.3093148934972607, + "tokens_seen": 3138476032 + }, + { + "epoch": 10.05, + "learning_rate": 2.4874623871614847e-05, + "loss": 2.4361, + "theoretical_loss": 3.3093096290015525, + "tokens_seen": 3138541568 + }, + { + "epoch": 10.05, + "learning_rate": 2.486459378134403e-05, + "loss": 2.521, + "theoretical_loss": 3.3093043646465508, + "tokens_seen": 3138607104 + }, + { + "epoch": 10.05, + "learning_rate": 2.485456369107322e-05, + "loss": 2.5923, + "theoretical_loss": 3.3092991004322485, + "tokens_seen": 3138672640 + }, + { + "epoch": 10.05, + "learning_rate": 2.4844533600802407e-05, + "loss": 2.4375, + "theoretical_loss": 3.3092938363586386, + "tokens_seen": 3138738176 + }, + { + "epoch": 10.05, + "learning_rate": 2.4834503510531595e-05, + "loss": 2.3467, + "theoretical_loss": 3.309288572425715, + "tokens_seen": 3138803712 + }, + { + "epoch": 10.05, + "learning_rate": 2.4824473420260783e-05, + "loss": 2.3056, + "theoretical_loss": 3.3092833086334705, + "tokens_seen": 3138869248 + }, + { + "epoch": 10.05, + "learning_rate": 2.481444332998997e-05, + "loss": 2.4062, + "theoretical_loss": 3.309278044981899, + "tokens_seen": 3138934784 + }, + { + "epoch": 10.05, + "learning_rate": 2.480441323971916e-05, + "loss": 2.4946, + "theoretical_loss": 3.3092727814709932, + "tokens_seen": 3139000320 + }, + { + "epoch": 10.05, + "learning_rate": 2.4794383149448346e-05, + "loss": 2.3945, + "theoretical_loss": 3.3092675181007465, + "tokens_seen": 3139065856 + }, + { + "epoch": 10.05, + "learning_rate": 2.4784353059177534e-05, + "loss": 2.5272, + "theoretical_loss": 3.3092622548711526, + "tokens_seen": 3139131392 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3441149, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.666487455368042, + "objective/train/theoretical_loss": 3.3092596233090985, + "objective/train/tokens_used": 3159624160, + "theoretical_loss": 3.3092596233090985, + "tokens_seen": 3139164160 + }, + { + "epoch": 10.05, + "learning_rate": 2.4774322968906722e-05, + "loss": 2.5758, + "theoretical_loss": 3.3092569917822043, + "tokens_seen": 3139196928 + }, + { + "epoch": 10.05, + "learning_rate": 2.4764292878635907e-05, + "loss": 2.4079, + "theoretical_loss": 3.3092517288338956, + "tokens_seen": 3139262464 + }, + { + "epoch": 10.05, + "learning_rate": 2.4754262788365095e-05, + "loss": 2.4059, + "theoretical_loss": 3.3092464660262193, + "tokens_seen": 3139328000 + }, + { + "epoch": 10.05, + "learning_rate": 2.4744232698094282e-05, + "loss": 2.3818, + "theoretical_loss": 3.3092412033591683, + "tokens_seen": 3139393536 + }, + { + "epoch": 10.05, + "learning_rate": 2.473420260782347e-05, + "loss": 2.3051, + "theoretical_loss": 3.3092359408327368, + "tokens_seen": 3139459072 + }, + { + "epoch": 10.05, + "learning_rate": 2.472417251755266e-05, + "loss": 2.5324, + "theoretical_loss": 3.3092306784469177, + "tokens_seen": 3139524608 + }, + { + "epoch": 10.05, + "learning_rate": 2.4714142427281846e-05, + "loss": 2.3645, + "theoretical_loss": 3.309225416201704, + "tokens_seen": 3139590144 + }, + { + "epoch": 10.05, + "learning_rate": 2.4704112337011034e-05, + "loss": 2.5066, + "theoretical_loss": 3.309220154097089, + "tokens_seen": 3139655680 + }, + { + "epoch": 10.05, + "learning_rate": 2.4694082246740222e-05, + "loss": 2.422, + "theoretical_loss": 3.309214892133067, + "tokens_seen": 3139721216 + }, + { + "epoch": 10.05, + "learning_rate": 2.468405215646941e-05, + "loss": 2.2512, + "theoretical_loss": 3.3092096303096303, + "tokens_seen": 3139786752 + }, + { + "epoch": 10.05, + "learning_rate": 2.4674022066198598e-05, + "loss": 2.3361, + "theoretical_loss": 3.3092043686267725, + "tokens_seen": 3139852288 + }, + { + "epoch": 10.05, + "learning_rate": 2.4663991975927782e-05, + "loss": 2.4289, + "theoretical_loss": 3.309199107084487, + "tokens_seen": 3139917824 + }, + { + "epoch": 10.05, + "learning_rate": 2.465396188565697e-05, + "loss": 2.4657, + "theoretical_loss": 3.309193845682767, + "tokens_seen": 3139983360 + }, + { + "epoch": 10.05, + "learning_rate": 2.4643931795386158e-05, + "loss": 2.4874, + "theoretical_loss": 3.309188584421606, + "tokens_seen": 3140048896 + }, + { + "epoch": 10.05, + "learning_rate": 2.463390170511535e-05, + "loss": 2.6626, + "theoretical_loss": 3.3091833233009966, + "tokens_seen": 3140114432 + }, + { + "epoch": 10.05, + "learning_rate": 2.4623871614844537e-05, + "loss": 2.4418, + "theoretical_loss": 3.3091780623209335, + "tokens_seen": 3140179968 + }, + { + "epoch": 10.05, + "learning_rate": 2.461384152457372e-05, + "loss": 2.2799, + "theoretical_loss": 3.3091728014814086, + "tokens_seen": 3140245504 + }, + { + "epoch": 10.05, + "learning_rate": 2.460381143430291e-05, + "loss": 2.4982, + "theoretical_loss": 3.309167540782416, + "tokens_seen": 3140311040 + }, + { + "epoch": 10.05, + "learning_rate": 2.4593781344032097e-05, + "loss": 2.2829, + "theoretical_loss": 3.309162280223949, + "tokens_seen": 3140376576 + }, + { + "epoch": 10.05, + "learning_rate": 2.4583751253761285e-05, + "loss": 2.6448, + "theoretical_loss": 3.3091570198060003, + "tokens_seen": 3140442112 + }, + { + "epoch": 10.05, + "learning_rate": 2.457372116349047e-05, + "loss": 2.4099, + "theoretical_loss": 3.3091517595285644, + "tokens_seen": 3140507648 + }, + { + "epoch": 10.05, + "learning_rate": 2.4563691073219658e-05, + "loss": 2.3331, + "theoretical_loss": 3.309146499391633, + "tokens_seen": 3140573184 + }, + { + "epoch": 10.05, + "learning_rate": 2.4553660982948846e-05, + "loss": 2.4074, + "theoretical_loss": 3.309141239395201, + "tokens_seen": 3140638720 + }, + { + "epoch": 10.05, + "learning_rate": 2.4543630892678034e-05, + "loss": 2.4874, + "theoretical_loss": 3.3091359795392608, + "tokens_seen": 3140704256 + }, + { + "epoch": 10.05, + "learning_rate": 2.4533600802407225e-05, + "loss": 2.6415, + "theoretical_loss": 3.3091307198238056, + "tokens_seen": 3140769792 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3442200, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5560142993927, + "objective/train/theoretical_loss": 3.3091280900187576, + "objective/train/tokens_used": 3161262560, + "theoretical_loss": 3.3091280900187576, + "tokens_seen": 3140802560 + }, + { + "epoch": 10.05, + "learning_rate": 2.452357071213641e-05, + "loss": 2.6087, + "theoretical_loss": 3.309125460248829, + "tokens_seen": 3140835328 + }, + { + "epoch": 10.05, + "learning_rate": 2.4513540621865597e-05, + "loss": 2.5001, + "theoretical_loss": 3.3091202008143243, + "tokens_seen": 3140900864 + }, + { + "epoch": 10.05, + "learning_rate": 2.4503510531594785e-05, + "loss": 2.324, + "theoretical_loss": 3.3091149415202854, + "tokens_seen": 3140966400 + }, + { + "epoch": 10.05, + "learning_rate": 2.4493480441323973e-05, + "loss": 2.2876, + "theoretical_loss": 3.3091096823667048, + "tokens_seen": 3141031936 + }, + { + "epoch": 10.05, + "learning_rate": 2.448345035105316e-05, + "loss": 2.4271, + "theoretical_loss": 3.309104423353576, + "tokens_seen": 3141097472 + }, + { + "epoch": 10.05, + "learning_rate": 2.4473420260782345e-05, + "loss": 2.2126, + "theoretical_loss": 3.3090991644808923, + "tokens_seen": 3141163008 + }, + { + "epoch": 10.05, + "learning_rate": 2.4463390170511533e-05, + "loss": 2.6007, + "theoretical_loss": 3.3090939057486475, + "tokens_seen": 3141228544 + }, + { + "epoch": 10.05, + "learning_rate": 2.445336008024072e-05, + "loss": 2.6254, + "theoretical_loss": 3.3090886471568344, + "tokens_seen": 3141294080 + }, + { + "epoch": 10.05, + "learning_rate": 2.4443329989969913e-05, + "loss": 2.4826, + "theoretical_loss": 3.3090833887054463, + "tokens_seen": 3141359616 + }, + { + "epoch": 10.05, + "learning_rate": 2.44332998996991e-05, + "loss": 2.3788, + "theoretical_loss": 3.309078130394477, + "tokens_seen": 3141425152 + }, + { + "epoch": 10.05, + "learning_rate": 2.4423269809428285e-05, + "loss": 2.4144, + "theoretical_loss": 3.3090728722239193, + "tokens_seen": 3141490688 + }, + { + "epoch": 10.05, + "learning_rate": 2.4413239719157473e-05, + "loss": 2.4339, + "theoretical_loss": 3.3090676141937667, + "tokens_seen": 3141556224 + }, + { + "epoch": 10.05, + "learning_rate": 2.440320962888666e-05, + "loss": 2.398, + "theoretical_loss": 3.3090623563040125, + "tokens_seen": 3141621760 + }, + { + "epoch": 10.05, + "learning_rate": 2.439317953861585e-05, + "loss": 2.6628, + "theoretical_loss": 3.3090570985546504, + "tokens_seen": 3141687296 + }, + { + "epoch": 10.05, + "learning_rate": 2.4383149448345037e-05, + "loss": 2.2534, + "theoretical_loss": 3.3090518409456733, + "tokens_seen": 3141752832 + }, + { + "epoch": 10.05, + "learning_rate": 2.437311935807422e-05, + "loss": 2.4849, + "theoretical_loss": 3.309046583477074, + "tokens_seen": 3141818368 + }, + { + "epoch": 10.05, + "learning_rate": 2.436308926780341e-05, + "loss": 2.4516, + "theoretical_loss": 3.309041326148847, + "tokens_seen": 3141883904 + }, + { + "epoch": 10.05, + "learning_rate": 2.4353059177532597e-05, + "loss": 2.4432, + "theoretical_loss": 3.3090360689609852, + "tokens_seen": 3141949440 + }, + { + "epoch": 10.05, + "learning_rate": 2.4343029087261788e-05, + "loss": 2.4134, + "theoretical_loss": 3.3090308119134817, + "tokens_seen": 3142014976 + }, + { + "epoch": 10.05, + "learning_rate": 2.4332998996990976e-05, + "loss": 2.4908, + "theoretical_loss": 3.30902555500633, + "tokens_seen": 3142080512 + }, + { + "epoch": 10.05, + "learning_rate": 2.432296890672016e-05, + "loss": 2.5409, + "theoretical_loss": 3.309020298239523, + "tokens_seen": 3142146048 + }, + { + "epoch": 10.05, + "learning_rate": 2.431293881644935e-05, + "loss": 2.4407, + "theoretical_loss": 3.309015041613055, + "tokens_seen": 3142211584 + }, + { + "epoch": 10.05, + "learning_rate": 2.4302908726178536e-05, + "loss": 2.5405, + "theoretical_loss": 3.3090097851269182, + "tokens_seen": 3142277120 + }, + { + "epoch": 10.05, + "learning_rate": 2.4292878635907724e-05, + "loss": 2.4295, + "theoretical_loss": 3.3090045287811063, + "tokens_seen": 3142342656 + }, + { + "epoch": 10.05, + "learning_rate": 2.4282848545636912e-05, + "loss": 2.6183, + "theoretical_loss": 3.308999272575613, + "tokens_seen": 3142408192 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3442945, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.567739725112915, + "objective/train/theoretical_loss": 3.3089966445254837, + "objective/train/tokens_used": 3162900960, + "theoretical_loss": 3.3089966445254837, + "tokens_seen": 3142440960 + }, + { + "epoch": 10.05, + "learning_rate": 2.4272818455366097e-05, + "loss": 2.6112, + "theoretical_loss": 3.3089940165104315, + "tokens_seen": 3142473728 + }, + { + "epoch": 10.05, + "learning_rate": 2.4262788365095284e-05, + "loss": 2.2815, + "theoretical_loss": 3.308988760585555, + "tokens_seen": 3142539264 + }, + { + "epoch": 10.05, + "learning_rate": 2.4252758274824476e-05, + "loss": 2.5329, + "theoretical_loss": 3.3089835048009766, + "tokens_seen": 3142604800 + }, + { + "epoch": 10.05, + "learning_rate": 2.4242728184553664e-05, + "loss": 2.6119, + "theoretical_loss": 3.30897824915669, + "tokens_seen": 3142670336 + }, + { + "epoch": 10.05, + "learning_rate": 2.4232698094282848e-05, + "loss": 2.4278, + "theoretical_loss": 3.3089729936526884, + "tokens_seen": 3142735872 + }, + { + "epoch": 10.05, + "learning_rate": 2.4222668004012036e-05, + "loss": 2.6125, + "theoretical_loss": 3.308967738288965, + "tokens_seen": 3142801408 + }, + { + "epoch": 10.05, + "learning_rate": 2.4212637913741224e-05, + "loss": 2.4916, + "theoretical_loss": 3.3089624830655135, + "tokens_seen": 3142866944 + }, + { + "epoch": 10.05, + "learning_rate": 2.4202607823470412e-05, + "loss": 2.4892, + "theoretical_loss": 3.308957227982327, + "tokens_seen": 3142932480 + }, + { + "epoch": 10.05, + "learning_rate": 2.41925777331996e-05, + "loss": 2.4678, + "theoretical_loss": 3.308951973039399, + "tokens_seen": 3142998016 + }, + { + "epoch": 10.05, + "learning_rate": 2.4182547642928784e-05, + "loss": 2.597, + "theoretical_loss": 3.3089467182367223, + "tokens_seen": 3143063552 + }, + { + "epoch": 10.05, + "learning_rate": 2.4172517552657972e-05, + "loss": 2.4296, + "theoretical_loss": 3.3089414635742904, + "tokens_seen": 3143129088 + }, + { + "epoch": 10.05, + "learning_rate": 2.4162487462387163e-05, + "loss": 2.3231, + "theoretical_loss": 3.3089362090520975, + "tokens_seen": 3143194624 + }, + { + "epoch": 10.05, + "learning_rate": 2.415245737211635e-05, + "loss": 2.4468, + "theoretical_loss": 3.3089309546701355, + "tokens_seen": 3143260160 + }, + { + "epoch": 10.05, + "learning_rate": 2.414242728184554e-05, + "loss": 2.2584, + "theoretical_loss": 3.308925700428399, + "tokens_seen": 3143325696 + }, + { + "epoch": 10.05, + "learning_rate": 2.4132397191574724e-05, + "loss": 2.4895, + "theoretical_loss": 3.308920446326881, + "tokens_seen": 3143391232 + }, + { + "epoch": 10.05, + "learning_rate": 2.412236710130391e-05, + "loss": 2.4678, + "theoretical_loss": 3.308915192365574, + "tokens_seen": 3143456768 + }, + { + "epoch": 10.05, + "learning_rate": 2.41123370110331e-05, + "loss": 2.3415, + "theoretical_loss": 3.3089099385444722, + "tokens_seen": 3143522304 + }, + { + "epoch": 10.05, + "learning_rate": 2.4102306920762287e-05, + "loss": 2.5072, + "theoretical_loss": 3.308904684863569, + "tokens_seen": 3143587840 + }, + { + "epoch": 10.05, + "learning_rate": 2.4092276830491475e-05, + "loss": 2.5189, + "theoretical_loss": 3.3088994313228572, + "tokens_seen": 3143653376 + }, + { + "epoch": 10.05, + "learning_rate": 2.408224674022066e-05, + "loss": 2.6268, + "theoretical_loss": 3.3088941779223306, + "tokens_seen": 3143718912 + }, + { + "epoch": 10.05, + "learning_rate": 2.4072216649949848e-05, + "loss": 2.3328, + "theoretical_loss": 3.308888924661982, + "tokens_seen": 3143784448 + }, + { + "epoch": 10.05, + "learning_rate": 2.406218655967904e-05, + "loss": 2.6218, + "theoretical_loss": 3.3088836715418055, + "tokens_seen": 3143849984 + }, + { + "epoch": 10.05, + "learning_rate": 2.4052156469408227e-05, + "loss": 2.4198, + "theoretical_loss": 3.308878418561794, + "tokens_seen": 3143915520 + }, + { + "epoch": 10.05, + "learning_rate": 2.4042126379137415e-05, + "loss": 2.6456, + "theoretical_loss": 3.308873165721941, + "tokens_seen": 3143981056 + }, + { + "epoch": 10.05, + "learning_rate": 2.40320962888666e-05, + "loss": 2.3811, + "theoretical_loss": 3.308867913022239, + "tokens_seen": 3144046592 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3444206, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.253753185272217, + "objective/train/theoretical_loss": 3.3088652867249433, + "objective/train/tokens_used": 3164539360, + "theoretical_loss": 3.3088652867249433, + "tokens_seen": 3144079360 + }, + { + "epoch": 10.05, + "learning_rate": 2.4022066198595787e-05, + "loss": 2.2282, + "theoretical_loss": 3.3088626604626823, + "tokens_seen": 3144112128 + }, + { + "epoch": 10.05, + "learning_rate": 2.4012036108324975e-05, + "loss": 2.5577, + "theoretical_loss": 3.3088574080432642, + "tokens_seen": 3144177664 + }, + { + "epoch": 10.05, + "learning_rate": 2.4002006018054163e-05, + "loss": 2.4512, + "theoretical_loss": 3.3088521557639776, + "tokens_seen": 3144243200 + }, + { + "epoch": 10.05, + "learning_rate": 2.399197592778335e-05, + "loss": 2.564, + "theoretical_loss": 3.3088469036248163, + "tokens_seen": 3144308736 + }, + { + "epoch": 10.05, + "learning_rate": 2.3981945837512535e-05, + "loss": 2.5533, + "theoretical_loss": 3.3088416516257735, + "tokens_seen": 3144374272 + }, + { + "epoch": 10.05, + "learning_rate": 2.3971915747241727e-05, + "loss": 2.5073, + "theoretical_loss": 3.308836399766842, + "tokens_seen": 3144439808 + }, + { + "epoch": 10.05, + "learning_rate": 2.3961885656970915e-05, + "loss": 2.654, + "theoretical_loss": 3.3088311480480157, + "tokens_seen": 3144505344 + }, + { + "epoch": 10.05, + "learning_rate": 2.3951855566700103e-05, + "loss": 2.4559, + "theoretical_loss": 3.308825896469288, + "tokens_seen": 3144570880 + }, + { + "epoch": 10.05, + "learning_rate": 2.394182547642929e-05, + "loss": 2.4626, + "theoretical_loss": 3.308820645030652, + "tokens_seen": 3144636416 + }, + { + "epoch": 10.05, + "learning_rate": 2.3931795386158475e-05, + "loss": 2.4696, + "theoretical_loss": 3.308815393732101, + "tokens_seen": 3144701952 + }, + { + "epoch": 10.05, + "learning_rate": 2.3921765295887663e-05, + "loss": 2.7049, + "theoretical_loss": 3.3088101425736287, + "tokens_seen": 3144767488 + }, + { + "epoch": 10.05, + "learning_rate": 2.391173520561685e-05, + "loss": 2.5877, + "theoretical_loss": 3.3088048915552277, + "tokens_seen": 3144833024 + }, + { + "epoch": 10.05, + "learning_rate": 2.390170511534604e-05, + "loss": 2.5578, + "theoretical_loss": 3.3087996406768925, + "tokens_seen": 3144898560 + }, + { + "epoch": 10.05, + "learning_rate": 2.3891675025075223e-05, + "loss": 2.4902, + "theoretical_loss": 3.3087943899386154, + "tokens_seen": 3144964096 + }, + { + "epoch": 10.05, + "learning_rate": 2.3881644934804414e-05, + "loss": 2.4196, + "theoretical_loss": 3.30878913934039, + "tokens_seen": 3145029632 + }, + { + "epoch": 10.05, + "learning_rate": 2.3871614844533602e-05, + "loss": 2.5281, + "theoretical_loss": 3.3087838888822096, + "tokens_seen": 3145095168 + }, + { + "epoch": 10.05, + "learning_rate": 2.386158475426279e-05, + "loss": 2.4302, + "theoretical_loss": 3.3087786385640685, + "tokens_seen": 3145160704 + }, + { + "epoch": 10.05, + "learning_rate": 2.3851554663991978e-05, + "loss": 2.2708, + "theoretical_loss": 3.3087733883859585, + "tokens_seen": 3145226240 + }, + { + "epoch": 10.05, + "learning_rate": 2.3841524573721163e-05, + "loss": 2.3082, + "theoretical_loss": 3.308768138347874, + "tokens_seen": 3145291776 + }, + { + "epoch": 10.05, + "learning_rate": 2.383149448345035e-05, + "loss": 2.465, + "theoretical_loss": 3.3087628884498086, + "tokens_seen": 3145357312 + }, + { + "epoch": 10.05, + "learning_rate": 2.382146439317954e-05, + "loss": 2.5056, + "theoretical_loss": 3.3087576386917545, + "tokens_seen": 3145422848 + }, + { + "epoch": 10.05, + "learning_rate": 2.3811434302908726e-05, + "loss": 2.5524, + "theoretical_loss": 3.308752389073706, + "tokens_seen": 3145488384 + }, + { + "epoch": 10.05, + "learning_rate": 2.3801404212637914e-05, + "loss": 2.3963, + "theoretical_loss": 3.3087471395956554, + "tokens_seen": 3145553920 + }, + { + "epoch": 10.05, + "learning_rate": 2.37913741223671e-05, + "loss": 2.5255, + "theoretical_loss": 3.3087418902575974, + "tokens_seen": 3145619456 + }, + { + "epoch": 10.05, + "learning_rate": 2.378134403209629e-05, + "loss": 2.2963, + "theoretical_loss": 3.3087366410595247, + "tokens_seen": 3145684992 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3444941, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5931429862976074, + "objective/train/theoretical_loss": 3.3087340165129806, + "objective/train/tokens_used": 3166177760, + "theoretical_loss": 3.3087340165129806, + "tokens_seen": 3145717760 + }, + { + "epoch": 10.05, + "learning_rate": 2.3771313941825478e-05, + "loss": 2.6379, + "theoretical_loss": 3.3087313920014303, + "tokens_seen": 3145750528 + }, + { + "epoch": 10.05, + "learning_rate": 2.3761283851554666e-05, + "loss": 2.547, + "theoretical_loss": 3.3087261430833084, + "tokens_seen": 3145816064 + }, + { + "epoch": 10.05, + "learning_rate": 2.3751253761283854e-05, + "loss": 2.6406, + "theoretical_loss": 3.3087208943051514, + "tokens_seen": 3145881600 + }, + { + "epoch": 10.05, + "learning_rate": 2.3741223671013038e-05, + "loss": 2.4889, + "theoretical_loss": 3.3087156456669535, + "tokens_seen": 3145947136 + }, + { + "epoch": 10.05, + "learning_rate": 2.3731193580742226e-05, + "loss": 2.3713, + "theoretical_loss": 3.3087103971687073, + "tokens_seen": 3146012672 + }, + { + "epoch": 10.05, + "learning_rate": 2.3721163490471414e-05, + "loss": 2.5751, + "theoretical_loss": 3.308705148810407, + "tokens_seen": 3146078208 + }, + { + "epoch": 10.05, + "learning_rate": 2.3711133400200602e-05, + "loss": 2.7274, + "theoretical_loss": 3.308699900592045, + "tokens_seen": 3146143744 + }, + { + "epoch": 10.05, + "learning_rate": 2.370110330992979e-05, + "loss": 2.6884, + "theoretical_loss": 3.3086946525136156, + "tokens_seen": 3146209280 + }, + { + "epoch": 10.05, + "learning_rate": 2.3691073219658978e-05, + "loss": 2.5309, + "theoretical_loss": 3.3086894045751114, + "tokens_seen": 3146274816 + }, + { + "epoch": 10.05, + "learning_rate": 2.3681043129388166e-05, + "loss": 2.5204, + "theoretical_loss": 3.308684156776526, + "tokens_seen": 3146340352 + }, + { + "epoch": 10.05, + "learning_rate": 2.3671013039117353e-05, + "loss": 2.4253, + "theoretical_loss": 3.3086789091178526, + "tokens_seen": 3146405888 + }, + { + "epoch": 10.05, + "learning_rate": 2.366098294884654e-05, + "loss": 2.4608, + "theoretical_loss": 3.308673661599085, + "tokens_seen": 3146471424 + }, + { + "epoch": 10.05, + "learning_rate": 2.365095285857573e-05, + "loss": 2.4831, + "theoretical_loss": 3.3086684142202163, + "tokens_seen": 3146536960 + }, + { + "epoch": 10.05, + "learning_rate": 2.3640922768304914e-05, + "loss": 2.4788, + "theoretical_loss": 3.30866316698124, + "tokens_seen": 3146602496 + }, + { + "epoch": 10.05, + "learning_rate": 2.36308926780341e-05, + "loss": 2.5, + "theoretical_loss": 3.3086579198821493, + "tokens_seen": 3146668032 + }, + { + "epoch": 10.05, + "learning_rate": 2.362086258776329e-05, + "loss": 2.4434, + "theoretical_loss": 3.3086526729229373, + "tokens_seen": 3146733568 + }, + { + "epoch": 10.05, + "learning_rate": 2.3610832497492477e-05, + "loss": 2.6876, + "theoretical_loss": 3.308647426103598, + "tokens_seen": 3146799104 + }, + { + "epoch": 10.05, + "learning_rate": 2.360080240722167e-05, + "loss": 2.375, + "theoretical_loss": 3.308642179424124, + "tokens_seen": 3146864640 + }, + { + "epoch": 10.05, + "learning_rate": 2.3590772316950853e-05, + "loss": 2.6794, + "theoretical_loss": 3.308636932884509, + "tokens_seen": 3146930176 + }, + { + "epoch": 10.05, + "learning_rate": 2.358074222668004e-05, + "loss": 2.6501, + "theoretical_loss": 3.308631686484747, + "tokens_seen": 3146995712 + }, + { + "epoch": 10.05, + "learning_rate": 2.357071213640923e-05, + "loss": 2.662, + "theoretical_loss": 3.30862644022483, + "tokens_seen": 3147061248 + }, + { + "epoch": 10.05, + "learning_rate": 2.3560682046138417e-05, + "loss": 2.6088, + "theoretical_loss": 3.308621194104753, + "tokens_seen": 3147126784 + }, + { + "epoch": 10.05, + "learning_rate": 2.3550651955867605e-05, + "loss": 2.4546, + "theoretical_loss": 3.308615948124508, + "tokens_seen": 3147192320 + }, + { + "epoch": 10.05, + "learning_rate": 2.354062186559679e-05, + "loss": 2.332, + "theoretical_loss": 3.308610702284089, + "tokens_seen": 3147257856 + }, + { + "epoch": 10.05, + "learning_rate": 2.3530591775325977e-05, + "loss": 2.5178, + "theoretical_loss": 3.3086054565834893, + "tokens_seen": 3147323392 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3446161, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4707751274108887, + "objective/train/theoretical_loss": 3.3086028337856193, + "objective/train/tokens_used": 3167816160, + "theoretical_loss": 3.3086028337856193, + "tokens_seen": 3147356160 + }, + { + "epoch": 10.05, + "learning_rate": 2.3520561685055165e-05, + "loss": 2.4357, + "theoretical_loss": 3.3086002110227017, + "tokens_seen": 3147388928 + }, + { + "epoch": 10.05, + "learning_rate": 2.3510531594784353e-05, + "loss": 2.4904, + "theoretical_loss": 3.30859496560172, + "tokens_seen": 3147454464 + }, + { + "epoch": 10.05, + "learning_rate": 2.350050150451354e-05, + "loss": 2.3258, + "theoretical_loss": 3.3085897203205383, + "tokens_seen": 3147520000 + }, + { + "epoch": 10.05, + "learning_rate": 2.349047141424273e-05, + "loss": 2.4085, + "theoretical_loss": 3.308584475179149, + "tokens_seen": 3147585536 + }, + { + "epoch": 10.05, + "learning_rate": 2.3480441323971917e-05, + "loss": 2.5047, + "theoretical_loss": 3.3085792301775454, + "tokens_seen": 3147651072 + }, + { + "epoch": 10.05, + "learning_rate": 2.3470411233701105e-05, + "loss": 2.5823, + "theoretical_loss": 3.3085739853157214, + "tokens_seen": 3147716608 + }, + { + "epoch": 10.05, + "learning_rate": 2.3460381143430293e-05, + "loss": 2.3648, + "theoretical_loss": 3.3085687405936706, + "tokens_seen": 3147782144 + }, + { + "epoch": 10.05, + "learning_rate": 2.3450351053159477e-05, + "loss": 2.5064, + "theoretical_loss": 3.308563496011385, + "tokens_seen": 3147847680 + }, + { + "epoch": 10.05, + "learning_rate": 2.3440320962888665e-05, + "loss": 2.4222, + "theoretical_loss": 3.3085582515688596, + "tokens_seen": 3147913216 + }, + { + "epoch": 10.05, + "learning_rate": 2.3430290872617853e-05, + "loss": 2.4532, + "theoretical_loss": 3.308553007266087, + "tokens_seen": 3147978752 + }, + { + "epoch": 10.05, + "learning_rate": 2.342026078234704e-05, + "loss": 2.4016, + "theoretical_loss": 3.3085477631030606, + "tokens_seen": 3148044288 + }, + { + "epoch": 10.05, + "learning_rate": 2.3410230692076232e-05, + "loss": 2.3867, + "theoretical_loss": 3.3085425190797735, + "tokens_seen": 3148109824 + }, + { + "epoch": 10.05, + "learning_rate": 2.3400200601805416e-05, + "loss": 2.549, + "theoretical_loss": 3.30853727519622, + "tokens_seen": 3148175360 + }, + { + "epoch": 10.05, + "learning_rate": 2.3390170511534604e-05, + "loss": 2.3613, + "theoretical_loss": 3.308532031452392, + "tokens_seen": 3148240896 + }, + { + "epoch": 10.05, + "learning_rate": 2.3380140421263792e-05, + "loss": 2.1872, + "theoretical_loss": 3.308526787848284, + "tokens_seen": 3148306432 + }, + { + "epoch": 10.05, + "learning_rate": 2.337011033099298e-05, + "loss": 2.5081, + "theoretical_loss": 3.3085215443838893, + "tokens_seen": 3148371968 + }, + { + "epoch": 10.05, + "learning_rate": 2.3360080240722168e-05, + "loss": 2.611, + "theoretical_loss": 3.3085163010592007, + "tokens_seen": 3148437504 + }, + { + "epoch": 10.05, + "learning_rate": 2.3350050150451353e-05, + "loss": 2.4868, + "theoretical_loss": 3.3085110578742123, + "tokens_seen": 3148503040 + }, + { + "epoch": 10.05, + "learning_rate": 2.334002006018054e-05, + "loss": 2.5076, + "theoretical_loss": 3.3085058148289166, + "tokens_seen": 3148568576 + }, + { + "epoch": 10.05, + "learning_rate": 2.332998996990973e-05, + "loss": 2.5016, + "theoretical_loss": 3.308500571923308, + "tokens_seen": 3148634112 + }, + { + "epoch": 10.05, + "learning_rate": 2.331995987963892e-05, + "loss": 2.2573, + "theoretical_loss": 3.3084953291573784, + "tokens_seen": 3148699648 + }, + { + "epoch": 10.05, + "learning_rate": 2.3309929789368108e-05, + "loss": 2.4576, + "theoretical_loss": 3.308490086531123, + "tokens_seen": 3148765184 + }, + { + "epoch": 10.05, + "learning_rate": 2.3299899699097292e-05, + "loss": 2.6163, + "theoretical_loss": 3.308484844044534, + "tokens_seen": 3148830720 + }, + { + "epoch": 10.05, + "learning_rate": 2.328986960882648e-05, + "loss": 2.4635, + "theoretical_loss": 3.308479601697605, + "tokens_seen": 3148896256 + }, + { + "epoch": 10.05, + "learning_rate": 2.3279839518555668e-05, + "loss": 2.4885, + "theoretical_loss": 3.3084743594903294, + "tokens_seen": 3148961792 + }, + { + "epoch": 10.05, + "objective/train/docs_used": 3446876, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9056057929992676, + "objective/train/theoretical_loss": 3.308471738439059, + "objective/train/tokens_used": 3169454560, + "theoretical_loss": 3.308471738439059, + "tokens_seen": 3148994560 + }, + { + "epoch": 10.05, + "learning_rate": 2.3269809428284856e-05, + "loss": 2.5845, + "theoretical_loss": 3.3084691174227006, + "tokens_seen": 3149027328 + }, + { + "epoch": 10.05, + "learning_rate": 2.3259779338014044e-05, + "loss": 2.3471, + "theoretical_loss": 3.3084638754947115, + "tokens_seen": 3149092864 + }, + { + "epoch": 10.06, + "learning_rate": 2.3249749247743228e-05, + "loss": 2.5355, + "theoretical_loss": 3.3084586337063566, + "tokens_seen": 3149158400 + }, + { + "epoch": 10.06, + "learning_rate": 2.3239719157472416e-05, + "loss": 2.5922, + "theoretical_loss": 3.3084533920576282, + "tokens_seen": 3149223936 + }, + { + "epoch": 10.06, + "learning_rate": 2.3229689067201604e-05, + "loss": 2.4992, + "theoretical_loss": 3.30844815054852, + "tokens_seen": 3149289472 + }, + { + "epoch": 10.06, + "learning_rate": 2.3219658976930795e-05, + "loss": 2.5055, + "theoretical_loss": 3.3084429091790257, + "tokens_seen": 3149355008 + }, + { + "epoch": 10.06, + "learning_rate": 2.3209628886659983e-05, + "loss": 2.3951, + "theoretical_loss": 3.3084376679491383, + "tokens_seen": 3149420544 + }, + { + "epoch": 10.06, + "learning_rate": 2.3199598796389168e-05, + "loss": 2.5854, + "theoretical_loss": 3.308432426858851, + "tokens_seen": 3149486080 + }, + { + "epoch": 10.06, + "learning_rate": 2.3189568706118356e-05, + "loss": 2.4708, + "theoretical_loss": 3.308427185908158, + "tokens_seen": 3149551616 + }, + { + "epoch": 10.06, + "learning_rate": 2.3179538615847543e-05, + "loss": 2.5062, + "theoretical_loss": 3.3084219450970522, + "tokens_seen": 3149617152 + }, + { + "epoch": 10.06, + "learning_rate": 2.316950852557673e-05, + "loss": 2.4441, + "theoretical_loss": 3.308416704425527, + "tokens_seen": 3149682688 + }, + { + "epoch": 10.06, + "learning_rate": 2.3159478435305916e-05, + "loss": 2.5648, + "theoretical_loss": 3.3084114638935755, + "tokens_seen": 3149748224 + }, + { + "epoch": 10.06, + "learning_rate": 2.3149448345035104e-05, + "loss": 2.4558, + "theoretical_loss": 3.308406223501191, + "tokens_seen": 3149813760 + }, + { + "epoch": 10.06, + "learning_rate": 2.313941825476429e-05, + "loss": 2.3272, + "theoretical_loss": 3.3084009832483674, + "tokens_seen": 3149879296 + }, + { + "epoch": 10.06, + "learning_rate": 2.3129388164493483e-05, + "loss": 2.4521, + "theoretical_loss": 3.308395743135098, + "tokens_seen": 3149944832 + }, + { + "epoch": 10.06, + "learning_rate": 2.311935807422267e-05, + "loss": 2.292, + "theoretical_loss": 3.308390503161376, + "tokens_seen": 3150010368 + }, + { + "epoch": 10.06, + "learning_rate": 2.3109327983951855e-05, + "loss": 2.3611, + "theoretical_loss": 3.3083852633271946, + "tokens_seen": 3150075904 + }, + { + "epoch": 10.06, + "learning_rate": 2.3099297893681043e-05, + "loss": 2.4689, + "theoretical_loss": 3.308380023632547, + "tokens_seen": 3150141440 + }, + { + "epoch": 10.06, + "learning_rate": 2.308926780341023e-05, + "loss": 2.3108, + "theoretical_loss": 3.3083747840774276, + "tokens_seen": 3150206976 + }, + { + "epoch": 10.06, + "learning_rate": 2.307923771313942e-05, + "loss": 2.4166, + "theoretical_loss": 3.3083695446618293, + "tokens_seen": 3150272512 + }, + { + "epoch": 10.06, + "learning_rate": 2.3069207622868607e-05, + "loss": 2.6537, + "theoretical_loss": 3.308364305385745, + "tokens_seen": 3150338048 + }, + { + "epoch": 10.06, + "learning_rate": 2.305917753259779e-05, + "loss": 2.536, + "theoretical_loss": 3.3083590662491686, + "tokens_seen": 3150403584 + }, + { + "epoch": 10.06, + "learning_rate": 2.304914744232698e-05, + "loss": 2.6599, + "theoretical_loss": 3.308353827252093, + "tokens_seen": 3150469120 + }, + { + "epoch": 10.06, + "learning_rate": 2.3039117352056167e-05, + "loss": 2.4118, + "theoretical_loss": 3.3083485883945123, + "tokens_seen": 3150534656 + }, + { + "epoch": 10.06, + "learning_rate": 2.302908726178536e-05, + "loss": 2.5152, + "theoretical_loss": 3.308343349676419, + "tokens_seen": 3150600192 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3447961, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3561718463897705, + "objective/train/theoretical_loss": 3.3083407303696784, + "objective/train/tokens_used": 3171092960, + "theoretical_loss": 3.3083407303696784, + "tokens_seen": 3150632960 + }, + { + "epoch": 10.06, + "learning_rate": 2.3019057171514546e-05, + "loss": 2.4846, + "theoretical_loss": 3.3083381110978074, + "tokens_seen": 3150665728 + }, + { + "epoch": 10.06, + "learning_rate": 2.300902708124373e-05, + "loss": 2.4261, + "theoretical_loss": 3.30833287265867, + "tokens_seen": 3150731264 + }, + { + "epoch": 10.06, + "learning_rate": 2.299899699097292e-05, + "loss": 2.4208, + "theoretical_loss": 3.308327634359001, + "tokens_seen": 3150796800 + }, + { + "epoch": 10.06, + "learning_rate": 2.2988966900702107e-05, + "loss": 2.5033, + "theoretical_loss": 3.3083223961987933, + "tokens_seen": 3150862336 + }, + { + "epoch": 10.06, + "learning_rate": 2.2978936810431295e-05, + "loss": 2.2822, + "theoretical_loss": 3.30831715817804, + "tokens_seen": 3150927872 + }, + { + "epoch": 10.06, + "learning_rate": 2.2968906720160482e-05, + "loss": 2.4816, + "theoretical_loss": 3.3083119202967355, + "tokens_seen": 3150993408 + }, + { + "epoch": 10.06, + "learning_rate": 2.2958876629889667e-05, + "loss": 2.5923, + "theoretical_loss": 3.308306682554872, + "tokens_seen": 3151058944 + }, + { + "epoch": 10.06, + "learning_rate": 2.2948846539618855e-05, + "loss": 2.5692, + "theoretical_loss": 3.3083014449524444, + "tokens_seen": 3151124480 + }, + { + "epoch": 10.06, + "learning_rate": 2.2938816449348046e-05, + "loss": 2.389, + "theoretical_loss": 3.308296207489444, + "tokens_seen": 3151190016 + }, + { + "epoch": 10.06, + "learning_rate": 2.2928786359077234e-05, + "loss": 2.6074, + "theoretical_loss": 3.3082909701658663, + "tokens_seen": 3151255552 + }, + { + "epoch": 10.06, + "learning_rate": 2.2918756268806422e-05, + "loss": 2.3258, + "theoretical_loss": 3.3082857329817035, + "tokens_seen": 3151321088 + }, + { + "epoch": 10.06, + "learning_rate": 2.2908726178535606e-05, + "loss": 2.5976, + "theoretical_loss": 3.3082804959369487, + "tokens_seen": 3151386624 + }, + { + "epoch": 10.06, + "learning_rate": 2.2898696088264794e-05, + "loss": 2.6393, + "theoretical_loss": 3.3082752590315962, + "tokens_seen": 3151452160 + }, + { + "epoch": 10.06, + "learning_rate": 2.2888665997993982e-05, + "loss": 2.4066, + "theoretical_loss": 3.308270022265639, + "tokens_seen": 3151517696 + }, + { + "epoch": 10.06, + "learning_rate": 2.287863590772317e-05, + "loss": 2.1858, + "theoretical_loss": 3.3082647856390706, + "tokens_seen": 3151583232 + }, + { + "epoch": 10.06, + "learning_rate": 2.2868605817452358e-05, + "loss": 2.5769, + "theoretical_loss": 3.308259549151884, + "tokens_seen": 3151648768 + }, + { + "epoch": 10.06, + "learning_rate": 2.2858575727181543e-05, + "loss": 2.3478, + "theoretical_loss": 3.308254312804073, + "tokens_seen": 3151714304 + }, + { + "epoch": 10.06, + "learning_rate": 2.2848545636910734e-05, + "loss": 2.4977, + "theoretical_loss": 3.3082490765956307, + "tokens_seen": 3151779840 + }, + { + "epoch": 10.06, + "learning_rate": 2.2838515546639922e-05, + "loss": 2.5327, + "theoretical_loss": 3.308243840526551, + "tokens_seen": 3151845376 + }, + { + "epoch": 10.06, + "learning_rate": 2.282848545636911e-05, + "loss": 2.4098, + "theoretical_loss": 3.3082386045968266, + "tokens_seen": 3151910912 + }, + { + "epoch": 10.06, + "learning_rate": 2.2818455366098298e-05, + "loss": 2.4208, + "theoretical_loss": 3.3082333688064516, + "tokens_seen": 3151976448 + }, + { + "epoch": 10.06, + "learning_rate": 2.2808425275827482e-05, + "loss": 2.5164, + "theoretical_loss": 3.308228133155419, + "tokens_seen": 3152041984 + }, + { + "epoch": 10.06, + "learning_rate": 2.279839518555667e-05, + "loss": 2.4958, + "theoretical_loss": 3.308222897643722, + "tokens_seen": 3152107520 + }, + { + "epoch": 10.06, + "learning_rate": 2.2788365095285858e-05, + "loss": 2.3908, + "theoretical_loss": 3.308217662271354, + "tokens_seen": 3152173056 + }, + { + "epoch": 10.06, + "learning_rate": 2.2778335005015046e-05, + "loss": 2.4337, + "theoretical_loss": 3.308212427038309, + "tokens_seen": 3152238592 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3448725, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.485743522644043, + "objective/train/theoretical_loss": 3.3082098094740306, + "objective/train/tokens_used": 3172731360, + "theoretical_loss": 3.3082098094740306, + "tokens_seen": 3152271360 + }, + { + "epoch": 10.06, + "learning_rate": 2.276830491474423e-05, + "loss": 2.4117, + "theoretical_loss": 3.30820719194458, + "tokens_seen": 3152304128 + }, + { + "epoch": 10.06, + "learning_rate": 2.2758274824473418e-05, + "loss": 2.6182, + "theoretical_loss": 3.3082019569901604, + "tokens_seen": 3152369664 + }, + { + "epoch": 10.06, + "learning_rate": 2.274824473420261e-05, + "loss": 2.3636, + "theoretical_loss": 3.3081967221750435, + "tokens_seen": 3152435200 + }, + { + "epoch": 10.06, + "learning_rate": 2.2738214643931797e-05, + "loss": 2.3904, + "theoretical_loss": 3.308191487499223, + "tokens_seen": 3152500736 + }, + { + "epoch": 10.06, + "learning_rate": 2.2728184553660985e-05, + "loss": 2.3765, + "theoretical_loss": 3.3081862529626918, + "tokens_seen": 3152566272 + }, + { + "epoch": 10.06, + "learning_rate": 2.271815446339017e-05, + "loss": 2.4705, + "theoretical_loss": 3.308181018565444, + "tokens_seen": 3152631808 + }, + { + "epoch": 10.06, + "learning_rate": 2.2708124373119358e-05, + "loss": 2.411, + "theoretical_loss": 3.3081757843074726, + "tokens_seen": 3152697344 + }, + { + "epoch": 10.06, + "learning_rate": 2.2698094282848546e-05, + "loss": 2.5776, + "theoretical_loss": 3.3081705501887706, + "tokens_seen": 3152762880 + }, + { + "epoch": 10.06, + "learning_rate": 2.2688064192577733e-05, + "loss": 2.4034, + "theoretical_loss": 3.3081653162093323, + "tokens_seen": 3152828416 + }, + { + "epoch": 10.06, + "learning_rate": 2.267803410230692e-05, + "loss": 2.6034, + "theoretical_loss": 3.30816008236915, + "tokens_seen": 3152893952 + }, + { + "epoch": 10.06, + "learning_rate": 2.2668004012036106e-05, + "loss": 2.3616, + "theoretical_loss": 3.3081548486682184, + "tokens_seen": 3152959488 + }, + { + "epoch": 10.06, + "learning_rate": 2.2657973921765297e-05, + "loss": 2.5342, + "theoretical_loss": 3.3081496151065295, + "tokens_seen": 3153025024 + }, + { + "epoch": 10.06, + "learning_rate": 2.2647943831494485e-05, + "loss": 2.6883, + "theoretical_loss": 3.3081443816840777, + "tokens_seen": 3153090560 + }, + { + "epoch": 10.06, + "learning_rate": 2.2637913741223673e-05, + "loss": 2.2046, + "theoretical_loss": 3.308139148400856, + "tokens_seen": 3153156096 + }, + { + "epoch": 10.06, + "learning_rate": 2.262788365095286e-05, + "loss": 2.5592, + "theoretical_loss": 3.3081339152568585, + "tokens_seen": 3153221632 + }, + { + "epoch": 10.06, + "learning_rate": 2.2617853560682045e-05, + "loss": 2.6253, + "theoretical_loss": 3.3081286822520775, + "tokens_seen": 3153287168 + }, + { + "epoch": 10.06, + "learning_rate": 2.2607823470411233e-05, + "loss": 2.4147, + "theoretical_loss": 3.308123449386507, + "tokens_seen": 3153352704 + }, + { + "epoch": 10.06, + "learning_rate": 2.259779338014042e-05, + "loss": 2.5067, + "theoretical_loss": 3.3081182166601404, + "tokens_seen": 3153418240 + }, + { + "epoch": 10.06, + "learning_rate": 2.258776328986961e-05, + "loss": 2.4359, + "theoretical_loss": 3.3081129840729706, + "tokens_seen": 3153483776 + }, + { + "epoch": 10.06, + "learning_rate": 2.2577733199598797e-05, + "loss": 2.4756, + "theoretical_loss": 3.308107751624992, + "tokens_seen": 3153549312 + }, + { + "epoch": 10.06, + "learning_rate": 2.2567703109327985e-05, + "loss": 2.4924, + "theoretical_loss": 3.308102519316197, + "tokens_seen": 3153614848 + }, + { + "epoch": 10.06, + "learning_rate": 2.2557673019057173e-05, + "loss": 2.2686, + "theoretical_loss": 3.30809728714658, + "tokens_seen": 3153680384 + }, + { + "epoch": 10.06, + "learning_rate": 2.254764292878636e-05, + "loss": 2.5645, + "theoretical_loss": 3.3080920551161332, + "tokens_seen": 3153745920 + }, + { + "epoch": 10.06, + "learning_rate": 2.253761283851555e-05, + "loss": 2.5083, + "theoretical_loss": 3.308086823224851, + "tokens_seen": 3153811456 + }, + { + "epoch": 10.06, + "learning_rate": 2.2527582748244736e-05, + "loss": 2.5255, + "theoretical_loss": 3.3080815914727264, + "tokens_seen": 3153876992 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3449889, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.633913040161133, + "objective/train/theoretical_loss": 3.3080789756488462, + "objective/train/tokens_used": 3174369760, + "theoretical_loss": 3.3080789756488462, + "tokens_seen": 3153909760 + }, + { + "epoch": 10.06, + "learning_rate": 2.251755265797392e-05, + "loss": 2.7677, + "theoretical_loss": 3.308076359859753, + "tokens_seen": 3153942528 + }, + { + "epoch": 10.06, + "learning_rate": 2.250752256770311e-05, + "loss": 2.6084, + "theoretical_loss": 3.3080711283859237, + "tokens_seen": 3154008064 + }, + { + "epoch": 10.06, + "learning_rate": 2.2497492477432297e-05, + "loss": 2.3169, + "theoretical_loss": 3.3080658970512324, + "tokens_seen": 3154073600 + }, + { + "epoch": 10.06, + "learning_rate": 2.2487462387161485e-05, + "loss": 2.6202, + "theoretical_loss": 3.3080606658556726, + "tokens_seen": 3154139136 + }, + { + "epoch": 10.06, + "learning_rate": 2.2477432296890672e-05, + "loss": 2.6445, + "theoretical_loss": 3.3080554347992375, + "tokens_seen": 3154204672 + }, + { + "epoch": 10.06, + "learning_rate": 2.246740220661986e-05, + "loss": 2.4673, + "theoretical_loss": 3.3080502038819204, + "tokens_seen": 3154270208 + }, + { + "epoch": 10.06, + "learning_rate": 2.2457372116349048e-05, + "loss": 2.3858, + "theoretical_loss": 3.3080449731037147, + "tokens_seen": 3154335744 + }, + { + "epoch": 10.06, + "learning_rate": 2.2447342026078236e-05, + "loss": 2.5384, + "theoretical_loss": 3.308039742464614, + "tokens_seen": 3154401280 + }, + { + "epoch": 10.06, + "learning_rate": 2.2437311935807424e-05, + "loss": 2.3637, + "theoretical_loss": 3.3080345119646117, + "tokens_seen": 3154466816 + }, + { + "epoch": 10.06, + "learning_rate": 2.242728184553661e-05, + "loss": 2.3666, + "theoretical_loss": 3.308029281603701, + "tokens_seen": 3154532352 + }, + { + "epoch": 10.06, + "learning_rate": 2.2417251755265796e-05, + "loss": 2.6556, + "theoretical_loss": 3.3080240513818757, + "tokens_seen": 3154597888 + }, + { + "epoch": 10.06, + "learning_rate": 2.2407221664994984e-05, + "loss": 2.5709, + "theoretical_loss": 3.308018821299129, + "tokens_seen": 3154663424 + }, + { + "epoch": 10.06, + "learning_rate": 2.2397191574724172e-05, + "loss": 2.5558, + "theoretical_loss": 3.3080135913554543, + "tokens_seen": 3154728960 + }, + { + "epoch": 10.06, + "learning_rate": 2.238716148445336e-05, + "loss": 2.2902, + "theoretical_loss": 3.3080083615508444, + "tokens_seen": 3154794496 + }, + { + "epoch": 10.06, + "learning_rate": 2.2377131394182548e-05, + "loss": 2.3873, + "theoretical_loss": 3.308003131885294, + "tokens_seen": 3154860032 + }, + { + "epoch": 10.06, + "learning_rate": 2.2367101303911736e-05, + "loss": 2.3384, + "theoretical_loss": 3.3079979023587955, + "tokens_seen": 3154925568 + }, + { + "epoch": 10.06, + "learning_rate": 2.2357071213640924e-05, + "loss": 2.3313, + "theoretical_loss": 3.3079926729713427, + "tokens_seen": 3154991104 + }, + { + "epoch": 10.06, + "learning_rate": 2.2347041123370112e-05, + "loss": 2.4978, + "theoretical_loss": 3.3079874437229293, + "tokens_seen": 3155056640 + }, + { + "epoch": 10.06, + "learning_rate": 2.23370110330993e-05, + "loss": 2.2759, + "theoretical_loss": 3.307982214613548, + "tokens_seen": 3155122176 + }, + { + "epoch": 10.06, + "learning_rate": 2.2326980942828484e-05, + "loss": 2.3367, + "theoretical_loss": 3.3079769856431924, + "tokens_seen": 3155187712 + }, + { + "epoch": 10.06, + "learning_rate": 2.2316950852557672e-05, + "loss": 2.509, + "theoretical_loss": 3.307971756811856, + "tokens_seen": 3155253248 + }, + { + "epoch": 10.06, + "learning_rate": 2.230692076228686e-05, + "loss": 2.3909, + "theoretical_loss": 3.307966528119533, + "tokens_seen": 3155318784 + }, + { + "epoch": 10.06, + "learning_rate": 2.2296890672016048e-05, + "loss": 2.2684, + "theoretical_loss": 3.307961299566216, + "tokens_seen": 3155384320 + }, + { + "epoch": 10.06, + "learning_rate": 2.228686058174524e-05, + "loss": 2.4102, + "theoretical_loss": 3.3079560711518985, + "tokens_seen": 3155449856 + }, + { + "epoch": 10.06, + "learning_rate": 2.2276830491474424e-05, + "loss": 2.4176, + "theoretical_loss": 3.3079508428765734, + "tokens_seen": 3155515392 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3454852, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4433863162994385, + "objective/train/theoretical_loss": 3.3079482287910316, + "objective/train/tokens_used": 3176008160, + "theoretical_loss": 3.3079482287910316, + "tokens_seen": 3155548160 + }, + { + "epoch": 10.06, + "learning_rate": 2.226680040120361e-05, + "loss": 2.5062, + "theoretical_loss": 3.3079456147402353, + "tokens_seen": 3155580928 + }, + { + "epoch": 10.06, + "learning_rate": 2.22567703109328e-05, + "loss": 2.5597, + "theoretical_loss": 3.3079403867428767, + "tokens_seen": 3155646464 + }, + { + "epoch": 10.06, + "learning_rate": 2.2246740220661987e-05, + "loss": 2.3627, + "theoretical_loss": 3.3079351588844914, + "tokens_seen": 3155712000 + }, + { + "epoch": 10.06, + "learning_rate": 2.2236710130391175e-05, + "loss": 2.7206, + "theoretical_loss": 3.3079299311650727, + "tokens_seen": 3155777536 + }, + { + "epoch": 10.06, + "learning_rate": 2.222668004012036e-05, + "loss": 2.5366, + "theoretical_loss": 3.3079247035846144, + "tokens_seen": 3155843072 + }, + { + "epoch": 10.06, + "learning_rate": 2.2216649949849548e-05, + "loss": 2.5406, + "theoretical_loss": 3.3079194761431094, + "tokens_seen": 3155908608 + }, + { + "epoch": 10.06, + "learning_rate": 2.2206619859578736e-05, + "loss": 2.5817, + "theoretical_loss": 3.307914248840551, + "tokens_seen": 3155974144 + }, + { + "epoch": 10.06, + "learning_rate": 2.2196589769307923e-05, + "loss": 2.5035, + "theoretical_loss": 3.3079090216769336, + "tokens_seen": 3156039680 + }, + { + "epoch": 10.06, + "learning_rate": 2.2186559679037115e-05, + "loss": 2.5307, + "theoretical_loss": 3.3079037946522494, + "tokens_seen": 3156105216 + }, + { + "epoch": 10.06, + "learning_rate": 2.21765295887663e-05, + "loss": 2.4208, + "theoretical_loss": 3.3078985677664923, + "tokens_seen": 3156170752 + }, + { + "epoch": 10.06, + "learning_rate": 2.2166499498495487e-05, + "loss": 2.5449, + "theoretical_loss": 3.307893341019656, + "tokens_seen": 3156236288 + }, + { + "epoch": 10.06, + "learning_rate": 2.2156469408224675e-05, + "loss": 2.423, + "theoretical_loss": 3.3078881144117336, + "tokens_seen": 3156301824 + }, + { + "epoch": 10.06, + "learning_rate": 2.2146439317953863e-05, + "loss": 2.6555, + "theoretical_loss": 3.307882887942719, + "tokens_seen": 3156367360 + }, + { + "epoch": 10.06, + "learning_rate": 2.213640922768305e-05, + "loss": 2.5079, + "theoretical_loss": 3.307877661612605, + "tokens_seen": 3156432896 + }, + { + "epoch": 10.06, + "learning_rate": 2.2126379137412235e-05, + "loss": 2.5248, + "theoretical_loss": 3.3078724354213853, + "tokens_seen": 3156498432 + }, + { + "epoch": 10.06, + "learning_rate": 2.2116349047141423e-05, + "loss": 2.5177, + "theoretical_loss": 3.3078672093690535, + "tokens_seen": 3156563968 + }, + { + "epoch": 10.06, + "learning_rate": 2.210631895687061e-05, + "loss": 2.4651, + "theoretical_loss": 3.3078619834556027, + "tokens_seen": 3156629504 + }, + { + "epoch": 10.06, + "learning_rate": 2.2096288866599802e-05, + "loss": 2.5399, + "theoretical_loss": 3.3078567576810265, + "tokens_seen": 3156695040 + }, + { + "epoch": 10.06, + "learning_rate": 2.208625877632899e-05, + "loss": 2.37, + "theoretical_loss": 3.3078515320453183, + "tokens_seen": 3156760576 + }, + { + "epoch": 10.06, + "learning_rate": 2.2076228686058175e-05, + "loss": 2.6783, + "theoretical_loss": 3.3078463065484716, + "tokens_seen": 3156826112 + }, + { + "epoch": 10.06, + "learning_rate": 2.2066198595787363e-05, + "loss": 2.4403, + "theoretical_loss": 3.3078410811904795, + "tokens_seen": 3156891648 + }, + { + "epoch": 10.06, + "learning_rate": 2.205616850551655e-05, + "loss": 2.5725, + "theoretical_loss": 3.307835855971336, + "tokens_seen": 3156957184 + }, + { + "epoch": 10.06, + "learning_rate": 2.204613841524574e-05, + "loss": 2.5905, + "theoretical_loss": 3.307830630891034, + "tokens_seen": 3157022720 + }, + { + "epoch": 10.06, + "learning_rate": 2.2036108324974923e-05, + "loss": 2.393, + "theoretical_loss": 3.307825405949567, + "tokens_seen": 3157088256 + }, + { + "epoch": 10.06, + "learning_rate": 2.202607823470411e-05, + "loss": 2.5203, + "theoretical_loss": 3.307820181146929, + "tokens_seen": 3157153792 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3459829, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4088733196258545, + "objective/train/theoretical_loss": 3.3078175687976685, + "objective/train/tokens_used": 3177646560, + "theoretical_loss": 3.3078175687976685, + "tokens_seen": 3157186560 + }, + { + "epoch": 10.06, + "learning_rate": 2.20160481444333e-05, + "loss": 2.473, + "theoretical_loss": 3.3078149564831127, + "tokens_seen": 3157219328 + }, + { + "epoch": 10.06, + "learning_rate": 2.200601805416249e-05, + "loss": 2.5336, + "theoretical_loss": 3.3078097319581117, + "tokens_seen": 3157284864 + }, + { + "epoch": 10.06, + "learning_rate": 2.1995987963891678e-05, + "loss": 2.3604, + "theoretical_loss": 3.30780450757192, + "tokens_seen": 3157350400 + }, + { + "epoch": 10.06, + "learning_rate": 2.1985957873620862e-05, + "loss": 2.6009, + "theoretical_loss": 3.30779928332453, + "tokens_seen": 3157415936 + }, + { + "epoch": 10.06, + "learning_rate": 2.197592778335005e-05, + "loss": 2.3091, + "theoretical_loss": 3.3077940592159365, + "tokens_seen": 3157481472 + }, + { + "epoch": 10.06, + "learning_rate": 2.1965897693079238e-05, + "loss": 2.3954, + "theoretical_loss": 3.3077888352461313, + "tokens_seen": 3157547008 + }, + { + "epoch": 10.06, + "learning_rate": 2.1955867602808426e-05, + "loss": 2.5693, + "theoretical_loss": 3.3077836114151093, + "tokens_seen": 3157612544 + }, + { + "epoch": 10.06, + "learning_rate": 2.1945837512537614e-05, + "loss": 2.6905, + "theoretical_loss": 3.307778387722863, + "tokens_seen": 3157678080 + }, + { + "epoch": 10.06, + "learning_rate": 2.19358074222668e-05, + "loss": 2.5553, + "theoretical_loss": 3.307773164169386, + "tokens_seen": 3157743616 + }, + { + "epoch": 10.06, + "learning_rate": 2.1925777331995986e-05, + "loss": 2.4121, + "theoretical_loss": 3.3077679407546725, + "tokens_seen": 3157809152 + }, + { + "epoch": 10.06, + "learning_rate": 2.1915747241725174e-05, + "loss": 2.2951, + "theoretical_loss": 3.307762717478715, + "tokens_seen": 3157874688 + }, + { + "epoch": 10.06, + "learning_rate": 2.1905717151454366e-05, + "loss": 2.4947, + "theoretical_loss": 3.307757494341507, + "tokens_seen": 3157940224 + }, + { + "epoch": 10.06, + "learning_rate": 2.1895687061183554e-05, + "loss": 2.5092, + "theoretical_loss": 3.307752271343042, + "tokens_seen": 3158005760 + }, + { + "epoch": 10.06, + "learning_rate": 2.1885656970912738e-05, + "loss": 2.4617, + "theoretical_loss": 3.3077470484833142, + "tokens_seen": 3158071296 + }, + { + "epoch": 10.06, + "learning_rate": 2.1875626880641926e-05, + "loss": 2.4422, + "theoretical_loss": 3.3077418257623163, + "tokens_seen": 3158136832 + }, + { + "epoch": 10.06, + "learning_rate": 2.1865596790371114e-05, + "loss": 2.2589, + "theoretical_loss": 3.3077366031800417, + "tokens_seen": 3158202368 + }, + { + "epoch": 10.06, + "learning_rate": 2.1855566700100302e-05, + "loss": 2.4351, + "theoretical_loss": 3.307731380736484, + "tokens_seen": 3158267904 + }, + { + "epoch": 10.06, + "learning_rate": 2.184553660982949e-05, + "loss": 2.5035, + "theoretical_loss": 3.3077261584316373, + "tokens_seen": 3158333440 + }, + { + "epoch": 10.06, + "learning_rate": 2.1835506519558674e-05, + "loss": 2.5327, + "theoretical_loss": 3.307720936265494, + "tokens_seen": 3158398976 + }, + { + "epoch": 10.06, + "learning_rate": 2.1825476429287862e-05, + "loss": 2.6083, + "theoretical_loss": 3.3077157142380478, + "tokens_seen": 3158464512 + }, + { + "epoch": 10.06, + "learning_rate": 2.1815446339017053e-05, + "loss": 2.5066, + "theoretical_loss": 3.307710492349292, + "tokens_seen": 3158530048 + }, + { + "epoch": 10.06, + "learning_rate": 2.180541624874624e-05, + "loss": 2.5568, + "theoretical_loss": 3.3077052705992207, + "tokens_seen": 3158595584 + }, + { + "epoch": 10.06, + "learning_rate": 2.179538615847543e-05, + "loss": 2.4398, + "theoretical_loss": 3.307700048987827, + "tokens_seen": 3158661120 + }, + { + "epoch": 10.06, + "learning_rate": 2.1785356068204614e-05, + "loss": 2.55, + "theoretical_loss": 3.307694827515104, + "tokens_seen": 3158726656 + }, + { + "epoch": 10.06, + "learning_rate": 2.17753259779338e-05, + "loss": 2.5053, + "theoretical_loss": 3.307689606181046, + "tokens_seen": 3158792192 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3464921, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.543109655380249, + "objective/train/theoretical_loss": 3.307686995566014, + "objective/train/tokens_used": 3179284960, + "theoretical_loss": 3.307686995566014, + "tokens_seen": 3158824960 + }, + { + "epoch": 10.06, + "learning_rate": 2.176529588766299e-05, + "loss": 2.3333, + "theoretical_loss": 3.3076843849856457, + "tokens_seen": 3158857728 + }, + { + "epoch": 10.06, + "learning_rate": 2.1755265797392177e-05, + "loss": 2.2218, + "theoretical_loss": 3.3076791639288965, + "tokens_seen": 3158923264 + }, + { + "epoch": 10.06, + "learning_rate": 2.1745235707121365e-05, + "loss": 2.4132, + "theoretical_loss": 3.3076739430107924, + "tokens_seen": 3158988800 + }, + { + "epoch": 10.06, + "learning_rate": 2.173520561685055e-05, + "loss": 2.3925, + "theoretical_loss": 3.307668722231326, + "tokens_seen": 3159054336 + }, + { + "epoch": 10.06, + "learning_rate": 2.1725175526579738e-05, + "loss": 2.333, + "theoretical_loss": 3.3076635015904916, + "tokens_seen": 3159119872 + }, + { + "epoch": 10.06, + "learning_rate": 2.171514543630893e-05, + "loss": 2.1917, + "theoretical_loss": 3.307658281088282, + "tokens_seen": 3159185408 + }, + { + "epoch": 10.06, + "learning_rate": 2.1705115346038117e-05, + "loss": 2.666, + "theoretical_loss": 3.3076530607246912, + "tokens_seen": 3159250944 + }, + { + "epoch": 10.06, + "learning_rate": 2.16950852557673e-05, + "loss": 2.5462, + "theoretical_loss": 3.3076478404997123, + "tokens_seen": 3159316480 + }, + { + "epoch": 10.06, + "learning_rate": 2.168505516549649e-05, + "loss": 2.6005, + "theoretical_loss": 3.307642620413339, + "tokens_seen": 3159382016 + }, + { + "epoch": 10.06, + "learning_rate": 2.1675025075225677e-05, + "loss": 2.3414, + "theoretical_loss": 3.3076374004655644, + "tokens_seen": 3159447552 + }, + { + "epoch": 10.06, + "learning_rate": 2.1664994984954865e-05, + "loss": 2.4654, + "theoretical_loss": 3.307632180656382, + "tokens_seen": 3159513088 + }, + { + "epoch": 10.06, + "learning_rate": 2.1654964894684053e-05, + "loss": 2.5859, + "theoretical_loss": 3.3076269609857856, + "tokens_seen": 3159578624 + }, + { + "epoch": 10.06, + "learning_rate": 2.1644934804413237e-05, + "loss": 2.3229, + "theoretical_loss": 3.3076217414537683, + "tokens_seen": 3159644160 + }, + { + "epoch": 10.06, + "learning_rate": 2.1634904714142425e-05, + "loss": 2.4854, + "theoretical_loss": 3.3076165220603233, + "tokens_seen": 3159709696 + }, + { + "epoch": 10.06, + "learning_rate": 2.1624874623871617e-05, + "loss": 2.5743, + "theoretical_loss": 3.307611302805445, + "tokens_seen": 3159775232 + }, + { + "epoch": 10.06, + "learning_rate": 2.1614844533600804e-05, + "loss": 2.4513, + "theoretical_loss": 3.3076060836891257, + "tokens_seen": 3159840768 + }, + { + "epoch": 10.06, + "learning_rate": 2.1604814443329992e-05, + "loss": 2.5037, + "theoretical_loss": 3.3076008647113597, + "tokens_seen": 3159906304 + }, + { + "epoch": 10.06, + "learning_rate": 2.1594784353059177e-05, + "loss": 2.6026, + "theoretical_loss": 3.3075956458721403, + "tokens_seen": 3159971840 + }, + { + "epoch": 10.06, + "learning_rate": 2.1584754262788365e-05, + "loss": 2.3901, + "theoretical_loss": 3.3075904271714602, + "tokens_seen": 3160037376 + }, + { + "epoch": 10.06, + "learning_rate": 2.1574724172517553e-05, + "loss": 2.4502, + "theoretical_loss": 3.3075852086093143, + "tokens_seen": 3160102912 + }, + { + "epoch": 10.06, + "learning_rate": 2.156469408224674e-05, + "loss": 2.3414, + "theoretical_loss": 3.3075799901856944, + "tokens_seen": 3160168448 + }, + { + "epoch": 10.06, + "learning_rate": 2.155466399197593e-05, + "loss": 2.4773, + "theoretical_loss": 3.3075747719005952, + "tokens_seen": 3160233984 + }, + { + "epoch": 10.06, + "learning_rate": 2.1544633901705113e-05, + "loss": 2.3837, + "theoretical_loss": 3.3075695537540097, + "tokens_seen": 3160299520 + }, + { + "epoch": 10.06, + "learning_rate": 2.1534603811434304e-05, + "loss": 2.4633, + "theoretical_loss": 3.3075643357459312, + "tokens_seen": 3160365056 + }, + { + "epoch": 10.06, + "learning_rate": 2.1524573721163492e-05, + "loss": 2.3822, + "theoretical_loss": 3.307559117876353, + "tokens_seen": 3160430592 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3469998, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.391409158706665, + "objective/train/theoretical_loss": 3.3075565089934997, + "objective/train/tokens_used": 3180923360, + "theoretical_loss": 3.3075565089934997, + "tokens_seen": 3160463360 + }, + { + "epoch": 10.06, + "learning_rate": 2.151454363089268e-05, + "loss": 2.4189, + "theoretical_loss": 3.3075539001452694, + "tokens_seen": 3160496128 + }, + { + "epoch": 10.06, + "learning_rate": 2.1504513540621868e-05, + "loss": 2.6798, + "theoretical_loss": 3.3075486825526728, + "tokens_seen": 3160561664 + }, + { + "epoch": 10.06, + "learning_rate": 2.1494483450351052e-05, + "loss": 2.5103, + "theoretical_loss": 3.3075434650985573, + "tokens_seen": 3160627200 + }, + { + "epoch": 10.06, + "learning_rate": 2.148445336008024e-05, + "loss": 2.4747, + "theoretical_loss": 3.3075382477829165, + "tokens_seen": 3160692736 + }, + { + "epoch": 10.06, + "learning_rate": 2.1474423269809428e-05, + "loss": 2.3861, + "theoretical_loss": 3.307533030605743, + "tokens_seen": 3160758272 + }, + { + "epoch": 10.06, + "learning_rate": 2.1464393179538616e-05, + "loss": 2.4285, + "theoretical_loss": 3.3075278135670314, + "tokens_seen": 3160823808 + }, + { + "epoch": 10.06, + "learning_rate": 2.1454363089267804e-05, + "loss": 2.4031, + "theoretical_loss": 3.3075225966667743, + "tokens_seen": 3160889344 + }, + { + "epoch": 10.06, + "learning_rate": 2.144433299899699e-05, + "loss": 2.3179, + "theoretical_loss": 3.3075173799049655, + "tokens_seen": 3160954880 + }, + { + "epoch": 10.06, + "learning_rate": 2.143430290872618e-05, + "loss": 2.4925, + "theoretical_loss": 3.3075121632815985, + "tokens_seen": 3161020416 + }, + { + "epoch": 10.06, + "learning_rate": 2.1424272818455368e-05, + "loss": 2.3415, + "theoretical_loss": 3.307506946796666, + "tokens_seen": 3161085952 + }, + { + "epoch": 10.06, + "learning_rate": 2.1414242728184556e-05, + "loss": 2.5482, + "theoretical_loss": 3.307501730450163, + "tokens_seen": 3161151488 + }, + { + "epoch": 10.06, + "learning_rate": 2.1404212637913744e-05, + "loss": 2.4444, + "theoretical_loss": 3.3074965142420814, + "tokens_seen": 3161217024 + }, + { + "epoch": 10.06, + "learning_rate": 2.1394182547642928e-05, + "loss": 2.3675, + "theoretical_loss": 3.3074912981724154, + "tokens_seen": 3161282560 + }, + { + "epoch": 10.06, + "learning_rate": 2.1384152457372116e-05, + "loss": 2.4289, + "theoretical_loss": 3.3074860822411587, + "tokens_seen": 3161348096 + }, + { + "epoch": 10.06, + "learning_rate": 2.1374122367101304e-05, + "loss": 2.3863, + "theoretical_loss": 3.307480866448304, + "tokens_seen": 3161413632 + }, + { + "epoch": 10.06, + "learning_rate": 2.1364092276830492e-05, + "loss": 2.2985, + "theoretical_loss": 3.307475650793845, + "tokens_seen": 3161479168 + }, + { + "epoch": 10.06, + "learning_rate": 2.1354062186559676e-05, + "loss": 2.3353, + "theoretical_loss": 3.307470435277776, + "tokens_seen": 3161544704 + }, + { + "epoch": 10.06, + "learning_rate": 2.1344032096288868e-05, + "loss": 2.4991, + "theoretical_loss": 3.3074652199000893, + "tokens_seen": 3161610240 + }, + { + "epoch": 10.06, + "learning_rate": 2.1334002006018055e-05, + "loss": 2.3928, + "theoretical_loss": 3.307460004660779, + "tokens_seen": 3161675776 + }, + { + "epoch": 10.06, + "learning_rate": 2.1323971915747243e-05, + "loss": 2.4569, + "theoretical_loss": 3.3074547895598387, + "tokens_seen": 3161741312 + }, + { + "epoch": 10.06, + "learning_rate": 2.131394182547643e-05, + "loss": 2.4056, + "theoretical_loss": 3.3074495745972614, + "tokens_seen": 3161806848 + }, + { + "epoch": 10.06, + "learning_rate": 2.1303911735205616e-05, + "loss": 2.4745, + "theoretical_loss": 3.3074443597730405, + "tokens_seen": 3161872384 + }, + { + "epoch": 10.06, + "learning_rate": 2.1293881644934804e-05, + "loss": 2.5934, + "theoretical_loss": 3.30743914508717, + "tokens_seen": 3161937920 + }, + { + "epoch": 10.06, + "learning_rate": 2.128385155466399e-05, + "loss": 2.5607, + "theoretical_loss": 3.3074339305396427, + "tokens_seen": 3162003456 + }, + { + "epoch": 10.06, + "learning_rate": 2.127382146439318e-05, + "loss": 2.288, + "theoretical_loss": 3.3074287161304525, + "tokens_seen": 3162068992 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3475090, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.344153881072998, + "objective/train/theoretical_loss": 3.307426108977732, + "objective/train/tokens_used": 3182561760, + "theoretical_loss": 3.307426108977732, + "tokens_seen": 3162101760 + }, + { + "epoch": 10.06, + "learning_rate": 2.1263791374122367e-05, + "loss": 2.3617, + "theoretical_loss": 3.307423501859593, + "tokens_seen": 3162134528 + }, + { + "epoch": 10.06, + "learning_rate": 2.1253761283851555e-05, + "loss": 2.4813, + "theoretical_loss": 3.3074182877270575, + "tokens_seen": 3162200064 + }, + { + "epoch": 10.06, + "learning_rate": 2.1243731193580743e-05, + "loss": 2.5392, + "theoretical_loss": 3.3074130737328393, + "tokens_seen": 3162265600 + }, + { + "epoch": 10.06, + "learning_rate": 2.123370110330993e-05, + "loss": 2.305, + "theoretical_loss": 3.307407859876932, + "tokens_seen": 3162331136 + }, + { + "epoch": 10.06, + "learning_rate": 2.122367101303912e-05, + "loss": 2.2864, + "theoretical_loss": 3.307402646159329, + "tokens_seen": 3162396672 + }, + { + "epoch": 10.06, + "learning_rate": 2.1213640922768307e-05, + "loss": 2.3075, + "theoretical_loss": 3.3073974325800237, + "tokens_seen": 3162462208 + }, + { + "epoch": 10.06, + "learning_rate": 2.120361083249749e-05, + "loss": 2.5167, + "theoretical_loss": 3.30739221913901, + "tokens_seen": 3162527744 + }, + { + "epoch": 10.06, + "learning_rate": 2.119358074222668e-05, + "loss": 2.4662, + "theoretical_loss": 3.3073870058362806, + "tokens_seen": 3162593280 + }, + { + "epoch": 10.06, + "learning_rate": 2.1183550651955867e-05, + "loss": 2.51, + "theoretical_loss": 3.30738179267183, + "tokens_seen": 3162658816 + }, + { + "epoch": 10.06, + "learning_rate": 2.1173520561685055e-05, + "loss": 2.5281, + "theoretical_loss": 3.3073765796456502, + "tokens_seen": 3162724352 + }, + { + "epoch": 10.06, + "learning_rate": 2.1163490471414243e-05, + "loss": 2.4587, + "theoretical_loss": 3.3073713667577365, + "tokens_seen": 3162789888 + }, + { + "epoch": 10.06, + "learning_rate": 2.115346038114343e-05, + "loss": 2.4066, + "theoretical_loss": 3.307366154008081, + "tokens_seen": 3162855424 + }, + { + "epoch": 10.06, + "learning_rate": 2.114343029087262e-05, + "loss": 2.3348, + "theoretical_loss": 3.3073609413966776, + "tokens_seen": 3162920960 + }, + { + "epoch": 10.06, + "learning_rate": 2.1133400200601807e-05, + "loss": 2.4688, + "theoretical_loss": 3.3073557289235196, + "tokens_seen": 3162986496 + }, + { + "epoch": 10.06, + "learning_rate": 2.1123370110330994e-05, + "loss": 2.4862, + "theoretical_loss": 3.307350516588601, + "tokens_seen": 3163052032 + }, + { + "epoch": 10.06, + "learning_rate": 2.1113340020060182e-05, + "loss": 2.296, + "theoretical_loss": 3.307345304391914, + "tokens_seen": 3163117568 + }, + { + "epoch": 10.06, + "learning_rate": 2.1103309929789367e-05, + "loss": 2.2185, + "theoretical_loss": 3.3073400923334537, + "tokens_seen": 3163183104 + }, + { + "epoch": 10.06, + "learning_rate": 2.1093279839518555e-05, + "loss": 2.3724, + "theoretical_loss": 3.3073348804132126, + "tokens_seen": 3163248640 + }, + { + "epoch": 10.06, + "learning_rate": 2.1083249749247743e-05, + "loss": 2.5715, + "theoretical_loss": 3.3073296686311844, + "tokens_seen": 3163314176 + }, + { + "epoch": 10.06, + "learning_rate": 2.107321965897693e-05, + "loss": 2.583, + "theoretical_loss": 3.307324456987363, + "tokens_seen": 3163379712 + }, + { + "epoch": 10.06, + "learning_rate": 2.1063189568706122e-05, + "loss": 2.5574, + "theoretical_loss": 3.307319245481741, + "tokens_seen": 3163445248 + }, + { + "epoch": 10.06, + "learning_rate": 2.1053159478435306e-05, + "loss": 2.3733, + "theoretical_loss": 3.307314034114312, + "tokens_seen": 3163510784 + }, + { + "epoch": 10.06, + "learning_rate": 2.1043129388164494e-05, + "loss": 2.3941, + "theoretical_loss": 3.3073088228850707, + "tokens_seen": 3163576320 + }, + { + "epoch": 10.06, + "learning_rate": 2.1033099297893682e-05, + "loss": 2.5402, + "theoretical_loss": 3.307303611794009, + "tokens_seen": 3163641856 + }, + { + "epoch": 10.06, + "learning_rate": 2.102306920762287e-05, + "loss": 2.4338, + "theoretical_loss": 3.3072984008411215, + "tokens_seen": 3163707392 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3476371, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.771028518676758, + "objective/train/theoretical_loss": 3.3072957954164903, + "objective/train/tokens_used": 3184200160, + "theoretical_loss": 3.3072957954164903, + "tokens_seen": 3163740160 + }, + { + "epoch": 10.06, + "learning_rate": 2.1013039117352058e-05, + "loss": 2.5635, + "theoretical_loss": 3.3072931900264004, + "tokens_seen": 3163772928 + }, + { + "epoch": 10.06, + "learning_rate": 2.1003009027081242e-05, + "loss": 2.4991, + "theoretical_loss": 3.307287979349841, + "tokens_seen": 3163838464 + }, + { + "epoch": 10.06, + "learning_rate": 2.099297893681043e-05, + "loss": 2.4027, + "theoretical_loss": 3.307282768811435, + "tokens_seen": 3163904000 + }, + { + "epoch": 10.06, + "learning_rate": 2.0982948846539618e-05, + "loss": 2.4663, + "theoretical_loss": 3.307277558411177, + "tokens_seen": 3163969536 + }, + { + "epoch": 10.06, + "learning_rate": 2.097291875626881e-05, + "loss": 2.474, + "theoretical_loss": 3.30727234814906, + "tokens_seen": 3164035072 + }, + { + "epoch": 10.06, + "learning_rate": 2.0962888665997994e-05, + "loss": 2.662, + "theoretical_loss": 3.307267138025077, + "tokens_seen": 3164100608 + }, + { + "epoch": 10.06, + "learning_rate": 2.0952858575727182e-05, + "loss": 2.4534, + "theoretical_loss": 3.307261928039223, + "tokens_seen": 3164166144 + }, + { + "epoch": 10.06, + "learning_rate": 2.094282848545637e-05, + "loss": 2.5326, + "theoretical_loss": 3.30725671819149, + "tokens_seen": 3164231680 + }, + { + "epoch": 10.06, + "learning_rate": 2.0932798395185558e-05, + "loss": 2.6487, + "theoretical_loss": 3.3072515084818725, + "tokens_seen": 3164297216 + }, + { + "epoch": 10.06, + "learning_rate": 2.0922768304914746e-05, + "loss": 2.3618, + "theoretical_loss": 3.3072462989103633, + "tokens_seen": 3164362752 + }, + { + "epoch": 10.06, + "learning_rate": 2.091273821464393e-05, + "loss": 2.5271, + "theoretical_loss": 3.3072410894769555, + "tokens_seen": 3164428288 + }, + { + "epoch": 10.06, + "learning_rate": 2.0902708124373118e-05, + "loss": 2.5319, + "theoretical_loss": 3.3072358801816435, + "tokens_seen": 3164493824 + }, + { + "epoch": 10.06, + "learning_rate": 2.0892678034102306e-05, + "loss": 2.4747, + "theoretical_loss": 3.3072306710244206, + "tokens_seen": 3164559360 + }, + { + "epoch": 10.06, + "learning_rate": 2.0882647943831494e-05, + "loss": 2.5544, + "theoretical_loss": 3.30722546200528, + "tokens_seen": 3164624896 + }, + { + "epoch": 10.06, + "learning_rate": 2.0872617853560685e-05, + "loss": 2.171, + "theoretical_loss": 3.3072202531242154, + "tokens_seen": 3164690432 + }, + { + "epoch": 10.06, + "learning_rate": 2.086258776328987e-05, + "loss": 2.4272, + "theoretical_loss": 3.3072150443812203, + "tokens_seen": 3164755968 + }, + { + "epoch": 10.06, + "learning_rate": 2.0852557673019058e-05, + "loss": 2.5415, + "theoretical_loss": 3.3072098357762876, + "tokens_seen": 3164821504 + }, + { + "epoch": 10.06, + "learning_rate": 2.0842527582748245e-05, + "loss": 2.5594, + "theoretical_loss": 3.3072046273094116, + "tokens_seen": 3164887040 + }, + { + "epoch": 10.06, + "learning_rate": 2.0832497492477433e-05, + "loss": 2.4582, + "theoretical_loss": 3.307199418980585, + "tokens_seen": 3164952576 + }, + { + "epoch": 10.06, + "learning_rate": 2.082246740220662e-05, + "loss": 2.6083, + "theoretical_loss": 3.3071942107898025, + "tokens_seen": 3165018112 + }, + { + "epoch": 10.06, + "learning_rate": 2.0812437311935806e-05, + "loss": 2.3261, + "theoretical_loss": 3.307189002737056, + "tokens_seen": 3165083648 + }, + { + "epoch": 10.06, + "learning_rate": 2.0802407221664994e-05, + "loss": 2.373, + "theoretical_loss": 3.30718379482234, + "tokens_seen": 3165149184 + }, + { + "epoch": 10.06, + "learning_rate": 2.079237713139418e-05, + "loss": 2.524, + "theoretical_loss": 3.307178587045648, + "tokens_seen": 3165214720 + }, + { + "epoch": 10.06, + "learning_rate": 2.0782347041123373e-05, + "loss": 2.5273, + "theoretical_loss": 3.307173379406973, + "tokens_seen": 3165280256 + }, + { + "epoch": 10.06, + "learning_rate": 2.077231695085256e-05, + "loss": 2.3858, + "theoretical_loss": 3.3071681719063086, + "tokens_seen": 3165345792 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3477024, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4376564025878906, + "objective/train/theoretical_loss": 3.307165568207729, + "objective/train/tokens_used": 3185838560, + "theoretical_loss": 3.307165568207729, + "tokens_seen": 3165378560 + }, + { + "epoch": 10.06, + "learning_rate": 2.0762286860581745e-05, + "loss": 2.4784, + "theoretical_loss": 3.307162964543649, + "tokens_seen": 3165411328 + }, + { + "epoch": 10.06, + "learning_rate": 2.0752256770310933e-05, + "loss": 2.4405, + "theoretical_loss": 3.3071577573189863, + "tokens_seen": 3165476864 + }, + { + "epoch": 10.06, + "learning_rate": 2.074222668004012e-05, + "loss": 2.5844, + "theoretical_loss": 3.307152550232315, + "tokens_seen": 3165542400 + }, + { + "epoch": 10.06, + "learning_rate": 2.073219658976931e-05, + "loss": 2.39, + "theoretical_loss": 3.3071473432836287, + "tokens_seen": 3165607936 + }, + { + "epoch": 10.06, + "learning_rate": 2.0722166499498497e-05, + "loss": 2.5373, + "theoretical_loss": 3.3071421364729203, + "tokens_seen": 3165673472 + }, + { + "epoch": 10.06, + "learning_rate": 2.071213640922768e-05, + "loss": 2.5483, + "theoretical_loss": 3.307136929800184, + "tokens_seen": 3165739008 + }, + { + "epoch": 10.06, + "learning_rate": 2.070210631895687e-05, + "loss": 2.5403, + "theoretical_loss": 3.307131723265412, + "tokens_seen": 3165804544 + }, + { + "epoch": 10.06, + "learning_rate": 2.069207622868606e-05, + "loss": 2.4486, + "theoretical_loss": 3.3071265168685993, + "tokens_seen": 3165870080 + }, + { + "epoch": 10.06, + "learning_rate": 2.068204613841525e-05, + "loss": 2.5242, + "theoretical_loss": 3.3071213106097384, + "tokens_seen": 3165935616 + }, + { + "epoch": 10.06, + "learning_rate": 2.0672016048144436e-05, + "loss": 2.3799, + "theoretical_loss": 3.307116104488823, + "tokens_seen": 3166001152 + }, + { + "epoch": 10.06, + "learning_rate": 2.066198595787362e-05, + "loss": 2.6575, + "theoretical_loss": 3.307110898505847, + "tokens_seen": 3166066688 + }, + { + "epoch": 10.06, + "learning_rate": 2.065195586760281e-05, + "loss": 2.5328, + "theoretical_loss": 3.3071056926608033, + "tokens_seen": 3166132224 + }, + { + "epoch": 10.06, + "learning_rate": 2.0641925777331997e-05, + "loss": 2.4769, + "theoretical_loss": 3.3071004869536855, + "tokens_seen": 3166197760 + }, + { + "epoch": 10.06, + "learning_rate": 2.0631895687061184e-05, + "loss": 2.2934, + "theoretical_loss": 3.3070952813844876, + "tokens_seen": 3166263296 + }, + { + "epoch": 10.06, + "learning_rate": 2.062186559679037e-05, + "loss": 2.5468, + "theoretical_loss": 3.307090075953203, + "tokens_seen": 3166328832 + }, + { + "epoch": 10.06, + "learning_rate": 2.0611835506519557e-05, + "loss": 2.4467, + "theoretical_loss": 3.3070848706598244, + "tokens_seen": 3166394368 + }, + { + "epoch": 10.06, + "learning_rate": 2.0601805416248745e-05, + "loss": 2.3552, + "theoretical_loss": 3.3070796655043457, + "tokens_seen": 3166459904 + }, + { + "epoch": 10.06, + "learning_rate": 2.0591775325977936e-05, + "loss": 2.6414, + "theoretical_loss": 3.307074460486761, + "tokens_seen": 3166525440 + }, + { + "epoch": 10.06, + "learning_rate": 2.0581745235707124e-05, + "loss": 2.563, + "theoretical_loss": 3.3070692556070633, + "tokens_seen": 3166590976 + }, + { + "epoch": 10.06, + "learning_rate": 2.057171514543631e-05, + "loss": 2.3786, + "theoretical_loss": 3.307064050865246, + "tokens_seen": 3166656512 + }, + { + "epoch": 10.06, + "learning_rate": 2.0561685055165496e-05, + "loss": 2.6311, + "theoretical_loss": 3.3070588462613024, + "tokens_seen": 3166722048 + }, + { + "epoch": 10.06, + "learning_rate": 2.0551654964894684e-05, + "loss": 2.5345, + "theoretical_loss": 3.3070536417952265, + "tokens_seen": 3166787584 + }, + { + "epoch": 10.06, + "learning_rate": 2.0541624874623872e-05, + "loss": 2.5073, + "theoretical_loss": 3.307048437467012, + "tokens_seen": 3166853120 + }, + { + "epoch": 10.06, + "learning_rate": 2.053159478435306e-05, + "loss": 2.4849, + "theoretical_loss": 3.3070432332766515, + "tokens_seen": 3166918656 + }, + { + "epoch": 10.06, + "learning_rate": 2.0521564694082245e-05, + "loss": 2.2932, + "theoretical_loss": 3.307038029224139, + "tokens_seen": 3166984192 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3478614, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3299360275268555, + "objective/train/theoretical_loss": 3.3070354272495734, + "objective/train/tokens_used": 3187476960, + "theoretical_loss": 3.3070354272495734, + "tokens_seen": 3167016960 + }, + { + "epoch": 10.06, + "learning_rate": 2.0511534603811432e-05, + "loss": 2.3095, + "theoretical_loss": 3.3070328253094683, + "tokens_seen": 3167049728 + }, + { + "epoch": 10.06, + "learning_rate": 2.0501504513540624e-05, + "loss": 2.3842, + "theoretical_loss": 3.307027621532632, + "tokens_seen": 3167115264 + }, + { + "epoch": 10.06, + "learning_rate": 2.049147442326981e-05, + "loss": 2.3783, + "theoretical_loss": 3.3070224178936245, + "tokens_seen": 3167180800 + }, + { + "epoch": 10.06, + "learning_rate": 2.0481444332999e-05, + "loss": 2.6956, + "theoretical_loss": 3.307017214392439, + "tokens_seen": 3167246336 + }, + { + "epoch": 10.06, + "learning_rate": 2.0471414242728184e-05, + "loss": 2.4427, + "theoretical_loss": 3.307012011029069, + "tokens_seen": 3167311872 + }, + { + "epoch": 10.06, + "learning_rate": 2.0461384152457372e-05, + "loss": 2.345, + "theoretical_loss": 3.307006807803508, + "tokens_seen": 3167377408 + }, + { + "epoch": 10.06, + "learning_rate": 2.045135406218656e-05, + "loss": 2.589, + "theoretical_loss": 3.307001604715749, + "tokens_seen": 3167442944 + }, + { + "epoch": 10.06, + "learning_rate": 2.0441323971915748e-05, + "loss": 2.406, + "theoretical_loss": 3.3069964017657867, + "tokens_seen": 3167508480 + }, + { + "epoch": 10.06, + "learning_rate": 2.0431293881644936e-05, + "loss": 2.5937, + "theoretical_loss": 3.3069911989536136, + "tokens_seen": 3167574016 + }, + { + "epoch": 10.06, + "learning_rate": 2.042126379137412e-05, + "loss": 2.2958, + "theoretical_loss": 3.3069859962792236, + "tokens_seen": 3167639552 + }, + { + "epoch": 10.06, + "learning_rate": 2.0411233701103308e-05, + "loss": 2.3969, + "theoretical_loss": 3.3069807937426097, + "tokens_seen": 3167705088 + }, + { + "epoch": 10.06, + "learning_rate": 2.04012036108325e-05, + "loss": 2.4126, + "theoretical_loss": 3.306975591343766, + "tokens_seen": 3167770624 + }, + { + "epoch": 10.06, + "learning_rate": 2.0391173520561687e-05, + "loss": 2.5945, + "theoretical_loss": 3.3069703890826854, + "tokens_seen": 3167836160 + }, + { + "epoch": 10.06, + "learning_rate": 2.0381143430290875e-05, + "loss": 2.5026, + "theoretical_loss": 3.306965186959362, + "tokens_seen": 3167901696 + }, + { + "epoch": 10.06, + "learning_rate": 2.037111334002006e-05, + "loss": 2.3167, + "theoretical_loss": 3.306959984973789, + "tokens_seen": 3167967232 + }, + { + "epoch": 10.06, + "learning_rate": 2.0361083249749248e-05, + "loss": 2.4035, + "theoretical_loss": 3.3069547831259607, + "tokens_seen": 3168032768 + }, + { + "epoch": 10.06, + "learning_rate": 2.0351053159478435e-05, + "loss": 2.2836, + "theoretical_loss": 3.3069495814158687, + "tokens_seen": 3168098304 + }, + { + "epoch": 10.06, + "learning_rate": 2.0341023069207623e-05, + "loss": 2.4773, + "theoretical_loss": 3.3069443798435083, + "tokens_seen": 3168163840 + }, + { + "epoch": 10.06, + "learning_rate": 2.033099297893681e-05, + "loss": 2.6237, + "theoretical_loss": 3.3069391784088724, + "tokens_seen": 3168229376 + }, + { + "epoch": 10.06, + "learning_rate": 2.0320962888665996e-05, + "loss": 2.4357, + "theoretical_loss": 3.306933977111955, + "tokens_seen": 3168294912 + }, + { + "epoch": 10.06, + "learning_rate": 2.0310932798395187e-05, + "loss": 2.4402, + "theoretical_loss": 3.3069287759527484, + "tokens_seen": 3168360448 + }, + { + "epoch": 10.06, + "learning_rate": 2.0300902708124375e-05, + "loss": 2.5185, + "theoretical_loss": 3.306923574931247, + "tokens_seen": 3168425984 + }, + { + "epoch": 10.06, + "learning_rate": 2.0290872617853563e-05, + "loss": 2.3453, + "theoretical_loss": 3.3069183740474437, + "tokens_seen": 3168491520 + }, + { + "epoch": 10.06, + "learning_rate": 2.028084252758275e-05, + "loss": 2.3976, + "theoretical_loss": 3.306913173301333, + "tokens_seen": 3168557056 + }, + { + "epoch": 10.06, + "learning_rate": 2.0270812437311935e-05, + "loss": 2.6933, + "theoretical_loss": 3.3069079726929074, + "tokens_seen": 3168622592 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3479361, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6280713081359863, + "objective/train/theoretical_loss": 3.306905372440325, + "objective/train/tokens_used": 3189115360, + "theoretical_loss": 3.306905372440325, + "tokens_seen": 3168655360 + }, + { + "epoch": 10.06, + "learning_rate": 2.0260782347041123e-05, + "loss": 2.338, + "theoretical_loss": 3.306902772222161, + "tokens_seen": 3168688128 + }, + { + "epoch": 10.06, + "learning_rate": 2.025075225677031e-05, + "loss": 2.3721, + "theoretical_loss": 3.3068975718890874, + "tokens_seen": 3168753664 + }, + { + "epoch": 10.06, + "learning_rate": 2.02407221664995e-05, + "loss": 2.4192, + "theoretical_loss": 3.3068923716936793, + "tokens_seen": 3168819200 + }, + { + "epoch": 10.06, + "learning_rate": 2.0230692076228683e-05, + "loss": 2.4703, + "theoretical_loss": 3.306887171635931, + "tokens_seen": 3168884736 + }, + { + "epoch": 10.06, + "learning_rate": 2.0220661985957875e-05, + "loss": 2.4881, + "theoretical_loss": 3.306881971715836, + "tokens_seen": 3168950272 + }, + { + "epoch": 10.06, + "learning_rate": 2.0210631895687063e-05, + "loss": 2.4812, + "theoretical_loss": 3.3068767719333874, + "tokens_seen": 3169015808 + }, + { + "epoch": 10.06, + "learning_rate": 2.020060180541625e-05, + "loss": 2.4705, + "theoretical_loss": 3.306871572288579, + "tokens_seen": 3169081344 + }, + { + "epoch": 10.06, + "learning_rate": 2.019057171514544e-05, + "loss": 2.5075, + "theoretical_loss": 3.306866372781404, + "tokens_seen": 3169146880 + }, + { + "epoch": 10.06, + "learning_rate": 2.0180541624874623e-05, + "loss": 2.3825, + "theoretical_loss": 3.306861173411856, + "tokens_seen": 3169212416 + }, + { + "epoch": 10.06, + "learning_rate": 2.017051153460381e-05, + "loss": 2.4676, + "theoretical_loss": 3.306855974179929, + "tokens_seen": 3169277952 + }, + { + "epoch": 10.06, + "learning_rate": 2.0160481444333e-05, + "loss": 2.3565, + "theoretical_loss": 3.306850775085616, + "tokens_seen": 3169343488 + }, + { + "epoch": 10.06, + "learning_rate": 2.0150451354062187e-05, + "loss": 2.3816, + "theoretical_loss": 3.3068455761289104, + "tokens_seen": 3169409024 + }, + { + "epoch": 10.06, + "learning_rate": 2.0140421263791374e-05, + "loss": 2.523, + "theoretical_loss": 3.306840377309806, + "tokens_seen": 3169474560 + }, + { + "epoch": 10.06, + "learning_rate": 2.013039117352056e-05, + "loss": 2.3025, + "theoretical_loss": 3.3068351786282966, + "tokens_seen": 3169540096 + }, + { + "epoch": 10.06, + "learning_rate": 2.012036108324975e-05, + "loss": 2.3292, + "theoretical_loss": 3.306829980084375, + "tokens_seen": 3169605632 + }, + { + "epoch": 10.06, + "learning_rate": 2.0110330992978938e-05, + "loss": 2.3514, + "theoretical_loss": 3.306824781678035, + "tokens_seen": 3169671168 + }, + { + "epoch": 10.06, + "learning_rate": 2.0100300902708126e-05, + "loss": 2.5131, + "theoretical_loss": 3.3068195834092706, + "tokens_seen": 3169736704 + }, + { + "epoch": 10.06, + "learning_rate": 2.0090270812437314e-05, + "loss": 2.5239, + "theoretical_loss": 3.3068143852780745, + "tokens_seen": 3169802240 + }, + { + "epoch": 10.06, + "learning_rate": 2.00802407221665e-05, + "loss": 2.2016, + "theoretical_loss": 3.3068091872844407, + "tokens_seen": 3169867776 + }, + { + "epoch": 10.06, + "learning_rate": 2.0070210631895686e-05, + "loss": 2.3087, + "theoretical_loss": 3.306803989428363, + "tokens_seen": 3169933312 + }, + { + "epoch": 10.06, + "learning_rate": 2.0060180541624874e-05, + "loss": 2.5405, + "theoretical_loss": 3.3067987917098343, + "tokens_seen": 3169998848 + }, + { + "epoch": 10.06, + "learning_rate": 2.0050150451354062e-05, + "loss": 2.4821, + "theoretical_loss": 3.3067935941288487, + "tokens_seen": 3170064384 + }, + { + "epoch": 10.06, + "learning_rate": 2.004012036108325e-05, + "loss": 2.4352, + "theoretical_loss": 3.3067883966853993, + "tokens_seen": 3170129920 + }, + { + "epoch": 10.06, + "learning_rate": 2.0030090270812438e-05, + "loss": 2.4625, + "theoretical_loss": 3.3067831993794794, + "tokens_seen": 3170195456 + }, + { + "epoch": 10.06, + "learning_rate": 2.0020060180541626e-05, + "loss": 2.5055, + "theoretical_loss": 3.3067780022110833, + "tokens_seen": 3170260992 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3480744, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.456031560897827, + "objective/train/theoretical_loss": 3.3067754036784542, + "objective/train/tokens_used": 3190753760, + "theoretical_loss": 3.3067754036784542, + "tokens_seen": 3170293760 + }, + { + "epoch": 10.06, + "learning_rate": 2.0010030090270814e-05, + "loss": 2.3639, + "theoretical_loss": 3.3067728051802034, + "tokens_seen": 3170326528 + }, + { + "epoch": 10.06, + "learning_rate": 2e-05, + "loss": 2.3817, + "theoretical_loss": 3.3067676082868345, + "tokens_seen": 3170392064 + }, + { + "epoch": 10.06, + "learning_rate": 1.998996990972919e-05, + "loss": 2.4456, + "theoretical_loss": 3.3067624115309693, + "tokens_seen": 3170457600 + }, + { + "epoch": 10.06, + "learning_rate": 1.9979939819458374e-05, + "loss": 2.4113, + "theoretical_loss": 3.306757214912601, + "tokens_seen": 3170523136 + }, + { + "epoch": 10.06, + "learning_rate": 1.9969909729187562e-05, + "loss": 2.3766, + "theoretical_loss": 3.3067520184317245, + "tokens_seen": 3170588672 + }, + { + "epoch": 10.06, + "learning_rate": 1.995987963891675e-05, + "loss": 2.4986, + "theoretical_loss": 3.306746822088332, + "tokens_seen": 3170654208 + }, + { + "epoch": 10.06, + "learning_rate": 1.9949849548645938e-05, + "loss": 2.6378, + "theoretical_loss": 3.3067416258824176, + "tokens_seen": 3170719744 + }, + { + "epoch": 10.06, + "learning_rate": 1.993981945837513e-05, + "loss": 2.2618, + "theoretical_loss": 3.3067364298139745, + "tokens_seen": 3170785280 + }, + { + "epoch": 10.06, + "learning_rate": 1.9929789368104314e-05, + "loss": 2.3273, + "theoretical_loss": 3.3067312338829966, + "tokens_seen": 3170850816 + }, + { + "epoch": 10.06, + "learning_rate": 1.99197592778335e-05, + "loss": 2.4683, + "theoretical_loss": 3.3067260380894767, + "tokens_seen": 3170916352 + }, + { + "epoch": 10.06, + "learning_rate": 1.990972918756269e-05, + "loss": 2.3577, + "theoretical_loss": 3.3067208424334096, + "tokens_seen": 3170981888 + }, + { + "epoch": 10.06, + "learning_rate": 1.9899699097291877e-05, + "loss": 2.5095, + "theoretical_loss": 3.306715646914788, + "tokens_seen": 3171047424 + }, + { + "epoch": 10.06, + "learning_rate": 1.9889669007021062e-05, + "loss": 2.4437, + "theoretical_loss": 3.306710451533605, + "tokens_seen": 3171112960 + }, + { + "epoch": 10.06, + "learning_rate": 1.987963891675025e-05, + "loss": 2.4689, + "theoretical_loss": 3.306705256289855, + "tokens_seen": 3171178496 + }, + { + "epoch": 10.06, + "learning_rate": 1.9869608826479438e-05, + "loss": 2.6726, + "theoretical_loss": 3.3067000611835313, + "tokens_seen": 3171244032 + }, + { + "epoch": 10.06, + "learning_rate": 1.9859578736208625e-05, + "loss": 2.4607, + "theoretical_loss": 3.306694866214627, + "tokens_seen": 3171309568 + }, + { + "epoch": 10.06, + "learning_rate": 1.9849548645937813e-05, + "loss": 2.3755, + "theoretical_loss": 3.306689671383136, + "tokens_seen": 3171375104 + }, + { + "epoch": 10.06, + "learning_rate": 1.9839518555667e-05, + "loss": 2.2422, + "theoretical_loss": 3.3066844766890515, + "tokens_seen": 3171440640 + }, + { + "epoch": 10.06, + "learning_rate": 1.982948846539619e-05, + "loss": 2.567, + "theoretical_loss": 3.3066792821323676, + "tokens_seen": 3171506176 + }, + { + "epoch": 10.06, + "learning_rate": 1.9819458375125377e-05, + "loss": 2.4341, + "theoretical_loss": 3.3066740877130774, + "tokens_seen": 3171571712 + }, + { + "epoch": 10.06, + "learning_rate": 1.9809428284854565e-05, + "loss": 2.5548, + "theoretical_loss": 3.3066688934311745, + "tokens_seen": 3171637248 + }, + { + "epoch": 10.06, + "learning_rate": 1.9799398194583753e-05, + "loss": 2.4094, + "theoretical_loss": 3.306663699286652, + "tokens_seen": 3171702784 + }, + { + "epoch": 10.06, + "learning_rate": 1.9789368104312937e-05, + "loss": 2.3014, + "theoretical_loss": 3.3066585052795046, + "tokens_seen": 3171768320 + }, + { + "epoch": 10.06, + "learning_rate": 1.9779338014042125e-05, + "loss": 2.4894, + "theoretical_loss": 3.3066533114097245, + "tokens_seen": 3171833856 + }, + { + "epoch": 10.06, + "learning_rate": 1.9769307923771313e-05, + "loss": 2.4606, + "theoretical_loss": 3.306648117677306, + "tokens_seen": 3171899392 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3481266, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.341278076171875, + "objective/train/theoretical_loss": 3.3066455208626055, + "objective/train/tokens_used": 3192392160, + "theoretical_loss": 3.3066455208626055, + "tokens_seen": 3171932160 + }, + { + "epoch": 10.06, + "learning_rate": 1.97592778335005e-05, + "loss": 2.6419, + "theoretical_loss": 3.306642924082243, + "tokens_seen": 3171964928 + }, + { + "epoch": 10.06, + "learning_rate": 1.9749247743229692e-05, + "loss": 2.4621, + "theoretical_loss": 3.306637730624528, + "tokens_seen": 3172030464 + }, + { + "epoch": 10.06, + "learning_rate": 1.9739217652958877e-05, + "loss": 2.3827, + "theoretical_loss": 3.306632537304155, + "tokens_seen": 3172096000 + }, + { + "epoch": 10.06, + "learning_rate": 1.9729187562688065e-05, + "loss": 2.4731, + "theoretical_loss": 3.3066273441211176, + "tokens_seen": 3172161536 + }, + { + "epoch": 10.06, + "learning_rate": 1.9719157472417253e-05, + "loss": 2.6043, + "theoretical_loss": 3.306622151075409, + "tokens_seen": 3172227072 + }, + { + "epoch": 10.06, + "learning_rate": 1.970912738214644e-05, + "loss": 2.4431, + "theoretical_loss": 3.3066169581670235, + "tokens_seen": 3172292608 + }, + { + "epoch": 10.06, + "learning_rate": 1.969909729187563e-05, + "loss": 2.5019, + "theoretical_loss": 3.306611765395954, + "tokens_seen": 3172358144 + }, + { + "epoch": 10.06, + "learning_rate": 1.9689067201604813e-05, + "loss": 2.5117, + "theoretical_loss": 3.3066065727621936, + "tokens_seen": 3172423680 + }, + { + "epoch": 10.06, + "learning_rate": 1.9679037111334e-05, + "loss": 2.5637, + "theoretical_loss": 3.3066013802657372, + "tokens_seen": 3172489216 + }, + { + "epoch": 10.06, + "learning_rate": 1.966900702106319e-05, + "loss": 2.5527, + "theoretical_loss": 3.3065961879065773, + "tokens_seen": 3172554752 + }, + { + "epoch": 10.06, + "learning_rate": 1.965897693079238e-05, + "loss": 2.4419, + "theoretical_loss": 3.3065909956847075, + "tokens_seen": 3172620288 + }, + { + "epoch": 10.06, + "learning_rate": 1.9648946840521568e-05, + "loss": 2.4635, + "theoretical_loss": 3.3065858036001217, + "tokens_seen": 3172685824 + }, + { + "epoch": 10.06, + "learning_rate": 1.9638916750250752e-05, + "loss": 2.4404, + "theoretical_loss": 3.306580611652813, + "tokens_seen": 3172751360 + }, + { + "epoch": 10.06, + "learning_rate": 1.962888665997994e-05, + "loss": 2.6059, + "theoretical_loss": 3.3065754198427753, + "tokens_seen": 3172816896 + }, + { + "epoch": 10.06, + "learning_rate": 1.9618856569709128e-05, + "loss": 2.4127, + "theoretical_loss": 3.306570228170002, + "tokens_seen": 3172882432 + }, + { + "epoch": 10.06, + "learning_rate": 1.9608826479438316e-05, + "loss": 2.6107, + "theoretical_loss": 3.3065650366344865, + "tokens_seen": 3172947968 + }, + { + "epoch": 10.06, + "learning_rate": 1.9598796389167504e-05, + "loss": 2.3836, + "theoretical_loss": 3.306559845236223, + "tokens_seen": 3173013504 + }, + { + "epoch": 10.06, + "learning_rate": 1.958876629889669e-05, + "loss": 2.2373, + "theoretical_loss": 3.306554653975204, + "tokens_seen": 3173079040 + }, + { + "epoch": 10.06, + "learning_rate": 1.9578736208625876e-05, + "loss": 2.261, + "theoretical_loss": 3.3065494628514234, + "tokens_seen": 3173144576 + }, + { + "epoch": 10.06, + "learning_rate": 1.9568706118355064e-05, + "loss": 2.5838, + "theoretical_loss": 3.306544271864875, + "tokens_seen": 3173210112 + }, + { + "epoch": 10.06, + "learning_rate": 1.9558676028084256e-05, + "loss": 2.5773, + "theoretical_loss": 3.306539081015553, + "tokens_seen": 3173275648 + }, + { + "epoch": 10.06, + "learning_rate": 1.9548645937813443e-05, + "loss": 2.6803, + "theoretical_loss": 3.306533890303449, + "tokens_seen": 3173341184 + }, + { + "epoch": 10.06, + "learning_rate": 1.9538615847542628e-05, + "loss": 2.3349, + "theoretical_loss": 3.306528699728559, + "tokens_seen": 3173406720 + }, + { + "epoch": 10.06, + "learning_rate": 1.9528585757271816e-05, + "loss": 2.4657, + "theoretical_loss": 3.3065235092908742, + "tokens_seen": 3173472256 + }, + { + "epoch": 10.06, + "learning_rate": 1.9518555667001004e-05, + "loss": 2.2617, + "theoretical_loss": 3.3065183189903893, + "tokens_seen": 3173537792 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3481919, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.522944211959839, + "objective/train/theoretical_loss": 3.3065157238915948, + "objective/train/tokens_used": 3194030560, + "theoretical_loss": 3.3065157238915948, + "tokens_seen": 3173570560 + }, + { + "epoch": 10.06, + "learning_rate": 1.950852557673019e-05, + "loss": 2.2966, + "theoretical_loss": 3.306513128827098, + "tokens_seen": 3173603328 + }, + { + "epoch": 10.06, + "learning_rate": 1.9498495486459376e-05, + "loss": 2.3289, + "theoretical_loss": 3.3065079388009937, + "tokens_seen": 3173668864 + }, + { + "epoch": 10.06, + "learning_rate": 1.9488465396188564e-05, + "loss": 2.4387, + "theoretical_loss": 3.3065027489120697, + "tokens_seen": 3173734400 + }, + { + "epoch": 10.06, + "learning_rate": 1.9478435305917752e-05, + "loss": 2.2595, + "theoretical_loss": 3.3064975591603196, + "tokens_seen": 3173799936 + }, + { + "epoch": 10.06, + "learning_rate": 1.9468405215646943e-05, + "loss": 2.5694, + "theoretical_loss": 3.3064923695457367, + "tokens_seen": 3173865472 + }, + { + "epoch": 10.06, + "learning_rate": 1.945837512537613e-05, + "loss": 2.4235, + "theoretical_loss": 3.3064871800683155, + "tokens_seen": 3173931008 + }, + { + "epoch": 10.06, + "learning_rate": 1.9448345035105316e-05, + "loss": 2.7503, + "theoretical_loss": 3.3064819907280483, + "tokens_seen": 3173996544 + }, + { + "epoch": 10.06, + "learning_rate": 1.9438314944834504e-05, + "loss": 2.3043, + "theoretical_loss": 3.306476801524929, + "tokens_seen": 3174062080 + }, + { + "epoch": 10.06, + "learning_rate": 1.942828485456369e-05, + "loss": 2.2893, + "theoretical_loss": 3.306471612458952, + "tokens_seen": 3174127616 + }, + { + "epoch": 10.06, + "learning_rate": 1.941825476429288e-05, + "loss": 2.5659, + "theoretical_loss": 3.30646642353011, + "tokens_seen": 3174193152 + }, + { + "epoch": 10.06, + "learning_rate": 1.9408224674022067e-05, + "loss": 2.4194, + "theoretical_loss": 3.306461234738397, + "tokens_seen": 3174258688 + }, + { + "epoch": 10.06, + "learning_rate": 1.9398194583751252e-05, + "loss": 2.5188, + "theoretical_loss": 3.306456046083806, + "tokens_seen": 3174324224 + }, + { + "epoch": 10.06, + "learning_rate": 1.938816449348044e-05, + "loss": 2.5068, + "theoretical_loss": 3.3064508575663307, + "tokens_seen": 3174389760 + }, + { + "epoch": 10.06, + "learning_rate": 1.937813440320963e-05, + "loss": 2.501, + "theoretical_loss": 3.306445669185965, + "tokens_seen": 3174455296 + }, + { + "epoch": 10.06, + "learning_rate": 1.936810431293882e-05, + "loss": 2.6457, + "theoretical_loss": 3.3064404809427024, + "tokens_seen": 3174520832 + }, + { + "epoch": 10.06, + "learning_rate": 1.9358074222668007e-05, + "loss": 2.5617, + "theoretical_loss": 3.306435292836536, + "tokens_seen": 3174586368 + }, + { + "epoch": 10.06, + "learning_rate": 1.934804413239719e-05, + "loss": 2.4253, + "theoretical_loss": 3.3064301048674594, + "tokens_seen": 3174651904 + }, + { + "epoch": 10.06, + "learning_rate": 1.933801404212638e-05, + "loss": 2.3715, + "theoretical_loss": 3.306424917035467, + "tokens_seen": 3174717440 + }, + { + "epoch": 10.06, + "learning_rate": 1.9327983951855567e-05, + "loss": 2.6437, + "theoretical_loss": 3.3064197293405515, + "tokens_seen": 3174782976 + }, + { + "epoch": 10.06, + "learning_rate": 1.9317953861584755e-05, + "loss": 2.4519, + "theoretical_loss": 3.3064145417827064, + "tokens_seen": 3174848512 + }, + { + "epoch": 10.06, + "learning_rate": 1.9307923771313943e-05, + "loss": 2.38, + "theoretical_loss": 3.306409354361926, + "tokens_seen": 3174914048 + }, + { + "epoch": 10.06, + "learning_rate": 1.9297893681043127e-05, + "loss": 2.3301, + "theoretical_loss": 3.3064041670782025, + "tokens_seen": 3174979584 + }, + { + "epoch": 10.06, + "learning_rate": 1.9287863590772315e-05, + "loss": 2.3565, + "theoretical_loss": 3.306398979931531, + "tokens_seen": 3175045120 + }, + { + "epoch": 10.06, + "learning_rate": 1.9277833500501506e-05, + "loss": 2.6274, + "theoretical_loss": 3.3063937929219045, + "tokens_seen": 3175110656 + }, + { + "epoch": 10.06, + "learning_rate": 1.9267803410230694e-05, + "loss": 2.4097, + "theoretical_loss": 3.306388606049316, + "tokens_seen": 3175176192 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3483238, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4159152507781982, + "objective/train/theoretical_loss": 3.3063860126644093, + "objective/train/tokens_used": 3195668960, + "theoretical_loss": 3.3063860126644093, + "tokens_seen": 3175208960 + }, + { + "epoch": 10.06, + "learning_rate": 1.9257773319959882e-05, + "loss": 2.4149, + "theoretical_loss": 3.30638341931376, + "tokens_seen": 3175241728 + }, + { + "epoch": 10.06, + "learning_rate": 1.9247743229689067e-05, + "loss": 2.4923, + "theoretical_loss": 3.306378232715229, + "tokens_seen": 3175307264 + }, + { + "epoch": 10.06, + "learning_rate": 1.9237713139418255e-05, + "loss": 2.3561, + "theoretical_loss": 3.306373046253717, + "tokens_seen": 3175372800 + }, + { + "epoch": 10.06, + "learning_rate": 1.9227683049147443e-05, + "loss": 2.2018, + "theoretical_loss": 3.3063678599292183, + "tokens_seen": 3175438336 + }, + { + "epoch": 10.06, + "learning_rate": 1.921765295887663e-05, + "loss": 2.5324, + "theoretical_loss": 3.306362673741725, + "tokens_seen": 3175503872 + }, + { + "epoch": 10.06, + "learning_rate": 1.920762286860582e-05, + "loss": 2.4901, + "theoretical_loss": 3.306357487691232, + "tokens_seen": 3175569408 + }, + { + "epoch": 10.06, + "learning_rate": 1.9197592778335003e-05, + "loss": 2.4206, + "theoretical_loss": 3.306352301777732, + "tokens_seen": 3175634944 + }, + { + "epoch": 10.06, + "learning_rate": 1.9187562688064194e-05, + "loss": 2.5609, + "theoretical_loss": 3.306347116001219, + "tokens_seen": 3175700480 + }, + { + "epoch": 10.06, + "learning_rate": 1.9177532597793382e-05, + "loss": 2.5024, + "theoretical_loss": 3.3063419303616866, + "tokens_seen": 3175766016 + }, + { + "epoch": 10.06, + "learning_rate": 1.916750250752257e-05, + "loss": 2.3054, + "theoretical_loss": 3.3063367448591277, + "tokens_seen": 3175831552 + }, + { + "epoch": 10.06, + "learning_rate": 1.9157472417251758e-05, + "loss": 2.3981, + "theoretical_loss": 3.3063315594935365, + "tokens_seen": 3175897088 + }, + { + "epoch": 10.06, + "learning_rate": 1.9147442326980942e-05, + "loss": 2.3889, + "theoretical_loss": 3.3063263742649065, + "tokens_seen": 3175962624 + }, + { + "epoch": 10.06, + "learning_rate": 1.913741223671013e-05, + "loss": 2.4548, + "theoretical_loss": 3.306321189173231, + "tokens_seen": 3176028160 + }, + { + "epoch": 10.06, + "learning_rate": 1.9127382146439318e-05, + "loss": 2.5747, + "theoretical_loss": 3.3063160042185036, + "tokens_seen": 3176093696 + }, + { + "epoch": 10.06, + "learning_rate": 1.9117352056168506e-05, + "loss": 2.3625, + "theoretical_loss": 3.306310819400718, + "tokens_seen": 3176159232 + }, + { + "epoch": 10.06, + "learning_rate": 1.910732196589769e-05, + "loss": 2.5319, + "theoretical_loss": 3.306305634719868, + "tokens_seen": 3176224768 + }, + { + "epoch": 10.06, + "learning_rate": 1.909729187562688e-05, + "loss": 2.3626, + "theoretical_loss": 3.3063004501759465, + "tokens_seen": 3176290304 + }, + { + "epoch": 10.06, + "learning_rate": 1.908726178535607e-05, + "loss": 2.3802, + "theoretical_loss": 3.3062952657689473, + "tokens_seen": 3176355840 + }, + { + "epoch": 10.06, + "learning_rate": 1.9077231695085258e-05, + "loss": 2.5714, + "theoretical_loss": 3.3062900814988643, + "tokens_seen": 3176421376 + }, + { + "epoch": 10.06, + "learning_rate": 1.9067201604814446e-05, + "loss": 2.4303, + "theoretical_loss": 3.306284897365691, + "tokens_seen": 3176486912 + }, + { + "epoch": 10.06, + "learning_rate": 1.905717151454363e-05, + "loss": 2.416, + "theoretical_loss": 3.3062797133694204, + "tokens_seen": 3176552448 + }, + { + "epoch": 10.06, + "learning_rate": 1.9047141424272818e-05, + "loss": 2.4513, + "theoretical_loss": 3.3062745295100466, + "tokens_seen": 3176617984 + }, + { + "epoch": 10.06, + "learning_rate": 1.9037111334002006e-05, + "loss": 2.4468, + "theoretical_loss": 3.306269345787563, + "tokens_seen": 3176683520 + }, + { + "epoch": 10.06, + "learning_rate": 1.9027081243731194e-05, + "loss": 2.4344, + "theoretical_loss": 3.306264162201963, + "tokens_seen": 3176749056 + }, + { + "epoch": 10.06, + "learning_rate": 1.901705115346038e-05, + "loss": 2.3009, + "theoretical_loss": 3.306258978753241, + "tokens_seen": 3176814592 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3484000, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.129225969314575, + "objective/train/theoretical_loss": 3.3062563870802064, + "objective/train/tokens_used": 3197307360, + "theoretical_loss": 3.3062563870802064, + "tokens_seen": 3176847360 + }, + { + "epoch": 10.06, + "learning_rate": 1.9007021063189566e-05, + "loss": 2.3505, + "theoretical_loss": 3.306253795441389, + "tokens_seen": 3176880128 + }, + { + "epoch": 10.06, + "learning_rate": 1.8996990972918757e-05, + "loss": 2.4374, + "theoretical_loss": 3.306248612266402, + "tokens_seen": 3176945664 + }, + { + "epoch": 10.06, + "learning_rate": 1.8986960882647945e-05, + "loss": 2.3951, + "theoretical_loss": 3.3062434292282727, + "tokens_seen": 3177011200 + }, + { + "epoch": 10.06, + "learning_rate": 1.8976930792377133e-05, + "loss": 2.558, + "theoretical_loss": 3.3062382463269953, + "tokens_seen": 3177076736 + }, + { + "epoch": 10.06, + "learning_rate": 1.896690070210632e-05, + "loss": 2.5036, + "theoretical_loss": 3.306233063562563, + "tokens_seen": 3177142272 + }, + { + "epoch": 10.06, + "learning_rate": 1.8956870611835506e-05, + "loss": 2.4004, + "theoretical_loss": 3.306227880934969, + "tokens_seen": 3177207808 + }, + { + "epoch": 10.06, + "learning_rate": 1.8946840521564694e-05, + "loss": 2.3869, + "theoretical_loss": 3.306222698444208, + "tokens_seen": 3177273344 + }, + { + "epoch": 10.06, + "learning_rate": 1.893681043129388e-05, + "loss": 2.601, + "theoretical_loss": 3.306217516090272, + "tokens_seen": 3177338880 + }, + { + "epoch": 10.06, + "learning_rate": 1.892678034102307e-05, + "loss": 2.2395, + "theoretical_loss": 3.306212333873156, + "tokens_seen": 3177404416 + }, + { + "epoch": 10.06, + "learning_rate": 1.8916750250752257e-05, + "loss": 2.2643, + "theoretical_loss": 3.306207151792853, + "tokens_seen": 3177469952 + }, + { + "epoch": 10.06, + "learning_rate": 1.8906720160481445e-05, + "loss": 2.4005, + "theoretical_loss": 3.3062019698493565, + "tokens_seen": 3177535488 + }, + { + "epoch": 10.06, + "learning_rate": 1.8896690070210633e-05, + "loss": 2.4188, + "theoretical_loss": 3.30619678804266, + "tokens_seen": 3177601024 + }, + { + "epoch": 10.06, + "learning_rate": 1.888665997993982e-05, + "loss": 2.5227, + "theoretical_loss": 3.306191606372757, + "tokens_seen": 3177666560 + }, + { + "epoch": 10.06, + "learning_rate": 1.887662988966901e-05, + "loss": 2.3601, + "theoretical_loss": 3.3061864248396415, + "tokens_seen": 3177732096 + }, + { + "epoch": 10.06, + "learning_rate": 1.8866599799398197e-05, + "loss": 2.1405, + "theoretical_loss": 3.3061812434433064, + "tokens_seen": 3177797632 + }, + { + "epoch": 10.06, + "learning_rate": 1.885656970912738e-05, + "loss": 2.4175, + "theoretical_loss": 3.306176062183746, + "tokens_seen": 3177863168 + }, + { + "epoch": 10.06, + "learning_rate": 1.884653961885657e-05, + "loss": 2.6218, + "theoretical_loss": 3.306170881060954, + "tokens_seen": 3177928704 + }, + { + "epoch": 10.06, + "learning_rate": 1.8836509528585757e-05, + "loss": 2.3887, + "theoretical_loss": 3.3061657000749225, + "tokens_seen": 3177994240 + }, + { + "epoch": 10.06, + "learning_rate": 1.8826479438314945e-05, + "loss": 2.3948, + "theoretical_loss": 3.3061605192256467, + "tokens_seen": 3178059776 + }, + { + "epoch": 10.06, + "learning_rate": 1.881644934804413e-05, + "loss": 2.4543, + "theoretical_loss": 3.3061553385131193, + "tokens_seen": 3178125312 + }, + { + "epoch": 10.06, + "learning_rate": 1.880641925777332e-05, + "loss": 2.5416, + "theoretical_loss": 3.3061501579373345, + "tokens_seen": 3178190848 + }, + { + "epoch": 10.06, + "learning_rate": 1.879638916750251e-05, + "loss": 2.5428, + "theoretical_loss": 3.306144977498285, + "tokens_seen": 3178256384 + }, + { + "epoch": 10.06, + "learning_rate": 1.8786359077231696e-05, + "loss": 2.1963, + "theoretical_loss": 3.3061397971959656, + "tokens_seen": 3178321920 + }, + { + "epoch": 10.06, + "learning_rate": 1.8776328986960884e-05, + "loss": 2.2851, + "theoretical_loss": 3.3061346170303683, + "tokens_seen": 3178387456 + }, + { + "epoch": 10.06, + "learning_rate": 1.876629889669007e-05, + "loss": 2.6108, + "theoretical_loss": 3.306129437001488, + "tokens_seen": 3178452992 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3485379, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.521840810775757, + "objective/train/theoretical_loss": 3.3061268470383145, + "objective/train/tokens_used": 3198945760, + "theoretical_loss": 3.3061268470383145, + "tokens_seen": 3178485760 + }, + { + "epoch": 10.06, + "learning_rate": 1.8756268806419257e-05, + "loss": 2.4747, + "theoretical_loss": 3.306124257109318, + "tokens_seen": 3178518528 + }, + { + "epoch": 10.06, + "learning_rate": 1.8746238716148445e-05, + "loss": 2.5294, + "theoretical_loss": 3.306119077353851, + "tokens_seen": 3178584064 + }, + { + "epoch": 10.06, + "learning_rate": 1.8736208625877633e-05, + "loss": 2.4318, + "theoretical_loss": 3.3061138977350817, + "tokens_seen": 3178649600 + }, + { + "epoch": 10.06, + "learning_rate": 1.872617853560682e-05, + "loss": 2.3207, + "theoretical_loss": 3.3061087182530033, + "tokens_seen": 3178715136 + }, + { + "epoch": 10.06, + "learning_rate": 1.871614844533601e-05, + "loss": 2.4844, + "theoretical_loss": 3.306103538907609, + "tokens_seen": 3178780672 + }, + { + "epoch": 10.06, + "learning_rate": 1.8706118355065196e-05, + "loss": 2.2786, + "theoretical_loss": 3.306098359698893, + "tokens_seen": 3178846208 + }, + { + "epoch": 10.06, + "learning_rate": 1.8696088264794384e-05, + "loss": 2.4263, + "theoretical_loss": 3.306093180626848, + "tokens_seen": 3178911744 + }, + { + "epoch": 10.06, + "learning_rate": 1.8686058174523572e-05, + "loss": 2.2991, + "theoretical_loss": 3.306088001691468, + "tokens_seen": 3178977280 + }, + { + "epoch": 10.06, + "learning_rate": 1.867602808425276e-05, + "loss": 2.358, + "theoretical_loss": 3.3060828228927477, + "tokens_seen": 3179042816 + }, + { + "epoch": 10.06, + "learning_rate": 1.8665997993981944e-05, + "loss": 2.3167, + "theoretical_loss": 3.3060776442306787, + "tokens_seen": 3179108352 + }, + { + "epoch": 10.06, + "learning_rate": 1.8655967903711132e-05, + "loss": 2.2273, + "theoretical_loss": 3.306072465705256, + "tokens_seen": 3179173888 + }, + { + "epoch": 10.06, + "learning_rate": 1.864593781344032e-05, + "loss": 2.5076, + "theoretical_loss": 3.3060672873164725, + "tokens_seen": 3179239424 + }, + { + "epoch": 10.06, + "learning_rate": 1.8635907723169508e-05, + "loss": 2.4417, + "theoretical_loss": 3.3060621090643223, + "tokens_seen": 3179304960 + }, + { + "epoch": 10.06, + "learning_rate": 1.86258776328987e-05, + "loss": 2.1291, + "theoretical_loss": 3.3060569309487984, + "tokens_seen": 3179370496 + }, + { + "epoch": 10.06, + "learning_rate": 1.8615847542627884e-05, + "loss": 2.2651, + "theoretical_loss": 3.3060517529698945, + "tokens_seen": 3179436032 + }, + { + "epoch": 10.06, + "learning_rate": 1.8605817452357072e-05, + "loss": 2.3654, + "theoretical_loss": 3.3060465751276045, + "tokens_seen": 3179501568 + }, + { + "epoch": 10.06, + "learning_rate": 1.859578736208626e-05, + "loss": 2.2718, + "theoretical_loss": 3.306041397421922, + "tokens_seen": 3179567104 + }, + { + "epoch": 10.06, + "learning_rate": 1.8585757271815448e-05, + "loss": 2.4208, + "theoretical_loss": 3.30603621985284, + "tokens_seen": 3179632640 + }, + { + "epoch": 10.06, + "learning_rate": 1.8575727181544636e-05, + "loss": 2.4085, + "theoretical_loss": 3.306031042420353, + "tokens_seen": 3179698176 + }, + { + "epoch": 10.06, + "learning_rate": 1.856569709127382e-05, + "loss": 1.9906, + "theoretical_loss": 3.306025865124454, + "tokens_seen": 3179763712 + }, + { + "epoch": 10.06, + "learning_rate": 1.8555667001003008e-05, + "loss": 2.6995, + "theoretical_loss": 3.306020687965136, + "tokens_seen": 3179829248 + }, + { + "epoch": 10.06, + "learning_rate": 1.8545636910732196e-05, + "loss": 2.4801, + "theoretical_loss": 3.3060155109423937, + "tokens_seen": 3179894784 + }, + { + "epoch": 10.06, + "learning_rate": 1.8535606820461384e-05, + "loss": 2.4944, + "theoretical_loss": 3.30601033405622, + "tokens_seen": 3179960320 + }, + { + "epoch": 10.06, + "learning_rate": 1.8525576730190575e-05, + "loss": 2.2834, + "theoretical_loss": 3.306005157306609, + "tokens_seen": 3180025856 + }, + { + "epoch": 10.06, + "learning_rate": 1.851554663991976e-05, + "loss": 2.3672, + "theoretical_loss": 3.305999980693554, + "tokens_seen": 3180091392 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3486032, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.495100259780884, + "objective/train/theoretical_loss": 3.3059973924382327, + "objective/train/tokens_used": 3200584160, + "theoretical_loss": 3.3059973924382327, + "tokens_seen": 3180124160 + }, + { + "epoch": 10.06, + "learning_rate": 1.8505516549648947e-05, + "loss": 2.457, + "theoretical_loss": 3.305994804217048, + "tokens_seen": 3180156928 + }, + { + "epoch": 10.06, + "learning_rate": 1.8495486459378135e-05, + "loss": 2.6376, + "theoretical_loss": 3.3059896278770857, + "tokens_seen": 3180222464 + }, + { + "epoch": 10.06, + "learning_rate": 1.8485456369107323e-05, + "loss": 2.3187, + "theoretical_loss": 3.30598445167366, + "tokens_seen": 3180288000 + }, + { + "epoch": 10.06, + "learning_rate": 1.847542627883651e-05, + "loss": 2.4315, + "theoretical_loss": 3.3059792756067643, + "tokens_seen": 3180353536 + }, + { + "epoch": 10.06, + "learning_rate": 1.8465396188565696e-05, + "loss": 2.3647, + "theoretical_loss": 3.3059740996763924, + "tokens_seen": 3180419072 + }, + { + "epoch": 10.06, + "learning_rate": 1.8455366098294884e-05, + "loss": 2.6596, + "theoretical_loss": 3.3059689238825385, + "tokens_seen": 3180484608 + }, + { + "epoch": 10.06, + "learning_rate": 1.844533600802407e-05, + "loss": 2.3443, + "theoretical_loss": 3.3059637482251953, + "tokens_seen": 3180550144 + }, + { + "epoch": 10.06, + "learning_rate": 1.8435305917753263e-05, + "loss": 2.3175, + "theoretical_loss": 3.3059585727043572, + "tokens_seen": 3180615680 + }, + { + "epoch": 10.06, + "learning_rate": 1.842527582748245e-05, + "loss": 2.2891, + "theoretical_loss": 3.3059533973200166, + "tokens_seen": 3180681216 + }, + { + "epoch": 10.06, + "learning_rate": 1.8415245737211635e-05, + "loss": 2.3456, + "theoretical_loss": 3.3059482220721685, + "tokens_seen": 3180746752 + }, + { + "epoch": 10.06, + "learning_rate": 1.8405215646940823e-05, + "loss": 2.3821, + "theoretical_loss": 3.3059430469608055, + "tokens_seen": 3180812288 + }, + { + "epoch": 10.06, + "learning_rate": 1.839518555667001e-05, + "loss": 2.2465, + "theoretical_loss": 3.3059378719859214, + "tokens_seen": 3180877824 + }, + { + "epoch": 10.06, + "learning_rate": 1.83851554663992e-05, + "loss": 2.573, + "theoretical_loss": 3.30593269714751, + "tokens_seen": 3180943360 + }, + { + "epoch": 10.06, + "learning_rate": 1.8375125376128383e-05, + "loss": 2.4366, + "theoretical_loss": 3.3059275224455646, + "tokens_seen": 3181008896 + }, + { + "epoch": 10.06, + "learning_rate": 1.836509528585757e-05, + "loss": 2.4582, + "theoretical_loss": 3.305922347880079, + "tokens_seen": 3181074432 + }, + { + "epoch": 10.06, + "learning_rate": 1.835506519558676e-05, + "loss": 2.418, + "theoretical_loss": 3.3059171734510473, + "tokens_seen": 3181139968 + }, + { + "epoch": 10.06, + "learning_rate": 1.834503510531595e-05, + "loss": 2.3608, + "theoretical_loss": 3.3059119991584622, + "tokens_seen": 3181205504 + }, + { + "epoch": 10.06, + "learning_rate": 1.8335005015045138e-05, + "loss": 2.4856, + "theoretical_loss": 3.3059068250023174, + "tokens_seen": 3181271040 + }, + { + "epoch": 10.06, + "learning_rate": 1.8324974924774323e-05, + "loss": 2.479, + "theoretical_loss": 3.3059016509826074, + "tokens_seen": 3181336576 + }, + { + "epoch": 10.06, + "learning_rate": 1.831494483450351e-05, + "loss": 2.4046, + "theoretical_loss": 3.305896477099324, + "tokens_seen": 3181402112 + }, + { + "epoch": 10.06, + "learning_rate": 1.83049147442327e-05, + "loss": 2.3851, + "theoretical_loss": 3.305891303352463, + "tokens_seen": 3181467648 + }, + { + "epoch": 10.06, + "learning_rate": 1.8294884653961886e-05, + "loss": 2.3125, + "theoretical_loss": 3.3058861297420163, + "tokens_seen": 3181533184 + }, + { + "epoch": 10.06, + "learning_rate": 1.8284854563691074e-05, + "loss": 2.3234, + "theoretical_loss": 3.3058809562679787, + "tokens_seen": 3181598720 + }, + { + "epoch": 10.06, + "learning_rate": 1.827482447342026e-05, + "loss": 2.3744, + "theoretical_loss": 3.3058757829303427, + "tokens_seen": 3181664256 + }, + { + "epoch": 10.06, + "learning_rate": 1.8264794383149447e-05, + "loss": 2.2749, + "theoretical_loss": 3.3058706097291024, + "tokens_seen": 3181729792 + }, + { + "epoch": 10.06, + "objective/train/docs_used": 3487141, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.356879234313965, + "objective/train/theoretical_loss": 3.3058680231796287, + "objective/train/tokens_used": 3202222560, + "theoretical_loss": 3.3058680231796287, + "tokens_seen": 3181762560 + }, + { + "epoch": 10.06, + "learning_rate": 1.8254764292878635e-05, + "loss": 2.3591, + "theoretical_loss": 3.3058654366642517, + "tokens_seen": 3181795328 + }, + { + "epoch": 10.06, + "learning_rate": 1.8244734202607826e-05, + "loss": 2.2673, + "theoretical_loss": 3.3058602637357835, + "tokens_seen": 3181860864 + }, + { + "epoch": 10.06, + "learning_rate": 1.8234704112337014e-05, + "loss": 2.3586, + "theoretical_loss": 3.305855090943692, + "tokens_seen": 3181926400 + }, + { + "epoch": 10.06, + "learning_rate": 1.82246740220662e-05, + "loss": 2.5033, + "theoretical_loss": 3.3058499182879704, + "tokens_seen": 3181991936 + }, + { + "epoch": 10.06, + "learning_rate": 1.8214643931795386e-05, + "loss": 2.4865, + "theoretical_loss": 3.3058447457686126, + "tokens_seen": 3182057472 + }, + { + "epoch": 10.06, + "learning_rate": 1.8204613841524574e-05, + "loss": 2.3396, + "theoretical_loss": 3.305839573385612, + "tokens_seen": 3182123008 + }, + { + "epoch": 10.07, + "learning_rate": 1.8194583751253762e-05, + "loss": 2.1889, + "theoretical_loss": 3.3058344011389624, + "tokens_seen": 3182188544 + }, + { + "epoch": 10.07, + "learning_rate": 1.818455366098295e-05, + "loss": 2.4138, + "theoretical_loss": 3.305829229028657, + "tokens_seen": 3182254080 + }, + { + "epoch": 10.07, + "learning_rate": 1.8174523570712134e-05, + "loss": 2.4124, + "theoretical_loss": 3.30582405705469, + "tokens_seen": 3182319616 + }, + { + "epoch": 10.07, + "learning_rate": 1.8164493480441322e-05, + "loss": 2.6654, + "theoretical_loss": 3.3058188852170542, + "tokens_seen": 3182385152 + }, + { + "epoch": 10.07, + "learning_rate": 1.8154463390170514e-05, + "loss": 2.4222, + "theoretical_loss": 3.305813713515744, + "tokens_seen": 3182450688 + }, + { + "epoch": 10.07, + "learning_rate": 1.81444332998997e-05, + "loss": 2.5056, + "theoretical_loss": 3.3058085419507526, + "tokens_seen": 3182516224 + }, + { + "epoch": 10.07, + "learning_rate": 1.813440320962889e-05, + "loss": 2.3608, + "theoretical_loss": 3.3058033705220735, + "tokens_seen": 3182581760 + }, + { + "epoch": 10.07, + "learning_rate": 1.8124373119358074e-05, + "loss": 2.2502, + "theoretical_loss": 3.305798199229701, + "tokens_seen": 3182647296 + }, + { + "epoch": 10.07, + "learning_rate": 1.8114343029087262e-05, + "loss": 2.459, + "theoretical_loss": 3.3057930280736274, + "tokens_seen": 3182712832 + }, + { + "epoch": 10.07, + "learning_rate": 1.810431293881645e-05, + "loss": 2.5307, + "theoretical_loss": 3.3057878570538475, + "tokens_seen": 3182778368 + }, + { + "epoch": 10.07, + "learning_rate": 1.8094282848545638e-05, + "loss": 2.6341, + "theoretical_loss": 3.3057826861703545, + "tokens_seen": 3182843904 + }, + { + "epoch": 10.07, + "learning_rate": 1.8084252758274826e-05, + "loss": 2.3498, + "theoretical_loss": 3.3057775154231415, + "tokens_seen": 3182909440 + }, + { + "epoch": 10.07, + "learning_rate": 1.807422266800401e-05, + "loss": 2.3839, + "theoretical_loss": 3.305772344812203, + "tokens_seen": 3182974976 + }, + { + "epoch": 10.07, + "learning_rate": 1.80641925777332e-05, + "loss": 2.253, + "theoretical_loss": 3.3057671743375323, + "tokens_seen": 3183040512 + }, + { + "epoch": 10.07, + "learning_rate": 1.805416248746239e-05, + "loss": 2.2791, + "theoretical_loss": 3.3057620039991225, + "tokens_seen": 3183106048 + }, + { + "epoch": 10.07, + "learning_rate": 1.8044132397191577e-05, + "loss": 2.397, + "theoretical_loss": 3.3057568337969676, + "tokens_seen": 3183171584 + }, + { + "epoch": 10.07, + "learning_rate": 1.803410230692076e-05, + "loss": 2.5477, + "theoretical_loss": 3.305751663731061, + "tokens_seen": 3183237120 + }, + { + "epoch": 10.07, + "learning_rate": 1.802407221664995e-05, + "loss": 2.57, + "theoretical_loss": 3.305746493801397, + "tokens_seen": 3183302656 + }, + { + "epoch": 10.07, + "learning_rate": 1.8014042126379137e-05, + "loss": 2.2607, + "theoretical_loss": 3.3057413240079683, + "tokens_seen": 3183368192 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3487855, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5130972862243652, + "objective/train/theoretical_loss": 3.3057387391623405, + "objective/train/tokens_used": 3203860960, + "theoretical_loss": 3.3057387391623405, + "tokens_seen": 3183400960 + }, + { + "epoch": 10.07, + "learning_rate": 1.8004012036108325e-05, + "loss": 2.4016, + "theoretical_loss": 3.305736154350769, + "tokens_seen": 3183433728 + }, + { + "epoch": 10.07, + "learning_rate": 1.7993981945837513e-05, + "loss": 2.3423, + "theoretical_loss": 3.305730984829793, + "tokens_seen": 3183499264 + }, + { + "epoch": 10.07, + "learning_rate": 1.7983951855566698e-05, + "loss": 2.3763, + "theoretical_loss": 3.305725815445033, + "tokens_seen": 3183564800 + }, + { + "epoch": 10.07, + "learning_rate": 1.7973921765295886e-05, + "loss": 2.5553, + "theoretical_loss": 3.3057206461964834, + "tokens_seen": 3183630336 + }, + { + "epoch": 10.07, + "learning_rate": 1.7963891675025077e-05, + "loss": 2.4534, + "theoretical_loss": 3.3057154770841373, + "tokens_seen": 3183695872 + }, + { + "epoch": 10.07, + "learning_rate": 1.7953861584754265e-05, + "loss": 2.4949, + "theoretical_loss": 3.3057103081079884, + "tokens_seen": 3183761408 + }, + { + "epoch": 10.07, + "learning_rate": 1.7943831494483453e-05, + "loss": 2.5019, + "theoretical_loss": 3.3057051392680306, + "tokens_seen": 3183826944 + }, + { + "epoch": 10.07, + "learning_rate": 1.7933801404212637e-05, + "loss": 2.3835, + "theoretical_loss": 3.305699970564257, + "tokens_seen": 3183892480 + }, + { + "epoch": 10.07, + "learning_rate": 1.7923771313941825e-05, + "loss": 2.1872, + "theoretical_loss": 3.3056948019966623, + "tokens_seen": 3183958016 + }, + { + "epoch": 10.07, + "learning_rate": 1.7913741223671013e-05, + "loss": 2.5635, + "theoretical_loss": 3.305689633565239, + "tokens_seen": 3184023552 + }, + { + "epoch": 10.07, + "learning_rate": 1.79037111334002e-05, + "loss": 2.5775, + "theoretical_loss": 3.305684465269981, + "tokens_seen": 3184089088 + }, + { + "epoch": 10.07, + "learning_rate": 1.789368104312939e-05, + "loss": 2.361, + "theoretical_loss": 3.305679297110882, + "tokens_seen": 3184154624 + }, + { + "epoch": 10.07, + "learning_rate": 1.7883650952858573e-05, + "loss": 2.566, + "theoretical_loss": 3.3056741290879357, + "tokens_seen": 3184220160 + }, + { + "epoch": 10.07, + "learning_rate": 1.7873620862587765e-05, + "loss": 2.4455, + "theoretical_loss": 3.3056689612011354, + "tokens_seen": 3184285696 + }, + { + "epoch": 10.07, + "learning_rate": 1.7863590772316952e-05, + "loss": 2.2511, + "theoretical_loss": 3.305663793450475, + "tokens_seen": 3184351232 + }, + { + "epoch": 10.07, + "learning_rate": 1.785356068204614e-05, + "loss": 2.4959, + "theoretical_loss": 3.3056586258359477, + "tokens_seen": 3184416768 + }, + { + "epoch": 10.07, + "learning_rate": 1.7843530591775328e-05, + "loss": 2.5271, + "theoretical_loss": 3.305653458357548, + "tokens_seen": 3184482304 + }, + { + "epoch": 10.07, + "learning_rate": 1.7833500501504513e-05, + "loss": 2.3657, + "theoretical_loss": 3.305648291015269, + "tokens_seen": 3184547840 + }, + { + "epoch": 10.07, + "learning_rate": 1.78234704112337e-05, + "loss": 2.3054, + "theoretical_loss": 3.3056431238091037, + "tokens_seen": 3184613376 + }, + { + "epoch": 10.07, + "learning_rate": 1.781344032096289e-05, + "loss": 2.4603, + "theoretical_loss": 3.3056379567390466, + "tokens_seen": 3184678912 + }, + { + "epoch": 10.07, + "learning_rate": 1.7803410230692076e-05, + "loss": 2.5653, + "theoretical_loss": 3.305632789805091, + "tokens_seen": 3184744448 + }, + { + "epoch": 10.07, + "learning_rate": 1.7793380140421264e-05, + "loss": 2.356, + "theoretical_loss": 3.30562762300723, + "tokens_seen": 3184809984 + }, + { + "epoch": 10.07, + "learning_rate": 1.7783350050150452e-05, + "loss": 2.3123, + "theoretical_loss": 3.3056224563454584, + "tokens_seen": 3184875520 + }, + { + "epoch": 10.07, + "learning_rate": 1.777331995987964e-05, + "loss": 2.3338, + "theoretical_loss": 3.305617289819769, + "tokens_seen": 3184941056 + }, + { + "epoch": 10.07, + "learning_rate": 1.7763289869608828e-05, + "loss": 2.4969, + "theoretical_loss": 3.3056121234301554, + "tokens_seen": 3185006592 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3489105, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.709012508392334, + "objective/train/theoretical_loss": 3.305609540286375, + "objective/train/tokens_used": 3205499360, + "theoretical_loss": 3.305609540286375, + "tokens_seen": 3185039360 + }, + { + "epoch": 10.07, + "learning_rate": 1.7753259779338016e-05, + "loss": 2.4486, + "theoretical_loss": 3.3056069571766113, + "tokens_seen": 3185072128 + }, + { + "epoch": 10.07, + "learning_rate": 1.7743229689067204e-05, + "loss": 2.603, + "theoretical_loss": 3.3056017910591304, + "tokens_seen": 3185137664 + }, + { + "epoch": 10.07, + "learning_rate": 1.773319959879639e-05, + "loss": 2.352, + "theoretical_loss": 3.3055966250777065, + "tokens_seen": 3185203200 + }, + { + "epoch": 10.07, + "learning_rate": 1.7723169508525576e-05, + "loss": 2.3751, + "theoretical_loss": 3.3055914592323328, + "tokens_seen": 3185268736 + }, + { + "epoch": 10.07, + "learning_rate": 1.7713139418254764e-05, + "loss": 2.4188, + "theoretical_loss": 3.3055862935230036, + "tokens_seen": 3185334272 + }, + { + "epoch": 10.07, + "learning_rate": 1.7703109327983952e-05, + "loss": 2.2852, + "theoretical_loss": 3.3055811279497114, + "tokens_seen": 3185399808 + }, + { + "epoch": 10.07, + "learning_rate": 1.7693079237713137e-05, + "loss": 2.2963, + "theoretical_loss": 3.3055759625124512, + "tokens_seen": 3185465344 + }, + { + "epoch": 10.07, + "learning_rate": 1.7683049147442328e-05, + "loss": 2.0468, + "theoretical_loss": 3.305570797211215, + "tokens_seen": 3185530880 + }, + { + "epoch": 10.07, + "learning_rate": 1.7673019057171516e-05, + "loss": 2.4186, + "theoretical_loss": 3.305565632045998, + "tokens_seen": 3185596416 + }, + { + "epoch": 10.07, + "learning_rate": 1.7662988966900704e-05, + "loss": 2.4721, + "theoretical_loss": 3.305560467016793, + "tokens_seen": 3185661952 + }, + { + "epoch": 10.07, + "learning_rate": 1.765295887662989e-05, + "loss": 2.3255, + "theoretical_loss": 3.3055553021235937, + "tokens_seen": 3185727488 + }, + { + "epoch": 10.07, + "learning_rate": 1.7642928786359076e-05, + "loss": 2.4348, + "theoretical_loss": 3.3055501373663936, + "tokens_seen": 3185793024 + }, + { + "epoch": 10.07, + "learning_rate": 1.7632898696088264e-05, + "loss": 2.3276, + "theoretical_loss": 3.3055449727451864, + "tokens_seen": 3185858560 + }, + { + "epoch": 10.07, + "learning_rate": 1.7622868605817452e-05, + "loss": 2.2953, + "theoretical_loss": 3.305539808259966, + "tokens_seen": 3185924096 + }, + { + "epoch": 10.07, + "learning_rate": 1.761283851554664e-05, + "loss": 2.2863, + "theoretical_loss": 3.3055346439107263, + "tokens_seen": 3185989632 + }, + { + "epoch": 10.07, + "learning_rate": 1.7602808425275828e-05, + "loss": 2.4526, + "theoretical_loss": 3.30552947969746, + "tokens_seen": 3186055168 + }, + { + "epoch": 10.07, + "learning_rate": 1.7592778335005016e-05, + "loss": 2.4887, + "theoretical_loss": 3.3055243156201612, + "tokens_seen": 3186120704 + }, + { + "epoch": 10.07, + "learning_rate": 1.7582748244734203e-05, + "loss": 2.4754, + "theoretical_loss": 3.3055191516788236, + "tokens_seen": 3186186240 + }, + { + "epoch": 10.07, + "learning_rate": 1.757271815446339e-05, + "loss": 2.6412, + "theoretical_loss": 3.3055139878734403, + "tokens_seen": 3186251776 + }, + { + "epoch": 10.07, + "learning_rate": 1.756268806419258e-05, + "loss": 2.3386, + "theoretical_loss": 3.305508824204006, + "tokens_seen": 3186317312 + }, + { + "epoch": 10.07, + "learning_rate": 1.7552657973921767e-05, + "loss": 2.5453, + "theoretical_loss": 3.3055036606705133, + "tokens_seen": 3186382848 + }, + { + "epoch": 10.07, + "learning_rate": 1.754262788365095e-05, + "loss": 2.4562, + "theoretical_loss": 3.305498497272956, + "tokens_seen": 3186448384 + }, + { + "epoch": 10.07, + "learning_rate": 1.753259779338014e-05, + "loss": 2.1703, + "theoretical_loss": 3.3054933340113286, + "tokens_seen": 3186513920 + }, + { + "epoch": 10.07, + "learning_rate": 1.7522567703109327e-05, + "loss": 2.3971, + "theoretical_loss": 3.3054881708856234, + "tokens_seen": 3186579456 + }, + { + "epoch": 10.07, + "learning_rate": 1.7512537612838515e-05, + "loss": 2.2292, + "theoretical_loss": 3.3054830078958353, + "tokens_seen": 3186644992 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3489468, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.510357618331909, + "objective/train/theoretical_loss": 3.3054804264519078, + "objective/train/tokens_used": 3207137760, + "theoretical_loss": 3.3054804264519078, + "tokens_seen": 3186677760 + }, + { + "epoch": 10.07, + "learning_rate": 1.7502507522567703e-05, + "loss": 2.396, + "theoretical_loss": 3.305477845041957, + "tokens_seen": 3186710528 + }, + { + "epoch": 10.07, + "learning_rate": 1.749247743229689e-05, + "loss": 2.4358, + "theoretical_loss": 3.305472682323982, + "tokens_seen": 3186776064 + }, + { + "epoch": 10.07, + "learning_rate": 1.748244734202608e-05, + "loss": 2.4007, + "theoretical_loss": 3.3054675197419052, + "tokens_seen": 3186841600 + }, + { + "epoch": 10.07, + "learning_rate": 1.7472417251755267e-05, + "loss": 2.4198, + "theoretical_loss": 3.3054623572957187, + "tokens_seen": 3186907136 + }, + { + "epoch": 10.07, + "learning_rate": 1.7462387161484455e-05, + "loss": 2.4421, + "theoretical_loss": 3.3054571949854172, + "tokens_seen": 3186972672 + }, + { + "epoch": 10.07, + "learning_rate": 1.7452357071213643e-05, + "loss": 2.4023, + "theoretical_loss": 3.3054520328109938, + "tokens_seen": 3187038208 + }, + { + "epoch": 10.07, + "learning_rate": 1.7442326980942827e-05, + "loss": 2.5843, + "theoretical_loss": 3.3054468707724425, + "tokens_seen": 3187103744 + }, + { + "epoch": 10.07, + "learning_rate": 1.7432296890672015e-05, + "loss": 2.4013, + "theoretical_loss": 3.3054417088697567, + "tokens_seen": 3187169280 + }, + { + "epoch": 10.07, + "learning_rate": 1.7422266800401203e-05, + "loss": 2.5617, + "theoretical_loss": 3.30543654710293, + "tokens_seen": 3187234816 + }, + { + "epoch": 10.07, + "learning_rate": 1.741223671013039e-05, + "loss": 2.4087, + "theoretical_loss": 3.3054313854719557, + "tokens_seen": 3187300352 + }, + { + "epoch": 10.07, + "learning_rate": 1.7402206619859582e-05, + "loss": 2.5435, + "theoretical_loss": 3.305426223976828, + "tokens_seen": 3187365888 + }, + { + "epoch": 10.07, + "learning_rate": 1.7392176529588767e-05, + "loss": 2.3145, + "theoretical_loss": 3.3054210626175404, + "tokens_seen": 3187431424 + }, + { + "epoch": 10.07, + "learning_rate": 1.7382146439317955e-05, + "loss": 2.3149, + "theoretical_loss": 3.3054159013940865, + "tokens_seen": 3187496960 + }, + { + "epoch": 10.07, + "learning_rate": 1.7372116349047142e-05, + "loss": 2.4299, + "theoretical_loss": 3.30541074030646, + "tokens_seen": 3187562496 + }, + { + "epoch": 10.07, + "learning_rate": 1.736208625877633e-05, + "loss": 2.3458, + "theoretical_loss": 3.3054055793546544, + "tokens_seen": 3187628032 + }, + { + "epoch": 10.07, + "learning_rate": 1.7352056168505518e-05, + "loss": 2.3399, + "theoretical_loss": 3.305400418538663, + "tokens_seen": 3187693568 + }, + { + "epoch": 10.07, + "learning_rate": 1.7342026078234703e-05, + "loss": 2.25, + "theoretical_loss": 3.30539525785848, + "tokens_seen": 3187759104 + }, + { + "epoch": 10.07, + "learning_rate": 1.733199598796389e-05, + "loss": 2.4029, + "theoretical_loss": 3.305390097314099, + "tokens_seen": 3187824640 + }, + { + "epoch": 10.07, + "learning_rate": 1.732196589769308e-05, + "loss": 2.345, + "theoretical_loss": 3.3053849369055133, + "tokens_seen": 3187890176 + }, + { + "epoch": 10.07, + "learning_rate": 1.731193580742227e-05, + "loss": 2.4161, + "theoretical_loss": 3.305379776632717, + "tokens_seen": 3187955712 + }, + { + "epoch": 10.07, + "learning_rate": 1.7301905717151454e-05, + "loss": 2.3344, + "theoretical_loss": 3.305374616495703, + "tokens_seen": 3188021248 + }, + { + "epoch": 10.07, + "learning_rate": 1.7291875626880642e-05, + "loss": 2.4173, + "theoretical_loss": 3.305369456494466, + "tokens_seen": 3188086784 + }, + { + "epoch": 10.07, + "learning_rate": 1.728184553660983e-05, + "loss": 2.4677, + "theoretical_loss": 3.305364296628998, + "tokens_seen": 3188152320 + }, + { + "epoch": 10.07, + "learning_rate": 1.7271815446339018e-05, + "loss": 2.3842, + "theoretical_loss": 3.3053591368992947, + "tokens_seen": 3188217856 + }, + { + "epoch": 10.07, + "learning_rate": 1.7261785356068206e-05, + "loss": 2.2352, + "theoretical_loss": 3.305353977305348, + "tokens_seen": 3188283392 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3490734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.45648193359375, + "objective/train/theoretical_loss": 3.305351397559282, + "objective/train/tokens_used": 3208776160, + "theoretical_loss": 3.305351397559282, + "tokens_seen": 3188316160 + }, + { + "epoch": 10.07, + "learning_rate": 1.725175526579739e-05, + "loss": 2.4428, + "theoretical_loss": 3.3053488178471526, + "tokens_seen": 3188348928 + }, + { + "epoch": 10.07, + "learning_rate": 1.724172517552658e-05, + "loss": 2.5104, + "theoretical_loss": 3.3053436585247016, + "tokens_seen": 3188414464 + }, + { + "epoch": 10.07, + "learning_rate": 1.7231695085255766e-05, + "loss": 2.3943, + "theoretical_loss": 3.3053384993379886, + "tokens_seen": 3188480000 + }, + { + "epoch": 10.07, + "learning_rate": 1.7221664994984954e-05, + "loss": 2.3832, + "theoretical_loss": 3.305333340287008, + "tokens_seen": 3188545536 + }, + { + "epoch": 10.07, + "learning_rate": 1.7211634904714145e-05, + "loss": 2.2588, + "theoretical_loss": 3.3053281813717525, + "tokens_seen": 3188611072 + }, + { + "epoch": 10.07, + "learning_rate": 1.720160481444333e-05, + "loss": 2.4759, + "theoretical_loss": 3.305323022592216, + "tokens_seen": 3188676608 + }, + { + "epoch": 10.07, + "learning_rate": 1.7191574724172518e-05, + "loss": 2.4525, + "theoretical_loss": 3.3053178639483924, + "tokens_seen": 3188742144 + }, + { + "epoch": 10.07, + "learning_rate": 1.7181544633901706e-05, + "loss": 2.4948, + "theoretical_loss": 3.3053127054402753, + "tokens_seen": 3188807680 + }, + { + "epoch": 10.07, + "learning_rate": 1.7171514543630894e-05, + "loss": 2.1729, + "theoretical_loss": 3.3053075470678586, + "tokens_seen": 3188873216 + }, + { + "epoch": 10.07, + "learning_rate": 1.716148445336008e-05, + "loss": 2.3711, + "theoretical_loss": 3.3053023888311346, + "tokens_seen": 3188938752 + }, + { + "epoch": 10.07, + "learning_rate": 1.7151454363089266e-05, + "loss": 2.5026, + "theoretical_loss": 3.305297230730099, + "tokens_seen": 3189004288 + }, + { + "epoch": 10.07, + "learning_rate": 1.7141424272818454e-05, + "loss": 2.1834, + "theoretical_loss": 3.3052920727647437, + "tokens_seen": 3189069824 + }, + { + "epoch": 10.07, + "learning_rate": 1.7131394182547642e-05, + "loss": 2.3062, + "theoretical_loss": 3.305286914935063, + "tokens_seen": 3189135360 + }, + { + "epoch": 10.07, + "learning_rate": 1.7121364092276833e-05, + "loss": 2.5906, + "theoretical_loss": 3.3052817572410507, + "tokens_seen": 3189200896 + }, + { + "epoch": 10.07, + "learning_rate": 1.711133400200602e-05, + "loss": 2.3797, + "theoretical_loss": 3.3052765996827005, + "tokens_seen": 3189266432 + }, + { + "epoch": 10.07, + "learning_rate": 1.7101303911735205e-05, + "loss": 2.2777, + "theoretical_loss": 3.305271442260006, + "tokens_seen": 3189331968 + }, + { + "epoch": 10.07, + "learning_rate": 1.7091273821464393e-05, + "loss": 2.3605, + "theoretical_loss": 3.30526628497296, + "tokens_seen": 3189397504 + }, + { + "epoch": 10.07, + "learning_rate": 1.708124373119358e-05, + "loss": 2.9125, + "theoretical_loss": 3.305261127821557, + "tokens_seen": 3189463040 + }, + { + "epoch": 10.07, + "learning_rate": 1.707121364092277e-05, + "loss": 2.3431, + "theoretical_loss": 3.3052559708057907, + "tokens_seen": 3189528576 + }, + { + "epoch": 10.07, + "learning_rate": 1.7061183550651957e-05, + "loss": 2.4476, + "theoretical_loss": 3.3052508139256545, + "tokens_seen": 3189594112 + }, + { + "epoch": 10.07, + "learning_rate": 1.705115346038114e-05, + "loss": 2.2223, + "theoretical_loss": 3.3052456571811417, + "tokens_seen": 3189659648 + }, + { + "epoch": 10.07, + "learning_rate": 1.704112337011033e-05, + "loss": 2.4152, + "theoretical_loss": 3.3052405005722467, + "tokens_seen": 3189725184 + }, + { + "epoch": 10.07, + "learning_rate": 1.703109327983952e-05, + "loss": 2.5192, + "theoretical_loss": 3.305235344098963, + "tokens_seen": 3189790720 + }, + { + "epoch": 10.07, + "learning_rate": 1.702106318956871e-05, + "loss": 2.1178, + "theoretical_loss": 3.3052301877612837, + "tokens_seen": 3189856256 + }, + { + "epoch": 10.07, + "learning_rate": 1.7011033099297897e-05, + "loss": 2.2655, + "theoretical_loss": 3.3052250315592024, + "tokens_seen": 3189921792 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3492186, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6151463985443115, + "objective/train/theoretical_loss": 3.3052224535090096, + "objective/train/tokens_used": 3210414560, + "theoretical_loss": 3.3052224535090096, + "tokens_seen": 3189954560 + }, + { + "epoch": 10.07, + "learning_rate": 1.700100300902708e-05, + "loss": 2.5049, + "theoretical_loss": 3.305219875492714, + "tokens_seen": 3189987328 + }, + { + "epoch": 10.07, + "learning_rate": 1.699097291875627e-05, + "loss": 2.4823, + "theoretical_loss": 3.3052147195618105, + "tokens_seen": 3190052864 + }, + { + "epoch": 10.07, + "learning_rate": 1.6980942828485457e-05, + "loss": 2.1835, + "theoretical_loss": 3.3052095637664864, + "tokens_seen": 3190118400 + }, + { + "epoch": 10.07, + "learning_rate": 1.6970912738214645e-05, + "loss": 2.2767, + "theoretical_loss": 3.3052044081067353, + "tokens_seen": 3190183936 + }, + { + "epoch": 10.07, + "learning_rate": 1.696088264794383e-05, + "loss": 2.474, + "theoretical_loss": 3.305199252582551, + "tokens_seen": 3190249472 + }, + { + "epoch": 10.07, + "learning_rate": 1.6950852557673017e-05, + "loss": 2.3361, + "theoretical_loss": 3.305194097193927, + "tokens_seen": 3190315008 + }, + { + "epoch": 10.07, + "learning_rate": 1.6940822467402205e-05, + "loss": 2.0763, + "theoretical_loss": 3.3051889419408567, + "tokens_seen": 3190380544 + }, + { + "epoch": 10.07, + "learning_rate": 1.6930792377131396e-05, + "loss": 2.252, + "theoretical_loss": 3.305183786823334, + "tokens_seen": 3190446080 + }, + { + "epoch": 10.07, + "learning_rate": 1.6920762286860584e-05, + "loss": 2.3499, + "theoretical_loss": 3.3051786318413523, + "tokens_seen": 3190511616 + }, + { + "epoch": 10.07, + "learning_rate": 1.691073219658977e-05, + "loss": 2.3873, + "theoretical_loss": 3.3051734769949057, + "tokens_seen": 3190577152 + }, + { + "epoch": 10.07, + "learning_rate": 1.6900702106318957e-05, + "loss": 2.4669, + "theoretical_loss": 3.3051683222839876, + "tokens_seen": 3190642688 + }, + { + "epoch": 10.07, + "learning_rate": 1.6890672016048145e-05, + "loss": 2.2839, + "theoretical_loss": 3.3051631677085918, + "tokens_seen": 3190708224 + }, + { + "epoch": 10.07, + "learning_rate": 1.6880641925777332e-05, + "loss": 2.5691, + "theoretical_loss": 3.3051580132687115, + "tokens_seen": 3190773760 + }, + { + "epoch": 10.07, + "learning_rate": 1.687061183550652e-05, + "loss": 2.3154, + "theoretical_loss": 3.305152858964341, + "tokens_seen": 3190839296 + }, + { + "epoch": 10.07, + "learning_rate": 1.6860581745235705e-05, + "loss": 2.3145, + "theoretical_loss": 3.3051477047954734, + "tokens_seen": 3190904832 + }, + { + "epoch": 10.07, + "learning_rate": 1.6850551654964893e-05, + "loss": 2.3315, + "theoretical_loss": 3.3051425507621026, + "tokens_seen": 3190970368 + }, + { + "epoch": 10.07, + "learning_rate": 1.6840521564694084e-05, + "loss": 2.3302, + "theoretical_loss": 3.3051373968642226, + "tokens_seen": 3191035904 + }, + { + "epoch": 10.07, + "learning_rate": 1.6830491474423272e-05, + "loss": 2.3068, + "theoretical_loss": 3.305132243101826, + "tokens_seen": 3191101440 + }, + { + "epoch": 10.07, + "learning_rate": 1.682046138415246e-05, + "loss": 2.3112, + "theoretical_loss": 3.3051270894749076, + "tokens_seen": 3191166976 + }, + { + "epoch": 10.07, + "learning_rate": 1.6810431293881644e-05, + "loss": 2.0772, + "theoretical_loss": 3.3051219359834607, + "tokens_seen": 3191232512 + }, + { + "epoch": 10.07, + "learning_rate": 1.6800401203610832e-05, + "loss": 2.2143, + "theoretical_loss": 3.3051167826274788, + "tokens_seen": 3191298048 + }, + { + "epoch": 10.07, + "learning_rate": 1.679037111334002e-05, + "loss": 2.3538, + "theoretical_loss": 3.3051116294069556, + "tokens_seen": 3191363584 + }, + { + "epoch": 10.07, + "learning_rate": 1.6780341023069208e-05, + "loss": 2.4556, + "theoretical_loss": 3.3051064763218845, + "tokens_seen": 3191429120 + }, + { + "epoch": 10.07, + "learning_rate": 1.6770310932798396e-05, + "loss": 2.4417, + "theoretical_loss": 3.3051013233722597, + "tokens_seen": 3191494656 + }, + { + "epoch": 10.07, + "learning_rate": 1.676028084252758e-05, + "loss": 2.3646, + "theoretical_loss": 3.3050961705580746, + "tokens_seen": 3191560192 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3492896, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.166111707687378, + "objective/train/theoretical_loss": 3.30509359420177, + "objective/train/tokens_used": 3212052960, + "theoretical_loss": 3.30509359420177, + "tokens_seen": 3191592960 + }, + { + "epoch": 10.07, + "learning_rate": 1.6750250752256772e-05, + "loss": 2.3433, + "theoretical_loss": 3.305091017879323, + "tokens_seen": 3191625728 + }, + { + "epoch": 10.07, + "learning_rate": 1.674022066198596e-05, + "loss": 2.2637, + "theoretical_loss": 3.305085865335998, + "tokens_seen": 3191691264 + }, + { + "epoch": 10.07, + "learning_rate": 1.6730190571715148e-05, + "loss": 2.5851, + "theoretical_loss": 3.3050807129280937, + "tokens_seen": 3191756800 + }, + { + "epoch": 10.07, + "learning_rate": 1.6720160481444335e-05, + "loss": 2.5739, + "theoretical_loss": 3.305075560655604, + "tokens_seen": 3191822336 + }, + { + "epoch": 10.07, + "learning_rate": 1.671013039117352e-05, + "loss": 2.4652, + "theoretical_loss": 3.3050704085185223, + "tokens_seen": 3191887872 + }, + { + "epoch": 10.07, + "learning_rate": 1.6700100300902708e-05, + "loss": 2.3648, + "theoretical_loss": 3.305065256516842, + "tokens_seen": 3191953408 + }, + { + "epoch": 10.07, + "learning_rate": 1.6690070210631896e-05, + "loss": 2.3982, + "theoretical_loss": 3.305060104650557, + "tokens_seen": 3192018944 + }, + { + "epoch": 10.07, + "learning_rate": 1.6680040120361084e-05, + "loss": 2.3731, + "theoretical_loss": 3.3050549529196616, + "tokens_seen": 3192084480 + }, + { + "epoch": 10.07, + "learning_rate": 1.667001003009027e-05, + "loss": 2.4026, + "theoretical_loss": 3.3050498013241483, + "tokens_seen": 3192150016 + }, + { + "epoch": 10.07, + "learning_rate": 1.6659979939819456e-05, + "loss": 2.4228, + "theoretical_loss": 3.305044649864011, + "tokens_seen": 3192215552 + }, + { + "epoch": 10.07, + "learning_rate": 1.6649949849548647e-05, + "loss": 2.5058, + "theoretical_loss": 3.3050394985392444, + "tokens_seen": 3192281088 + }, + { + "epoch": 10.07, + "learning_rate": 1.6639919759277835e-05, + "loss": 2.6197, + "theoretical_loss": 3.3050343473498414, + "tokens_seen": 3192346624 + }, + { + "epoch": 10.07, + "learning_rate": 1.6629889669007023e-05, + "loss": 2.5135, + "theoretical_loss": 3.305029196295795, + "tokens_seen": 3192412160 + }, + { + "epoch": 10.07, + "learning_rate": 1.661985957873621e-05, + "loss": 2.2754, + "theoretical_loss": 3.3050240453771003, + "tokens_seen": 3192477696 + }, + { + "epoch": 10.07, + "learning_rate": 1.6609829488465395e-05, + "loss": 2.5905, + "theoretical_loss": 3.3050188945937498, + "tokens_seen": 3192543232 + }, + { + "epoch": 10.07, + "learning_rate": 1.6599799398194583e-05, + "loss": 2.3566, + "theoretical_loss": 3.3050137439457377, + "tokens_seen": 3192608768 + }, + { + "epoch": 10.07, + "learning_rate": 1.658976930792377e-05, + "loss": 2.2766, + "theoretical_loss": 3.3050085934330573, + "tokens_seen": 3192674304 + }, + { + "epoch": 10.07, + "learning_rate": 1.657973921765296e-05, + "loss": 2.4127, + "theoretical_loss": 3.305003443055703, + "tokens_seen": 3192739840 + }, + { + "epoch": 10.07, + "learning_rate": 1.6569709127382144e-05, + "loss": 2.3501, + "theoretical_loss": 3.304998292813668, + "tokens_seen": 3192805376 + }, + { + "epoch": 10.07, + "learning_rate": 1.6559679037111335e-05, + "loss": 2.2763, + "theoretical_loss": 3.3049931427069454, + "tokens_seen": 3192870912 + }, + { + "epoch": 10.07, + "learning_rate": 1.6549648946840523e-05, + "loss": 2.4723, + "theoretical_loss": 3.30498799273553, + "tokens_seen": 3192936448 + }, + { + "epoch": 10.07, + "learning_rate": 1.653961885656971e-05, + "loss": 2.1725, + "theoretical_loss": 3.304982842899414, + "tokens_seen": 3193001984 + }, + { + "epoch": 10.07, + "learning_rate": 1.65295887662989e-05, + "loss": 2.4346, + "theoretical_loss": 3.304977693198593, + "tokens_seen": 3193067520 + }, + { + "epoch": 10.07, + "learning_rate": 1.6519558676028083e-05, + "loss": 2.2618, + "theoretical_loss": 3.3049725436330593, + "tokens_seen": 3193133056 + }, + { + "epoch": 10.07, + "learning_rate": 1.650952858575727e-05, + "loss": 2.3051, + "theoretical_loss": 3.304967394202807, + "tokens_seen": 3193198592 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3493555, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.052079916000366, + "objective/train/theoretical_loss": 3.3049648195384087, + "objective/train/tokens_used": 3213691360, + "theoretical_loss": 3.3049648195384087, + "tokens_seen": 3193231360 + }, + { + "epoch": 10.07, + "learning_rate": 1.649949849548646e-05, + "loss": 2.2704, + "theoretical_loss": 3.3049622449078293, + "tokens_seen": 3193264128 + }, + { + "epoch": 10.07, + "learning_rate": 1.6489468405215647e-05, + "loss": 2.3131, + "theoretical_loss": 3.3049570957481205, + "tokens_seen": 3193329664 + }, + { + "epoch": 10.07, + "learning_rate": 1.6479438314944835e-05, + "loss": 2.3533, + "theoretical_loss": 3.3049519467236737, + "tokens_seen": 3193395200 + }, + { + "epoch": 10.07, + "learning_rate": 1.6469408224674023e-05, + "loss": 2.2912, + "theoretical_loss": 3.304946797834483, + "tokens_seen": 3193460736 + }, + { + "epoch": 10.07, + "learning_rate": 1.645937813440321e-05, + "loss": 2.2397, + "theoretical_loss": 3.3049416490805426, + "tokens_seen": 3193526272 + }, + { + "epoch": 10.07, + "learning_rate": 1.64493480441324e-05, + "loss": 2.381, + "theoretical_loss": 3.304936500461845, + "tokens_seen": 3193591808 + }, + { + "epoch": 10.07, + "learning_rate": 1.6439317953861586e-05, + "loss": 2.4523, + "theoretical_loss": 3.304931351978384, + "tokens_seen": 3193657344 + }, + { + "epoch": 10.07, + "learning_rate": 1.6429287863590774e-05, + "loss": 2.4207, + "theoretical_loss": 3.304926203630154, + "tokens_seen": 3193722880 + }, + { + "epoch": 10.07, + "learning_rate": 1.641925777331996e-05, + "loss": 2.3992, + "theoretical_loss": 3.3049210554171484, + "tokens_seen": 3193788416 + }, + { + "epoch": 10.07, + "learning_rate": 1.6409227683049147e-05, + "loss": 2.3139, + "theoretical_loss": 3.3049159073393612, + "tokens_seen": 3193853952 + }, + { + "epoch": 10.07, + "learning_rate": 1.6399197592778335e-05, + "loss": 2.3632, + "theoretical_loss": 3.304910759396785, + "tokens_seen": 3193919488 + }, + { + "epoch": 10.07, + "learning_rate": 1.6389167502507522e-05, + "loss": 2.2032, + "theoretical_loss": 3.3049056115894144, + "tokens_seen": 3193985024 + }, + { + "epoch": 10.07, + "learning_rate": 1.637913741223671e-05, + "loss": 2.3733, + "theoretical_loss": 3.3049004639172432, + "tokens_seen": 3194050560 + }, + { + "epoch": 10.07, + "learning_rate": 1.6369107321965898e-05, + "loss": 2.5484, + "theoretical_loss": 3.3048953163802643, + "tokens_seen": 3194116096 + }, + { + "epoch": 10.07, + "learning_rate": 1.6359077231695086e-05, + "loss": 1.9742, + "theoretical_loss": 3.3048901689784724, + "tokens_seen": 3194181632 + }, + { + "epoch": 10.07, + "learning_rate": 1.6349047141424274e-05, + "loss": 2.3729, + "theoretical_loss": 3.30488502171186, + "tokens_seen": 3194247168 + }, + { + "epoch": 10.07, + "learning_rate": 1.6339017051153462e-05, + "loss": 2.421, + "theoretical_loss": 3.3048798745804215, + "tokens_seen": 3194312704 + }, + { + "epoch": 10.07, + "learning_rate": 1.632898696088265e-05, + "loss": 2.2311, + "theoretical_loss": 3.30487472758415, + "tokens_seen": 3194378240 + }, + { + "epoch": 10.07, + "learning_rate": 1.6318956870611834e-05, + "loss": 2.3743, + "theoretical_loss": 3.3048695807230404, + "tokens_seen": 3194443776 + }, + { + "epoch": 10.07, + "learning_rate": 1.6308926780341022e-05, + "loss": 2.4234, + "theoretical_loss": 3.3048644339970847, + "tokens_seen": 3194509312 + }, + { + "epoch": 10.07, + "learning_rate": 1.629889669007021e-05, + "loss": 2.2687, + "theoretical_loss": 3.3048592874062783, + "tokens_seen": 3194574848 + }, + { + "epoch": 10.07, + "learning_rate": 1.6288866599799398e-05, + "loss": 2.3768, + "theoretical_loss": 3.3048541409506136, + "tokens_seen": 3194640384 + }, + { + "epoch": 10.07, + "learning_rate": 1.627883650952859e-05, + "loss": 2.3672, + "theoretical_loss": 3.3048489946300847, + "tokens_seen": 3194705920 + }, + { + "epoch": 10.07, + "learning_rate": 1.6268806419257774e-05, + "loss": 2.1971, + "theoretical_loss": 3.304843848444685, + "tokens_seen": 3194771456 + }, + { + "epoch": 10.07, + "learning_rate": 1.6258776328986962e-05, + "loss": 2.4225, + "theoretical_loss": 3.304838702394409, + "tokens_seen": 3194836992 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3494597, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.22898268699646, + "objective/train/theoretical_loss": 3.3048361294199404, + "objective/train/tokens_used": 3215329760, + "theoretical_loss": 3.3048361294199404, + "tokens_seen": 3194869760 + }, + { + "epoch": 10.07, + "learning_rate": 1.624874623871615e-05, + "loss": 2.1515, + "theoretical_loss": 3.3048335564792497, + "tokens_seen": 3194902528 + }, + { + "epoch": 10.07, + "learning_rate": 1.6238716148445338e-05, + "loss": 2.1651, + "theoretical_loss": 3.304828410699201, + "tokens_seen": 3194968064 + }, + { + "epoch": 10.07, + "learning_rate": 1.6228686058174522e-05, + "loss": 1.9924, + "theoretical_loss": 3.3048232650542566, + "tokens_seen": 3195033600 + }, + { + "epoch": 10.07, + "learning_rate": 1.621865596790371e-05, + "loss": 2.344, + "theoretical_loss": 3.30481811954441, + "tokens_seen": 3195099136 + }, + { + "epoch": 10.07, + "learning_rate": 1.6208625877632898e-05, + "loss": 2.575, + "theoretical_loss": 3.3048129741696544, + "tokens_seen": 3195164672 + }, + { + "epoch": 10.07, + "learning_rate": 1.6198595787362086e-05, + "loss": 2.4056, + "theoretical_loss": 3.3048078289299845, + "tokens_seen": 3195230208 + }, + { + "epoch": 10.07, + "learning_rate": 1.6188565697091274e-05, + "loss": 2.2415, + "theoretical_loss": 3.3048026838253937, + "tokens_seen": 3195295744 + }, + { + "epoch": 10.07, + "learning_rate": 1.617853560682046e-05, + "loss": 2.6253, + "theoretical_loss": 3.3047975388558757, + "tokens_seen": 3195361280 + }, + { + "epoch": 10.07, + "learning_rate": 1.616850551654965e-05, + "loss": 2.2634, + "theoretical_loss": 3.3047923940214234, + "tokens_seen": 3195426816 + }, + { + "epoch": 10.07, + "learning_rate": 1.6158475426278837e-05, + "loss": 2.3868, + "theoretical_loss": 3.3047872493220316, + "tokens_seen": 3195492352 + }, + { + "epoch": 10.07, + "learning_rate": 1.6148445336008025e-05, + "loss": 2.3681, + "theoretical_loss": 3.3047821047576935, + "tokens_seen": 3195557888 + }, + { + "epoch": 10.07, + "learning_rate": 1.6138415245737213e-05, + "loss": 2.4933, + "theoretical_loss": 3.3047769603284025, + "tokens_seen": 3195623424 + }, + { + "epoch": 10.07, + "learning_rate": 1.6128385155466398e-05, + "loss": 2.3373, + "theoretical_loss": 3.3047718160341524, + "tokens_seen": 3195688960 + }, + { + "epoch": 10.07, + "learning_rate": 1.6118355065195585e-05, + "loss": 2.3804, + "theoretical_loss": 3.3047666718749373, + "tokens_seen": 3195754496 + }, + { + "epoch": 10.07, + "learning_rate": 1.6108324974924773e-05, + "loss": 2.3828, + "theoretical_loss": 3.3047615278507507, + "tokens_seen": 3195820032 + }, + { + "epoch": 10.07, + "learning_rate": 1.609829488465396e-05, + "loss": 2.5363, + "theoretical_loss": 3.3047563839615863, + "tokens_seen": 3195885568 + }, + { + "epoch": 10.07, + "learning_rate": 1.6088264794383153e-05, + "loss": 2.4729, + "theoretical_loss": 3.3047512402074375, + "tokens_seen": 3195951104 + }, + { + "epoch": 10.07, + "learning_rate": 1.6078234704112337e-05, + "loss": 2.452, + "theoretical_loss": 3.304746096588298, + "tokens_seen": 3196016640 + }, + { + "epoch": 10.07, + "learning_rate": 1.6068204613841525e-05, + "loss": 2.4213, + "theoretical_loss": 3.3047409531041616, + "tokens_seen": 3196082176 + }, + { + "epoch": 10.07, + "learning_rate": 1.6058174523570713e-05, + "loss": 2.4374, + "theoretical_loss": 3.3047358097550226, + "tokens_seen": 3196147712 + }, + { + "epoch": 10.07, + "learning_rate": 1.60481444332999e-05, + "loss": 2.5338, + "theoretical_loss": 3.3047306665408738, + "tokens_seen": 3196213248 + }, + { + "epoch": 10.07, + "learning_rate": 1.603811434302909e-05, + "loss": 2.4332, + "theoretical_loss": 3.304725523461709, + "tokens_seen": 3196278784 + }, + { + "epoch": 10.07, + "learning_rate": 1.6028084252758273e-05, + "loss": 2.3994, + "theoretical_loss": 3.304720380517523, + "tokens_seen": 3196344320 + }, + { + "epoch": 10.07, + "learning_rate": 1.601805416248746e-05, + "loss": 2.4942, + "theoretical_loss": 3.3047152377083076, + "tokens_seen": 3196409856 + }, + { + "epoch": 10.07, + "learning_rate": 1.600802407221665e-05, + "loss": 2.3248, + "theoretical_loss": 3.304710095034058, + "tokens_seen": 3196475392 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3495226, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1664018630981445, + "objective/train/theoretical_loss": 3.304707523747543, + "objective/train/tokens_used": 3216968160, + "theoretical_loss": 3.304707523747543, + "tokens_seen": 3196508160 + }, + { + "epoch": 10.07, + "learning_rate": 1.599799398194584e-05, + "loss": 2.3979, + "theoretical_loss": 3.3047049524947676, + "tokens_seen": 3196540928 + }, + { + "epoch": 10.07, + "learning_rate": 1.5987963891675028e-05, + "loss": 2.3756, + "theoretical_loss": 3.3046998100904297, + "tokens_seen": 3196606464 + }, + { + "epoch": 10.07, + "learning_rate": 1.5977933801404213e-05, + "loss": 2.4078, + "theoretical_loss": 3.304694667821038, + "tokens_seen": 3196672000 + }, + { + "epoch": 10.07, + "learning_rate": 1.59679037111334e-05, + "loss": 2.4722, + "theoretical_loss": 3.3046895256865865, + "tokens_seen": 3196737536 + }, + { + "epoch": 10.07, + "learning_rate": 1.595787362086259e-05, + "loss": 2.7018, + "theoretical_loss": 3.3046843836870687, + "tokens_seen": 3196803072 + }, + { + "epoch": 10.07, + "learning_rate": 1.5947843530591776e-05, + "loss": 2.3312, + "theoretical_loss": 3.304679241822478, + "tokens_seen": 3196868608 + }, + { + "epoch": 10.07, + "learning_rate": 1.5937813440320964e-05, + "loss": 2.1861, + "theoretical_loss": 3.304674100092809, + "tokens_seen": 3196934144 + }, + { + "epoch": 10.07, + "learning_rate": 1.592778335005015e-05, + "loss": 2.4483, + "theoretical_loss": 3.304668958498055, + "tokens_seen": 3196999680 + }, + { + "epoch": 10.07, + "learning_rate": 1.5917753259779337e-05, + "loss": 2.3754, + "theoretical_loss": 3.304663817038209, + "tokens_seen": 3197065216 + }, + { + "epoch": 10.07, + "learning_rate": 1.5907723169508525e-05, + "loss": 2.43, + "theoretical_loss": 3.3046586757132657, + "tokens_seen": 3197130752 + }, + { + "epoch": 10.07, + "learning_rate": 1.5897693079237716e-05, + "loss": 2.4075, + "theoretical_loss": 3.304653534523218, + "tokens_seen": 3197196288 + }, + { + "epoch": 10.07, + "learning_rate": 1.5887662988966904e-05, + "loss": 2.5814, + "theoretical_loss": 3.30464839346806, + "tokens_seen": 3197261824 + }, + { + "epoch": 10.07, + "learning_rate": 1.5877632898696088e-05, + "loss": 2.1041, + "theoretical_loss": 3.304643252547786, + "tokens_seen": 3197327360 + }, + { + "epoch": 10.07, + "learning_rate": 1.5867602808425276e-05, + "loss": 2.2267, + "theoretical_loss": 3.304638111762388, + "tokens_seen": 3197392896 + }, + { + "epoch": 10.07, + "learning_rate": 1.5857572718154464e-05, + "loss": 2.3351, + "theoretical_loss": 3.3046329711118614, + "tokens_seen": 3197458432 + }, + { + "epoch": 10.07, + "learning_rate": 1.5847542627883652e-05, + "loss": 2.437, + "theoretical_loss": 3.304627830596199, + "tokens_seen": 3197523968 + }, + { + "epoch": 10.07, + "learning_rate": 1.5837512537612836e-05, + "loss": 2.4492, + "theoretical_loss": 3.3046226902153943, + "tokens_seen": 3197589504 + }, + { + "epoch": 10.07, + "learning_rate": 1.5827482447342024e-05, + "loss": 2.4969, + "theoretical_loss": 3.304617549969442, + "tokens_seen": 3197655040 + }, + { + "epoch": 10.07, + "learning_rate": 1.5817452357071212e-05, + "loss": 2.2168, + "theoretical_loss": 3.304612409858335, + "tokens_seen": 3197720576 + }, + { + "epoch": 10.07, + "learning_rate": 1.5807422266800404e-05, + "loss": 2.3953, + "theoretical_loss": 3.304607269882067, + "tokens_seen": 3197786112 + }, + { + "epoch": 10.07, + "learning_rate": 1.579739217652959e-05, + "loss": 2.3038, + "theoretical_loss": 3.3046021300406325, + "tokens_seen": 3197851648 + }, + { + "epoch": 10.07, + "learning_rate": 1.5787362086258776e-05, + "loss": 2.558, + "theoretical_loss": 3.3045969903340238, + "tokens_seen": 3197917184 + }, + { + "epoch": 10.07, + "learning_rate": 1.5777331995987964e-05, + "loss": 2.4678, + "theoretical_loss": 3.304591850762236, + "tokens_seen": 3197982720 + }, + { + "epoch": 10.07, + "learning_rate": 1.5767301905717152e-05, + "loss": 2.2603, + "theoretical_loss": 3.3045867113252623, + "tokens_seen": 3198048256 + }, + { + "epoch": 10.07, + "learning_rate": 1.575727181544634e-05, + "loss": 2.4683, + "theoretical_loss": 3.3045815720230958, + "tokens_seen": 3198113792 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3496464, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.253554582595825, + "objective/train/theoretical_loss": 3.304579002422564, + "objective/train/tokens_used": 3218606560, + "theoretical_loss": 3.304579002422564, + "tokens_seen": 3198146560 + }, + { + "epoch": 10.07, + "learning_rate": 1.5747241725175527e-05, + "loss": 2.4894, + "theoretical_loss": 3.3045764328557308, + "tokens_seen": 3198179328 + }, + { + "epoch": 10.07, + "learning_rate": 1.5737211634904712e-05, + "loss": 2.451, + "theoretical_loss": 3.3045712938231615, + "tokens_seen": 3198244864 + }, + { + "epoch": 10.07, + "learning_rate": 1.57271815446339e-05, + "loss": 2.3865, + "theoretical_loss": 3.3045661549253804, + "tokens_seen": 3198310400 + }, + { + "epoch": 10.07, + "learning_rate": 1.571715145436309e-05, + "loss": 2.2775, + "theoretical_loss": 3.304561016162382, + "tokens_seen": 3198375936 + }, + { + "epoch": 10.07, + "learning_rate": 1.570712136409228e-05, + "loss": 2.3278, + "theoretical_loss": 3.3045558775341597, + "tokens_seen": 3198441472 + }, + { + "epoch": 10.07, + "learning_rate": 1.5697091273821467e-05, + "loss": 2.3068, + "theoretical_loss": 3.3045507390407076, + "tokens_seen": 3198507008 + }, + { + "epoch": 10.07, + "learning_rate": 1.568706118355065e-05, + "loss": 2.6068, + "theoretical_loss": 3.304545600682019, + "tokens_seen": 3198572544 + }, + { + "epoch": 10.07, + "learning_rate": 1.567703109327984e-05, + "loss": 2.5316, + "theoretical_loss": 3.304540462458088, + "tokens_seen": 3198638080 + }, + { + "epoch": 10.07, + "learning_rate": 1.5667001003009027e-05, + "loss": 2.5084, + "theoretical_loss": 3.3045353243689073, + "tokens_seen": 3198703616 + }, + { + "epoch": 10.07, + "learning_rate": 1.5656970912738215e-05, + "loss": 2.363, + "theoretical_loss": 3.304530186414472, + "tokens_seen": 3198769152 + }, + { + "epoch": 10.07, + "learning_rate": 1.5646940822467403e-05, + "loss": 2.5434, + "theoretical_loss": 3.3045250485947752, + "tokens_seen": 3198834688 + }, + { + "epoch": 10.07, + "learning_rate": 1.5636910732196588e-05, + "loss": 2.3232, + "theoretical_loss": 3.3045199109098102, + "tokens_seen": 3198900224 + }, + { + "epoch": 10.07, + "learning_rate": 1.5626880641925775e-05, + "loss": 2.4905, + "theoretical_loss": 3.3045147733595712, + "tokens_seen": 3198965760 + }, + { + "epoch": 10.07, + "learning_rate": 1.5616850551654963e-05, + "loss": 2.4155, + "theoretical_loss": 3.3045096359440516, + "tokens_seen": 3199031296 + }, + { + "epoch": 10.07, + "learning_rate": 1.560682046138415e-05, + "loss": 2.5815, + "theoretical_loss": 3.3045044986632455, + "tokens_seen": 3199096832 + }, + { + "epoch": 10.07, + "learning_rate": 1.5596790371113343e-05, + "loss": 2.4934, + "theoretical_loss": 3.3044993615171467, + "tokens_seen": 3199162368 + }, + { + "epoch": 10.07, + "learning_rate": 1.5586760280842527e-05, + "loss": 2.2173, + "theoretical_loss": 3.3044942245057483, + "tokens_seen": 3199227904 + }, + { + "epoch": 10.07, + "learning_rate": 1.5576730190571715e-05, + "loss": 2.4277, + "theoretical_loss": 3.304489087629044, + "tokens_seen": 3199293440 + }, + { + "epoch": 10.07, + "learning_rate": 1.5566700100300903e-05, + "loss": 2.6967, + "theoretical_loss": 3.304483950887028, + "tokens_seen": 3199358976 + }, + { + "epoch": 10.07, + "learning_rate": 1.555667001003009e-05, + "loss": 2.5744, + "theoretical_loss": 3.304478814279694, + "tokens_seen": 3199424512 + }, + { + "epoch": 10.07, + "learning_rate": 1.554663991975928e-05, + "loss": 2.305, + "theoretical_loss": 3.304473677807035, + "tokens_seen": 3199490048 + }, + { + "epoch": 10.07, + "learning_rate": 1.5536609829488467e-05, + "loss": 2.3375, + "theoretical_loss": 3.304468541469046, + "tokens_seen": 3199555584 + }, + { + "epoch": 10.07, + "learning_rate": 1.5526579739217654e-05, + "loss": 2.37, + "theoretical_loss": 3.30446340526572, + "tokens_seen": 3199621120 + }, + { + "epoch": 10.07, + "learning_rate": 1.551654964894684e-05, + "loss": 2.4053, + "theoretical_loss": 3.30445826919705, + "tokens_seen": 3199686656 + }, + { + "epoch": 10.07, + "learning_rate": 1.550651955867603e-05, + "loss": 2.3103, + "theoretical_loss": 3.3044531332630305, + "tokens_seen": 3199752192 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3496971, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.480128049850464, + "objective/train/theoretical_loss": 3.304450565346513, + "objective/train/tokens_used": 3220244960, + "theoretical_loss": 3.304450565346513, + "tokens_seen": 3199784960 + }, + { + "epoch": 10.07, + "learning_rate": 1.5496489468405215e-05, + "loss": 2.4694, + "theoretical_loss": 3.3044479974636554, + "tokens_seen": 3199817728 + }, + { + "epoch": 10.07, + "learning_rate": 1.5486459378134403e-05, + "loss": 2.2563, + "theoretical_loss": 3.304442861798918, + "tokens_seen": 3199883264 + }, + { + "epoch": 10.07, + "learning_rate": 1.547642928786359e-05, + "loss": 2.3784, + "theoretical_loss": 3.3044377262688123, + "tokens_seen": 3199948800 + }, + { + "epoch": 10.07, + "learning_rate": 1.546639919759278e-05, + "loss": 2.3372, + "theoretical_loss": 3.3044325908733314, + "tokens_seen": 3200014336 + }, + { + "epoch": 10.07, + "learning_rate": 1.5456369107321966e-05, + "loss": 2.4312, + "theoretical_loss": 3.3044274556124695, + "tokens_seen": 3200079872 + }, + { + "epoch": 10.07, + "learning_rate": 1.5446339017051154e-05, + "loss": 2.328, + "theoretical_loss": 3.3044223204862204, + "tokens_seen": 3200145408 + }, + { + "epoch": 10.07, + "learning_rate": 1.5436308926780342e-05, + "loss": 2.4047, + "theoretical_loss": 3.304417185494578, + "tokens_seen": 3200210944 + }, + { + "epoch": 10.07, + "learning_rate": 1.542627883650953e-05, + "loss": 2.3403, + "theoretical_loss": 3.304412050637535, + "tokens_seen": 3200276480 + }, + { + "epoch": 10.07, + "learning_rate": 1.5416248746238715e-05, + "loss": 2.5189, + "theoretical_loss": 3.304406915915086, + "tokens_seen": 3200342016 + }, + { + "epoch": 10.07, + "learning_rate": 1.5406218655967906e-05, + "loss": 2.4344, + "theoretical_loss": 3.304401781327225, + "tokens_seen": 3200407552 + }, + { + "epoch": 10.07, + "learning_rate": 1.539618856569709e-05, + "loss": 2.459, + "theoretical_loss": 3.304396646873945, + "tokens_seen": 3200473088 + }, + { + "epoch": 10.07, + "learning_rate": 1.5386158475426278e-05, + "loss": 2.582, + "theoretical_loss": 3.30439151255524, + "tokens_seen": 3200538624 + }, + { + "epoch": 10.07, + "learning_rate": 1.5376128385155466e-05, + "loss": 2.4781, + "theoretical_loss": 3.3043863783711034, + "tokens_seen": 3200604160 + }, + { + "epoch": 10.07, + "learning_rate": 1.5366098294884654e-05, + "loss": 2.4297, + "theoretical_loss": 3.3043812443215295, + "tokens_seen": 3200669696 + }, + { + "epoch": 10.07, + "learning_rate": 1.5356068204613842e-05, + "loss": 2.426, + "theoretical_loss": 3.3043761104065115, + "tokens_seen": 3200735232 + }, + { + "epoch": 10.07, + "learning_rate": 1.534603811434303e-05, + "loss": 2.5187, + "theoretical_loss": 3.304370976626043, + "tokens_seen": 3200800768 + }, + { + "epoch": 10.07, + "learning_rate": 1.5336008024072218e-05, + "loss": 2.2256, + "theoretical_loss": 3.3043658429801184, + "tokens_seen": 3200866304 + }, + { + "epoch": 10.07, + "learning_rate": 1.5325977933801402e-05, + "loss": 2.5279, + "theoretical_loss": 3.3043607094687313, + "tokens_seen": 3200931840 + }, + { + "epoch": 10.07, + "learning_rate": 1.5315947843530593e-05, + "loss": 2.231, + "theoretical_loss": 3.304355576091875, + "tokens_seen": 3200997376 + }, + { + "epoch": 10.07, + "learning_rate": 1.530591775325978e-05, + "loss": 2.4057, + "theoretical_loss": 3.304350442849543, + "tokens_seen": 3201062912 + }, + { + "epoch": 10.07, + "learning_rate": 1.5295887662988966e-05, + "loss": 2.6065, + "theoretical_loss": 3.30434530974173, + "tokens_seen": 3201128448 + }, + { + "epoch": 10.07, + "learning_rate": 1.5285857572718154e-05, + "loss": 2.4404, + "theoretical_loss": 3.304340176768429, + "tokens_seen": 3201193984 + }, + { + "epoch": 10.07, + "learning_rate": 1.527582748244734e-05, + "loss": 2.599, + "theoretical_loss": 3.3043350439296337, + "tokens_seen": 3201259520 + }, + { + "epoch": 10.07, + "learning_rate": 1.526579739217653e-05, + "loss": 2.3913, + "theoretical_loss": 3.3043299112253384, + "tokens_seen": 3201325056 + }, + { + "epoch": 10.07, + "learning_rate": 1.5255767301905717e-05, + "loss": 2.5968, + "theoretical_loss": 3.304324778655536, + "tokens_seen": 3201390592 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3498270, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.691161870956421, + "objective/train/theoretical_loss": 3.304322212421068, + "objective/train/tokens_used": 3221883360, + "theoretical_loss": 3.304322212421068, + "tokens_seen": 3201423360 + }, + { + "epoch": 10.07, + "learning_rate": 1.5245737211634905e-05, + "loss": 2.4475, + "theoretical_loss": 3.3043196462202205, + "tokens_seen": 3201456128 + }, + { + "epoch": 10.07, + "learning_rate": 1.5235707121364092e-05, + "loss": 2.5506, + "theoretical_loss": 3.304314513919386, + "tokens_seen": 3201521664 + }, + { + "epoch": 10.07, + "learning_rate": 1.5225677031093281e-05, + "loss": 2.3357, + "theoretical_loss": 3.304309381753026, + "tokens_seen": 3201587200 + }, + { + "epoch": 10.07, + "learning_rate": 1.5215646940822467e-05, + "loss": 2.4167, + "theoretical_loss": 3.304304249721134, + "tokens_seen": 3201652736 + }, + { + "epoch": 10.07, + "learning_rate": 1.5205616850551655e-05, + "loss": 2.4276, + "theoretical_loss": 3.3042991178237044, + "tokens_seen": 3201718272 + }, + { + "epoch": 10.07, + "learning_rate": 1.5195586760280843e-05, + "loss": 2.3974, + "theoretical_loss": 3.3042939860607303, + "tokens_seen": 3201783808 + }, + { + "epoch": 10.07, + "learning_rate": 1.518555667001003e-05, + "loss": 2.3673, + "theoretical_loss": 3.304288854432205, + "tokens_seen": 3201849344 + }, + { + "epoch": 10.07, + "learning_rate": 1.5175526579739219e-05, + "loss": 2.3657, + "theoretical_loss": 3.3042837229381234, + "tokens_seen": 3201914880 + }, + { + "epoch": 10.07, + "learning_rate": 1.5165496489468405e-05, + "loss": 2.7348, + "theoretical_loss": 3.3042785915784787, + "tokens_seen": 3201980416 + }, + { + "epoch": 10.07, + "learning_rate": 1.5155466399197593e-05, + "loss": 2.5565, + "theoretical_loss": 3.304273460353264, + "tokens_seen": 3202045952 + }, + { + "epoch": 10.07, + "learning_rate": 1.5145436308926781e-05, + "loss": 2.51, + "theoretical_loss": 3.3042683292624737, + "tokens_seen": 3202111488 + }, + { + "epoch": 10.07, + "learning_rate": 1.5135406218655967e-05, + "loss": 2.4523, + "theoretical_loss": 3.304263198306102, + "tokens_seen": 3202177024 + }, + { + "epoch": 10.07, + "learning_rate": 1.5125376128385157e-05, + "loss": 2.4782, + "theoretical_loss": 3.3042580674841413, + "tokens_seen": 3202242560 + }, + { + "epoch": 10.07, + "learning_rate": 1.5115346038114343e-05, + "loss": 2.1832, + "theoretical_loss": 3.3042529367965865, + "tokens_seen": 3202308096 + }, + { + "epoch": 10.07, + "learning_rate": 1.5105315947843531e-05, + "loss": 2.339, + "theoretical_loss": 3.304247806243431, + "tokens_seen": 3202373632 + }, + { + "epoch": 10.07, + "learning_rate": 1.5095285857572719e-05, + "loss": 2.4442, + "theoretical_loss": 3.304242675824668, + "tokens_seen": 3202439168 + }, + { + "epoch": 10.07, + "learning_rate": 1.5085255767301907e-05, + "loss": 2.6615, + "theoretical_loss": 3.3042375455402917, + "tokens_seen": 3202504704 + }, + { + "epoch": 10.07, + "learning_rate": 1.5075225677031095e-05, + "loss": 2.5252, + "theoretical_loss": 3.304232415390296, + "tokens_seen": 3202570240 + }, + { + "epoch": 10.07, + "learning_rate": 1.506519558676028e-05, + "loss": 2.4926, + "theoretical_loss": 3.3042272853746746, + "tokens_seen": 3202635776 + }, + { + "epoch": 10.07, + "learning_rate": 1.5055165496489469e-05, + "loss": 2.4475, + "theoretical_loss": 3.304222155493421, + "tokens_seen": 3202701312 + }, + { + "epoch": 10.07, + "learning_rate": 1.5045135406218655e-05, + "loss": 2.3955, + "theoretical_loss": 3.304217025746529, + "tokens_seen": 3202766848 + }, + { + "epoch": 10.07, + "learning_rate": 1.5035105315947844e-05, + "loss": 2.5709, + "theoretical_loss": 3.304211896133992, + "tokens_seen": 3202832384 + }, + { + "epoch": 10.07, + "learning_rate": 1.5025075225677032e-05, + "loss": 2.3127, + "theoretical_loss": 3.304206766655804, + "tokens_seen": 3202897920 + }, + { + "epoch": 10.07, + "learning_rate": 1.5015045135406219e-05, + "loss": 2.583, + "theoretical_loss": 3.3042016373119587, + "tokens_seen": 3202963456 + }, + { + "epoch": 10.07, + "learning_rate": 1.5005015045135406e-05, + "loss": 2.5027, + "theoretical_loss": 3.30419650810245, + "tokens_seen": 3203028992 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3499017, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.916851282119751, + "objective/train/theoretical_loss": 3.30419394354807, + "objective/train/tokens_used": 3223521760, + "theoretical_loss": 3.30419394354807, + "tokens_seen": 3203061760 + }, + { + "epoch": 10.07, + "learning_rate": 1.4994984954864593e-05, + "loss": 2.2804, + "theoretical_loss": 3.304191379027272, + "tokens_seen": 3203094528 + }, + { + "epoch": 10.07, + "learning_rate": 1.4984954864593782e-05, + "loss": 2.498, + "theoretical_loss": 3.3041862500864174, + "tokens_seen": 3203160064 + }, + { + "epoch": 10.07, + "learning_rate": 1.497492477432297e-05, + "loss": 2.3293, + "theoretical_loss": 3.304181121279881, + "tokens_seen": 3203225600 + }, + { + "epoch": 10.07, + "learning_rate": 1.4964894684052156e-05, + "loss": 2.6353, + "theoretical_loss": 3.3041759926076555, + "tokens_seen": 3203291136 + }, + { + "epoch": 10.07, + "learning_rate": 1.4954864593781344e-05, + "loss": 2.3928, + "theoretical_loss": 3.3041708640697354, + "tokens_seen": 3203356672 + }, + { + "epoch": 10.07, + "learning_rate": 1.494483450351053e-05, + "loss": 2.5703, + "theoretical_loss": 3.3041657356661145, + "tokens_seen": 3203422208 + }, + { + "epoch": 10.07, + "learning_rate": 1.493480441323972e-05, + "loss": 2.4726, + "theoretical_loss": 3.3041606073967857, + "tokens_seen": 3203487744 + }, + { + "epoch": 10.07, + "learning_rate": 1.4924774322968908e-05, + "loss": 2.3865, + "theoretical_loss": 3.304155479261744, + "tokens_seen": 3203553280 + }, + { + "epoch": 10.07, + "learning_rate": 1.4914744232698094e-05, + "loss": 2.5351, + "theoretical_loss": 3.304150351260982, + "tokens_seen": 3203618816 + }, + { + "epoch": 10.07, + "learning_rate": 1.4904714142427282e-05, + "loss": 2.4838, + "theoretical_loss": 3.3041452233944932, + "tokens_seen": 3203684352 + }, + { + "epoch": 10.07, + "learning_rate": 1.489468405215647e-05, + "loss": 2.314, + "theoretical_loss": 3.304140095662273, + "tokens_seen": 3203749888 + }, + { + "epoch": 10.07, + "learning_rate": 1.4884653961885658e-05, + "loss": 2.4573, + "theoretical_loss": 3.304134968064314, + "tokens_seen": 3203815424 + }, + { + "epoch": 10.07, + "learning_rate": 1.4874623871614846e-05, + "loss": 2.3669, + "theoretical_loss": 3.3041298406006097, + "tokens_seen": 3203880960 + }, + { + "epoch": 10.07, + "learning_rate": 1.4864593781344032e-05, + "loss": 2.2377, + "theoretical_loss": 3.304124713271154, + "tokens_seen": 3203946496 + }, + { + "epoch": 10.07, + "learning_rate": 1.485456369107322e-05, + "loss": 2.3899, + "theoretical_loss": 3.3041195860759416, + "tokens_seen": 3204012032 + }, + { + "epoch": 10.07, + "learning_rate": 1.4844533600802408e-05, + "loss": 2.3087, + "theoretical_loss": 3.304114459014965, + "tokens_seen": 3204077568 + }, + { + "epoch": 10.07, + "learning_rate": 1.4834503510531596e-05, + "loss": 2.4227, + "theoretical_loss": 3.3041093320882187, + "tokens_seen": 3204143104 + }, + { + "epoch": 10.07, + "learning_rate": 1.4824473420260782e-05, + "loss": 2.2226, + "theoretical_loss": 3.3041042052956957, + "tokens_seen": 3204208640 + }, + { + "epoch": 10.07, + "learning_rate": 1.481444332998997e-05, + "loss": 2.522, + "theoretical_loss": 3.3040990786373907, + "tokens_seen": 3204274176 + }, + { + "epoch": 10.07, + "learning_rate": 1.4804413239719158e-05, + "loss": 2.5915, + "theoretical_loss": 3.3040939521132966, + "tokens_seen": 3204339712 + }, + { + "epoch": 10.07, + "learning_rate": 1.4794383149448345e-05, + "loss": 2.3095, + "theoretical_loss": 3.3040888257234076, + "tokens_seen": 3204405248 + }, + { + "epoch": 10.07, + "learning_rate": 1.4784353059177533e-05, + "loss": 2.5954, + "theoretical_loss": 3.3040836994677174, + "tokens_seen": 3204470784 + }, + { + "epoch": 10.07, + "learning_rate": 1.477432296890672e-05, + "loss": 2.3644, + "theoretical_loss": 3.3040785733462195, + "tokens_seen": 3204536320 + }, + { + "epoch": 10.07, + "learning_rate": 1.4764292878635907e-05, + "loss": 2.3343, + "theoretical_loss": 3.304073447358908, + "tokens_seen": 3204601856 + }, + { + "epoch": 10.07, + "learning_rate": 1.4754262788365097e-05, + "loss": 2.6694, + "theoretical_loss": 3.3040683215057767, + "tokens_seen": 3204667392 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3500374, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4184939861297607, + "objective/train/theoretical_loss": 3.3040657586295263, + "objective/train/tokens_used": 3225160160, + "theoretical_loss": 3.3040657586295263, + "tokens_seen": 3204700160 + }, + { + "epoch": 10.07, + "learning_rate": 1.4744232698094283e-05, + "loss": 2.5115, + "theoretical_loss": 3.3040631957868185, + "tokens_seen": 3204732928 + }, + { + "epoch": 10.07, + "learning_rate": 1.4734202607823471e-05, + "loss": 2.4912, + "theoretical_loss": 3.304058070202028, + "tokens_seen": 3204798464 + }, + { + "epoch": 10.07, + "learning_rate": 1.4724172517552657e-05, + "loss": 2.3634, + "theoretical_loss": 3.304052944751399, + "tokens_seen": 3204864000 + }, + { + "epoch": 10.07, + "learning_rate": 1.4714142427281845e-05, + "loss": 2.5091, + "theoretical_loss": 3.3040478194349245, + "tokens_seen": 3204929536 + }, + { + "epoch": 10.07, + "learning_rate": 1.4704112337011035e-05, + "loss": 2.2543, + "theoretical_loss": 3.304042694252599, + "tokens_seen": 3204995072 + }, + { + "epoch": 10.07, + "learning_rate": 1.4694082246740221e-05, + "loss": 2.4398, + "theoretical_loss": 3.3040375692044157, + "tokens_seen": 3205060608 + }, + { + "epoch": 10.07, + "learning_rate": 1.4684052156469409e-05, + "loss": 2.4761, + "theoretical_loss": 3.304032444290369, + "tokens_seen": 3205126144 + }, + { + "epoch": 10.07, + "learning_rate": 1.4674022066198595e-05, + "loss": 2.3501, + "theoretical_loss": 3.3040273195104515, + "tokens_seen": 3205191680 + }, + { + "epoch": 10.07, + "learning_rate": 1.4663991975927783e-05, + "loss": 2.5218, + "theoretical_loss": 3.3040221948646584, + "tokens_seen": 3205257216 + }, + { + "epoch": 10.07, + "learning_rate": 1.4653961885656971e-05, + "loss": 2.3918, + "theoretical_loss": 3.304017070352982, + "tokens_seen": 3205322752 + }, + { + "epoch": 10.07, + "learning_rate": 1.4643931795386159e-05, + "loss": 2.4399, + "theoretical_loss": 3.3040119459754176, + "tokens_seen": 3205388288 + }, + { + "epoch": 10.07, + "learning_rate": 1.4633901705115347e-05, + "loss": 2.5341, + "theoretical_loss": 3.3040068217319574, + "tokens_seen": 3205453824 + }, + { + "epoch": 10.07, + "learning_rate": 1.4623871614844533e-05, + "loss": 2.231, + "theoretical_loss": 3.3040016976225965, + "tokens_seen": 3205519360 + }, + { + "epoch": 10.07, + "learning_rate": 1.4613841524573723e-05, + "loss": 2.4062, + "theoretical_loss": 3.3039965736473276, + "tokens_seen": 3205584896 + }, + { + "epoch": 10.07, + "learning_rate": 1.4603811434302909e-05, + "loss": 2.4038, + "theoretical_loss": 3.3039914498061447, + "tokens_seen": 3205650432 + }, + { + "epoch": 10.07, + "learning_rate": 1.4593781344032097e-05, + "loss": 2.4142, + "theoretical_loss": 3.303986326099042, + "tokens_seen": 3205715968 + }, + { + "epoch": 10.07, + "learning_rate": 1.4583751253761285e-05, + "loss": 2.4375, + "theoretical_loss": 3.303981202526013, + "tokens_seen": 3205781504 + }, + { + "epoch": 10.07, + "learning_rate": 1.457372116349047e-05, + "loss": 2.3531, + "theoretical_loss": 3.303976079087051, + "tokens_seen": 3205847040 + }, + { + "epoch": 10.07, + "learning_rate": 1.456369107321966e-05, + "loss": 2.4162, + "theoretical_loss": 3.303970955782151, + "tokens_seen": 3205912576 + }, + { + "epoch": 10.07, + "learning_rate": 1.4553660982948847e-05, + "loss": 2.4876, + "theoretical_loss": 3.303965832611305, + "tokens_seen": 3205978112 + }, + { + "epoch": 10.07, + "learning_rate": 1.4543630892678034e-05, + "loss": 2.3377, + "theoretical_loss": 3.3039607095745085, + "tokens_seen": 3206043648 + }, + { + "epoch": 10.07, + "learning_rate": 1.4533600802407222e-05, + "loss": 2.4985, + "theoretical_loss": 3.3039555866717536, + "tokens_seen": 3206109184 + }, + { + "epoch": 10.07, + "learning_rate": 1.4523570712136409e-05, + "loss": 2.4396, + "theoretical_loss": 3.3039504639030355, + "tokens_seen": 3206174720 + }, + { + "epoch": 10.07, + "learning_rate": 1.4513540621865598e-05, + "loss": 2.4405, + "theoretical_loss": 3.303945341268347, + "tokens_seen": 3206240256 + }, + { + "epoch": 10.07, + "learning_rate": 1.4503510531594784e-05, + "loss": 2.3897, + "theoretical_loss": 3.303940218767682, + "tokens_seen": 3206305792 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3501823, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8027145862579346, + "objective/train/theoretical_loss": 3.303937657567607, + "objective/train/tokens_used": 3226798560, + "theoretical_loss": 3.303937657567607, + "tokens_seen": 3206338560 + }, + { + "epoch": 10.07, + "learning_rate": 1.4493480441323972e-05, + "loss": 2.5035, + "theoretical_loss": 3.3039350964010348, + "tokens_seen": 3206371328 + }, + { + "epoch": 10.07, + "learning_rate": 1.4483450351053158e-05, + "loss": 2.5446, + "theoretical_loss": 3.303929974168399, + "tokens_seen": 3206436864 + }, + { + "epoch": 10.07, + "learning_rate": 1.4473420260782348e-05, + "loss": 2.5655, + "theoretical_loss": 3.3039248520697675, + "tokens_seen": 3206502400 + }, + { + "epoch": 10.07, + "learning_rate": 1.4463390170511536e-05, + "loss": 2.3083, + "theoretical_loss": 3.303919730105135, + "tokens_seen": 3206567936 + }, + { + "epoch": 10.07, + "learning_rate": 1.4453360080240722e-05, + "loss": 2.3468, + "theoretical_loss": 3.303914608274495, + "tokens_seen": 3206633472 + }, + { + "epoch": 10.07, + "learning_rate": 1.444332998996991e-05, + "loss": 2.5142, + "theoretical_loss": 3.303909486577841, + "tokens_seen": 3206699008 + }, + { + "epoch": 10.07, + "learning_rate": 1.4433299899699096e-05, + "loss": 2.4206, + "theoretical_loss": 3.3039043650151676, + "tokens_seen": 3206764544 + }, + { + "epoch": 10.07, + "learning_rate": 1.4423269809428286e-05, + "loss": 2.502, + "theoretical_loss": 3.3038992435864674, + "tokens_seen": 3206830080 + }, + { + "epoch": 10.07, + "learning_rate": 1.4413239719157474e-05, + "loss": 2.5503, + "theoretical_loss": 3.3038941222917346, + "tokens_seen": 3206895616 + }, + { + "epoch": 10.07, + "learning_rate": 1.440320962888666e-05, + "loss": 2.6437, + "theoretical_loss": 3.3038890011309636, + "tokens_seen": 3206961152 + }, + { + "epoch": 10.07, + "learning_rate": 1.4393179538615848e-05, + "loss": 2.6506, + "theoretical_loss": 3.3038838801041472, + "tokens_seen": 3207026688 + }, + { + "epoch": 10.07, + "learning_rate": 1.4383149448345034e-05, + "loss": 2.3805, + "theoretical_loss": 3.3038787592112797, + "tokens_seen": 3207092224 + }, + { + "epoch": 10.07, + "learning_rate": 1.4373119358074224e-05, + "loss": 2.4302, + "theoretical_loss": 3.3038736384523544, + "tokens_seen": 3207157760 + }, + { + "epoch": 10.07, + "learning_rate": 1.4363089267803411e-05, + "loss": 2.3283, + "theoretical_loss": 3.3038685178273655, + "tokens_seen": 3207223296 + }, + { + "epoch": 10.07, + "learning_rate": 1.4353059177532598e-05, + "loss": 2.4474, + "theoretical_loss": 3.3038633973363067, + "tokens_seen": 3207288832 + }, + { + "epoch": 10.07, + "learning_rate": 1.4343029087261786e-05, + "loss": 2.5996, + "theoretical_loss": 3.303858276979172, + "tokens_seen": 3207354368 + }, + { + "epoch": 10.07, + "learning_rate": 1.4332998996990973e-05, + "loss": 2.4582, + "theoretical_loss": 3.303853156755954, + "tokens_seen": 3207419904 + }, + { + "epoch": 10.07, + "learning_rate": 1.4322968906720161e-05, + "loss": 2.6184, + "theoretical_loss": 3.3038480366666483, + "tokens_seen": 3207485440 + }, + { + "epoch": 10.07, + "learning_rate": 1.4312938816449348e-05, + "loss": 2.5223, + "theoretical_loss": 3.3038429167112473, + "tokens_seen": 3207550976 + }, + { + "epoch": 10.07, + "learning_rate": 1.4302908726178535e-05, + "loss": 2.3923, + "theoretical_loss": 3.303837796889745, + "tokens_seen": 3207616512 + }, + { + "epoch": 10.07, + "learning_rate": 1.4292878635907723e-05, + "loss": 2.5973, + "theoretical_loss": 3.3038326772021356, + "tokens_seen": 3207682048 + }, + { + "epoch": 10.07, + "learning_rate": 1.4282848545636911e-05, + "loss": 2.3054, + "theoretical_loss": 3.3038275576484124, + "tokens_seen": 3207747584 + }, + { + "epoch": 10.07, + "learning_rate": 1.42728184553661e-05, + "loss": 2.3936, + "theoretical_loss": 3.3038224382285692, + "tokens_seen": 3207813120 + }, + { + "epoch": 10.07, + "learning_rate": 1.4262788365095285e-05, + "loss": 2.5402, + "theoretical_loss": 3.3038173189426003, + "tokens_seen": 3207878656 + }, + { + "epoch": 10.07, + "learning_rate": 1.4252758274824473e-05, + "loss": 2.3484, + "theoretical_loss": 3.3038121997904986, + "tokens_seen": 3207944192 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3502576, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5598480701446533, + "objective/train/theoretical_loss": 3.3038096402646464, + "objective/train/tokens_used": 3228436960, + "theoretical_loss": 3.3038096402646464, + "tokens_seen": 3207976960 + }, + { + "epoch": 10.07, + "learning_rate": 1.4242728184553661e-05, + "loss": 2.5542, + "theoretical_loss": 3.3038070807722586, + "tokens_seen": 3208009728 + }, + { + "epoch": 10.07, + "learning_rate": 1.4232698094282849e-05, + "loss": 2.5004, + "theoretical_loss": 3.303801961887874, + "tokens_seen": 3208075264 + }, + { + "epoch": 10.07, + "learning_rate": 1.4222668004012037e-05, + "loss": 2.3771, + "theoretical_loss": 3.303796843137338, + "tokens_seen": 3208140800 + }, + { + "epoch": 10.07, + "learning_rate": 1.4212637913741223e-05, + "loss": 2.5475, + "theoretical_loss": 3.3037917245206447, + "tokens_seen": 3208206336 + }, + { + "epoch": 10.07, + "learning_rate": 1.4202607823470411e-05, + "loss": 2.5733, + "theoretical_loss": 3.303786606037788, + "tokens_seen": 3208271872 + }, + { + "epoch": 10.07, + "learning_rate": 1.41925777331996e-05, + "loss": 2.4698, + "theoretical_loss": 3.3037814876887612, + "tokens_seen": 3208337408 + }, + { + "epoch": 10.07, + "learning_rate": 1.4182547642928787e-05, + "loss": 2.4521, + "theoretical_loss": 3.3037763694735593, + "tokens_seen": 3208402944 + }, + { + "epoch": 10.07, + "learning_rate": 1.4172517552657975e-05, + "loss": 2.1972, + "theoretical_loss": 3.3037712513921744, + "tokens_seen": 3208468480 + }, + { + "epoch": 10.07, + "learning_rate": 1.4162487462387161e-05, + "loss": 2.5105, + "theoretical_loss": 3.3037661334446016, + "tokens_seen": 3208534016 + }, + { + "epoch": 10.07, + "learning_rate": 1.4152457372116349e-05, + "loss": 2.4001, + "theoretical_loss": 3.3037610156308337, + "tokens_seen": 3208599552 + }, + { + "epoch": 10.07, + "learning_rate": 1.4142427281845538e-05, + "loss": 2.524, + "theoretical_loss": 3.303755897950865, + "tokens_seen": 3208665088 + }, + { + "epoch": 10.07, + "learning_rate": 1.4132397191574725e-05, + "loss": 2.3822, + "theoretical_loss": 3.3037507804046893, + "tokens_seen": 3208730624 + }, + { + "epoch": 10.07, + "learning_rate": 1.4122367101303913e-05, + "loss": 2.6167, + "theoretical_loss": 3.3037456629923003, + "tokens_seen": 3208796160 + }, + { + "epoch": 10.07, + "learning_rate": 1.4112337011033099e-05, + "loss": 2.4019, + "theoretical_loss": 3.3037405457136915, + "tokens_seen": 3208861696 + }, + { + "epoch": 10.07, + "learning_rate": 1.4102306920762287e-05, + "loss": 2.4107, + "theoretical_loss": 3.303735428568857, + "tokens_seen": 3208927232 + }, + { + "epoch": 10.07, + "learning_rate": 1.4092276830491475e-05, + "loss": 2.5312, + "theoretical_loss": 3.30373031155779, + "tokens_seen": 3208992768 + }, + { + "epoch": 10.07, + "learning_rate": 1.4082246740220662e-05, + "loss": 2.3918, + "theoretical_loss": 3.303725194680485, + "tokens_seen": 3209058304 + }, + { + "epoch": 10.07, + "learning_rate": 1.407221664994985e-05, + "loss": 2.4187, + "theoretical_loss": 3.303720077936936, + "tokens_seen": 3209123840 + }, + { + "epoch": 10.07, + "learning_rate": 1.4062186559679037e-05, + "loss": 2.4613, + "theoretical_loss": 3.303714961327136, + "tokens_seen": 3209189376 + }, + { + "epoch": 10.07, + "learning_rate": 1.4052156469408226e-05, + "loss": 2.6369, + "theoretical_loss": 3.3037098448510784, + "tokens_seen": 3209254912 + }, + { + "epoch": 10.07, + "learning_rate": 1.4042126379137412e-05, + "loss": 2.4542, + "theoretical_loss": 3.3037047285087584, + "tokens_seen": 3209320448 + }, + { + "epoch": 10.07, + "learning_rate": 1.40320962888666e-05, + "loss": 2.4833, + "theoretical_loss": 3.3036996123001687, + "tokens_seen": 3209385984 + }, + { + "epoch": 10.07, + "learning_rate": 1.4022066198595788e-05, + "loss": 2.5165, + "theoretical_loss": 3.303694496225303, + "tokens_seen": 3209451520 + }, + { + "epoch": 10.07, + "learning_rate": 1.4012036108324974e-05, + "loss": 2.6114, + "theoretical_loss": 3.303689380284156, + "tokens_seen": 3209517056 + }, + { + "epoch": 10.07, + "learning_rate": 1.4002006018054164e-05, + "loss": 2.5355, + "theoretical_loss": 3.303684264476721, + "tokens_seen": 3209582592 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3504043, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.9048463106155396, + "objective/train/theoretical_loss": 3.303681706623143, + "objective/train/tokens_used": 3230075360, + "theoretical_loss": 3.303681706623143, + "tokens_seen": 3209615360 + }, + { + "epoch": 10.07, + "learning_rate": 1.399197592778335e-05, + "loss": 2.1408, + "theoretical_loss": 3.303679148802991, + "tokens_seen": 3209648128 + }, + { + "epoch": 10.07, + "learning_rate": 1.3981945837512538e-05, + "loss": 2.3457, + "theoretical_loss": 3.3036740332629613, + "tokens_seen": 3209713664 + }, + { + "epoch": 10.07, + "learning_rate": 1.3971915747241726e-05, + "loss": 2.6819, + "theoretical_loss": 3.303668917856624, + "tokens_seen": 3209779200 + }, + { + "epoch": 10.07, + "learning_rate": 1.3961885656970912e-05, + "loss": 2.5926, + "theoretical_loss": 3.303663802583974, + "tokens_seen": 3209844736 + }, + { + "epoch": 10.07, + "learning_rate": 1.3951855566700102e-05, + "loss": 2.404, + "theoretical_loss": 3.303658687445005, + "tokens_seen": 3209910272 + }, + { + "epoch": 10.07, + "learning_rate": 1.3941825476429288e-05, + "loss": 2.4037, + "theoretical_loss": 3.3036535724397105, + "tokens_seen": 3209975808 + }, + { + "epoch": 10.07, + "learning_rate": 1.3931795386158476e-05, + "loss": 2.7602, + "theoretical_loss": 3.3036484575680842, + "tokens_seen": 3210041344 + }, + { + "epoch": 10.07, + "learning_rate": 1.3921765295887662e-05, + "loss": 2.4043, + "theoretical_loss": 3.30364334283012, + "tokens_seen": 3210106880 + }, + { + "epoch": 10.07, + "learning_rate": 1.3911735205616852e-05, + "loss": 2.5787, + "theoretical_loss": 3.303638228225812, + "tokens_seen": 3210172416 + }, + { + "epoch": 10.07, + "learning_rate": 1.390170511534604e-05, + "loss": 2.3462, + "theoretical_loss": 3.3036331137551533, + "tokens_seen": 3210237952 + }, + { + "epoch": 10.07, + "learning_rate": 1.3891675025075226e-05, + "loss": 2.2513, + "theoretical_loss": 3.303627999418138, + "tokens_seen": 3210303488 + }, + { + "epoch": 10.07, + "learning_rate": 1.3881644934804414e-05, + "loss": 2.4259, + "theoretical_loss": 3.30362288521476, + "tokens_seen": 3210369024 + }, + { + "epoch": 10.07, + "learning_rate": 1.38716148445336e-05, + "loss": 2.498, + "theoretical_loss": 3.303617771145013, + "tokens_seen": 3210434560 + }, + { + "epoch": 10.07, + "learning_rate": 1.386158475426279e-05, + "loss": 2.487, + "theoretical_loss": 3.303612657208891, + "tokens_seen": 3210500096 + }, + { + "epoch": 10.07, + "learning_rate": 1.3851554663991977e-05, + "loss": 2.3301, + "theoretical_loss": 3.303607543406388, + "tokens_seen": 3210565632 + }, + { + "epoch": 10.07, + "learning_rate": 1.3841524573721163e-05, + "loss": 2.5828, + "theoretical_loss": 3.303602429737497, + "tokens_seen": 3210631168 + }, + { + "epoch": 10.07, + "learning_rate": 1.3831494483450351e-05, + "loss": 2.2509, + "theoretical_loss": 3.303597316202212, + "tokens_seen": 3210696704 + }, + { + "epoch": 10.07, + "learning_rate": 1.3821464393179538e-05, + "loss": 2.4165, + "theoretical_loss": 3.3035922028005267, + "tokens_seen": 3210762240 + }, + { + "epoch": 10.07, + "learning_rate": 1.3811434302908727e-05, + "loss": 2.4018, + "theoretical_loss": 3.303587089532435, + "tokens_seen": 3210827776 + }, + { + "epoch": 10.07, + "learning_rate": 1.3801404212637915e-05, + "loss": 2.4623, + "theoretical_loss": 3.3035819763979313, + "tokens_seen": 3210893312 + }, + { + "epoch": 10.07, + "learning_rate": 1.3791374122367101e-05, + "loss": 2.2746, + "theoretical_loss": 3.3035768633970086, + "tokens_seen": 3210958848 + }, + { + "epoch": 10.07, + "learning_rate": 1.378134403209629e-05, + "loss": 2.3135, + "theoretical_loss": 3.3035717505296613, + "tokens_seen": 3211024384 + }, + { + "epoch": 10.07, + "learning_rate": 1.3771313941825477e-05, + "loss": 2.3174, + "theoretical_loss": 3.3035666377958823, + "tokens_seen": 3211089920 + }, + { + "epoch": 10.07, + "learning_rate": 1.3761283851554665e-05, + "loss": 2.5076, + "theoretical_loss": 3.3035615251956663, + "tokens_seen": 3211155456 + }, + { + "epoch": 10.07, + "learning_rate": 1.3751253761283851e-05, + "loss": 2.3373, + "theoretical_loss": 3.303556412729007, + "tokens_seen": 3211220992 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3504612, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5325920581817627, + "objective/train/theoretical_loss": 3.3035538565457587, + "objective/train/tokens_used": 3231713760, + "theoretical_loss": 3.3035538565457587, + "tokens_seen": 3211253760 + }, + { + "epoch": 10.07, + "learning_rate": 1.3741223671013039e-05, + "loss": 2.4347, + "theoretical_loss": 3.303551300395897, + "tokens_seen": 3211286528 + }, + { + "epoch": 10.07, + "learning_rate": 1.3731193580742227e-05, + "loss": 2.5187, + "theoretical_loss": 3.3035461881963317, + "tokens_seen": 3211352064 + }, + { + "epoch": 10.07, + "learning_rate": 1.3721163490471415e-05, + "loss": 2.3479, + "theoretical_loss": 3.303541076130304, + "tokens_seen": 3211417600 + }, + { + "epoch": 10.07, + "learning_rate": 1.3711133400200603e-05, + "loss": 2.2388, + "theoretical_loss": 3.303535964197808, + "tokens_seen": 3211483136 + }, + { + "epoch": 10.07, + "learning_rate": 1.3701103309929789e-05, + "loss": 2.2925, + "theoretical_loss": 3.303530852398837, + "tokens_seen": 3211548672 + }, + { + "epoch": 10.07, + "learning_rate": 1.3691073219658977e-05, + "loss": 2.6136, + "theoretical_loss": 3.3035257407333853, + "tokens_seen": 3211614208 + }, + { + "epoch": 10.07, + "learning_rate": 1.3681043129388165e-05, + "loss": 2.0559, + "theoretical_loss": 3.3035206292014463, + "tokens_seen": 3211679744 + }, + { + "epoch": 10.07, + "learning_rate": 1.3671013039117353e-05, + "loss": 2.6748, + "theoretical_loss": 3.3035155178030142, + "tokens_seen": 3211745280 + }, + { + "epoch": 10.07, + "learning_rate": 1.366098294884654e-05, + "loss": 2.5565, + "theoretical_loss": 3.3035104065380825, + "tokens_seen": 3211810816 + }, + { + "epoch": 10.07, + "learning_rate": 1.3650952858575727e-05, + "loss": 2.1849, + "theoretical_loss": 3.3035052954066453, + "tokens_seen": 3211876352 + }, + { + "epoch": 10.07, + "learning_rate": 1.3640922768304915e-05, + "loss": 2.5088, + "theoretical_loss": 3.303500184408696, + "tokens_seen": 3211941888 + }, + { + "epoch": 10.07, + "learning_rate": 1.3630892678034103e-05, + "loss": 2.4588, + "theoretical_loss": 3.3034950735442288, + "tokens_seen": 3212007424 + }, + { + "epoch": 10.07, + "learning_rate": 1.362086258776329e-05, + "loss": 2.1016, + "theoretical_loss": 3.303489962813237, + "tokens_seen": 3212072960 + }, + { + "epoch": 10.07, + "learning_rate": 1.3610832497492478e-05, + "loss": 2.3337, + "theoretical_loss": 3.3034848522157145, + "tokens_seen": 3212138496 + }, + { + "epoch": 10.07, + "learning_rate": 1.3600802407221665e-05, + "loss": 2.3754, + "theoretical_loss": 3.3034797417516555, + "tokens_seen": 3212204032 + }, + { + "epoch": 10.07, + "learning_rate": 1.3590772316950852e-05, + "loss": 2.2935, + "theoretical_loss": 3.3034746314210537, + "tokens_seen": 3212269568 + }, + { + "epoch": 10.07, + "learning_rate": 1.358074222668004e-05, + "loss": 2.4185, + "theoretical_loss": 3.303469521223902, + "tokens_seen": 3212335104 + }, + { + "epoch": 10.07, + "learning_rate": 1.3570712136409228e-05, + "loss": 2.4386, + "theoretical_loss": 3.3034644111601956, + "tokens_seen": 3212400640 + }, + { + "epoch": 10.07, + "learning_rate": 1.3560682046138416e-05, + "loss": 2.2837, + "theoretical_loss": 3.3034593012299274, + "tokens_seen": 3212466176 + }, + { + "epoch": 10.07, + "learning_rate": 1.3550651955867602e-05, + "loss": 2.1297, + "theoretical_loss": 3.3034541914330915, + "tokens_seen": 3212531712 + }, + { + "epoch": 10.07, + "learning_rate": 1.354062186559679e-05, + "loss": 2.3921, + "theoretical_loss": 3.3034490817696813, + "tokens_seen": 3212597248 + }, + { + "epoch": 10.07, + "learning_rate": 1.3530591775325978e-05, + "loss": 2.4171, + "theoretical_loss": 3.303443972239691, + "tokens_seen": 3212662784 + }, + { + "epoch": 10.07, + "learning_rate": 1.3520561685055166e-05, + "loss": 2.4447, + "theoretical_loss": 3.3034388628431146, + "tokens_seen": 3212728320 + }, + { + "epoch": 10.07, + "learning_rate": 1.3510531594784354e-05, + "loss": 2.5312, + "theoretical_loss": 3.303433753579945, + "tokens_seen": 3212793856 + }, + { + "epoch": 10.07, + "learning_rate": 1.350050150451354e-05, + "loss": 2.4364, + "theoretical_loss": 3.303428644450177, + "tokens_seen": 3212859392 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3505250, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3676419258117676, + "objective/train/theoretical_loss": 3.3034260899353165, + "objective/train/tokens_used": 3233352160, + "theoretical_loss": 3.3034260899353165, + "tokens_seen": 3212892160 + }, + { + "epoch": 10.07, + "learning_rate": 1.3490471414242728e-05, + "loss": 2.2914, + "theoretical_loss": 3.303423535453804, + "tokens_seen": 3212924928 + }, + { + "epoch": 10.07, + "learning_rate": 1.3480441323971916e-05, + "loss": 2.4908, + "theoretical_loss": 3.3034184265908197, + "tokens_seen": 3212990464 + }, + { + "epoch": 10.07, + "learning_rate": 1.3470411233701104e-05, + "loss": 2.2709, + "theoretical_loss": 3.3034133178612177, + "tokens_seen": 3213056000 + }, + { + "epoch": 10.07, + "learning_rate": 1.3460381143430292e-05, + "loss": 2.3995, + "theoretical_loss": 3.303408209264992, + "tokens_seen": 3213121536 + }, + { + "epoch": 10.07, + "learning_rate": 1.3450351053159478e-05, + "loss": 2.4186, + "theoretical_loss": 3.303403100802137, + "tokens_seen": 3213187072 + }, + { + "epoch": 10.07, + "learning_rate": 1.3440320962888667e-05, + "loss": 2.4124, + "theoretical_loss": 3.3033979924726458, + "tokens_seen": 3213252608 + }, + { + "epoch": 10.07, + "learning_rate": 1.3430290872617854e-05, + "loss": 2.6037, + "theoretical_loss": 3.303392884276512, + "tokens_seen": 3213318144 + }, + { + "epoch": 10.07, + "learning_rate": 1.3420260782347042e-05, + "loss": 2.5407, + "theoretical_loss": 3.30338777621373, + "tokens_seen": 3213383680 + }, + { + "epoch": 10.07, + "learning_rate": 1.3410230692076228e-05, + "loss": 2.7437, + "theoretical_loss": 3.3033826682842933, + "tokens_seen": 3213449216 + }, + { + "epoch": 10.07, + "learning_rate": 1.3400200601805416e-05, + "loss": 2.3623, + "theoretical_loss": 3.3033775604881956, + "tokens_seen": 3213514752 + }, + { + "epoch": 10.07, + "learning_rate": 1.3390170511534605e-05, + "loss": 2.3985, + "theoretical_loss": 3.3033724528254313, + "tokens_seen": 3213580288 + }, + { + "epoch": 10.07, + "learning_rate": 1.3380140421263791e-05, + "loss": 2.486, + "theoretical_loss": 3.3033673452959933, + "tokens_seen": 3213645824 + }, + { + "epoch": 10.07, + "learning_rate": 1.337011033099298e-05, + "loss": 2.1902, + "theoretical_loss": 3.303362237899876, + "tokens_seen": 3213711360 + }, + { + "epoch": 10.07, + "learning_rate": 1.3360080240722166e-05, + "loss": 2.4256, + "theoretical_loss": 3.3033571306370733, + "tokens_seen": 3213776896 + }, + { + "epoch": 10.07, + "learning_rate": 1.3350050150451353e-05, + "loss": 2.5524, + "theoretical_loss": 3.3033520235075784, + "tokens_seen": 3213842432 + }, + { + "epoch": 10.07, + "learning_rate": 1.3340020060180543e-05, + "loss": 2.353, + "theoretical_loss": 3.3033469165113853, + "tokens_seen": 3213907968 + }, + { + "epoch": 10.07, + "learning_rate": 1.332998996990973e-05, + "loss": 2.2643, + "theoretical_loss": 3.3033418096484883, + "tokens_seen": 3213973504 + }, + { + "epoch": 10.07, + "learning_rate": 1.3319959879638917e-05, + "loss": 2.4861, + "theoretical_loss": 3.303336702918881, + "tokens_seen": 3214039040 + }, + { + "epoch": 10.07, + "learning_rate": 1.3309929789368103e-05, + "loss": 2.4125, + "theoretical_loss": 3.3033315963225567, + "tokens_seen": 3214104576 + }, + { + "epoch": 10.07, + "learning_rate": 1.3299899699097293e-05, + "loss": 2.3998, + "theoretical_loss": 3.3033264898595096, + "tokens_seen": 3214170112 + }, + { + "epoch": 10.07, + "learning_rate": 1.328986960882648e-05, + "loss": 2.4565, + "theoretical_loss": 3.3033213835297337, + "tokens_seen": 3214235648 + }, + { + "epoch": 10.07, + "learning_rate": 1.3279839518555667e-05, + "loss": 2.3485, + "theoretical_loss": 3.3033162773332228, + "tokens_seen": 3214301184 + }, + { + "epoch": 10.07, + "learning_rate": 1.3269809428284855e-05, + "loss": 2.3701, + "theoretical_loss": 3.3033111712699696, + "tokens_seen": 3214366720 + }, + { + "epoch": 10.07, + "learning_rate": 1.3259779338014041e-05, + "loss": 2.5261, + "theoretical_loss": 3.30330606533997, + "tokens_seen": 3214432256 + }, + { + "epoch": 10.07, + "learning_rate": 1.324974924774323e-05, + "loss": 2.4352, + "theoretical_loss": 3.3033009595432157, + "tokens_seen": 3214497792 + }, + { + "epoch": 10.07, + "objective/train/docs_used": 3506259, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.493565320968628, + "objective/train/theoretical_loss": 3.303298406694804, + "objective/train/tokens_used": 3234990560, + "theoretical_loss": 3.303298406694804, + "tokens_seen": 3214530560 + }, + { + "epoch": 10.07, + "learning_rate": 1.3239719157472419e-05, + "loss": 2.4651, + "theoretical_loss": 3.3032958538797015, + "tokens_seen": 3214563328 + }, + { + "epoch": 10.07, + "learning_rate": 1.3229689067201605e-05, + "loss": 2.3577, + "theoretical_loss": 3.303290748349421, + "tokens_seen": 3214628864 + }, + { + "epoch": 10.07, + "learning_rate": 1.3219658976930793e-05, + "loss": 2.216, + "theoretical_loss": 3.3032856429523685, + "tokens_seen": 3214694400 + }, + { + "epoch": 10.07, + "learning_rate": 1.3209628886659979e-05, + "loss": 2.2575, + "theoretical_loss": 3.3032805376885372, + "tokens_seen": 3214759936 + }, + { + "epoch": 10.07, + "learning_rate": 1.3199598796389169e-05, + "loss": 2.2694, + "theoretical_loss": 3.303275432557921, + "tokens_seen": 3214825472 + }, + { + "epoch": 10.07, + "learning_rate": 1.3189568706118355e-05, + "loss": 2.1493, + "theoretical_loss": 3.3032703275605146, + "tokens_seen": 3214891008 + }, + { + "epoch": 10.07, + "learning_rate": 1.3179538615847543e-05, + "loss": 2.3464, + "theoretical_loss": 3.3032652226963104, + "tokens_seen": 3214956544 + }, + { + "epoch": 10.07, + "learning_rate": 1.316950852557673e-05, + "loss": 2.7426, + "theoretical_loss": 3.303260117965303, + "tokens_seen": 3215022080 + }, + { + "epoch": 10.07, + "learning_rate": 1.3159478435305918e-05, + "loss": 2.3357, + "theoretical_loss": 3.303255013367486, + "tokens_seen": 3215087616 + }, + { + "epoch": 10.08, + "learning_rate": 1.3149448345035106e-05, + "loss": 2.5295, + "theoretical_loss": 3.3032499089028535, + "tokens_seen": 3215153152 + }, + { + "epoch": 10.08, + "learning_rate": 1.3139418254764293e-05, + "loss": 2.486, + "theoretical_loss": 3.3032448045713987, + "tokens_seen": 3215218688 + }, + { + "epoch": 10.08, + "learning_rate": 1.312938816449348e-05, + "loss": 2.5536, + "theoretical_loss": 3.303239700373116, + "tokens_seen": 3215284224 + }, + { + "epoch": 10.08, + "learning_rate": 1.3119358074222668e-05, + "loss": 2.5286, + "theoretical_loss": 3.3032345963079988, + "tokens_seen": 3215349760 + }, + { + "epoch": 10.08, + "learning_rate": 1.3109327983951856e-05, + "loss": 2.3894, + "theoretical_loss": 3.3032294923760412, + "tokens_seen": 3215415296 + }, + { + "epoch": 10.08, + "learning_rate": 1.3099297893681044e-05, + "loss": 2.3452, + "theoretical_loss": 3.303224388577237, + "tokens_seen": 3215480832 + }, + { + "epoch": 10.08, + "learning_rate": 1.308926780341023e-05, + "loss": 2.5632, + "theoretical_loss": 3.30321928491158, + "tokens_seen": 3215546368 + }, + { + "epoch": 10.08, + "learning_rate": 1.3079237713139418e-05, + "loss": 2.2554, + "theoretical_loss": 3.3032141813790643, + "tokens_seen": 3215611904 + }, + { + "epoch": 10.08, + "learning_rate": 1.3069207622868606e-05, + "loss": 2.2127, + "theoretical_loss": 3.303209077979683, + "tokens_seen": 3215677440 + }, + { + "epoch": 10.08, + "learning_rate": 1.3059177532597794e-05, + "loss": 2.319, + "theoretical_loss": 3.30320397471343, + "tokens_seen": 3215742976 + }, + { + "epoch": 10.08, + "learning_rate": 1.3049147442326982e-05, + "loss": 2.4294, + "theoretical_loss": 3.3031988715802996, + "tokens_seen": 3215808512 + }, + { + "epoch": 10.08, + "learning_rate": 1.3039117352056168e-05, + "loss": 2.1784, + "theoretical_loss": 3.303193768580286, + "tokens_seen": 3215874048 + }, + { + "epoch": 10.08, + "learning_rate": 1.3029087261785356e-05, + "loss": 2.4696, + "theoretical_loss": 3.3031886657133818, + "tokens_seen": 3215939584 + }, + { + "epoch": 10.08, + "learning_rate": 1.3019057171514544e-05, + "loss": 2.3695, + "theoretical_loss": 3.3031835629795814, + "tokens_seen": 3216005120 + }, + { + "epoch": 10.08, + "learning_rate": 1.3009027081243732e-05, + "loss": 2.3902, + "theoretical_loss": 3.303178460378879, + "tokens_seen": 3216070656 + }, + { + "epoch": 10.08, + "learning_rate": 1.299899699097292e-05, + "loss": 2.4671, + "theoretical_loss": 3.303173357911268, + "tokens_seen": 3216136192 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3506924, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5368123054504395, + "objective/train/theoretical_loss": 3.3031708067273695, + "objective/train/tokens_used": 3236628960, + "theoretical_loss": 3.3031708067273695, + "tokens_seen": 3216168960 + }, + { + "epoch": 10.08, + "learning_rate": 1.2988966900702106e-05, + "loss": 2.5476, + "theoretical_loss": 3.303168255576742, + "tokens_seen": 3216201728 + }, + { + "epoch": 10.08, + "learning_rate": 1.2978936810431294e-05, + "loss": 2.548, + "theoretical_loss": 3.303163153375295, + "tokens_seen": 3216267264 + }, + { + "epoch": 10.08, + "learning_rate": 1.2968906720160482e-05, + "loss": 2.6513, + "theoretical_loss": 3.3031580513069216, + "tokens_seen": 3216332800 + }, + { + "epoch": 10.08, + "learning_rate": 1.295887662988967e-05, + "loss": 2.3167, + "theoretical_loss": 3.3031529493716145, + "tokens_seen": 3216398336 + }, + { + "epoch": 10.08, + "learning_rate": 1.2948846539618857e-05, + "loss": 2.3715, + "theoretical_loss": 3.303147847569368, + "tokens_seen": 3216463872 + }, + { + "epoch": 10.08, + "learning_rate": 1.2938816449348044e-05, + "loss": 2.4547, + "theoretical_loss": 3.3031427459001756, + "tokens_seen": 3216529408 + }, + { + "epoch": 10.08, + "learning_rate": 1.2928786359077232e-05, + "loss": 2.1864, + "theoretical_loss": 3.303137644364032, + "tokens_seen": 3216594944 + }, + { + "epoch": 10.08, + "learning_rate": 1.291875626880642e-05, + "loss": 2.4129, + "theoretical_loss": 3.30313254296093, + "tokens_seen": 3216660480 + }, + { + "epoch": 10.08, + "learning_rate": 1.2908726178535607e-05, + "loss": 2.3391, + "theoretical_loss": 3.3031274416908643, + "tokens_seen": 3216726016 + }, + { + "epoch": 10.08, + "learning_rate": 1.2898696088264795e-05, + "loss": 2.2952, + "theoretical_loss": 3.303122340553828, + "tokens_seen": 3216791552 + }, + { + "epoch": 10.08, + "learning_rate": 1.2888665997993981e-05, + "loss": 2.366, + "theoretical_loss": 3.303117239549815, + "tokens_seen": 3216857088 + }, + { + "epoch": 10.08, + "learning_rate": 1.2878635907723171e-05, + "loss": 2.3367, + "theoretical_loss": 3.303112138678819, + "tokens_seen": 3216922624 + }, + { + "epoch": 10.08, + "learning_rate": 1.2868605817452357e-05, + "loss": 2.469, + "theoretical_loss": 3.3031070379408347, + "tokens_seen": 3216988160 + }, + { + "epoch": 10.08, + "learning_rate": 1.2858575727181545e-05, + "loss": 2.6269, + "theoretical_loss": 3.3031019373358554, + "tokens_seen": 3217053696 + }, + { + "epoch": 10.08, + "learning_rate": 1.2848545636910731e-05, + "loss": 2.1903, + "theoretical_loss": 3.3030968368638747, + "tokens_seen": 3217119232 + }, + { + "epoch": 10.08, + "learning_rate": 1.283851554663992e-05, + "loss": 2.48, + "theoretical_loss": 3.3030917365248866, + "tokens_seen": 3217184768 + }, + { + "epoch": 10.08, + "learning_rate": 1.2828485456369109e-05, + "loss": 2.5899, + "theoretical_loss": 3.3030866363188847, + "tokens_seen": 3217250304 + }, + { + "epoch": 10.08, + "learning_rate": 1.2818455366098295e-05, + "loss": 2.55, + "theoretical_loss": 3.3030815362458634, + "tokens_seen": 3217315840 + }, + { + "epoch": 10.08, + "learning_rate": 1.2808425275827483e-05, + "loss": 2.3028, + "theoretical_loss": 3.303076436305816, + "tokens_seen": 3217381376 + }, + { + "epoch": 10.08, + "learning_rate": 1.2798395185556669e-05, + "loss": 2.3075, + "theoretical_loss": 3.3030713364987365, + "tokens_seen": 3217446912 + }, + { + "epoch": 10.08, + "learning_rate": 1.2788365095285857e-05, + "loss": 2.4142, + "theoretical_loss": 3.3030662368246184, + "tokens_seen": 3217512448 + }, + { + "epoch": 10.08, + "learning_rate": 1.2778335005015047e-05, + "loss": 2.3047, + "theoretical_loss": 3.303061137283456, + "tokens_seen": 3217577984 + }, + { + "epoch": 10.08, + "learning_rate": 1.2768304914744233e-05, + "loss": 2.393, + "theoretical_loss": 3.303056037875243, + "tokens_seen": 3217643520 + }, + { + "epoch": 10.08, + "learning_rate": 1.275827482447342e-05, + "loss": 2.5041, + "theoretical_loss": 3.3030509385999736, + "tokens_seen": 3217709056 + }, + { + "epoch": 10.08, + "learning_rate": 1.2748244734202607e-05, + "loss": 2.5867, + "theoretical_loss": 3.3030458394576407, + "tokens_seen": 3217774592 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3508358, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.755643606185913, + "objective/train/theoretical_loss": 3.303043289936324, + "objective/train/tokens_used": 3238267360, + "theoretical_loss": 3.303043289936324, + "tokens_seen": 3217807360 + }, + { + "epoch": 10.08, + "learning_rate": 1.2738214643931797e-05, + "loss": 2.4101, + "theoretical_loss": 3.303040740448239, + "tokens_seen": 3217840128 + }, + { + "epoch": 10.08, + "learning_rate": 1.2728184553660984e-05, + "loss": 2.3958, + "theoretical_loss": 3.303035641571762, + "tokens_seen": 3217905664 + }, + { + "epoch": 10.08, + "learning_rate": 1.271815446339017e-05, + "loss": 2.5137, + "theoretical_loss": 3.303030542828203, + "tokens_seen": 3217971200 + }, + { + "epoch": 10.08, + "learning_rate": 1.2708124373119359e-05, + "loss": 2.397, + "theoretical_loss": 3.3030254442175564, + "tokens_seen": 3218036736 + }, + { + "epoch": 10.08, + "learning_rate": 1.2698094282848545e-05, + "loss": 2.5829, + "theoretical_loss": 3.3030203457398164, + "tokens_seen": 3218102272 + }, + { + "epoch": 10.08, + "learning_rate": 1.2688064192577734e-05, + "loss": 2.4715, + "theoretical_loss": 3.303015247394976, + "tokens_seen": 3218167808 + }, + { + "epoch": 10.08, + "learning_rate": 1.267803410230692e-05, + "loss": 2.3158, + "theoretical_loss": 3.3030101491830295, + "tokens_seen": 3218233344 + }, + { + "epoch": 10.08, + "learning_rate": 1.2668004012036108e-05, + "loss": 2.4037, + "theoretical_loss": 3.3030050511039706, + "tokens_seen": 3218298880 + }, + { + "epoch": 10.08, + "learning_rate": 1.2657973921765296e-05, + "loss": 2.3676, + "theoretical_loss": 3.3029999531577934, + "tokens_seen": 3218364416 + }, + { + "epoch": 10.08, + "learning_rate": 1.2647943831494483e-05, + "loss": 2.5196, + "theoretical_loss": 3.302994855344491, + "tokens_seen": 3218429952 + }, + { + "epoch": 10.08, + "learning_rate": 1.2637913741223672e-05, + "loss": 2.4132, + "theoretical_loss": 3.302989757664059, + "tokens_seen": 3218495488 + }, + { + "epoch": 10.08, + "learning_rate": 1.2627883650952858e-05, + "loss": 2.3635, + "theoretical_loss": 3.3029846601164885, + "tokens_seen": 3218561024 + }, + { + "epoch": 10.08, + "learning_rate": 1.2617853560682046e-05, + "loss": 2.5, + "theoretical_loss": 3.3029795627017755, + "tokens_seen": 3218626560 + }, + { + "epoch": 10.08, + "learning_rate": 1.2607823470411234e-05, + "loss": 2.4499, + "theoretical_loss": 3.302974465419913, + "tokens_seen": 3218692096 + }, + { + "epoch": 10.08, + "learning_rate": 1.2597793380140422e-05, + "loss": 2.3389, + "theoretical_loss": 3.3029693682708947, + "tokens_seen": 3218757632 + }, + { + "epoch": 10.08, + "learning_rate": 1.258776328986961e-05, + "loss": 2.343, + "theoretical_loss": 3.3029642712547154, + "tokens_seen": 3218823168 + }, + { + "epoch": 10.08, + "learning_rate": 1.2577733199598796e-05, + "loss": 2.3061, + "theoretical_loss": 3.3029591743713675, + "tokens_seen": 3218888704 + }, + { + "epoch": 10.08, + "learning_rate": 1.2567703109327984e-05, + "loss": 2.3057, + "theoretical_loss": 3.302954077620846, + "tokens_seen": 3218954240 + }, + { + "epoch": 10.08, + "learning_rate": 1.2557673019057172e-05, + "loss": 2.3494, + "theoretical_loss": 3.3029489810031443, + "tokens_seen": 3219019776 + }, + { + "epoch": 10.08, + "learning_rate": 1.254764292878636e-05, + "loss": 2.4872, + "theoretical_loss": 3.302943884518256, + "tokens_seen": 3219085312 + }, + { + "epoch": 10.08, + "learning_rate": 1.2537612838515548e-05, + "loss": 2.4243, + "theoretical_loss": 3.302938788166175, + "tokens_seen": 3219150848 + }, + { + "epoch": 10.08, + "learning_rate": 1.2527582748244734e-05, + "loss": 2.4606, + "theoretical_loss": 3.3029336919468957, + "tokens_seen": 3219216384 + }, + { + "epoch": 10.08, + "learning_rate": 1.2517552657973922e-05, + "loss": 2.4548, + "theoretical_loss": 3.3029285958604113, + "tokens_seen": 3219281920 + }, + { + "epoch": 10.08, + "learning_rate": 1.2507522567703108e-05, + "loss": 2.4812, + "theoretical_loss": 3.302923499906716, + "tokens_seen": 3219347456 + }, + { + "epoch": 10.08, + "learning_rate": 1.2497492477432298e-05, + "loss": 2.5458, + "theoretical_loss": 3.3029184040858035, + "tokens_seen": 3219412992 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3508900, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6754791736602783, + "objective/train/theoretical_loss": 3.302915856225139, + "objective/train/tokens_used": 3239905760, + "theoretical_loss": 3.302915856225139, + "tokens_seen": 3219445760 + }, + { + "epoch": 10.08, + "learning_rate": 1.2487462387161485e-05, + "loss": 2.5726, + "theoretical_loss": 3.3029133083976676, + "tokens_seen": 3219478528 + }, + { + "epoch": 10.08, + "learning_rate": 1.2477432296890672e-05, + "loss": 2.4869, + "theoretical_loss": 3.3029082128423024, + "tokens_seen": 3219544064 + }, + { + "epoch": 10.08, + "learning_rate": 1.246740220661986e-05, + "loss": 2.331, + "theoretical_loss": 3.302903117419701, + "tokens_seen": 3219609600 + }, + { + "epoch": 10.08, + "learning_rate": 1.2457372116349047e-05, + "loss": 2.2459, + "theoretical_loss": 3.3028980221298583, + "tokens_seen": 3219675136 + }, + { + "epoch": 10.08, + "learning_rate": 1.2447342026078235e-05, + "loss": 2.4963, + "theoretical_loss": 3.3028929269727674, + "tokens_seen": 3219740672 + }, + { + "epoch": 10.08, + "learning_rate": 1.2437311935807423e-05, + "loss": 2.248, + "theoretical_loss": 3.3028878319484223, + "tokens_seen": 3219806208 + }, + { + "epoch": 10.08, + "learning_rate": 1.242728184553661e-05, + "loss": 2.5112, + "theoretical_loss": 3.3028827370568172, + "tokens_seen": 3219871744 + }, + { + "epoch": 10.08, + "learning_rate": 1.2417251755265797e-05, + "loss": 2.445, + "theoretical_loss": 3.302877642297945, + "tokens_seen": 3219937280 + }, + { + "epoch": 10.08, + "learning_rate": 1.2407221664994985e-05, + "loss": 2.5195, + "theoretical_loss": 3.3028725476718006, + "tokens_seen": 3220002816 + }, + { + "epoch": 10.08, + "learning_rate": 1.2397191574724173e-05, + "loss": 2.5177, + "theoretical_loss": 3.3028674531783775, + "tokens_seen": 3220068352 + }, + { + "epoch": 10.08, + "learning_rate": 1.2387161484453361e-05, + "loss": 2.3604, + "theoretical_loss": 3.302862358817669, + "tokens_seen": 3220133888 + }, + { + "epoch": 10.08, + "learning_rate": 1.2377131394182547e-05, + "loss": 2.4127, + "theoretical_loss": 3.3028572645896697, + "tokens_seen": 3220199424 + }, + { + "epoch": 10.08, + "learning_rate": 1.2367101303911735e-05, + "loss": 2.4081, + "theoretical_loss": 3.3028521704943734, + "tokens_seen": 3220264960 + }, + { + "epoch": 10.08, + "learning_rate": 1.2357071213640923e-05, + "loss": 2.4653, + "theoretical_loss": 3.3028470765317732, + "tokens_seen": 3220330496 + }, + { + "epoch": 10.08, + "learning_rate": 1.2347041123370111e-05, + "loss": 2.649, + "theoretical_loss": 3.302841982701864, + "tokens_seen": 3220396032 + }, + { + "epoch": 10.08, + "learning_rate": 1.2337011033099299e-05, + "loss": 2.3677, + "theoretical_loss": 3.302836889004638, + "tokens_seen": 3220461568 + }, + { + "epoch": 10.08, + "learning_rate": 1.2326980942828485e-05, + "loss": 2.2359, + "theoretical_loss": 3.302831795440091, + "tokens_seen": 3220527104 + }, + { + "epoch": 10.08, + "learning_rate": 1.2316950852557675e-05, + "loss": 2.1501, + "theoretical_loss": 3.3028267020082156, + "tokens_seen": 3220592640 + }, + { + "epoch": 10.08, + "learning_rate": 1.230692076228686e-05, + "loss": 2.4475, + "theoretical_loss": 3.302821608709006, + "tokens_seen": 3220658176 + }, + { + "epoch": 10.08, + "learning_rate": 1.2296890672016049e-05, + "loss": 2.2039, + "theoretical_loss": 3.3028165155424563, + "tokens_seen": 3220723712 + }, + { + "epoch": 10.08, + "learning_rate": 1.2286860581745235e-05, + "loss": 2.5673, + "theoretical_loss": 3.30281142250856, + "tokens_seen": 3220789248 + }, + { + "epoch": 10.08, + "learning_rate": 1.2276830491474423e-05, + "loss": 2.2883, + "theoretical_loss": 3.3028063296073107, + "tokens_seen": 3220854784 + }, + { + "epoch": 10.08, + "learning_rate": 1.2266800401203612e-05, + "loss": 2.4113, + "theoretical_loss": 3.302801236838703, + "tokens_seen": 3220920320 + }, + { + "epoch": 10.08, + "learning_rate": 1.2256770310932799e-05, + "loss": 2.438, + "theoretical_loss": 3.3027961442027296, + "tokens_seen": 3220985856 + }, + { + "epoch": 10.08, + "learning_rate": 1.2246740220661987e-05, + "loss": 2.5105, + "theoretical_loss": 3.302791051699386, + "tokens_seen": 3221051392 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3510329, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3891491889953613, + "objective/train/theoretical_loss": 3.3027885054974475, + "objective/train/tokens_used": 3241544160, + "theoretical_loss": 3.3027885054974475, + "tokens_seen": 3221084160 + }, + { + "epoch": 10.08, + "learning_rate": 1.2236710130391173e-05, + "loss": 2.5278, + "theoretical_loss": 3.3027859593286646, + "tokens_seen": 3221116928 + }, + { + "epoch": 10.08, + "learning_rate": 1.222668004012036e-05, + "loss": 2.2899, + "theoretical_loss": 3.3027808670905596, + "tokens_seen": 3221182464 + }, + { + "epoch": 10.08, + "learning_rate": 1.221664994984955e-05, + "loss": 2.208, + "theoretical_loss": 3.3027757749850655, + "tokens_seen": 3221248000 + }, + { + "epoch": 10.08, + "learning_rate": 1.2206619859578736e-05, + "loss": 2.4575, + "theoretical_loss": 3.3027706830121755, + "tokens_seen": 3221313536 + }, + { + "epoch": 10.08, + "learning_rate": 1.2196589769307924e-05, + "loss": 2.6608, + "theoretical_loss": 3.3027655911718834, + "tokens_seen": 3221379072 + }, + { + "epoch": 10.08, + "learning_rate": 1.218655967903711e-05, + "loss": 2.3633, + "theoretical_loss": 3.3027604994641835, + "tokens_seen": 3221444608 + }, + { + "epoch": 10.08, + "learning_rate": 1.2176529588766298e-05, + "loss": 2.6176, + "theoretical_loss": 3.302755407889069, + "tokens_seen": 3221510144 + }, + { + "epoch": 10.08, + "learning_rate": 1.2166499498495488e-05, + "loss": 2.2619, + "theoretical_loss": 3.3027503164465344, + "tokens_seen": 3221575680 + }, + { + "epoch": 10.08, + "learning_rate": 1.2156469408224674e-05, + "loss": 2.3921, + "theoretical_loss": 3.3027452251365736, + "tokens_seen": 3221641216 + }, + { + "epoch": 10.08, + "learning_rate": 1.2146439317953862e-05, + "loss": 2.5645, + "theoretical_loss": 3.3027401339591798, + "tokens_seen": 3221706752 + }, + { + "epoch": 10.08, + "learning_rate": 1.2136409227683048e-05, + "loss": 2.3046, + "theoretical_loss": 3.3027350429143474, + "tokens_seen": 3221772288 + }, + { + "epoch": 10.08, + "learning_rate": 1.2126379137412238e-05, + "loss": 2.3506, + "theoretical_loss": 3.3027299520020703, + "tokens_seen": 3221837824 + }, + { + "epoch": 10.08, + "learning_rate": 1.2116349047141424e-05, + "loss": 2.2235, + "theoretical_loss": 3.3027248612223414, + "tokens_seen": 3221903360 + }, + { + "epoch": 10.08, + "learning_rate": 1.2106318956870612e-05, + "loss": 2.3229, + "theoretical_loss": 3.302719770575156, + "tokens_seen": 3221968896 + }, + { + "epoch": 10.08, + "learning_rate": 1.20962888665998e-05, + "loss": 2.4751, + "theoretical_loss": 3.302714680060507, + "tokens_seen": 3222034432 + }, + { + "epoch": 10.08, + "learning_rate": 1.2086258776328986e-05, + "loss": 2.1497, + "theoretical_loss": 3.302709589678388, + "tokens_seen": 3222099968 + }, + { + "epoch": 10.08, + "learning_rate": 1.2076228686058176e-05, + "loss": 2.3077, + "theoretical_loss": 3.3027044994287937, + "tokens_seen": 3222165504 + }, + { + "epoch": 10.08, + "learning_rate": 1.2066198595787362e-05, + "loss": 2.5218, + "theoretical_loss": 3.302699409311718, + "tokens_seen": 3222231040 + }, + { + "epoch": 10.08, + "learning_rate": 1.205616850551655e-05, + "loss": 2.4913, + "theoretical_loss": 3.302694319327154, + "tokens_seen": 3222296576 + }, + { + "epoch": 10.08, + "learning_rate": 1.2046138415245738e-05, + "loss": 2.4157, + "theoretical_loss": 3.302689229475096, + "tokens_seen": 3222362112 + }, + { + "epoch": 10.08, + "learning_rate": 1.2036108324974924e-05, + "loss": 2.4611, + "theoretical_loss": 3.302684139755537, + "tokens_seen": 3222427648 + }, + { + "epoch": 10.08, + "learning_rate": 1.2026078234704113e-05, + "loss": 2.4308, + "theoretical_loss": 3.3026790501684724, + "tokens_seen": 3222493184 + }, + { + "epoch": 10.08, + "learning_rate": 1.20160481444333e-05, + "loss": 2.4412, + "theoretical_loss": 3.302673960713895, + "tokens_seen": 3222558720 + }, + { + "epoch": 10.08, + "learning_rate": 1.2006018054162488e-05, + "loss": 2.2436, + "theoretical_loss": 3.3026688713917993, + "tokens_seen": 3222624256 + }, + { + "epoch": 10.08, + "learning_rate": 1.1995987963891675e-05, + "loss": 2.3636, + "theoretical_loss": 3.3026637822021785, + "tokens_seen": 3222689792 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3511094, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2500593662261963, + "objective/train/theoretical_loss": 3.302661237657044, + "objective/train/tokens_used": 3243182560, + "theoretical_loss": 3.302661237657044, + "tokens_seen": 3222722560 + }, + { + "epoch": 10.08, + "learning_rate": 1.1985957873620863e-05, + "loss": 2.4794, + "theoretical_loss": 3.3026586931450264, + "tokens_seen": 3222755328 + }, + { + "epoch": 10.08, + "learning_rate": 1.1975927783350051e-05, + "loss": 2.515, + "theoretical_loss": 3.3026536042203376, + "tokens_seen": 3222820864 + }, + { + "epoch": 10.08, + "learning_rate": 1.1965897693079237e-05, + "loss": 2.379, + "theoretical_loss": 3.3026485154281056, + "tokens_seen": 3222886400 + }, + { + "epoch": 10.08, + "learning_rate": 1.1955867602808425e-05, + "loss": 2.4489, + "theoretical_loss": 3.302643426768324, + "tokens_seen": 3222951936 + }, + { + "epoch": 10.08, + "learning_rate": 1.1945837512537612e-05, + "loss": 2.7974, + "theoretical_loss": 3.3026383382409867, + "tokens_seen": 3223017472 + }, + { + "epoch": 10.08, + "learning_rate": 1.1935807422266801e-05, + "loss": 2.4437, + "theoretical_loss": 3.302633249846088, + "tokens_seen": 3223083008 + }, + { + "epoch": 10.08, + "learning_rate": 1.1925777331995989e-05, + "loss": 2.308, + "theoretical_loss": 3.3026281615836215, + "tokens_seen": 3223148544 + }, + { + "epoch": 10.08, + "learning_rate": 1.1915747241725175e-05, + "loss": 2.3408, + "theoretical_loss": 3.302623073453581, + "tokens_seen": 3223214080 + }, + { + "epoch": 10.08, + "learning_rate": 1.1905717151454363e-05, + "loss": 2.4823, + "theoretical_loss": 3.3026179854559605, + "tokens_seen": 3223279616 + }, + { + "epoch": 10.08, + "learning_rate": 1.189568706118355e-05, + "loss": 2.4964, + "theoretical_loss": 3.3026128975907536, + "tokens_seen": 3223345152 + }, + { + "epoch": 10.08, + "learning_rate": 1.1885656970912739e-05, + "loss": 2.3217, + "theoretical_loss": 3.3026078098579545, + "tokens_seen": 3223410688 + }, + { + "epoch": 10.08, + "learning_rate": 1.1875626880641927e-05, + "loss": 2.3664, + "theoretical_loss": 3.302602722257557, + "tokens_seen": 3223476224 + }, + { + "epoch": 10.08, + "learning_rate": 1.1865596790371113e-05, + "loss": 2.3103, + "theoretical_loss": 3.3025976347895547, + "tokens_seen": 3223541760 + }, + { + "epoch": 10.08, + "learning_rate": 1.1855566700100301e-05, + "loss": 2.581, + "theoretical_loss": 3.3025925474539415, + "tokens_seen": 3223607296 + }, + { + "epoch": 10.08, + "learning_rate": 1.1845536609829489e-05, + "loss": 2.7387, + "theoretical_loss": 3.3025874602507113, + "tokens_seen": 3223672832 + }, + { + "epoch": 10.08, + "learning_rate": 1.1835506519558677e-05, + "loss": 2.399, + "theoretical_loss": 3.3025823731798587, + "tokens_seen": 3223738368 + }, + { + "epoch": 10.08, + "learning_rate": 1.1825476429287865e-05, + "loss": 2.3434, + "theoretical_loss": 3.3025772862413763, + "tokens_seen": 3223803904 + }, + { + "epoch": 10.08, + "learning_rate": 1.181544633901705e-05, + "loss": 2.4181, + "theoretical_loss": 3.3025721994352586, + "tokens_seen": 3223869440 + }, + { + "epoch": 10.08, + "learning_rate": 1.1805416248746239e-05, + "loss": 2.4499, + "theoretical_loss": 3.3025671127614995, + "tokens_seen": 3223934976 + }, + { + "epoch": 10.08, + "learning_rate": 1.1795386158475427e-05, + "loss": 2.6791, + "theoretical_loss": 3.302562026220093, + "tokens_seen": 3224000512 + }, + { + "epoch": 10.08, + "learning_rate": 1.1785356068204615e-05, + "loss": 2.4863, + "theoretical_loss": 3.302556939811033, + "tokens_seen": 3224066048 + }, + { + "epoch": 10.08, + "learning_rate": 1.1775325977933802e-05, + "loss": 2.4047, + "theoretical_loss": 3.302551853534313, + "tokens_seen": 3224131584 + }, + { + "epoch": 10.08, + "learning_rate": 1.1765295887662989e-05, + "loss": 2.4603, + "theoretical_loss": 3.3025467673899267, + "tokens_seen": 3224197120 + }, + { + "epoch": 10.08, + "learning_rate": 1.1755265797392177e-05, + "loss": 2.3378, + "theoretical_loss": 3.3025416813778685, + "tokens_seen": 3224262656 + }, + { + "epoch": 10.08, + "learning_rate": 1.1745235707121364e-05, + "loss": 2.5706, + "theoretical_loss": 3.3025365954981316, + "tokens_seen": 3224328192 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3512424, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.044245481491089, + "objective/train/theoretical_loss": 3.302534052607882, + "objective/train/tokens_used": 3244820960, + "theoretical_loss": 3.302534052607882, + "tokens_seen": 3224360960 + }, + { + "epoch": 10.08, + "learning_rate": 1.1735205616850552e-05, + "loss": 2.4083, + "theoretical_loss": 3.302531509750711, + "tokens_seen": 3224393728 + }, + { + "epoch": 10.08, + "learning_rate": 1.1725175526579739e-05, + "loss": 2.3636, + "theoretical_loss": 3.3025264241355994, + "tokens_seen": 3224459264 + }, + { + "epoch": 10.08, + "learning_rate": 1.1715145436308926e-05, + "loss": 2.5753, + "theoretical_loss": 3.3025213386527916, + "tokens_seen": 3224524800 + }, + { + "epoch": 10.08, + "learning_rate": 1.1705115346038116e-05, + "loss": 2.639, + "theoretical_loss": 3.3025162533022807, + "tokens_seen": 3224590336 + }, + { + "epoch": 10.08, + "learning_rate": 1.1695085255767302e-05, + "loss": 2.3908, + "theoretical_loss": 3.302511168084061, + "tokens_seen": 3224655872 + }, + { + "epoch": 10.08, + "learning_rate": 1.168505516549649e-05, + "loss": 2.5589, + "theoretical_loss": 3.302506082998126, + "tokens_seen": 3224721408 + }, + { + "epoch": 10.08, + "learning_rate": 1.1675025075225676e-05, + "loss": 2.4504, + "theoretical_loss": 3.30250099804447, + "tokens_seen": 3224786944 + }, + { + "epoch": 10.08, + "learning_rate": 1.1664994984954864e-05, + "loss": 2.4552, + "theoretical_loss": 3.302495913223087, + "tokens_seen": 3224852480 + }, + { + "epoch": 10.08, + "learning_rate": 1.1654964894684054e-05, + "loss": 2.3423, + "theoretical_loss": 3.30249082853397, + "tokens_seen": 3224918016 + }, + { + "epoch": 10.08, + "learning_rate": 1.164493480441324e-05, + "loss": 2.3579, + "theoretical_loss": 3.3024857439771136, + "tokens_seen": 3224983552 + }, + { + "epoch": 10.08, + "learning_rate": 1.1634904714142428e-05, + "loss": 2.4528, + "theoretical_loss": 3.302480659552512, + "tokens_seen": 3225049088 + }, + { + "epoch": 10.08, + "learning_rate": 1.1624874623871614e-05, + "loss": 2.4954, + "theoretical_loss": 3.302475575260158, + "tokens_seen": 3225114624 + }, + { + "epoch": 10.08, + "learning_rate": 1.1614844533600802e-05, + "loss": 2.254, + "theoretical_loss": 3.3024704911000464, + "tokens_seen": 3225180160 + }, + { + "epoch": 10.08, + "learning_rate": 1.1604814443329992e-05, + "loss": 2.4751, + "theoretical_loss": 3.302465407072171, + "tokens_seen": 3225245696 + }, + { + "epoch": 10.08, + "learning_rate": 1.1594784353059178e-05, + "loss": 2.4694, + "theoretical_loss": 3.3024603231765246, + "tokens_seen": 3225311232 + }, + { + "epoch": 10.08, + "learning_rate": 1.1584754262788366e-05, + "loss": 2.4999, + "theoretical_loss": 3.302455239413103, + "tokens_seen": 3225376768 + }, + { + "epoch": 10.08, + "learning_rate": 1.1574724172517552e-05, + "loss": 2.2474, + "theoretical_loss": 3.302450155781898, + "tokens_seen": 3225442304 + }, + { + "epoch": 10.08, + "learning_rate": 1.1564694082246741e-05, + "loss": 2.3608, + "theoretical_loss": 3.3024450722829046, + "tokens_seen": 3225507840 + }, + { + "epoch": 10.08, + "learning_rate": 1.1554663991975928e-05, + "loss": 2.4275, + "theoretical_loss": 3.3024399889161167, + "tokens_seen": 3225573376 + }, + { + "epoch": 10.08, + "learning_rate": 1.1544633901705116e-05, + "loss": 2.3932, + "theoretical_loss": 3.302434905681528, + "tokens_seen": 3225638912 + }, + { + "epoch": 10.08, + "learning_rate": 1.1534603811434303e-05, + "loss": 2.4948, + "theoretical_loss": 3.302429822579132, + "tokens_seen": 3225704448 + }, + { + "epoch": 10.08, + "learning_rate": 1.152457372116349e-05, + "loss": 2.177, + "theoretical_loss": 3.3024247396089232, + "tokens_seen": 3225769984 + }, + { + "epoch": 10.08, + "learning_rate": 1.151454363089268e-05, + "loss": 2.2541, + "theoretical_loss": 3.3024196567708954, + "tokens_seen": 3225835520 + }, + { + "epoch": 10.08, + "learning_rate": 1.1504513540621865e-05, + "loss": 2.6842, + "theoretical_loss": 3.3024145740650424, + "tokens_seen": 3225901056 + }, + { + "epoch": 10.08, + "learning_rate": 1.1494483450351053e-05, + "loss": 2.3997, + "theoretical_loss": 3.3024094914913573, + "tokens_seen": 3225966592 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3513152, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6767683029174805, + "objective/train/theoretical_loss": 3.3024069502540763, + "objective/train/tokens_used": 3246459360, + "theoretical_loss": 3.3024069502540763, + "tokens_seen": 3225999360 + }, + { + "epoch": 10.08, + "learning_rate": 1.1484453360080241e-05, + "loss": 2.3793, + "theoretical_loss": 3.302404409049835, + "tokens_seen": 3226032128 + }, + { + "epoch": 10.08, + "learning_rate": 1.1474423269809427e-05, + "loss": 2.4553, + "theoretical_loss": 3.302399326740469, + "tokens_seen": 3226097664 + }, + { + "epoch": 10.08, + "learning_rate": 1.1464393179538617e-05, + "loss": 2.1407, + "theoretical_loss": 3.3023942445632537, + "tokens_seen": 3226163200 + }, + { + "epoch": 10.08, + "learning_rate": 1.1454363089267803e-05, + "loss": 2.3338, + "theoretical_loss": 3.302389162518182, + "tokens_seen": 3226228736 + }, + { + "epoch": 10.08, + "learning_rate": 1.1444332998996991e-05, + "loss": 2.4622, + "theoretical_loss": 3.3023840806052482, + "tokens_seen": 3226294272 + }, + { + "epoch": 10.08, + "learning_rate": 1.1434302908726179e-05, + "loss": 2.3977, + "theoretical_loss": 3.3023789988244463, + "tokens_seen": 3226359808 + }, + { + "epoch": 10.08, + "learning_rate": 1.1424272818455367e-05, + "loss": 2.3385, + "theoretical_loss": 3.3023739171757702, + "tokens_seen": 3226425344 + }, + { + "epoch": 10.08, + "learning_rate": 1.1414242728184555e-05, + "loss": 2.4216, + "theoretical_loss": 3.302368835659214, + "tokens_seen": 3226490880 + }, + { + "epoch": 10.08, + "learning_rate": 1.1404212637913741e-05, + "loss": 2.4864, + "theoretical_loss": 3.302363754274771, + "tokens_seen": 3226556416 + }, + { + "epoch": 10.08, + "learning_rate": 1.1394182547642929e-05, + "loss": 2.438, + "theoretical_loss": 3.302358673022435, + "tokens_seen": 3226621952 + }, + { + "epoch": 10.08, + "learning_rate": 1.1384152457372115e-05, + "loss": 2.4636, + "theoretical_loss": 3.3023535919022007, + "tokens_seen": 3226687488 + }, + { + "epoch": 10.08, + "learning_rate": 1.1374122367101305e-05, + "loss": 2.3599, + "theoretical_loss": 3.3023485109140616, + "tokens_seen": 3226753024 + }, + { + "epoch": 10.08, + "learning_rate": 1.1364092276830493e-05, + "loss": 2.3852, + "theoretical_loss": 3.3023434300580115, + "tokens_seen": 3226818560 + }, + { + "epoch": 10.08, + "learning_rate": 1.1354062186559679e-05, + "loss": 2.2442, + "theoretical_loss": 3.302338349334044, + "tokens_seen": 3226884096 + }, + { + "epoch": 10.08, + "learning_rate": 1.1344032096288867e-05, + "loss": 2.4792, + "theoretical_loss": 3.3023332687421534, + "tokens_seen": 3226949632 + }, + { + "epoch": 10.08, + "learning_rate": 1.1334002006018053e-05, + "loss": 2.4282, + "theoretical_loss": 3.3023281882823334, + "tokens_seen": 3227015168 + }, + { + "epoch": 10.08, + "learning_rate": 1.1323971915747243e-05, + "loss": 2.2316, + "theoretical_loss": 3.302323107954578, + "tokens_seen": 3227080704 + }, + { + "epoch": 10.08, + "learning_rate": 1.131394182547643e-05, + "loss": 2.3769, + "theoretical_loss": 3.302318027758881, + "tokens_seen": 3227146240 + }, + { + "epoch": 10.08, + "learning_rate": 1.1303911735205617e-05, + "loss": 2.4609, + "theoretical_loss": 3.3023129476952366, + "tokens_seen": 3227211776 + }, + { + "epoch": 10.08, + "learning_rate": 1.1293881644934805e-05, + "loss": 2.1407, + "theoretical_loss": 3.3023078677636386, + "tokens_seen": 3227277312 + }, + { + "epoch": 10.08, + "learning_rate": 1.1283851554663992e-05, + "loss": 2.4039, + "theoretical_loss": 3.3023027879640803, + "tokens_seen": 3227342848 + }, + { + "epoch": 10.08, + "learning_rate": 1.127382146439318e-05, + "loss": 2.363, + "theoretical_loss": 3.302297708296556, + "tokens_seen": 3227408384 + }, + { + "epoch": 10.08, + "learning_rate": 1.1263791374122368e-05, + "loss": 2.5742, + "theoretical_loss": 3.3022926287610597, + "tokens_seen": 3227473920 + }, + { + "epoch": 10.08, + "learning_rate": 1.1253761283851554e-05, + "loss": 2.4282, + "theoretical_loss": 3.302287549357585, + "tokens_seen": 3227539456 + }, + { + "epoch": 10.08, + "learning_rate": 1.1243731193580742e-05, + "loss": 2.4515, + "theoretical_loss": 3.302282470086126, + "tokens_seen": 3227604992 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3513751, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.889968156814575, + "objective/train/theoretical_loss": 3.3022799304999, + "objective/train/tokens_used": 3248097760, + "theoretical_loss": 3.3022799304999, + "tokens_seen": 3227637760 + }, + { + "epoch": 10.08, + "learning_rate": 1.123370110330993e-05, + "loss": 2.62, + "theoretical_loss": 3.3022773909466765, + "tokens_seen": 3227670528 + }, + { + "epoch": 10.08, + "learning_rate": 1.1223671013039118e-05, + "loss": 2.6397, + "theoretical_loss": 3.3022723119392303, + "tokens_seen": 3227736064 + }, + { + "epoch": 10.08, + "learning_rate": 1.1213640922768304e-05, + "loss": 2.5561, + "theoretical_loss": 3.3022672330637817, + "tokens_seen": 3227801600 + }, + { + "epoch": 10.08, + "learning_rate": 1.1203610832497492e-05, + "loss": 2.431, + "theoretical_loss": 3.3022621543203243, + "tokens_seen": 3227867136 + }, + { + "epoch": 10.08, + "learning_rate": 1.119358074222668e-05, + "loss": 2.1946, + "theoretical_loss": 3.3022570757088516, + "tokens_seen": 3227932672 + }, + { + "epoch": 10.08, + "learning_rate": 1.1183550651955868e-05, + "loss": 2.3771, + "theoretical_loss": 3.3022519972293582, + "tokens_seen": 3227998208 + }, + { + "epoch": 10.08, + "learning_rate": 1.1173520561685056e-05, + "loss": 2.3033, + "theoretical_loss": 3.3022469188818375, + "tokens_seen": 3228063744 + }, + { + "epoch": 10.08, + "learning_rate": 1.1163490471414242e-05, + "loss": 2.5062, + "theoretical_loss": 3.3022418406662837, + "tokens_seen": 3228129280 + }, + { + "epoch": 10.08, + "learning_rate": 1.115346038114343e-05, + "loss": 2.513, + "theoretical_loss": 3.302236762582691, + "tokens_seen": 3228194816 + }, + { + "epoch": 10.08, + "learning_rate": 1.114343029087262e-05, + "loss": 2.527, + "theoretical_loss": 3.302231684631052, + "tokens_seen": 3228260352 + }, + { + "epoch": 10.08, + "learning_rate": 1.1133400200601806e-05, + "loss": 2.7057, + "theoretical_loss": 3.302226606811362, + "tokens_seen": 3228325888 + }, + { + "epoch": 10.08, + "learning_rate": 1.1123370110330994e-05, + "loss": 2.5347, + "theoretical_loss": 3.302221529123614, + "tokens_seen": 3228391424 + }, + { + "epoch": 10.08, + "learning_rate": 1.111334002006018e-05, + "loss": 2.4712, + "theoretical_loss": 3.3022164515678027, + "tokens_seen": 3228456960 + }, + { + "epoch": 10.08, + "learning_rate": 1.1103309929789368e-05, + "loss": 2.3598, + "theoretical_loss": 3.302211374143921, + "tokens_seen": 3228522496 + }, + { + "epoch": 10.08, + "learning_rate": 1.1093279839518557e-05, + "loss": 2.3938, + "theoretical_loss": 3.3022062968519634, + "tokens_seen": 3228588032 + }, + { + "epoch": 10.08, + "learning_rate": 1.1083249749247744e-05, + "loss": 2.2116, + "theoretical_loss": 3.3022012196919244, + "tokens_seen": 3228653568 + }, + { + "epoch": 10.08, + "learning_rate": 1.1073219658976931e-05, + "loss": 2.6354, + "theoretical_loss": 3.3021961426637967, + "tokens_seen": 3228719104 + }, + { + "epoch": 10.08, + "learning_rate": 1.1063189568706118e-05, + "loss": 2.5124, + "theoretical_loss": 3.302191065767574, + "tokens_seen": 3228784640 + }, + { + "epoch": 10.08, + "learning_rate": 1.1053159478435306e-05, + "loss": 2.2429, + "theoretical_loss": 3.302185989003252, + "tokens_seen": 3228850176 + }, + { + "epoch": 10.08, + "learning_rate": 1.1043129388164495e-05, + "loss": 2.4106, + "theoretical_loss": 3.3021809123708232, + "tokens_seen": 3228915712 + }, + { + "epoch": 10.08, + "learning_rate": 1.1033099297893681e-05, + "loss": 2.5214, + "theoretical_loss": 3.3021758358702815, + "tokens_seen": 3228981248 + }, + { + "epoch": 10.08, + "learning_rate": 1.102306920762287e-05, + "loss": 2.287, + "theoretical_loss": 3.302170759501621, + "tokens_seen": 3229046784 + }, + { + "epoch": 10.08, + "learning_rate": 1.1013039117352055e-05, + "loss": 2.5261, + "theoretical_loss": 3.3021656832648363, + "tokens_seen": 3229112320 + }, + { + "epoch": 10.08, + "learning_rate": 1.1003009027081245e-05, + "loss": 2.3105, + "theoretical_loss": 3.3021606071599203, + "tokens_seen": 3229177856 + }, + { + "epoch": 10.08, + "learning_rate": 1.0992978936810431e-05, + "loss": 2.361, + "theoretical_loss": 3.302155531186867, + "tokens_seen": 3229243392 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3514700, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4262630939483643, + "objective/train/theoretical_loss": 3.3021529932497873, + "objective/train/tokens_used": 3249736160, + "theoretical_loss": 3.3021529932497873, + "tokens_seen": 3229276160 + }, + { + "epoch": 10.08, + "learning_rate": 1.0982948846539619e-05, + "loss": 2.2576, + "theoretical_loss": 3.3021504553456706, + "tokens_seen": 3229308928 + }, + { + "epoch": 10.08, + "learning_rate": 1.0972918756268807e-05, + "loss": 2.3736, + "theoretical_loss": 3.3021453796363254, + "tokens_seen": 3229374464 + }, + { + "epoch": 10.08, + "learning_rate": 1.0962888665997993e-05, + "loss": 2.1692, + "theoretical_loss": 3.302140304058825, + "tokens_seen": 3229440000 + }, + { + "epoch": 10.08, + "learning_rate": 1.0952858575727183e-05, + "loss": 2.3759, + "theoretical_loss": 3.302135228613163, + "tokens_seen": 3229505536 + }, + { + "epoch": 10.08, + "learning_rate": 1.0942828485456369e-05, + "loss": 2.5131, + "theoretical_loss": 3.3021301532993332, + "tokens_seen": 3229571072 + }, + { + "epoch": 10.08, + "learning_rate": 1.0932798395185557e-05, + "loss": 2.4996, + "theoretical_loss": 3.3021250781173297, + "tokens_seen": 3229636608 + }, + { + "epoch": 10.08, + "learning_rate": 1.0922768304914745e-05, + "loss": 2.2465, + "theoretical_loss": 3.302120003067147, + "tokens_seen": 3229702144 + }, + { + "epoch": 10.08, + "learning_rate": 1.0912738214643931e-05, + "loss": 2.4211, + "theoretical_loss": 3.302114928148778, + "tokens_seen": 3229767680 + }, + { + "epoch": 10.08, + "learning_rate": 1.090270812437312e-05, + "loss": 2.2749, + "theoretical_loss": 3.302109853362217, + "tokens_seen": 3229833216 + }, + { + "epoch": 10.08, + "learning_rate": 1.0892678034102307e-05, + "loss": 2.1885, + "theoretical_loss": 3.3021047787074584, + "tokens_seen": 3229898752 + }, + { + "epoch": 10.08, + "learning_rate": 1.0882647943831495e-05, + "loss": 2.4357, + "theoretical_loss": 3.3020997041844957, + "tokens_seen": 3229964288 + }, + { + "epoch": 10.08, + "learning_rate": 1.0872617853560683e-05, + "loss": 2.369, + "theoretical_loss": 3.3020946297933227, + "tokens_seen": 3230029824 + }, + { + "epoch": 10.08, + "learning_rate": 1.0862587763289869e-05, + "loss": 2.3716, + "theoretical_loss": 3.302089555533933, + "tokens_seen": 3230095360 + }, + { + "epoch": 10.08, + "learning_rate": 1.0852557673019058e-05, + "loss": 2.2388, + "theoretical_loss": 3.3020844814063213, + "tokens_seen": 3230160896 + }, + { + "epoch": 10.08, + "learning_rate": 1.0842527582748245e-05, + "loss": 2.3235, + "theoretical_loss": 3.3020794074104813, + "tokens_seen": 3230226432 + }, + { + "epoch": 10.08, + "learning_rate": 1.0832497492477433e-05, + "loss": 2.3908, + "theoretical_loss": 3.3020743335464062, + "tokens_seen": 3230291968 + }, + { + "epoch": 10.08, + "learning_rate": 1.0822467402206619e-05, + "loss": 2.4095, + "theoretical_loss": 3.3020692598140906, + "tokens_seen": 3230357504 + }, + { + "epoch": 10.08, + "learning_rate": 1.0812437311935808e-05, + "loss": 2.4189, + "theoretical_loss": 3.3020641862135283, + "tokens_seen": 3230423040 + }, + { + "epoch": 10.08, + "learning_rate": 1.0802407221664996e-05, + "loss": 2.3557, + "theoretical_loss": 3.302059112744713, + "tokens_seen": 3230488576 + }, + { + "epoch": 10.08, + "learning_rate": 1.0792377131394182e-05, + "loss": 2.3543, + "theoretical_loss": 3.302054039407639, + "tokens_seen": 3230554112 + }, + { + "epoch": 10.08, + "learning_rate": 1.078234704112337e-05, + "loss": 2.4227, + "theoretical_loss": 3.3020489662022996, + "tokens_seen": 3230619648 + }, + { + "epoch": 10.08, + "learning_rate": 1.0772316950852556e-05, + "loss": 2.2115, + "theoretical_loss": 3.302043893128689, + "tokens_seen": 3230685184 + }, + { + "epoch": 10.08, + "learning_rate": 1.0762286860581746e-05, + "loss": 2.3463, + "theoretical_loss": 3.3020388201868016, + "tokens_seen": 3230750720 + }, + { + "epoch": 10.08, + "learning_rate": 1.0752256770310934e-05, + "loss": 2.3152, + "theoretical_loss": 3.3020337473766306, + "tokens_seen": 3230816256 + }, + { + "epoch": 10.08, + "learning_rate": 1.074222668004012e-05, + "loss": 2.4466, + "theoretical_loss": 3.30202867469817, + "tokens_seen": 3230881792 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3515935, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.977961540222168, + "objective/train/theoretical_loss": 3.3020261384083294, + "objective/train/tokens_used": 3251374560, + "theoretical_loss": 3.3020261384083294, + "tokens_seen": 3230914560 + }, + { + "epoch": 10.08, + "learning_rate": 1.0732196589769308e-05, + "loss": 2.5163, + "theoretical_loss": 3.302023602151414, + "tokens_seen": 3230947328 + }, + { + "epoch": 10.08, + "learning_rate": 1.0722166499498494e-05, + "loss": 2.2952, + "theoretical_loss": 3.3020185297363565, + "tokens_seen": 3231012864 + }, + { + "epoch": 10.08, + "learning_rate": 1.0712136409227684e-05, + "loss": 2.4216, + "theoretical_loss": 3.302013457452991, + "tokens_seen": 3231078400 + }, + { + "epoch": 10.08, + "learning_rate": 1.0702106318956872e-05, + "loss": 2.4844, + "theoretical_loss": 3.302008385301312, + "tokens_seen": 3231143936 + }, + { + "epoch": 10.08, + "learning_rate": 1.0692076228686058e-05, + "loss": 2.2311, + "theoretical_loss": 3.302003313281313, + "tokens_seen": 3231209472 + }, + { + "epoch": 10.08, + "learning_rate": 1.0682046138415246e-05, + "loss": 2.314, + "theoretical_loss": 3.301998241392988, + "tokens_seen": 3231275008 + }, + { + "epoch": 10.08, + "learning_rate": 1.0672016048144434e-05, + "loss": 2.5374, + "theoretical_loss": 3.301993169636331, + "tokens_seen": 3231340544 + }, + { + "epoch": 10.08, + "learning_rate": 1.0661985957873622e-05, + "loss": 2.537, + "theoretical_loss": 3.3019880980113356, + "tokens_seen": 3231406080 + }, + { + "epoch": 10.08, + "learning_rate": 1.0651955867602808e-05, + "loss": 2.177, + "theoretical_loss": 3.3019830265179966, + "tokens_seen": 3231471616 + }, + { + "epoch": 10.08, + "learning_rate": 1.0641925777331996e-05, + "loss": 2.4231, + "theoretical_loss": 3.301977955156307, + "tokens_seen": 3231537152 + }, + { + "epoch": 10.08, + "learning_rate": 1.0631895687061184e-05, + "loss": 2.4341, + "theoretical_loss": 3.3019728839262603, + "tokens_seen": 3231602688 + }, + { + "epoch": 10.08, + "learning_rate": 1.0621865596790372e-05, + "loss": 2.4898, + "theoretical_loss": 3.3019678128278516, + "tokens_seen": 3231668224 + }, + { + "epoch": 10.08, + "learning_rate": 1.061183550651956e-05, + "loss": 2.3778, + "theoretical_loss": 3.3019627418610744, + "tokens_seen": 3231733760 + }, + { + "epoch": 10.08, + "learning_rate": 1.0601805416248746e-05, + "loss": 2.4968, + "theoretical_loss": 3.3019576710259226, + "tokens_seen": 3231799296 + }, + { + "epoch": 10.08, + "learning_rate": 1.0591775325977934e-05, + "loss": 2.158, + "theoretical_loss": 3.30195260032239, + "tokens_seen": 3231864832 + }, + { + "epoch": 10.08, + "learning_rate": 1.0581745235707121e-05, + "loss": 2.3042, + "theoretical_loss": 3.3019475297504703, + "tokens_seen": 3231930368 + }, + { + "epoch": 10.08, + "learning_rate": 1.057171514543631e-05, + "loss": 2.29, + "theoretical_loss": 3.301942459310158, + "tokens_seen": 3231995904 + }, + { + "epoch": 10.08, + "learning_rate": 1.0561685055165497e-05, + "loss": 2.5364, + "theoretical_loss": 3.3019373890014463, + "tokens_seen": 3232061440 + }, + { + "epoch": 10.08, + "learning_rate": 1.0551654964894683e-05, + "loss": 2.2517, + "theoretical_loss": 3.30193231882433, + "tokens_seen": 3232126976 + }, + { + "epoch": 10.08, + "learning_rate": 1.0541624874623871e-05, + "loss": 2.4137, + "theoretical_loss": 3.301927248778802, + "tokens_seen": 3232192512 + }, + { + "epoch": 10.08, + "learning_rate": 1.0531594784353061e-05, + "loss": 2.618, + "theoretical_loss": 3.301922178864857, + "tokens_seen": 3232258048 + }, + { + "epoch": 10.08, + "learning_rate": 1.0521564694082247e-05, + "loss": 2.3655, + "theoretical_loss": 3.3019171090824884, + "tokens_seen": 3232323584 + }, + { + "epoch": 10.08, + "learning_rate": 1.0511534603811435e-05, + "loss": 2.3874, + "theoretical_loss": 3.3019120394316905, + "tokens_seen": 3232389120 + }, + { + "epoch": 10.08, + "learning_rate": 1.0501504513540621e-05, + "loss": 2.4822, + "theoretical_loss": 3.3019069699124572, + "tokens_seen": 3232454656 + }, + { + "epoch": 10.08, + "learning_rate": 1.0491474423269809e-05, + "loss": 2.3329, + "theoretical_loss": 3.3019019005247827, + "tokens_seen": 3232520192 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3516371, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5806937217712402, + "objective/train/theoretical_loss": 3.3018993658802778, + "objective/train/tokens_used": 3253012960, + "theoretical_loss": 3.3018993658802778, + "tokens_seen": 3232552960 + }, + { + "epoch": 10.08, + "learning_rate": 1.0481444332998997e-05, + "loss": 2.4884, + "theoretical_loss": 3.30189683126866, + "tokens_seen": 3232585728 + }, + { + "epoch": 10.08, + "learning_rate": 1.0471414242728185e-05, + "loss": 2.4811, + "theoretical_loss": 3.3018917621440833, + "tokens_seen": 3232651264 + }, + { + "epoch": 10.08, + "learning_rate": 1.0461384152457373e-05, + "loss": 2.363, + "theoretical_loss": 3.3018866931510473, + "tokens_seen": 3232716800 + }, + { + "epoch": 10.08, + "learning_rate": 1.0451354062186559e-05, + "loss": 2.3398, + "theoretical_loss": 3.3018816242895452, + "tokens_seen": 3232782336 + }, + { + "epoch": 10.08, + "learning_rate": 1.0441323971915747e-05, + "loss": 2.4768, + "theoretical_loss": 3.301876555559571, + "tokens_seen": 3232847872 + }, + { + "epoch": 10.08, + "learning_rate": 1.0431293881644935e-05, + "loss": 2.5706, + "theoretical_loss": 3.3018714869611188, + "tokens_seen": 3232913408 + }, + { + "epoch": 10.08, + "learning_rate": 1.0421263791374123e-05, + "loss": 2.5965, + "theoretical_loss": 3.3018664184941824, + "tokens_seen": 3232978944 + }, + { + "epoch": 10.08, + "learning_rate": 1.041123370110331e-05, + "loss": 2.5617, + "theoretical_loss": 3.301861350158756, + "tokens_seen": 3233044480 + }, + { + "epoch": 10.08, + "learning_rate": 1.0401203610832497e-05, + "loss": 2.5326, + "theoretical_loss": 3.3018562819548327, + "tokens_seen": 3233110016 + }, + { + "epoch": 10.08, + "learning_rate": 1.0391173520561686e-05, + "loss": 2.4064, + "theoretical_loss": 3.301851213882408, + "tokens_seen": 3233175552 + }, + { + "epoch": 10.08, + "learning_rate": 1.0381143430290873e-05, + "loss": 2.5795, + "theoretical_loss": 3.301846145941474, + "tokens_seen": 3233241088 + }, + { + "epoch": 10.08, + "learning_rate": 1.037111334002006e-05, + "loss": 2.556, + "theoretical_loss": 3.3018410781320258, + "tokens_seen": 3233306624 + }, + { + "epoch": 10.08, + "learning_rate": 1.0361083249749248e-05, + "loss": 2.6238, + "theoretical_loss": 3.301836010454057, + "tokens_seen": 3233372160 + }, + { + "epoch": 10.08, + "learning_rate": 1.0351053159478435e-05, + "loss": 2.1335, + "theoretical_loss": 3.301830942907561, + "tokens_seen": 3233437696 + }, + { + "epoch": 10.08, + "learning_rate": 1.0341023069207624e-05, + "loss": 2.339, + "theoretical_loss": 3.3018258754925327, + "tokens_seen": 3233503232 + }, + { + "epoch": 10.08, + "learning_rate": 1.033099297893681e-05, + "loss": 2.412, + "theoretical_loss": 3.3018208082089657, + "tokens_seen": 3233568768 + }, + { + "epoch": 10.08, + "learning_rate": 1.0320962888665998e-05, + "loss": 2.3212, + "theoretical_loss": 3.3018157410568536, + "tokens_seen": 3233634304 + }, + { + "epoch": 10.08, + "learning_rate": 1.0310932798395184e-05, + "loss": 2.3477, + "theoretical_loss": 3.30181067403619, + "tokens_seen": 3233699840 + }, + { + "epoch": 10.08, + "learning_rate": 1.0300902708124372e-05, + "loss": 2.4572, + "theoretical_loss": 3.30180560714697, + "tokens_seen": 3233765376 + }, + { + "epoch": 10.08, + "learning_rate": 1.0290872617853562e-05, + "loss": 2.3393, + "theoretical_loss": 3.301800540389187, + "tokens_seen": 3233830912 + }, + { + "epoch": 10.08, + "learning_rate": 1.0280842527582748e-05, + "loss": 2.429, + "theoretical_loss": 3.301795473762834, + "tokens_seen": 3233896448 + }, + { + "epoch": 10.08, + "learning_rate": 1.0270812437311936e-05, + "loss": 2.618, + "theoretical_loss": 3.301790407267906, + "tokens_seen": 3233961984 + }, + { + "epoch": 10.08, + "learning_rate": 1.0260782347041122e-05, + "loss": 2.4066, + "theoretical_loss": 3.301785340904397, + "tokens_seen": 3234027520 + }, + { + "epoch": 10.08, + "learning_rate": 1.0250752256770312e-05, + "loss": 2.3789, + "theoretical_loss": 3.3017802746723004, + "tokens_seen": 3234093056 + }, + { + "epoch": 10.08, + "learning_rate": 1.02407221664995e-05, + "loss": 2.5891, + "theoretical_loss": 3.3017752085716103, + "tokens_seen": 3234158592 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3517792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7262346744537354, + "objective/train/theoretical_loss": 3.3017726755705405, + "objective/train/tokens_used": 3254651360, + "theoretical_loss": 3.3017726755705405, + "tokens_seen": 3234191360 + }, + { + "epoch": 10.08, + "learning_rate": 1.0230692076228686e-05, + "loss": 2.4527, + "theoretical_loss": 3.3017701426023205, + "tokens_seen": 3234224128 + }, + { + "epoch": 10.08, + "learning_rate": 1.0220661985957874e-05, + "loss": 2.4123, + "theoretical_loss": 3.301765076764425, + "tokens_seen": 3234289664 + }, + { + "epoch": 10.08, + "learning_rate": 1.021063189568706e-05, + "loss": 2.2502, + "theoretical_loss": 3.301760011057918, + "tokens_seen": 3234355200 + }, + { + "epoch": 10.08, + "learning_rate": 1.020060180541625e-05, + "loss": 2.5823, + "theoretical_loss": 3.3017549454827932, + "tokens_seen": 3234420736 + }, + { + "epoch": 10.08, + "learning_rate": 1.0190571715145438e-05, + "loss": 2.4931, + "theoretical_loss": 3.3017498800390443, + "tokens_seen": 3234486272 + }, + { + "epoch": 10.08, + "learning_rate": 1.0180541624874624e-05, + "loss": 2.4215, + "theoretical_loss": 3.301744814726666, + "tokens_seen": 3234551808 + }, + { + "epoch": 10.08, + "learning_rate": 1.0170511534603812e-05, + "loss": 2.2934, + "theoretical_loss": 3.3017397495456513, + "tokens_seen": 3234617344 + }, + { + "epoch": 10.08, + "learning_rate": 1.0160481444332998e-05, + "loss": 2.2575, + "theoretical_loss": 3.3017346844959947, + "tokens_seen": 3234682880 + }, + { + "epoch": 10.08, + "learning_rate": 1.0150451354062187e-05, + "loss": 2.5069, + "theoretical_loss": 3.3017296195776895, + "tokens_seen": 3234748416 + }, + { + "epoch": 10.08, + "learning_rate": 1.0140421263791375e-05, + "loss": 2.3553, + "theoretical_loss": 3.301724554790731, + "tokens_seen": 3234813952 + }, + { + "epoch": 10.08, + "learning_rate": 1.0130391173520562e-05, + "loss": 2.4964, + "theoretical_loss": 3.3017194901351115, + "tokens_seen": 3234879488 + }, + { + "epoch": 10.08, + "learning_rate": 1.012036108324975e-05, + "loss": 2.5157, + "theoretical_loss": 3.3017144256108257, + "tokens_seen": 3234945024 + }, + { + "epoch": 10.08, + "learning_rate": 1.0110330992978937e-05, + "loss": 2.5238, + "theoretical_loss": 3.301709361217868, + "tokens_seen": 3235010560 + }, + { + "epoch": 10.08, + "learning_rate": 1.0100300902708125e-05, + "loss": 2.3887, + "theoretical_loss": 3.301704296956231, + "tokens_seen": 3235076096 + }, + { + "epoch": 10.08, + "learning_rate": 1.0090270812437311e-05, + "loss": 2.3989, + "theoretical_loss": 3.3016992328259103, + "tokens_seen": 3235141632 + }, + { + "epoch": 10.08, + "learning_rate": 1.00802407221665e-05, + "loss": 2.4389, + "theoretical_loss": 3.3016941688268986, + "tokens_seen": 3235207168 + }, + { + "epoch": 10.08, + "learning_rate": 1.0070210631895687e-05, + "loss": 2.3456, + "theoretical_loss": 3.3016891049591903, + "tokens_seen": 3235272704 + }, + { + "epoch": 10.08, + "learning_rate": 1.0060180541624875e-05, + "loss": 2.282, + "theoretical_loss": 3.3016840412227797, + "tokens_seen": 3235338240 + }, + { + "epoch": 10.08, + "learning_rate": 1.0050150451354063e-05, + "loss": 2.4522, + "theoretical_loss": 3.3016789776176596, + "tokens_seen": 3235403776 + }, + { + "epoch": 10.08, + "learning_rate": 1.004012036108325e-05, + "loss": 2.4948, + "theoretical_loss": 3.301673914143825, + "tokens_seen": 3235469312 + }, + { + "epoch": 10.08, + "learning_rate": 1.0030090270812437e-05, + "loss": 2.4325, + "theoretical_loss": 3.3016688508012697, + "tokens_seen": 3235534848 + }, + { + "epoch": 10.08, + "learning_rate": 1.0020060180541625e-05, + "loss": 2.4534, + "theoretical_loss": 3.3016637875899875, + "tokens_seen": 3235600384 + }, + { + "epoch": 10.08, + "learning_rate": 1.0010030090270813e-05, + "loss": 1.9207, + "theoretical_loss": 3.3016587245099718, + "tokens_seen": 3235665920 + }, + { + "epoch": 10.08, + "learning_rate": 1e-05, + "loss": 2.4734, + "theoretical_loss": 3.3016536615612173, + "tokens_seen": 3235731456 + }, + { + "epoch": 10.08, + "learning_rate": 9.989969909729187e-06, + "loss": 2.5852, + "theoretical_loss": 3.3016485987437174, + "tokens_seen": 3235796992 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3518571, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.528883457183838, + "objective/train/theoretical_loss": 3.301646067384186, + "objective/train/tokens_used": 3256289760, + "theoretical_loss": 3.301646067384186, + "tokens_seen": 3235829760 + }, + { + "epoch": 10.08, + "learning_rate": 9.979939819458375e-06, + "loss": 2.3827, + "theoretical_loss": 3.3016435360574663, + "tokens_seen": 3235862528 + }, + { + "epoch": 10.08, + "learning_rate": 9.969909729187565e-06, + "loss": 2.5567, + "theoretical_loss": 3.301638473502458, + "tokens_seen": 3235928064 + }, + { + "epoch": 10.08, + "learning_rate": 9.95987963891675e-06, + "loss": 2.2256, + "theoretical_loss": 3.3016334110786865, + "tokens_seen": 3235993600 + }, + { + "epoch": 10.08, + "learning_rate": 9.949849548645939e-06, + "loss": 2.4984, + "theoretical_loss": 3.3016283487861458, + "tokens_seen": 3236059136 + }, + { + "epoch": 10.08, + "learning_rate": 9.939819458375125e-06, + "loss": 2.3416, + "theoretical_loss": 3.301623286624829, + "tokens_seen": 3236124672 + }, + { + "epoch": 10.08, + "learning_rate": 9.929789368104313e-06, + "loss": 2.5589, + "theoretical_loss": 3.301618224594731, + "tokens_seen": 3236190208 + }, + { + "epoch": 10.08, + "learning_rate": 9.9197592778335e-06, + "loss": 2.5286, + "theoretical_loss": 3.3016131626958454, + "tokens_seen": 3236255744 + }, + { + "epoch": 10.08, + "learning_rate": 9.909729187562688e-06, + "loss": 2.5029, + "theoretical_loss": 3.301608100928166, + "tokens_seen": 3236321280 + }, + { + "epoch": 10.08, + "learning_rate": 9.899699097291876e-06, + "loss": 2.3199, + "theoretical_loss": 3.3016030392916873, + "tokens_seen": 3236386816 + }, + { + "epoch": 10.08, + "learning_rate": 9.889669007021063e-06, + "loss": 2.4939, + "theoretical_loss": 3.301597977786403, + "tokens_seen": 3236452352 + }, + { + "epoch": 10.08, + "learning_rate": 9.87963891675025e-06, + "loss": 2.3943, + "theoretical_loss": 3.3015929164123063, + "tokens_seen": 3236517888 + }, + { + "epoch": 10.08, + "learning_rate": 9.869608826479438e-06, + "loss": 2.3051, + "theoretical_loss": 3.301587855169392, + "tokens_seen": 3236583424 + }, + { + "epoch": 10.08, + "learning_rate": 9.859578736208626e-06, + "loss": 2.3186, + "theoretical_loss": 3.3015827940576536, + "tokens_seen": 3236648960 + }, + { + "epoch": 10.08, + "learning_rate": 9.849548645937814e-06, + "loss": 2.3929, + "theoretical_loss": 3.3015777330770857, + "tokens_seen": 3236714496 + }, + { + "epoch": 10.08, + "learning_rate": 9.839518555667e-06, + "loss": 2.3954, + "theoretical_loss": 3.3015726722276812, + "tokens_seen": 3236780032 + }, + { + "epoch": 10.08, + "learning_rate": 9.82948846539619e-06, + "loss": 2.4416, + "theoretical_loss": 3.301567611509435, + "tokens_seen": 3236845568 + }, + { + "epoch": 10.08, + "learning_rate": 9.819458375125376e-06, + "loss": 2.2968, + "theoretical_loss": 3.3015625509223403, + "tokens_seen": 3236911104 + }, + { + "epoch": 10.08, + "learning_rate": 9.809428284854564e-06, + "loss": 2.5926, + "theoretical_loss": 3.301557490466392, + "tokens_seen": 3236976640 + }, + { + "epoch": 10.08, + "learning_rate": 9.799398194583752e-06, + "loss": 2.5175, + "theoretical_loss": 3.301552430141583, + "tokens_seen": 3237042176 + }, + { + "epoch": 10.08, + "learning_rate": 9.789368104312938e-06, + "loss": 2.6344, + "theoretical_loss": 3.3015473699479077, + "tokens_seen": 3237107712 + }, + { + "epoch": 10.08, + "learning_rate": 9.779338014042128e-06, + "loss": 2.3115, + "theoretical_loss": 3.3015423098853605, + "tokens_seen": 3237173248 + }, + { + "epoch": 10.08, + "learning_rate": 9.769307923771314e-06, + "loss": 2.3598, + "theoretical_loss": 3.301537249953934, + "tokens_seen": 3237238784 + }, + { + "epoch": 10.08, + "learning_rate": 9.759277833500502e-06, + "loss": 2.3948, + "theoretical_loss": 3.301532190153624, + "tokens_seen": 3237304320 + }, + { + "epoch": 10.08, + "learning_rate": 9.749247743229688e-06, + "loss": 2.2279, + "theoretical_loss": 3.301527130484423, + "tokens_seen": 3237369856 + }, + { + "epoch": 10.08, + "learning_rate": 9.739217652958876e-06, + "loss": 2.2335, + "theoretical_loss": 3.3015220709463255, + "tokens_seen": 3237435392 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3519061, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.659114122390747, + "objective/train/theoretical_loss": 3.3015195412264386, + "objective/train/tokens_used": 3257928160, + "theoretical_loss": 3.3015195412264386, + "tokens_seen": 3237468160 + }, + { + "epoch": 10.08, + "learning_rate": 9.729187562688066e-06, + "loss": 2.6597, + "theoretical_loss": 3.3015170115393255, + "tokens_seen": 3237500928 + }, + { + "epoch": 10.08, + "learning_rate": 9.719157472417252e-06, + "loss": 2.4904, + "theoretical_loss": 3.301511952263416, + "tokens_seen": 3237566464 + }, + { + "epoch": 10.08, + "learning_rate": 9.70912738214644e-06, + "loss": 2.363, + "theoretical_loss": 3.301506893118593, + "tokens_seen": 3237632000 + }, + { + "epoch": 10.08, + "learning_rate": 9.699097291875626e-06, + "loss": 2.3222, + "theoretical_loss": 3.3015018341048488, + "tokens_seen": 3237697536 + }, + { + "epoch": 10.08, + "learning_rate": 9.689067201604815e-06, + "loss": 2.6333, + "theoretical_loss": 3.3014967752221778, + "tokens_seen": 3237763072 + }, + { + "epoch": 10.08, + "learning_rate": 9.679037111334003e-06, + "loss": 2.3965, + "theoretical_loss": 3.3014917164705735, + "tokens_seen": 3237828608 + }, + { + "epoch": 10.08, + "learning_rate": 9.66900702106319e-06, + "loss": 2.5245, + "theoretical_loss": 3.301486657850031, + "tokens_seen": 3237894144 + }, + { + "epoch": 10.08, + "learning_rate": 9.658976930792377e-06, + "loss": 2.3749, + "theoretical_loss": 3.3014815993605433, + "tokens_seen": 3237959680 + }, + { + "epoch": 10.08, + "learning_rate": 9.648946840521564e-06, + "loss": 2.434, + "theoretical_loss": 3.301476541002104, + "tokens_seen": 3238025216 + }, + { + "epoch": 10.08, + "learning_rate": 9.638916750250753e-06, + "loss": 2.5621, + "theoretical_loss": 3.3014714827747085, + "tokens_seen": 3238090752 + }, + { + "epoch": 10.08, + "learning_rate": 9.628886659979941e-06, + "loss": 2.2791, + "theoretical_loss": 3.3014664246783494, + "tokens_seen": 3238156288 + }, + { + "epoch": 10.08, + "learning_rate": 9.618856569709127e-06, + "loss": 2.4436, + "theoretical_loss": 3.3014613667130215, + "tokens_seen": 3238221824 + }, + { + "epoch": 10.08, + "learning_rate": 9.608826479438315e-06, + "loss": 2.4391, + "theoretical_loss": 3.301456308878718, + "tokens_seen": 3238287360 + }, + { + "epoch": 10.08, + "learning_rate": 9.598796389167501e-06, + "loss": 2.3293, + "theoretical_loss": 3.3014512511754335, + "tokens_seen": 3238352896 + }, + { + "epoch": 10.08, + "learning_rate": 9.588766298896691e-06, + "loss": 2.3469, + "theoretical_loss": 3.3014461936031614, + "tokens_seen": 3238418432 + }, + { + "epoch": 10.08, + "learning_rate": 9.578736208625879e-06, + "loss": 2.2787, + "theoretical_loss": 3.3014411361618965, + "tokens_seen": 3238483968 + }, + { + "epoch": 10.08, + "learning_rate": 9.568706118355065e-06, + "loss": 2.5473, + "theoretical_loss": 3.3014360788516317, + "tokens_seen": 3238549504 + }, + { + "epoch": 10.08, + "learning_rate": 9.558676028084253e-06, + "loss": 2.2631, + "theoretical_loss": 3.3014310216723617, + "tokens_seen": 3238615040 + }, + { + "epoch": 10.08, + "learning_rate": 9.54864593781344e-06, + "loss": 2.1624, + "theoretical_loss": 3.3014259646240807, + "tokens_seen": 3238680576 + }, + { + "epoch": 10.08, + "learning_rate": 9.538615847542629e-06, + "loss": 2.5383, + "theoretical_loss": 3.3014209077067815, + "tokens_seen": 3238746112 + }, + { + "epoch": 10.08, + "learning_rate": 9.528585757271815e-06, + "loss": 2.4368, + "theoretical_loss": 3.301415850920459, + "tokens_seen": 3238811648 + }, + { + "epoch": 10.08, + "learning_rate": 9.518555667001003e-06, + "loss": 2.3841, + "theoretical_loss": 3.301410794265107, + "tokens_seen": 3238877184 + }, + { + "epoch": 10.08, + "learning_rate": 9.50852557673019e-06, + "loss": 2.3346, + "theoretical_loss": 3.3014057377407195, + "tokens_seen": 3238942720 + }, + { + "epoch": 10.08, + "learning_rate": 9.498495486459379e-06, + "loss": 2.3811, + "theoretical_loss": 3.30140068134729, + "tokens_seen": 3239008256 + }, + { + "epoch": 10.08, + "learning_rate": 9.488465396188567e-06, + "loss": 1.9283, + "theoretical_loss": 3.301395625084813, + "tokens_seen": 3239073792 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3520349, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.600020408630371, + "objective/train/theoretical_loss": 3.3013930970026797, + "objective/train/tokens_used": 3259566560, + "theoretical_loss": 3.3013930970026797, + "tokens_seen": 3239106560 + }, + { + "epoch": 10.08, + "learning_rate": 9.478435305917753e-06, + "loss": 2.4407, + "theoretical_loss": 3.3013905689532823, + "tokens_seen": 3239139328 + }, + { + "epoch": 10.08, + "learning_rate": 9.46840521564694e-06, + "loss": 2.4187, + "theoretical_loss": 3.3013855129526917, + "tokens_seen": 3239204864 + }, + { + "epoch": 10.08, + "learning_rate": 9.458375125376129e-06, + "loss": 2.5704, + "theoretical_loss": 3.301380457083035, + "tokens_seen": 3239270400 + }, + { + "epoch": 10.08, + "learning_rate": 9.448345035105316e-06, + "loss": 2.0848, + "theoretical_loss": 3.301375401344307, + "tokens_seen": 3239335936 + }, + { + "epoch": 10.08, + "learning_rate": 9.438314944834504e-06, + "loss": 2.3836, + "theoretical_loss": 3.3013703457365002, + "tokens_seen": 3239401472 + }, + { + "epoch": 10.08, + "learning_rate": 9.42828485456369e-06, + "loss": 2.3408, + "theoretical_loss": 3.3013652902596102, + "tokens_seen": 3239467008 + }, + { + "epoch": 10.08, + "learning_rate": 9.418254764292878e-06, + "loss": 2.4303, + "theoretical_loss": 3.3013602349136297, + "tokens_seen": 3239532544 + }, + { + "epoch": 10.08, + "learning_rate": 9.408224674022065e-06, + "loss": 2.0596, + "theoretical_loss": 3.301355179698554, + "tokens_seen": 3239598080 + }, + { + "epoch": 10.08, + "learning_rate": 9.398194583751254e-06, + "loss": 2.2926, + "theoretical_loss": 3.3013501246143755, + "tokens_seen": 3239663616 + }, + { + "epoch": 10.08, + "learning_rate": 9.388164493480442e-06, + "loss": 2.318, + "theoretical_loss": 3.301345069661089, + "tokens_seen": 3239729152 + }, + { + "epoch": 10.08, + "learning_rate": 9.378134403209628e-06, + "loss": 2.5099, + "theoretical_loss": 3.3013400148386887, + "tokens_seen": 3239794688 + }, + { + "epoch": 10.08, + "learning_rate": 9.368104312938816e-06, + "loss": 2.189, + "theoretical_loss": 3.3013349601471678, + "tokens_seen": 3239860224 + }, + { + "epoch": 10.08, + "learning_rate": 9.358074222668004e-06, + "loss": 2.5333, + "theoretical_loss": 3.301329905586521, + "tokens_seen": 3239925760 + }, + { + "epoch": 10.08, + "learning_rate": 9.348044132397192e-06, + "loss": 2.3768, + "theoretical_loss": 3.3013248511567417, + "tokens_seen": 3239991296 + }, + { + "epoch": 10.08, + "learning_rate": 9.33801404212638e-06, + "loss": 2.4843, + "theoretical_loss": 3.301319796857824, + "tokens_seen": 3240056832 + }, + { + "epoch": 10.08, + "learning_rate": 9.327983951855566e-06, + "loss": 2.3673, + "theoretical_loss": 3.301314742689762, + "tokens_seen": 3240122368 + }, + { + "epoch": 10.08, + "learning_rate": 9.317953861584754e-06, + "loss": 2.4485, + "theoretical_loss": 3.30130968865255, + "tokens_seen": 3240187904 + }, + { + "epoch": 10.08, + "learning_rate": 9.307923771313942e-06, + "loss": 2.4307, + "theoretical_loss": 3.301304634746182, + "tokens_seen": 3240253440 + }, + { + "epoch": 10.08, + "learning_rate": 9.29789368104313e-06, + "loss": 2.1967, + "theoretical_loss": 3.3012995809706505, + "tokens_seen": 3240318976 + }, + { + "epoch": 10.08, + "learning_rate": 9.287863590772318e-06, + "loss": 2.4123, + "theoretical_loss": 3.3012945273259513, + "tokens_seen": 3240384512 + }, + { + "epoch": 10.08, + "learning_rate": 9.277833500501504e-06, + "loss": 2.429, + "theoretical_loss": 3.3012894738120773, + "tokens_seen": 3240450048 + }, + { + "epoch": 10.08, + "learning_rate": 9.267803410230692e-06, + "loss": 2.2227, + "theoretical_loss": 3.301284420429023, + "tokens_seen": 3240515584 + }, + { + "epoch": 10.08, + "learning_rate": 9.25777331995988e-06, + "loss": 2.186, + "theoretical_loss": 3.301279367176782, + "tokens_seen": 3240581120 + }, + { + "epoch": 10.08, + "learning_rate": 9.247743229689068e-06, + "loss": 2.3545, + "theoretical_loss": 3.301274314055348, + "tokens_seen": 3240646656 + }, + { + "epoch": 10.08, + "learning_rate": 9.237713139418256e-06, + "loss": 2.0592, + "theoretical_loss": 3.301269261064716, + "tokens_seen": 3240712192 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3520882, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.8807461261749268, + "objective/train/theoretical_loss": 3.3012667346184488, + "objective/train/tokens_used": 3261204960, + "theoretical_loss": 3.3012667346184488, + "tokens_seen": 3240744960 + }, + { + "epoch": 10.08, + "learning_rate": 9.227683049147442e-06, + "loss": 2.2082, + "theoretical_loss": 3.301264208204879, + "tokens_seen": 3240777728 + }, + { + "epoch": 10.08, + "learning_rate": 9.217652958876631e-06, + "loss": 2.4456, + "theoretical_loss": 3.3012591554758317, + "tokens_seen": 3240843264 + }, + { + "epoch": 10.08, + "learning_rate": 9.207622868605818e-06, + "loss": 2.4103, + "theoretical_loss": 3.3012541028775675, + "tokens_seen": 3240908800 + }, + { + "epoch": 10.08, + "learning_rate": 9.197592778335005e-06, + "loss": 2.3052, + "theoretical_loss": 3.3012490504100804, + "tokens_seen": 3240974336 + }, + { + "epoch": 10.08, + "learning_rate": 9.187562688064192e-06, + "loss": 2.4243, + "theoretical_loss": 3.3012439980733648, + "tokens_seen": 3241039872 + }, + { + "epoch": 10.08, + "learning_rate": 9.17753259779338e-06, + "loss": 2.2079, + "theoretical_loss": 3.3012389458674143, + "tokens_seen": 3241105408 + }, + { + "epoch": 10.08, + "learning_rate": 9.167502507522569e-06, + "loss": 2.381, + "theoretical_loss": 3.3012338937922228, + "tokens_seen": 3241170944 + }, + { + "epoch": 10.08, + "learning_rate": 9.157472417251755e-06, + "loss": 2.2223, + "theoretical_loss": 3.3012288418477844, + "tokens_seen": 3241236480 + }, + { + "epoch": 10.08, + "learning_rate": 9.147442326980943e-06, + "loss": 2.3112, + "theoretical_loss": 3.3012237900340935, + "tokens_seen": 3241302016 + }, + { + "epoch": 10.08, + "learning_rate": 9.13741223671013e-06, + "loss": 2.456, + "theoretical_loss": 3.3012187383511433, + "tokens_seen": 3241367552 + }, + { + "epoch": 10.08, + "learning_rate": 9.127382146439317e-06, + "loss": 2.2912, + "theoretical_loss": 3.3012136867989286, + "tokens_seen": 3241433088 + }, + { + "epoch": 10.08, + "learning_rate": 9.117352056168507e-06, + "loss": 2.2382, + "theoretical_loss": 3.3012086353774426, + "tokens_seen": 3241498624 + }, + { + "epoch": 10.08, + "learning_rate": 9.107321965897693e-06, + "loss": 2.4214, + "theoretical_loss": 3.3012035840866796, + "tokens_seen": 3241564160 + }, + { + "epoch": 10.08, + "learning_rate": 9.097291875626881e-06, + "loss": 2.2139, + "theoretical_loss": 3.301198532926634, + "tokens_seen": 3241629696 + }, + { + "epoch": 10.08, + "learning_rate": 9.087261785356067e-06, + "loss": 2.4077, + "theoretical_loss": 3.3011934818972986, + "tokens_seen": 3241695232 + }, + { + "epoch": 10.08, + "learning_rate": 9.077231695085257e-06, + "loss": 2.4148, + "theoretical_loss": 3.3011884309986685, + "tokens_seen": 3241760768 + }, + { + "epoch": 10.08, + "learning_rate": 9.067201604814445e-06, + "loss": 2.0409, + "theoretical_loss": 3.3011833802307375, + "tokens_seen": 3241826304 + }, + { + "epoch": 10.08, + "learning_rate": 9.057171514543631e-06, + "loss": 2.3039, + "theoretical_loss": 3.3011783295934993, + "tokens_seen": 3241891840 + }, + { + "epoch": 10.08, + "learning_rate": 9.047141424272819e-06, + "loss": 2.3552, + "theoretical_loss": 3.301173279086948, + "tokens_seen": 3241957376 + }, + { + "epoch": 10.08, + "learning_rate": 9.037111334002005e-06, + "loss": 2.3509, + "theoretical_loss": 3.3011682287110773, + "tokens_seen": 3242022912 + }, + { + "epoch": 10.08, + "learning_rate": 9.027081243731195e-06, + "loss": 2.4152, + "theoretical_loss": 3.301163178465882, + "tokens_seen": 3242088448 + }, + { + "epoch": 10.08, + "learning_rate": 9.01705115346038e-06, + "loss": 2.2923, + "theoretical_loss": 3.301158128351355, + "tokens_seen": 3242153984 + }, + { + "epoch": 10.08, + "learning_rate": 9.007021063189569e-06, + "loss": 2.2621, + "theoretical_loss": 3.3011530783674905, + "tokens_seen": 3242219520 + }, + { + "epoch": 10.08, + "learning_rate": 8.996990972918757e-06, + "loss": 2.2113, + "theoretical_loss": 3.3011480285142834, + "tokens_seen": 3242285056 + }, + { + "epoch": 10.08, + "learning_rate": 8.986960882647943e-06, + "loss": 2.1476, + "theoretical_loss": 3.301142978791727, + "tokens_seen": 3242350592 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3524434, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2765543460845947, + "objective/train/theoretical_loss": 3.3011404539794404, + "objective/train/tokens_used": 3262843360, + "theoretical_loss": 3.3011404539794404, + "tokens_seen": 3242383360 + }, + { + "epoch": 10.08, + "learning_rate": 8.976930792377132e-06, + "loss": 2.3098, + "theoretical_loss": 3.3011379291998146, + "tokens_seen": 3242416128 + }, + { + "epoch": 10.08, + "learning_rate": 8.966900702106319e-06, + "loss": 2.1596, + "theoretical_loss": 3.3011328797385415, + "tokens_seen": 3242481664 + }, + { + "epoch": 10.08, + "learning_rate": 8.956870611835506e-06, + "loss": 2.517, + "theoretical_loss": 3.301127830407901, + "tokens_seen": 3242547200 + }, + { + "epoch": 10.08, + "learning_rate": 8.946840521564694e-06, + "loss": 2.0956, + "theoretical_loss": 3.301122781207887, + "tokens_seen": 3242612736 + }, + { + "epoch": 10.08, + "learning_rate": 8.936810431293882e-06, + "loss": 2.1918, + "theoretical_loss": 3.301117732138494, + "tokens_seen": 3242678272 + }, + { + "epoch": 10.08, + "learning_rate": 8.92678034102307e-06, + "loss": 2.2966, + "theoretical_loss": 3.3011126831997153, + "tokens_seen": 3242743808 + }, + { + "epoch": 10.08, + "learning_rate": 8.916750250752256e-06, + "loss": 2.2596, + "theoretical_loss": 3.301107634391545, + "tokens_seen": 3242809344 + }, + { + "epoch": 10.08, + "learning_rate": 8.906720160481444e-06, + "loss": 2.3719, + "theoretical_loss": 3.3011025857139775, + "tokens_seen": 3242874880 + }, + { + "epoch": 10.08, + "learning_rate": 8.896690070210632e-06, + "loss": 2.4326, + "theoretical_loss": 3.3010975371670064, + "tokens_seen": 3242940416 + }, + { + "epoch": 10.08, + "learning_rate": 8.88665997993982e-06, + "loss": 2.4089, + "theoretical_loss": 3.3010924887506263, + "tokens_seen": 3243005952 + }, + { + "epoch": 10.08, + "learning_rate": 8.876629889669008e-06, + "loss": 2.2422, + "theoretical_loss": 3.3010874404648303, + "tokens_seen": 3243071488 + }, + { + "epoch": 10.08, + "learning_rate": 8.866599799398194e-06, + "loss": 2.261, + "theoretical_loss": 3.301082392309613, + "tokens_seen": 3243137024 + }, + { + "epoch": 10.08, + "learning_rate": 8.856569709127382e-06, + "loss": 2.2066, + "theoretical_loss": 3.301077344284968, + "tokens_seen": 3243202560 + }, + { + "epoch": 10.08, + "learning_rate": 8.846539618856568e-06, + "loss": 2.3914, + "theoretical_loss": 3.301072296390889, + "tokens_seen": 3243268096 + }, + { + "epoch": 10.08, + "learning_rate": 8.836509528585758e-06, + "loss": 2.4572, + "theoretical_loss": 3.3010672486273713, + "tokens_seen": 3243333632 + }, + { + "epoch": 10.08, + "learning_rate": 8.826479438314946e-06, + "loss": 2.3575, + "theoretical_loss": 3.3010622009944077, + "tokens_seen": 3243399168 + }, + { + "epoch": 10.08, + "learning_rate": 8.816449348044132e-06, + "loss": 2.2387, + "theoretical_loss": 3.3010571534919926, + "tokens_seen": 3243464704 + }, + { + "epoch": 10.08, + "learning_rate": 8.80641925777332e-06, + "loss": 2.3096, + "theoretical_loss": 3.30105210612012, + "tokens_seen": 3243530240 + }, + { + "epoch": 10.08, + "learning_rate": 8.796389167502508e-06, + "loss": 2.2862, + "theoretical_loss": 3.3010470588787837, + "tokens_seen": 3243595776 + }, + { + "epoch": 10.08, + "learning_rate": 8.786359077231696e-06, + "loss": 2.3062, + "theoretical_loss": 3.301042011767978, + "tokens_seen": 3243661312 + }, + { + "epoch": 10.08, + "learning_rate": 8.776328986960884e-06, + "loss": 2.2965, + "theoretical_loss": 3.3010369647876963, + "tokens_seen": 3243726848 + }, + { + "epoch": 10.08, + "learning_rate": 8.76629889669007e-06, + "loss": 2.4077, + "theoretical_loss": 3.3010319179379333, + "tokens_seen": 3243792384 + }, + { + "epoch": 10.08, + "learning_rate": 8.756268806419258e-06, + "loss": 2.3226, + "theoretical_loss": 3.3010268712186823, + "tokens_seen": 3243857920 + }, + { + "epoch": 10.08, + "learning_rate": 8.746238716148446e-06, + "loss": 2.3469, + "theoretical_loss": 3.301021824629938, + "tokens_seen": 3243923456 + }, + { + "epoch": 10.08, + "learning_rate": 8.736208625877633e-06, + "loss": 2.2519, + "theoretical_loss": 3.301016778171694, + "tokens_seen": 3243988992 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.577930212020874, + "objective/train/theoretical_loss": 3.3010142549915074, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.3010142549915074, + "tokens_seen": 3244021760 + }, + { + "epoch": 10.08, + "learning_rate": 8.726178535606821e-06, + "loss": 2.4266, + "theoretical_loss": 3.301011731843944, + "tokens_seen": 3244054528 + }, + { + "epoch": 10.08, + "learning_rate": 8.716148445336008e-06, + "loss": 2.1291, + "theoretical_loss": 3.3010066856466826, + "tokens_seen": 3244120064 + }, + { + "epoch": 10.08, + "learning_rate": 8.706118355065195e-06, + "loss": 2.2643, + "theoretical_loss": 3.301001639579903, + "tokens_seen": 3244185600 + }, + { + "epoch": 10.08, + "learning_rate": 8.696088264794383e-06, + "loss": 2.4982, + "theoretical_loss": 3.3009965936436005, + "tokens_seen": 3244251136 + }, + { + "epoch": 10.08, + "learning_rate": 8.686058174523571e-06, + "loss": 2.1731, + "theoretical_loss": 3.300991547837768, + "tokens_seen": 3244316672 + }, + { + "epoch": 10.08, + "learning_rate": 8.676028084252759e-06, + "loss": 2.4886, + "theoretical_loss": 3.300986502162399, + "tokens_seen": 3244382208 + }, + { + "epoch": 10.08, + "learning_rate": 8.665997993981945e-06, + "loss": 2.3984, + "theoretical_loss": 3.300981456617489, + "tokens_seen": 3244447744 + }, + { + "epoch": 10.08, + "learning_rate": 8.655967903711135e-06, + "loss": 2.1289, + "theoretical_loss": 3.300976411203031, + "tokens_seen": 3244513280 + }, + { + "epoch": 10.08, + "learning_rate": 8.645937813440321e-06, + "loss": 2.4564, + "theoretical_loss": 3.3009713659190196, + "tokens_seen": 3244578816 + }, + { + "epoch": 10.08, + "learning_rate": 8.635907723169509e-06, + "loss": 2.46, + "theoretical_loss": 3.3009663207654483, + "tokens_seen": 3244644352 + }, + { + "epoch": 10.08, + "learning_rate": 8.625877632898695e-06, + "loss": 2.1995, + "theoretical_loss": 3.300961275742311, + "tokens_seen": 3244709888 + }, + { + "epoch": 10.08, + "learning_rate": 8.615847542627883e-06, + "loss": 2.4033, + "theoretical_loss": 3.300956230849602, + "tokens_seen": 3244775424 + }, + { + "epoch": 10.08, + "learning_rate": 8.605817452357073e-06, + "loss": 2.198, + "theoretical_loss": 3.3009511860873153, + "tokens_seen": 3244840960 + }, + { + "epoch": 10.08, + "learning_rate": 8.595787362086259e-06, + "loss": 2.1878, + "theoretical_loss": 3.300946141455445, + "tokens_seen": 3244906496 + }, + { + "epoch": 10.08, + "learning_rate": 8.585757271815447e-06, + "loss": 1.7795, + "theoretical_loss": 3.3009410969539843, + "tokens_seen": 3244972032 + }, + { + "epoch": 10.08, + "learning_rate": 8.575727181544633e-06, + "loss": 2.4248, + "theoretical_loss": 3.3009360525829283, + "tokens_seen": 3245037568 + }, + { + "epoch": 10.08, + "learning_rate": 8.565697091273821e-06, + "loss": 2.3486, + "theoretical_loss": 3.3009310083422703, + "tokens_seen": 3245103104 + }, + { + "epoch": 10.08, + "learning_rate": 8.55566700100301e-06, + "loss": 2.3404, + "theoretical_loss": 3.300925964232005, + "tokens_seen": 3245168640 + }, + { + "epoch": 10.08, + "learning_rate": 8.545636910732197e-06, + "loss": 2.3156, + "theoretical_loss": 3.3009209202521252, + "tokens_seen": 3245234176 + }, + { + "epoch": 10.08, + "learning_rate": 8.535606820461385e-06, + "loss": 2.2562, + "theoretical_loss": 3.300915876402626, + "tokens_seen": 3245299712 + }, + { + "epoch": 10.08, + "learning_rate": 8.52557673019057e-06, + "loss": 2.4168, + "theoretical_loss": 3.300910832683501, + "tokens_seen": 3245365248 + }, + { + "epoch": 10.08, + "learning_rate": 8.51554663991976e-06, + "loss": 2.384, + "theoretical_loss": 3.3009057890947435, + "tokens_seen": 3245430784 + }, + { + "epoch": 10.08, + "learning_rate": 8.505516549648948e-06, + "loss": 2.3194, + "theoretical_loss": 3.3009007456363486, + "tokens_seen": 3245496320 + }, + { + "epoch": 10.08, + "learning_rate": 8.495486459378134e-06, + "loss": 2.5099, + "theoretical_loss": 3.3008957023083103, + "tokens_seen": 3245561856 + }, + { + "epoch": 10.08, + "learning_rate": 8.485456369107322e-06, + "loss": 2.3956, + "theoretical_loss": 3.3008906591106215, + "tokens_seen": 3245627392 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5550336837768555, + "objective/train/theoretical_loss": 3.3008881375606567, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.3008881375606567, + "tokens_seen": 3245660160 + }, + { + "epoch": 10.08, + "learning_rate": 8.475426278836509e-06, + "loss": 2.2892, + "theoretical_loss": 3.3008856160432773, + "tokens_seen": 3245692928 + }, + { + "epoch": 10.08, + "learning_rate": 8.465396188565698e-06, + "loss": 2.3596, + "theoretical_loss": 3.300880573106271, + "tokens_seen": 3245758464 + }, + { + "epoch": 10.08, + "learning_rate": 8.455366098294884e-06, + "loss": 2.4593, + "theoretical_loss": 3.3008755302995976, + "tokens_seen": 3245824000 + }, + { + "epoch": 10.08, + "learning_rate": 8.445336008024072e-06, + "loss": 2.4368, + "theoretical_loss": 3.3008704876232495, + "tokens_seen": 3245889536 + }, + { + "epoch": 10.08, + "learning_rate": 8.43530591775326e-06, + "loss": 2.2733, + "theoretical_loss": 3.300865445077222, + "tokens_seen": 3245955072 + }, + { + "epoch": 10.08, + "learning_rate": 8.425275827482446e-06, + "loss": 2.5285, + "theoretical_loss": 3.3008604026615087, + "tokens_seen": 3246020608 + }, + { + "epoch": 10.08, + "learning_rate": 8.415245737211636e-06, + "loss": 2.3252, + "theoretical_loss": 3.3008553603761035, + "tokens_seen": 3246086144 + }, + { + "epoch": 10.08, + "learning_rate": 8.405215646940822e-06, + "loss": 2.39, + "theoretical_loss": 3.3008503182210003, + "tokens_seen": 3246151680 + }, + { + "epoch": 10.08, + "learning_rate": 8.39518555667001e-06, + "loss": 2.4463, + "theoretical_loss": 3.3008452761961933, + "tokens_seen": 3246217216 + }, + { + "epoch": 10.08, + "learning_rate": 8.385155466399198e-06, + "loss": 2.4287, + "theoretical_loss": 3.3008402343016767, + "tokens_seen": 3246282752 + }, + { + "epoch": 10.08, + "learning_rate": 8.375125376128386e-06, + "loss": 2.392, + "theoretical_loss": 3.300835192537444, + "tokens_seen": 3246348288 + }, + { + "epoch": 10.08, + "learning_rate": 8.365095285857574e-06, + "loss": 2.34, + "theoretical_loss": 3.3008301509034896, + "tokens_seen": 3246413824 + }, + { + "epoch": 10.08, + "learning_rate": 8.35506519558676e-06, + "loss": 2.3801, + "theoretical_loss": 3.3008251093998076, + "tokens_seen": 3246479360 + }, + { + "epoch": 10.08, + "learning_rate": 8.345035105315948e-06, + "loss": 2.5519, + "theoretical_loss": 3.3008200680263915, + "tokens_seen": 3246544896 + }, + { + "epoch": 10.08, + "learning_rate": 8.335005015045136e-06, + "loss": 2.4637, + "theoretical_loss": 3.3008150267832357, + "tokens_seen": 3246610432 + }, + { + "epoch": 10.08, + "learning_rate": 8.335005015045136e-06, + "loss": 2.0993, + "theoretical_loss": 3.3008099856703343, + "tokens_seen": 3246675968 + }, + { + "epoch": 10.08, + "learning_rate": 8.324974924774324e-06, + "loss": 2.4878, + "theoretical_loss": 3.300804944687681, + "tokens_seen": 3246741504 + }, + { + "epoch": 10.08, + "learning_rate": 8.314944834503512e-06, + "loss": 2.3651, + "theoretical_loss": 3.3007999038352698, + "tokens_seen": 3246807040 + }, + { + "epoch": 10.08, + "learning_rate": 8.304914744232698e-06, + "loss": 2.4874, + "theoretical_loss": 3.300794863113095, + "tokens_seen": 3246872576 + }, + { + "epoch": 10.08, + "learning_rate": 8.294884653961886e-06, + "loss": 2.4691, + "theoretical_loss": 3.3007898225211503, + "tokens_seen": 3246938112 + }, + { + "epoch": 10.08, + "learning_rate": 8.284854563691072e-06, + "loss": 2.1477, + "theoretical_loss": 3.30078478205943, + "tokens_seen": 3247003648 + }, + { + "epoch": 10.08, + "learning_rate": 8.274824473420261e-06, + "loss": 2.2661, + "theoretical_loss": 3.3007797417279274, + "tokens_seen": 3247069184 + }, + { + "epoch": 10.08, + "learning_rate": 8.26479438314945e-06, + "loss": 2.2338, + "theoretical_loss": 3.3007747015266378, + "tokens_seen": 3247134720 + }, + { + "epoch": 10.08, + "learning_rate": 8.254764292878636e-06, + "loss": 2.4278, + "theoretical_loss": 3.300769661455554, + "tokens_seen": 3247200256 + }, + { + "epoch": 10.08, + "learning_rate": 8.244734202607823e-06, + "loss": 2.4355, + "theoretical_loss": 3.3007646215146704, + "tokens_seen": 3247265792 + }, + { + "epoch": 10.08, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3271687030792236, + "objective/train/theoretical_loss": 3.300762101593052, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.300762101593052, + "tokens_seen": 3247298560 + }, + { + "epoch": 10.08, + "learning_rate": 8.234704112337011e-06, + "loss": 2.3265, + "theoretical_loss": 3.300759581703981, + "tokens_seen": 3247331328 + }, + { + "epoch": 10.08, + "learning_rate": 8.2246740220662e-06, + "loss": 2.4962, + "theoretical_loss": 3.3007545420234807, + "tokens_seen": 3247396864 + }, + { + "epoch": 10.08, + "learning_rate": 8.214643931795387e-06, + "loss": 2.1746, + "theoretical_loss": 3.300749502473162, + "tokens_seen": 3247462400 + }, + { + "epoch": 10.08, + "learning_rate": 8.204613841524573e-06, + "loss": 2.429, + "theoretical_loss": 3.30074446305302, + "tokens_seen": 3247527936 + }, + { + "epoch": 10.08, + "learning_rate": 8.194583751253761e-06, + "loss": 2.3296, + "theoretical_loss": 3.300739423763048, + "tokens_seen": 3247593472 + }, + { + "epoch": 10.08, + "learning_rate": 8.184553660982949e-06, + "loss": 2.2576, + "theoretical_loss": 3.30073438460324, + "tokens_seen": 3247659008 + }, + { + "epoch": 10.08, + "learning_rate": 8.174523570712137e-06, + "loss": 2.3669, + "theoretical_loss": 3.300729345573591, + "tokens_seen": 3247724544 + }, + { + "epoch": 10.08, + "learning_rate": 8.164493480441325e-06, + "loss": 2.3366, + "theoretical_loss": 3.300724306674094, + "tokens_seen": 3247790080 + }, + { + "epoch": 10.08, + "learning_rate": 8.154463390170511e-06, + "loss": 2.2634, + "theoretical_loss": 3.3007192679047437, + "tokens_seen": 3247855616 + }, + { + "epoch": 10.08, + "learning_rate": 8.144433299899699e-06, + "loss": 2.3303, + "theoretical_loss": 3.3007142292655334, + "tokens_seen": 3247921152 + }, + { + "epoch": 10.08, + "learning_rate": 8.134403209628887e-06, + "loss": 2.3064, + "theoretical_loss": 3.3007091907564576, + "tokens_seen": 3247986688 + }, + { + "epoch": 10.08, + "learning_rate": 8.124373119358075e-06, + "loss": 2.5113, + "theoretical_loss": 3.3007041523775102, + "tokens_seen": 3248052224 + }, + { + "epoch": 10.08, + "learning_rate": 8.114343029087261e-06, + "loss": 2.3739, + "theoretical_loss": 3.300699114128685, + "tokens_seen": 3248117760 + }, + { + "epoch": 10.09, + "learning_rate": 8.104312938816449e-06, + "loss": 2.2658, + "theoretical_loss": 3.300694076009977, + "tokens_seen": 3248183296 + }, + { + "epoch": 10.09, + "learning_rate": 8.094282848545637e-06, + "loss": 2.3319, + "theoretical_loss": 3.3006890380213787, + "tokens_seen": 3248248832 + }, + { + "epoch": 10.09, + "learning_rate": 8.084252758274825e-06, + "loss": 2.4543, + "theoretical_loss": 3.300684000162885, + "tokens_seen": 3248314368 + }, + { + "epoch": 10.09, + "learning_rate": 8.074222668004013e-06, + "loss": 2.2674, + "theoretical_loss": 3.30067896243449, + "tokens_seen": 3248379904 + }, + { + "epoch": 10.09, + "learning_rate": 8.064192577733199e-06, + "loss": 2.118, + "theoretical_loss": 3.3006739248361874, + "tokens_seen": 3248445440 + }, + { + "epoch": 10.09, + "learning_rate": 8.054162487462387e-06, + "loss": 2.3636, + "theoretical_loss": 3.300668887367971, + "tokens_seen": 3248510976 + }, + { + "epoch": 10.09, + "learning_rate": 8.044132397191576e-06, + "loss": 2.2515, + "theoretical_loss": 3.3006638500298355, + "tokens_seen": 3248576512 + }, + { + "epoch": 10.09, + "learning_rate": 8.034102306920762e-06, + "loss": 2.3666, + "theoretical_loss": 3.300658812821774, + "tokens_seen": 3248642048 + }, + { + "epoch": 10.09, + "learning_rate": 8.02407221664995e-06, + "loss": 2.3191, + "theoretical_loss": 3.3006537757437817, + "tokens_seen": 3248707584 + }, + { + "epoch": 10.09, + "learning_rate": 8.014042126379137e-06, + "loss": 2.4674, + "theoretical_loss": 3.300648738795852, + "tokens_seen": 3248773120 + }, + { + "epoch": 10.09, + "learning_rate": 8.004012036108324e-06, + "loss": 2.3042, + "theoretical_loss": 3.3006437019779784, + "tokens_seen": 3248838656 + }, + { + "epoch": 10.09, + "learning_rate": 7.993981945837514e-06, + "loss": 2.1427, + "theoretical_loss": 3.3006386652901556, + "tokens_seen": 3248904192 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0486295223236084, + "objective/train/theoretical_loss": 3.3006361469950116, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.3006361469950116, + "tokens_seen": 3248936960 + }, + { + "epoch": 10.09, + "learning_rate": 7.9839518555667e-06, + "loss": 2.3152, + "theoretical_loss": 3.3006336287323776, + "tokens_seen": 3248969728 + }, + { + "epoch": 10.09, + "learning_rate": 7.973921765295888e-06, + "loss": 2.2522, + "theoretical_loss": 3.300628592304638, + "tokens_seen": 3249035264 + }, + { + "epoch": 10.09, + "learning_rate": 7.963891675025074e-06, + "loss": 2.2699, + "theoretical_loss": 3.300623556006931, + "tokens_seen": 3249100800 + }, + { + "epoch": 10.09, + "learning_rate": 7.953861584754262e-06, + "loss": 2.3335, + "theoretical_loss": 3.3006185198392513, + "tokens_seen": 3249166336 + }, + { + "epoch": 10.09, + "learning_rate": 7.943831494483452e-06, + "loss": 2.3745, + "theoretical_loss": 3.300613483801592, + "tokens_seen": 3249231872 + }, + { + "epoch": 10.09, + "learning_rate": 7.933801404212638e-06, + "loss": 2.3692, + "theoretical_loss": 3.3006084478939473, + "tokens_seen": 3249297408 + }, + { + "epoch": 10.09, + "learning_rate": 7.923771313941826e-06, + "loss": 2.1091, + "theoretical_loss": 3.300603412116311, + "tokens_seen": 3249362944 + }, + { + "epoch": 10.09, + "learning_rate": 7.913741223671012e-06, + "loss": 2.3503, + "theoretical_loss": 3.300598376468678, + "tokens_seen": 3249428480 + }, + { + "epoch": 10.09, + "learning_rate": 7.903711133400202e-06, + "loss": 2.3836, + "theoretical_loss": 3.3005933409510417, + "tokens_seen": 3249494016 + }, + { + "epoch": 10.09, + "learning_rate": 7.893681043129388e-06, + "loss": 2.5233, + "theoretical_loss": 3.300588305563396, + "tokens_seen": 3249559552 + }, + { + "epoch": 10.09, + "learning_rate": 7.883650952858576e-06, + "loss": 2.3755, + "theoretical_loss": 3.300583270305735, + "tokens_seen": 3249625088 + }, + { + "epoch": 10.09, + "learning_rate": 7.873620862587764e-06, + "loss": 2.3056, + "theoretical_loss": 3.3005782351780537, + "tokens_seen": 3249690624 + }, + { + "epoch": 10.09, + "learning_rate": 7.86359077231695e-06, + "loss": 2.5515, + "theoretical_loss": 3.3005732001803443, + "tokens_seen": 3249756160 + }, + { + "epoch": 10.09, + "learning_rate": 7.85356068204614e-06, + "loss": 2.4293, + "theoretical_loss": 3.3005681653126024, + "tokens_seen": 3249821696 + }, + { + "epoch": 10.09, + "learning_rate": 7.843530591775326e-06, + "loss": 2.4503, + "theoretical_loss": 3.3005631305748215, + "tokens_seen": 3249887232 + }, + { + "epoch": 10.09, + "learning_rate": 7.833500501504514e-06, + "loss": 2.2899, + "theoretical_loss": 3.3005580959669953, + "tokens_seen": 3249952768 + }, + { + "epoch": 10.09, + "learning_rate": 7.823470411233702e-06, + "loss": 2.3312, + "theoretical_loss": 3.300553061489118, + "tokens_seen": 3250018304 + }, + { + "epoch": 10.09, + "learning_rate": 7.813440320962888e-06, + "loss": 2.4651, + "theoretical_loss": 3.300548027141184, + "tokens_seen": 3250083840 + }, + { + "epoch": 10.09, + "learning_rate": 7.803410230692076e-06, + "loss": 2.3922, + "theoretical_loss": 3.300542992923187, + "tokens_seen": 3250149376 + }, + { + "epoch": 10.09, + "learning_rate": 7.793380140421264e-06, + "loss": 2.4603, + "theoretical_loss": 3.300537958835121, + "tokens_seen": 3250214912 + }, + { + "epoch": 10.09, + "learning_rate": 7.783350050150451e-06, + "loss": 2.1142, + "theoretical_loss": 3.30053292487698, + "tokens_seen": 3250280448 + }, + { + "epoch": 10.09, + "learning_rate": 7.77331995987964e-06, + "loss": 2.2628, + "theoretical_loss": 3.300527891048758, + "tokens_seen": 3250345984 + }, + { + "epoch": 10.09, + "learning_rate": 7.763289869608827e-06, + "loss": 2.3228, + "theoretical_loss": 3.3005228573504493, + "tokens_seen": 3250411520 + }, + { + "epoch": 10.09, + "learning_rate": 7.753259779338015e-06, + "loss": 2.2079, + "theoretical_loss": 3.3005178237820476, + "tokens_seen": 3250477056 + }, + { + "epoch": 10.09, + "learning_rate": 7.743229689067201e-06, + "loss": 2.5616, + "theoretical_loss": 3.3005127903435474, + "tokens_seen": 3250542592 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3505051136016846, + "objective/train/theoretical_loss": 3.3005102736730083, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.3005102736730083, + "tokens_seen": 3250575360 + }, + { + "epoch": 10.09, + "learning_rate": 7.73319959879639e-06, + "loss": 2.3381, + "theoretical_loss": 3.3005077570349424, + "tokens_seen": 3250608128 + }, + { + "epoch": 10.09, + "learning_rate": 7.723169508525577e-06, + "loss": 2.2762, + "theoretical_loss": 3.3005027238562263, + "tokens_seen": 3250673664 + }, + { + "epoch": 10.09, + "learning_rate": 7.713139418254765e-06, + "loss": 2.3156, + "theoretical_loss": 3.300497690807394, + "tokens_seen": 3250739200 + }, + { + "epoch": 10.09, + "learning_rate": 7.703109327983953e-06, + "loss": 2.2622, + "theoretical_loss": 3.3004926578884386, + "tokens_seen": 3250804736 + }, + { + "epoch": 10.09, + "learning_rate": 7.693079237713139e-06, + "loss": 2.3215, + "theoretical_loss": 3.300487625099355, + "tokens_seen": 3250870272 + }, + { + "epoch": 10.09, + "learning_rate": 7.683049147442327e-06, + "loss": 2.4457, + "theoretical_loss": 3.300482592440136, + "tokens_seen": 3250935808 + }, + { + "epoch": 10.09, + "learning_rate": 7.673019057171515e-06, + "loss": 2.2537, + "theoretical_loss": 3.300477559910777, + "tokens_seen": 3251001344 + }, + { + "epoch": 10.09, + "learning_rate": 7.662988966900701e-06, + "loss": 2.3622, + "theoretical_loss": 3.3004725275112716, + "tokens_seen": 3251066880 + }, + { + "epoch": 10.09, + "learning_rate": 7.65295887662989e-06, + "loss": 2.1478, + "theoretical_loss": 3.3004674952416133, + "tokens_seen": 3251132416 + }, + { + "epoch": 10.09, + "learning_rate": 7.642928786359077e-06, + "loss": 2.3456, + "theoretical_loss": 3.3004624631017965, + "tokens_seen": 3251197952 + }, + { + "epoch": 10.09, + "learning_rate": 7.632898696088265e-06, + "loss": 2.2115, + "theoretical_loss": 3.3004574310918153, + "tokens_seen": 3251263488 + }, + { + "epoch": 10.09, + "learning_rate": 7.622868605817453e-06, + "loss": 2.4513, + "theoretical_loss": 3.3004523992116637, + "tokens_seen": 3251329024 + }, + { + "epoch": 10.09, + "learning_rate": 7.612838515546641e-06, + "loss": 2.3158, + "theoretical_loss": 3.300447367461336, + "tokens_seen": 3251394560 + }, + { + "epoch": 10.09, + "learning_rate": 7.602808425275828e-06, + "loss": 2.3593, + "theoretical_loss": 3.3004423358408257, + "tokens_seen": 3251460096 + }, + { + "epoch": 10.09, + "learning_rate": 7.592778335005015e-06, + "loss": 2.4856, + "theoretical_loss": 3.300437304350127, + "tokens_seen": 3251525632 + }, + { + "epoch": 10.09, + "learning_rate": 7.582748244734203e-06, + "loss": 2.3717, + "theoretical_loss": 3.300432272989234, + "tokens_seen": 3251591168 + }, + { + "epoch": 10.09, + "learning_rate": 7.5727181544633905e-06, + "loss": 2.0971, + "theoretical_loss": 3.3004272417581406, + "tokens_seen": 3251656704 + }, + { + "epoch": 10.09, + "learning_rate": 7.562688064192578e-06, + "loss": 2.5359, + "theoretical_loss": 3.3004222106568415, + "tokens_seen": 3251722240 + }, + { + "epoch": 10.09, + "learning_rate": 7.5526579739217654e-06, + "loss": 2.5914, + "theoretical_loss": 3.30041717968533, + "tokens_seen": 3251787776 + }, + { + "epoch": 10.09, + "learning_rate": 7.542627883650953e-06, + "loss": 2.2252, + "theoretical_loss": 3.3004121488436002, + "tokens_seen": 3251853312 + }, + { + "epoch": 10.09, + "learning_rate": 7.53259779338014e-06, + "loss": 2.2402, + "theoretical_loss": 3.3004071181316466, + "tokens_seen": 3251918848 + }, + { + "epoch": 10.09, + "learning_rate": 7.522567703109327e-06, + "loss": 2.4701, + "theoretical_loss": 3.3004020875494624, + "tokens_seen": 3251984384 + }, + { + "epoch": 10.09, + "learning_rate": 7.512537612838516e-06, + "loss": 2.2272, + "theoretical_loss": 3.3003970570970425, + "tokens_seen": 3252049920 + }, + { + "epoch": 10.09, + "learning_rate": 7.502507522567703e-06, + "loss": 2.32, + "theoretical_loss": 3.3003920267743805, + "tokens_seen": 3252115456 + }, + { + "epoch": 10.09, + "learning_rate": 7.492477432296891e-06, + "loss": 2.1444, + "theoretical_loss": 3.3003869965814703, + "tokens_seen": 3252180992 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.378401756286621, + "objective/train/theoretical_loss": 3.3003844815336705, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.3003844815336705, + "tokens_seen": 3252213760 + }, + { + "epoch": 10.09, + "learning_rate": 7.482447342026078e-06, + "loss": 2.3743, + "theoretical_loss": 3.3003819665183065, + "tokens_seen": 3252246528 + }, + { + "epoch": 10.09, + "learning_rate": 7.472417251755265e-06, + "loss": 2.468, + "theoretical_loss": 3.300376936584883, + "tokens_seen": 3252312064 + }, + { + "epoch": 10.09, + "learning_rate": 7.462387161484454e-06, + "loss": 2.3108, + "theoretical_loss": 3.3003719067811934, + "tokens_seen": 3252377600 + }, + { + "epoch": 10.09, + "learning_rate": 7.452357071213641e-06, + "loss": 2.1781, + "theoretical_loss": 3.3003668771072316, + "tokens_seen": 3252443136 + }, + { + "epoch": 10.09, + "learning_rate": 7.442326980942829e-06, + "loss": 2.1889, + "theoretical_loss": 3.300361847562993, + "tokens_seen": 3252508672 + }, + { + "epoch": 10.09, + "learning_rate": 7.432296890672016e-06, + "loss": 2.347, + "theoretical_loss": 3.3003568181484697, + "tokens_seen": 3252574208 + }, + { + "epoch": 10.09, + "learning_rate": 7.422266800401204e-06, + "loss": 2.4465, + "theoretical_loss": 3.300351788863657, + "tokens_seen": 3252639744 + }, + { + "epoch": 10.09, + "learning_rate": 7.412236710130391e-06, + "loss": 2.3275, + "theoretical_loss": 3.3003467597085487, + "tokens_seen": 3252705280 + }, + { + "epoch": 10.09, + "learning_rate": 7.402206619859579e-06, + "loss": 2.2674, + "theoretical_loss": 3.3003417306831393, + "tokens_seen": 3252770816 + }, + { + "epoch": 10.09, + "learning_rate": 7.392176529588767e-06, + "loss": 1.9177, + "theoretical_loss": 3.3003367017874217, + "tokens_seen": 3252836352 + }, + { + "epoch": 10.09, + "learning_rate": 7.382146439317954e-06, + "loss": 2.3219, + "theoretical_loss": 3.300331673021391, + "tokens_seen": 3252901888 + }, + { + "epoch": 10.09, + "learning_rate": 7.372116349047142e-06, + "loss": 2.3814, + "theoretical_loss": 3.3003266443850405, + "tokens_seen": 3252967424 + }, + { + "epoch": 10.09, + "learning_rate": 7.362086258776329e-06, + "loss": 2.2931, + "theoretical_loss": 3.3003216158783646, + "tokens_seen": 3253032960 + }, + { + "epoch": 10.09, + "learning_rate": 7.3520561685055174e-06, + "loss": 2.3334, + "theoretical_loss": 3.3003165875013574, + "tokens_seen": 3253098496 + }, + { + "epoch": 10.09, + "learning_rate": 7.3420260782347045e-06, + "loss": 2.3744, + "theoretical_loss": 3.300311559254013, + "tokens_seen": 3253164032 + }, + { + "epoch": 10.09, + "learning_rate": 7.3319959879638915e-06, + "loss": 2.4643, + "theoretical_loss": 3.300306531136325, + "tokens_seen": 3253229568 + }, + { + "epoch": 10.09, + "learning_rate": 7.3219658976930794e-06, + "loss": 2.2006, + "theoretical_loss": 3.300301503148288, + "tokens_seen": 3253295104 + }, + { + "epoch": 10.09, + "learning_rate": 7.3119358074222665e-06, + "loss": 2.2698, + "theoretical_loss": 3.300296475289896, + "tokens_seen": 3253360640 + }, + { + "epoch": 10.09, + "learning_rate": 7.301905717151454e-06, + "loss": 2.2878, + "theoretical_loss": 3.3002914475611425, + "tokens_seen": 3253426176 + }, + { + "epoch": 10.09, + "learning_rate": 7.291875626880642e-06, + "loss": 2.2835, + "theoretical_loss": 3.300286419962022, + "tokens_seen": 3253491712 + }, + { + "epoch": 10.09, + "learning_rate": 7.28184553660983e-06, + "loss": 2.2995, + "theoretical_loss": 3.3002813924925283, + "tokens_seen": 3253557248 + }, + { + "epoch": 10.09, + "learning_rate": 7.271815446339017e-06, + "loss": 2.5539, + "theoretical_loss": 3.3002763651526554, + "tokens_seen": 3253622784 + }, + { + "epoch": 10.09, + "learning_rate": 7.261785356068204e-06, + "loss": 2.6378, + "theoretical_loss": 3.300271337942398, + "tokens_seen": 3253688320 + }, + { + "epoch": 10.09, + "learning_rate": 7.251755265797392e-06, + "loss": 2.2851, + "theoretical_loss": 3.3002663108617494, + "tokens_seen": 3253753856 + }, + { + "epoch": 10.09, + "learning_rate": 7.241725175526579e-06, + "loss": 2.3087, + "theoretical_loss": 3.3002612839107037, + "tokens_seen": 3253819392 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1509740352630615, + "objective/train/theoretical_loss": 3.3002587704837802, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.3002587704837802, + "tokens_seen": 3253852160 + }, + { + "epoch": 10.09, + "learning_rate": 7.231695085255768e-06, + "loss": 2.3147, + "theoretical_loss": 3.3002562570892557, + "tokens_seen": 3253884928 + }, + { + "epoch": 10.09, + "learning_rate": 7.221664994984955e-06, + "loss": 2.3742, + "theoretical_loss": 3.3002512303973988, + "tokens_seen": 3253950464 + }, + { + "epoch": 10.09, + "learning_rate": 7.211634904714143e-06, + "loss": 1.9995, + "theoretical_loss": 3.300246203835127, + "tokens_seen": 3254016000 + }, + { + "epoch": 10.09, + "learning_rate": 7.20160481444333e-06, + "loss": 2.2605, + "theoretical_loss": 3.3002411774024343, + "tokens_seen": 3254081536 + }, + { + "epoch": 10.09, + "learning_rate": 7.191574724172517e-06, + "loss": 2.6969, + "theoretical_loss": 3.3002361510993152, + "tokens_seen": 3254147072 + }, + { + "epoch": 10.09, + "learning_rate": 7.181544633901706e-06, + "loss": 2.32, + "theoretical_loss": 3.3002311249257636, + "tokens_seen": 3254212608 + }, + { + "epoch": 10.09, + "learning_rate": 7.171514543630893e-06, + "loss": 2.2654, + "theoretical_loss": 3.3002260988817733, + "tokens_seen": 3254278144 + }, + { + "epoch": 10.09, + "learning_rate": 7.161484453360081e-06, + "loss": 2.3183, + "theoretical_loss": 3.3002210729673385, + "tokens_seen": 3254343680 + }, + { + "epoch": 10.09, + "learning_rate": 7.151454363089268e-06, + "loss": 2.3403, + "theoretical_loss": 3.3002160471824538, + "tokens_seen": 3254409216 + }, + { + "epoch": 10.09, + "learning_rate": 7.141424272818456e-06, + "loss": 2.2889, + "theoretical_loss": 3.300211021527112, + "tokens_seen": 3254474752 + }, + { + "epoch": 10.09, + "learning_rate": 7.131394182547643e-06, + "loss": 2.4088, + "theoretical_loss": 3.300205996001308, + "tokens_seen": 3254540288 + }, + { + "epoch": 10.09, + "learning_rate": 7.121364092276831e-06, + "loss": 2.2919, + "theoretical_loss": 3.300200970605036, + "tokens_seen": 3254605824 + }, + { + "epoch": 10.09, + "learning_rate": 7.1113340020060185e-06, + "loss": 2.3813, + "theoretical_loss": 3.30019594533829, + "tokens_seen": 3254671360 + }, + { + "epoch": 10.09, + "learning_rate": 7.1013039117352055e-06, + "loss": 2.333, + "theoretical_loss": 3.3001909202010635, + "tokens_seen": 3254736896 + }, + { + "epoch": 10.09, + "learning_rate": 7.0912738214643934e-06, + "loss": 2.4666, + "theoretical_loss": 3.300185895193351, + "tokens_seen": 3254802432 + }, + { + "epoch": 10.09, + "learning_rate": 7.0812437311935805e-06, + "loss": 2.2202, + "theoretical_loss": 3.3001808703151463, + "tokens_seen": 3254867968 + }, + { + "epoch": 10.09, + "learning_rate": 7.071213640922769e-06, + "loss": 2.2198, + "theoretical_loss": 3.3001758455664434, + "tokens_seen": 3254933504 + }, + { + "epoch": 10.09, + "learning_rate": 7.061183550651956e-06, + "loss": 2.4506, + "theoretical_loss": 3.300170820947237, + "tokens_seen": 3254999040 + }, + { + "epoch": 10.09, + "learning_rate": 7.051153460381143e-06, + "loss": 2.1945, + "theoretical_loss": 3.3001657964575206, + "tokens_seen": 3255064576 + }, + { + "epoch": 10.09, + "learning_rate": 7.041123370110331e-06, + "loss": 2.3706, + "theoretical_loss": 3.3001607720972883, + "tokens_seen": 3255130112 + }, + { + "epoch": 10.09, + "learning_rate": 7.031093279839518e-06, + "loss": 2.5782, + "theoretical_loss": 3.300155747866534, + "tokens_seen": 3255195648 + }, + { + "epoch": 10.09, + "learning_rate": 7.021063189568706e-06, + "loss": 2.4021, + "theoretical_loss": 3.300150723765252, + "tokens_seen": 3255261184 + }, + { + "epoch": 10.09, + "learning_rate": 7.011033099297894e-06, + "loss": 2.3222, + "theoretical_loss": 3.3001456997934366, + "tokens_seen": 3255326720 + }, + { + "epoch": 10.09, + "learning_rate": 7.001003009027082e-06, + "loss": 2.3145, + "theoretical_loss": 3.3001406759510816, + "tokens_seen": 3255392256 + }, + { + "epoch": 10.09, + "learning_rate": 6.990972918756269e-06, + "loss": 2.4147, + "theoretical_loss": 3.300135652238181, + "tokens_seen": 3255457792 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5501060485839844, + "objective/train/theoretical_loss": 3.3001331404302743, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.3001331404302743, + "tokens_seen": 3255490560 + }, + { + "epoch": 10.09, + "learning_rate": 6.980942828485456e-06, + "loss": 2.4802, + "theoretical_loss": 3.3001306286547285, + "tokens_seen": 3255523328 + }, + { + "epoch": 10.09, + "learning_rate": 6.970912738214644e-06, + "loss": 2.1636, + "theoretical_loss": 3.3001256052007193, + "tokens_seen": 3255588864 + }, + { + "epoch": 10.09, + "learning_rate": 6.960882647943831e-06, + "loss": 2.3435, + "theoretical_loss": 3.300120581876146, + "tokens_seen": 3255654400 + }, + { + "epoch": 10.09, + "learning_rate": 6.95085255767302e-06, + "loss": 2.3653, + "theoretical_loss": 3.300115558681004, + "tokens_seen": 3255719936 + }, + { + "epoch": 10.09, + "learning_rate": 6.940822467402207e-06, + "loss": 2.1611, + "theoretical_loss": 3.3001105356152864, + "tokens_seen": 3255785472 + }, + { + "epoch": 10.09, + "learning_rate": 6.930792377131395e-06, + "loss": 2.4031, + "theoretical_loss": 3.3001055126789876, + "tokens_seen": 3255851008 + }, + { + "epoch": 10.09, + "learning_rate": 6.920762286860582e-06, + "loss": 2.5585, + "theoretical_loss": 3.3001004898721016, + "tokens_seen": 3255916544 + }, + { + "epoch": 10.09, + "learning_rate": 6.910732196589769e-06, + "loss": 2.5046, + "theoretical_loss": 3.300095467194623, + "tokens_seen": 3255982080 + }, + { + "epoch": 10.09, + "learning_rate": 6.9007021063189575e-06, + "loss": 2.3761, + "theoretical_loss": 3.300090444646545, + "tokens_seen": 3256047616 + }, + { + "epoch": 10.09, + "learning_rate": 6.890672016048145e-06, + "loss": 2.4178, + "theoretical_loss": 3.300085422227862, + "tokens_seen": 3256113152 + }, + { + "epoch": 10.09, + "learning_rate": 6.8806419257773325e-06, + "loss": 2.3381, + "theoretical_loss": 3.3000803999385684, + "tokens_seen": 3256178688 + }, + { + "epoch": 10.09, + "learning_rate": 6.8706118355065195e-06, + "loss": 2.4261, + "theoretical_loss": 3.300075377778658, + "tokens_seen": 3256244224 + }, + { + "epoch": 10.09, + "learning_rate": 6.860581745235707e-06, + "loss": 2.2974, + "theoretical_loss": 3.300070355748124, + "tokens_seen": 3256309760 + }, + { + "epoch": 10.09, + "learning_rate": 6.8505516549648945e-06, + "loss": 2.357, + "theoretical_loss": 3.3000653338469625, + "tokens_seen": 3256375296 + }, + { + "epoch": 10.09, + "learning_rate": 6.840521564694082e-06, + "loss": 2.6288, + "theoretical_loss": 3.3000603120751655, + "tokens_seen": 3256440832 + }, + { + "epoch": 10.09, + "learning_rate": 6.83049147442327e-06, + "loss": 2.2103, + "theoretical_loss": 3.3000552904327285, + "tokens_seen": 3256506368 + }, + { + "epoch": 10.09, + "learning_rate": 6.820461384152457e-06, + "loss": 2.2894, + "theoretical_loss": 3.3000502689196445, + "tokens_seen": 3256571904 + }, + { + "epoch": 10.09, + "learning_rate": 6.810431293881645e-06, + "loss": 2.6397, + "theoretical_loss": 3.300045247535908, + "tokens_seen": 3256637440 + }, + { + "epoch": 10.09, + "learning_rate": 6.800401203610832e-06, + "loss": 2.3725, + "theoretical_loss": 3.3000402262815136, + "tokens_seen": 3256702976 + }, + { + "epoch": 10.09, + "learning_rate": 6.79037111334002e-06, + "loss": 2.1934, + "theoretical_loss": 3.3000352051564548, + "tokens_seen": 3256768512 + }, + { + "epoch": 10.09, + "learning_rate": 6.780341023069208e-06, + "loss": 2.2342, + "theoretical_loss": 3.3000301841607254, + "tokens_seen": 3256834048 + }, + { + "epoch": 10.09, + "learning_rate": 6.770310932798395e-06, + "loss": 2.3518, + "theoretical_loss": 3.30002516329432, + "tokens_seen": 3256899584 + }, + { + "epoch": 10.09, + "learning_rate": 6.760280842527583e-06, + "loss": 2.3342, + "theoretical_loss": 3.3000201425572326, + "tokens_seen": 3256965120 + }, + { + "epoch": 10.09, + "learning_rate": 6.75025075225677e-06, + "loss": 2.23, + "theoretical_loss": 3.3000151219494573, + "tokens_seen": 3257030656 + }, + { + "epoch": 10.09, + "learning_rate": 6.740220661985958e-06, + "loss": 2.3281, + "theoretical_loss": 3.300010101470988, + "tokens_seen": 3257096192 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.602235794067383, + "objective/train/theoretical_loss": 3.300007591280241, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.300007591280241, + "tokens_seen": 3257128960 + }, + { + "epoch": 10.09, + "learning_rate": 6.730190571715146e-06, + "loss": 2.2773, + "theoretical_loss": 3.3000050811218182, + "tokens_seen": 3257161728 + }, + { + "epoch": 10.09, + "learning_rate": 6.720160481444334e-06, + "loss": 2.4107, + "theoretical_loss": 3.3000000609019433, + "tokens_seen": 3257227264 + }, + { + "epoch": 10.09, + "learning_rate": 6.710130391173521e-06, + "loss": 2.0009, + "theoretical_loss": 3.299995040811356, + "tokens_seen": 3257292800 + }, + { + "epoch": 10.09, + "learning_rate": 6.700100300902708e-06, + "loss": 2.1746, + "theoretical_loss": 3.2999900208500517, + "tokens_seen": 3257358336 + }, + { + "epoch": 10.09, + "learning_rate": 6.690070210631896e-06, + "loss": 2.222, + "theoretical_loss": 3.2999850010180234, + "tokens_seen": 3257423872 + }, + { + "epoch": 10.09, + "learning_rate": 6.680040120361083e-06, + "loss": 2.2374, + "theoretical_loss": 3.2999799813152655, + "tokens_seen": 3257489408 + }, + { + "epoch": 10.09, + "learning_rate": 6.6700100300902715e-06, + "loss": 2.4762, + "theoretical_loss": 3.299974961741772, + "tokens_seen": 3257554944 + }, + { + "epoch": 10.09, + "learning_rate": 6.659979939819459e-06, + "loss": 2.093, + "theoretical_loss": 3.2999699422975373, + "tokens_seen": 3257620480 + }, + { + "epoch": 10.09, + "learning_rate": 6.6499498495486465e-06, + "loss": 2.3407, + "theoretical_loss": 3.299964922982555, + "tokens_seen": 3257686016 + }, + { + "epoch": 10.09, + "learning_rate": 6.6399197592778335e-06, + "loss": 2.4707, + "theoretical_loss": 3.2999599037968195, + "tokens_seen": 3257751552 + }, + { + "epoch": 10.09, + "learning_rate": 6.6298896690070206e-06, + "loss": 2.4044, + "theoretical_loss": 3.299954884740325, + "tokens_seen": 3257817088 + }, + { + "epoch": 10.09, + "learning_rate": 6.619859578736209e-06, + "loss": 2.3739, + "theoretical_loss": 3.2999498658130655, + "tokens_seen": 3257882624 + }, + { + "epoch": 10.09, + "learning_rate": 6.609829488465396e-06, + "loss": 2.2639, + "theoretical_loss": 3.299944847015034, + "tokens_seen": 3257948160 + }, + { + "epoch": 10.09, + "learning_rate": 6.599799398194584e-06, + "loss": 2.3593, + "theoretical_loss": 3.2999398283462265, + "tokens_seen": 3258013696 + }, + { + "epoch": 10.09, + "learning_rate": 6.589769307923771e-06, + "loss": 2.4238, + "theoretical_loss": 3.299934809806636, + "tokens_seen": 3258079232 + }, + { + "epoch": 10.09, + "learning_rate": 6.579739217652959e-06, + "loss": 1.9844, + "theoretical_loss": 3.299929791396256, + "tokens_seen": 3258144768 + }, + { + "epoch": 10.09, + "learning_rate": 6.569709127382146e-06, + "loss": 2.3619, + "theoretical_loss": 3.299924773115082, + "tokens_seen": 3258210304 + }, + { + "epoch": 10.09, + "learning_rate": 6.559679037111334e-06, + "loss": 2.4427, + "theoretical_loss": 3.2999197549631067, + "tokens_seen": 3258275840 + }, + { + "epoch": 10.09, + "learning_rate": 6.549648946840522e-06, + "loss": 2.1852, + "theoretical_loss": 3.299914736940325, + "tokens_seen": 3258341376 + }, + { + "epoch": 10.09, + "learning_rate": 6.539618856569709e-06, + "loss": 2.4006, + "theoretical_loss": 3.299909719046731, + "tokens_seen": 3258406912 + }, + { + "epoch": 10.09, + "learning_rate": 6.529588766298897e-06, + "loss": 2.1723, + "theoretical_loss": 3.2999047012823186, + "tokens_seen": 3258472448 + }, + { + "epoch": 10.09, + "learning_rate": 6.519558676028084e-06, + "loss": 2.2768, + "theoretical_loss": 3.2998996836470815, + "tokens_seen": 3258537984 + }, + { + "epoch": 10.09, + "learning_rate": 6.509528585757272e-06, + "loss": 2.2388, + "theoretical_loss": 3.299894666141014, + "tokens_seen": 3258603520 + }, + { + "epoch": 10.09, + "learning_rate": 6.49949849548646e-06, + "loss": 2.461, + "theoretical_loss": 3.2998896487641103, + "tokens_seen": 3258669056 + }, + { + "epoch": 10.09, + "learning_rate": 6.489468405215647e-06, + "loss": 2.365, + "theoretical_loss": 3.2998846315163646, + "tokens_seen": 3258734592 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.6884894371032715, + "objective/train/theoretical_loss": 3.2998821229409243, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.2998821229409243, + "tokens_seen": 3258767360 + }, + { + "epoch": 10.09, + "learning_rate": 6.479438314944835e-06, + "loss": 2.082, + "theoretical_loss": 3.2998796143977707, + "tokens_seen": 3258800128 + }, + { + "epoch": 10.09, + "learning_rate": 6.469408224674022e-06, + "loss": 2.2142, + "theoretical_loss": 3.2998745974083232, + "tokens_seen": 3258865664 + }, + { + "epoch": 10.09, + "learning_rate": 6.45937813440321e-06, + "loss": 2.4685, + "theoretical_loss": 3.299869580548015, + "tokens_seen": 3258931200 + }, + { + "epoch": 10.09, + "learning_rate": 6.449348044132398e-06, + "loss": 2.2587, + "theoretical_loss": 3.2998645638168416, + "tokens_seen": 3258996736 + }, + { + "epoch": 10.09, + "learning_rate": 6.4393179538615855e-06, + "loss": 2.3659, + "theoretical_loss": 3.2998595472147962, + "tokens_seen": 3259062272 + }, + { + "epoch": 10.09, + "learning_rate": 6.429287863590773e-06, + "loss": 2.4425, + "theoretical_loss": 3.2998545307418734, + "tokens_seen": 3259127808 + }, + { + "epoch": 10.09, + "learning_rate": 6.41925777331996e-06, + "loss": 2.5747, + "theoretical_loss": 3.2998495143980664, + "tokens_seen": 3259193344 + }, + { + "epoch": 10.09, + "learning_rate": 6.4092276830491475e-06, + "loss": 2.2488, + "theoretical_loss": 3.2998444981833703, + "tokens_seen": 3259258880 + }, + { + "epoch": 10.09, + "learning_rate": 6.3991975927783346e-06, + "loss": 2.5305, + "theoretical_loss": 3.299839482097779, + "tokens_seen": 3259324416 + }, + { + "epoch": 10.09, + "learning_rate": 6.389167502507523e-06, + "loss": 2.1856, + "theoretical_loss": 3.2998344661412857, + "tokens_seen": 3259389952 + }, + { + "epoch": 10.09, + "learning_rate": 6.37913741223671e-06, + "loss": 2.3818, + "theoretical_loss": 3.2998294503138856, + "tokens_seen": 3259455488 + }, + { + "epoch": 10.09, + "learning_rate": 6.369107321965898e-06, + "loss": 2.3368, + "theoretical_loss": 3.299824434615572, + "tokens_seen": 3259521024 + }, + { + "epoch": 10.09, + "learning_rate": 6.359077231695085e-06, + "loss": 2.4652, + "theoretical_loss": 3.2998194190463397, + "tokens_seen": 3259586560 + }, + { + "epoch": 10.09, + "learning_rate": 6.349047141424272e-06, + "loss": 2.1812, + "theoretical_loss": 3.299814403606182, + "tokens_seen": 3259652096 + }, + { + "epoch": 10.09, + "learning_rate": 6.33901705115346e-06, + "loss": 2.3983, + "theoretical_loss": 3.2998093882950936, + "tokens_seen": 3259717632 + }, + { + "epoch": 10.09, + "learning_rate": 6.328986960882648e-06, + "loss": 2.1491, + "theoretical_loss": 3.299804373113068, + "tokens_seen": 3259783168 + }, + { + "epoch": 10.09, + "learning_rate": 6.318956870611836e-06, + "loss": 2.3731, + "theoretical_loss": 3.2997993580601, + "tokens_seen": 3259848704 + }, + { + "epoch": 10.09, + "learning_rate": 6.308926780341023e-06, + "loss": 2.3765, + "theoretical_loss": 3.299794343136183, + "tokens_seen": 3259914240 + }, + { + "epoch": 10.09, + "learning_rate": 6.298896690070211e-06, + "loss": 2.2094, + "theoretical_loss": 3.299789328341311, + "tokens_seen": 3259979776 + }, + { + "epoch": 10.09, + "learning_rate": 6.288866599799398e-06, + "loss": 2.2629, + "theoretical_loss": 3.299784313675479, + "tokens_seen": 3260045312 + }, + { + "epoch": 10.09, + "learning_rate": 6.278836509528586e-06, + "loss": 2.4638, + "theoretical_loss": 3.299779299138681, + "tokens_seen": 3260110848 + }, + { + "epoch": 10.09, + "learning_rate": 6.268806419257774e-06, + "loss": 2.3951, + "theoretical_loss": 3.29977428473091, + "tokens_seen": 3260176384 + }, + { + "epoch": 10.09, + "learning_rate": 6.258776328986961e-06, + "loss": 2.4094, + "theoretical_loss": 3.2997692704521606, + "tokens_seen": 3260241920 + }, + { + "epoch": 10.09, + "learning_rate": 6.248746238716149e-06, + "loss": 2.3195, + "theoretical_loss": 3.2997642563024274, + "tokens_seen": 3260307456 + }, + { + "epoch": 10.09, + "learning_rate": 6.238716148445336e-06, + "loss": 2.4157, + "theoretical_loss": 3.299759242281704, + "tokens_seen": 3260372992 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.263901710510254, + "objective/train/theoretical_loss": 3.2997567353197192, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.2997567353197192, + "tokens_seen": 3260405760 + }, + { + "epoch": 10.09, + "learning_rate": 6.228686058174524e-06, + "loss": 2.301, + "theoretical_loss": 3.299754228389985, + "tokens_seen": 3260438528 + }, + { + "epoch": 10.09, + "learning_rate": 6.218655967903712e-06, + "loss": 2.2245, + "theoretical_loss": 3.2997492146272633, + "tokens_seen": 3260504064 + }, + { + "epoch": 10.09, + "learning_rate": 6.208625877632899e-06, + "loss": 2.4743, + "theoretical_loss": 3.2997442009935343, + "tokens_seen": 3260569600 + }, + { + "epoch": 10.09, + "learning_rate": 6.1985957873620866e-06, + "loss": 2.3184, + "theoretical_loss": 3.2997391874887914, + "tokens_seen": 3260635136 + }, + { + "epoch": 10.09, + "learning_rate": 6.188565697091274e-06, + "loss": 2.3688, + "theoretical_loss": 3.299734174113029, + "tokens_seen": 3260700672 + }, + { + "epoch": 10.09, + "learning_rate": 6.1785356068204615e-06, + "loss": 2.0821, + "theoretical_loss": 3.2997291608662405, + "tokens_seen": 3260766208 + }, + { + "epoch": 10.09, + "learning_rate": 6.168505516549649e-06, + "loss": 2.1268, + "theoretical_loss": 3.299724147748421, + "tokens_seen": 3260831744 + }, + { + "epoch": 10.09, + "learning_rate": 6.158475426278837e-06, + "loss": 2.2444, + "theoretical_loss": 3.299719134759564, + "tokens_seen": 3260897280 + }, + { + "epoch": 10.09, + "learning_rate": 6.148445336008024e-06, + "loss": 2.4195, + "theoretical_loss": 3.299714121899664, + "tokens_seen": 3260962816 + }, + { + "epoch": 10.09, + "learning_rate": 6.138415245737211e-06, + "loss": 2.3387, + "theoretical_loss": 3.2997091091687145, + "tokens_seen": 3261028352 + }, + { + "epoch": 10.09, + "learning_rate": 6.128385155466399e-06, + "loss": 2.4032, + "theoretical_loss": 3.29970409656671, + "tokens_seen": 3261093888 + }, + { + "epoch": 10.09, + "learning_rate": 6.118355065195586e-06, + "loss": 2.2337, + "theoretical_loss": 3.2996990840936444, + "tokens_seen": 3261159424 + }, + { + "epoch": 10.09, + "learning_rate": 6.108324974924775e-06, + "loss": 2.4131, + "theoretical_loss": 3.299694071749512, + "tokens_seen": 3261224960 + }, + { + "epoch": 10.09, + "learning_rate": 6.098294884653962e-06, + "loss": 2.35, + "theoretical_loss": 3.2996890595343062, + "tokens_seen": 3261290496 + }, + { + "epoch": 10.09, + "learning_rate": 6.088264794383149e-06, + "loss": 2.4108, + "theoretical_loss": 3.2996840474480225, + "tokens_seen": 3261356032 + }, + { + "epoch": 10.09, + "learning_rate": 6.078234704112337e-06, + "loss": 2.378, + "theoretical_loss": 3.2996790354906538, + "tokens_seen": 3261421568 + }, + { + "epoch": 10.09, + "learning_rate": 6.068204613841524e-06, + "loss": 2.3651, + "theoretical_loss": 3.2996740236621944, + "tokens_seen": 3261487104 + }, + { + "epoch": 10.09, + "learning_rate": 6.058174523570712e-06, + "loss": 2.387, + "theoretical_loss": 3.2996690119626386, + "tokens_seen": 3261552640 + }, + { + "epoch": 10.09, + "learning_rate": 6.0481444332999e-06, + "loss": 2.3106, + "theoretical_loss": 3.2996640003919806, + "tokens_seen": 3261618176 + }, + { + "epoch": 10.09, + "learning_rate": 6.038114343029088e-06, + "loss": 2.4304, + "theoretical_loss": 3.299658988950214, + "tokens_seen": 3261683712 + }, + { + "epoch": 10.09, + "learning_rate": 6.028084252758275e-06, + "loss": 2.2286, + "theoretical_loss": 3.2996539776373335, + "tokens_seen": 3261749248 + }, + { + "epoch": 10.09, + "learning_rate": 6.018054162487462e-06, + "loss": 2.2978, + "theoretical_loss": 3.299648966453333, + "tokens_seen": 3261814784 + }, + { + "epoch": 10.09, + "learning_rate": 6.00802407221665e-06, + "loss": 2.5202, + "theoretical_loss": 3.2996439553982064, + "tokens_seen": 3261880320 + }, + { + "epoch": 10.09, + "learning_rate": 5.997993981945838e-06, + "loss": 2.4541, + "theoretical_loss": 3.2996389444719476, + "tokens_seen": 3261945856 + }, + { + "epoch": 10.09, + "learning_rate": 5.987963891675026e-06, + "loss": 2.3883, + "theoretical_loss": 3.2996339336745515, + "tokens_seen": 3262011392 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.7806648015975952, + "objective/train/theoretical_loss": 3.299631428324175, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.299631428324175, + "tokens_seen": 3262044160 + }, + { + "epoch": 10.09, + "learning_rate": 5.977933801404213e-06, + "loss": 2.2941, + "theoretical_loss": 3.2996289230060114, + "tokens_seen": 3262076928 + }, + { + "epoch": 10.09, + "learning_rate": 5.9679037111334006e-06, + "loss": 2.2937, + "theoretical_loss": 3.299623912466322, + "tokens_seen": 3262142464 + }, + { + "epoch": 10.09, + "learning_rate": 5.957873620862588e-06, + "loss": 2.3168, + "theoretical_loss": 3.2996189020554767, + "tokens_seen": 3262208000 + }, + { + "epoch": 10.09, + "learning_rate": 5.947843530591775e-06, + "loss": 2.2576, + "theoretical_loss": 3.2996138917734705, + "tokens_seen": 3262273536 + }, + { + "epoch": 10.09, + "learning_rate": 5.937813440320963e-06, + "loss": 2.1675, + "theoretical_loss": 3.2996088816202964, + "tokens_seen": 3262339072 + }, + { + "epoch": 10.09, + "learning_rate": 5.9277833500501505e-06, + "loss": 2.4018, + "theoretical_loss": 3.2996038715959495, + "tokens_seen": 3262404608 + }, + { + "epoch": 10.09, + "learning_rate": 5.917753259779338e-06, + "loss": 2.2704, + "theoretical_loss": 3.2995988617004235, + "tokens_seen": 3262470144 + }, + { + "epoch": 10.09, + "learning_rate": 5.907723169508525e-06, + "loss": 2.3716, + "theoretical_loss": 3.2995938519337127, + "tokens_seen": 3262535680 + }, + { + "epoch": 10.09, + "learning_rate": 5.897693079237713e-06, + "loss": 2.3421, + "theoretical_loss": 3.2995888422958104, + "tokens_seen": 3262601216 + }, + { + "epoch": 10.09, + "learning_rate": 5.887662988966901e-06, + "loss": 2.2199, + "theoretical_loss": 3.2995838327867117, + "tokens_seen": 3262666752 + }, + { + "epoch": 10.09, + "learning_rate": 5.877632898696088e-06, + "loss": 2.3595, + "theoretical_loss": 3.29957882340641, + "tokens_seen": 3262732288 + }, + { + "epoch": 10.09, + "learning_rate": 5.867602808425276e-06, + "loss": 2.4461, + "theoretical_loss": 3.2995738141549005, + "tokens_seen": 3262797824 + }, + { + "epoch": 10.09, + "learning_rate": 5.857572718154463e-06, + "loss": 2.5694, + "theoretical_loss": 3.299568805032176, + "tokens_seen": 3262863360 + }, + { + "epoch": 10.09, + "learning_rate": 5.847542627883651e-06, + "loss": 2.2311, + "theoretical_loss": 3.299563796038231, + "tokens_seen": 3262928896 + }, + { + "epoch": 10.09, + "learning_rate": 5.837512537612838e-06, + "loss": 2.49, + "theoretical_loss": 3.2995587871730594, + "tokens_seen": 3262994432 + }, + { + "epoch": 10.09, + "learning_rate": 5.827482447342027e-06, + "loss": 2.2305, + "theoretical_loss": 3.299553778436656, + "tokens_seen": 3263059968 + }, + { + "epoch": 10.09, + "learning_rate": 5.817452357071214e-06, + "loss": 2.1957, + "theoretical_loss": 3.2995487698290145, + "tokens_seen": 3263125504 + }, + { + "epoch": 10.09, + "learning_rate": 5.807422266800401e-06, + "loss": 2.2546, + "theoretical_loss": 3.299543761350129, + "tokens_seen": 3263191040 + }, + { + "epoch": 10.09, + "learning_rate": 5.797392176529589e-06, + "loss": 2.241, + "theoretical_loss": 3.299538752999994, + "tokens_seen": 3263256576 + }, + { + "epoch": 10.09, + "learning_rate": 5.787362086258776e-06, + "loss": 2.4576, + "theoretical_loss": 3.2995337447786026, + "tokens_seen": 3263322112 + }, + { + "epoch": 10.09, + "learning_rate": 5.777331995987964e-06, + "loss": 2.3383, + "theoretical_loss": 3.29952873668595, + "tokens_seen": 3263387648 + }, + { + "epoch": 10.09, + "learning_rate": 5.767301905717152e-06, + "loss": 2.0957, + "theoretical_loss": 3.2995237287220296, + "tokens_seen": 3263453184 + }, + { + "epoch": 10.09, + "learning_rate": 5.75727181544634e-06, + "loss": 2.2192, + "theoretical_loss": 3.2995187208868355, + "tokens_seen": 3263518720 + }, + { + "epoch": 10.09, + "learning_rate": 5.747241725175527e-06, + "loss": 2.2948, + "theoretical_loss": 3.299513713180363, + "tokens_seen": 3263584256 + }, + { + "epoch": 10.09, + "learning_rate": 5.737211634904714e-06, + "loss": 2.0747, + "theoretical_loss": 3.2995087056026042, + "tokens_seen": 3263649792 + }, + { + "epoch": 10.09, + "objective/train/docs_used": 3526666, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0052497386932373, + "objective/train/theoretical_loss": 3.299506201861991, + "objective/train/tokens_used": 3264366048, + "theoretical_loss": 3.299506201861991, + "tokens_seen": 3263682560 + }, + { + "epoch": 10.09, + "learning_rate": 5.727181544633902e-06, + "loss": 2.429, + "theoretical_loss": 3.299503698153555, + "tokens_seen": 3263715328 + }, + { + "epoch": 10.09, + "learning_rate": 5.7171514543630895e-06, + "loss": 2.3227, + "theoretical_loss": 3.2994986908332082, + "tokens_seen": 3263780864 + }, + { + "epoch": 10.09, + "learning_rate": 5.707121364092277e-06, + "loss": 2.4387, + "theoretical_loss": 3.299493683641559, + "tokens_seen": 3263846400 + }, + { + "epoch": 10.09, + "learning_rate": 5.6970912738214645e-06, + "loss": 2.438, + "theoretical_loss": 3.2994886765786005, + "tokens_seen": 3263911936 + }, + { + "epoch": 10.09, + "learning_rate": 5.687061183550652e-06, + "loss": 2.3502, + "theoretical_loss": 3.299483669644328, + "tokens_seen": 3263977472 + }, + { + "epoch": 10.09, + "learning_rate": 5.677031093279839e-06, + "loss": 2.3993, + "theoretical_loss": 3.2994786628387343, + "tokens_seen": 3264043008 + }, + { + "epoch": 10.09, + "learning_rate": 5.6670010030090265e-06, + "loss": 2.4858, + "theoretical_loss": 3.299473656161814, + "tokens_seen": 3264108544 + }, + { + "epoch": 10.09, + "learning_rate": 5.656970912738215e-06, + "loss": 2.2211, + "theoretical_loss": 3.2994686496135617, + "tokens_seen": 3264174080 + }, + { + "epoch": 10.09, + "learning_rate": 5.646940822467402e-06, + "loss": 2.092, + "theoretical_loss": 3.2994636431939712, + "tokens_seen": 3264239616 + }, + { + "epoch": 10.09, + "learning_rate": 5.63691073219659e-06, + "loss": 2.3975, + "theoretical_loss": 3.299458636903036, + "tokens_seen": 3264305152 + }, + { + "epoch": 10.09, + "learning_rate": 5.626880641925777e-06, + "loss": 2.3597, + "theoretical_loss": 3.2994536307407514, + "tokens_seen": 3264370688 + }, + { + "epoch": 11.0, + "learning_rate": 5.616850551654965e-06, + "loss": 3.1098, + "theoretical_loss": 3.299447451436584, + "tokens_seen": 3264451584 + }, + { + "epoch": 11.0, + "learning_rate": 5.606820461384152e-06, + "loss": 2.5032, + "theoretical_loss": 3.29944244556173, + "tokens_seen": 3264517120 + }, + { + "epoch": 11.0, + "learning_rate": 5.59679037111334e-06, + "loss": 2.4643, + "theoretical_loss": 3.2994374398155073, + "tokens_seen": 3264582656 + }, + { + "epoch": 11.0, + "learning_rate": 5.586760280842528e-06, + "loss": 2.3444, + "theoretical_loss": 3.2994324341979095, + "tokens_seen": 3264648192 + }, + { + "epoch": 11.0, + "learning_rate": 5.576730190571715e-06, + "loss": 2.5267, + "theoretical_loss": 3.2994274287089307, + "tokens_seen": 3264713728 + }, + { + "epoch": 11.0, + "learning_rate": 5.566700100300903e-06, + "loss": 2.4141, + "theoretical_loss": 3.299422423348565, + "tokens_seen": 3264779264 + }, + { + "epoch": 11.0, + "learning_rate": 5.55667001003009e-06, + "loss": 2.393, + "theoretical_loss": 3.299417418116807, + "tokens_seen": 3264844800 + }, + { + "epoch": 11.0, + "learning_rate": 5.546639919759279e-06, + "loss": 2.4797, + "theoretical_loss": 3.29941241301365, + "tokens_seen": 3264910336 + }, + { + "epoch": 11.0, + "learning_rate": 5.536609829488466e-06, + "loss": 2.4218, + "theoretical_loss": 3.299407408039089, + "tokens_seen": 3264975872 + }, + { + "epoch": 11.0, + "learning_rate": 5.526579739217653e-06, + "loss": 2.365, + "theoretical_loss": 3.2994024031931177, + "tokens_seen": 3265041408 + }, + { + "epoch": 11.0, + "learning_rate": 5.516549648946841e-06, + "loss": 2.5463, + "theoretical_loss": 3.29939739847573, + "tokens_seen": 3265106944 + }, + { + "epoch": 11.0, + "learning_rate": 5.506519558676028e-06, + "loss": 2.6017, + "theoretical_loss": 3.2993923938869205, + "tokens_seen": 3265172480 + }, + { + "epoch": 11.0, + "learning_rate": 5.496489468405216e-06, + "loss": 2.517, + "theoretical_loss": 3.299387389426683, + "tokens_seen": 3265238016 + }, + { + "epoch": 11.0, + "learning_rate": 5.4864593781344035e-06, + "loss": 2.5684, + "theoretical_loss": 3.2993823850950115, + "tokens_seen": 3265303552 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3591475, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6314191818237305, + "objective/train/theoretical_loss": 3.2993811340321813, + "objective/train/tokens_used": 3285779936, + "theoretical_loss": 3.2993811340321813, + "tokens_seen": 3265319936 + }, + { + "epoch": 11.0, + "learning_rate": 5.476429287863591e-06, + "loss": 2.4718, + "theoretical_loss": 3.2993773808919005, + "tokens_seen": 3265369088 + }, + { + "epoch": 11.0, + "learning_rate": 5.4663991975927785e-06, + "loss": 2.4559, + "theoretical_loss": 3.299372376817344, + "tokens_seen": 3265434624 + }, + { + "epoch": 11.0, + "learning_rate": 5.4563691073219655e-06, + "loss": 2.2091, + "theoretical_loss": 3.2993673728713357, + "tokens_seen": 3265500160 + }, + { + "epoch": 11.0, + "learning_rate": 5.446339017051153e-06, + "loss": 2.4434, + "theoretical_loss": 3.29936236905387, + "tokens_seen": 3265565696 + }, + { + "epoch": 11.0, + "learning_rate": 5.436308926780341e-06, + "loss": 2.4194, + "theoretical_loss": 3.2993573653649415, + "tokens_seen": 3265631232 + }, + { + "epoch": 11.0, + "learning_rate": 5.426278836509529e-06, + "loss": 2.3383, + "theoretical_loss": 3.2993523618045435, + "tokens_seen": 3265696768 + }, + { + "epoch": 11.0, + "learning_rate": 5.416248746238716e-06, + "loss": 2.5918, + "theoretical_loss": 3.2993473583726707, + "tokens_seen": 3265762304 + }, + { + "epoch": 11.0, + "learning_rate": 5.406218655967904e-06, + "loss": 2.4888, + "theoretical_loss": 3.299342355069317, + "tokens_seen": 3265827840 + }, + { + "epoch": 11.0, + "learning_rate": 5.396188565697091e-06, + "loss": 2.3451, + "theoretical_loss": 3.2993373518944766, + "tokens_seen": 3265893376 + }, + { + "epoch": 11.0, + "learning_rate": 5.386158475426278e-06, + "loss": 2.4314, + "theoretical_loss": 3.299332348848143, + "tokens_seen": 3265958912 + }, + { + "epoch": 11.0, + "learning_rate": 5.376128385155467e-06, + "loss": 2.4862, + "theoretical_loss": 3.299327345930312, + "tokens_seen": 3266024448 + }, + { + "epoch": 11.0, + "learning_rate": 5.366098294884654e-06, + "loss": 2.4318, + "theoretical_loss": 3.2993223431409753, + "tokens_seen": 3266089984 + }, + { + "epoch": 11.0, + "learning_rate": 5.356068204613842e-06, + "loss": 2.4195, + "theoretical_loss": 3.2993173404801293, + "tokens_seen": 3266155520 + }, + { + "epoch": 11.0, + "learning_rate": 5.346038114343029e-06, + "loss": 2.5173, + "theoretical_loss": 3.2993123379477667, + "tokens_seen": 3266221056 + }, + { + "epoch": 11.0, + "learning_rate": 5.336008024072217e-06, + "loss": 2.5219, + "theoretical_loss": 3.2993073355438822, + "tokens_seen": 3266286592 + }, + { + "epoch": 11.0, + "learning_rate": 5.325977933801404e-06, + "loss": 2.4155, + "theoretical_loss": 3.2993023332684697, + "tokens_seen": 3266352128 + }, + { + "epoch": 11.0, + "learning_rate": 5.315947843530592e-06, + "loss": 2.4, + "theoretical_loss": 3.2992973311215232, + "tokens_seen": 3266417664 + }, + { + "epoch": 11.0, + "learning_rate": 5.30591775325978e-06, + "loss": 2.4109, + "theoretical_loss": 3.2992923291030376, + "tokens_seen": 3266483200 + }, + { + "epoch": 11.0, + "learning_rate": 5.295887662988967e-06, + "loss": 2.4364, + "theoretical_loss": 3.299287327213006, + "tokens_seen": 3266548736 + }, + { + "epoch": 11.0, + "learning_rate": 5.285857572718155e-06, + "loss": 2.4437, + "theoretical_loss": 3.299282325451423, + "tokens_seen": 3266614272 + }, + { + "epoch": 11.0, + "learning_rate": 5.275827482447342e-06, + "loss": 2.4432, + "theoretical_loss": 3.2992773238182833, + "tokens_seen": 3266679808 + }, + { + "epoch": 11.0, + "learning_rate": 5.2657973921765305e-06, + "loss": 2.4985, + "theoretical_loss": 3.2992723223135796, + "tokens_seen": 3266745344 + }, + { + "epoch": 11.0, + "learning_rate": 5.2557673019057175e-06, + "loss": 2.5578, + "theoretical_loss": 3.2992673209373073, + "tokens_seen": 3266810880 + }, + { + "epoch": 11.0, + "learning_rate": 5.2457372116349046e-06, + "loss": 2.411, + "theoretical_loss": 3.2992623196894604, + "tokens_seen": 3266876416 + }, + { + "epoch": 11.0, + "learning_rate": 5.2357071213640925e-06, + "loss": 2.4944, + "theoretical_loss": 3.299257318570032, + "tokens_seen": 3266941952 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3596481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4639172554016113, + "objective/train/theoretical_loss": 3.29925606831024, + "objective/train/tokens_used": 3287418336, + "theoretical_loss": 3.29925606831024, + "tokens_seen": 3266958336 + }, + { + "epoch": 11.0, + "learning_rate": 5.2256770310932795e-06, + "loss": 2.4733, + "theoretical_loss": 3.2992523175790174, + "tokens_seen": 3267007488 + }, + { + "epoch": 11.0, + "learning_rate": 5.215646940822467e-06, + "loss": 2.5067, + "theoretical_loss": 3.2992473167164102, + "tokens_seen": 3267073024 + }, + { + "epoch": 11.0, + "learning_rate": 5.205616850551655e-06, + "loss": 2.618, + "theoretical_loss": 3.2992423159822044, + "tokens_seen": 3267138560 + }, + { + "epoch": 11.0, + "learning_rate": 5.195586760280843e-06, + "loss": 2.5821, + "theoretical_loss": 3.2992373153763945, + "tokens_seen": 3267204096 + }, + { + "epoch": 11.0, + "learning_rate": 5.18555667001003e-06, + "loss": 2.3902, + "theoretical_loss": 3.2992323148989744, + "tokens_seen": 3267269632 + }, + { + "epoch": 11.0, + "learning_rate": 5.175526579739217e-06, + "loss": 2.4721, + "theoretical_loss": 3.2992273145499382, + "tokens_seen": 3267335168 + }, + { + "epoch": 11.0, + "learning_rate": 5.165496489468405e-06, + "loss": 2.4377, + "theoretical_loss": 3.2992223143292803, + "tokens_seen": 3267400704 + }, + { + "epoch": 11.0, + "learning_rate": 5.155466399197592e-06, + "loss": 2.3335, + "theoretical_loss": 3.2992173142369943, + "tokens_seen": 3267466240 + }, + { + "epoch": 11.0, + "learning_rate": 5.145436308926781e-06, + "loss": 2.5567, + "theoretical_loss": 3.299212314273075, + "tokens_seen": 3267531776 + }, + { + "epoch": 11.0, + "learning_rate": 5.135406218655968e-06, + "loss": 2.4808, + "theoretical_loss": 3.299207314437516, + "tokens_seen": 3267597312 + }, + { + "epoch": 11.0, + "learning_rate": 5.125376128385156e-06, + "loss": 2.4445, + "theoretical_loss": 3.299202314730312, + "tokens_seen": 3267662848 + }, + { + "epoch": 11.0, + "learning_rate": 5.115346038114343e-06, + "loss": 2.4246, + "theoretical_loss": 3.2991973151514564, + "tokens_seen": 3267728384 + }, + { + "epoch": 11.0, + "learning_rate": 5.10531594784353e-06, + "loss": 2.4697, + "theoretical_loss": 3.2991923157009437, + "tokens_seen": 3267793920 + }, + { + "epoch": 11.0, + "learning_rate": 5.095285857572719e-06, + "loss": 2.5422, + "theoretical_loss": 3.299187316378768, + "tokens_seen": 3267859456 + }, + { + "epoch": 11.0, + "learning_rate": 5.085255767301906e-06, + "loss": 2.5137, + "theoretical_loss": 3.2991823171849233, + "tokens_seen": 3267924992 + }, + { + "epoch": 11.0, + "learning_rate": 5.075225677031094e-06, + "loss": 2.43, + "theoretical_loss": 3.299177318119404, + "tokens_seen": 3267990528 + }, + { + "epoch": 11.0, + "learning_rate": 5.065195586760281e-06, + "loss": 2.4563, + "theoretical_loss": 3.2991723191822047, + "tokens_seen": 3268056064 + }, + { + "epoch": 11.0, + "learning_rate": 5.055165496489469e-06, + "loss": 2.4667, + "theoretical_loss": 3.2991673203733183, + "tokens_seen": 3268121600 + }, + { + "epoch": 11.0, + "learning_rate": 5.045135406218656e-06, + "loss": 2.4804, + "theoretical_loss": 3.2991623216927395, + "tokens_seen": 3268187136 + }, + { + "epoch": 11.0, + "learning_rate": 5.035105315947844e-06, + "loss": 2.3623, + "theoretical_loss": 3.299157323140463, + "tokens_seen": 3268252672 + }, + { + "epoch": 11.0, + "learning_rate": 5.0250752256770315e-06, + "loss": 2.6065, + "theoretical_loss": 3.299152324716482, + "tokens_seen": 3268318208 + }, + { + "epoch": 11.0, + "learning_rate": 5.0150451354062186e-06, + "loss": 2.5511, + "theoretical_loss": 3.2991473264207913, + "tokens_seen": 3268383744 + }, + { + "epoch": 11.0, + "learning_rate": 5.0050150451354065e-06, + "loss": 2.4621, + "theoretical_loss": 3.2991423282533847, + "tokens_seen": 3268449280 + }, + { + "epoch": 11.0, + "learning_rate": 4.9949849548645935e-06, + "loss": 2.5593, + "theoretical_loss": 3.299137330214257, + "tokens_seen": 3268514816 + }, + { + "epoch": 11.0, + "learning_rate": 4.984954864593782e-06, + "loss": 2.4488, + "theoretical_loss": 3.299132332303401, + "tokens_seen": 3268580352 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3601500, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.286137104034424, + "objective/train/theoretical_loss": 3.299131082845729, + "objective/train/tokens_used": 3289056736, + "theoretical_loss": 3.299131082845729, + "tokens_seen": 3268596736 + }, + { + "epoch": 11.0, + "learning_rate": 4.974924774322969e-06, + "loss": 2.3306, + "theoretical_loss": 3.299127334520812, + "tokens_seen": 3268645888 + }, + { + "epoch": 11.0, + "learning_rate": 4.964894684052156e-06, + "loss": 2.4511, + "theoretical_loss": 3.2991223368664837, + "tokens_seen": 3268711424 + }, + { + "epoch": 11.0, + "learning_rate": 4.954864593781344e-06, + "loss": 2.2802, + "theoretical_loss": 3.2991173393404103, + "tokens_seen": 3268776960 + }, + { + "epoch": 11.0, + "learning_rate": 4.944834503510531e-06, + "loss": 2.2864, + "theoretical_loss": 3.299112341942586, + "tokens_seen": 3268842496 + }, + { + "epoch": 11.0, + "learning_rate": 4.934804413239719e-06, + "loss": 2.3014, + "theoretical_loss": 3.2991073446730046, + "tokens_seen": 3268908032 + }, + { + "epoch": 11.0, + "learning_rate": 4.924774322968907e-06, + "loss": 2.5577, + "theoretical_loss": 3.2991023475316608, + "tokens_seen": 3268973568 + }, + { + "epoch": 11.0, + "learning_rate": 4.914744232698095e-06, + "loss": 2.4302, + "theoretical_loss": 3.299097350518548, + "tokens_seen": 3269039104 + }, + { + "epoch": 11.0, + "learning_rate": 4.904714142427282e-06, + "loss": 2.4581, + "theoretical_loss": 3.2990923536336614, + "tokens_seen": 3269104640 + }, + { + "epoch": 11.0, + "learning_rate": 4.894684052156469e-06, + "loss": 2.5152, + "theoretical_loss": 3.2990873568769943, + "tokens_seen": 3269170176 + }, + { + "epoch": 11.0, + "learning_rate": 4.884653961885657e-06, + "loss": 2.3502, + "theoretical_loss": 3.299082360248541, + "tokens_seen": 3269235712 + }, + { + "epoch": 11.0, + "learning_rate": 4.874623871614844e-06, + "loss": 2.4211, + "theoretical_loss": 3.299077363748296, + "tokens_seen": 3269301248 + }, + { + "epoch": 11.0, + "learning_rate": 4.864593781344033e-06, + "loss": 2.5015, + "theoretical_loss": 3.2990723673762528, + "tokens_seen": 3269366784 + }, + { + "epoch": 11.0, + "learning_rate": 4.85456369107322e-06, + "loss": 2.3804, + "theoretical_loss": 3.299067371132406, + "tokens_seen": 3269432320 + }, + { + "epoch": 11.0, + "learning_rate": 4.844533600802408e-06, + "loss": 2.2963, + "theoretical_loss": 3.29906237501675, + "tokens_seen": 3269497856 + }, + { + "epoch": 11.0, + "learning_rate": 4.834503510531595e-06, + "loss": 2.3126, + "theoretical_loss": 3.299057379029278, + "tokens_seen": 3269563392 + }, + { + "epoch": 11.0, + "learning_rate": 4.824473420260782e-06, + "loss": 2.484, + "theoretical_loss": 3.2990523831699847, + "tokens_seen": 3269628928 + }, + { + "epoch": 11.0, + "learning_rate": 4.8144433299899706e-06, + "loss": 2.2849, + "theoretical_loss": 3.299047387438865, + "tokens_seen": 3269694464 + }, + { + "epoch": 11.0, + "learning_rate": 4.804413239719158e-06, + "loss": 2.4261, + "theoretical_loss": 3.2990423918359113, + "tokens_seen": 3269760000 + }, + { + "epoch": 11.0, + "learning_rate": 4.7943831494483455e-06, + "loss": 2.4231, + "theoretical_loss": 3.2990373963611193, + "tokens_seen": 3269825536 + }, + { + "epoch": 11.0, + "learning_rate": 4.7843530591775326e-06, + "loss": 2.2328, + "theoretical_loss": 3.2990324010144825, + "tokens_seen": 3269891072 + }, + { + "epoch": 11.0, + "learning_rate": 4.77432296890672e-06, + "loss": 2.3571, + "theoretical_loss": 3.299027405795995, + "tokens_seen": 3269956608 + }, + { + "epoch": 11.0, + "learning_rate": 4.7642928786359075e-06, + "loss": 2.5386, + "theoretical_loss": 3.2990224107056516, + "tokens_seen": 3270022144 + }, + { + "epoch": 11.0, + "learning_rate": 4.754262788365095e-06, + "loss": 2.6018, + "theoretical_loss": 3.299017415743445, + "tokens_seen": 3270087680 + }, + { + "epoch": 11.0, + "learning_rate": 4.744232698094283e-06, + "loss": 2.4587, + "theoretical_loss": 3.2990124209093707, + "tokens_seen": 3270153216 + }, + { + "epoch": 11.0, + "learning_rate": 4.73420260782347e-06, + "loss": 2.3401, + "theoretical_loss": 3.299007426203423, + "tokens_seen": 3270218752 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3606434, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3323018550872803, + "objective/train/theoretical_loss": 3.299006177546955, + "objective/train/tokens_used": 3290695136, + "theoretical_loss": 3.299006177546955, + "tokens_seen": 3270235136 + }, + { + "epoch": 11.0, + "learning_rate": 4.724172517552658e-06, + "loss": 2.47, + "theoretical_loss": 3.2990024316255946, + "tokens_seen": 3270284288 + }, + { + "epoch": 11.0, + "learning_rate": 4.714142427281845e-06, + "loss": 2.4519, + "theoretical_loss": 3.2989974371758812, + "tokens_seen": 3270349824 + }, + { + "epoch": 11.0, + "learning_rate": 4.704112337011032e-06, + "loss": 2.3797, + "theoretical_loss": 3.2989924428542756, + "tokens_seen": 3270415360 + }, + { + "epoch": 11.0, + "learning_rate": 4.694082246740221e-06, + "loss": 2.3601, + "theoretical_loss": 3.298987448660773, + "tokens_seen": 3270480896 + }, + { + "epoch": 11.0, + "learning_rate": 4.684052156469408e-06, + "loss": 2.4858, + "theoretical_loss": 3.298982454595367, + "tokens_seen": 3270546432 + }, + { + "epoch": 11.0, + "learning_rate": 4.674022066198596e-06, + "loss": 2.3789, + "theoretical_loss": 3.298977460658052, + "tokens_seen": 3270611968 + }, + { + "epoch": 11.0, + "learning_rate": 4.663991975927783e-06, + "loss": 2.3275, + "theoretical_loss": 3.298972466848822, + "tokens_seen": 3270677504 + }, + { + "epoch": 11.0, + "learning_rate": 4.653961885656971e-06, + "loss": 2.5811, + "theoretical_loss": 3.298967473167671, + "tokens_seen": 3270743040 + }, + { + "epoch": 11.0, + "learning_rate": 4.643931795386159e-06, + "loss": 2.5301, + "theoretical_loss": 3.2989624796145938, + "tokens_seen": 3270808576 + }, + { + "epoch": 11.0, + "learning_rate": 4.633901705115346e-06, + "loss": 2.5429, + "theoretical_loss": 3.2989574861895834, + "tokens_seen": 3270874112 + }, + { + "epoch": 11.0, + "learning_rate": 4.623871614844534e-06, + "loss": 2.4735, + "theoretical_loss": 3.298952492892635, + "tokens_seen": 3270939648 + }, + { + "epoch": 11.0, + "learning_rate": 4.613841524573721e-06, + "loss": 2.4421, + "theoretical_loss": 3.2989474997237425, + "tokens_seen": 3271005184 + }, + { + "epoch": 11.0, + "learning_rate": 4.603811434302909e-06, + "loss": 2.442, + "theoretical_loss": 3.2989425066828995, + "tokens_seen": 3271070720 + }, + { + "epoch": 11.0, + "learning_rate": 4.593781344032096e-06, + "loss": 2.3501, + "theoretical_loss": 3.298937513770101, + "tokens_seen": 3271136256 + }, + { + "epoch": 11.0, + "learning_rate": 4.5837512537612846e-06, + "loss": 2.4971, + "theoretical_loss": 3.298932520985341, + "tokens_seen": 3271201792 + }, + { + "epoch": 11.0, + "learning_rate": 4.573721163490472e-06, + "loss": 2.4774, + "theoretical_loss": 3.298927528328613, + "tokens_seen": 3271267328 + }, + { + "epoch": 11.0, + "learning_rate": 4.563691073219659e-06, + "loss": 2.5058, + "theoretical_loss": 3.2989225357999112, + "tokens_seen": 3271332864 + }, + { + "epoch": 11.0, + "learning_rate": 4.5536609829488466e-06, + "loss": 2.508, + "theoretical_loss": 3.2989175433992304, + "tokens_seen": 3271398400 + }, + { + "epoch": 11.0, + "learning_rate": 4.543630892678034e-06, + "loss": 2.5401, + "theoretical_loss": 3.2989125511265645, + "tokens_seen": 3271463936 + }, + { + "epoch": 11.0, + "learning_rate": 4.533600802407222e-06, + "loss": 2.3517, + "theoretical_loss": 3.2989075589819077, + "tokens_seen": 3271529472 + }, + { + "epoch": 11.0, + "learning_rate": 4.523570712136409e-06, + "loss": 2.6441, + "theoretical_loss": 3.298902566965254, + "tokens_seen": 3271595008 + }, + { + "epoch": 11.0, + "learning_rate": 4.513540621865597e-06, + "loss": 2.5686, + "theoretical_loss": 3.298897575076597, + "tokens_seen": 3271660544 + }, + { + "epoch": 11.0, + "learning_rate": 4.503510531594784e-06, + "loss": 2.6267, + "theoretical_loss": 3.2988925833159324, + "tokens_seen": 3271726080 + }, + { + "epoch": 11.0, + "learning_rate": 4.493480441323971e-06, + "loss": 2.5366, + "theoretical_loss": 3.2988875916832527, + "tokens_seen": 3271791616 + }, + { + "epoch": 11.0, + "learning_rate": 4.483450351053159e-06, + "loss": 2.4028, + "theoretical_loss": 3.2988826001785534, + "tokens_seen": 3271857152 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3611442, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.456303596496582, + "objective/train/theoretical_loss": 3.2988813523223746, + "objective/train/tokens_used": 3292333536, + "theoretical_loss": 3.2988813523223746, + "tokens_seen": 3271873536 + }, + { + "epoch": 11.0, + "learning_rate": 4.473420260782347e-06, + "loss": 2.4051, + "theoretical_loss": 3.2988776088018277, + "tokens_seen": 3271922688 + }, + { + "epoch": 11.0, + "learning_rate": 4.463390170511535e-06, + "loss": 2.2281, + "theoretical_loss": 3.29887261755307, + "tokens_seen": 3271988224 + }, + { + "epoch": 11.0, + "learning_rate": 4.453360080240722e-06, + "loss": 2.4365, + "theoretical_loss": 3.2988676264322745, + "tokens_seen": 3272053760 + }, + { + "epoch": 11.0, + "learning_rate": 4.44332998996991e-06, + "loss": 2.4543, + "theoretical_loss": 3.2988626354394355, + "tokens_seen": 3272119296 + }, + { + "epoch": 11.0, + "learning_rate": 4.433299899699097e-06, + "loss": 2.3826, + "theoretical_loss": 3.298857644574547, + "tokens_seen": 3272184832 + }, + { + "epoch": 11.0, + "learning_rate": 4.423269809428284e-06, + "loss": 2.5648, + "theoretical_loss": 3.2988526538376033, + "tokens_seen": 3272250368 + }, + { + "epoch": 11.0, + "learning_rate": 4.413239719157473e-06, + "loss": 2.3753, + "theoretical_loss": 3.2988476632285986, + "tokens_seen": 3272315904 + }, + { + "epoch": 11.0, + "learning_rate": 4.40320962888666e-06, + "loss": 2.3867, + "theoretical_loss": 3.2988426727475266, + "tokens_seen": 3272381440 + }, + { + "epoch": 11.0, + "learning_rate": 4.393179538615848e-06, + "loss": 2.4593, + "theoretical_loss": 3.2988376823943817, + "tokens_seen": 3272446976 + }, + { + "epoch": 11.0, + "learning_rate": 4.383149448345035e-06, + "loss": 2.2919, + "theoretical_loss": 3.2988326921691584, + "tokens_seen": 3272512512 + }, + { + "epoch": 11.0, + "learning_rate": 4.373119358074223e-06, + "loss": 2.3984, + "theoretical_loss": 3.2988277020718506, + "tokens_seen": 3272578048 + }, + { + "epoch": 11.0, + "learning_rate": 4.363089267803411e-06, + "loss": 2.5412, + "theoretical_loss": 3.2988227121024525, + "tokens_seen": 3272643584 + }, + { + "epoch": 11.0, + "learning_rate": 4.353059177532598e-06, + "loss": 2.5808, + "theoretical_loss": 3.2988177222609583, + "tokens_seen": 3272709120 + }, + { + "epoch": 11.0, + "learning_rate": 4.343029087261786e-06, + "loss": 2.3977, + "theoretical_loss": 3.2988127325473617, + "tokens_seen": 3272774656 + }, + { + "epoch": 11.0, + "learning_rate": 4.332998996990973e-06, + "loss": 2.4942, + "theoretical_loss": 3.2988077429616576, + "tokens_seen": 3272840192 + }, + { + "epoch": 11.0, + "learning_rate": 4.3229689067201606e-06, + "loss": 2.4025, + "theoretical_loss": 3.2988027535038396, + "tokens_seen": 3272905728 + }, + { + "epoch": 11.0, + "learning_rate": 4.312938816449348e-06, + "loss": 2.4369, + "theoretical_loss": 3.298797764173902, + "tokens_seen": 3272971264 + }, + { + "epoch": 11.0, + "learning_rate": 4.302908726178536e-06, + "loss": 2.4907, + "theoretical_loss": 3.2987927749718393, + "tokens_seen": 3273036800 + }, + { + "epoch": 11.0, + "learning_rate": 4.292878635907723e-06, + "loss": 2.4404, + "theoretical_loss": 3.2987877858976447, + "tokens_seen": 3273102336 + }, + { + "epoch": 11.0, + "learning_rate": 4.2828485456369105e-06, + "loss": 2.4812, + "theoretical_loss": 3.298782796951314, + "tokens_seen": 3273167872 + }, + { + "epoch": 11.0, + "learning_rate": 4.272818455366098e-06, + "loss": 2.4537, + "theoretical_loss": 3.2987778081328396, + "tokens_seen": 3273233408 + }, + { + "epoch": 11.0, + "learning_rate": 4.262788365095285e-06, + "loss": 2.5571, + "theoretical_loss": 3.298772819442217, + "tokens_seen": 3273298944 + }, + { + "epoch": 11.0, + "learning_rate": 4.252758274824474e-06, + "loss": 2.3515, + "theoretical_loss": 3.2987678308794397, + "tokens_seen": 3273364480 + }, + { + "epoch": 11.0, + "learning_rate": 4.242728184553661e-06, + "loss": 2.5078, + "theoretical_loss": 3.298762842444502, + "tokens_seen": 3273430016 + }, + { + "epoch": 11.0, + "learning_rate": 4.232698094282849e-06, + "loss": 2.5408, + "theoretical_loss": 3.298757854137398, + "tokens_seen": 3273495552 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3616541, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.521176815032959, + "objective/train/theoretical_loss": 3.298756607080595, + "objective/train/tokens_used": 3293971936, + "theoretical_loss": 3.298756607080595, + "tokens_seen": 3273511936 + }, + { + "epoch": 11.0, + "learning_rate": 4.222668004012036e-06, + "loss": 2.5509, + "theoretical_loss": 3.298752865958122, + "tokens_seen": 3273561088 + }, + { + "epoch": 11.0, + "learning_rate": 4.212637913741223e-06, + "loss": 2.4869, + "theoretical_loss": 3.298747877906668, + "tokens_seen": 3273626624 + }, + { + "epoch": 11.0, + "learning_rate": 4.202607823470411e-06, + "loss": 2.4479, + "theoretical_loss": 3.2987428899830302, + "tokens_seen": 3273692160 + }, + { + "epoch": 11.0, + "learning_rate": 4.192577733199599e-06, + "loss": 2.2684, + "theoretical_loss": 3.298737902187203, + "tokens_seen": 3273757696 + }, + { + "epoch": 11.0, + "learning_rate": 4.182547642928787e-06, + "loss": 2.414, + "theoretical_loss": 3.2987329145191806, + "tokens_seen": 3273823232 + }, + { + "epoch": 11.0, + "learning_rate": 4.172517552657974e-06, + "loss": 2.262, + "theoretical_loss": 3.2987279269789567, + "tokens_seen": 3273888768 + }, + { + "epoch": 11.0, + "learning_rate": 4.162487462387162e-06, + "loss": 2.4161, + "theoretical_loss": 3.2987229395665256, + "tokens_seen": 3273954304 + }, + { + "epoch": 11.0, + "learning_rate": 4.152457372116349e-06, + "loss": 2.4909, + "theoretical_loss": 3.2987179522818817, + "tokens_seen": 3274019840 + }, + { + "epoch": 11.0, + "learning_rate": 4.142427281845536e-06, + "loss": 2.4048, + "theoretical_loss": 3.298712965125019, + "tokens_seen": 3274085376 + }, + { + "epoch": 11.0, + "learning_rate": 4.132397191574725e-06, + "loss": 2.4769, + "theoretical_loss": 3.298707978095932, + "tokens_seen": 3274150912 + }, + { + "epoch": 11.0, + "learning_rate": 4.122367101303912e-06, + "loss": 2.3701, + "theoretical_loss": 3.298702991194615, + "tokens_seen": 3274216448 + }, + { + "epoch": 11.0, + "learning_rate": 4.1123370110331e-06, + "loss": 2.2643, + "theoretical_loss": 3.2986980044210608, + "tokens_seen": 3274281984 + }, + { + "epoch": 11.0, + "learning_rate": 4.102306920762287e-06, + "loss": 2.4611, + "theoretical_loss": 3.298693017775265, + "tokens_seen": 3274347520 + }, + { + "epoch": 11.0, + "learning_rate": 4.0922768304914746e-06, + "loss": 2.3881, + "theoretical_loss": 3.298688031257221, + "tokens_seen": 3274413056 + }, + { + "epoch": 11.0, + "learning_rate": 4.0822467402206625e-06, + "loss": 2.5393, + "theoretical_loss": 3.298683044866924, + "tokens_seen": 3274478592 + }, + { + "epoch": 11.0, + "learning_rate": 4.0722166499498495e-06, + "loss": 2.3838, + "theoretical_loss": 3.298678058604367, + "tokens_seen": 3274544128 + }, + { + "epoch": 11.0, + "learning_rate": 4.062186559679037e-06, + "loss": 2.5224, + "theoretical_loss": 3.2986730724695446, + "tokens_seen": 3274609664 + }, + { + "epoch": 11.0, + "learning_rate": 4.0521564694082245e-06, + "loss": 2.4815, + "theoretical_loss": 3.298668086462451, + "tokens_seen": 3274675200 + }, + { + "epoch": 11.0, + "learning_rate": 4.042126379137412e-06, + "loss": 2.474, + "theoretical_loss": 3.2986631005830804, + "tokens_seen": 3274740736 + }, + { + "epoch": 11.0, + "learning_rate": 4.032096288866599e-06, + "loss": 2.5908, + "theoretical_loss": 3.2986581148314267, + "tokens_seen": 3274806272 + }, + { + "epoch": 11.0, + "learning_rate": 4.022066198595788e-06, + "loss": 2.4917, + "theoretical_loss": 3.2986531292074845, + "tokens_seen": 3274871808 + }, + { + "epoch": 11.0, + "learning_rate": 4.012036108324975e-06, + "loss": 2.4264, + "theoretical_loss": 3.2986481437112483, + "tokens_seen": 3274937344 + }, + { + "epoch": 11.0, + "learning_rate": 4.002006018054162e-06, + "loss": 2.4526, + "theoretical_loss": 3.298643158342711, + "tokens_seen": 3275002880 + }, + { + "epoch": 11.0, + "learning_rate": 3.99197592778335e-06, + "loss": 2.5334, + "theoretical_loss": 3.298638173101868, + "tokens_seen": 3275068416 + }, + { + "epoch": 11.0, + "learning_rate": 3.981945837512537e-06, + "loss": 2.3728, + "theoretical_loss": 3.2986331879887127, + "tokens_seen": 3275133952 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3621573, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4960453510284424, + "objective/train/theoretical_loss": 3.2986319417303744, + "objective/train/tokens_used": 3295610336, + "theoretical_loss": 3.2986319417303744, + "tokens_seen": 3275150336 + }, + { + "epoch": 11.0, + "learning_rate": 3.971915747241726e-06, + "loss": 2.3263, + "theoretical_loss": 3.29862820300324, + "tokens_seen": 3275199488 + }, + { + "epoch": 11.0, + "learning_rate": 3.961885656970913e-06, + "loss": 2.4182, + "theoretical_loss": 3.298623218145443, + "tokens_seen": 3275265024 + }, + { + "epoch": 11.0, + "learning_rate": 3.951855566700101e-06, + "loss": 2.3684, + "theoretical_loss": 3.298618233415317, + "tokens_seen": 3275330560 + }, + { + "epoch": 11.0, + "learning_rate": 3.941825476429288e-06, + "loss": 2.393, + "theoretical_loss": 3.2986132488128557, + "tokens_seen": 3275396096 + }, + { + "epoch": 11.0, + "learning_rate": 3.931795386158475e-06, + "loss": 2.4312, + "theoretical_loss": 3.2986082643380534, + "tokens_seen": 3275461632 + }, + { + "epoch": 11.0, + "learning_rate": 3.921765295887663e-06, + "loss": 2.3529, + "theoretical_loss": 3.2986032799909037, + "tokens_seen": 3275527168 + }, + { + "epoch": 11.0, + "learning_rate": 3.911735205616851e-06, + "loss": 2.4726, + "theoretical_loss": 3.2985982957714013, + "tokens_seen": 3275592704 + }, + { + "epoch": 11.0, + "learning_rate": 3.901705115346038e-06, + "loss": 2.4463, + "theoretical_loss": 3.2985933116795407, + "tokens_seen": 3275658240 + }, + { + "epoch": 11.0, + "learning_rate": 3.891675025075226e-06, + "loss": 2.3889, + "theoretical_loss": 3.2985883277153154, + "tokens_seen": 3275723776 + }, + { + "epoch": 11.0, + "learning_rate": 3.881644934804414e-06, + "loss": 2.6106, + "theoretical_loss": 3.29858334387872, + "tokens_seen": 3275789312 + }, + { + "epoch": 11.0, + "learning_rate": 3.871614844533601e-06, + "loss": 2.5269, + "theoretical_loss": 3.2985783601697487, + "tokens_seen": 3275854848 + }, + { + "epoch": 11.0, + "learning_rate": 3.8615847542627886e-06, + "loss": 2.4823, + "theoretical_loss": 3.2985733765883953, + "tokens_seen": 3275920384 + }, + { + "epoch": 11.0, + "learning_rate": 3.8515546639919765e-06, + "loss": 2.4155, + "theoretical_loss": 3.298568393134654, + "tokens_seen": 3275985920 + }, + { + "epoch": 11.0, + "learning_rate": 3.8415245737211635e-06, + "loss": 2.1929, + "theoretical_loss": 3.2985634098085197, + "tokens_seen": 3276051456 + }, + { + "epoch": 11.0, + "learning_rate": 3.8314944834503506e-06, + "loss": 2.3596, + "theoretical_loss": 3.298558426609986, + "tokens_seen": 3276116992 + }, + { + "epoch": 11.0, + "learning_rate": 3.8214643931795384e-06, + "loss": 2.2677, + "theoretical_loss": 3.2985534435390464, + "tokens_seen": 3276182528 + }, + { + "epoch": 11.0, + "learning_rate": 3.8114343029087263e-06, + "loss": 2.2368, + "theoretical_loss": 3.2985484605956965, + "tokens_seen": 3276248064 + }, + { + "epoch": 11.0, + "learning_rate": 3.801404212637914e-06, + "loss": 2.3407, + "theoretical_loss": 3.29854347777993, + "tokens_seen": 3276313600 + }, + { + "epoch": 11.0, + "learning_rate": 3.7913741223671013e-06, + "loss": 2.4836, + "theoretical_loss": 3.2985384950917402, + "tokens_seen": 3276379136 + }, + { + "epoch": 11.0, + "learning_rate": 3.781344032096289e-06, + "loss": 2.4791, + "theoretical_loss": 3.2985335125311224, + "tokens_seen": 3276444672 + }, + { + "epoch": 11.0, + "learning_rate": 3.7713139418254767e-06, + "loss": 2.3304, + "theoretical_loss": 3.2985285300980705, + "tokens_seen": 3276510208 + }, + { + "epoch": 11.0, + "learning_rate": 3.7612838515546637e-06, + "loss": 2.4239, + "theoretical_loss": 3.2985235477925783, + "tokens_seen": 3276575744 + }, + { + "epoch": 11.0, + "learning_rate": 3.7512537612838516e-06, + "loss": 2.3848, + "theoretical_loss": 3.29851856561464, + "tokens_seen": 3276641280 + }, + { + "epoch": 11.0, + "learning_rate": 3.741223671013039e-06, + "loss": 2.4655, + "theoretical_loss": 3.29851358356425, + "tokens_seen": 3276706816 + }, + { + "epoch": 11.0, + "learning_rate": 3.731193580742227e-06, + "loss": 2.3564, + "theoretical_loss": 3.298508601641403, + "tokens_seen": 3276772352 + }, + { + "debugging/Self-BLEU-5": 0.7063088359334876, + "debugging/distinct-1-grams": 0.7727579255954624, + "debugging/distinct-2-grams": 0.9571882754371277, + "debugging/entropy-1-grams": 6.693675726442681, + "debugging/entropy-2-grams": 8.224606452622062, + "debugging/length": 449.07462686567163, + "debugging/num_segments": 67, + "debugging/score": 2.627706537733866e-05, + "debugging/score_std": 0.0002134758882866292, + "epoch": 11.0, + "objective/train/docs_used": 3626420, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.813333034515381, + "objective/train/theoretical_loss": 3.298507356180619, + "objective/train/tokens_used": 3297248736, + "theoretical_loss": 3.298507356180619, + "tokens_seen": 3276788736 + }, + { + "epoch": 11.0, + "learning_rate": 3.7211634904714145e-06, + "loss": 2.5, + "theoretical_loss": 3.2985036198460924, + "tokens_seen": 3276837888 + }, + { + "epoch": 11.0, + "learning_rate": 3.711133400200602e-06, + "loss": 2.2413, + "theoretical_loss": 3.298498638178313, + "tokens_seen": 3276903424 + }, + { + "epoch": 11.0, + "learning_rate": 3.7011033099297894e-06, + "loss": 2.5197, + "theoretical_loss": 3.298493656638058, + "tokens_seen": 3276968960 + }, + { + "epoch": 11.0, + "learning_rate": 3.691073219658977e-06, + "loss": 2.4158, + "theoretical_loss": 3.2984886752253226, + "tokens_seen": 3277034496 + }, + { + "epoch": 11.0, + "learning_rate": 3.6810431293881643e-06, + "loss": 2.4642, + "theoretical_loss": 3.2984836939401, + "tokens_seen": 3277100032 + }, + { + "epoch": 11.0, + "learning_rate": 3.6710130391173522e-06, + "loss": 2.3554, + "theoretical_loss": 3.298478712782386, + "tokens_seen": 3277165568 + }, + { + "epoch": 11.0, + "learning_rate": 3.6609829488465397e-06, + "loss": 2.3956, + "theoretical_loss": 3.298473731752173, + "tokens_seen": 3277231104 + }, + { + "epoch": 11.0, + "learning_rate": 3.650952858575727e-06, + "loss": 2.4114, + "theoretical_loss": 3.298468750849456, + "tokens_seen": 3277296640 + }, + { + "epoch": 11.0, + "learning_rate": 3.640922768304915e-06, + "loss": 2.5638, + "theoretical_loss": 3.2984637700742296, + "tokens_seen": 3277362176 + }, + { + "epoch": 11.0, + "learning_rate": 3.630892678034102e-06, + "loss": 2.371, + "theoretical_loss": 3.2984587894264874, + "tokens_seen": 3277427712 + }, + { + "epoch": 11.0, + "learning_rate": 3.6208625877632896e-06, + "loss": 2.3776, + "theoretical_loss": 3.2984538089062236, + "tokens_seen": 3277493248 + }, + { + "epoch": 11.0, + "learning_rate": 3.6108324974924775e-06, + "loss": 2.4767, + "theoretical_loss": 3.2984488285134326, + "tokens_seen": 3277558784 + }, + { + "epoch": 11.0, + "learning_rate": 3.600802407221665e-06, + "loss": 2.3115, + "theoretical_loss": 3.298443848248109, + "tokens_seen": 3277624320 + }, + { + "epoch": 11.0, + "learning_rate": 3.590772316950853e-06, + "loss": 2.2996, + "theoretical_loss": 3.298438868110246, + "tokens_seen": 3277689856 + }, + { + "epoch": 11.0, + "learning_rate": 3.5807422266800403e-06, + "loss": 2.4894, + "theoretical_loss": 3.298433888099838, + "tokens_seen": 3277755392 + }, + { + "epoch": 11.0, + "learning_rate": 3.570712136409228e-06, + "loss": 2.4172, + "theoretical_loss": 3.29842890821688, + "tokens_seen": 3277820928 + }, + { + "epoch": 11.0, + "learning_rate": 3.5606820461384153e-06, + "loss": 2.5081, + "theoretical_loss": 3.2984239284613652, + "tokens_seen": 3277886464 + }, + { + "epoch": 11.0, + "learning_rate": 3.5506519558676028e-06, + "loss": 2.542, + "theoretical_loss": 3.2984189488332882, + "tokens_seen": 3277952000 + }, + { + "epoch": 11.0, + "learning_rate": 3.5406218655967902e-06, + "loss": 2.4271, + "theoretical_loss": 3.2984139693326435, + "tokens_seen": 3278017536 + }, + { + "epoch": 11.0, + "learning_rate": 3.530591775325978e-06, + "loss": 2.5892, + "theoretical_loss": 3.2984089899594253, + "tokens_seen": 3278083072 + }, + { + "epoch": 11.0, + "learning_rate": 3.5205616850551656e-06, + "loss": 2.3962, + "theoretical_loss": 3.2984040107136274, + "tokens_seen": 3278148608 + }, + { + "epoch": 11.0, + "learning_rate": 3.510531594784353e-06, + "loss": 2.5071, + "theoretical_loss": 3.298399031595244, + "tokens_seen": 3278214144 + }, + { + "epoch": 11.0, + "learning_rate": 3.500501504513541e-06, + "loss": 2.637, + "theoretical_loss": 3.2983940526042694, + "tokens_seen": 3278279680 + }, + { + "epoch": 11.0, + "learning_rate": 3.490471414242728e-06, + "loss": 2.4059, + "theoretical_loss": 3.298389073740698, + "tokens_seen": 3278345216 + }, + { + "epoch": 11.0, + "learning_rate": 3.4804413239719155e-06, + "loss": 2.4829, + "theoretical_loss": 3.2983840950045233, + "tokens_seen": 3278410752 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3631478, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.516972064971924, + "objective/train/theoretical_loss": 3.298382850340385, + "objective/train/tokens_used": 3298887136, + "theoretical_loss": 3.298382850340385, + "tokens_seen": 3278427136 + }, + { + "epoch": 11.0, + "learning_rate": 3.4704112337011034e-06, + "loss": 2.404, + "theoretical_loss": 3.2983791163957403, + "tokens_seen": 3278476288 + }, + { + "epoch": 11.0, + "learning_rate": 3.460381143430291e-06, + "loss": 2.4454, + "theoretical_loss": 3.298374137914343, + "tokens_seen": 3278541824 + }, + { + "epoch": 11.0, + "learning_rate": 3.4503510531594788e-06, + "loss": 2.4664, + "theoretical_loss": 3.2983691595603255, + "tokens_seen": 3278607360 + }, + { + "epoch": 11.0, + "learning_rate": 3.4403209628886662e-06, + "loss": 2.5101, + "theoretical_loss": 3.298364181333682, + "tokens_seen": 3278672896 + }, + { + "epoch": 11.0, + "learning_rate": 3.4302908726178537e-06, + "loss": 2.4467, + "theoretical_loss": 3.2983592032344067, + "tokens_seen": 3278738432 + }, + { + "epoch": 11.0, + "learning_rate": 3.420260782347041e-06, + "loss": 2.4986, + "theoretical_loss": 3.2983542252624933, + "tokens_seen": 3278803968 + }, + { + "epoch": 11.0, + "learning_rate": 3.4102306920762287e-06, + "loss": 2.5225, + "theoretical_loss": 3.298349247417937, + "tokens_seen": 3278869504 + }, + { + "epoch": 11.0, + "learning_rate": 3.400200601805416e-06, + "loss": 2.3403, + "theoretical_loss": 3.2983442697007312, + "tokens_seen": 3278935040 + }, + { + "epoch": 11.0, + "learning_rate": 3.390170511534604e-06, + "loss": 2.4272, + "theoretical_loss": 3.2983392921108705, + "tokens_seen": 3279000576 + }, + { + "epoch": 11.0, + "learning_rate": 3.3801404212637915e-06, + "loss": 2.4724, + "theoretical_loss": 3.298334314648349, + "tokens_seen": 3279066112 + }, + { + "epoch": 11.0, + "learning_rate": 3.370110330992979e-06, + "loss": 2.4512, + "theoretical_loss": 3.2983293373131604, + "tokens_seen": 3279131648 + }, + { + "epoch": 11.0, + "learning_rate": 3.360080240722167e-06, + "loss": 2.394, + "theoretical_loss": 3.2983243601053, + "tokens_seen": 3279197184 + }, + { + "epoch": 11.0, + "learning_rate": 3.350050150451354e-06, + "loss": 2.4374, + "theoretical_loss": 3.2983193830247606, + "tokens_seen": 3279262720 + }, + { + "epoch": 11.0, + "learning_rate": 3.3400200601805414e-06, + "loss": 2.3747, + "theoretical_loss": 3.2983144060715377, + "tokens_seen": 3279328256 + }, + { + "epoch": 11.0, + "learning_rate": 3.3299899699097293e-06, + "loss": 2.4705, + "theoretical_loss": 3.298309429245625, + "tokens_seen": 3279393792 + }, + { + "epoch": 11.0, + "learning_rate": 3.3199598796389168e-06, + "loss": 2.4021, + "theoretical_loss": 3.298304452547016, + "tokens_seen": 3279459328 + }, + { + "epoch": 11.0, + "learning_rate": 3.3099297893681047e-06, + "loss": 2.4078, + "theoretical_loss": 3.298299475975706, + "tokens_seen": 3279524864 + }, + { + "epoch": 11.0, + "learning_rate": 3.299899699097292e-06, + "loss": 2.4884, + "theoretical_loss": 3.298294499531689, + "tokens_seen": 3279590400 + }, + { + "epoch": 11.0, + "learning_rate": 3.2898696088264796e-06, + "loss": 2.474, + "theoretical_loss": 3.298289523214959, + "tokens_seen": 3279655936 + }, + { + "epoch": 11.0, + "learning_rate": 3.279839518555667e-06, + "loss": 2.354, + "theoretical_loss": 3.2982845470255095, + "tokens_seen": 3279721472 + }, + { + "epoch": 11.0, + "learning_rate": 3.2698094282848546e-06, + "loss": 2.5486, + "theoretical_loss": 3.2982795709633357, + "tokens_seen": 3279787008 + }, + { + "epoch": 11.0, + "learning_rate": 3.259779338014042e-06, + "loss": 2.4622, + "theoretical_loss": 3.2982745950284316, + "tokens_seen": 3279852544 + }, + { + "epoch": 11.0, + "learning_rate": 3.24974924774323e-06, + "loss": 2.4006, + "theoretical_loss": 3.298269619220791, + "tokens_seen": 3279918080 + }, + { + "epoch": 11.0, + "learning_rate": 3.2397191574724174e-06, + "loss": 2.5002, + "theoretical_loss": 3.298264643540408, + "tokens_seen": 3279983616 + }, + { + "epoch": 11.0, + "learning_rate": 3.229689067201605e-06, + "loss": 2.415, + "theoretical_loss": 3.2982596679872778, + "tokens_seen": 3280049152 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 3636498, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.803462028503418, + "objective/train/theoretical_loss": 3.298258424118878, + "objective/train/tokens_used": 3300525536, + "theoretical_loss": 3.298258424118878, + "tokens_seen": 3280065536 + }, + { + "epoch": 11.0, + "learning_rate": 3.2196589769307928e-06, + "loss": 2.528, + "theoretical_loss": 3.298254692561394, + "tokens_seen": 3280114688 + }, + { + "epoch": 11.0, + "learning_rate": 3.20962888665998e-06, + "loss": 2.4099, + "theoretical_loss": 3.2982497172627503, + "tokens_seen": 3280180224 + }, + { + "epoch": 11.0, + "learning_rate": 3.1995987963891673e-06, + "loss": 2.284, + "theoretical_loss": 3.2982447420913417, + "tokens_seen": 3280245760 + }, + { + "epoch": 11.0, + "learning_rate": 3.189568706118355e-06, + "loss": 2.3534, + "theoretical_loss": 3.298239767047162, + "tokens_seen": 3280311296 + }, + { + "epoch": 11.0, + "learning_rate": 3.1795386158475427e-06, + "loss": 2.4631, + "theoretical_loss": 3.2982347921302053, + "tokens_seen": 3280376832 + }, + { + "epoch": 11.0, + "learning_rate": 3.16950852557673e-06, + "loss": 2.3571, + "theoretical_loss": 3.298229817340466, + "tokens_seen": 3280442368 + }, + { + "epoch": 11.0, + "learning_rate": 3.159478435305918e-06, + "loss": 2.6004, + "theoretical_loss": 3.2982248426779384, + "tokens_seen": 3280507904 + }, + { + "epoch": 11.0, + "learning_rate": 3.1494483450351055e-06, + "loss": 2.5979, + "theoretical_loss": 3.298219868142617, + "tokens_seen": 3280573440 + }, + { + "epoch": 11.0, + "learning_rate": 3.139418254764293e-06, + "loss": 2.5286, + "theoretical_loss": 3.298214893734495, + "tokens_seen": 3280638976 + }, + { + "epoch": 11.0, + "learning_rate": 3.1293881644934804e-06, + "loss": 2.35, + "theoretical_loss": 3.2982099194535675, + "tokens_seen": 3280704512 + }, + { + "epoch": 11.0, + "learning_rate": 3.119358074222668e-06, + "loss": 2.5921, + "theoretical_loss": 3.298204945299828, + "tokens_seen": 3280770048 + }, + { + "epoch": 11.0, + "learning_rate": 3.109327983951856e-06, + "loss": 2.6235, + "theoretical_loss": 3.2981999712732715, + "tokens_seen": 3280835584 + }, + { + "epoch": 11.01, + "learning_rate": 3.0992978936810433e-06, + "loss": 2.4394, + "theoretical_loss": 3.2981949973738915, + "tokens_seen": 3280901120 + }, + { + "epoch": 11.01, + "learning_rate": 3.0892678034102308e-06, + "loss": 2.3822, + "theoretical_loss": 3.298190023601683, + "tokens_seen": 3280966656 + }, + { + "epoch": 11.01, + "learning_rate": 3.0792377131394187e-06, + "loss": 2.5673, + "theoretical_loss": 3.2981850499566394, + "tokens_seen": 3281032192 + }, + { + "epoch": 11.01, + "learning_rate": 3.0692076228686057e-06, + "loss": 2.4498, + "theoretical_loss": 3.298180076438755, + "tokens_seen": 3281097728 + }, + { + "epoch": 11.01, + "learning_rate": 3.059177532597793e-06, + "loss": 2.5526, + "theoretical_loss": 3.298175103048025, + "tokens_seen": 3281163264 + }, + { + "epoch": 11.01, + "learning_rate": 3.049147442326981e-06, + "loss": 2.521, + "theoretical_loss": 3.2981701297844426, + "tokens_seen": 3281228800 + }, + { + "epoch": 11.01, + "learning_rate": 3.0391173520561685e-06, + "loss": 2.3419, + "theoretical_loss": 3.298165156648002, + "tokens_seen": 3281294336 + }, + { + "epoch": 11.01, + "learning_rate": 3.029087261785356e-06, + "loss": 2.5755, + "theoretical_loss": 3.298160183638698, + "tokens_seen": 3281359872 + }, + { + "epoch": 11.01, + "learning_rate": 3.019057171514544e-06, + "loss": 2.5974, + "theoretical_loss": 3.298155210756524, + "tokens_seen": 3281425408 + }, + { + "epoch": 11.01, + "learning_rate": 3.009027081243731e-06, + "loss": 2.496, + "theoretical_loss": 3.298150238001475, + "tokens_seen": 3281490944 + }, + { + "epoch": 11.01, + "learning_rate": 2.998996990972919e-06, + "loss": 2.4782, + "theoretical_loss": 3.2981452653735452, + "tokens_seen": 3281556480 + }, + { + "epoch": 11.01, + "learning_rate": 2.9889669007021063e-06, + "loss": 2.4258, + "theoretical_loss": 3.2981402928727284, + "tokens_seen": 3281622016 + }, + { + "epoch": 11.01, + "learning_rate": 2.978936810431294e-06, + "loss": 2.4614, + "theoretical_loss": 3.2981353204990187, + "tokens_seen": 3281687552 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3641603, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.629624843597412, + "objective/train/theoretical_loss": 3.2981340774254515, + "objective/train/tokens_used": 3302163936, + "theoretical_loss": 3.2981340774254515, + "tokens_seen": 3281703936 + }, + { + "epoch": 11.01, + "learning_rate": 2.9689067201604817e-06, + "loss": 2.3758, + "theoretical_loss": 3.298130348252411, + "tokens_seen": 3281753088 + }, + { + "epoch": 11.01, + "learning_rate": 2.958876629889669e-06, + "loss": 2.3315, + "theoretical_loss": 3.298125376132899, + "tokens_seen": 3281818624 + }, + { + "epoch": 11.01, + "learning_rate": 2.9488465396188567e-06, + "loss": 2.562, + "theoretical_loss": 3.2981204041404766, + "tokens_seen": 3281884160 + }, + { + "epoch": 11.01, + "learning_rate": 2.938816449348044e-06, + "loss": 2.4936, + "theoretical_loss": 3.2981154322751385, + "tokens_seen": 3281949696 + }, + { + "epoch": 11.01, + "learning_rate": 2.9287863590772316e-06, + "loss": 2.3418, + "theoretical_loss": 3.298110460536879, + "tokens_seen": 3282015232 + }, + { + "epoch": 11.01, + "learning_rate": 2.918756268806419e-06, + "loss": 2.4756, + "theoretical_loss": 3.298105488925692, + "tokens_seen": 3282080768 + }, + { + "epoch": 11.01, + "learning_rate": 2.908726178535607e-06, + "loss": 2.3747, + "theoretical_loss": 3.298100517441572, + "tokens_seen": 3282146304 + }, + { + "epoch": 11.01, + "learning_rate": 2.8986960882647944e-06, + "loss": 2.6012, + "theoretical_loss": 3.2980955460845127, + "tokens_seen": 3282211840 + }, + { + "epoch": 11.01, + "learning_rate": 2.888665997993982e-06, + "loss": 2.5482, + "theoretical_loss": 3.298090574854509, + "tokens_seen": 3282277376 + }, + { + "epoch": 11.01, + "learning_rate": 2.87863590772317e-06, + "loss": 2.5797, + "theoretical_loss": 3.298085603751555, + "tokens_seen": 3282342912 + }, + { + "epoch": 11.01, + "learning_rate": 2.868605817452357e-06, + "loss": 2.4004, + "theoretical_loss": 3.298080632775644, + "tokens_seen": 3282408448 + }, + { + "epoch": 11.01, + "learning_rate": 2.8585757271815448e-06, + "loss": 2.3293, + "theoretical_loss": 3.2980756619267715, + "tokens_seen": 3282473984 + }, + { + "epoch": 11.01, + "learning_rate": 2.8485456369107322e-06, + "loss": 2.5562, + "theoretical_loss": 3.298070691204931, + "tokens_seen": 3282539520 + }, + { + "epoch": 11.01, + "learning_rate": 2.8385155466399197e-06, + "loss": 2.3933, + "theoretical_loss": 3.2980657206101167, + "tokens_seen": 3282605056 + }, + { + "epoch": 11.01, + "learning_rate": 2.8284854563691076e-06, + "loss": 2.3639, + "theoretical_loss": 3.2980607501423234, + "tokens_seen": 3282670592 + }, + { + "epoch": 11.01, + "learning_rate": 2.818455366098295e-06, + "loss": 2.4546, + "theoretical_loss": 3.2980557798015444, + "tokens_seen": 3282736128 + }, + { + "epoch": 11.01, + "learning_rate": 2.8084252758274825e-06, + "loss": 2.3462, + "theoretical_loss": 3.2980508095877745, + "tokens_seen": 3282801664 + }, + { + "epoch": 11.01, + "learning_rate": 2.79839518555667e-06, + "loss": 2.5417, + "theoretical_loss": 3.2980458395010084, + "tokens_seen": 3282867200 + }, + { + "epoch": 11.01, + "learning_rate": 2.7883650952858575e-06, + "loss": 2.4159, + "theoretical_loss": 3.2980408695412393, + "tokens_seen": 3282932736 + }, + { + "epoch": 11.01, + "learning_rate": 2.778335005015045e-06, + "loss": 2.502, + "theoretical_loss": 3.2980358997084616, + "tokens_seen": 3282998272 + }, + { + "epoch": 11.01, + "learning_rate": 2.768304914744233e-06, + "loss": 2.3216, + "theoretical_loss": 3.2980309300026702, + "tokens_seen": 3283063808 + }, + { + "epoch": 11.01, + "learning_rate": 2.7582748244734203e-06, + "loss": 2.491, + "theoretical_loss": 3.2980259604238586, + "tokens_seen": 3283129344 + }, + { + "epoch": 11.01, + "learning_rate": 2.748244734202608e-06, + "loss": 2.3682, + "theoretical_loss": 3.2980209909720215, + "tokens_seen": 3283194880 + }, + { + "epoch": 11.01, + "learning_rate": 2.7382146439317957e-06, + "loss": 2.4925, + "theoretical_loss": 3.298016021647153, + "tokens_seen": 3283260416 + }, + { + "epoch": 11.01, + "learning_rate": 2.7281845536609828e-06, + "loss": 2.6088, + "theoretical_loss": 3.2980110524492474, + "tokens_seen": 3283325952 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3646724, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.213735818862915, + "objective/train/theoretical_loss": 3.2980098101696083, + "objective/train/tokens_used": 3303802336, + "theoretical_loss": 3.2980098101696083, + "tokens_seen": 3283342336 + }, + { + "epoch": 11.01, + "learning_rate": 2.7181544633901707e-06, + "loss": 2.4305, + "theoretical_loss": 3.298006083378299, + "tokens_seen": 3283391488 + }, + { + "epoch": 11.01, + "learning_rate": 2.708124373119358e-06, + "loss": 2.3703, + "theoretical_loss": 3.2980011144343013, + "tokens_seen": 3283457024 + }, + { + "epoch": 11.01, + "learning_rate": 2.6980942828485456e-06, + "loss": 2.4496, + "theoretical_loss": 3.297996145617249, + "tokens_seen": 3283522560 + }, + { + "epoch": 11.01, + "learning_rate": 2.6880641925777335e-06, + "loss": 2.4439, + "theoretical_loss": 3.2979911769271366, + "tokens_seen": 3283588096 + }, + { + "epoch": 11.01, + "learning_rate": 2.678034102306921e-06, + "loss": 2.4186, + "theoretical_loss": 3.297986208363958, + "tokens_seen": 3283653632 + }, + { + "epoch": 11.01, + "learning_rate": 2.6680040120361084e-06, + "loss": 2.3892, + "theoretical_loss": 3.2979812399277075, + "tokens_seen": 3283719168 + }, + { + "epoch": 11.01, + "learning_rate": 2.657973921765296e-06, + "loss": 2.5647, + "theoretical_loss": 3.2979762716183796, + "tokens_seen": 3283784704 + }, + { + "epoch": 11.01, + "learning_rate": 2.6479438314944834e-06, + "loss": 2.5906, + "theoretical_loss": 3.2979713034359683, + "tokens_seen": 3283850240 + }, + { + "epoch": 11.01, + "learning_rate": 2.637913741223671e-06, + "loss": 2.5927, + "theoretical_loss": 3.297966335380467, + "tokens_seen": 3283915776 + }, + { + "epoch": 11.01, + "learning_rate": 2.6278836509528588e-06, + "loss": 2.3992, + "theoretical_loss": 3.2979613674518715, + "tokens_seen": 3283981312 + }, + { + "epoch": 11.01, + "learning_rate": 2.6178535606820462e-06, + "loss": 2.3425, + "theoretical_loss": 3.2979563996501753, + "tokens_seen": 3284046848 + }, + { + "epoch": 11.01, + "learning_rate": 2.6078234704112337e-06, + "loss": 2.5707, + "theoretical_loss": 3.297951431975372, + "tokens_seen": 3284112384 + }, + { + "epoch": 11.01, + "learning_rate": 2.5977933801404216e-06, + "loss": 2.486, + "theoretical_loss": 3.297946464427457, + "tokens_seen": 3284177920 + }, + { + "epoch": 11.01, + "learning_rate": 2.5877632898696087e-06, + "loss": 2.5502, + "theoretical_loss": 3.2979414970064234, + "tokens_seen": 3284243456 + }, + { + "epoch": 11.01, + "learning_rate": 2.577733199598796e-06, + "loss": 2.3492, + "theoretical_loss": 3.297936529712266, + "tokens_seen": 3284308992 + }, + { + "epoch": 11.01, + "learning_rate": 2.567703109327984e-06, + "loss": 2.3892, + "theoretical_loss": 3.297931562544979, + "tokens_seen": 3284374528 + }, + { + "epoch": 11.01, + "learning_rate": 2.5576730190571715e-06, + "loss": 2.5177, + "theoretical_loss": 3.297926595504557, + "tokens_seen": 3284440064 + }, + { + "epoch": 11.01, + "learning_rate": 2.5476429287863594e-06, + "loss": 2.4447, + "theoretical_loss": 3.2979216285909936, + "tokens_seen": 3284505600 + }, + { + "epoch": 11.01, + "learning_rate": 2.537612838515547e-06, + "loss": 2.4984, + "theoretical_loss": 3.2979166618042832, + "tokens_seen": 3284571136 + }, + { + "epoch": 11.01, + "learning_rate": 2.5275827482447343e-06, + "loss": 2.3384, + "theoretical_loss": 3.2979116951444203, + "tokens_seen": 3284636672 + }, + { + "epoch": 11.01, + "learning_rate": 2.517552657973922e-06, + "loss": 2.4293, + "theoretical_loss": 3.2979067286113986, + "tokens_seen": 3284702208 + }, + { + "epoch": 11.01, + "learning_rate": 2.5075225677031093e-06, + "loss": 2.4272, + "theoretical_loss": 3.2979017622052127, + "tokens_seen": 3284767744 + }, + { + "epoch": 11.01, + "learning_rate": 2.4974924774322968e-06, + "loss": 2.5172, + "theoretical_loss": 3.297896795925857, + "tokens_seen": 3284833280 + }, + { + "epoch": 11.01, + "learning_rate": 2.4874623871614847e-06, + "loss": 2.2925, + "theoretical_loss": 3.2978918297733255, + "tokens_seen": 3284898816 + }, + { + "epoch": 11.01, + "learning_rate": 2.477432296890672e-06, + "loss": 2.4706, + "theoretical_loss": 3.297886863747612, + "tokens_seen": 3284964352 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3651736, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.337067127227783, + "objective/train/theoretical_loss": 3.2978856222609987, + "objective/train/tokens_used": 3305440736, + "theoretical_loss": 3.2978856222609987, + "tokens_seen": 3284980736 + }, + { + "epoch": 11.01, + "learning_rate": 2.4674022066198596e-06, + "loss": 2.3101, + "theoretical_loss": 3.2978818978487117, + "tokens_seen": 3285029888 + }, + { + "epoch": 11.01, + "learning_rate": 2.4573721163490475e-06, + "loss": 2.423, + "theoretical_loss": 3.297876932076618, + "tokens_seen": 3285095424 + }, + { + "epoch": 11.01, + "learning_rate": 2.4473420260782345e-06, + "loss": 2.3217, + "theoretical_loss": 3.297871966431326, + "tokens_seen": 3285160960 + }, + { + "epoch": 11.01, + "learning_rate": 2.437311935807422e-06, + "loss": 2.5017, + "theoretical_loss": 3.2978670009128286, + "tokens_seen": 3285226496 + }, + { + "epoch": 11.01, + "learning_rate": 2.42728184553661e-06, + "loss": 2.4069, + "theoretical_loss": 3.297862035521121, + "tokens_seen": 3285292032 + }, + { + "epoch": 11.01, + "learning_rate": 2.4172517552657974e-06, + "loss": 2.5342, + "theoretical_loss": 3.2978570702561973, + "tokens_seen": 3285357568 + }, + { + "epoch": 11.01, + "learning_rate": 2.4072216649949853e-06, + "loss": 2.2647, + "theoretical_loss": 3.297852105118052, + "tokens_seen": 3285423104 + }, + { + "epoch": 11.01, + "learning_rate": 2.3971915747241728e-06, + "loss": 2.4249, + "theoretical_loss": 3.2978471401066782, + "tokens_seen": 3285488640 + }, + { + "epoch": 11.01, + "learning_rate": 2.38716148445336e-06, + "loss": 2.4837, + "theoretical_loss": 3.2978421752220717, + "tokens_seen": 3285554176 + }, + { + "epoch": 11.01, + "learning_rate": 2.3771313941825477e-06, + "loss": 2.2961, + "theoretical_loss": 3.297837210464226, + "tokens_seen": 3285619712 + }, + { + "epoch": 11.01, + "learning_rate": 2.367101303911735e-06, + "loss": 2.3352, + "theoretical_loss": 3.297832245833135, + "tokens_seen": 3285685248 + }, + { + "epoch": 11.01, + "learning_rate": 2.3570712136409226e-06, + "loss": 2.4047, + "theoretical_loss": 3.297827281328793, + "tokens_seen": 3285750784 + }, + { + "epoch": 11.01, + "learning_rate": 2.3470411233701105e-06, + "loss": 2.5793, + "theoretical_loss": 3.297822316951195, + "tokens_seen": 3285816320 + }, + { + "epoch": 11.01, + "learning_rate": 2.337011033099298e-06, + "loss": 2.5735, + "theoretical_loss": 3.2978173527003345, + "tokens_seen": 3285881856 + }, + { + "epoch": 11.01, + "learning_rate": 2.3269809428284855e-06, + "loss": 2.4261, + "theoretical_loss": 3.297812388576206, + "tokens_seen": 3285947392 + }, + { + "epoch": 11.01, + "learning_rate": 2.316950852557673e-06, + "loss": 2.3893, + "theoretical_loss": 3.2978074245788034, + "tokens_seen": 3286012928 + }, + { + "epoch": 11.01, + "learning_rate": 2.3069207622868604e-06, + "loss": 2.593, + "theoretical_loss": 3.2978024607081213, + "tokens_seen": 3286078464 + }, + { + "epoch": 11.01, + "learning_rate": 2.296890672016048e-06, + "loss": 2.4433, + "theoretical_loss": 3.2977974969641544, + "tokens_seen": 3286144000 + }, + { + "epoch": 11.01, + "learning_rate": 2.286860581745236e-06, + "loss": 2.4824, + "theoretical_loss": 3.297792533346896, + "tokens_seen": 3286209536 + }, + { + "epoch": 11.01, + "learning_rate": 2.2768304914744233e-06, + "loss": 2.4602, + "theoretical_loss": 3.297787569856341, + "tokens_seen": 3286275072 + }, + { + "epoch": 11.01, + "learning_rate": 2.266800401203611e-06, + "loss": 2.3591, + "theoretical_loss": 3.2977826064924827, + "tokens_seen": 3286340608 + }, + { + "epoch": 11.01, + "learning_rate": 2.2567703109327987e-06, + "loss": 2.3961, + "theoretical_loss": 3.2977776432553165, + "tokens_seen": 3286406144 + }, + { + "epoch": 11.01, + "learning_rate": 2.2467402206619857e-06, + "loss": 2.442, + "theoretical_loss": 3.297772680144836, + "tokens_seen": 3286471680 + }, + { + "epoch": 11.01, + "learning_rate": 2.2367101303911736e-06, + "loss": 2.4098, + "theoretical_loss": 3.297767717161036, + "tokens_seen": 3286537216 + }, + { + "epoch": 11.01, + "learning_rate": 2.226680040120361e-06, + "loss": 2.41, + "theoretical_loss": 3.29776275430391, + "tokens_seen": 3286602752 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3656731, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6031546592712402, + "objective/train/theoretical_loss": 3.2977615136094207, + "objective/train/tokens_used": 3307079136, + "theoretical_loss": 3.2977615136094207, + "tokens_seen": 3286619136 + }, + { + "epoch": 11.01, + "learning_rate": 2.2166499498495485e-06, + "loss": 2.5736, + "theoretical_loss": 3.297757791573453, + "tokens_seen": 3286668288 + }, + { + "epoch": 11.01, + "learning_rate": 2.2066198595787364e-06, + "loss": 2.6197, + "theoretical_loss": 3.2977528289696583, + "tokens_seen": 3286733824 + }, + { + "epoch": 11.01, + "learning_rate": 2.196589769307924e-06, + "loss": 2.2528, + "theoretical_loss": 3.2977478664925206, + "tokens_seen": 3286799360 + }, + { + "epoch": 11.01, + "learning_rate": 2.1865596790371114e-06, + "loss": 2.1717, + "theoretical_loss": 3.2977429041420345, + "tokens_seen": 3286864896 + }, + { + "epoch": 11.01, + "learning_rate": 2.176529588766299e-06, + "loss": 2.4929, + "theoretical_loss": 3.2977379419181942, + "tokens_seen": 3286930432 + }, + { + "epoch": 11.01, + "learning_rate": 2.1664994984954863e-06, + "loss": 2.4075, + "theoretical_loss": 3.2977329798209936, + "tokens_seen": 3286995968 + }, + { + "epoch": 11.01, + "learning_rate": 2.156469408224674e-06, + "loss": 2.3045, + "theoretical_loss": 3.2977280178504267, + "tokens_seen": 3287061504 + }, + { + "epoch": 11.01, + "learning_rate": 2.1464393179538617e-06, + "loss": 2.4949, + "theoretical_loss": 3.2977230560064883, + "tokens_seen": 3287127040 + }, + { + "epoch": 11.01, + "learning_rate": 2.136409227683049e-06, + "loss": 2.4567, + "theoretical_loss": 3.297718094289172, + "tokens_seen": 3287192576 + }, + { + "epoch": 11.01, + "learning_rate": 2.126379137412237e-06, + "loss": 2.3193, + "theoretical_loss": 3.297713132698473, + "tokens_seen": 3287258112 + }, + { + "epoch": 11.01, + "learning_rate": 2.1163490471414245e-06, + "loss": 2.4916, + "theoretical_loss": 3.297708171234385, + "tokens_seen": 3287323648 + }, + { + "epoch": 11.01, + "learning_rate": 2.1063189568706116e-06, + "loss": 2.5118, + "theoretical_loss": 3.297703209896902, + "tokens_seen": 3287389184 + }, + { + "epoch": 11.01, + "learning_rate": 2.0962888665997995e-06, + "loss": 2.3504, + "theoretical_loss": 3.297698248686019, + "tokens_seen": 3287454720 + }, + { + "epoch": 11.01, + "learning_rate": 2.086258776328987e-06, + "loss": 2.4586, + "theoretical_loss": 3.297693287601729, + "tokens_seen": 3287520256 + }, + { + "epoch": 11.01, + "learning_rate": 2.0762286860581744e-06, + "loss": 2.569, + "theoretical_loss": 3.2976883266440273, + "tokens_seen": 3287585792 + }, + { + "epoch": 11.01, + "learning_rate": 2.0661985957873623e-06, + "loss": 2.4847, + "theoretical_loss": 3.297683365812908, + "tokens_seen": 3287651328 + }, + { + "epoch": 11.01, + "learning_rate": 2.05616850551655e-06, + "loss": 2.6212, + "theoretical_loss": 3.297678405108365, + "tokens_seen": 3287716864 + }, + { + "epoch": 11.01, + "learning_rate": 2.0461384152457373e-06, + "loss": 2.4075, + "theoretical_loss": 3.297673444530393, + "tokens_seen": 3287782400 + }, + { + "epoch": 11.01, + "learning_rate": 2.0361083249749248e-06, + "loss": 2.382, + "theoretical_loss": 3.2976684840789856, + "tokens_seen": 3287847936 + }, + { + "epoch": 11.01, + "learning_rate": 2.0260782347041122e-06, + "loss": 2.5109, + "theoretical_loss": 3.297663523754138, + "tokens_seen": 3287913472 + }, + { + "epoch": 11.01, + "learning_rate": 2.0160481444332997e-06, + "loss": 2.4216, + "theoretical_loss": 3.297658563555843, + "tokens_seen": 3287979008 + }, + { + "epoch": 11.01, + "learning_rate": 2.0060180541624876e-06, + "loss": 2.2866, + "theoretical_loss": 3.297653603484097, + "tokens_seen": 3288044544 + }, + { + "epoch": 11.01, + "learning_rate": 1.995987963891675e-06, + "loss": 2.3096, + "theoretical_loss": 3.297648643538892, + "tokens_seen": 3288110080 + }, + { + "epoch": 11.01, + "learning_rate": 1.985957873620863e-06, + "loss": 2.5463, + "theoretical_loss": 3.2976436837202234, + "tokens_seen": 3288175616 + }, + { + "epoch": 11.01, + "learning_rate": 1.9759277833500504e-06, + "loss": 2.4233, + "theoretical_loss": 3.2976387240280856, + "tokens_seen": 3288241152 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3657404, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.405703544616699, + "objective/train/theoretical_loss": 3.2976374841248206, + "objective/train/tokens_used": 3308717536, + "theoretical_loss": 3.2976374841248206, + "tokens_seen": 3288257536 + }, + { + "epoch": 11.01, + "learning_rate": 1.9658976930792375e-06, + "loss": 2.4049, + "theoretical_loss": 3.297633764462472, + "tokens_seen": 3288306688 + }, + { + "epoch": 11.01, + "learning_rate": 1.9558676028084254e-06, + "loss": 2.4662, + "theoretical_loss": 3.2976288050233777, + "tokens_seen": 3288372224 + }, + { + "epoch": 11.01, + "learning_rate": 1.945837512537613e-06, + "loss": 2.6248, + "theoretical_loss": 3.2976238457107967, + "tokens_seen": 3288437760 + }, + { + "epoch": 11.01, + "learning_rate": 1.9358074222668003e-06, + "loss": 2.3938, + "theoretical_loss": 3.297618886524723, + "tokens_seen": 3288503296 + }, + { + "epoch": 11.01, + "learning_rate": 1.9257773319959882e-06, + "loss": 2.4305, + "theoretical_loss": 3.297613927465151, + "tokens_seen": 3288568832 + }, + { + "epoch": 11.01, + "learning_rate": 1.9157472417251753e-06, + "loss": 2.3802, + "theoretical_loss": 3.2976089685320753, + "tokens_seen": 3288634368 + }, + { + "epoch": 11.01, + "learning_rate": 1.9057171514543632e-06, + "loss": 2.541, + "theoretical_loss": 3.2976040097254895, + "tokens_seen": 3288699904 + }, + { + "epoch": 11.01, + "learning_rate": 1.8956870611835506e-06, + "loss": 2.4723, + "theoretical_loss": 3.2975990510453883, + "tokens_seen": 3288765440 + }, + { + "epoch": 11.01, + "learning_rate": 1.8856569709127383e-06, + "loss": 2.3862, + "theoretical_loss": 3.297594092491766, + "tokens_seen": 3288830976 + }, + { + "epoch": 11.01, + "learning_rate": 1.8756268806419258e-06, + "loss": 2.4118, + "theoretical_loss": 3.2975891340646166, + "tokens_seen": 3288896512 + }, + { + "epoch": 11.01, + "learning_rate": 1.8655967903711135e-06, + "loss": 2.3209, + "theoretical_loss": 3.2975841757639346, + "tokens_seen": 3288962048 + }, + { + "epoch": 11.01, + "learning_rate": 1.855566700100301e-06, + "loss": 2.509, + "theoretical_loss": 3.2975792175897136, + "tokens_seen": 3289027584 + }, + { + "epoch": 11.01, + "learning_rate": 1.8455366098294884e-06, + "loss": 2.6126, + "theoretical_loss": 3.297574259541949, + "tokens_seen": 3289093120 + }, + { + "epoch": 11.01, + "learning_rate": 1.8355065195586761e-06, + "loss": 2.314, + "theoretical_loss": 3.297569301620634, + "tokens_seen": 3289158656 + }, + { + "epoch": 11.01, + "learning_rate": 1.8254764292878636e-06, + "loss": 2.4824, + "theoretical_loss": 3.2975643438257634, + "tokens_seen": 3289224192 + }, + { + "epoch": 11.01, + "learning_rate": 1.815446339017051e-06, + "loss": 2.3696, + "theoretical_loss": 3.2975593861573316, + "tokens_seen": 3289289728 + }, + { + "epoch": 11.01, + "learning_rate": 1.8054162487462388e-06, + "loss": 2.2555, + "theoretical_loss": 3.297554428615332, + "tokens_seen": 3289355264 + }, + { + "epoch": 11.01, + "learning_rate": 1.7953861584754264e-06, + "loss": 2.2736, + "theoretical_loss": 3.2975494711997597, + "tokens_seen": 3289420800 + }, + { + "epoch": 11.01, + "learning_rate": 1.785356068204614e-06, + "loss": 2.1785, + "theoretical_loss": 3.297544513910609, + "tokens_seen": 3289486336 + }, + { + "epoch": 11.01, + "learning_rate": 1.7753259779338014e-06, + "loss": 2.2953, + "theoretical_loss": 3.2975395567478736, + "tokens_seen": 3289551872 + }, + { + "epoch": 11.01, + "learning_rate": 1.765295887662989e-06, + "loss": 2.5441, + "theoretical_loss": 3.297534599711548, + "tokens_seen": 3289617408 + }, + { + "epoch": 11.01, + "learning_rate": 1.7552657973921765e-06, + "loss": 2.475, + "theoretical_loss": 3.2975296428016265, + "tokens_seen": 3289682944 + }, + { + "epoch": 11.01, + "learning_rate": 1.745235707121364e-06, + "loss": 2.4941, + "theoretical_loss": 3.2975246860181033, + "tokens_seen": 3289748480 + }, + { + "epoch": 11.01, + "learning_rate": 1.7352056168505517e-06, + "loss": 2.3968, + "theoretical_loss": 3.297519729360973, + "tokens_seen": 3289814016 + }, + { + "epoch": 11.01, + "learning_rate": 1.7251755265797394e-06, + "loss": 2.3742, + "theoretical_loss": 3.297514772830229, + "tokens_seen": 3289879552 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3658524, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5236475467681885, + "objective/train/theoretical_loss": 3.2975135337172903, + "objective/train/tokens_used": 3310355936, + "theoretical_loss": 3.2975135337172903, + "tokens_seen": 3289895936 + }, + { + "epoch": 11.01, + "learning_rate": 1.7151454363089269e-06, + "loss": 2.4146, + "theoretical_loss": 3.2975098164258663, + "tokens_seen": 3289945088 + }, + { + "epoch": 11.01, + "learning_rate": 1.7051153460381143e-06, + "loss": 2.4894, + "theoretical_loss": 3.297504860147879, + "tokens_seen": 3290010624 + }, + { + "epoch": 11.01, + "learning_rate": 1.695085255767302e-06, + "loss": 2.356, + "theoretical_loss": 3.2974999039962616, + "tokens_seen": 3290076160 + }, + { + "epoch": 11.01, + "learning_rate": 1.6850551654964895e-06, + "loss": 2.3375, + "theoretical_loss": 3.297494947971008, + "tokens_seen": 3290141696 + }, + { + "epoch": 11.01, + "learning_rate": 1.675025075225677e-06, + "loss": 2.5238, + "theoretical_loss": 3.2974899920721126, + "tokens_seen": 3290207232 + }, + { + "epoch": 11.01, + "learning_rate": 1.6649949849548646e-06, + "loss": 2.4811, + "theoretical_loss": 3.2974850362995696, + "tokens_seen": 3290272768 + }, + { + "epoch": 11.01, + "learning_rate": 1.6549648946840523e-06, + "loss": 2.4245, + "theoretical_loss": 3.297480080653373, + "tokens_seen": 3290338304 + }, + { + "epoch": 11.01, + "learning_rate": 1.6449348044132398e-06, + "loss": 2.5195, + "theoretical_loss": 3.2974751251335173, + "tokens_seen": 3290403840 + }, + { + "epoch": 11.01, + "learning_rate": 1.6349047141424273e-06, + "loss": 2.6218, + "theoretical_loss": 3.2974701697399973, + "tokens_seen": 3290469376 + }, + { + "epoch": 11.01, + "learning_rate": 1.624874623871615e-06, + "loss": 2.3705, + "theoretical_loss": 3.2974652144728065, + "tokens_seen": 3290534912 + }, + { + "epoch": 11.01, + "learning_rate": 1.6148445336008024e-06, + "loss": 2.4007, + "theoretical_loss": 3.297460259331939, + "tokens_seen": 3290600448 + }, + { + "epoch": 11.01, + "learning_rate": 1.60481444332999e-06, + "loss": 2.4544, + "theoretical_loss": 3.29745530431739, + "tokens_seen": 3290665984 + }, + { + "epoch": 11.01, + "learning_rate": 1.5947843530591776e-06, + "loss": 2.3477, + "theoretical_loss": 3.297450349429153, + "tokens_seen": 3290731520 + }, + { + "epoch": 11.01, + "learning_rate": 1.584754262788365e-06, + "loss": 2.5655, + "theoretical_loss": 3.297445394667223, + "tokens_seen": 3290797056 + }, + { + "epoch": 11.01, + "learning_rate": 1.5747241725175527e-06, + "loss": 2.4519, + "theoretical_loss": 3.2974404400315933, + "tokens_seen": 3290862592 + }, + { + "epoch": 11.01, + "learning_rate": 1.5646940822467402e-06, + "loss": 2.3426, + "theoretical_loss": 3.297435485522259, + "tokens_seen": 3290928128 + }, + { + "epoch": 11.01, + "learning_rate": 1.554663991975928e-06, + "loss": 2.5845, + "theoretical_loss": 3.297430531139214, + "tokens_seen": 3290993664 + }, + { + "epoch": 11.01, + "learning_rate": 1.5446339017051154e-06, + "loss": 2.5272, + "theoretical_loss": 3.297425576882452, + "tokens_seen": 3291059200 + }, + { + "epoch": 11.01, + "learning_rate": 1.5346038114343029e-06, + "loss": 2.3826, + "theoretical_loss": 3.2974206227519685, + "tokens_seen": 3291124736 + }, + { + "epoch": 11.01, + "learning_rate": 1.5245737211634905e-06, + "loss": 2.4434, + "theoretical_loss": 3.2974156687477567, + "tokens_seen": 3291190272 + }, + { + "epoch": 11.01, + "learning_rate": 1.514543630892678e-06, + "loss": 2.3248, + "theoretical_loss": 3.2974107148698115, + "tokens_seen": 3291255808 + }, + { + "epoch": 11.01, + "learning_rate": 1.5045135406218655e-06, + "loss": 2.3835, + "theoretical_loss": 3.297405761118127, + "tokens_seen": 3291321344 + }, + { + "epoch": 11.01, + "learning_rate": 1.4944834503510532e-06, + "loss": 2.4939, + "theoretical_loss": 3.2974008074926973, + "tokens_seen": 3291386880 + }, + { + "epoch": 11.01, + "learning_rate": 1.4844533600802409e-06, + "loss": 2.3812, + "theoretical_loss": 3.2973958539935166, + "tokens_seen": 3291452416 + }, + { + "epoch": 11.01, + "learning_rate": 1.4744232698094283e-06, + "loss": 2.3665, + "theoretical_loss": 3.29739090062058, + "tokens_seen": 3291517952 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3659893, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5835659503936768, + "objective/train/theoretical_loss": 3.2973896622970704, + "objective/train/tokens_used": 3311994336, + "theoretical_loss": 3.2973896622970704, + "tokens_seen": 3291534336 + }, + { + "epoch": 11.01, + "learning_rate": 1.4643931795386158e-06, + "loss": 2.4451, + "theoretical_loss": 3.2973859473738805, + "tokens_seen": 3291583488 + }, + { + "epoch": 11.01, + "learning_rate": 1.4543630892678035e-06, + "loss": 2.4541, + "theoretical_loss": 3.2973809942534134, + "tokens_seen": 3291649024 + }, + { + "epoch": 11.01, + "learning_rate": 1.444332998996991e-06, + "loss": 2.4683, + "theoretical_loss": 3.297376041259172, + "tokens_seen": 3291714560 + }, + { + "epoch": 11.01, + "learning_rate": 1.4343029087261784e-06, + "loss": 2.5376, + "theoretical_loss": 3.297371088391152, + "tokens_seen": 3291780096 + }, + { + "epoch": 11.01, + "learning_rate": 1.4242728184553661e-06, + "loss": 2.4123, + "theoretical_loss": 3.2973661356493458, + "tokens_seen": 3291845632 + }, + { + "epoch": 11.01, + "learning_rate": 1.4142427281845538e-06, + "loss": 2.6492, + "theoretical_loss": 3.2973611830337495, + "tokens_seen": 3291911168 + }, + { + "epoch": 11.01, + "learning_rate": 1.4042126379137413e-06, + "loss": 2.5908, + "theoretical_loss": 3.297356230544356, + "tokens_seen": 3291976704 + }, + { + "epoch": 11.01, + "learning_rate": 1.3941825476429287e-06, + "loss": 2.5045, + "theoretical_loss": 3.2973512781811603, + "tokens_seen": 3292042240 + }, + { + "epoch": 11.01, + "learning_rate": 1.3841524573721164e-06, + "loss": 2.3427, + "theoretical_loss": 3.2973463259441567, + "tokens_seen": 3292107776 + }, + { + "epoch": 11.01, + "learning_rate": 1.374122367101304e-06, + "loss": 2.2856, + "theoretical_loss": 3.297341373833339, + "tokens_seen": 3292173312 + }, + { + "epoch": 11.01, + "learning_rate": 1.3640922768304914e-06, + "loss": 2.3285, + "theoretical_loss": 3.297336421848702, + "tokens_seen": 3292238848 + }, + { + "epoch": 11.01, + "learning_rate": 1.354062186559679e-06, + "loss": 2.5363, + "theoretical_loss": 3.2973314699902394, + "tokens_seen": 3292304384 + }, + { + "epoch": 11.01, + "learning_rate": 1.3440320962888667e-06, + "loss": 2.3616, + "theoretical_loss": 3.297326518257946, + "tokens_seen": 3292369920 + }, + { + "epoch": 11.01, + "learning_rate": 1.3340020060180542e-06, + "loss": 2.5385, + "theoretical_loss": 3.297321566651816, + "tokens_seen": 3292435456 + }, + { + "epoch": 11.01, + "learning_rate": 1.3239719157472417e-06, + "loss": 2.4934, + "theoretical_loss": 3.297316615171843, + "tokens_seen": 3292500992 + }, + { + "epoch": 11.01, + "learning_rate": 1.3139418254764294e-06, + "loss": 2.6809, + "theoretical_loss": 3.2973116638180224, + "tokens_seen": 3292566528 + }, + { + "epoch": 11.01, + "learning_rate": 1.3039117352056169e-06, + "loss": 2.3946, + "theoretical_loss": 3.2973067125903475, + "tokens_seen": 3292632064 + }, + { + "epoch": 11.01, + "learning_rate": 1.2938816449348043e-06, + "loss": 2.2271, + "theoretical_loss": 3.2973017614888134, + "tokens_seen": 3292697600 + }, + { + "epoch": 11.01, + "learning_rate": 1.283851554663992e-06, + "loss": 2.396, + "theoretical_loss": 3.2972968105134135, + "tokens_seen": 3292763136 + }, + { + "epoch": 11.01, + "learning_rate": 1.2738214643931797e-06, + "loss": 2.3798, + "theoretical_loss": 3.2972918596641425, + "tokens_seen": 3292828672 + }, + { + "epoch": 11.01, + "learning_rate": 1.2637913741223672e-06, + "loss": 2.2079, + "theoretical_loss": 3.297286908940995, + "tokens_seen": 3292894208 + }, + { + "epoch": 11.01, + "learning_rate": 1.2537612838515546e-06, + "loss": 2.531, + "theoretical_loss": 3.2972819583439645, + "tokens_seen": 3292959744 + }, + { + "epoch": 11.01, + "learning_rate": 1.2437311935807423e-06, + "loss": 2.5793, + "theoretical_loss": 3.2972770078730465, + "tokens_seen": 3293025280 + }, + { + "epoch": 11.01, + "learning_rate": 1.2337011033099298e-06, + "loss": 2.5074, + "theoretical_loss": 3.2972720575282337, + "tokens_seen": 3293090816 + }, + { + "epoch": 11.01, + "learning_rate": 1.2236710130391173e-06, + "loss": 2.5377, + "theoretical_loss": 3.297267107309522, + "tokens_seen": 3293156352 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3660474, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2138562202453613, + "objective/train/theoretical_loss": 3.297265869774546, + "objective/train/tokens_used": 3313632736, + "theoretical_loss": 3.297265869774546, + "tokens_seen": 3293172736 + }, + { + "epoch": 11.01, + "learning_rate": 1.213640922768305e-06, + "loss": 2.4569, + "theoretical_loss": 3.297262157216904, + "tokens_seen": 3293221888 + }, + { + "epoch": 11.01, + "learning_rate": 1.2036108324974926e-06, + "loss": 2.5303, + "theoretical_loss": 3.297257207250375, + "tokens_seen": 3293287424 + }, + { + "epoch": 11.01, + "learning_rate": 1.19358074222668e-06, + "loss": 2.2367, + "theoretical_loss": 3.2972522574099297, + "tokens_seen": 3293352960 + }, + { + "epoch": 11.01, + "learning_rate": 1.1835506519558676e-06, + "loss": 2.5222, + "theoretical_loss": 3.2972473076955615, + "tokens_seen": 3293418496 + }, + { + "epoch": 11.01, + "learning_rate": 1.1735205616850553e-06, + "loss": 2.4229, + "theoretical_loss": 3.297242358107265, + "tokens_seen": 3293484032 + }, + { + "epoch": 11.01, + "learning_rate": 1.1634904714142427e-06, + "loss": 2.6355, + "theoretical_loss": 3.2972374086450347, + "tokens_seen": 3293549568 + }, + { + "epoch": 11.01, + "learning_rate": 1.1534603811434302e-06, + "loss": 2.4775, + "theoretical_loss": 3.2972324593088644, + "tokens_seen": 3293615104 + }, + { + "epoch": 11.01, + "learning_rate": 1.143430290872618e-06, + "loss": 2.3192, + "theoretical_loss": 3.2972275100987485, + "tokens_seen": 3293680640 + }, + { + "epoch": 11.01, + "learning_rate": 1.1334002006018056e-06, + "loss": 2.4205, + "theoretical_loss": 3.2972225610146815, + "tokens_seen": 3293746176 + }, + { + "epoch": 11.01, + "learning_rate": 1.1233701103309928e-06, + "loss": 2.5761, + "theoretical_loss": 3.2972176120566576, + "tokens_seen": 3293811712 + }, + { + "epoch": 11.01, + "learning_rate": 1.1133400200601805e-06, + "loss": 2.4, + "theoretical_loss": 3.2972126632246717, + "tokens_seen": 3293877248 + }, + { + "epoch": 11.01, + "learning_rate": 1.1033099297893682e-06, + "loss": 2.5211, + "theoretical_loss": 3.297207714518717, + "tokens_seen": 3293942784 + }, + { + "epoch": 11.01, + "learning_rate": 1.0932798395185557e-06, + "loss": 2.242, + "theoretical_loss": 3.297202765938788, + "tokens_seen": 3294008320 + }, + { + "epoch": 11.01, + "learning_rate": 1.0832497492477432e-06, + "loss": 2.5417, + "theoretical_loss": 3.2971978174848795, + "tokens_seen": 3294073856 + }, + { + "epoch": 11.01, + "learning_rate": 1.0732196589769309e-06, + "loss": 2.3505, + "theoretical_loss": 3.2971928691569854, + "tokens_seen": 3294139392 + }, + { + "epoch": 11.01, + "learning_rate": 1.0631895687061185e-06, + "loss": 2.4247, + "theoretical_loss": 3.2971879209551003, + "tokens_seen": 3294204928 + }, + { + "epoch": 11.01, + "learning_rate": 1.0531594784353058e-06, + "loss": 2.3527, + "theoretical_loss": 3.297182972879218, + "tokens_seen": 3294270464 + }, + { + "epoch": 11.01, + "learning_rate": 1.0431293881644935e-06, + "loss": 2.5641, + "theoretical_loss": 3.297178024929333, + "tokens_seen": 3294336000 + }, + { + "epoch": 11.01, + "learning_rate": 1.0330992978936812e-06, + "loss": 2.6207, + "theoretical_loss": 3.29717307710544, + "tokens_seen": 3294401536 + }, + { + "epoch": 11.01, + "learning_rate": 1.0230692076228686e-06, + "loss": 2.35, + "theoretical_loss": 3.297168129407533, + "tokens_seen": 3294467072 + }, + { + "epoch": 11.01, + "learning_rate": 1.0130391173520561e-06, + "loss": 2.5491, + "theoretical_loss": 3.2971631818356055, + "tokens_seen": 3294532608 + }, + { + "epoch": 11.01, + "learning_rate": 1.0030090270812438e-06, + "loss": 2.429, + "theoretical_loss": 3.297158234389653, + "tokens_seen": 3294598144 + }, + { + "epoch": 11.01, + "learning_rate": 9.929789368104315e-07, + "loss": 2.4164, + "theoretical_loss": 3.2971532870696696, + "tokens_seen": 3294663680 + }, + { + "epoch": 11.01, + "learning_rate": 9.829488465396187e-07, + "loss": 2.6127, + "theoretical_loss": 3.297148339875649, + "tokens_seen": 3294729216 + }, + { + "epoch": 11.01, + "learning_rate": 9.729187562688064e-07, + "loss": 2.4943, + "theoretical_loss": 3.2971433928075857, + "tokens_seen": 3294794752 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3661890, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.039202928543091, + "objective/train/theoretical_loss": 3.29714215606025, + "objective/train/tokens_used": 3315271136, + "theoretical_loss": 3.29714215606025, + "tokens_seen": 3294811136 + }, + { + "epoch": 11.01, + "learning_rate": 9.628886659979941e-07, + "loss": 2.4041, + "theoretical_loss": 3.297138445865474, + "tokens_seen": 3294860288 + }, + { + "epoch": 11.01, + "learning_rate": 9.528585757271816e-07, + "loss": 2.4574, + "theoretical_loss": 3.2971334990493086, + "tokens_seen": 3294925824 + }, + { + "epoch": 11.01, + "learning_rate": 9.428284854563692e-07, + "loss": 2.4671, + "theoretical_loss": 3.2971285523590828, + "tokens_seen": 3294991360 + }, + { + "epoch": 11.01, + "learning_rate": 9.327983951855567e-07, + "loss": 2.3966, + "theoretical_loss": 3.297123605794792, + "tokens_seen": 3295056896 + }, + { + "epoch": 11.01, + "learning_rate": 9.227683049147442e-07, + "loss": 2.359, + "theoretical_loss": 3.29711865935643, + "tokens_seen": 3295122432 + }, + { + "epoch": 11.01, + "learning_rate": 9.127382146439318e-07, + "loss": 2.2599, + "theoretical_loss": 3.2971137130439914, + "tokens_seen": 3295187968 + }, + { + "epoch": 11.01, + "learning_rate": 9.027081243731194e-07, + "loss": 2.3202, + "theoretical_loss": 3.2971087668574697, + "tokens_seen": 3295253504 + }, + { + "epoch": 11.01, + "learning_rate": 8.92678034102307e-07, + "loss": 2.3565, + "theoretical_loss": 3.2971038207968597, + "tokens_seen": 3295319040 + }, + { + "epoch": 11.01, + "learning_rate": 8.826479438314945e-07, + "loss": 2.534, + "theoretical_loss": 3.297098874862156, + "tokens_seen": 3295384576 + }, + { + "epoch": 11.01, + "learning_rate": 8.72617853560682e-07, + "loss": 2.2723, + "theoretical_loss": 3.2970939290533523, + "tokens_seen": 3295450112 + }, + { + "epoch": 11.01, + "learning_rate": 8.625877632898697e-07, + "loss": 2.3955, + "theoretical_loss": 3.2970889833704433, + "tokens_seen": 3295515648 + }, + { + "epoch": 11.01, + "learning_rate": 8.525576730190572e-07, + "loss": 2.49, + "theoretical_loss": 3.297084037813423, + "tokens_seen": 3295581184 + }, + { + "epoch": 11.01, + "learning_rate": 8.425275827482447e-07, + "loss": 2.4699, + "theoretical_loss": 3.297079092382286, + "tokens_seen": 3295646720 + }, + { + "epoch": 11.01, + "learning_rate": 8.324974924774323e-07, + "loss": 2.3907, + "theoretical_loss": 3.2970741470770264, + "tokens_seen": 3295712256 + }, + { + "epoch": 11.01, + "learning_rate": 8.224674022066199e-07, + "loss": 2.5514, + "theoretical_loss": 3.297069201897638, + "tokens_seen": 3295777792 + }, + { + "epoch": 11.01, + "learning_rate": 8.124373119358075e-07, + "loss": 2.46, + "theoretical_loss": 3.2970642568441164, + "tokens_seen": 3295843328 + }, + { + "epoch": 11.01, + "learning_rate": 8.02407221664995e-07, + "loss": 2.3755, + "theoretical_loss": 3.2970593119164544, + "tokens_seen": 3295908864 + }, + { + "epoch": 11.01, + "learning_rate": 7.923771313941825e-07, + "loss": 2.3753, + "theoretical_loss": 3.2970543671146477, + "tokens_seen": 3295974400 + }, + { + "epoch": 11.01, + "learning_rate": 7.823470411233701e-07, + "loss": 2.3334, + "theoretical_loss": 3.2970494224386897, + "tokens_seen": 3296039936 + }, + { + "epoch": 11.01, + "learning_rate": 7.723169508525577e-07, + "loss": 2.5228, + "theoretical_loss": 3.2970444778885746, + "tokens_seen": 3296105472 + }, + { + "epoch": 11.01, + "learning_rate": 7.622868605817453e-07, + "loss": 2.5045, + "theoretical_loss": 3.2970395334642975, + "tokens_seen": 3296171008 + }, + { + "epoch": 11.01, + "learning_rate": 7.522567703109327e-07, + "loss": 2.4706, + "theoretical_loss": 3.2970345891658517, + "tokens_seen": 3296236544 + }, + { + "epoch": 11.01, + "learning_rate": 7.422266800401204e-07, + "loss": 2.2136, + "theoretical_loss": 3.2970296449932324, + "tokens_seen": 3296302080 + }, + { + "epoch": 11.01, + "learning_rate": 7.321965897693079e-07, + "loss": 2.4443, + "theoretical_loss": 3.297024700946433, + "tokens_seen": 3296367616 + }, + { + "epoch": 11.01, + "learning_rate": 7.221664994984955e-07, + "loss": 2.4079, + "theoretical_loss": 3.2970197570254487, + "tokens_seen": 3296433152 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3662714, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.0287671089172363, + "objective/train/theoretical_loss": 3.2970185210648606, + "objective/train/tokens_used": 3316909536, + "theoretical_loss": 3.2970185210648606, + "tokens_seen": 3296449536 + }, + { + "epoch": 11.01, + "learning_rate": 7.121364092276831e-07, + "loss": 2.2175, + "theoretical_loss": 3.2970148132302732, + "tokens_seen": 3296498688 + }, + { + "epoch": 11.01, + "learning_rate": 7.021063189568706e-07, + "loss": 2.269, + "theoretical_loss": 3.297009869560901, + "tokens_seen": 3296564224 + }, + { + "epoch": 11.01, + "learning_rate": 6.920762286860582e-07, + "loss": 2.3691, + "theoretical_loss": 3.297004926017326, + "tokens_seen": 3296629760 + }, + { + "epoch": 11.01, + "learning_rate": 6.820461384152457e-07, + "loss": 2.3685, + "theoretical_loss": 3.2969999825995435, + "tokens_seen": 3296695296 + }, + { + "epoch": 11.01, + "learning_rate": 6.720160481444334e-07, + "loss": 2.1834, + "theoretical_loss": 3.296995039307547, + "tokens_seen": 3296760832 + }, + { + "epoch": 11.01, + "learning_rate": 6.619859578736208e-07, + "loss": 2.3384, + "theoretical_loss": 3.2969900961413305, + "tokens_seen": 3296826368 + }, + { + "epoch": 11.01, + "learning_rate": 6.519558676028084e-07, + "loss": 2.5262, + "theoretical_loss": 3.2969851531008896, + "tokens_seen": 3296891904 + }, + { + "epoch": 11.01, + "learning_rate": 6.41925777331996e-07, + "loss": 2.5222, + "theoretical_loss": 3.296980210186217, + "tokens_seen": 3296957440 + }, + { + "epoch": 11.01, + "learning_rate": 6.318956870611836e-07, + "loss": 2.4981, + "theoretical_loss": 3.296975267397308, + "tokens_seen": 3297022976 + }, + { + "epoch": 11.01, + "learning_rate": 6.218655967903712e-07, + "loss": 2.4237, + "theoretical_loss": 3.296970324734157, + "tokens_seen": 3297088512 + }, + { + "epoch": 11.01, + "learning_rate": 6.118355065195586e-07, + "loss": 2.328, + "theoretical_loss": 3.2969653821967575, + "tokens_seen": 3297154048 + }, + { + "epoch": 11.01, + "learning_rate": 6.018054162487463e-07, + "loss": 2.4767, + "theoretical_loss": 3.2969604397851047, + "tokens_seen": 3297219584 + }, + { + "epoch": 11.01, + "learning_rate": 5.917753259779338e-07, + "loss": 2.3637, + "theoretical_loss": 3.2969554974991917, + "tokens_seen": 3297285120 + }, + { + "epoch": 11.01, + "learning_rate": 5.817452357071214e-07, + "loss": 2.4554, + "theoretical_loss": 3.2969505553390146, + "tokens_seen": 3297350656 + }, + { + "epoch": 11.01, + "learning_rate": 5.71715145436309e-07, + "loss": 2.5866, + "theoretical_loss": 3.296945613304566, + "tokens_seen": 3297416192 + }, + { + "epoch": 11.01, + "learning_rate": 5.616850551654964e-07, + "loss": 2.3424, + "theoretical_loss": 3.296940671395841, + "tokens_seen": 3297481728 + }, + { + "epoch": 11.01, + "learning_rate": 5.516549648946841e-07, + "loss": 2.1149, + "theoretical_loss": 3.2969357296128337, + "tokens_seen": 3297547264 + }, + { + "epoch": 11.01, + "learning_rate": 5.416248746238716e-07, + "loss": 2.5704, + "theoretical_loss": 3.2969307879555387, + "tokens_seen": 3297612800 + }, + { + "epoch": 11.01, + "learning_rate": 5.315947843530593e-07, + "loss": 2.3847, + "theoretical_loss": 3.29692584642395, + "tokens_seen": 3297678336 + }, + { + "epoch": 11.01, + "learning_rate": 5.215646940822467e-07, + "loss": 2.1976, + "theoretical_loss": 3.296920905018062, + "tokens_seen": 3297743872 + }, + { + "epoch": 11.01, + "learning_rate": 5.115346038114343e-07, + "loss": 2.543, + "theoretical_loss": 3.296915963737869, + "tokens_seen": 3297809408 + }, + { + "epoch": 11.01, + "learning_rate": 5.015045135406219e-07, + "loss": 2.511, + "theoretical_loss": 3.296911022583365, + "tokens_seen": 3297874944 + }, + { + "epoch": 11.01, + "learning_rate": 4.914744232698094e-07, + "loss": 2.5762, + "theoretical_loss": 3.2969060815545452, + "tokens_seen": 3297940480 + }, + { + "epoch": 11.01, + "learning_rate": 4.814443329989971e-07, + "loss": 2.3698, + "theoretical_loss": 3.2969011406514026, + "tokens_seen": 3298006016 + }, + { + "epoch": 11.01, + "learning_rate": 4.714142427281846e-07, + "loss": 2.4597, + "theoretical_loss": 3.296896199873933, + "tokens_seen": 3298071552 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3664221, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1083929538726807, + "objective/train/theoretical_loss": 3.296894964699201, + "objective/train/tokens_used": 3318547936, + "theoretical_loss": 3.296894964699201, + "tokens_seen": 3298087936 + }, + { + "epoch": 11.01, + "learning_rate": 4.613841524573721e-07, + "loss": 2.1804, + "theoretical_loss": 3.296891259222129, + "tokens_seen": 3298137088 + }, + { + "epoch": 11.01, + "learning_rate": 4.513540621865597e-07, + "loss": 2.3823, + "theoretical_loss": 3.2968863186959867, + "tokens_seen": 3298202624 + }, + { + "epoch": 11.01, + "learning_rate": 4.4132397191574727e-07, + "loss": 2.0109, + "theoretical_loss": 3.296881378295499, + "tokens_seen": 3298268160 + }, + { + "epoch": 11.01, + "learning_rate": 4.3129388164493485e-07, + "loss": 2.2854, + "theoretical_loss": 3.296876438020661, + "tokens_seen": 3298333696 + }, + { + "epoch": 11.01, + "learning_rate": 4.2126379137412237e-07, + "loss": 2.3385, + "theoretical_loss": 3.2968714978714666, + "tokens_seen": 3298399232 + }, + { + "epoch": 11.01, + "learning_rate": 4.1123370110330995e-07, + "loss": 2.3976, + "theoretical_loss": 3.2968665578479106, + "tokens_seen": 3298464768 + }, + { + "epoch": 11.01, + "learning_rate": 4.012036108324975e-07, + "loss": 2.4002, + "theoretical_loss": 3.2968616179499866, + "tokens_seen": 3298530304 + }, + { + "epoch": 11.01, + "learning_rate": 3.9117352056168506e-07, + "loss": 2.3801, + "theoretical_loss": 3.296856678177689, + "tokens_seen": 3298595840 + }, + { + "epoch": 11.01, + "learning_rate": 3.8114343029087263e-07, + "loss": 2.5368, + "theoretical_loss": 3.296851738531013, + "tokens_seen": 3298661376 + }, + { + "epoch": 11.01, + "learning_rate": 3.711133400200602e-07, + "loss": 2.3616, + "theoretical_loss": 3.2968467990099524, + "tokens_seen": 3298726912 + }, + { + "epoch": 11.01, + "learning_rate": 3.6108324974924774e-07, + "loss": 2.5889, + "theoretical_loss": 3.296841859614501, + "tokens_seen": 3298792448 + }, + { + "epoch": 11.01, + "learning_rate": 3.510531594784353e-07, + "loss": 2.3937, + "theoretical_loss": 3.2968369203446537, + "tokens_seen": 3298857984 + }, + { + "epoch": 11.01, + "learning_rate": 3.4102306920762284e-07, + "loss": 2.4839, + "theoretical_loss": 3.296831981200404, + "tokens_seen": 3298923520 + }, + { + "epoch": 11.01, + "learning_rate": 3.309929789368104e-07, + "loss": 2.4297, + "theoretical_loss": 3.296827042181748, + "tokens_seen": 3298989056 + }, + { + "epoch": 11.01, + "learning_rate": 3.20962888665998e-07, + "loss": 2.4258, + "theoretical_loss": 3.2968221032886778, + "tokens_seen": 3299054592 + }, + { + "epoch": 11.01, + "learning_rate": 3.109327983951856e-07, + "loss": 2.3299, + "theoretical_loss": 3.2968171645211894, + "tokens_seen": 3299120128 + }, + { + "epoch": 11.01, + "learning_rate": 3.0090270812437316e-07, + "loss": 2.6277, + "theoretical_loss": 3.2968122258792762, + "tokens_seen": 3299185664 + }, + { + "epoch": 11.01, + "learning_rate": 2.908726178535607e-07, + "loss": 2.4105, + "theoretical_loss": 3.296807287362933, + "tokens_seen": 3299251200 + }, + { + "epoch": 11.01, + "learning_rate": 2.808425275827482e-07, + "loss": 2.4864, + "theoretical_loss": 3.2968023489721534, + "tokens_seen": 3299316736 + }, + { + "epoch": 11.01, + "learning_rate": 2.708124373119358e-07, + "loss": 2.4381, + "theoretical_loss": 3.2967974107069327, + "tokens_seen": 3299382272 + }, + { + "epoch": 11.01, + "learning_rate": 2.6078234704112337e-07, + "loss": 2.3298, + "theoretical_loss": 3.2967924725672644, + "tokens_seen": 3299447808 + }, + { + "epoch": 11.01, + "learning_rate": 2.5075225677031095e-07, + "loss": 2.3699, + "theoretical_loss": 3.296787534553143, + "tokens_seen": 3299513344 + }, + { + "epoch": 11.01, + "learning_rate": 2.4072216649949853e-07, + "loss": 2.4824, + "theoretical_loss": 3.296782596664564, + "tokens_seen": 3299578880 + }, + { + "epoch": 11.01, + "learning_rate": 2.3069207622868605e-07, + "loss": 2.5774, + "theoretical_loss": 3.2967776589015196, + "tokens_seen": 3299644416 + }, + { + "epoch": 11.01, + "learning_rate": 2.2066198595787363e-07, + "loss": 2.0439, + "theoretical_loss": 3.2967727212640057, + "tokens_seen": 3299709952 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 3665048, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3698160648345947, + "objective/train/theoretical_loss": 3.2967714868742406, + "objective/train/tokens_used": 3320186336, + "theoretical_loss": 3.2967714868742406, + "tokens_seen": 3299726336 + }, + { + "epoch": 11.01, + "learning_rate": 2.1063189568706119e-07, + "loss": 2.5858, + "theoretical_loss": 3.2967677837520157, + "tokens_seen": 3299775488 + }, + { + "epoch": 11.01, + "learning_rate": 2.0060180541624874e-07, + "loss": 2.426, + "theoretical_loss": 3.2967628463655445, + "tokens_seen": 3299841024 + }, + { + "epoch": 11.01, + "learning_rate": 1.9057171514543632e-07, + "loss": 2.5417, + "theoretical_loss": 3.2967579091045867, + "tokens_seen": 3299906560 + }, + { + "epoch": 11.01, + "learning_rate": 1.8054162487462387e-07, + "loss": 2.4561, + "theoretical_loss": 3.2967529719691355, + "tokens_seen": 3299972096 + }, + { + "epoch": 11.01, + "learning_rate": 1.7051153460381142e-07, + "loss": 2.2859, + "theoretical_loss": 3.2967480349591862, + "tokens_seen": 3300037632 + }, + { + "epoch": 11.01, + "learning_rate": 1.60481444332999e-07, + "loss": 2.3687, + "theoretical_loss": 3.2967430980747325, + "tokens_seen": 3300103168 + }, + { + "epoch": 11.01, + "learning_rate": 1.5045135406218658e-07, + "loss": 2.2316, + "theoretical_loss": 3.296738161315769, + "tokens_seen": 3300168704 + } + ], + "max_steps": 50354, + "num_train_epochs": 9223372036854775807, + "total_flos": 1.684194895724544e+18, + "trial_name": null, + "trial_params": null +}